whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 42b00f122cfbfed79fc29b0b3610f3abbb1e3864
parent 460023a5d1d2aa0f733b6708b2fae5ea9f9dfec0
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Wed, 26 Dec 2018 11:46:28 -0800

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Paolo Bonzini:
 "ARM:
   - selftests improvements
   - large PUD support for HugeTLB
   - single-stepping fixes
   - improved tracing
   - various timer and vGIC fixes

  x86:
   - Processor Tracing virtualization
   - STIBP support
   - some correctness fixes
   - refactorings and splitting of vmx.c
   - use the Hyper-V range TLB flush hypercall
   - reduce order of vcpu struct
   - WBNOINVD support
   - do not use -ftrace for __noclone functions
   - nested guest support for PAUSE filtering on AMD
   - more Hyper-V enlightenments (direct mode for synthetic timers)

  PPC:
   -  nested VFIO

  s390:
   - bugfixes only this time"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (171 commits)
  KVM: x86: Add CPUID support for new instruction WBNOINVD
  kvm: selftests: ucall: fix exit mmio address guessing
  Revert "compiler-gcc: disable -ftracer for __noclone functions"
  KVM: VMX: Move VM-Enter + VM-Exit handling to non-inline sub-routines
  KVM: VMX: Explicitly reference RCX as the vmx_vcpu pointer in asm blobs
  KVM: x86: Use jmp to invoke kvm_spurious_fault() from .fixup
  MAINTAINERS: Add arch/x86/kvm sub-directories to existing KVM/x86 entry
  KVM/x86: Use SVM assembly instruction mnemonics instead of .byte streams
  KVM/MMU: Flush tlb directly in the kvm_zap_gfn_range()
  KVM/MMU: Flush tlb directly in kvm_set_pte_rmapp()
  KVM/MMU: Move tlb flush in kvm_set_pte_rmapp() to kvm_mmu_notifier_change_pte()
  KVM: Make kvm_set_spte_hva() return int
  KVM: Replace old tlb flush function with new one to flush a specified range.
  KVM/MMU: Add tlb flush with range helper function
  KVM/VMX: Add hv tlb range flush support
  x86/hyper-v: Add HvFlushGuestAddressList hypercall support
  KVM: Add tlb_remote_flush_with_range callback in kvm_x86_ops
  KVM: x86: Disable Intel PT when VMXON in L1 guest
  KVM: x86: Set intercept for Intel PT MSRs read/write
  KVM: x86: Implement Intel PT MSRs read/write emulation
  ...

Diffstat:
MDocumentation/virtual/kvm/api.txt | 136++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
MMAINTAINERS | 1+
March/arm/include/asm/kvm_asm.h | 4++++
March/arm/include/asm/kvm_host.h | 7+------
March/arm/include/asm/kvm_mmu.h | 61+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/arm/include/asm/stage2_pgtable.h | 8++++++++
March/arm/kvm/coproc.c | 4++--
March/arm64/include/asm/kvm_arm.h | 6+-----
March/arm64/include/asm/kvm_asm.h | 7+++++++
March/arm64/include/asm/kvm_emulate.h | 35+++++++++++++++++++++++++++--------
March/arm64/include/asm/kvm_host.h | 5++---
March/arm64/include/asm/kvm_mmu.h | 48++++++++++++++++++++++++++++++++++++++++++++++++
March/arm64/include/asm/pgtable-hwdef.h | 4++++
March/arm64/include/asm/pgtable.h | 9+++++++++
March/arm64/include/asm/stage2_pgtable.h | 16+++++++---------
March/arm64/kvm/debug.c | 21---------------------
March/arm64/kvm/handle_exit.c | 14+-------------
March/arm64/kvm/hyp/switch.c | 43++++---------------------------------------
March/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c | 12+++++++++---
March/arm64/kvm/sys_regs.c | 12+++++++-----
March/arm64/kvm/sys_regs.h | 4++++
March/arm64/kvm/trace.h | 35+++++++++++++++++++++++++++++++++++
March/mips/include/asm/kvm_host.h | 2+-
March/mips/kvm/mips.c | 29++++++++++++++++++++++++++---
March/mips/kvm/mmu.c | 3++-
March/powerpc/include/asm/hvcall.h | 1+
March/powerpc/include/asm/kvm_book3s.h | 23++++++++++++++++-------
March/powerpc/include/asm/kvm_book3s_64.h | 18+++++++++++++++++-
March/powerpc/include/asm/kvm_host.h | 5++++-
March/powerpc/include/asm/kvm_ppc.h | 10++++++++--
March/powerpc/kernel/exceptions-64s.S | 9+++++++++
March/powerpc/kvm/book3s.c | 8+++++---
March/powerpc/kvm/book3s_64_mmu_hv.c | 12++++++++----
March/powerpc/kvm/book3s_64_mmu_radix.c | 160+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
March/powerpc/kvm/book3s_hv.c | 95+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
March/powerpc/kvm/book3s_hv_nested.c | 190+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
March/powerpc/kvm/book3s_hv_rm_mmu.c | 2+-
March/powerpc/kvm/book3s_pr.c | 4+++-
March/powerpc/kvm/book3s_xics.c | 12+-----------
March/powerpc/kvm/book3s_xive.c | 12+-----------
March/powerpc/kvm/booke.c | 3++-
March/powerpc/kvm/e500_mmu_host.c | 3++-
March/powerpc/kvm/powerpc.c | 47+++++++++++++++++++++++++++++++----------------
March/powerpc/mm/fault.c | 1+
March/s390/kvm/kvm-s390.c | 35++++++++++++++++++-----------------
March/x86/events/intel/pt.c | 60+++++++++++++++++++++++++++++++++++-------------------------
March/x86/events/intel/pt.h | 58----------------------------------------------------------
March/x86/hyperv/nested.c | 80+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/x86/include/asm/cpufeatures.h | 1+
March/x86/include/asm/hyperv-tlfs.h | 335++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
March/x86/include/asm/intel_pt.h | 26++++++++++++++++++++++++++
March/x86/include/asm/kvm_host.h | 25+++++++++++++++++--------
March/x86/include/asm/mshyperv.h | 15+++++++++++++++
March/x86/include/asm/msr-index.h | 37+++++++++++++++++++++++++++++++++++++
March/x86/include/asm/svm.h | 7-------
March/x86/include/asm/trace/hyperv.h | 14++++++++++++++
March/x86/include/asm/vmx.h | 9+++++++++
March/x86/kernel/kvmclock.c | 15+--------------
March/x86/kvm/Makefile | 2+-
March/x86/kvm/cpuid.c | 31+++++++++++++++++++++++--------
March/x86/kvm/hyperv.c | 305++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
March/x86/kvm/hyperv.h | 4++++
March/x86/kvm/kvm_cache_regs.h | 2++
March/x86/kvm/lapic.c | 5++---
March/x86/kvm/mmu.c | 98+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
March/x86/kvm/paging_tmpl.h | 3++-
March/x86/kvm/svm.c | 68+++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
March/x86/kvm/trace.h | 10++++++----
Darch/x86/kvm/vmx.c | 15252-------------------------------------------------------------------------------
Aarch/x86/kvm/vmx/capabilities.h | 343+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kvm/vmx/evmcs.c | 352+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kvm/vmx/evmcs.h | 202+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kvm/vmx/nested.c | 5721+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kvm/vmx/nested.h | 282+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kvm/vmx/ops.h | 285+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rarch/x86/kvm/pmu_intel.c -> arch/x86/kvm/vmx/pmu_intel.c | 0
Aarch/x86/kvm/vmx/vmcs.h | 136+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kvm/vmx/vmcs12.c | 157+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kvm/vmx/vmcs12.h | 462+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rarch/x86/kvm/vmx_shadow_fields.h -> arch/x86/kvm/vmx/vmcs_shadow_fields.h | 0
Aarch/x86/kvm/vmx/vmenter.S | 57+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kvm/vmx/vmx.c | 7935+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kvm/vmx/vmx.h | 519+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Darch/x86/kvm/vmx_evmcs.h | 324-------------------------------------------------------------------------------
March/x86/kvm/x86.c | 161+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Mdrivers/hv/hv.c | 2+-
Mdrivers/hv/hyperv_vmbus.h | 68--------------------------------------------------------------------
Minclude/kvm/arm_arch_timer.h | 4----
Minclude/linux/compiler_attributes.h | 9+--------
Minclude/linux/kvm_host.h | 12++++++++++--
Minclude/uapi/linux/kvm.h | 19+++++++++++++++++++
Mtools/kvm/kvm_stat/kvm_stat | 2+-
Mtools/testing/selftests/android/Makefile | 2+-
Mtools/testing/selftests/futex/functional/Makefile | 1+
Mtools/testing/selftests/gpio/Makefile | 6+++---
Mtools/testing/selftests/kvm/Makefile | 5++++-
Atools/testing/selftests/kvm/clear_dirty_log_test.c | 2++
Mtools/testing/selftests/kvm/dirty_log_test.c | 165++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
Mtools/testing/selftests/kvm/include/kvm_util.h | 8++++++++
Mtools/testing/selftests/kvm/lib/aarch64/processor.c | 18++++++++++++------
Mtools/testing/selftests/kvm/lib/kvm_util.c | 67++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Mtools/testing/selftests/kvm/lib/kvm_util_internal.h | 1+
Mtools/testing/selftests/kvm/lib/ucall.c | 36+++++++++++++++++++++---------------
Mtools/testing/selftests/kvm/x86_64/evmcs_test.c | 4++--
Atools/testing/selftests/kvm/x86_64/hyperv_cpuid.c | 157+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtools/testing/selftests/kvm/x86_64/state_test.c | 4++--
Mtools/testing/selftests/lib.mk | 8++++----
Mtools/testing/selftests/networking/timestamping/Makefile | 1+
Mtools/testing/selftests/tc-testing/bpf/Makefile | 1+
Mtools/testing/selftests/vm/Makefile | 1+
Mvirt/kvm/arm/arch_timer.c | 35++++++-----------------------------
Mvirt/kvm/arm/arm.c | 47++++++++++++++++++++++++++++++-----------------
Mvirt/kvm/arm/hyp/vgic-v3-sr.c | 6+++++-
Mvirt/kvm/arm/mmio.c | 11++++++-----
Mvirt/kvm/arm/mmu.c | 390+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
Mvirt/kvm/arm/trace.h | 18+++++++++---------
Mvirt/kvm/arm/vgic/vgic-mmio.c | 44++++++++++++++++++++------------------------
Mvirt/kvm/arm/vgic/vgic.c | 13+++++++++----
Mvirt/kvm/async_pf.c | 2+-
Mvirt/kvm/kvm_main.c | 208++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
120 files changed, 19323 insertions(+), 16628 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt @@ -305,6 +305,9 @@ the address space for which you want to return the dirty bitmap. They must be less than the value that KVM_CHECK_EXTENSION returns for the KVM_CAP_MULTI_ADDRESS_SPACE capability. +The bits in the dirty bitmap are cleared before the ioctl returns, unless +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is enabled. For more information, +see the description of the capability. 4.9 KVM_SET_MEMORY_ALIAS @@ -1129,10 +1132,15 @@ documentation when it pops into existence). 4.37 KVM_ENABLE_CAP -Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM -Architectures: x86 (only KVM_CAP_ENABLE_CAP_VM), - mips (only KVM_CAP_ENABLE_CAP), ppc, s390 -Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM) +Capability: KVM_CAP_ENABLE_CAP +Architectures: mips, ppc, s390 +Type: vcpu ioctl +Parameters: struct kvm_enable_cap (in) +Returns: 0 on success; -1 on error + +Capability: KVM_CAP_ENABLE_CAP_VM +Architectures: all +Type: vcpu ioctl Parameters: struct kvm_enable_cap (in) Returns: 0 on success; -1 on error @@ -3753,6 +3761,102 @@ Coalesced pio is based on coalesced mmio. There is little difference between coalesced mmio and pio except that coalesced pio records accesses to I/O ports. +4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) + +Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_dirty_log (in) +Returns: 0 on success, -1 on error + +/* for KVM_CLEAR_DIRTY_LOG */ +struct kvm_clear_dirty_log { + __u32 slot; + __u32 num_pages; + __u64 first_page; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; +}; + +The ioctl clears the dirty status of pages in a memory slot, according to +the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap +field. Bit 0 of the bitmap corresponds to page "first_page" in the +memory slot, and num_pages is the size in bits of the input bitmap. +Both first_page and num_pages must be a multiple of 64. For each bit +that is set in the input bitmap, the corresponding page is marked "clean" +in KVM's dirty bitmap, and dirty tracking is re-enabled for that page +(for example via write-protection, or by clearing the dirty bit in +a page table entry). + +If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies +the address space for which you want to return the dirty bitmap. +They must be less than the value that KVM_CHECK_EXTENSION returns for +the KVM_CAP_MULTI_ADDRESS_SPACE capability. + +This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT +is enabled; for more information, see the description of the capability. +However, it can always be used as long as KVM_CHECK_EXTENSION confirms +that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT is present. + +4.118 KVM_GET_SUPPORTED_HV_CPUID + +Capability: KVM_CAP_HYPERV_CPUID +Architectures: x86 +Type: vcpu ioctl +Parameters: struct kvm_cpuid2 (in/out) +Returns: 0 on success, -1 on error + +struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; +}; + +struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; +}; + +This ioctl returns x86 cpuid features leaves related to Hyper-V emulation in +KVM. Userspace can use the information returned by this ioctl to construct +cpuid information presented to guests consuming Hyper-V enlightenments (e.g. +Windows or Hyper-V guests). + +CPUID feature leaves returned by this ioctl are defined by Hyper-V Top Level +Functional Specification (TLFS). These leaves can't be obtained with +KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature +leaves (0x40000000, 0x40000001). + +Currently, the following list of CPUID leaves are returned: + HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS + HYPERV_CPUID_INTERFACE + HYPERV_CPUID_VERSION + HYPERV_CPUID_FEATURES + HYPERV_CPUID_ENLIGHTMENT_INFO + HYPERV_CPUID_IMPLEMENT_LIMITS + HYPERV_CPUID_NESTED_FEATURES + +HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was +enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS). + +Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure +with the 'nent' field indicating the number of entries in the variable-size +array 'entries'. If the number of entries is too low to describe all Hyper-V +feature leaves, an error (E2BIG) is returned. If the number is more or equal +to the number of Hyper-V feature leaves, the 'nent' field is adjusted to the +number of valid entries in the 'entries' array, which is then filled. + +'index' and 'flags' fields in 'struct kvm_cpuid_entry2' are currently reserved, +userspace should not expect to get any particular value there. + 5. The kvm_run structure ------------------------ @@ -4647,6 +4751,30 @@ and injected exceptions. * For the new DR6 bits, note that bit 16 is set iff the #DB exception will clear DR6.RTM. +7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT + +Architectures: all +Parameters: args[0] whether feature should be enabled or not + +With this capability enabled, KVM_GET_DIRTY_LOG will not automatically +clear and write-protect all pages that are returned as dirty. +Rather, userspace will have to do this operation separately using +KVM_CLEAR_DIRTY_LOG. + +At the cost of a slightly more complicated operation, this provides better +scalability and responsiveness for two reasons. First, +KVM_CLEAR_DIRTY_LOG ioctl can operate on a 64-page granularity rather +than requiring to sync a full memslot; this ensures that KVM does not +take spinlocks for an extended period of time. Second, in some cases a +large amount of time can pass between a call to KVM_GET_DIRTY_LOG and +userspace actually using the data in the page. Pages can be modified +during this time, which is inefficint for both the guest and userspace: +the guest will incur a higher penalty due to write protection faults, +while userspace can see false reports of dirty pages. Manual reprotection +helps reducing this time, improving guest performance and reducing the +number of dirty log false positives. + + 8. Other capabilities. ---------------------- diff --git a/MAINTAINERS b/MAINTAINERS @@ -8309,6 +8309,7 @@ W: http://www.linux-kvm.org T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git S: Supported F: arch/x86/kvm/ +F: arch/x86/kvm/*/ F: arch/x86/include/uapi/asm/kvm* F: arch/x86/include/asm/kvm* F: arch/x86/include/asm/pvclock-abi.h diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h @@ -23,6 +23,10 @@ #define ARM_EXIT_WITH_ABORT_BIT 31 #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_ABORT_BIT)) +#define ARM_EXCEPTION_IS_TRAP(x) \ + (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_PREF_ABORT || \ + ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_DATA_ABORT || \ + ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_HVC) #define ARM_ABORT_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_ABORT_BIT)) #define ARM_EXCEPTION_RESET 0 diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h @@ -225,7 +225,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); @@ -296,11 +296,6 @@ static inline void kvm_arm_init_debug(void) {} static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} -static inline bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, - struct kvm_run *run) -{ - return false; -} int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h @@ -82,6 +82,67 @@ void kvm_clear_hyp_idmap(void); #define kvm_mk_pud(pmdp) __pud(__pa(pmdp) | PMD_TYPE_TABLE) #define kvm_mk_pgd(pudp) ({ BUILD_BUG(); 0; }) +#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) +#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) +#define kvm_pfn_pud(pfn, prot) (__pud(0)) + +#define kvm_pud_pfn(pud) ({ WARN_ON(1); 0; }) + + +#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) +/* No support for pud hugepages */ +#define kvm_pud_mkhuge(pud) ( {WARN_ON(1); pud; }) + +/* + * The following kvm_*pud*() functions are provided strictly to allow + * sharing code with arm64. They should never be called in practice. + */ +static inline void kvm_set_s2pud_readonly(pud_t *pud) +{ + WARN_ON(1); +} + +static inline bool kvm_s2pud_readonly(pud_t *pud) +{ + WARN_ON(1); + return false; +} + +static inline void kvm_set_pud(pud_t *pud, pud_t new_pud) +{ + WARN_ON(1); +} + +static inline pud_t kvm_s2pud_mkwrite(pud_t pud) +{ + WARN_ON(1); + return pud; +} + +static inline pud_t kvm_s2pud_mkexec(pud_t pud) +{ + WARN_ON(1); + return pud; +} + +static inline bool kvm_s2pud_exec(pud_t *pud) +{ + WARN_ON(1); + return false; +} + +static inline pud_t kvm_s2pud_mkyoung(pud_t pud) +{ + BUG(); + return pud; +} + +static inline bool kvm_s2pud_young(pud_t pud) +{ + WARN_ON(1); + return false; +} + static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { pte_val(pte) |= L_PTE_S2_RDWR; diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h @@ -68,4 +68,12 @@ stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) #define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) #define stage2_pud_table_empty(kvm, pudp) false +static inline bool kvm_stage2_has_pud(struct kvm *kvm) +{ + return false; +} + +#define S2_PMD_MASK PMD_MASK +#define S2_PMD_SIZE PMD_SIZE + #endif /* __ARM_S2_PGTABLE_H_ */ diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c @@ -602,8 +602,8 @@ static int emulate_cp15(struct kvm_vcpu *vcpu, } } else { /* If access function fails, it should complain. */ - kvm_err("Unsupported guest CP15 access at: %08lx\n", - *vcpu_pc(vcpu)); + kvm_err("Unsupported guest CP15 access at: %08lx [%08lx]\n", + *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); print_cp_instr(params); kvm_inject_undefined(vcpu); } diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h @@ -107,7 +107,7 @@ TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) /* VTCR_EL2 Registers bits */ -#define VTCR_EL2_RES1 (1 << 31) +#define VTCR_EL2_RES1 (1U << 31) #define VTCR_EL2_HD (1 << 22) #define VTCR_EL2_HA (1 << 21) #define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT @@ -323,10 +323,6 @@ #define PAR_TO_HPFAR(par) \ (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) -#define kvm_arm_exception_type \ - {0, "IRQ" }, \ - {1, "TRAP" } - #define ECN(x) { ESR_ELx_EC_##x, #x } #define kvm_arm_exception_class \ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h @@ -25,6 +25,7 @@ #define ARM_EXIT_WITH_SERROR_BIT 31 #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) +#define ARM_EXCEPTION_IS_TRAP(x) (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_TRAP) #define ARM_SERROR_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_SERROR_BIT)) #define ARM_EXCEPTION_IRQ 0 @@ -34,6 +35,12 @@ /* The hyp-stub will return this for any kvm_call_hyp() call */ #define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR +#define kvm_arm_exception_type \ + {ARM_EXCEPTION_IRQ, "IRQ" }, \ + {ARM_EXCEPTION_EL1_SERROR, "SERROR" }, \ + {ARM_EXCEPTION_TRAP, "TRAP" }, \ + {ARM_EXCEPTION_HYP_GONE, "HYP_GONE" } + #ifndef __ASSEMBLY__ #include <linux/mm.h> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h @@ -24,6 +24,7 @@ #include <linux/kvm_host.h> +#include <asm/debug-monitors.h> #include <asm/esr.h> #include <asm/kvm_arm.h> #include <asm/kvm_hyp.h> @@ -147,14 +148,6 @@ static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu) return true; } -static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) -{ - if (vcpu_mode_is_32bit(vcpu)) - kvm_skip_instr32(vcpu, is_wide_instr); - else - *vcpu_pc(vcpu) += 4; -} - static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu) { *vcpu_cpsr(vcpu) |= PSR_AA32_T_BIT; @@ -424,4 +417,30 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, return data; /* Leave LE untouched */ } +static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) +{ + if (vcpu_mode_is_32bit(vcpu)) + kvm_skip_instr32(vcpu, is_wide_instr); + else + *vcpu_pc(vcpu) += 4; + + /* advance the singlestep state machine */ + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; +} + +/* + * Skip an instruction which has been emulated at hyp while most guest sysregs + * are live. + */ +static inline void __hyp_text __kvm_skip_instr(struct kvm_vcpu *vcpu) +{ + *vcpu_pc(vcpu) = read_sysreg_el2(elr); + vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr); + + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + + write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr); + write_sysreg_el2(*vcpu_pc(vcpu), elr); +} + #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h @@ -319,7 +319,7 @@ struct kvm_vcpu_arch { */ #define __vcpu_sys_reg(v,r) ((v)->arch.ctxt.sys_regs[(r)]) -u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg); +u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg); void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); /* @@ -360,7 +360,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); @@ -449,7 +449,6 @@ void kvm_arm_init_debug(void); void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); -bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h @@ -184,6 +184,17 @@ void kvm_clear_hyp_idmap(void); #define kvm_mk_pgd(pudp) \ __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE) +#define kvm_set_pud(pudp, pud) set_pud(pudp, pud) + +#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) +#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) +#define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot) + +#define kvm_pud_pfn(pud) pud_pfn(pud) + +#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) +#define kvm_pud_mkhuge(pud) pud_mkhuge(pud) + static inline pte_t kvm_s2pte_mkwrite(pte_t pte) { pte_val(pte) |= PTE_S2_RDWR; @@ -196,6 +207,12 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) return pmd; } +static inline pud_t kvm_s2pud_mkwrite(pud_t pud) +{ + pud_val(pud) |= PUD_S2_RDWR; + return pud; +} + static inline pte_t kvm_s2pte_mkexec(pte_t pte) { pte_val(pte) &= ~PTE_S2_XN; @@ -208,6 +225,12 @@ static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd) return pmd; } +static inline pud_t kvm_s2pud_mkexec(pud_t pud) +{ + pud_val(pud) &= ~PUD_S2_XN; + return pud; +} + static inline void kvm_set_s2pte_readonly(pte_t *ptep) { pteval_t old_pteval, pteval; @@ -246,6 +269,31 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp) return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); } +static inline void kvm_set_s2pud_readonly(pud_t *pudp) +{ + kvm_set_s2pte_readonly((pte_t *)pudp); +} + +static inline bool kvm_s2pud_readonly(pud_t *pudp) +{ + return kvm_s2pte_readonly((pte_t *)pudp); +} + +static inline bool kvm_s2pud_exec(pud_t *pudp) +{ + return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN); +} + +static inline pud_t kvm_s2pud_mkyoung(pud_t pud) +{ + return pud_mkyoung(pud); +} + +static inline bool kvm_s2pud_young(pud_t pud) +{ + return pud_young(pud); +} + #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) #ifdef __PAGETABLE_PMD_FOLDED diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h @@ -193,6 +193,10 @@ #define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ #define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */ +#define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */ +#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */ +#define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */ + /* * Memory Attribute override for Stage-2 (MemAttr[3:0]) */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h @@ -315,6 +315,11 @@ static inline pte_t pud_pte(pud_t pud) return __pte(pud_val(pud)); } +static inline pud_t pte_pud(pte_t pte) +{ + return __pud(pte_val(pte)); +} + static inline pmd_t pud_pmd(pud_t pud) { return __pmd(pud_val(pud)); @@ -382,8 +387,12 @@ static inline int pmd_protnone(pmd_t pmd) #define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) +#define pud_young(pud) pte_young(pud_pte(pud)) +#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) +#define pud_mkhuge(pud) (__pud(pud_val(pud) & ~PUD_TABLE_BIT)) + #define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud)) #define __phys_to_pud_val(phys) __phys_to_pte_val(phys) #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h @@ -30,16 +30,14 @@ #define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) /* - * The hardware supports concatenation of up to 16 tables at stage2 entry level - * and we use the feature whenever possible. + * The hardware supports concatenation of up to 16 tables at stage2 entry + * level and we use the feature whenever possible, which means we resolve 4 + * additional bits of address at the entry level. * - * Now, the minimum number of bits resolved at any level is (PAGE_SHIFT - 3). - * On arm64, the smallest PAGE_SIZE supported is 4k, which means - * (PAGE_SHIFT - 3) > 4 holds for all page sizes. - * This implies, the total number of page table levels at stage2 expected - * by the hardware is actually the number of levels required for (IPA_SHIFT - 4) - * in normal translations(e.g, stage1), since we cannot have another level in - * the range (IPA_SHIFT, IPA_SHIFT - 4). + * This implies, the total number of page table levels required for + * IPA_SHIFT at stage2 expected by the hardware can be calculated using + * the same logic used for the (non-collapsable) stage1 page tables but for + * (IPA_SHIFT - 4). */ #define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) #define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c @@ -236,24 +236,3 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) } } } - - -/* - * After successfully emulating an instruction, we might want to - * return to user space with a KVM_EXIT_DEBUG. We can only do this - * once the emulation is complete, though, so for userspace emulations - * we have to wait until we have re-entered KVM before calling this - * helper. - * - * Return true (and set exit_reason) to return to userspace or false - * if no further action is required. - */ -bool kvm_arm_handle_step_debug(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - run->exit_reason = KVM_EXIT_DEBUG; - run->debug.arch.hsr = ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT; - return true; - } - return false; -} diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c @@ -247,13 +247,6 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu, struct kvm_run *run) handled = exit_handler(vcpu, run); } - /* - * kvm_arm_handle_step_debug() sets the exit_reason on the kvm_run - * structure if we need to return to userspace. - */ - if (handled > 0 && kvm_arm_handle_step_debug(vcpu, run)) - handled = 0; - return handled; } @@ -287,12 +280,7 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, case ARM_EXCEPTION_IRQ: return 1; case ARM_EXCEPTION_EL1_SERROR: - /* We may still need to return for single-step */ - if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS) - && kvm_arm_handle_step_debug(vcpu, run)) - return 0; - else - return 1; + return 1; case ARM_EXCEPTION_TRAP: return handle_trap_exceptions(vcpu, run); case ARM_EXCEPTION_HYP_GONE: diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c @@ -313,33 +313,6 @@ static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) return true; } -/* Skip an instruction which has been emulated. Returns true if - * execution can continue or false if we need to exit hyp mode because - * single-step was in effect. - */ -static bool __hyp_text __skip_instr(struct kvm_vcpu *vcpu) -{ - *vcpu_pc(vcpu) = read_sysreg_el2(elr); - - if (vcpu_mode_is_32bit(vcpu)) { - vcpu->arch.ctxt.gp_regs.regs.pstate = read_sysreg_el2(spsr); - kvm_skip_instr32(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - write_sysreg_el2(vcpu->arch.ctxt.gp_regs.regs.pstate, spsr); - } else { - *vcpu_pc(vcpu) += 4; - } - - write_sysreg_el2(*vcpu_pc(vcpu), elr); - - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - vcpu->arch.fault.esr_el2 = - (ESR_ELx_EC_SOFTSTP_LOW << ESR_ELx_EC_SHIFT) | 0x22; - return false; - } else { - return true; - } -} - static bool __hyp_text __hyp_switch_fpsimd(struct kvm_vcpu *vcpu) { struct user_fpsimd_state *host_fpsimd = vcpu->arch.host_fpsimd_state; @@ -428,20 +401,12 @@ static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) if (valid) { int ret = __vgic_v2_perform_cpuif_access(vcpu); - if (ret == 1 && __skip_instr(vcpu)) + if (ret == 1) return true; - if (ret == -1) { - /* Promote an illegal access to an - * SError. If we would be returning - * due to single-step clear the SS - * bit so handle_exit knows what to - * do after dealing with the error. - */ - if (!__skip_instr(vcpu)) - *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + /* Promote an illegal access to an SError.*/ + if (ret == -1) *exit_code = ARM_EXCEPTION_EL1_SERROR; - } goto exit; } @@ -452,7 +417,7 @@ static bool __hyp_text fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { int ret = __vgic_v3_perform_cpuif_access(vcpu); - if (ret == 1 && __skip_instr(vcpu)) + if (ret == 1) return true; } diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c @@ -41,7 +41,7 @@ static bool __hyp_text __is_be(struct kvm_vcpu *vcpu) * Returns: * 1: GICV access successfully performed * 0: Not a GICV access - * -1: Illegal GICV access + * -1: Illegal GICV access successfully performed */ int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) { @@ -61,12 +61,16 @@ int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) return 0; /* Reject anything but a 32bit access */ - if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) + if (kvm_vcpu_dabt_get_as(vcpu) != sizeof(u32)) { + __kvm_skip_instr(vcpu); return -1; + } /* Not aligned? Don't bother */ - if (fault_ipa & 3) + if (fault_ipa & 3) { + __kvm_skip_instr(vcpu); return -1; + } rd = kvm_vcpu_dabt_get_rd(vcpu); addr = hyp_symbol_addr(kvm_vgic_global_state)->vcpu_hyp_va; @@ -88,5 +92,7 @@ int __hyp_text __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu) vcpu_set_reg(vcpu, rd, data); } + __kvm_skip_instr(vcpu); + return 1; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c @@ -76,7 +76,7 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu, return false; } -u64 vcpu_read_sys_reg(struct kvm_vcpu *vcpu, int reg) +u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { if (!vcpu->arch.sysregs_loaded_on_cpu) goto immediate_read; @@ -1858,6 +1858,8 @@ static void perform_access(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) { + trace_kvm_sys_access(*vcpu_pc(vcpu), params, r); + /* * Not having an accessor means that we have configured a trap * that we don't know how to handle. This certainly qualifies @@ -1920,8 +1922,8 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu, WARN_ON(1); } - kvm_err("Unsupported guest CP%d access at: %08lx\n", - cp, *vcpu_pc(vcpu)); + kvm_err("Unsupported guest CP%d access at: %08lx [%08lx]\n", + cp, *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); print_sys_reg_instr(params); kvm_inject_undefined(vcpu); } @@ -2071,8 +2073,8 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu, if (likely(r)) { perform_access(vcpu, params, r); } else { - kvm_err("Unsupported guest sys_reg access at: %lx\n", - *vcpu_pc(vcpu)); + kvm_err("Unsupported guest sys_reg access at: %lx [%08lx]\n", + *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); print_sys_reg_instr(params); kvm_inject_undefined(vcpu); } diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h @@ -35,6 +35,9 @@ struct sys_reg_params { }; struct sys_reg_desc { + /* Sysreg string for debug */ + const char *name; + /* MRS/MSR instruction which accesses it. */ u8 Op0; u8 Op1; @@ -130,6 +133,7 @@ const struct sys_reg_desc *find_reg_by_id(u64 id, #define Op2(_x) .Op2 = _x #define SYS_DESC(reg) \ + .name = #reg, \ Op0(sys_reg_Op0(reg)), Op1(sys_reg_Op1(reg)), \ CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h @@ -3,6 +3,7 @@ #define _TRACE_ARM64_KVM_H #include <linux/tracepoint.h> +#include "sys_regs.h" #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -152,6 +153,40 @@ TRACE_EVENT(kvm_handle_sys_reg, TP_printk("HSR 0x%08lx", __entry->hsr) ); +TRACE_EVENT(kvm_sys_access, + TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg), + TP_ARGS(vcpu_pc, params, reg), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(bool, is_write) + __field(const char *, name) + __field(u8, Op0) + __field(u8, Op1) + __field(u8, CRn) + __field(u8, CRm) + __field(u8, Op2) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->is_write = params->is_write; + __entry->name = reg->name; + __entry->Op0 = reg->Op0; + __entry->Op0 = reg->Op0; + __entry->Op1 = reg->Op1; + __entry->CRn = reg->CRn; + __entry->CRm = reg->CRm; + __entry->Op2 = reg->Op2; + ), + + TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s", + __entry->vcpu_pc, __entry->name ?: "UNKN", + __entry->Op0, __entry->Op1, __entry->CRn, + __entry->CRm, __entry->Op2, + __entry->is_write ? "write" : "read") +); + TRACE_EVENT(kvm_set_guest_debug, TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), TP_ARGS(vcpu, guest_debug), diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h @@ -936,7 +936,7 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c @@ -1004,14 +1004,37 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - bool is_dirty = false; + bool flush = false; int r; mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + r = kvm_get_dirty_log_protect(kvm, log, &flush); - if (is_dirty) { + if (flush) { + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, log->slot); + + /* Let implementation handle TLB/GVA invalidation */ + kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot); + } + + mutex_unlock(&kvm->slots_lock); + return r; +} + +int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + bool flush = false; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_clear_dirty_log_protect(kvm, log, &flush); + + if (flush) { slots = kvm_memslots(kvm); memslot = id_to_memslot(slots, log->slot); diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c @@ -551,7 +551,7 @@ static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, (pte_dirty(old_pte) && !pte_dirty(hva_pte)); } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { unsigned long end = hva + PAGE_SIZE; int ret; @@ -559,6 +559,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); if (ret) kvm_mips_callbacks->flush_shadow_all(kvm); + return 0; } static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h @@ -335,6 +335,7 @@ #define H_SET_PARTITION_TABLE 0xF800 #define H_ENTER_NESTED 0xF804 #define H_TLB_INVALIDATE 0xF808 +#define H_COPY_TOFROM_GUEST 0xF80C /* Values for 2nd argument to H_SET_MODE */ #define H_SET_MODE_RESOURCE_SET_CIABR 1 diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h @@ -188,6 +188,13 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc); extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long ea, unsigned long dsisr); +extern unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, + gva_t eaddr, void *to, void *from, + unsigned long n); +extern long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, + void *to, unsigned long n); +extern long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, + void *from, unsigned long n); extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, u64 root, u64 *pte_ret_p); @@ -196,8 +203,11 @@ extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, int table_index, u64 *pte_ret_p); extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, bool data, bool iswrite); +extern void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, + unsigned int pshift, unsigned int lpid); extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, - unsigned int shift, struct kvm_memory_slot *memslot, + unsigned int shift, + const struct kvm_memory_slot *memslot, unsigned int lpid); extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing, unsigned long gpa, @@ -215,16 +225,14 @@ extern int kvmppc_radix_init(void); extern void kvmppc_radix_exit(void); extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); -extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, - unsigned long gpa, unsigned int shift, - struct kvm_memory_slot *memslot, - unsigned int lpid); extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); +extern void kvmppc_radix_flush_memslot(struct kvm *kvm, + const struct kvm_memory_slot *memslot); extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); /* XXX remove this export when load_last_inst() is generic */ @@ -242,7 +250,7 @@ extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, bool *writable); extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, unsigned long *rmap, long pte_index, int realmode); -extern void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot, +extern void kvmppc_update_dirty_map(const struct kvm_memory_slot *memslot, unsigned long gfn, unsigned long psize); extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, unsigned long pte_index); @@ -298,6 +306,7 @@ long kvmhv_nested_init(void); void kvmhv_nested_exit(void); void kvmhv_vm_nested_init(struct kvm *kvm); long kvmhv_set_partition_table(struct kvm_vcpu *vcpu); +long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu); void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1); void kvmhv_release_all_nested(struct kvm *kvm); long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu); @@ -307,7 +316,7 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu, void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr); void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, struct hv_guest_state *hr); -long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu); +long int kvmhv_nested_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu); void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -55,6 +55,7 @@ struct kvm_nested_guest { cpumask_t need_tlb_flush; cpumask_t cpu_in_guest; short prev_cpu[NR_CPUS]; + u8 radix; /* is this nested guest radix */ }; /* @@ -150,6 +151,18 @@ static inline bool kvm_is_radix(struct kvm *kvm) return kvm->arch.radix; } +static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu) +{ + bool radix; + + if (vcpu->arch.nested) + radix = vcpu->arch.nested->radix; + else + radix = kvm_is_radix(vcpu->kvm); + + return radix; +} + #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ #endif @@ -624,8 +637,11 @@ extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, unsigned long *rmapp, struct rmap_nested **n_rmap); extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, struct rmap_nested **n_rmap); +extern void kvmhv_update_nest_rmap_rc_list(struct kvm *kvm, unsigned long *rmapp, + unsigned long clr, unsigned long set, + unsigned long hpa, unsigned long nbytes); extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, unsigned long gpa, unsigned long hpa, unsigned long nbytes); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h @@ -72,7 +72,7 @@ extern int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +extern int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 @@ -793,6 +793,7 @@ struct kvm_vcpu_arch { /* For support of nested guests */ struct kvm_nested_guest *nested; u32 nested_vcpu_id; + gpa_t nested_io_gpr; #endif #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING @@ -827,6 +828,8 @@ struct kvm_vcpu_arch { #define KVM_MMIO_REG_FQPR 0x00c0 #define KVM_MMIO_REG_VSX 0x0100 #define KVM_MMIO_REG_VMX 0x0180 +#define KVM_MMIO_REG_NESTED_GPR 0xffc0 + #define __KVM_HAVE_ARCH_WQP #define __KVM_HAVE_CREATE_DEVICE diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h @@ -224,7 +224,8 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, extern void kvmppc_core_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, - const struct kvm_memory_slot *new); + const struct kvm_memory_slot *new, + enum kvm_mr_change change); extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info); extern void kvmppc_core_flush_memslot(struct kvm *kvm, @@ -294,7 +295,8 @@ struct kvmppc_ops { void (*commit_memory_region)(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, - const struct kvm_memory_slot *new); + const struct kvm_memory_slot *new, + enum kvm_mr_change change); int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, unsigned long end); int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end); @@ -326,6 +328,10 @@ struct kvmppc_ops { unsigned long flags); void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr); int (*enable_nested)(struct kvm *kvm); + int (*load_from_eaddr)(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr, + int size); + int (*store_to_eaddr)(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr, + int size); }; extern struct kvmppc_ops *kvmppc_hv_ops; diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S @@ -995,7 +995,16 @@ EXC_COMMON_BEGIN(h_data_storage_common) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD +BEGIN_MMU_FTR_SECTION + ld r4,PACA_EXGEN+EX_DAR(r13) + lwz r5,PACA_EXGEN+EX_DSISR(r13) + std r4,_DAR(r1) + std r5,_DSISR(r1) + li r5,SIGSEGV + bl bad_page_fault +MMU_FTR_SECTION_ELSE bl unknown_exception +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX) b ret_from_except diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c @@ -830,9 +830,10 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, void kvmppc_core_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, - const struct kvm_memory_slot *new) + const struct kvm_memory_slot *new, + enum kvm_mr_change change) { - kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new); + kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change); } int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) @@ -850,9 +851,10 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) return kvm->arch.kvm_ops->test_age_hva(kvm, hva); } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); + return 0; } void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -743,12 +743,15 @@ void kvmppc_rmap_reset(struct kvm *kvm) srcu_idx = srcu_read_lock(&kvm->srcu); slots = kvm_memslots(kvm); kvm_for_each_memslot(memslot, slots) { + /* Mutual exclusion with kvm_unmap_hva_range etc. */ + spin_lock(&kvm->mmu_lock); /* * This assumes it is acceptable to lose reference and * change bits across a reset. */ memset(memslot->arch.rmap, 0, memslot->npages * sizeof(*memslot->arch.rmap)); + spin_unlock(&kvm->mmu_lock); } srcu_read_unlock(&kvm->srcu, srcu_idx); } @@ -896,11 +899,12 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm, gfn = memslot->base_gfn; rmapp = memslot->arch.rmap; + if (kvm_is_radix(kvm)) { + kvmppc_radix_flush_memslot(kvm, memslot); + return; + } + for (n = memslot->npages; n; --n, ++gfn) { - if (kvm_is_radix(kvm)) { - kvm_unmap_radix(kvm, memslot, gfn); - continue; - } /* * Testing the present bit without locking is OK because * the memslot has been marked invalid already, and hence diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -29,6 +29,103 @@ */ static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; +unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, + gva_t eaddr, void *to, void *from, + unsigned long n) +{ + unsigned long quadrant, ret = n; + int old_pid, old_lpid; + bool is_load = !!to; + + /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */ + if (kvmhv_on_pseries()) + return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr, + __pa(to), __pa(from), n); + + quadrant = 1; + if (!pid) + quadrant = 2; + if (is_load) + from = (void *) (eaddr | (quadrant << 62)); + else + to = (void *) (eaddr | (quadrant << 62)); + + preempt_disable(); + + /* switch the lpid first to avoid running host with unallocated pid */ + old_lpid = mfspr(SPRN_LPID); + if (old_lpid != lpid) + mtspr(SPRN_LPID, lpid); + if (quadrant == 1) { + old_pid = mfspr(SPRN_PID); + if (old_pid != pid) + mtspr(SPRN_PID, pid); + } + isync(); + + pagefault_disable(); + if (is_load) + ret = raw_copy_from_user(to, from, n); + else + ret = raw_copy_to_user(to, from, n); + pagefault_enable(); + + /* switch the pid first to avoid running host with unallocated pid */ + if (quadrant == 1 && pid != old_pid) + mtspr(SPRN_PID, old_pid); + if (lpid != old_lpid) + mtspr(SPRN_LPID, old_lpid); + isync(); + + preempt_enable(); + + return ret; +} +EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix); + +static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, + void *to, void *from, unsigned long n) +{ + int lpid = vcpu->kvm->arch.lpid; + int pid = vcpu->arch.pid; + + /* This would cause a data segment intr so don't allow the access */ + if (eaddr & (0x3FFUL << 52)) + return -EINVAL; + + /* Should we be using the nested lpid */ + if (vcpu->arch.nested) + lpid = vcpu->arch.nested->shadow_lpid; + + /* If accessing quadrant 3 then pid is expected to be 0 */ + if (((eaddr >> 62) & 0x3) == 0x3) + pid = 0; + + eaddr &= ~(0xFFFUL << 52); + + return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n); +} + +long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, + unsigned long n) +{ + long ret; + + ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n); + if (ret > 0) + memset(to + (n - ret), 0, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix); + +long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from, + unsigned long n) +{ + return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n); +} +EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix); + int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, u64 root, u64 *pte_ret_p) @@ -197,8 +294,8 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, return 0; } -static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, - unsigned int pshift, unsigned int lpid) +void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, + unsigned int pshift, unsigned int lpid) { unsigned long psize = PAGE_SIZE; int psi; @@ -284,7 +381,8 @@ static void kvmppc_pmd_free(pmd_t *pmdp) /* Called with kvm->mmu_lock held */ void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, - unsigned int shift, struct kvm_memory_slot *memslot, + unsigned int shift, + const struct kvm_memory_slot *memslot, unsigned int lpid) { @@ -683,6 +781,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, pte_t pte, *ptep; unsigned int shift, level; int ret; + bool large_enable; /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_notifier_seq; @@ -732,12 +831,15 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, pte = *ptep; local_irq_enable(); + /* If we're logging dirty pages, always map single pages */ + large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES); + /* Get pte level from shift/size */ - if (shift == PUD_SHIFT && + if (large_enable && shift == PUD_SHIFT && (gpa & (PUD_SIZE - PAGE_SIZE)) == (hva & (PUD_SIZE - PAGE_SIZE))) { level = 2; - } else if (shift == PMD_SHIFT && + } else if (large_enable && shift == PMD_SHIFT && (gpa & (PMD_SIZE - PAGE_SIZE)) == (hva & (PMD_SIZE - PAGE_SIZE))) { level = 1; @@ -857,7 +959,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return ret; } -/* Called with kvm->lock held */ +/* Called with kvm->mmu_lock held */ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { @@ -872,7 +974,7 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, return 0; } -/* Called with kvm->lock held */ +/* Called with kvm->mmu_lock held */ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { @@ -880,18 +982,24 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; int ref = 0; + unsigned long old, *rmapp; ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) { - kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, - gpa, shift); + old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, + gpa, shift); /* XXX need to flush tlb here? */ + /* Also clear bit in ptes in shadow pgtable for nested guests */ + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; + kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0, + old & PTE_RPN_MASK, + 1UL << shift); ref = 1; } return ref; } -/* Called with kvm->lock held */ +/* Called with kvm->mmu_lock held */ int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { @@ -915,15 +1023,23 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, pte_t *ptep; unsigned int shift; int ret = 0; + unsigned long old, *rmapp; ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { ret = 1; if (shift) ret = 1 << (shift - PAGE_SHIFT); - kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, - gpa, shift); + spin_lock(&kvm->mmu_lock); + old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, + gpa, shift); kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); + /* Also clear bit in ptes in shadow pgtable for nested guests */ + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; + kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0, + old & PTE_RPN_MASK, + 1UL << shift); + spin_unlock(&kvm->mmu_lock); } return ret; } @@ -953,6 +1069,26 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, return 0; } +void kvmppc_radix_flush_memslot(struct kvm *kvm, + const struct kvm_memory_slot *memslot) +{ + unsigned long n; + pte_t *ptep; + unsigned long gpa; + unsigned int shift; + + gpa = memslot->base_gfn << PAGE_SHIFT; + spin_lock(&kvm->mmu_lock); + for (n = memslot->npages; n; --n) { + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + if (ptep && pte_present(*ptep)) + kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, + kvm->arch.lpid); + gpa += PAGE_SIZE; + } + spin_unlock(&kvm->mmu_lock); +} + static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info, int psize, int *indexp) { diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c @@ -985,6 +985,10 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) kvmppc_set_gpr(vcpu, 3, 0); vcpu->arch.hcall_needed = 0; return -EINTR; + } else if (ret == H_TOO_HARD) { + kvmppc_set_gpr(vcpu, 3, 0); + vcpu->arch.hcall_needed = 0; + return RESUME_HOST; } break; case H_TLB_INVALIDATE: @@ -992,7 +996,11 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) if (nesting_enabled(vcpu->kvm)) ret = kvmhv_do_nested_tlbie(vcpu); break; - + case H_COPY_TOFROM_GUEST: + ret = H_FUNCTION; + if (nesting_enabled(vcpu->kvm)) + ret = kvmhv_copy_tofrom_guest_nested(vcpu); + break; default: return RESUME_HOST; } @@ -1336,7 +1344,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, return r; } -static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) +static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; int srcu_idx; @@ -1394,7 +1402,7 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) */ case BOOK3S_INTERRUPT_H_DATA_STORAGE: srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvmhv_nested_page_fault(vcpu); + r = kvmhv_nested_page_fault(run, vcpu); srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); break; case BOOK3S_INTERRUPT_H_INST_STORAGE: @@ -1404,7 +1412,7 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE) vcpu->arch.fault_dsisr |= DSISR_ISSTORE; srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvmhv_nested_page_fault(vcpu); + r = kvmhv_nested_page_fault(run, vcpu); srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); break; @@ -4059,7 +4067,7 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, if (!nested) r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); else - r = kvmppc_handle_nested_exit(vcpu); + r = kvmppc_handle_nested_exit(kvm_run, vcpu); } vcpu->arch.ret = r; @@ -4371,7 +4379,8 @@ static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, - const struct kvm_memory_slot *new) + const struct kvm_memory_slot *new, + enum kvm_mr_change change) { unsigned long npages = mem->memory_size >> PAGE_SHIFT; @@ -4383,6 +4392,23 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, */ if (npages) atomic64_inc(&kvm->arch.mmio_update); + + /* + * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels + * have already called kvm_arch_flush_shadow_memslot() to + * flush shadow mappings. For KVM_MR_CREATE we have no + * previous mappings. So the only case to handle is + * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit + * has been changed. + * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES + * to get rid of any THP PTEs in the partition-scoped page tables + * so we can track dirtiness at the page level; we flush when + * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to + * using THP PTEs. + */ + if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) && + ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES)) + kvmppc_radix_flush_memslot(kvm, old); } /* @@ -4532,12 +4558,15 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) { if (nesting_enabled(kvm)) kvmhv_release_all_nested(kvm); + kvmppc_rmap_reset(kvm); + kvm->arch.process_table = 0; + /* Mutual exclusion with kvm_unmap_hva_range etc. */ + spin_lock(&kvm->mmu_lock); + kvm->arch.radix = 0; + spin_unlock(&kvm->mmu_lock); kvmppc_free_radix(kvm); kvmppc_update_lpcr(kvm, LPCR_VPM1, LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); - kvmppc_rmap_reset(kvm); - kvm->arch.radix = 0; - kvm->arch.process_table = 0; return 0; } @@ -4549,12 +4578,14 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) err = kvmppc_init_vm_radix(kvm); if (err) return err; - + kvmppc_rmap_reset(kvm); + /* Mutual exclusion with kvm_unmap_hva_range etc. */ + spin_lock(&kvm->mmu_lock); + kvm->arch.radix = 1; + spin_unlock(&kvm->mmu_lock); kvmppc_free_hpt(&kvm->arch.hpt); kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); - kvmppc_rmap_reset(kvm); - kvm->arch.radix = 1; return 0; } @@ -5214,6 +5245,44 @@ static int kvmhv_enable_nested(struct kvm *kvm) return 0; } +static int kvmhv_load_from_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr, + int size) +{ + int rc = -EINVAL; + + if (kvmhv_vcpu_is_radix(vcpu)) { + rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size); + + if (rc > 0) + rc = -EINVAL; + } + + /* For now quadrants are the only way to access nested guest memory */ + if (rc && vcpu->arch.nested) + rc = -EAGAIN; + + return rc; +} + +static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr, + int size) +{ + int rc = -EINVAL; + + if (kvmhv_vcpu_is_radix(vcpu)) { + rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size); + + if (rc > 0) + rc = -EINVAL; + } + + /* For now quadrants are the only way to access nested guest memory */ + if (rc && vcpu->arch.nested) + rc = -EAGAIN; + + return rc; +} + static struct kvmppc_ops kvm_ops_hv = { .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, @@ -5254,6 +5323,8 @@ static struct kvmppc_ops kvm_ops_hv = { .get_rmmu_info = kvmhv_get_rmmu_info, .set_smt_mode = kvmhv_set_smt_mode, .enable_nested = kvmhv_enable_nested, + .load_from_eaddr = kvmhv_load_from_eaddr, + .store_to_eaddr = kvmhv_store_to_eaddr, }; static int kvm_init_subcore_bitmap(void) diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c @@ -195,6 +195,26 @@ void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, vcpu->arch.ppr = hr->ppr; } +static void kvmhv_nested_mmio_needed(struct kvm_vcpu *vcpu, u64 regs_ptr) +{ + /* No need to reflect the page fault to L1, we've handled it */ + vcpu->arch.trap = 0; + + /* + * Since the L2 gprs have already been written back into L1 memory when + * we complete the mmio, store the L1 memory location of the L2 gpr + * being loaded into by the mmio so that the loaded value can be + * written there in kvmppc_complete_mmio_load() + */ + if (((vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) == KVM_MMIO_REG_GPR) + && (vcpu->mmio_is_write == 0)) { + vcpu->arch.nested_io_gpr = (gpa_t) regs_ptr + + offsetof(struct pt_regs, + gpr[vcpu->arch.io_gpr]); + vcpu->arch.io_gpr = KVM_MMIO_REG_NESTED_GPR; + } +} + long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) { long int err, r; @@ -316,6 +336,11 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) if (r == -EINTR) return H_INTERRUPT; + if (vcpu->mmio_needed) { + kvmhv_nested_mmio_needed(vcpu, regs_ptr); + return H_TOO_HARD; + } + return vcpu->arch.trap; } @@ -437,6 +462,81 @@ long kvmhv_set_partition_table(struct kvm_vcpu *vcpu) } /* + * Handle the H_COPY_TOFROM_GUEST hcall. + * r4 = L1 lpid of nested guest + * r5 = pid + * r6 = eaddr to access + * r7 = to buffer (L1 gpa) + * r8 = from buffer (L1 gpa) + * r9 = n bytes to copy + */ +long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu) +{ + struct kvm_nested_guest *gp; + int l1_lpid = kvmppc_get_gpr(vcpu, 4); + int pid = kvmppc_get_gpr(vcpu, 5); + gva_t eaddr = kvmppc_get_gpr(vcpu, 6); + gpa_t gp_to = (gpa_t) kvmppc_get_gpr(vcpu, 7); + gpa_t gp_from = (gpa_t) kvmppc_get_gpr(vcpu, 8); + void *buf; + unsigned long n = kvmppc_get_gpr(vcpu, 9); + bool is_load = !!gp_to; + long rc; + + if (gp_to && gp_from) /* One must be NULL to determine the direction */ + return H_PARAMETER; + + if (eaddr & (0xFFFUL << 52)) + return H_PARAMETER; + + buf = kzalloc(n, GFP_KERNEL); + if (!buf) + return H_NO_MEM; + + gp = kvmhv_get_nested(vcpu->kvm, l1_lpid, false); + if (!gp) { + rc = H_PARAMETER; + goto out_free; + } + + mutex_lock(&gp->tlb_lock); + + if (is_load) { + /* Load from the nested guest into our buffer */ + rc = __kvmhv_copy_tofrom_guest_radix(gp->shadow_lpid, pid, + eaddr, buf, NULL, n); + if (rc) + goto not_found; + + /* Write what was loaded into our buffer back to the L1 guest */ + rc = kvm_vcpu_write_guest(vcpu, gp_to, buf, n); + if (rc) + goto not_found; + } else { + /* Load the data to be stored from the L1 guest into our buf */ + rc = kvm_vcpu_read_guest(vcpu, gp_from, buf, n); + if (rc) + goto not_found; + + /* Store from our buffer into the nested guest */ + rc = __kvmhv_copy_tofrom_guest_radix(gp->shadow_lpid, pid, + eaddr, NULL, buf, n); + if (rc) + goto not_found; + } + +out_unlock: + mutex_unlock(&gp->tlb_lock); + kvmhv_put_nested(gp); +out_free: + kfree(buf); + return rc; +not_found: + rc = H_NOT_FOUND; + goto out_unlock; +} + +/* * Reload the partition table entry for a guest. * Caller must hold gp->tlb_lock. */ @@ -480,6 +580,7 @@ struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) if (shadow_lpid < 0) goto out_free2; gp->shadow_lpid = shadow_lpid; + gp->radix = 1; memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu)); @@ -687,6 +788,57 @@ void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, *n_rmap = NULL; } +static void kvmhv_update_nest_rmap_rc(struct kvm *kvm, u64 n_rmap, + unsigned long clr, unsigned long set, + unsigned long hpa, unsigned long mask) +{ + struct kvm_nested_guest *gp; + unsigned long gpa; + unsigned int shift, lpid; + pte_t *ptep; + + gpa = n_rmap & RMAP_NESTED_GPA_MASK; + lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT; + gp = kvmhv_find_nested(kvm, lpid); + if (!gp) + return; + + /* Find the pte */ + ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); + /* + * If the pte is present and the pfn is still the same, update the pte. + * If the pfn has changed then this is a stale rmap entry, the nested + * gpa actually points somewhere else now, and there is nothing to do. + * XXX A future optimisation would be to remove the rmap entry here. + */ + if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa)) { + __radix_pte_update(ptep, clr, set); + kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid); + } +} + +/* + * For a given list of rmap entries, update the rc bits in all ptes in shadow + * page tables for nested guests which are referenced by the rmap list. + */ +void kvmhv_update_nest_rmap_rc_list(struct kvm *kvm, unsigned long *rmapp, + unsigned long clr, unsigned long set, + unsigned long hpa, unsigned long nbytes) +{ + struct llist_node *entry = ((struct llist_head *) rmapp)->first; + struct rmap_nested *cursor; + unsigned long rmap, mask; + + if ((clr | set) & ~(_PAGE_DIRTY | _PAGE_ACCESSED)) + return; + + mask = PTE_RPN_MASK & ~(nbytes - 1); + hpa &= mask; + + for_each_nest_rmap_safe(cursor, entry, &rmap) + kvmhv_update_nest_rmap_rc(kvm, rmap, clr, set, hpa, mask); +} + static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, unsigned long hpa, unsigned long mask) { @@ -723,7 +875,7 @@ static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp, /* called with kvm->mmu_lock held */ void kvmhv_remove_nest_rmap_range(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, unsigned long gpa, unsigned long hpa, unsigned long nbytes) { @@ -1049,7 +1201,7 @@ static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu, struct kvm *kvm = vcpu->kvm; bool writing = !!(dsisr & DSISR_ISSTORE); u64 pgflags; - bool ret; + long ret; /* Are the rc bits set in the L1 partition scoped pte? */ pgflags = _PAGE_ACCESSED; @@ -1062,16 +1214,22 @@ static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu, /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */ ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing, gpte.raddr, kvm->arch.lpid); - spin_unlock(&kvm->mmu_lock); - if (!ret) - return -EINVAL; + if (!ret) { + ret = -EINVAL; + goto out_unlock; + } /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */ ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa, gp->shadow_lpid); if (!ret) - return -EINVAL; - return 0; + ret = -EINVAL; + else + ret = 0; + +out_unlock: + spin_unlock(&kvm->mmu_lock); + return ret; } static inline int kvmppc_radix_level_to_shift(int level) @@ -1099,7 +1257,8 @@ static inline int kvmppc_radix_shift_to_level(int shift) } /* called with gp->tlb_lock held */ -static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, +static long int __kvmhv_nested_page_fault(struct kvm_run *run, + struct kvm_vcpu *vcpu, struct kvm_nested_guest *gp) { struct kvm *kvm = vcpu->kvm; @@ -1180,9 +1339,9 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, kvmppc_core_queue_data_storage(vcpu, ea, dsisr); return RESUME_GUEST; } - /* passthrough of emulated MMIO case... */ - pr_err("emulated MMIO passthrough?\n"); - return -EINVAL; + + /* passthrough of emulated MMIO case */ + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing); } if (memslot->flags & KVM_MEM_READONLY) { if (writing) { @@ -1220,6 +1379,8 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, return ret; shift = kvmppc_radix_level_to_shift(level); } + /* Align gfn to the start of the page */ + gfn = (gpa & ~((1UL << shift) - 1)) >> PAGE_SHIFT; /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */ @@ -1227,6 +1388,9 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, perm |= gpte.may_read ? 0UL : _PAGE_READ; perm |= gpte.may_write ? 0UL : _PAGE_WRITE; perm |= gpte.may_execute ? 0UL : _PAGE_EXEC; + /* Only set accessed/dirty (rc) bits if set in host and l1 guest ptes */ + perm |= (gpte.rc & _PAGE_ACCESSED) ? 0UL : _PAGE_ACCESSED; + perm |= ((gpte.rc & _PAGE_DIRTY) && writing) ? 0UL : _PAGE_DIRTY; pte = __pte(pte_val(pte) & ~perm); /* What size pte can we insert? */ @@ -1264,13 +1428,13 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, return RESUME_GUEST; } -long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) +long int kvmhv_nested_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu) { struct kvm_nested_guest *gp = vcpu->arch.nested; long int ret; mutex_lock(&gp->tlb_lock); - ret = __kvmhv_nested_page_fault(vcpu, gp); + ret = __kvmhv_nested_page_fault(run, vcpu, gp); mutex_unlock(&gp->tlb_lock); return ret; } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -107,7 +107,7 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); /* Update the dirty bitmap of a memslot */ -void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot, +void kvmppc_update_dirty_map(const struct kvm_memory_slot *memslot, unsigned long gfn, unsigned long psize) { unsigned long npages; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c @@ -587,6 +587,7 @@ void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) case PVR_POWER8: case PVR_POWER8E: case PVR_POWER8NVL: + case PVR_POWER9: vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE | BOOK3S_HFLAG_NEW_TLBIE; break; @@ -1913,7 +1914,8 @@ static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm, static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, - const struct kvm_memory_slot *new) + const struct kvm_memory_slot *new, + enum kvm_mr_change change) { return; } diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c @@ -1015,17 +1015,7 @@ static int xics_debug_show(struct seq_file *m, void *private) return 0; } -static int xics_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, xics_debug_show, inode->i_private); -} - -static const struct file_operations xics_debug_fops = { - .open = xics_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(xics_debug); static void xics_debugfs_init(struct kvmppc_xics *xics) { diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c @@ -1968,17 +1968,7 @@ static int xive_debug_show(struct seq_file *m, void *private) return 0; } -static int xive_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, xive_debug_show, inode->i_private); -} - -static const struct file_operations xive_debug_fops = { - .open = xive_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(xive_debug); static void xive_debugfs_init(struct kvmppc_xive *xive) { diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c @@ -1833,7 +1833,8 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, void kvmppc_core_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, - const struct kvm_memory_slot *new) + const struct kvm_memory_slot *new, + enum kvm_mr_change change) { } diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c @@ -757,10 +757,11 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) return 0; } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { /* The page will get remapped properly on its next fault */ kvm_unmap_hva(kvm, hva); + return 0; } /*****************************************/ diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c @@ -331,10 +331,17 @@ int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, { ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK; struct kvmppc_pte pte; - int r; + int r = -EINVAL; vcpu->stat.st++; + if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->store_to_eaddr) + r = vcpu->kvm->arch.kvm_ops->store_to_eaddr(vcpu, eaddr, ptr, + size); + + if ((!r) || (r == -EAGAIN)) + return r; + r = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST, XLATE_WRITE, &pte); if (r < 0) @@ -367,10 +374,17 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, { ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK; struct kvmppc_pte pte; - int rc; + int rc = -EINVAL; vcpu->stat.ld++; + if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->load_from_eaddr) + rc = vcpu->kvm->arch.kvm_ops->load_from_eaddr(vcpu, eaddr, ptr, + size); + + if ((!rc) || (rc == -EAGAIN)) + return rc; + rc = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST, XLATE_READ, &pte); if (rc) @@ -518,7 +532,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_ONE_REG: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: @@ -543,8 +556,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: case KVM_CAP_SPAPR_TCE_64: - /* fallthrough */ + r = 1; + break; case KVM_CAP_SPAPR_TCE_VFIO: + r = !!cpu_has_feature(CPU_FTR_HVMODE); + break; case KVM_CAP_PPC_RTAS: case KVM_CAP_PPC_FIXUP_HCALL: case KVM_CAP_PPC_ENABLE_HCALL: @@ -696,7 +712,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { - kvmppc_core_commit_memory_region(kvm, mem, old, new); + kvmppc_core_commit_memory_region(kvm, mem, old, new, change); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, @@ -1192,6 +1208,14 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, kvmppc_set_vmx_byte(vcpu, gpr); break; #endif +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + case KVM_MMIO_REG_NESTED_GPR: + if (kvmppc_need_byteswap(vcpu)) + gpr = swab64(gpr); + kvm_vcpu_write_guest(vcpu, vcpu->arch.nested_io_gpr, &gpr, + sizeof(gpr)); + break; +#endif default: BUG(); } @@ -2084,8 +2108,8 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, } -static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, - struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) { int r; @@ -2273,15 +2297,6 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } - case KVM_ENABLE_CAP: - { - struct kvm_enable_cap cap; - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } #ifdef CONFIG_SPAPR_TCE_IOMMU case KVM_CREATE_SPAPR_TCE_64: { struct kvm_create_spapr_tce_64 create_tce_64; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c @@ -636,6 +636,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) switch (TRAP(regs)) { case 0x300: case 0x380: + case 0xe00: printk(KERN_ALERT "Unable to handle kernel paging request for " "data at address 0x%08lx\n", regs->dar); break; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c @@ -11,6 +11,9 @@ * Jason J. Herne <jjherne@us.ibm.com> */ +#define KMSG_COMPONENT "kvm-s390" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + #include <linux/compiler.h> #include <linux/err.h> #include <linux/fs.h> @@ -44,10 +47,6 @@ #include "kvm-s390.h" #include "gaccess.h" -#define KMSG_COMPONENT "kvm-s390" -#undef pr_fmt -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - #define CREATE_TRACE_POINTS #include "trace.h" #include "trace-s390.h" @@ -417,19 +416,30 @@ static void kvm_s390_cpu_feat_init(void) int kvm_arch_init(void *opaque) { + int rc; + kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long)); if (!kvm_s390_dbf) return -ENOMEM; if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view)) { - debug_unregister(kvm_s390_dbf); - return -ENOMEM; + rc = -ENOMEM; + goto out_debug_unreg; } kvm_s390_cpu_feat_init(); /* Register floating interrupt controller interface. */ - return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); + rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); + if (rc) { + pr_err("Failed to register FLIC rc=%d\n", rc); + goto out_debug_unreg; + } + return 0; + +out_debug_unreg: + debug_unregister(kvm_s390_dbf); + return rc; } void kvm_arch_exit(void) @@ -464,7 +474,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_CSS_SUPPORT: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_S390_IRQCHIP: case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: @@ -607,7 +616,7 @@ static void icpt_operexc_on_all_vcpus(struct kvm *kvm) } } -static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r; @@ -1933,14 +1942,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_s390_inject_vm(kvm, &s390int); break; } - case KVM_ENABLE_CAP: { - struct kvm_enable_cap cap; - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - break; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } case KVM_CREATE_IRQCHIP: { struct kvm_irq_routing_entry routing; diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c @@ -68,6 +68,7 @@ static struct pt_cap_desc { PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), + PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)), PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), @@ -75,14 +76,21 @@ static struct pt_cap_desc { PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), }; -static u32 pt_cap_get(enum pt_capabilities cap) +u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { - struct pt_cap_desc *cd = &pt_caps[cap]; - u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; + struct pt_cap_desc *cd = &pt_caps[capability]; + u32 c = caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg]; unsigned int shift = __ffs(cd->mask); return (c & cd->mask) >> shift; } +EXPORT_SYMBOL_GPL(intel_pt_validate_cap); + +u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) +{ + return intel_pt_validate_cap(pt_pmu.caps, cap); +} +EXPORT_SYMBOL_GPL(intel_pt_validate_hw_cap); static ssize_t pt_cap_show(struct device *cdev, struct device_attribute *attr, @@ -92,7 +100,7 @@ static ssize_t pt_cap_show(struct device *cdev, container_of(attr, struct dev_ext_attribute, attr); enum pt_capabilities cap = (long)ea->var; - return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap)); + return snprintf(buf, PAGE_SIZE, "%x\n", intel_pt_validate_hw_cap(cap)); } static struct attribute_group pt_cap_group __ro_after_init = { @@ -310,16 +318,16 @@ static bool pt_event_valid(struct perf_event *event) return false; if (config & RTIT_CTL_CYC_PSB) { - if (!pt_cap_get(PT_CAP_psb_cyc)) + if (!intel_pt_validate_hw_cap(PT_CAP_psb_cyc)) return false; - allowed = pt_cap_get(PT_CAP_psb_periods); + allowed = intel_pt_validate_hw_cap(PT_CAP_psb_periods); requested = (config & RTIT_CTL_PSB_FREQ) >> RTIT_CTL_PSB_FREQ_OFFSET; if (requested && (!(allowed & BIT(requested)))) return false; - allowed = pt_cap_get(PT_CAP_cycle_thresholds); + allowed = intel_pt_validate_hw_cap(PT_CAP_cycle_thresholds); requested = (config & RTIT_CTL_CYC_THRESH) >> RTIT_CTL_CYC_THRESH_OFFSET; if (requested && (!(allowed & BIT(requested)))) @@ -334,10 +342,10 @@ static bool pt_event_valid(struct perf_event *event) * Spec says that setting mtc period bits while mtc bit in * CPUID is 0 will #GP, so better safe than sorry. */ - if (!pt_cap_get(PT_CAP_mtc)) + if (!intel_pt_validate_hw_cap(PT_CAP_mtc)) return false; - allowed = pt_cap_get(PT_CAP_mtc_periods); + allowed = intel_pt_validate_hw_cap(PT_CAP_mtc_periods); if (!allowed) return false; @@ -349,11 +357,11 @@ static bool pt_event_valid(struct perf_event *event) } if (config & RTIT_CTL_PWR_EVT_EN && - !pt_cap_get(PT_CAP_power_event_trace)) + !intel_pt_validate_hw_cap(PT_CAP_power_event_trace)) return false; if (config & RTIT_CTL_PTW) { - if (!pt_cap_get(PT_CAP_ptwrite)) + if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite)) return false; /* FUPonPTW without PTW doesn't make sense */ @@ -598,7 +606,7 @@ static struct topa *topa_alloc(int cpu, gfp_t gfp) * In case of singe-entry ToPA, always put the self-referencing END * link as the 2nd entry in the table */ - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) { + if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT; TOPA_ENTRY(topa, 1)->end = 1; } @@ -638,7 +646,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) topa->offset = last->offset + last->size; buf->last = topa; - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) return; BUG_ON(last->last != TENTS_PER_PAGE - 1); @@ -654,7 +662,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) static bool topa_table_full(struct topa *topa) { /* single-entry ToPA is a special case */ - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) return !!topa->last; return topa->last == TENTS_PER_PAGE - 1; @@ -690,7 +698,8 @@ static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp) TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT; TOPA_ENTRY(topa, -1)->size = order; - if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) { + if (!buf->snapshot && + !intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { TOPA_ENTRY(topa, -1)->intr = 1; TOPA_ENTRY(topa, -1)->stop = 1; } @@ -725,7 +734,7 @@ static void pt_topa_dump(struct pt_buffer *buf) topa->table[i].intr ? 'I' : ' ', topa->table[i].stop ? 'S' : ' ', *(u64 *)&topa->table[i]); - if ((pt_cap_get(PT_CAP_topa_multiple_entries) && + if ((intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) && topa->table[i].stop) || topa->table[i].end) break; @@ -828,7 +837,7 @@ static void pt_handle_status(struct pt *pt) * means we are already losing data; need to let the decoder * know. */ - if (!pt_cap_get(PT_CAP_topa_multiple_entries) || + if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_TRUNCATED); @@ -840,7 +849,8 @@ static void pt_handle_status(struct pt *pt) * Also on single-entry ToPA implementations, interrupt will come * before the output reaches its output region's boundary. */ - if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot && + if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) && + !buf->snapshot && pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) { void *head = pt_buffer_region(buf); @@ -931,7 +941,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */ - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) return 0; /* clear STOP and INT from current entry */ @@ -1082,7 +1092,7 @@ static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages, pt_buffer_setup_topa_index(buf); /* link last table to the first one, unless we're double buffering */ - if (pt_cap_get(PT_CAP_topa_multiple_entries)) { + if (intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT; TOPA_ENTRY(buf->last, -1)->end = 1; } @@ -1153,7 +1163,7 @@ static int pt_addr_filters_init(struct perf_event *event) struct pt_filters *filters; int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu); - if (!pt_cap_get(PT_CAP_num_address_ranges)) + if (!intel_pt_validate_hw_cap(PT_CAP_num_address_ranges)) return 0; filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node); @@ -1202,7 +1212,7 @@ static int pt_event_addr_filters_validate(struct list_head *filters) return -EINVAL; } - if (++range > pt_cap_get(PT_CAP_num_address_ranges)) + if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges)) return -EOPNOTSUPP; } @@ -1507,12 +1517,12 @@ static __init int pt_init(void) if (ret) return ret; - if (!pt_cap_get(PT_CAP_topa_output)) { + if (!intel_pt_validate_hw_cap(PT_CAP_topa_output)) { pr_warn("ToPA output is not supported on this CPU\n"); return -ENODEV; } - if (!pt_cap_get(PT_CAP_topa_multiple_entries)) + if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; @@ -1530,7 +1540,7 @@ static __init int pt_init(void) pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync; pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate; pt_pmu.pmu.nr_addr_filters = - pt_cap_get(PT_CAP_num_address_ranges); + intel_pt_validate_hw_cap(PT_CAP_num_address_ranges); ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1); diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h @@ -20,43 +20,6 @@ #define __INTEL_PT_H__ /* - * PT MSR bit definitions - */ -#define RTIT_CTL_TRACEEN BIT(0) -#define RTIT_CTL_CYCLEACC BIT(1) -#define RTIT_CTL_OS BIT(2) -#define RTIT_CTL_USR BIT(3) -#define RTIT_CTL_PWR_EVT_EN BIT(4) -#define RTIT_CTL_FUP_ON_PTW BIT(5) -#define RTIT_CTL_CR3EN BIT(7) -#define RTIT_CTL_TOPA BIT(8) -#define RTIT_CTL_MTC_EN BIT(9) -#define RTIT_CTL_TSC_EN BIT(10) -#define RTIT_CTL_DISRETC BIT(11) -#define RTIT_CTL_PTW_EN BIT(12) -#define RTIT_CTL_BRANCH_EN BIT(13) -#define RTIT_CTL_MTC_RANGE_OFFSET 14 -#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) -#define RTIT_CTL_CYC_THRESH_OFFSET 19 -#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET) -#define RTIT_CTL_PSB_FREQ_OFFSET 24 -#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) -#define RTIT_CTL_ADDR0_OFFSET 32 -#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) -#define RTIT_CTL_ADDR1_OFFSET 36 -#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) -#define RTIT_CTL_ADDR2_OFFSET 40 -#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) -#define RTIT_CTL_ADDR3_OFFSET 44 -#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) -#define RTIT_STATUS_FILTEREN BIT(0) -#define RTIT_STATUS_CONTEXTEN BIT(1) -#define RTIT_STATUS_TRIGGEREN BIT(2) -#define RTIT_STATUS_BUFFOVF BIT(3) -#define RTIT_STATUS_ERROR BIT(4) -#define RTIT_STATUS_STOPPED BIT(5) - -/* * Single-entry ToPA: when this close to region boundary, switch * buffers to avoid losing data. */ @@ -82,30 +45,9 @@ struct topa_entry { u64 rsvd4 : 16; }; -#define PT_CPUID_LEAVES 2 -#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ - /* TSC to Core Crystal Clock Ratio */ #define CPUID_TSC_LEAF 0x15 -enum pt_capabilities { - PT_CAP_max_subleaf = 0, - PT_CAP_cr3_filtering, - PT_CAP_psb_cyc, - PT_CAP_ip_filtering, - PT_CAP_mtc, - PT_CAP_ptwrite, - PT_CAP_power_event_trace, - PT_CAP_topa_output, - PT_CAP_topa_multiple_entries, - PT_CAP_single_range_output, - PT_CAP_payloads_lip, - PT_CAP_num_address_ranges, - PT_CAP_mtc_periods, - PT_CAP_cycle_thresholds, - PT_CAP_psb_periods, -}; - struct pt_pmu { struct pmu pmu; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c @@ -7,6 +7,7 @@ * * Author : Lan Tianyu <Tianyu.Lan@microsoft.com> */ +#define pr_fmt(fmt) "Hyper-V: " fmt #include <linux/types.h> @@ -54,3 +55,82 @@ fault: return ret; } EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping); + +int hyperv_fill_flush_guest_mapping_list( + struct hv_guest_mapping_flush_list *flush, + u64 start_gfn, u64 pages) +{ + u64 cur = start_gfn; + u64 additional_pages; + int gpa_n = 0; + + do { + /* + * If flush requests exceed max flush count, go back to + * flush tlbs without range. + */ + if (gpa_n >= HV_MAX_FLUSH_REP_COUNT) + return -ENOSPC; + + additional_pages = min_t(u64, pages, HV_MAX_FLUSH_PAGES) - 1; + + flush->gpa_list[gpa_n].page.additional_pages = additional_pages; + flush->gpa_list[gpa_n].page.largepage = false; + flush->gpa_list[gpa_n].page.basepfn = cur; + + pages -= additional_pages + 1; + cur += additional_pages + 1; + gpa_n++; + } while (pages > 0); + + return gpa_n; +} +EXPORT_SYMBOL_GPL(hyperv_fill_flush_guest_mapping_list); + +int hyperv_flush_guest_mapping_range(u64 as, + hyperv_fill_flush_list_func fill_flush_list_func, void *data) +{ + struct hv_guest_mapping_flush_list **flush_pcpu; + struct hv_guest_mapping_flush_list *flush; + u64 status = 0; + unsigned long flags; + int ret = -ENOTSUPP; + int gpa_n = 0; + + if (!hv_hypercall_pg || !fill_flush_list_func) + goto fault; + + local_irq_save(flags); + + flush_pcpu = (struct hv_guest_mapping_flush_list **) + this_cpu_ptr(hyperv_pcpu_input_arg); + + flush = *flush_pcpu; + if (unlikely(!flush)) { + local_irq_restore(flags); + goto fault; + } + + flush->address_space = as; + flush->flags = 0; + + gpa_n = fill_flush_list_func(flush, data); + if (gpa_n < 0) { + local_irq_restore(flags); + goto fault; + } + + status = hv_do_rep_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST, + gpa_n, 0, flush, NULL); + + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + ret = 0; + else + ret = status; +fault: + trace_hyperv_nested_flush_guest_mapping_range(as, ret); + return ret; +} +EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping_range); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h @@ -281,6 +281,7 @@ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ +#define X86_FEATURE_WBNOINVD (13*32+ 9) /* WBNOINVD instruction */ #define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ #define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ #define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h @@ -10,6 +10,7 @@ #define _ASM_X86_HYPERV_TLFS_H #include <linux/types.h> +#include <asm/page.h> /* * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent @@ -30,158 +31,150 @@ /* * Feature identification. EAX indicates which features are available * to the partition based upon the current partition privileges. + * These are HYPERV_CPUID_FEATURES.EAX bits. */ /* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ -#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE BIT(0) /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ -#define HV_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) -/* Partition reference TSC MSR is available */ -#define HV_MSR_REFERENCE_TSC_AVAILABLE (1 << 9) -/* Partition Guest IDLE MSR is available */ -#define HV_X64_MSR_GUEST_IDLE_AVAILABLE (1 << 10) - -/* A partition's reference time stamp counter (TSC) page */ -#define HV_X64_MSR_REFERENCE_TSC 0x40000021 - -/* - * There is a single feature flag that signifies if the partition has access - * to MSRs with local APIC and TSC frequencies. - */ -#define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11) - -/* AccessReenlightenmentControls privilege */ -#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) - +#define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) /* * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available */ -#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2) +#define HV_X64_MSR_SYNIC_AVAILABLE BIT(2) /* * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through * HV_X64_MSR_STIMER3_COUNT) available */ -#define HV_MSR_SYNTIMER_AVAILABLE (1 << 3) +#define HV_MSR_SYNTIMER_AVAILABLE BIT(3) /* * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) * are available */ -#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4) +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE BIT(4) /* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ -#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5) +#define HV_X64_MSR_HYPERCALL_AVAILABLE BIT(5) /* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ -#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6) +#define HV_X64_MSR_VP_INDEX_AVAILABLE BIT(6) /* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ -#define HV_X64_MSR_RESET_AVAILABLE (1 << 7) - /* - * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, - * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, - * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available - */ -#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) - -/* Frequency MSRs available */ -#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE (1 << 8) - -/* Crash MSR available */ -#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10) - -/* stimer Direct Mode is available */ -#define HV_STIMER_DIRECT_MODE_AVAILABLE (1 << 19) +#define HV_X64_MSR_RESET_AVAILABLE BIT(7) +/* + * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, + * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, + * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available + */ +#define HV_X64_MSR_STAT_PAGES_AVAILABLE BIT(8) +/* Partition reference TSC MSR is available */ +#define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) +/* Partition Guest IDLE MSR is available */ +#define HV_X64_MSR_GUEST_IDLE_AVAILABLE BIT(10) +/* + * There is a single feature flag that signifies if the partition has access + * to MSRs with local APIC and TSC frequencies. + */ +#define HV_X64_ACCESS_FREQUENCY_MSRS BIT(11) +/* AccessReenlightenmentControls privilege */ +#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) /* - * Feature identification: EBX indicates which flags were specified at - * partition creation. The format is the same as the partition creation - * flag structure defined in section Partition Creation Flags. + * Feature identification: indicates which flags were specified at partition + * creation. The format is the same as the partition creation flag structure + * defined in section Partition Creation Flags. + * These are HYPERV_CPUID_FEATURES.EBX bits. */ -#define HV_X64_CREATE_PARTITIONS (1 << 0) -#define HV_X64_ACCESS_PARTITION_ID (1 << 1) -#define HV_X64_ACCESS_MEMORY_POOL (1 << 2) -#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3) -#define HV_X64_POST_MESSAGES (1 << 4) -#define HV_X64_SIGNAL_EVENTS (1 << 5) -#define HV_X64_CREATE_PORT (1 << 6) -#define HV_X64_CONNECT_PORT (1 << 7) -#define HV_X64_ACCESS_STATS (1 << 8) -#define HV_X64_DEBUGGING (1 << 11) -#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12) -#define HV_X64_CONFIGURE_PROFILER (1 << 13) +#define HV_X64_CREATE_PARTITIONS BIT(0) +#define HV_X64_ACCESS_PARTITION_ID BIT(1) +#define HV_X64_ACCESS_MEMORY_POOL BIT(2) +#define HV_X64_ADJUST_MESSAGE_BUFFERS BIT(3) +#define HV_X64_POST_MESSAGES BIT(4) +#define HV_X64_SIGNAL_EVENTS BIT(5) +#define HV_X64_CREATE_PORT BIT(6) +#define HV_X64_CONNECT_PORT BIT(7) +#define HV_X64_ACCESS_STATS BIT(8) +#define HV_X64_DEBUGGING BIT(11) +#define HV_X64_CPU_POWER_MANAGEMENT BIT(12) /* * Feature identification. EDX indicates which miscellaneous features * are available to the partition. + * These are HYPERV_CPUID_FEATURES.EDX bits. */ /* The MWAIT instruction is available (per section MONITOR / MWAIT) */ -#define HV_X64_MWAIT_AVAILABLE (1 << 0) +#define HV_X64_MWAIT_AVAILABLE BIT(0) /* Guest debugging support is available */ -#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1) +#define HV_X64_GUEST_DEBUGGING_AVAILABLE BIT(1) /* Performance Monitor support is available*/ -#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2) +#define HV_X64_PERF_MONITOR_AVAILABLE BIT(2) /* Support for physical CPU dynamic partitioning events is available*/ -#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3) +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE BIT(3) /* * Support for passing hypercall input parameter block via XMM * registers is available */ -#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) +#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE BIT(4) /* Support for a virtual guest idle state is available */ -#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) -/* Guest crash data handler available */ -#define HV_X64_GUEST_CRASH_MSR_AVAILABLE (1 << 10) +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE BIT(5) +/* Frequency MSRs available */ +#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE BIT(8) +/* Crash MSR available */ +#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) +/* stimer Direct Mode is available */ +#define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19) /* * Implementation recommendations. Indicates which behaviors the hypervisor * recommends the OS implement for optimal performance. + * These are HYPERV_CPUID_ENLIGHTMENT_INFO.EAX bits. + */ +/* + * Recommend using hypercall for address space switches rather + * than MOV to CR3 instruction */ - /* - * Recommend using hypercall for address space switches rather - * than MOV to CR3 instruction - */ -#define HV_X64_AS_SWITCH_RECOMMENDED (1 << 0) +#define HV_X64_AS_SWITCH_RECOMMENDED BIT(0) /* Recommend using hypercall for local TLB flushes rather * than INVLPG or MOV to CR3 instructions */ -#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED BIT(1) /* * Recommend using hypercall for remote TLB flushes rather * than inter-processor interrupts */ -#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2) +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED BIT(2) /* * Recommend using MSRs for accessing APIC registers * EOI, ICR and TPR rather than their memory-mapped counterparts */ -#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3) +#define HV_X64_APIC_ACCESS_RECOMMENDED BIT(3) /* Recommend using the hypervisor-provided MSR to initiate a system RESET */ -#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4) +#define HV_X64_SYSTEM_RESET_RECOMMENDED BIT(4) /* * Recommend using relaxed timing for this partition. If used, * the VM should disable any watchdog timeouts that rely on the * timely delivery of external interrupts */ -#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) +#define HV_X64_RELAXED_TIMING_RECOMMENDED BIT(5) /* * Recommend not using Auto End-Of-Interrupt feature */ -#define HV_DEPRECATING_AEOI_RECOMMENDED (1 << 9) +#define HV_DEPRECATING_AEOI_RECOMMENDED BIT(9) /* * Recommend using cluster IPI hypercalls. */ -#define HV_X64_CLUSTER_IPI_RECOMMENDED (1 << 10) +#define HV_X64_CLUSTER_IPI_RECOMMENDED BIT(10) /* Recommend using the newer ExProcessorMasks interface */ -#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) +#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED BIT(11) /* Recommend using enlightened VMCS */ -#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED (1 << 14) +#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) -/* - * Crash notification flags. - */ -#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) -#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) +/* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */ +#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) +#define HV_X64_NESTED_MSR_BITMAP BIT(19) + +/* Hyper-V specific model specific registers (MSRs) */ /* MSR used to identify the guest OS. */ #define HV_X64_MSR_GUEST_OS_ID 0x40000000 @@ -201,6 +194,9 @@ /* MSR used to read the per-partition time reference counter */ #define HV_X64_MSR_TIME_REF_COUNT 0x40000020 +/* A partition's reference time stamp counter (TSC) page */ +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 + /* MSR used to retrieve the TSC frequency */ #define HV_X64_MSR_TSC_FREQUENCY 0x40000022 @@ -258,9 +254,11 @@ #define HV_X64_MSR_CRASH_P3 0x40000103 #define HV_X64_MSR_CRASH_P4 0x40000104 #define HV_X64_MSR_CRASH_CTL 0x40000105 -#define HV_X64_MSR_CRASH_CTL_NOTIFY (1ULL << 63) -#define HV_X64_MSR_CRASH_PARAMS \ - (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) + +/* TSC emulation after migration */ +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 /* * Declare the MSR used to setup pages used to communicate with the hypervisor. @@ -271,7 +269,7 @@ union hv_x64_msr_hypercall_contents { u64 enable:1; u64 reserved:11; u64 guest_physical_address:52; - }; + } __packed; }; /* @@ -283,7 +281,7 @@ struct ms_hyperv_tsc_page { volatile u64 tsc_scale; volatile s64 tsc_offset; u64 reserved2[509]; -}; +} __packed; /* * The guest OS needs to register the guest ID with the hypervisor. @@ -311,39 +309,37 @@ struct ms_hyperv_tsc_page { #define HV_LINUX_VENDOR_ID 0x8100 -/* TSC emulation after migration */ -#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 - -/* Nested features (CPUID 0x4000000A) EAX */ -#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) -#define HV_X64_NESTED_MSR_BITMAP BIT(19) - struct hv_reenlightenment_control { __u64 vector:8; __u64 reserved1:8; __u64 enabled:1; __u64 reserved2:15; __u64 target_vp:32; -}; - -#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 -#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 +} __packed; struct hv_tsc_emulation_control { __u64 enabled:1; __u64 reserved:63; -}; +} __packed; struct hv_tsc_emulation_status { __u64 inprogress:1; __u64 reserved:63; -}; +} __packed; #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) +/* + * Crash notification (HV_X64_MSR_CRASH_CTL) flags. + */ +#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) +#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) +#define HV_X64_MSR_CRASH_PARAMS \ + (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) + #define HV_IPI_LOW_VECTOR 0x10 #define HV_IPI_HIGH_VECTOR 0xff @@ -358,6 +354,7 @@ struct hv_tsc_emulation_status { #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 @@ -409,7 +406,7 @@ typedef struct _HV_REFERENCE_TSC_PAGE { __u32 res1; __u64 tsc_scale; __s64 tsc_offset; -} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; +} __packed HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; /* Define the number of synthetic interrupt sources. */ #define HV_SYNIC_SINT_COUNT (16) @@ -466,7 +463,7 @@ union hv_message_flags { struct { __u8 msg_pending:1; __u8 reserved:7; - }; + } __packed; }; /* Define port identifier type. */ @@ -475,7 +472,7 @@ union hv_port_id { struct { __u32 id:24; __u32 reserved:8; - } u; + } __packed u; }; /* Define synthetic interrupt controller message header. */ @@ -488,7 +485,7 @@ struct hv_message_header { __u64 sender; union hv_port_id port; }; -}; +} __packed; /* Define synthetic interrupt controller message format. */ struct hv_message { @@ -496,12 +493,12 @@ struct hv_message { union { __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; } u; -}; +} __packed; /* Define the synthetic interrupt message page layout. */ struct hv_message_page { struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; -}; +} __packed; /* Define timer message payload structure. */ struct hv_timer_message_payload { @@ -509,7 +506,7 @@ struct hv_timer_message_payload { __u32 reserved; __u64 expiration_time; /* When the timer expired */ __u64 delivery_time; /* When the message was delivered */ -}; +} __packed; /* Define virtual processor assist page structure. */ struct hv_vp_assist_page { @@ -518,8 +515,9 @@ struct hv_vp_assist_page { __u64 vtl_control[2]; __u64 nested_enlightenments_control[2]; __u32 enlighten_vmentry; + __u32 padding; __u64 current_nested_vmcs; -}; +} __packed; struct hv_enlightened_vmcs { u32 revision_id; @@ -533,6 +531,8 @@ struct hv_enlightened_vmcs { u16 host_gs_selector; u16 host_tr_selector; + u16 padding16_1; + u64 host_ia32_pat; u64 host_ia32_efer; @@ -651,7 +651,7 @@ struct hv_enlightened_vmcs { u64 ept_pointer; u16 virtual_processor_id; - u16 padding16[3]; + u16 padding16_2[3]; u64 padding64_2[5]; u64 guest_physical_address; @@ -693,7 +693,7 @@ struct hv_enlightened_vmcs { u32 nested_flush_hypercall:1; u32 msr_bitmap:1; u32 reserved:30; - } hv_enlightenments_control; + } __packed hv_enlightenments_control; u32 hv_vp_id; u64 hv_vm_id; @@ -703,7 +703,7 @@ struct hv_enlightened_vmcs { u64 padding64_5[7]; u64 xss_exit_bitmap; u64 padding64_6[7]; -}; +} __packed; #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0 #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0) @@ -725,36 +725,129 @@ struct hv_enlightened_vmcs { #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF -#define HV_STIMER_ENABLE (1ULL << 0) -#define HV_STIMER_PERIODIC (1ULL << 1) -#define HV_STIMER_LAZY (1ULL << 2) -#define HV_STIMER_AUTOENABLE (1ULL << 3) -#define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F) +/* Define synthetic interrupt controller flag constants. */ +#define HV_EVENT_FLAGS_COUNT (256 * 8) +#define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long)) + +/* + * Synthetic timer configuration. + */ +union hv_stimer_config { + u64 as_uint64; + struct { + u64 enable:1; + u64 periodic:1; + u64 lazy:1; + u64 auto_enable:1; + u64 apic_vector:8; + u64 direct_mode:1; + u64 reserved_z0:3; + u64 sintx:4; + u64 reserved_z1:44; + } __packed; +}; + + +/* Define the synthetic interrupt controller event flags format. */ +union hv_synic_event_flags { + unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT]; +}; + +/* Define SynIC control register. */ +union hv_synic_scontrol { + u64 as_uint64; + struct { + u64 enable:1; + u64 reserved:63; + } __packed; +}; + +/* Define synthetic interrupt source. */ +union hv_synic_sint { + u64 as_uint64; + struct { + u64 vector:8; + u64 reserved1:8; + u64 masked:1; + u64 auto_eoi:1; + u64 reserved2:46; + } __packed; +}; + +/* Define the format of the SIMP register */ +union hv_synic_simp { + u64 as_uint64; + struct { + u64 simp_enabled:1; + u64 preserved:11; + u64 base_simp_gpa:52; + } __packed; +}; + +/* Define the format of the SIEFP register */ +union hv_synic_siefp { + u64 as_uint64; + struct { + u64 siefp_enabled:1; + u64 preserved:11; + u64 base_siefp_gpa:52; + } __packed; +}; struct hv_vpset { u64 format; u64 valid_bank_mask; u64 bank_contents[]; -}; +} __packed; /* HvCallSendSyntheticClusterIpi hypercall */ struct hv_send_ipi { u32 vector; u32 reserved; u64 cpu_mask; -}; +} __packed; /* HvCallSendSyntheticClusterIpiEx hypercall */ struct hv_send_ipi_ex { u32 vector; u32 reserved; struct hv_vpset vp_set; -}; +} __packed; /* HvFlushGuestPhysicalAddressSpace hypercalls */ struct hv_guest_mapping_flush { u64 address_space; u64 flags; +} __packed; + +/* + * HV_MAX_FLUSH_PAGES = "additional_pages" + 1. It's limited + * by the bitwidth of "additional_pages" in union hv_gpa_page_range. + */ +#define HV_MAX_FLUSH_PAGES (2048) + +/* HvFlushGuestPhysicalAddressList hypercall */ +union hv_gpa_page_range { + u64 address_space; + struct { + u64 additional_pages:11; + u64 largepage:1; + u64 basepfn:52; + } page; +}; + +/* + * All input flush parameters should be in single page. The max flush + * count is equal with how many entries of union hv_gpa_page_range can + * be populated into the input parameter page. + */ +#define HV_MAX_FLUSH_REP_COUNT (PAGE_SIZE - 2 * sizeof(u64) / \ + sizeof(union hv_gpa_page_range)) + +struct hv_guest_mapping_flush_list { + u64 address_space; + u64 flags; + union hv_gpa_page_range gpa_list[HV_MAX_FLUSH_REP_COUNT]; }; /* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ @@ -763,7 +856,7 @@ struct hv_tlb_flush { u64 flags; u64 processor_mask; u64 gva_list[]; -}; +} __packed; /* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ struct hv_tlb_flush_ex { @@ -771,6 +864,6 @@ struct hv_tlb_flush_ex { u64 flags; struct hv_vpset hv_vp_set; u64 gva_list[]; -}; +} __packed; #endif diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h @@ -2,10 +2,36 @@ #ifndef _ASM_X86_INTEL_PT_H #define _ASM_X86_INTEL_PT_H +#define PT_CPUID_LEAVES 2 +#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ + +enum pt_capabilities { + PT_CAP_max_subleaf = 0, + PT_CAP_cr3_filtering, + PT_CAP_psb_cyc, + PT_CAP_ip_filtering, + PT_CAP_mtc, + PT_CAP_ptwrite, + PT_CAP_power_event_trace, + PT_CAP_topa_output, + PT_CAP_topa_multiple_entries, + PT_CAP_single_range_output, + PT_CAP_output_subsys, + PT_CAP_payloads_lip, + PT_CAP_num_address_ranges, + PT_CAP_mtc_periods, + PT_CAP_cycle_thresholds, + PT_CAP_psb_periods, +}; + #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) void cpu_emergency_stop_pt(void); +extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap); +extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap); #else static inline void cpu_emergency_stop_pt(void) {} +static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 0; } +static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { return 0; } #endif #endif /* _ASM_X86_INTEL_PT_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h @@ -439,6 +439,11 @@ struct kvm_mmu { u64 pdptrs[4]; /* pae */ }; +struct kvm_tlb_range { + u64 start_gfn; + u64 pages; +}; + enum pmc_type { KVM_PMC_GP = 0, KVM_PMC_FIXED, @@ -497,7 +502,7 @@ struct kvm_mtrr { struct kvm_vcpu_hv_stimer { struct hrtimer timer; int index; - u64 config; + union hv_stimer_config config; u64 count; u64 exp_time; struct hv_message msg; @@ -601,17 +606,16 @@ struct kvm_vcpu_arch { /* * QEMU userspace and the guest each have their own FPU state. - * In vcpu_run, we switch between the user and guest FPU contexts. - * While running a VCPU, the VCPU thread will have the guest FPU - * context. + * In vcpu_run, we switch between the user, maintained in the + * task_struct struct, and guest FPU contexts. While running a VCPU, + * the VCPU thread will have the guest FPU context. * * Note that while the PKRU state lives inside the fpu registers, * it is switched out separately at VMENTER and VMEXIT time. The * "guest_fpu" state here contains the guest FPU context, with the * host PRKU bits. */ - struct fpu user_fpu; - struct fpu guest_fpu; + struct fpu *guest_fpu; u64 xcr0; u64 guest_supported_xcr0; @@ -1042,6 +1046,8 @@ struct kvm_x86_ops { void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); int (*tlb_remote_flush)(struct kvm *kvm); + int (*tlb_remote_flush_with_range)(struct kvm *kvm, + struct kvm_tlb_range *range); /* * Flush any TLB entries associated with the given GVA. @@ -1106,6 +1112,7 @@ struct kvm_x86_ops { bool (*mpx_supported)(void); bool (*xsaves_supported)(void); bool (*umip_emulated)(void); + bool (*pt_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); void (*request_immediate_exit)(struct kvm_vcpu *vcpu); @@ -1186,6 +1193,7 @@ struct kvm_x86_ops { int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); + uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { @@ -1196,6 +1204,7 @@ struct kvm_arch_async_pf { }; extern struct kvm_x86_ops *kvm_x86_ops; +extern struct kmem_cache *x86_fpu_cache; #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) @@ -1492,7 +1501,7 @@ asmlinkage void kvm_spurious_fault(void); "cmpb $0, kvm_rebooting \n\t" \ "jne 668b \n\t" \ __ASM_SIZE(push) " $666b \n\t" \ - "call kvm_spurious_fault \n\t" \ + "jmp kvm_spurious_fault \n\t" \ ".popsection \n\t" \ _ASM_EXTABLE(666b, 667b) @@ -1503,7 +1512,7 @@ asmlinkage void kvm_spurious_fault(void); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h @@ -22,6 +22,11 @@ struct ms_hyperv_info { extern struct ms_hyperv_info ms_hyperv; + +typedef int (*hyperv_fill_flush_list_func)( + struct hv_guest_mapping_flush_list *flush, + void *data); + /* * Generate the guest ID. */ @@ -348,6 +353,11 @@ void set_hv_tscchange_cb(void (*cb)(void)); void clear_hv_tscchange_cb(void); void hyperv_stop_tsc_emulation(void); int hyperv_flush_guest_mapping(u64 as); +int hyperv_flush_guest_mapping_range(u64 as, + hyperv_fill_flush_list_func fill_func, void *data); +int hyperv_fill_flush_guest_mapping_list( + struct hv_guest_mapping_flush_list *flush, + u64 start_gfn, u64 end_gfn); #ifdef CONFIG_X86_64 void hv_apic_init(void); @@ -370,6 +380,11 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) return NULL; } static inline int hyperv_flush_guest_mapping(u64 as) { return -1; } +static inline int hyperv_flush_guest_mapping_range(u64 as, + hyperv_fill_flush_list_func fill_func, void *data) +{ + return -1; +} #endif /* CONFIG_HYPERV */ #ifdef CONFIG_HYPERV_TSCPAGE diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h @@ -121,7 +121,43 @@ #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 #define MSR_IA32_RTIT_CTL 0x00000570 +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_CYCLEACC BIT(1) +#define RTIT_CTL_OS BIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_PWR_EVT_EN BIT(4) +#define RTIT_CTL_FUP_ON_PTW BIT(5) +#define RTIT_CTL_FABRIC_EN BIT(6) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_MTC_EN BIT(9) +#define RTIT_CTL_TSC_EN BIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_PTW_EN BIT(12) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET 14 +#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET 19 +#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET 24 +#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) +#define RTIT_CTL_ADDR0_OFFSET 32 +#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET) +#define RTIT_CTL_ADDR1_OFFSET 36 +#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET) +#define RTIT_CTL_ADDR2_OFFSET 40 +#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET) +#define RTIT_CTL_ADDR3_OFFSET 44 +#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET) #define MSR_IA32_RTIT_STATUS 0x00000571 +#define RTIT_STATUS_FILTEREN BIT(0) +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_BUFFOVF BIT(3) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPED BIT(5) +#define RTIT_STATUS_BYTECNT_OFFSET 32 +#define RTIT_STATUS_BYTECNT (0x1ffffull << RTIT_STATUS_BYTECNT_OFFSET) #define MSR_IA32_RTIT_ADDR0_A 0x00000580 #define MSR_IA32_RTIT_ADDR0_B 0x00000581 #define MSR_IA32_RTIT_ADDR1_A 0x00000582 @@ -772,6 +808,7 @@ #define VMX_BASIC_INOUT 0x0040000000000000LLU /* MSR_IA32_VMX_MISC bits */ +#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) #define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F /* AMD-V MSRs */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h @@ -290,11 +290,4 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) -#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" -#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" -#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb" -#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd" -#define SVM_STGI ".byte 0x0f, 0x01, 0xdc" -#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf" - #endif diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h @@ -42,6 +42,20 @@ TRACE_EVENT(hyperv_nested_flush_guest_mapping, TP_printk("address space %llx ret %d", __entry->as, __entry->ret) ); +TRACE_EVENT(hyperv_nested_flush_guest_mapping_range, + TP_PROTO(u64 as, int ret), + TP_ARGS(as, ret), + + TP_STRUCT__entry( + __field(u64, as) + __field(int, ret) + ), + TP_fast_assign(__entry->as = as; + __entry->ret = ret; + ), + TP_printk("address space %llx ret %d", __entry->as, __entry->ret) + ); + TRACE_EVENT(hyperv_send_ipi_mask, TP_PROTO(const struct cpumask *cpus, int vector), diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h @@ -77,7 +77,10 @@ #define SECONDARY_EXEC_ENCLS_EXITING 0x00008000 #define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 +#define SECONDARY_EXEC_PT_CONCEAL_VMX 0x00080000 #define SECONDARY_EXEC_XSAVES 0x00100000 +#define SECONDARY_EXEC_PT_USE_GPA 0x01000000 +#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x00400000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -98,6 +101,8 @@ #define VM_EXIT_LOAD_IA32_EFER 0x00200000 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 #define VM_EXIT_CLEAR_BNDCFGS 0x00800000 +#define VM_EXIT_PT_CONCEAL_PIP 0x01000000 +#define VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -109,6 +114,8 @@ #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 #define VM_ENTRY_LOAD_IA32_EFER 0x00008000 #define VM_ENTRY_LOAD_BNDCFGS 0x00010000 +#define VM_ENTRY_PT_CONCEAL_PIP 0x00020000 +#define VM_ENTRY_LOAD_IA32_RTIT_CTL 0x00040000 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff @@ -240,6 +247,8 @@ enum vmcs_field { GUEST_PDPTR3_HIGH = 0x00002811, GUEST_BNDCFGS = 0x00002812, GUEST_BNDCFGS_HIGH = 0x00002813, + GUEST_IA32_RTIT_CTL = 0x00002814, + GUEST_IA32_RTIT_CTL_HIGH = 0x00002815, HOST_IA32_PAT = 0x00002c00, HOST_IA32_PAT_HIGH = 0x00002c01, HOST_IA32_EFER = 0x00002c02, diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* KVM paravirtual clock driver. A clocksource implementation Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/clocksource.h> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile @@ -16,7 +16,7 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o page_track.o debugfs.o -kvm-intel-y += vmx.o pmu_intel.o +kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o kvm-amd-y += svm.o pmu_amd.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c @@ -67,9 +67,6 @@ u64 kvm_supported_xcr0(void) #define F(x) bit(X86_FEATURE_##x) -/* For scattered features from cpufeatures.h; we currently expose none */ -#define KF(x) bit(KVM_CPUID_BIT_##x) - int kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -337,6 +334,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; + unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -380,8 +378,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = - F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO); + F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | + F(AMD_SSB_NO) | F(AMD_STIBP); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = @@ -395,7 +393,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL); + F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = @@ -411,7 +409,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | - F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES); + F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -426,7 +424,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xd); + entry->eax = min(entry->eax, (u32)(f_intel_pt ? 0x14 : 0xd)); break; case 1: entry->edx &= kvm_cpuid_1_edx_x86_features; @@ -603,6 +601,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + /* Intel PT */ + case 0x14: { + int t, times = entry->eax; + + if (!f_intel_pt) + break; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (t = 1; t <= times; ++t) { + if (*nent >= maxnent) + goto out; + do_cpuid_1_ent(&entry[t], function, t); + entry[t].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } case KVM_CPUID_SIGNATURE: { static const char signature[12] = "KVMKVMKVM\0\0"; const u32 *sigptr = (const u32 *)signature; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c @@ -38,6 +38,9 @@ #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) +static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, + bool vcpu_kick); + static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) { return atomic64_read(&synic->sint[sint]); @@ -158,59 +161,24 @@ static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) return (synic->active) ? synic : NULL; } -static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic, - u32 sint) -{ - struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct page *page; - gpa_t gpa; - struct hv_message *msg; - struct hv_message_page *msg_page; - - gpa = synic->msg_page & PAGE_MASK; - page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) { - vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n", - gpa); - return; - } - msg_page = kmap_atomic(page); - - msg = &msg_page->sint_message[sint]; - msg->header.message_flags.msg_pending = 0; - - kunmap_atomic(msg_page); - kvm_release_page_dirty(page); - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); -} - static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) { struct kvm *kvm = vcpu->kvm; struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; - int gsi, idx, stimers_pending; + int gsi, idx; trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); - if (synic->msg_page & HV_SYNIC_SIMP_ENABLE) - synic_clear_sint_msg_pending(synic, sint); - /* Try to deliver pending Hyper-V SynIC timers messages */ - stimers_pending = 0; for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { stimer = &hv_vcpu->stimer[idx]; - if (stimer->msg_pending && - (stimer->config & HV_STIMER_ENABLE) && - HV_STIMER_SINT(stimer->config) == sint) { - set_bit(stimer->index, - hv_vcpu->stimer_pending_bitmap); - stimers_pending++; - } + if (stimer->msg_pending && stimer->config.enable && + !stimer->config.direct_mode && + stimer->config.sintx == sint) + stimer_mark_pending(stimer, false); } - if (stimers_pending) - kvm_make_request(KVM_REQ_HV_STIMER, vcpu); idx = srcu_read_lock(&kvm->irq_srcu); gsi = atomic_read(&synic->sint_to_gsi[sint]); @@ -497,7 +465,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm); ktime_now = ktime_get(); - if (stimer->config & HV_STIMER_PERIODIC) { + if (stimer->config.periodic) { if (stimer->exp_time) { if (time_now >= stimer->exp_time) { u64 remainder; @@ -546,13 +514,18 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, bool host) { + union hv_stimer_config new_config = {.as_uint64 = config}, + old_config = {.as_uint64 = stimer->config.as_uint64}; + trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); stimer_cleanup(stimer); - if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0) - config &= ~HV_STIMER_ENABLE; - stimer->config = config; + if (old_config.enable && + !new_config.direct_mode && new_config.sintx == 0) + new_config.enable = 0; + stimer->config.as_uint64 = new_config.as_uint64; + stimer_mark_pending(stimer, false); return 0; } @@ -566,16 +539,16 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, stimer_cleanup(stimer); stimer->count = count; if (stimer->count == 0) - stimer->config &= ~HV_STIMER_ENABLE; - else if (stimer->config & HV_STIMER_AUTOENABLE) - stimer->config |= HV_STIMER_ENABLE; + stimer->config.enable = 0; + else if (stimer->config.auto_enable) + stimer->config.enable = 1; stimer_mark_pending(stimer, false); return 0; } static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig) { - *pconfig = stimer->config; + *pconfig = stimer->config.as_uint64; return 0; } @@ -586,44 +559,60 @@ static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) } static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, - struct hv_message *src_msg) + struct hv_message *src_msg, bool no_retry) { struct kvm_vcpu *vcpu = synic_to_vcpu(synic); - struct page *page; - gpa_t gpa; - struct hv_message *dst_msg; + int msg_off = offsetof(struct hv_message_page, sint_message[sint]); + gfn_t msg_page_gfn; + struct hv_message_header hv_hdr; int r; - struct hv_message_page *msg_page; if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE)) return -ENOENT; - gpa = synic->msg_page & PAGE_MASK; - page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) - return -EFAULT; + msg_page_gfn = synic->msg_page >> PAGE_SHIFT; - msg_page = kmap_atomic(page); - dst_msg = &msg_page->sint_message[sint]; - if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE, - src_msg->header.message_type) != HVMSG_NONE) { - dst_msg->header.message_flags.msg_pending = 1; - r = -EAGAIN; - } else { - memcpy(&dst_msg->u.payload, &src_msg->u.payload, - src_msg->header.payload_size); - dst_msg->header.message_type = src_msg->header.message_type; - dst_msg->header.payload_size = src_msg->header.payload_size; - r = synic_set_irq(synic, sint); - if (r >= 1) - r = 0; - else if (r == 0) - r = -EFAULT; + /* + * Strictly following the spec-mandated ordering would assume setting + * .msg_pending before checking .message_type. However, this function + * is only called in vcpu context so the entire update is atomic from + * guest POV and thus the exact order here doesn't matter. + */ + r = kvm_vcpu_read_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_type, + msg_off + offsetof(struct hv_message, + header.message_type), + sizeof(hv_hdr.message_type)); + if (r < 0) + return r; + + if (hv_hdr.message_type != HVMSG_NONE) { + if (no_retry) + return 0; + + hv_hdr.message_flags.msg_pending = 1; + r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, + &hv_hdr.message_flags, + msg_off + + offsetof(struct hv_message, + header.message_flags), + sizeof(hv_hdr.message_flags)); + if (r < 0) + return r; + return -EAGAIN; } - kunmap_atomic(msg_page); - kvm_release_page_dirty(page); - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); - return r; + + r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, src_msg, msg_off, + sizeof(src_msg->header) + + src_msg->header.payload_size); + if (r < 0) + return r; + + r = synic_set_irq(synic, sint); + if (r < 0) + return r; + if (r == 0) + return -EFAULT; + return 0; } static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) @@ -633,24 +622,45 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) struct hv_timer_message_payload *payload = (struct hv_timer_message_payload *)&msg->u.payload; + /* + * To avoid piling up periodic ticks, don't retry message + * delivery for them (within "lazy" lost ticks policy). + */ + bool no_retry = stimer->config.periodic; + payload->expiration_time = stimer->exp_time; payload->delivery_time = get_time_ref_counter(vcpu->kvm); return synic_deliver_msg(vcpu_to_synic(vcpu), - HV_STIMER_SINT(stimer->config), msg); + stimer->config.sintx, msg, + no_retry); +} + +static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) +{ + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_lapic_irq irq = { + .delivery_mode = APIC_DM_FIXED, + .vector = stimer->config.apic_vector + }; + + return !kvm_apic_set_irq(vcpu, &irq, NULL); } static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) { - int r; + int r, direct = stimer->config.direct_mode; stimer->msg_pending = true; - r = stimer_send_msg(stimer); + if (!direct) + r = stimer_send_msg(stimer); + else + r = stimer_notify_direct(stimer); trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id, - stimer->index, r); + stimer->index, direct, r); if (!r) { stimer->msg_pending = false; - if (!(stimer->config & HV_STIMER_PERIODIC)) - stimer->config &= ~HV_STIMER_ENABLE; + if (!(stimer->config.periodic)) + stimer->config.enable = 0; } } @@ -664,7 +674,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { stimer = &hv_vcpu->stimer[i]; - if (stimer->config & HV_STIMER_ENABLE) { + if (stimer->config.enable) { exp_time = stimer->exp_time; if (exp_time) { @@ -674,7 +684,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) stimer_expiration(stimer); } - if ((stimer->config & HV_STIMER_ENABLE) && + if ((stimer->config.enable) && stimer->count) { if (!stimer->msg_pending) stimer_start(stimer); @@ -815,9 +825,9 @@ static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host) struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; if (host) - hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY; + hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY; - if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) { + if (!host && (data & HV_CRASH_CTL_CRASH_NOTIFY)) { vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", hv->hv_crash_param[0], @@ -1758,3 +1768,124 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args) return kvm_hv_eventfd_deassign(kvm, args->conn_id); return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd); } + +int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + uint16_t evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu); + struct kvm_cpuid_entry2 cpuid_entries[] = { + { .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS }, + { .function = HYPERV_CPUID_INTERFACE }, + { .function = HYPERV_CPUID_VERSION }, + { .function = HYPERV_CPUID_FEATURES }, + { .function = HYPERV_CPUID_ENLIGHTMENT_INFO }, + { .function = HYPERV_CPUID_IMPLEMENT_LIMITS }, + { .function = HYPERV_CPUID_NESTED_FEATURES }, + }; + int i, nent = ARRAY_SIZE(cpuid_entries); + + /* Skip NESTED_FEATURES if eVMCS is not supported */ + if (!evmcs_ver) + --nent; + + if (cpuid->nent < nent) + return -E2BIG; + + if (cpuid->nent > nent) + cpuid->nent = nent; + + for (i = 0; i < nent; i++) { + struct kvm_cpuid_entry2 *ent = &cpuid_entries[i]; + u32 signature[3]; + + switch (ent->function) { + case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: + memcpy(signature, "Linux KVM Hv", 12); + + ent->eax = HYPERV_CPUID_NESTED_FEATURES; + ent->ebx = signature[0]; + ent->ecx = signature[1]; + ent->edx = signature[2]; + break; + + case HYPERV_CPUID_INTERFACE: + memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12); + ent->eax = signature[0]; + break; + + case HYPERV_CPUID_VERSION: + /* + * We implement some Hyper-V 2016 functions so let's use + * this version. + */ + ent->eax = 0x00003839; + ent->ebx = 0x000A0000; + break; + + case HYPERV_CPUID_FEATURES: + ent->eax |= HV_X64_MSR_VP_RUNTIME_AVAILABLE; + ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; + ent->eax |= HV_X64_MSR_SYNIC_AVAILABLE; + ent->eax |= HV_MSR_SYNTIMER_AVAILABLE; + ent->eax |= HV_X64_MSR_APIC_ACCESS_AVAILABLE; + ent->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE; + ent->eax |= HV_X64_MSR_VP_INDEX_AVAILABLE; + ent->eax |= HV_X64_MSR_RESET_AVAILABLE; + ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; + ent->eax |= HV_X64_MSR_GUEST_IDLE_AVAILABLE; + ent->eax |= HV_X64_ACCESS_FREQUENCY_MSRS; + ent->eax |= HV_X64_ACCESS_REENLIGHTENMENT; + + ent->ebx |= HV_X64_POST_MESSAGES; + ent->ebx |= HV_X64_SIGNAL_EVENTS; + + ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; + ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; + + break; + + case HYPERV_CPUID_ENLIGHTMENT_INFO: + ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; + ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; + ent->eax |= HV_X64_SYSTEM_RESET_RECOMMENDED; + ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; + ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; + ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; + + /* + * Default number of spinlock retry attempts, matches + * HyperV 2016. + */ + ent->ebx = 0x00000FFF; + + break; + + case HYPERV_CPUID_IMPLEMENT_LIMITS: + /* Maximum number of virtual processors */ + ent->eax = KVM_MAX_VCPUS; + /* + * Maximum number of logical processors, matches + * HyperV 2016. + */ + ent->ebx = 64; + + break; + + case HYPERV_CPUID_NESTED_FEATURES: + ent->eax = evmcs_ver; + + break; + + default: + break; + } + } + + if (copy_to_user(entries, cpuid_entries, + nent * sizeof(struct kvm_cpuid_entry2))) + return -EFAULT; + + return 0; +} diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h @@ -24,6 +24,8 @@ #ifndef __ARCH_X86_KVM_HYPERV_H__ #define __ARCH_X86_KVM_HYPERV_H__ +#include <linux/kvm_host.h> + static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) { return &vcpu->arch.hyperv; @@ -95,5 +97,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, void kvm_hv_init_vm(struct kvm *kvm); void kvm_hv_destroy_vm(struct kvm *kvm); int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); +int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries); #endif diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h @@ -2,6 +2,8 @@ #ifndef ASM_KVM_CACHE_REGS_H #define ASM_KVM_CACHE_REGS_H +#include <linux/kvm_host.h> + #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c @@ -251,10 +251,9 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) if (enabled != apic->sw_enabled) { apic->sw_enabled = enabled; - if (enabled) { + if (enabled) static_key_slow_dec_deferred(&apic_sw_disabled); - recalculate_apic_map(apic->vcpu->kvm); - } else + else static_key_slow_inc(&apic_sw_disabled.key); } } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c @@ -264,6 +264,35 @@ static void mmu_spte_set(u64 *sptep, u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); + +static inline bool kvm_available_flush_tlb_with_range(void) +{ + return kvm_x86_ops->tlb_remote_flush_with_range; +} + +static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, + struct kvm_tlb_range *range) +{ + int ret = -ENOTSUPP; + + if (range && kvm_x86_ops->tlb_remote_flush_with_range) + ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range); + + if (ret) + kvm_flush_remote_tlbs(kvm); +} + +static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, + u64 start_gfn, u64 pages) +{ + struct kvm_tlb_range range; + + range.start_gfn = start_gfn; + range.pages = pages; + + kvm_flush_remote_tlbs_with_range(kvm, &range); +} + void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { BUG_ON((mmio_mask & mmio_value) != mmio_value); @@ -1456,8 +1485,12 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) { - if (__drop_large_spte(vcpu->kvm, sptep)) - kvm_flush_remote_tlbs(vcpu->kvm); + if (__drop_large_spte(vcpu->kvm, sptep)) { + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + + kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, + KVM_PAGES_PER_HPAGE(sp->role.level)); + } } /* @@ -1743,10 +1776,12 @@ restart: } } - if (need_flush) - kvm_flush_remote_tlbs(kvm); + if (need_flush && kvm_available_flush_tlb_with_range()) { + kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + return 0; + } - return 0; + return need_flush; } struct slot_rmap_walk_iterator { @@ -1880,9 +1915,9 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { - kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); } static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1925,7 +1960,8 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0); - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, + KVM_PAGES_PER_HPAGE(sp->role.level)); } int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) @@ -2441,7 +2477,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, account_shadowed(vcpu->kvm, sp); if (level == PT_PAGE_TABLE_LEVEL && rmap_write_protect(vcpu, gfn)) - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); if (level > PT_PAGE_TABLE_LEVEL && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); @@ -2561,7 +2597,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, return; drop_parent_pte(child, sptep); - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1); } } @@ -2985,8 +3021,10 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, ret = RET_PF_EMULATE; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } + if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, + KVM_PAGES_PER_HPAGE(level)); if (unlikely(is_mmio_spte(*sptep))) ret = RET_PF_EMULATE; @@ -5586,8 +5624,13 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + bool flush_tlb = true; + bool flush = false; int i; + if (kvm_available_flush_tlb_with_range()) + flush_tlb = false; + spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); @@ -5599,12 +5642,17 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, - start, end - 1, true); + flush |= slot_handle_level_range(kvm, memslot, + kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, start, + end - 1, flush_tlb); } } + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end - gfn_start + 1); + spin_unlock(&kvm->mmu_lock); } @@ -5638,12 +5686,13 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, * spte from present to present (changing the spte from present * to nonpresent will flush all the TLBs immediately), in other * words, the only case we care is mmu_spte_update() where we - * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE + * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE * instead of PT_WRITABLE_MASK, that means it does not depend * on PT_WRITABLE_MASK anymore. */ if (flush) - kvm_flush_remote_tlbs(kvm); + kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, + memslot->npages); } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, @@ -5671,7 +5720,13 @@ restart: !kvm_is_reserved_pfn(pfn) && PageTransCompoundMap(pfn_to_page(pfn))) { pte_list_remove(rmap_head, sptep); - need_tlb_flush = 1; + + if (kvm_available_flush_tlb_with_range()) + kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, + KVM_PAGES_PER_HPAGE(sp->role.level)); + else + need_tlb_flush = 1; + goto restart; } } @@ -5707,7 +5762,8 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, * dirty_bitmap. */ if (flush) - kvm_flush_remote_tlbs(kvm); + kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, + memslot->npages); } EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); @@ -5725,7 +5781,8 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, lockdep_assert_held(&kvm->slots_lock); if (flush) - kvm_flush_remote_tlbs(kvm); + kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, + memslot->npages); } EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); @@ -5742,7 +5799,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, /* see kvm_mmu_slot_leaf_clear_dirty */ if (flush) - kvm_flush_remote_tlbs(kvm); + kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, + memslot->npages); } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h @@ -894,7 +894,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs_with_address(vcpu->kvm, + sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); if (!rmap_can_add(vcpu)) break; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c @@ -675,11 +675,6 @@ struct svm_cpu_data { static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); -struct svm_init_data { - int cpu; - int r; -}; - static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) @@ -711,17 +706,17 @@ static u32 svm_msrpm_offset(u32 msr) static inline void clgi(void) { - asm volatile (__ex(SVM_CLGI)); + asm volatile (__ex("clgi")); } static inline void stgi(void) { - asm volatile (__ex(SVM_STGI)); + asm volatile (__ex("stgi")); } static inline void invlpga(unsigned long addr, u32 asid) { - asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); + asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr)); } static int get_npt_level(struct kvm_vcpu *vcpu) @@ -1456,10 +1451,11 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) g_tsc_offset = svm->vmcb->control.tsc_offset - svm->nested.hsave->control.tsc_offset; svm->nested.hsave->control.tsc_offset = offset; - } else - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - svm->vmcb->control.tsc_offset, - offset); + } + + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + svm->vmcb->control.tsc_offset - g_tsc_offset, + offset); svm->vmcb->control.tsc_offset = offset + g_tsc_offset; @@ -2129,6 +2125,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto out; } + svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); + if (!svm->vcpu.arch.guest_fpu) { + printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); + err = -ENOMEM; + goto free_partial_svm; + } + err = kvm_vcpu_init(&svm->vcpu, kvm, id); if (err) goto free_svm; @@ -2188,6 +2191,8 @@ free_page1: uninit: kvm_vcpu_uninit(&svm->vcpu); free_svm: + kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); +free_partial_svm: kmem_cache_free(kvm_vcpu_cache, svm); out: return ERR_PTR(err); @@ -2217,6 +2222,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); + kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); kmem_cache_free(kvm_vcpu_cache, svm); } @@ -3278,6 +3284,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr dst->event_inj_err = from->event_inj_err; dst->nested_cr3 = from->nested_cr3; dst->virt_ext = from->virt_ext; + dst->pause_filter_count = from->pause_filter_count; + dst->pause_filter_thresh = from->pause_filter_thresh; } static int nested_svm_vmexit(struct vcpu_svm *svm) @@ -3356,6 +3364,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->control.event_inj = 0; nested_vmcb->control.event_inj_err = 0; + nested_vmcb->control.pause_filter_count = + svm->vmcb->control.pause_filter_count; + nested_vmcb->control.pause_filter_thresh = + svm->vmcb->control.pause_filter_thresh; + /* We always set V_INTR_MASKING and remember the old value in hflags */ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; @@ -3532,6 +3545,11 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; + svm->vmcb->control.pause_filter_count = + nested_vmcb->control.pause_filter_count; + svm->vmcb->control.pause_filter_thresh = + nested_vmcb->control.pause_filter_thresh; + nested_svm_unmap(page); /* Enter Guest-Mode */ @@ -5636,9 +5654,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) /* Enter guest mode */ "push %%" _ASM_AX " \n\t" "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" - __ex(SVM_VMLOAD) "\n\t" - __ex(SVM_VMRUN) "\n\t" - __ex(SVM_VMSAVE) "\n\t" + __ex("vmload %%" _ASM_AX) "\n\t" + __ex("vmrun %%" _ASM_AX) "\n\t" + __ex("vmsave %%" _ASM_AX) "\n\t" "pop %%" _ASM_AX " \n\t" /* Save guest registers, load host registers */ @@ -5836,6 +5854,13 @@ static bool svm_cpu_has_accelerated_tpr(void) static bool svm_has_emulated_msr(int index) { + switch (index) { + case MSR_IA32_MCG_EXT_CTL: + return false; + default: + break; + } + return true; } @@ -5924,6 +5949,11 @@ static bool svm_umip_emulated(void) return false; } +static bool svm_pt_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -7053,6 +7083,12 @@ failed: return ret; } +static uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) +{ + /* Not supported */ + return 0; +} + static int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { @@ -7159,6 +7195,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .mpx_supported = svm_mpx_supported, .xsaves_supported = svm_xsaves_supported, .umip_emulated = svm_umip_emulated, + .pt_supported = svm_pt_supported, .set_supported_cpuid = svm_set_supported_cpuid, @@ -7191,6 +7228,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .mem_enc_unreg_region = svm_unregister_enc_region, .nested_enable_evmcs = nested_enable_evmcs, + .nested_get_evmcs_version = nested_get_evmcs_version, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h @@ -1254,24 +1254,26 @@ TRACE_EVENT(kvm_hv_stimer_callback, * Tracepoint for stimer_expiration. */ TRACE_EVENT(kvm_hv_stimer_expiration, - TP_PROTO(int vcpu_id, int timer_index, int msg_send_result), - TP_ARGS(vcpu_id, timer_index, msg_send_result), + TP_PROTO(int vcpu_id, int timer_index, int direct, int msg_send_result), + TP_ARGS(vcpu_id, timer_index, direct, msg_send_result), TP_STRUCT__entry( __field(int, vcpu_id) __field(int, timer_index) + __field(int, direct) __field(int, msg_send_result) ), TP_fast_assign( __entry->vcpu_id = vcpu_id; __entry->timer_index = timer_index; + __entry->direct = direct; __entry->msg_send_result = msg_send_result; ), - TP_printk("vcpu_id %d timer %d msg send result %d", + TP_printk("vcpu_id %d timer %d direct %d send result %d", __entry->vcpu_id, __entry->timer_index, - __entry->msg_send_result) + __entry->direct, __entry->msg_send_result) ); /* diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c @@ -1,15252 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "irq.h" -#include "mmu.h" -#include "cpuid.h" -#include "lapic.h" -#include "hyperv.h" - -#include <linux/kvm_host.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/sched.h> -#include <linux/moduleparam.h> -#include <linux/mod_devicetable.h> -#include <linux/trace_events.h> -#include <linux/slab.h> -#include <linux/tboot.h> -#include <linux/hrtimer.h> -#include <linux/frame.h> -#include <linux/nospec.h> -#include "kvm_cache_regs.h" -#include "x86.h" - -#include <asm/asm.h> -#include <asm/cpu.h> -#include <asm/io.h> -#include <asm/desc.h> -#include <asm/vmx.h> -#include <asm/virtext.h> -#include <asm/mce.h> -#include <asm/fpu/internal.h> -#include <asm/perf_event.h> -#include <asm/debugreg.h> -#include <asm/kexec.h> -#include <asm/apic.h> -#include <asm/irq_remapping.h> -#include <asm/mmu_context.h> -#include <asm/spec-ctrl.h> -#include <asm/mshyperv.h> - -#include "trace.h" -#include "pmu.h" -#include "vmx_evmcs.h" - -#define __ex(x) __kvm_handle_fault_on_reboot(x) -#define __ex_clear(x, reg) \ - ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) - -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); - -static const struct x86_cpu_id vmx_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_VMX), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); - -static bool __read_mostly enable_vpid = 1; -module_param_named(vpid, enable_vpid, bool, 0444); - -static bool __read_mostly enable_vnmi = 1; -module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); - -static bool __read_mostly flexpriority_enabled = 1; -module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); - -static bool __read_mostly enable_ept = 1; -module_param_named(ept, enable_ept, bool, S_IRUGO); - -static bool __read_mostly enable_unrestricted_guest = 1; -module_param_named(unrestricted_guest, - enable_unrestricted_guest, bool, S_IRUGO); - -static bool __read_mostly enable_ept_ad_bits = 1; -module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); - -static bool __read_mostly emulate_invalid_guest_state = true; -module_param(emulate_invalid_guest_state, bool, S_IRUGO); - -static bool __read_mostly fasteoi = 1; -module_param(fasteoi, bool, S_IRUGO); - -static bool __read_mostly enable_apicv = 1; -module_param(enable_apicv, bool, S_IRUGO); - -static bool __read_mostly enable_shadow_vmcs = 1; -module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); -/* - * If nested=1, nested virtualization is supported, i.e., guests may use - * VMX and be a hypervisor for its own guests. If nested=0, guests may not - * use VMX instructions. - */ -static bool __read_mostly nested = 1; -module_param(nested, bool, S_IRUGO); - -static bool __read_mostly nested_early_check = 0; -module_param(nested_early_check, bool, S_IRUGO); - -static u64 __read_mostly host_xss; - -static bool __read_mostly enable_pml = 1; -module_param_named(pml, enable_pml, bool, S_IRUGO); - -#define MSR_TYPE_R 1 -#define MSR_TYPE_W 2 -#define MSR_TYPE_RW 3 - -#define MSR_BITMAP_MODE_X2APIC 1 -#define MSR_BITMAP_MODE_X2APIC_APICV 2 - -#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL - -/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ -static int __read_mostly cpu_preemption_timer_multi; -static bool __read_mostly enable_preemption_timer = 1; -#ifdef CONFIG_X86_64 -module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); -#endif - -#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ - X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) -#define KVM_CR4_GUEST_OWNED_BITS \ - (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) - -#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) - -#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) - -#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 - -/* - * Hyper-V requires all of these, so mark them as supported even though - * they are just treated the same as all-context. - */ -#define VMX_VPID_EXTENT_SUPPORTED_MASK \ - (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ - VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ - VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ - VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) - -/* - * These 2 parameters are used to config the controls for Pause-Loop Exiting: - * ple_gap: upper bound on the amount of time between two successive - * executions of PAUSE in a loop. Also indicate if ple enabled. - * According to test, this time is usually smaller than 128 cycles. - * ple_window: upper bound on the amount of time a guest is allowed to execute - * in a PAUSE loop. Tests indicate that most spinlocks are held for - * less than 2^12 cycles - * Time is measured based on a counter that runs at the same rate as the TSC, - * refer SDM volume 3b section 21.6.13 & 22.1.3. - */ -static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; -module_param(ple_gap, uint, 0444); - -static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; -module_param(ple_window, uint, 0444); - -/* Default doubles per-vcpu window every exit. */ -static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; -module_param(ple_window_grow, uint, 0444); - -/* Default resets per-vcpu window every exit to ple_window. */ -static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; -module_param(ple_window_shrink, uint, 0444); - -/* Default is to compute the maximum so we can never overflow. */ -static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; -module_param(ple_window_max, uint, 0444); - -extern const ulong vmx_return; -extern const ulong vmx_early_consistency_check_return; - -static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); -static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); -static DEFINE_MUTEX(vmx_l1d_flush_mutex); - -/* Storage for pre module init parameter parsing */ -static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; - -static const struct { - const char *option; - bool for_parse; -} vmentry_l1d_param[] = { - [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, - [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, - [VMENTER_L1D_FLUSH_COND] = {"cond", true}, - [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, - [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, - [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, -}; - -#define L1D_CACHE_ORDER 4 -static void *vmx_l1d_flush_pages; - -static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) -{ - struct page *page; - unsigned int i; - - if (!enable_ept) { - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; - return 0; - } - - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { - u64 msr; - - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); - if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; - return 0; - } - } - - /* If set to auto use the default l1tf mitigation method */ - if (l1tf == VMENTER_L1D_FLUSH_AUTO) { - switch (l1tf_mitigation) { - case L1TF_MITIGATION_OFF: - l1tf = VMENTER_L1D_FLUSH_NEVER; - break; - case L1TF_MITIGATION_FLUSH_NOWARN: - case L1TF_MITIGATION_FLUSH: - case L1TF_MITIGATION_FLUSH_NOSMT: - l1tf = VMENTER_L1D_FLUSH_COND; - break; - case L1TF_MITIGATION_FULL: - case L1TF_MITIGATION_FULL_FORCE: - l1tf = VMENTER_L1D_FLUSH_ALWAYS; - break; - } - } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { - l1tf = VMENTER_L1D_FLUSH_ALWAYS; - } - - if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && - !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { - page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); - if (!page) - return -ENOMEM; - vmx_l1d_flush_pages = page_address(page); - - /* - * Initialize each page with a different pattern in - * order to protect against KSM in the nested - * virtualization case. - */ - for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { - memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, - PAGE_SIZE); - } - } - - l1tf_vmx_mitigation = l1tf; - - if (l1tf != VMENTER_L1D_FLUSH_NEVER) - static_branch_enable(&vmx_l1d_should_flush); - else - static_branch_disable(&vmx_l1d_should_flush); - - if (l1tf == VMENTER_L1D_FLUSH_COND) - static_branch_enable(&vmx_l1d_flush_cond); - else - static_branch_disable(&vmx_l1d_flush_cond); - return 0; -} - -static int vmentry_l1d_flush_parse(const char *s) -{ - unsigned int i; - - if (s) { - for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { - if (vmentry_l1d_param[i].for_parse && - sysfs_streq(s, vmentry_l1d_param[i].option)) - return i; - } - } - return -EINVAL; -} - -static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) -{ - int l1tf, ret; - - l1tf = vmentry_l1d_flush_parse(s); - if (l1tf < 0) - return l1tf; - - if (!boot_cpu_has(X86_BUG_L1TF)) - return 0; - - /* - * Has vmx_init() run already? If not then this is the pre init - * parameter parsing. In that case just store the value and let - * vmx_init() do the proper setup after enable_ept has been - * established. - */ - if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { - vmentry_l1d_flush_param = l1tf; - return 0; - } - - mutex_lock(&vmx_l1d_flush_mutex); - ret = vmx_setup_l1d_flush(l1tf); - mutex_unlock(&vmx_l1d_flush_mutex); - return ret; -} - -static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) -{ - if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) - return sprintf(s, "???\n"); - - return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); -} - -static const struct kernel_param_ops vmentry_l1d_flush_ops = { - .set = vmentry_l1d_flush_set, - .get = vmentry_l1d_flush_get, -}; -module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); - -enum ept_pointers_status { - EPT_POINTERS_CHECK = 0, - EPT_POINTERS_MATCH = 1, - EPT_POINTERS_MISMATCH = 2 -}; - -struct kvm_vmx { - struct kvm kvm; - - unsigned int tss_addr; - bool ept_identity_pagetable_done; - gpa_t ept_identity_map_addr; - - enum ept_pointers_status ept_pointers_match; - spinlock_t ept_pointer_lock; -}; - -#define NR_AUTOLOAD_MSRS 8 - -struct vmcs_hdr { - u32 revision_id:31; - u32 shadow_vmcs:1; -}; - -struct vmcs { - struct vmcs_hdr hdr; - u32 abort; - char data[0]; -}; - -/* - * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT - * and whose values change infrequently, but are not constant. I.e. this is - * used as a write-through cache of the corresponding VMCS fields. - */ -struct vmcs_host_state { - unsigned long cr3; /* May not match real cr3 */ - unsigned long cr4; /* May not match real cr4 */ - unsigned long gs_base; - unsigned long fs_base; - - u16 fs_sel, gs_sel, ldt_sel; -#ifdef CONFIG_X86_64 - u16 ds_sel, es_sel; -#endif -}; - -/* - * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also - * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs - * loaded on this CPU (so we can clear them if the CPU goes down). - */ -struct loaded_vmcs { - struct vmcs *vmcs; - struct vmcs *shadow_vmcs; - int cpu; - bool launched; - bool nmi_known_unmasked; - bool hv_timer_armed; - /* Support for vnmi-less CPUs */ - int soft_vnmi_blocked; - ktime_t entry_time; - s64 vnmi_blocked_time; - unsigned long *msr_bitmap; - struct list_head loaded_vmcss_on_cpu_link; - struct vmcs_host_state host_state; -}; - -struct shared_msr_entry { - unsigned index; - u64 data; - u64 mask; -}; - -/* - * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a - * single nested guest (L2), hence the name vmcs12. Any VMX implementation has - * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is - * stored in guest memory specified by VMPTRLD, but is opaque to the guest, - * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. - * More than one of these structures may exist, if L1 runs multiple L2 guests. - * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the - * underlying hardware which will be used to run L2. - * This structure is packed to ensure that its layout is identical across - * machines (necessary for live migration). - * - * IMPORTANT: Changing the layout of existing fields in this structure - * will break save/restore compatibility with older kvm releases. When - * adding new fields, either use space in the reserved padding* arrays - * or add the new fields to the end of the structure. - */ -typedef u64 natural_width; -struct __packed vmcs12 { - /* According to the Intel spec, a VMCS region must start with the - * following two fields. Then follow implementation-specific data. - */ - struct vmcs_hdr hdr; - u32 abort; - - u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ - u32 padding[7]; /* room for future expansion */ - - u64 io_bitmap_a; - u64 io_bitmap_b; - u64 msr_bitmap; - u64 vm_exit_msr_store_addr; - u64 vm_exit_msr_load_addr; - u64 vm_entry_msr_load_addr; - u64 tsc_offset; - u64 virtual_apic_page_addr; - u64 apic_access_addr; - u64 posted_intr_desc_addr; - u64 ept_pointer; - u64 eoi_exit_bitmap0; - u64 eoi_exit_bitmap1; - u64 eoi_exit_bitmap2; - u64 eoi_exit_bitmap3; - u64 xss_exit_bitmap; - u64 guest_physical_address; - u64 vmcs_link_pointer; - u64 guest_ia32_debugctl; - u64 guest_ia32_pat; - u64 guest_ia32_efer; - u64 guest_ia32_perf_global_ctrl; - u64 guest_pdptr0; - u64 guest_pdptr1; - u64 guest_pdptr2; - u64 guest_pdptr3; - u64 guest_bndcfgs; - u64 host_ia32_pat; - u64 host_ia32_efer; - u64 host_ia32_perf_global_ctrl; - u64 vmread_bitmap; - u64 vmwrite_bitmap; - u64 vm_function_control; - u64 eptp_list_address; - u64 pml_address; - u64 padding64[3]; /* room for future expansion */ - /* - * To allow migration of L1 (complete with its L2 guests) between - * machines of different natural widths (32 or 64 bit), we cannot have - * unsigned long fields with no explict size. We use u64 (aliased - * natural_width) instead. Luckily, x86 is little-endian. - */ - natural_width cr0_guest_host_mask; - natural_width cr4_guest_host_mask; - natural_width cr0_read_shadow; - natural_width cr4_read_shadow; - natural_width cr3_target_value0; - natural_width cr3_target_value1; - natural_width cr3_target_value2; - natural_width cr3_target_value3; - natural_width exit_qualification; - natural_width guest_linear_address; - natural_width guest_cr0; - natural_width guest_cr3; - natural_width guest_cr4; - natural_width guest_es_base; - natural_width guest_cs_base; - natural_width guest_ss_base; - natural_width guest_ds_base; - natural_width guest_fs_base; - natural_width guest_gs_base; - natural_width guest_ldtr_base; - natural_width guest_tr_base; - natural_width guest_gdtr_base; - natural_width guest_idtr_base; - natural_width guest_dr7; - natural_width guest_rsp; - natural_width guest_rip; - natural_width guest_rflags; - natural_width guest_pending_dbg_exceptions; - natural_width guest_sysenter_esp; - natural_width guest_sysenter_eip; - natural_width host_cr0; - natural_width host_cr3; - natural_width host_cr4; - natural_width host_fs_base; - natural_width host_gs_base; - natural_width host_tr_base; - natural_width host_gdtr_base; - natural_width host_idtr_base; - natural_width host_ia32_sysenter_esp; - natural_width host_ia32_sysenter_eip; - natural_width host_rsp; - natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ - u32 pin_based_vm_exec_control; - u32 cpu_based_vm_exec_control; - u32 exception_bitmap; - u32 page_fault_error_code_mask; - u32 page_fault_error_code_match; - u32 cr3_target_count; - u32 vm_exit_controls; - u32 vm_exit_msr_store_count; - u32 vm_exit_msr_load_count; - u32 vm_entry_controls; - u32 vm_entry_msr_load_count; - u32 vm_entry_intr_info_field; - u32 vm_entry_exception_error_code; - u32 vm_entry_instruction_len; - u32 tpr_threshold; - u32 secondary_vm_exec_control; - u32 vm_instruction_error; - u32 vm_exit_reason; - u32 vm_exit_intr_info; - u32 vm_exit_intr_error_code; - u32 idt_vectoring_info_field; - u32 idt_vectoring_error_code; - u32 vm_exit_instruction_len; - u32 vmx_instruction_info; - u32 guest_es_limit; - u32 guest_cs_limit; - u32 guest_ss_limit; - u32 guest_ds_limit; - u32 guest_fs_limit; - u32 guest_gs_limit; - u32 guest_ldtr_limit; - u32 guest_tr_limit; - u32 guest_gdtr_limit; - u32 guest_idtr_limit; - u32 guest_es_ar_bytes; - u32 guest_cs_ar_bytes; - u32 guest_ss_ar_bytes; - u32 guest_ds_ar_bytes; - u32 guest_fs_ar_bytes; - u32 guest_gs_ar_bytes; - u32 guest_ldtr_ar_bytes; - u32 guest_tr_ar_bytes; - u32 guest_interruptibility_info; - u32 guest_activity_state; - u32 guest_sysenter_cs; - u32 host_ia32_sysenter_cs; - u32 vmx_preemption_timer_value; - u32 padding32[7]; /* room for future expansion */ - u16 virtual_processor_id; - u16 posted_intr_nv; - u16 guest_es_selector; - u16 guest_cs_selector; - u16 guest_ss_selector; - u16 guest_ds_selector; - u16 guest_fs_selector; - u16 guest_gs_selector; - u16 guest_ldtr_selector; - u16 guest_tr_selector; - u16 guest_intr_status; - u16 host_es_selector; - u16 host_cs_selector; - u16 host_ss_selector; - u16 host_ds_selector; - u16 host_fs_selector; - u16 host_gs_selector; - u16 host_tr_selector; - u16 guest_pml_index; -}; - -/* - * For save/restore compatibility, the vmcs12 field offsets must not change. - */ -#define CHECK_OFFSET(field, loc) \ - BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \ - "Offset of " #field " in struct vmcs12 has changed.") - -static inline void vmx_check_vmcs12_offsets(void) { - CHECK_OFFSET(hdr, 0); - CHECK_OFFSET(abort, 4); - CHECK_OFFSET(launch_state, 8); - CHECK_OFFSET(io_bitmap_a, 40); - CHECK_OFFSET(io_bitmap_b, 48); - CHECK_OFFSET(msr_bitmap, 56); - CHECK_OFFSET(vm_exit_msr_store_addr, 64); - CHECK_OFFSET(vm_exit_msr_load_addr, 72); - CHECK_OFFSET(vm_entry_msr_load_addr, 80); - CHECK_OFFSET(tsc_offset, 88); - CHECK_OFFSET(virtual_apic_page_addr, 96); - CHECK_OFFSET(apic_access_addr, 104); - CHECK_OFFSET(posted_intr_desc_addr, 112); - CHECK_OFFSET(ept_pointer, 120); - CHECK_OFFSET(eoi_exit_bitmap0, 128); - CHECK_OFFSET(eoi_exit_bitmap1, 136); - CHECK_OFFSET(eoi_exit_bitmap2, 144); - CHECK_OFFSET(eoi_exit_bitmap3, 152); - CHECK_OFFSET(xss_exit_bitmap, 160); - CHECK_OFFSET(guest_physical_address, 168); - CHECK_OFFSET(vmcs_link_pointer, 176); - CHECK_OFFSET(guest_ia32_debugctl, 184); - CHECK_OFFSET(guest_ia32_pat, 192); - CHECK_OFFSET(guest_ia32_efer, 200); - CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208); - CHECK_OFFSET(guest_pdptr0, 216); - CHECK_OFFSET(guest_pdptr1, 224); - CHECK_OFFSET(guest_pdptr2, 232); - CHECK_OFFSET(guest_pdptr3, 240); - CHECK_OFFSET(guest_bndcfgs, 248); - CHECK_OFFSET(host_ia32_pat, 256); - CHECK_OFFSET(host_ia32_efer, 264); - CHECK_OFFSET(host_ia32_perf_global_ctrl, 272); - CHECK_OFFSET(vmread_bitmap, 280); - CHECK_OFFSET(vmwrite_bitmap, 288); - CHECK_OFFSET(vm_function_control, 296); - CHECK_OFFSET(eptp_list_address, 304); - CHECK_OFFSET(pml_address, 312); - CHECK_OFFSET(cr0_guest_host_mask, 344); - CHECK_OFFSET(cr4_guest_host_mask, 352); - CHECK_OFFSET(cr0_read_shadow, 360); - CHECK_OFFSET(cr4_read_shadow, 368); - CHECK_OFFSET(cr3_target_value0, 376); - CHECK_OFFSET(cr3_target_value1, 384); - CHECK_OFFSET(cr3_target_value2, 392); - CHECK_OFFSET(cr3_target_value3, 400); - CHECK_OFFSET(exit_qualification, 408); - CHECK_OFFSET(guest_linear_address, 416); - CHECK_OFFSET(guest_cr0, 424); - CHECK_OFFSET(guest_cr3, 432); - CHECK_OFFSET(guest_cr4, 440); - CHECK_OFFSET(guest_es_base, 448); - CHECK_OFFSET(guest_cs_base, 456); - CHECK_OFFSET(guest_ss_base, 464); - CHECK_OFFSET(guest_ds_base, 472); - CHECK_OFFSET(guest_fs_base, 480); - CHECK_OFFSET(guest_gs_base, 488); - CHECK_OFFSET(guest_ldtr_base, 496); - CHECK_OFFSET(guest_tr_base, 504); - CHECK_OFFSET(guest_gdtr_base, 512); - CHECK_OFFSET(guest_idtr_base, 520); - CHECK_OFFSET(guest_dr7, 528); - CHECK_OFFSET(guest_rsp, 536); - CHECK_OFFSET(guest_rip, 544); - CHECK_OFFSET(guest_rflags, 552); - CHECK_OFFSET(guest_pending_dbg_exceptions, 560); - CHECK_OFFSET(guest_sysenter_esp, 568); - CHECK_OFFSET(guest_sysenter_eip, 576); - CHECK_OFFSET(host_cr0, 584); - CHECK_OFFSET(host_cr3, 592); - CHECK_OFFSET(host_cr4, 600); - CHECK_OFFSET(host_fs_base, 608); - CHECK_OFFSET(host_gs_base, 616); - CHECK_OFFSET(host_tr_base, 624); - CHECK_OFFSET(host_gdtr_base, 632); - CHECK_OFFSET(host_idtr_base, 640); - CHECK_OFFSET(host_ia32_sysenter_esp, 648); - CHECK_OFFSET(host_ia32_sysenter_eip, 656); - CHECK_OFFSET(host_rsp, 664); - CHECK_OFFSET(host_rip, 672); - CHECK_OFFSET(pin_based_vm_exec_control, 744); - CHECK_OFFSET(cpu_based_vm_exec_control, 748); - CHECK_OFFSET(exception_bitmap, 752); - CHECK_OFFSET(page_fault_error_code_mask, 756); - CHECK_OFFSET(page_fault_error_code_match, 760); - CHECK_OFFSET(cr3_target_count, 764); - CHECK_OFFSET(vm_exit_controls, 768); - CHECK_OFFSET(vm_exit_msr_store_count, 772); - CHECK_OFFSET(vm_exit_msr_load_count, 776); - CHECK_OFFSET(vm_entry_controls, 780); - CHECK_OFFSET(vm_entry_msr_load_count, 784); - CHECK_OFFSET(vm_entry_intr_info_field, 788); - CHECK_OFFSET(vm_entry_exception_error_code, 792); - CHECK_OFFSET(vm_entry_instruction_len, 796); - CHECK_OFFSET(tpr_threshold, 800); - CHECK_OFFSET(secondary_vm_exec_control, 804); - CHECK_OFFSET(vm_instruction_error, 808); - CHECK_OFFSET(vm_exit_reason, 812); - CHECK_OFFSET(vm_exit_intr_info, 816); - CHECK_OFFSET(vm_exit_intr_error_code, 820); - CHECK_OFFSET(idt_vectoring_info_field, 824); - CHECK_OFFSET(idt_vectoring_error_code, 828); - CHECK_OFFSET(vm_exit_instruction_len, 832); - CHECK_OFFSET(vmx_instruction_info, 836); - CHECK_OFFSET(guest_es_limit, 840); - CHECK_OFFSET(guest_cs_limit, 844); - CHECK_OFFSET(guest_ss_limit, 848); - CHECK_OFFSET(guest_ds_limit, 852); - CHECK_OFFSET(guest_fs_limit, 856); - CHECK_OFFSET(guest_gs_limit, 860); - CHECK_OFFSET(guest_ldtr_limit, 864); - CHECK_OFFSET(guest_tr_limit, 868); - CHECK_OFFSET(guest_gdtr_limit, 872); - CHECK_OFFSET(guest_idtr_limit, 876); - CHECK_OFFSET(guest_es_ar_bytes, 880); - CHECK_OFFSET(guest_cs_ar_bytes, 884); - CHECK_OFFSET(guest_ss_ar_bytes, 888); - CHECK_OFFSET(guest_ds_ar_bytes, 892); - CHECK_OFFSET(guest_fs_ar_bytes, 896); - CHECK_OFFSET(guest_gs_ar_bytes, 900); - CHECK_OFFSET(guest_ldtr_ar_bytes, 904); - CHECK_OFFSET(guest_tr_ar_bytes, 908); - CHECK_OFFSET(guest_interruptibility_info, 912); - CHECK_OFFSET(guest_activity_state, 916); - CHECK_OFFSET(guest_sysenter_cs, 920); - CHECK_OFFSET(host_ia32_sysenter_cs, 924); - CHECK_OFFSET(vmx_preemption_timer_value, 928); - CHECK_OFFSET(virtual_processor_id, 960); - CHECK_OFFSET(posted_intr_nv, 962); - CHECK_OFFSET(guest_es_selector, 964); - CHECK_OFFSET(guest_cs_selector, 966); - CHECK_OFFSET(guest_ss_selector, 968); - CHECK_OFFSET(guest_ds_selector, 970); - CHECK_OFFSET(guest_fs_selector, 972); - CHECK_OFFSET(guest_gs_selector, 974); - CHECK_OFFSET(guest_ldtr_selector, 976); - CHECK_OFFSET(guest_tr_selector, 978); - CHECK_OFFSET(guest_intr_status, 980); - CHECK_OFFSET(host_es_selector, 982); - CHECK_OFFSET(host_cs_selector, 984); - CHECK_OFFSET(host_ss_selector, 986); - CHECK_OFFSET(host_ds_selector, 988); - CHECK_OFFSET(host_fs_selector, 990); - CHECK_OFFSET(host_gs_selector, 992); - CHECK_OFFSET(host_tr_selector, 994); - CHECK_OFFSET(guest_pml_index, 996); -} - -/* - * VMCS12_REVISION is an arbitrary id that should be changed if the content or - * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and - * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. - * - * IMPORTANT: Changing this value will break save/restore compatibility with - * older kvm releases. - */ -#define VMCS12_REVISION 0x11e57ed0 - -/* - * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region - * and any VMCS region. Although only sizeof(struct vmcs12) are used by the - * current implementation, 4K are reserved to avoid future complications. - */ -#define VMCS12_SIZE 0x1000 - -/* - * VMCS12_MAX_FIELD_INDEX is the highest index value used in any - * supported VMCS12 field encoding. - */ -#define VMCS12_MAX_FIELD_INDEX 0x17 - -struct nested_vmx_msrs { - /* - * We only store the "true" versions of the VMX capability MSRs. We - * generate the "non-true" versions by setting the must-be-1 bits - * according to the SDM. - */ - u32 procbased_ctls_low; - u32 procbased_ctls_high; - u32 secondary_ctls_low; - u32 secondary_ctls_high; - u32 pinbased_ctls_low; - u32 pinbased_ctls_high; - u32 exit_ctls_low; - u32 exit_ctls_high; - u32 entry_ctls_low; - u32 entry_ctls_high; - u32 misc_low; - u32 misc_high; - u32 ept_caps; - u32 vpid_caps; - u64 basic; - u64 cr0_fixed0; - u64 cr0_fixed1; - u64 cr4_fixed0; - u64 cr4_fixed1; - u64 vmcs_enum; - u64 vmfunc_controls; -}; - -/* - * The nested_vmx structure is part of vcpu_vmx, and holds information we need - * for correct emulation of VMX (i.e., nested VMX) on this vcpu. - */ -struct nested_vmx { - /* Has the level1 guest done vmxon? */ - bool vmxon; - gpa_t vmxon_ptr; - bool pml_full; - - /* The guest-physical address of the current VMCS L1 keeps for L2 */ - gpa_t current_vmptr; - /* - * Cache of the guest's VMCS, existing outside of guest memory. - * Loaded from guest memory during VMPTRLD. Flushed to guest - * memory during VMCLEAR and VMPTRLD. - */ - struct vmcs12 *cached_vmcs12; - /* - * Cache of the guest's shadow VMCS, existing outside of guest - * memory. Loaded from guest memory during VM entry. Flushed - * to guest memory during VM exit. - */ - struct vmcs12 *cached_shadow_vmcs12; - /* - * Indicates if the shadow vmcs or enlightened vmcs must be updated - * with the data held by struct vmcs12. - */ - bool need_vmcs12_sync; - bool dirty_vmcs12; - - /* - * vmcs02 has been initialized, i.e. state that is constant for - * vmcs02 has been written to the backing VMCS. Initialization - * is delayed until L1 actually attempts to run a nested VM. - */ - bool vmcs02_initialized; - - bool change_vmcs01_virtual_apic_mode; - - /* - * Enlightened VMCS has been enabled. It does not mean that L1 has to - * use it. However, VMX features available to L1 will be limited based - * on what the enlightened VMCS supports. - */ - bool enlightened_vmcs_enabled; - - /* L2 must run next, and mustn't decide to exit to L1. */ - bool nested_run_pending; - - struct loaded_vmcs vmcs02; - - /* - * Guest pages referred to in the vmcs02 with host-physical - * pointers, so we must keep them pinned while L2 runs. - */ - struct page *apic_access_page; - struct page *virtual_apic_page; - struct page *pi_desc_page; - struct pi_desc *pi_desc; - bool pi_pending; - u16 posted_intr_nv; - - struct hrtimer preemption_timer; - bool preemption_timer_expired; - - /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ - u64 vmcs01_debugctl; - u64 vmcs01_guest_bndcfgs; - - u16 vpid02; - u16 last_vpid; - - struct nested_vmx_msrs msrs; - - /* SMM related state */ - struct { - /* in VMX operation on SMM entry? */ - bool vmxon; - /* in guest mode on SMM entry? */ - bool guest_mode; - } smm; - - gpa_t hv_evmcs_vmptr; - struct page *hv_evmcs_page; - struct hv_enlightened_vmcs *hv_evmcs; -}; - -#define POSTED_INTR_ON 0 -#define POSTED_INTR_SN 1 - -/* Posted-Interrupt Descriptor */ -struct pi_desc { - u32 pir[8]; /* Posted interrupt requested */ - union { - struct { - /* bit 256 - Outstanding Notification */ - u16 on : 1, - /* bit 257 - Suppress Notification */ - sn : 1, - /* bit 271:258 - Reserved */ - rsvd_1 : 14; - /* bit 279:272 - Notification Vector */ - u8 nv; - /* bit 287:280 - Reserved */ - u8 rsvd_2; - /* bit 319:288 - Notification Destination */ - u32 ndst; - }; - u64 control; - }; - u32 rsvd[6]; -} __aligned(64); - -static bool pi_test_and_set_on(struct pi_desc *pi_desc) -{ - return test_and_set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static bool pi_test_and_clear_on(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) -{ - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); -} - -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - return clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_set_sn(struct pi_desc *pi_desc) -{ - return set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_on(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_on(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_sn(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -struct vmx_msrs { - unsigned int nr; - struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; -}; - -struct vcpu_vmx { - struct kvm_vcpu vcpu; - unsigned long host_rsp; - u8 fail; - u8 msr_bitmap_mode; - u32 exit_intr_info; - u32 idt_vectoring_info; - ulong rflags; - struct shared_msr_entry *guest_msrs; - int nmsrs; - int save_nmsrs; - bool guest_msrs_dirty; - unsigned long host_idt_base; -#ifdef CONFIG_X86_64 - u64 msr_host_kernel_gs_base; - u64 msr_guest_kernel_gs_base; -#endif - - u64 arch_capabilities; - u64 spec_ctrl; - - u32 vm_entry_controls_shadow; - u32 vm_exit_controls_shadow; - u32 secondary_exec_control; - - /* - * loaded_vmcs points to the VMCS currently used in this vcpu. For a - * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. loaded_cpu_state points - * to the VMCS whose state is loaded into the CPU registers that only - * need to be switched when transitioning to/from the kernel; a NULL - * value indicates that host state is loaded. - */ - struct loaded_vmcs vmcs01; - struct loaded_vmcs *loaded_vmcs; - struct loaded_vmcs *loaded_cpu_state; - bool __launched; /* temporary, used in vmx_vcpu_run */ - struct msr_autoload { - struct vmx_msrs guest; - struct vmx_msrs host; - } msr_autoload; - - struct { - int vm86_active; - ulong save_rflags; - struct kvm_segment segs[8]; - } rmode; - struct { - u32 bitmask; /* 4 bits per segment (1 bit per field) */ - struct kvm_save_segment { - u16 selector; - unsigned long base; - u32 limit; - u32 ar; - } seg[8]; - } segment_cache; - int vpid; - bool emulation_required; - - u32 exit_reason; - - /* Posted interrupt descriptor */ - struct pi_desc pi_desc; - - /* Support for a guest hypervisor (nested VMX) */ - struct nested_vmx nested; - - /* Dynamic PLE window. */ - int ple_window; - bool ple_window_dirty; - - bool req_immediate_exit; - - /* Support for PML */ -#define PML_ENTITY_NUM 512 - struct page *pml_pg; - - /* apic deadline value in host tsc */ - u64 hv_deadline_tsc; - - u64 current_tsc_ratio; - - u32 host_pkru; - - unsigned long host_debugctlmsr; - - /* - * Only bits masked by msr_ia32_feature_control_valid_bits can be set in - * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included - * in msr_ia32_feature_control_valid_bits. - */ - u64 msr_ia32_feature_control; - u64 msr_ia32_feature_control_valid_bits; - u64 ept_pointer; -}; - -enum segment_cache_field { - SEG_FIELD_SEL = 0, - SEG_FIELD_BASE = 1, - SEG_FIELD_LIMIT = 2, - SEG_FIELD_AR = 3, - - SEG_FIELD_NR = 4 -}; - -static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) -{ - return container_of(kvm, struct kvm_vmx, kvm); -} - -static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) -{ - return container_of(vcpu, struct vcpu_vmx, vcpu); -} - -static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) -{ - return &(to_vmx(vcpu)->pi_desc); -} - -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) -#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) -#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) -#define FIELD64(number, name) \ - FIELD(number, name), \ - [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) - - -static u16 shadow_read_only_fields[] = { -#define SHADOW_FIELD_RO(x) x, -#include "vmx_shadow_fields.h" -}; -static int max_shadow_read_only_fields = - ARRAY_SIZE(shadow_read_only_fields); - -static u16 shadow_read_write_fields[] = { -#define SHADOW_FIELD_RW(x) x, -#include "vmx_shadow_fields.h" -}; -static int max_shadow_read_write_fields = - ARRAY_SIZE(shadow_read_write_fields); - -static const unsigned short vmcs_field_to_offset_table[] = { - FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), - FIELD(POSTED_INTR_NV, posted_intr_nv), - FIELD(GUEST_ES_SELECTOR, guest_es_selector), - FIELD(GUEST_CS_SELECTOR, guest_cs_selector), - FIELD(GUEST_SS_SELECTOR, guest_ss_selector), - FIELD(GUEST_DS_SELECTOR, guest_ds_selector), - FIELD(GUEST_FS_SELECTOR, guest_fs_selector), - FIELD(GUEST_GS_SELECTOR, guest_gs_selector), - FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), - FIELD(GUEST_TR_SELECTOR, guest_tr_selector), - FIELD(GUEST_INTR_STATUS, guest_intr_status), - FIELD(GUEST_PML_INDEX, guest_pml_index), - FIELD(HOST_ES_SELECTOR, host_es_selector), - FIELD(HOST_CS_SELECTOR, host_cs_selector), - FIELD(HOST_SS_SELECTOR, host_ss_selector), - FIELD(HOST_DS_SELECTOR, host_ds_selector), - FIELD(HOST_FS_SELECTOR, host_fs_selector), - FIELD(HOST_GS_SELECTOR, host_gs_selector), - FIELD(HOST_TR_SELECTOR, host_tr_selector), - FIELD64(IO_BITMAP_A, io_bitmap_a), - FIELD64(IO_BITMAP_B, io_bitmap_b), - FIELD64(MSR_BITMAP, msr_bitmap), - FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), - FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), - FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), - FIELD64(PML_ADDRESS, pml_address), - FIELD64(TSC_OFFSET, tsc_offset), - FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), - FIELD64(APIC_ACCESS_ADDR, apic_access_addr), - FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), - FIELD64(VM_FUNCTION_CONTROL, vm_function_control), - FIELD64(EPT_POINTER, ept_pointer), - FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), - FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), - FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), - FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), - FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), - FIELD64(VMREAD_BITMAP, vmread_bitmap), - FIELD64(VMWRITE_BITMAP, vmwrite_bitmap), - FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), - FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), - FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), - FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), - FIELD64(GUEST_IA32_PAT, guest_ia32_pat), - FIELD64(GUEST_IA32_EFER, guest_ia32_efer), - FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), - FIELD64(GUEST_PDPTR0, guest_pdptr0), - FIELD64(GUEST_PDPTR1, guest_pdptr1), - FIELD64(GUEST_PDPTR2, guest_pdptr2), - FIELD64(GUEST_PDPTR3, guest_pdptr3), - FIELD64(GUEST_BNDCFGS, guest_bndcfgs), - FIELD64(HOST_IA32_PAT, host_ia32_pat), - FIELD64(HOST_IA32_EFER, host_ia32_efer), - FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), - FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), - FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), - FIELD(EXCEPTION_BITMAP, exception_bitmap), - FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), - FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), - FIELD(CR3_TARGET_COUNT, cr3_target_count), - FIELD(VM_EXIT_CONTROLS, vm_exit_controls), - FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), - FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), - FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), - FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), - FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), - FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), - FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), - FIELD(TPR_THRESHOLD, tpr_threshold), - FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), - FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), - FIELD(VM_EXIT_REASON, vm_exit_reason), - FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), - FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), - FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), - FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), - FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), - FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), - FIELD(GUEST_ES_LIMIT, guest_es_limit), - FIELD(GUEST_CS_LIMIT, guest_cs_limit), - FIELD(GUEST_SS_LIMIT, guest_ss_limit), - FIELD(GUEST_DS_LIMIT, guest_ds_limit), - FIELD(GUEST_FS_LIMIT, guest_fs_limit), - FIELD(GUEST_GS_LIMIT, guest_gs_limit), - FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), - FIELD(GUEST_TR_LIMIT, guest_tr_limit), - FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), - FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), - FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), - FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), - FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), - FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), - FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), - FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), - FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), - FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), - FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), - FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), - FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), - FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), - FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), - FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), - FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), - FIELD(CR0_READ_SHADOW, cr0_read_shadow), - FIELD(CR4_READ_SHADOW, cr4_read_shadow), - FIELD(CR3_TARGET_VALUE0, cr3_target_value0), - FIELD(CR3_TARGET_VALUE1, cr3_target_value1), - FIELD(CR3_TARGET_VALUE2, cr3_target_value2), - FIELD(CR3_TARGET_VALUE3, cr3_target_value3), - FIELD(EXIT_QUALIFICATION, exit_qualification), - FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), - FIELD(GUEST_CR0, guest_cr0), - FIELD(GUEST_CR3, guest_cr3), - FIELD(GUEST_CR4, guest_cr4), - FIELD(GUEST_ES_BASE, guest_es_base), - FIELD(GUEST_CS_BASE, guest_cs_base), - FIELD(GUEST_SS_BASE, guest_ss_base), - FIELD(GUEST_DS_BASE, guest_ds_base), - FIELD(GUEST_FS_BASE, guest_fs_base), - FIELD(GUEST_GS_BASE, guest_gs_base), - FIELD(GUEST_LDTR_BASE, guest_ldtr_base), - FIELD(GUEST_TR_BASE, guest_tr_base), - FIELD(GUEST_GDTR_BASE, guest_gdtr_base), - FIELD(GUEST_IDTR_BASE, guest_idtr_base), - FIELD(GUEST_DR7, guest_dr7), - FIELD(GUEST_RSP, guest_rsp), - FIELD(GUEST_RIP, guest_rip), - FIELD(GUEST_RFLAGS, guest_rflags), - FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), - FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), - FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), - FIELD(HOST_CR0, host_cr0), - FIELD(HOST_CR3, host_cr3), - FIELD(HOST_CR4, host_cr4), - FIELD(HOST_FS_BASE, host_fs_base), - FIELD(HOST_GS_BASE, host_gs_base), - FIELD(HOST_TR_BASE, host_tr_base), - FIELD(HOST_GDTR_BASE, host_gdtr_base), - FIELD(HOST_IDTR_BASE, host_idtr_base), - FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), - FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), - FIELD(HOST_RSP, host_rsp), - FIELD(HOST_RIP, host_rip), -}; - -static inline short vmcs_field_to_offset(unsigned long field) -{ - const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table); - unsigned short offset; - unsigned index; - - if (field >> 15) - return -ENOENT; - - index = ROL16(field, 6); - if (index >= size) - return -ENOENT; - - index = array_index_nospec(index, size); - offset = vmcs_field_to_offset_table[index]; - if (offset == 0) - return -ENOENT; - return offset; -} - -static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.cached_vmcs12; -} - -static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.cached_shadow_vmcs12; -} - -static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); -static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); -static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); -static bool vmx_xsaves_supported(void); -static void vmx_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg); -static void vmx_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg); -static bool guest_state_valid(struct kvm_vcpu *vcpu); -static u32 vmx_segment_access_rights(struct kvm_segment *var); -static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); -static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); -static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); -static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, - u16 error_code); -static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); -static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type); - -static DEFINE_PER_CPU(struct vmcs *, vmxarea); -static DEFINE_PER_CPU(struct vmcs *, current_vmcs); -/* - * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed - * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. - */ -static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); - -/* - * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we - * can find which vCPU should be waken up. - */ -static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); -static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); - -enum { - VMX_VMREAD_BITMAP, - VMX_VMWRITE_BITMAP, - VMX_BITMAP_NR -}; - -static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; - -#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) -#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) - -static bool cpu_has_load_ia32_efer; -static bool cpu_has_load_perf_global_ctrl; - -static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); -static DEFINE_SPINLOCK(vmx_vpid_lock); - -static struct vmcs_config { - int size; - int order; - u32 basic_cap; - u32 revision_id; - u32 pin_based_exec_ctrl; - u32 cpu_based_exec_ctrl; - u32 cpu_based_2nd_exec_ctrl; - u32 vmexit_ctrl; - u32 vmentry_ctrl; - struct nested_vmx_msrs nested; -} vmcs_config; - -static struct vmx_capability { - u32 ept; - u32 vpid; -} vmx_capability; - -#define VMX_SEGMENT_FIELD(seg) \ - [VCPU_SREG_##seg] = { \ - .selector = GUEST_##seg##_SELECTOR, \ - .base = GUEST_##seg##_BASE, \ - .limit = GUEST_##seg##_LIMIT, \ - .ar_bytes = GUEST_##seg##_AR_BYTES, \ - } - -static const struct kvm_vmx_segment_field { - unsigned selector; - unsigned base; - unsigned limit; - unsigned ar_bytes; -} kvm_vmx_segment_fields[] = { - VMX_SEGMENT_FIELD(CS), - VMX_SEGMENT_FIELD(DS), - VMX_SEGMENT_FIELD(ES), - VMX_SEGMENT_FIELD(FS), - VMX_SEGMENT_FIELD(GS), - VMX_SEGMENT_FIELD(SS), - VMX_SEGMENT_FIELD(TR), - VMX_SEGMENT_FIELD(LDTR), -}; - -static u64 host_efer; - -static void ept_save_pdptrs(struct kvm_vcpu *vcpu); - -/* - * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it - * away by decrementing the array size. - */ -static const u32 vmx_msr_index[] = { -#ifdef CONFIG_X86_64 - MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, -#endif - MSR_EFER, MSR_TSC_AUX, MSR_STAR, -}; - -DEFINE_STATIC_KEY_FALSE(enable_evmcs); - -#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) - -#define KVM_EVMCS_VERSION 1 - -/* - * Enlightened VMCSv1 doesn't support these: - * - * POSTED_INTR_NV = 0x00000002, - * GUEST_INTR_STATUS = 0x00000810, - * APIC_ACCESS_ADDR = 0x00002014, - * POSTED_INTR_DESC_ADDR = 0x00002016, - * EOI_EXIT_BITMAP0 = 0x0000201c, - * EOI_EXIT_BITMAP1 = 0x0000201e, - * EOI_EXIT_BITMAP2 = 0x00002020, - * EOI_EXIT_BITMAP3 = 0x00002022, - * GUEST_PML_INDEX = 0x00000812, - * PML_ADDRESS = 0x0000200e, - * VM_FUNCTION_CONTROL = 0x00002018, - * EPTP_LIST_ADDRESS = 0x00002024, - * VMREAD_BITMAP = 0x00002026, - * VMWRITE_BITMAP = 0x00002028, - * - * TSC_MULTIPLIER = 0x00002032, - * PLE_GAP = 0x00004020, - * PLE_WINDOW = 0x00004022, - * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, - * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, - * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, - * - * Currently unsupported in KVM: - * GUEST_IA32_RTIT_CTL = 0x00002814, - */ -#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ - PIN_BASED_VMX_PREEMPTION_TIMER) -#define EVMCS1_UNSUPPORTED_2NDEXEC \ - (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ - SECONDARY_EXEC_APIC_REGISTER_VIRT | \ - SECONDARY_EXEC_ENABLE_PML | \ - SECONDARY_EXEC_ENABLE_VMFUNC | \ - SECONDARY_EXEC_SHADOW_VMCS | \ - SECONDARY_EXEC_TSC_SCALING | \ - SECONDARY_EXEC_PAUSE_LOOP_EXITING) -#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) -#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) -#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) - -#if IS_ENABLED(CONFIG_HYPERV) -static bool __read_mostly enlightened_vmcs = true; -module_param(enlightened_vmcs, bool, 0444); - -static inline void evmcs_write64(unsigned long field, u64 value) -{ - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u64 *)((char *)current_evmcs + offset) = value; - - current_evmcs->hv_clean_fields &= ~clean_field; -} - -static inline void evmcs_write32(unsigned long field, u32 value) -{ - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u32 *)((char *)current_evmcs + offset) = value; - current_evmcs->hv_clean_fields &= ~clean_field; -} - -static inline void evmcs_write16(unsigned long field, u16 value) -{ - u16 clean_field; - int offset = get_evmcs_offset(field, &clean_field); - - if (offset < 0) - return; - - *(u16 *)((char *)current_evmcs + offset) = value; - current_evmcs->hv_clean_fields &= ~clean_field; -} - -static inline u64 evmcs_read64(unsigned long field) -{ - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u64 *)((char *)current_evmcs + offset); -} - -static inline u32 evmcs_read32(unsigned long field) -{ - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u32 *)((char *)current_evmcs + offset); -} - -static inline u16 evmcs_read16(unsigned long field) -{ - int offset = get_evmcs_offset(field, NULL); - - if (offset < 0) - return 0; - - return *(u16 *)((char *)current_evmcs + offset); -} - -static inline void evmcs_touch_msr_bitmap(void) -{ - if (unlikely(!current_evmcs)) - return; - - if (current_evmcs->hv_enlightenments_control.msr_bitmap) - current_evmcs->hv_clean_fields &= - ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; -} - -static void evmcs_load(u64 phys_addr) -{ - struct hv_vp_assist_page *vp_ap = - hv_get_vp_assist_page(smp_processor_id()); - - vp_ap->current_nested_vmcs = phys_addr; - vp_ap->enlighten_vmentry = 1; -} - -static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) -{ - vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; - - vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; - vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; - -} - -/* check_ept_pointer() should be under protection of ept_pointer_lock. */ -static void check_ept_pointer_match(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - u64 tmp_eptp = INVALID_PAGE; - int i; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!VALID_PAGE(tmp_eptp)) { - tmp_eptp = to_vmx(vcpu)->ept_pointer; - } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { - to_kvm_vmx(kvm)->ept_pointers_match - = EPT_POINTERS_MISMATCH; - return; - } - } - - to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; -} - -static int vmx_hv_remote_flush_tlb(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int ret = -ENOTSUPP, i; - - spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); - - if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) - check_ept_pointer_match(kvm); - - /* - * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the - * base of EPT PML4 table, strip off EPT configuration information. - */ - if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { - kvm_for_each_vcpu(i, vcpu, kvm) - ret |= hyperv_flush_guest_mapping( - to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer & PAGE_MASK); - } else { - ret = hyperv_flush_guest_mapping( - to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK); - } - - spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); - return ret; -} -#else /* !IS_ENABLED(CONFIG_HYPERV) */ -static inline void evmcs_write64(unsigned long field, u64 value) {} -static inline void evmcs_write32(unsigned long field, u32 value) {} -static inline void evmcs_write16(unsigned long field, u16 value) {} -static inline u64 evmcs_read64(unsigned long field) { return 0; } -static inline u32 evmcs_read32(unsigned long field) { return 0; } -static inline u16 evmcs_read16(unsigned long field) { return 0; } -static inline void evmcs_load(u64 phys_addr) {} -static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} -static inline void evmcs_touch_msr_bitmap(void) {} -#endif /* IS_ENABLED(CONFIG_HYPERV) */ - -static int nested_enable_evmcs(struct kvm_vcpu *vcpu, - uint16_t *vmcs_version) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * vmcs_version represents the range of supported Enlightened VMCS - * versions: lower 8 bits is the minimal version, higher 8 bits is the - * maximum supported version. KVM supports versions from 1 to - * KVM_EVMCS_VERSION. - */ - if (vmcs_version) - *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1; - - /* We don't support disabling the feature for simplicity. */ - if (vmx->nested.enlightened_vmcs_enabled) - return 0; - - vmx->nested.enlightened_vmcs_enabled = true; - - vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; - vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; - vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; - vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; - vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC; - - return 0; -} - -static inline bool is_exception_n(u32 intr_info, u8 vector) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK); -} - -static inline bool is_debug(u32 intr_info) -{ - return is_exception_n(intr_info, DB_VECTOR); -} - -static inline bool is_breakpoint(u32 intr_info) -{ - return is_exception_n(intr_info, BP_VECTOR); -} - -static inline bool is_page_fault(u32 intr_info) -{ - return is_exception_n(intr_info, PF_VECTOR); -} - -static inline bool is_invalid_opcode(u32 intr_info) -{ - return is_exception_n(intr_info, UD_VECTOR); -} - -static inline bool is_gp_fault(u32 intr_info) -{ - return is_exception_n(intr_info, GP_VECTOR); -} - -static inline bool is_machine_check(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); -} - -/* Undocumented: icebp/int1 */ -static inline bool is_icebp(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK); -} - -static inline bool cpu_has_vmx_msr_bitmap(void) -{ - return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; -} - -static inline bool cpu_has_vmx_tpr_shadow(void) -{ - return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; -} - -static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu) -{ - return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu); -} - -static inline bool cpu_has_secondary_exec_ctrls(void) -{ - return vmcs_config.cpu_based_exec_ctrl & - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; -} - -static inline bool cpu_has_vmx_virtualize_apic_accesses(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; -} - -static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; -} - -static inline bool cpu_has_vmx_apic_register_virt(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_APIC_REGISTER_VIRT; -} - -static inline bool cpu_has_vmx_virtual_intr_delivery(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; -} - -static inline bool cpu_has_vmx_encls_vmexit(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENCLS_EXITING; -} - -/* - * Comment's format: document - errata name - stepping - processor name. - * Refer from - * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp - */ -static u32 vmx_preemption_cpu_tfms[] = { -/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ -0x000206E6, -/* 323056.pdf - AAX65 - C2 - Xeon L3406 */ -/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ -/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ -0x00020652, -/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ -0x00020655, -/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ -/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ -/* - * 320767.pdf - AAP86 - B1 - - * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile - */ -0x000106E5, -/* 321333.pdf - AAM126 - C0 - Xeon 3500 */ -0x000106A0, -/* 321333.pdf - AAM126 - C1 - Xeon 3500 */ -0x000106A1, -/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ -0x000106A4, - /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ - /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ - /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ -0x000106A5, -}; - -static inline bool cpu_has_broken_vmx_preemption_timer(void) -{ - u32 eax = cpuid_eax(0x00000001), i; - - /* Clear the reserved bits */ - eax &= ~(0x3U << 14 | 0xfU << 28); - for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) - if (eax == vmx_preemption_cpu_tfms[i]) - return true; - - return false; -} - -static inline bool cpu_has_vmx_preemption_timer(void) -{ - return vmcs_config.pin_based_exec_ctrl & - PIN_BASED_VMX_PREEMPTION_TIMER; -} - -static inline bool cpu_has_vmx_posted_intr(void) -{ - return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && - vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; -} - -static inline bool cpu_has_vmx_apicv(void) -{ - return cpu_has_vmx_apic_register_virt() && - cpu_has_vmx_virtual_intr_delivery() && - cpu_has_vmx_posted_intr(); -} - -static inline bool cpu_has_vmx_flexpriority(void) -{ - return cpu_has_vmx_tpr_shadow() && - cpu_has_vmx_virtualize_apic_accesses(); -} - -static inline bool cpu_has_vmx_ept_execute_only(void) -{ - return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; -} - -static inline bool cpu_has_vmx_ept_2m_page(void) -{ - return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; -} - -static inline bool cpu_has_vmx_ept_1g_page(void) -{ - return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; -} - -static inline bool cpu_has_vmx_ept_4levels(void) -{ - return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; -} - -static inline bool cpu_has_vmx_ept_mt_wb(void) -{ - return vmx_capability.ept & VMX_EPTP_WB_BIT; -} - -static inline bool cpu_has_vmx_ept_5levels(void) -{ - return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; -} - -static inline bool cpu_has_vmx_ept_ad_bits(void) -{ - return vmx_capability.ept & VMX_EPT_AD_BIT; -} - -static inline bool cpu_has_vmx_invept_context(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; -} - -static inline bool cpu_has_vmx_invept_global(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; -} - -static inline bool cpu_has_vmx_invvpid_individual_addr(void) -{ - return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT; -} - -static inline bool cpu_has_vmx_invvpid_single(void) -{ - return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; -} - -static inline bool cpu_has_vmx_invvpid_global(void) -{ - return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; -} - -static inline bool cpu_has_vmx_invvpid(void) -{ - return vmx_capability.vpid & VMX_VPID_INVVPID_BIT; -} - -static inline bool cpu_has_vmx_ept(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_EPT; -} - -static inline bool cpu_has_vmx_unrestricted_guest(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_UNRESTRICTED_GUEST; -} - -static inline bool cpu_has_vmx_ple(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_PAUSE_LOOP_EXITING; -} - -static inline bool cpu_has_vmx_basic_inout(void) -{ - return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); -} - -static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) -{ - return flexpriority_enabled && lapic_in_kernel(vcpu); -} - -static inline bool cpu_has_vmx_vpid(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_VPID; -} - -static inline bool cpu_has_vmx_rdtscp(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDTSCP; -} - -static inline bool cpu_has_vmx_invpcid(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_INVPCID; -} - -static inline bool cpu_has_virtual_nmis(void) -{ - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; -} - -static inline bool cpu_has_vmx_wbinvd_exit(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_WBINVD_EXITING; -} - -static inline bool cpu_has_vmx_shadow_vmcs(void) -{ - u64 vmx_msr; - rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); - /* check if the cpu supports writing r/o exit information fields */ - if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) - return false; - - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_SHADOW_VMCS; -} - -static inline bool cpu_has_vmx_pml(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; -} - -static inline bool cpu_has_vmx_tsc_scaling(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_TSC_SCALING; -} - -static inline bool cpu_has_vmx_vmfunc(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_VMFUNC; -} - -static bool vmx_umip_emulated(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_DESC; -} - -static inline bool report_flexpriority(void) -{ - return flexpriority_enabled; -} - -static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) -{ - return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low); -} - -/* - * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE - * to modify any valid field of the VMCS, or are the VM-exit - * information fields read-only? - */ -static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.msrs.misc_low & - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; -} - -static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS; -} - -static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.msrs.procbased_ctls_high & - CPU_BASED_MONITOR_TRAP_FLAG; -} - -static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & - SECONDARY_EXEC_SHADOW_VMCS; -} - -static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) -{ - return vmcs12->cpu_based_vm_exec_control & bit; -} - -static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) -{ - return (vmcs12->cpu_based_vm_exec_control & - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && - (vmcs12->secondary_vm_exec_control & bit); -} - -static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & - PIN_BASED_VMX_PREEMPTION_TIMER; -} - -static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING; -} - -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; -} - -static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); -} - -static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); -} - -static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); -} - -static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); -} - -static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); -} - -static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); -} - -static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); -} - -static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; -} - -static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); -} - -static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) -{ - return nested_cpu_has_vmfunc(vmcs12) && - (vmcs12->vm_function_control & - VMX_VMFUNC_EPTP_SWITCHING); -} - -static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) -{ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); -} - -static inline bool is_nmi(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); -} - -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, - u32 exit_intr_info, - unsigned long exit_qualification); - -static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) -{ - int i; - - for (i = 0; i < vmx->nmsrs; ++i) - if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) - return i; - return -1; -} - -static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) -{ - struct { - u64 vpid : 16; - u64 rsvd : 48; - u64 gva; - } operand = { vpid, 0, gva }; - bool error; - - asm volatile (__ex("invvpid %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(ext), "m"(operand)); - BUG_ON(error); -} - -static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) -{ - struct { - u64 eptp, gpa; - } operand = {eptp, gpa}; - bool error; - - asm volatile (__ex("invept %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(ext), "m"(operand)); - BUG_ON(error); -} - -static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) -{ - int i; - - i = __find_msr_index(vmx, msr); - if (i >= 0) - return &vmx->guest_msrs[i]; - return NULL; -} - -static void vmcs_clear(struct vmcs *vmcs) -{ - u64 phys_addr = __pa(vmcs); - bool error; - - asm volatile (__ex("vmclear %1") CC_SET(na) - : CC_OUT(na) (error) : "m"(phys_addr)); - if (unlikely(error)) - printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", - vmcs, phys_addr); -} - -static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) -{ - vmcs_clear(loaded_vmcs->vmcs); - if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) - vmcs_clear(loaded_vmcs->shadow_vmcs); - loaded_vmcs->cpu = -1; - loaded_vmcs->launched = 0; -} - -static void vmcs_load(struct vmcs *vmcs) -{ - u64 phys_addr = __pa(vmcs); - bool error; - - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_load(phys_addr); - - asm volatile (__ex("vmptrld %1") CC_SET(na) - : CC_OUT(na) (error) : "m"(phys_addr)); - if (unlikely(error)) - printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", - vmcs, phys_addr); -} - -#ifdef CONFIG_KEXEC_CORE -/* - * This bitmap is used to indicate whether the vmclear - * operation is enabled on all cpus. All disabled by - * default. - */ -static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; - -static inline void crash_enable_local_vmclear(int cpu) -{ - cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline void crash_disable_local_vmclear(int cpu) -{ - cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline int crash_local_vmclear_enabled(int cpu) -{ - return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static void crash_vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v; - - if (!crash_local_vmclear_enabled(cpu)) - return; - - list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - vmcs_clear(v->vmcs); -} -#else -static inline void crash_enable_local_vmclear(int cpu) { } -static inline void crash_disable_local_vmclear(int cpu) { } -#endif /* CONFIG_KEXEC_CORE */ - -static void __loaded_vmcs_clear(void *arg) -{ - struct loaded_vmcs *loaded_vmcs = arg; - int cpu = raw_smp_processor_id(); - - if (loaded_vmcs->cpu != cpu) - return; /* vcpu migration can race with cpu offline */ - if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) - per_cpu(current_vmcs, cpu) = NULL; - crash_disable_local_vmclear(cpu); - list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); - - /* - * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link - * is before setting loaded_vmcs->vcpu to -1 which is done in - * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist - * then adds the vmcs into percpu list before it is deleted. - */ - smp_wmb(); - - loaded_vmcs_init(loaded_vmcs); - crash_enable_local_vmclear(cpu); -} - -static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) -{ - int cpu = loaded_vmcs->cpu; - - if (cpu != -1) - smp_call_function_single(cpu, - __loaded_vmcs_clear, loaded_vmcs, 1); -} - -static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) -{ - if (vpid == 0) - return true; - - if (cpu_has_vmx_invvpid_individual_addr()) { - __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); - return true; - } - - return false; -} - -static inline void vpid_sync_vcpu_single(int vpid) -{ - if (vpid == 0) - return; - - if (cpu_has_vmx_invvpid_single()) - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0); -} - -static inline void vpid_sync_vcpu_global(void) -{ - if (cpu_has_vmx_invvpid_global()) - __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); -} - -static inline void vpid_sync_context(int vpid) -{ - if (cpu_has_vmx_invvpid_single()) - vpid_sync_vcpu_single(vpid); - else - vpid_sync_vcpu_global(); -} - -static inline void ept_sync_global(void) -{ - __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); -} - -static inline void ept_sync_context(u64 eptp) -{ - if (cpu_has_vmx_invept_context()) - __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); - else - ept_sync_global(); -} - -static __always_inline void vmcs_check16(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, - "16-bit accessor invalid for 64-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "16-bit accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "16-bit accessor invalid for 32-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "16-bit accessor invalid for natural width field"); -} - -static __always_inline void vmcs_check32(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "32-bit accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "32-bit accessor invalid for natural width field"); -} - -static __always_inline void vmcs_check64(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "64-bit accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "64-bit accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "64-bit accessor invalid for 32-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, - "64-bit accessor invalid for natural width field"); -} - -static __always_inline void vmcs_checkl(unsigned long field) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0, - "Natural width accessor invalid for 16-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000, - "Natural width accessor invalid for 64-bit field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, - "Natural width accessor invalid for 64-bit high field"); - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "Natural width accessor invalid for 32-bit field"); -} - -static __always_inline unsigned long __vmcs_readl(unsigned long field) -{ - unsigned long value; - - asm volatile (__ex_clear("vmread %1, %0", "%k0") - : "=r"(value) : "r"(field)); - return value; -} - -static __always_inline u16 vmcs_read16(unsigned long field) -{ - vmcs_check16(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_read16(field); - return __vmcs_readl(field); -} - -static __always_inline u32 vmcs_read32(unsigned long field) -{ - vmcs_check32(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_read32(field); - return __vmcs_readl(field); -} - -static __always_inline u64 vmcs_read64(unsigned long field) -{ - vmcs_check64(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_read64(field); -#ifdef CONFIG_X86_64 - return __vmcs_readl(field); -#else - return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32); -#endif -} - -static __always_inline unsigned long vmcs_readl(unsigned long field) -{ - vmcs_checkl(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_read64(field); - return __vmcs_readl(field); -} - -static noinline void vmwrite_error(unsigned long field, unsigned long value) -{ - printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", - field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); - dump_stack(); -} - -static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) -{ - bool error; - - asm volatile (__ex("vmwrite %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(field), "rm"(value)); - if (unlikely(error)) - vmwrite_error(field, value); -} - -static __always_inline void vmcs_write16(unsigned long field, u16 value) -{ - vmcs_check16(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write16(field, value); - - __vmcs_writel(field, value); -} - -static __always_inline void vmcs_write32(unsigned long field, u32 value) -{ - vmcs_check32(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write32(field, value); - - __vmcs_writel(field, value); -} - -static __always_inline void vmcs_write64(unsigned long field, u64 value) -{ - vmcs_check64(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write64(field, value); - - __vmcs_writel(field, value); -#ifndef CONFIG_X86_64 - asm volatile (""); - __vmcs_writel(field+1, value >> 32); -#endif -} - -static __always_inline void vmcs_writel(unsigned long field, unsigned long value) -{ - vmcs_checkl(field); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write64(field, value); - - __vmcs_writel(field, value); -} - -static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, - "vmcs_clear_bits does not support 64-bit fields"); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write32(field, evmcs_read32(field) & ~mask); - - __vmcs_writel(field, __vmcs_readl(field) & ~mask); -} - -static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) -{ - BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000, - "vmcs_set_bits does not support 64-bit fields"); - if (static_branch_unlikely(&enable_evmcs)) - return evmcs_write32(field, evmcs_read32(field) | mask); - - __vmcs_writel(field, __vmcs_readl(field) | mask); -} - -static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) -{ - vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); -} - -static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VM_ENTRY_CONTROLS, val); - vmx->vm_entry_controls_shadow = val; -} - -static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) -{ - if (vmx->vm_entry_controls_shadow != val) - vm_entry_controls_init(vmx, val); -} - -static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) -{ - return vmx->vm_entry_controls_shadow; -} - - -static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); -} - -static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); -} - -static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) -{ - vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); -} - -static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) -{ - vmcs_write32(VM_EXIT_CONTROLS, val); - vmx->vm_exit_controls_shadow = val; -} - -static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) -{ - if (vmx->vm_exit_controls_shadow != val) - vm_exit_controls_init(vmx, val); -} - -static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) -{ - return vmx->vm_exit_controls_shadow; -} - - -static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); -} - -static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) -{ - vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); -} - -static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) -{ - vmx->segment_cache.bitmask = 0; -} - -static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, - unsigned field) -{ - bool ret; - u32 mask = 1 << (seg * SEG_FIELD_NR + field); - - if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { - vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); - vmx->segment_cache.bitmask = 0; - } - ret = vmx->segment_cache.bitmask & mask; - vmx->segment_cache.bitmask |= mask; - return ret; -} - -static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) -{ - u16 *p = &vmx->segment_cache.seg[seg].selector; - - if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) - *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); - return *p; -} - -static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) -{ - ulong *p = &vmx->segment_cache.seg[seg].base; - - if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) - *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); - return *p; -} - -static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) -{ - u32 *p = &vmx->segment_cache.seg[seg].limit; - - if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) - *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); - return *p; -} - -static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) -{ - u32 *p = &vmx->segment_cache.seg[seg].ar; - - if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) - *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); - return *p; -} - -static void update_exception_bitmap(struct kvm_vcpu *vcpu) -{ - u32 eb; - - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | - (1u << DB_VECTOR) | (1u << AC_VECTOR); - /* - * Guest access to VMware backdoor ports could legitimately - * trigger #GP because of TSS I/O permission bitmap. - * We intercept those #GP and allow access to them anyway - * as VMware does. - */ - if (enable_vmware_backdoor) - eb |= (1u << GP_VECTOR); - if ((vcpu->guest_debug & - (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == - (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) - eb |= 1u << BP_VECTOR; - if (to_vmx(vcpu)->rmode.vm86_active) - eb = ~0; - if (enable_ept) - eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ - - /* When we are running a nested L2 guest and L1 specified for it a - * certain exception bitmap, we must trap the same exceptions and pass - * them to L1. When running L2, we will only handle the exceptions - * specified above if L1 did not want them. - */ - if (is_guest_mode(vcpu)) - eb |= get_vmcs12(vcpu)->exception_bitmap; - - vmcs_write32(EXCEPTION_BITMAP, eb); -} - -/* - * Check if MSR is intercepted for currently loaded MSR bitmap. - */ -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) -{ - unsigned long *msr_bitmap; - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return true; - - msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; - - if (msr <= 0x1fff) { - return !!test_bit(msr, msr_bitmap + 0x800 / f); - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - return !!test_bit(msr, msr_bitmap + 0xc00 / f); - } - - return true; -} - -/* - * Check if MSR is intercepted for L01 MSR bitmap. - */ -static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) -{ - unsigned long *msr_bitmap; - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return true; - - msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; - - if (msr <= 0x1fff) { - return !!test_bit(msr, msr_bitmap + 0x800 / f); - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - return !!test_bit(msr, msr_bitmap + 0xc00 / f); - } - - return true; -} - -static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, - unsigned long entry, unsigned long exit) -{ - vm_entry_controls_clearbit(vmx, entry); - vm_exit_controls_clearbit(vmx, exit); -} - -static int find_msr(struct vmx_msrs *m, unsigned int msr) -{ - unsigned int i; - - for (i = 0; i < m->nr; ++i) { - if (m->val[i].index == msr) - return i; - } - return -ENOENT; -} - -static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) -{ - int i; - struct msr_autoload *m = &vmx->msr_autoload; - - switch (msr) { - case MSR_EFER: - if (cpu_has_load_ia32_efer) { - clear_atomic_switch_msr_special(vmx, - VM_ENTRY_LOAD_IA32_EFER, - VM_EXIT_LOAD_IA32_EFER); - return; - } - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - if (cpu_has_load_perf_global_ctrl) { - clear_atomic_switch_msr_special(vmx, - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); - return; - } - break; - } - i = find_msr(&m->guest, msr); - if (i < 0) - goto skip_guest; - --m->guest.nr; - m->guest.val[i] = m->guest.val[m->guest.nr]; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); - -skip_guest: - i = find_msr(&m->host, msr); - if (i < 0) - return; - - --m->host.nr; - m->host.val[i] = m->host.val[m->host.nr]; - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); -} - -static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, - unsigned long entry, unsigned long exit, - unsigned long guest_val_vmcs, unsigned long host_val_vmcs, - u64 guest_val, u64 host_val) -{ - vmcs_write64(guest_val_vmcs, guest_val); - if (host_val_vmcs != HOST_IA32_EFER) - vmcs_write64(host_val_vmcs, host_val); - vm_entry_controls_setbit(vmx, entry); - vm_exit_controls_setbit(vmx, exit); -} - -static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, - u64 guest_val, u64 host_val, bool entry_only) -{ - int i, j = 0; - struct msr_autoload *m = &vmx->msr_autoload; - - switch (msr) { - case MSR_EFER: - if (cpu_has_load_ia32_efer) { - add_atomic_switch_msr_special(vmx, - VM_ENTRY_LOAD_IA32_EFER, - VM_EXIT_LOAD_IA32_EFER, - GUEST_IA32_EFER, - HOST_IA32_EFER, - guest_val, host_val); - return; - } - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - if (cpu_has_load_perf_global_ctrl) { - add_atomic_switch_msr_special(vmx, - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, - GUEST_IA32_PERF_GLOBAL_CTRL, - HOST_IA32_PERF_GLOBAL_CTRL, - guest_val, host_val); - return; - } - break; - case MSR_IA32_PEBS_ENABLE: - /* PEBS needs a quiescent period after being disabled (to write - * a record). Disabling PEBS through VMX MSR swapping doesn't - * provide that period, so a CPU could write host's record into - * guest's memory. - */ - wrmsrl(MSR_IA32_PEBS_ENABLE, 0); - } - - i = find_msr(&m->guest, msr); - if (!entry_only) - j = find_msr(&m->host, msr); - - if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) { - printk_once(KERN_WARNING "Not enough msr switch entries. " - "Can't add msr %x\n", msr); - return; - } - if (i < 0) { - i = m->guest.nr++; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); - } - m->guest.val[i].index = msr; - m->guest.val[i].value = guest_val; - - if (entry_only) - return; - - if (j < 0) { - j = m->host.nr++; - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); - } - m->host.val[j].index = msr; - m->host.val[j].value = host_val; -} - -static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) -{ - u64 guest_efer = vmx->vcpu.arch.efer; - u64 ignore_bits = 0; - - if (!enable_ept) { - /* - * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing - * host CPUID is more efficient than testing guest CPUID - * or CR4. Host SMEP is anyway a requirement for guest SMEP. - */ - if (boot_cpu_has(X86_FEATURE_SMEP)) - guest_efer |= EFER_NX; - else if (!(guest_efer & EFER_NX)) - ignore_bits |= EFER_NX; - } - - /* - * LMA and LME handled by hardware; SCE meaningless outside long mode. - */ - ignore_bits |= EFER_SCE; -#ifdef CONFIG_X86_64 - ignore_bits |= EFER_LMA | EFER_LME; - /* SCE is meaningful only in long mode on Intel */ - if (guest_efer & EFER_LMA) - ignore_bits &= ~(u64)EFER_SCE; -#endif - - /* - * On EPT, we can't emulate NX, so we must switch EFER atomically. - * On CPUs that support "load IA32_EFER", always switch EFER - * atomically, since it's faster than switching it manually. - */ - if (cpu_has_load_ia32_efer || - (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { - if (!(guest_efer & EFER_LMA)) - guest_efer &= ~EFER_LME; - if (guest_efer != host_efer) - add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, host_efer, false); - else - clear_atomic_switch_msr(vmx, MSR_EFER); - return false; - } else { - clear_atomic_switch_msr(vmx, MSR_EFER); - - guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; - - vmx->guest_msrs[efer_offset].data = guest_efer; - vmx->guest_msrs[efer_offset].mask = ~ignore_bits; - - return true; - } -} - -#ifdef CONFIG_X86_32 -/* - * On 32-bit kernels, VM exits still load the FS and GS bases from the - * VMCS rather than the segment table. KVM uses this helper to figure - * out the current bases to poke them into the VMCS before entry. - */ -static unsigned long segment_base(u16 selector) -{ - struct desc_struct *table; - unsigned long v; - - if (!(selector & ~SEGMENT_RPL_MASK)) - return 0; - - table = get_current_gdt_ro(); - - if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { - u16 ldt_selector = kvm_read_ldt(); - - if (!(ldt_selector & ~SEGMENT_RPL_MASK)) - return 0; - - table = (struct desc_struct *)segment_base(ldt_selector); - } - v = get_desc_base(&table[selector >> 3]); - return v; -} -#endif - -static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs_host_state *host_state; -#ifdef CONFIG_X86_64 - int cpu = raw_smp_processor_id(); -#endif - unsigned long fs_base, gs_base; - u16 fs_sel, gs_sel; - int i; - - vmx->req_immediate_exit = false; - - /* - * Note that guest MSRs to be saved/restored can also be changed - * when guest state is loaded. This happens when guest transitions - * to/from long-mode by setting MSR_EFER.LMA. - */ - if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) { - vmx->guest_msrs_dirty = false; - for (i = 0; i < vmx->save_nmsrs; ++i) - kvm_set_shared_msr(vmx->guest_msrs[i].index, - vmx->guest_msrs[i].data, - vmx->guest_msrs[i].mask); - - } - - if (vmx->loaded_cpu_state) - return; - - vmx->loaded_cpu_state = vmx->loaded_vmcs; - host_state = &vmx->loaded_cpu_state->host_state; - - /* - * Set host fs and gs selectors. Unfortunately, 22.2.3 does not - * allow segment selectors with cpl > 0 or ti == 1. - */ - host_state->ldt_sel = kvm_read_ldt(); - -#ifdef CONFIG_X86_64 - savesegment(ds, host_state->ds_sel); - savesegment(es, host_state->es_sel); - - gs_base = cpu_kernelmode_gs_base(cpu); - if (likely(is_64bit_mm(current->mm))) { - save_fsgs_for_kvm(); - fs_sel = current->thread.fsindex; - gs_sel = current->thread.gsindex; - fs_base = current->thread.fsbase; - vmx->msr_host_kernel_gs_base = current->thread.gsbase; - } else { - savesegment(fs, fs_sel); - savesegment(gs, gs_sel); - fs_base = read_msr(MSR_FS_BASE); - vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); - } - - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); -#else - savesegment(fs, fs_sel); - savesegment(gs, gs_sel); - fs_base = segment_base(fs_sel); - gs_base = segment_base(gs_sel); -#endif - - if (unlikely(fs_sel != host_state->fs_sel)) { - if (!(fs_sel & 7)) - vmcs_write16(HOST_FS_SELECTOR, fs_sel); - else - vmcs_write16(HOST_FS_SELECTOR, 0); - host_state->fs_sel = fs_sel; - } - if (unlikely(gs_sel != host_state->gs_sel)) { - if (!(gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, gs_sel); - else - vmcs_write16(HOST_GS_SELECTOR, 0); - host_state->gs_sel = gs_sel; - } - if (unlikely(fs_base != host_state->fs_base)) { - vmcs_writel(HOST_FS_BASE, fs_base); - host_state->fs_base = fs_base; - } - if (unlikely(gs_base != host_state->gs_base)) { - vmcs_writel(HOST_GS_BASE, gs_base); - host_state->gs_base = gs_base; - } -} - -static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) -{ - struct vmcs_host_state *host_state; - - if (!vmx->loaded_cpu_state) - return; - - WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); - host_state = &vmx->loaded_cpu_state->host_state; - - ++vmx->vcpu.stat.host_state_reload; - vmx->loaded_cpu_state = NULL; - -#ifdef CONFIG_X86_64 - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); -#endif - if (host_state->ldt_sel || (host_state->gs_sel & 7)) { - kvm_load_ldt(host_state->ldt_sel); -#ifdef CONFIG_X86_64 - load_gs_index(host_state->gs_sel); -#else - loadsegment(gs, host_state->gs_sel); -#endif - } - if (host_state->fs_sel & 7) - loadsegment(fs, host_state->fs_sel); -#ifdef CONFIG_X86_64 - if (unlikely(host_state->ds_sel | host_state->es_sel)) { - loadsegment(ds, host_state->ds_sel); - loadsegment(es, host_state->es_sel); - } -#endif - invalidate_tss_limit(); -#ifdef CONFIG_X86_64 - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); -#endif - load_fixmap_gdt(raw_smp_processor_id()); -} - -#ifdef CONFIG_X86_64 -static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) -{ - preempt_disable(); - if (vmx->loaded_cpu_state) - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); - preempt_enable(); - return vmx->msr_guest_kernel_gs_base; -} - -static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) -{ - preempt_disable(); - if (vmx->loaded_cpu_state) - wrmsrl(MSR_KERNEL_GS_BASE, data); - preempt_enable(); - vmx->msr_guest_kernel_gs_base = data; -} -#endif - -static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - /* - * In case of hot-plug or hot-unplug, we may have to undo - * vmx_vcpu_pi_put even if there is no assigned device. And we - * always keep PI.NDST up to date for simplicity: it makes the - * code easier, and CPU migration is not a fast path. - */ - if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) - return; - - /* - * First handle the simple case where no cmpxchg is necessary; just - * allow posting non-urgent interrupts. - * - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block will do it for us and the wakeup_handler - * expects the VCPU to be on the blocked_vcpu_list that matches - * PI.NDST. - */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || - vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - return; - } - - /* The full case. */ - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - new.sn = 0; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); -} - -static void decache_tsc_multiplier(struct vcpu_vmx *vmx) -{ - vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; - vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); -} - -/* - * Switches to specified vcpu, until a matching vcpu_put(), but assumes - * vcpu mutex is already taken. - */ -static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - bool already_loaded = vmx->loaded_vmcs->cpu == cpu; - - if (!already_loaded) { - loaded_vmcs_clear(vmx->loaded_vmcs); - local_irq_disable(); - crash_disable_local_vmclear(cpu); - - /* - * Read loaded_vmcs->cpu should be before fetching - * loaded_vmcs->loaded_vmcss_on_cpu_link. - * See the comments in __loaded_vmcs_clear(). - */ - smp_rmb(); - - list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, - &per_cpu(loaded_vmcss_on_cpu, cpu)); - crash_enable_local_vmclear(cpu); - local_irq_enable(); - } - - if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { - per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; - vmcs_load(vmx->loaded_vmcs->vmcs); - indirect_branch_prediction_barrier(); - } - - if (!already_loaded) { - void *gdt = get_current_gdt_ro(); - unsigned long sysenter_esp; - - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - - /* - * Linux uses per-cpu TSS and GDT, so set these when switching - * processors. See 22.2.4. - */ - vmcs_writel(HOST_TR_BASE, - (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); - vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ - - /* - * VM exits change the host TR limit to 0x67 after a VM - * exit. This is okay, since 0x67 covers everything except - * the IO bitmap and have have code to handle the IO bitmap - * being lost after a VM exit. - */ - BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67); - - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); - vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ - - vmx->loaded_vmcs->cpu = cpu; - } - - /* Setup TSC multiplier */ - if (kvm_has_tsc_control && - vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) - decache_tsc_multiplier(vmx); - - vmx_vcpu_pi_load(vcpu, cpu); - vmx->host_pkru = read_pkru(); - vmx->host_debugctlmsr = get_debugctlmsr(); -} - -static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); -} - -static void vmx_vcpu_put(struct kvm_vcpu *vcpu) -{ - vmx_vcpu_pi_put(vcpu); - - vmx_prepare_switch_to_host(to_vmx(vcpu)); -} - -static bool emulation_required(struct kvm_vcpu *vcpu) -{ - return emulate_invalid_guest_state && !guest_state_valid(vcpu); -} - -static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); - -/* - * Return the cr0 value that a nested guest would read. This is a combination - * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by - * its hypervisor (cr0_read_shadow). - */ -static inline unsigned long nested_read_cr0(struct vmcs12 *fields) -{ - return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | - (fields->cr0_read_shadow & fields->cr0_guest_host_mask); -} -static inline unsigned long nested_read_cr4(struct vmcs12 *fields) -{ - return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | - (fields->cr4_read_shadow & fields->cr4_guest_host_mask); -} - -static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) -{ - unsigned long rflags, save_rflags; - - if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { - __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) { - rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; - save_rflags = to_vmx(vcpu)->rmode.save_rflags; - rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; - } - to_vmx(vcpu)->rflags = rflags; - } - return to_vmx(vcpu)->rflags; -} - -static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - unsigned long old_rflags = vmx_get_rflags(vcpu); - - __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - to_vmx(vcpu)->rflags = rflags; - if (to_vmx(vcpu)->rmode.vm86_active) { - to_vmx(vcpu)->rmode.save_rflags = rflags; - rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; - } - vmcs_writel(GUEST_RFLAGS, rflags); - - if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) - to_vmx(vcpu)->emulation_required = emulation_required(vcpu); -} - -static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) -{ - u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - int ret = 0; - - if (interruptibility & GUEST_INTR_STATE_STI) - ret |= KVM_X86_SHADOW_INT_STI; - if (interruptibility & GUEST_INTR_STATE_MOV_SS) - ret |= KVM_X86_SHADOW_INT_MOV_SS; - - return ret; -} - -static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) -{ - u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - u32 interruptibility = interruptibility_old; - - interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); - - if (mask & KVM_X86_SHADOW_INT_MOV_SS) - interruptibility |= GUEST_INTR_STATE_MOV_SS; - else if (mask & KVM_X86_SHADOW_INT_STI) - interruptibility |= GUEST_INTR_STATE_STI; - - if ((interruptibility != interruptibility_old)) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); -} - -static void skip_emulated_instruction(struct kvm_vcpu *vcpu) -{ - unsigned long rip; - - rip = kvm_rip_read(vcpu); - rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - kvm_rip_write(vcpu, rip); - - /* skipping an emulated instruction also counts */ - vmx_set_interrupt_shadow(vcpu, 0); -} - -static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, - unsigned long exit_qual) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned int nr = vcpu->arch.exception.nr; - u32 intr_info = nr | INTR_INFO_VALID_MASK; - - if (vcpu->arch.exception.has_error_code) { - vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; - intr_info |= INTR_INFO_DELIVER_CODE_MASK; - } - - if (kvm_exception_is_soft(nr)) - intr_info |= INTR_TYPE_SOFT_EXCEPTION; - else - intr_info |= INTR_TYPE_HARD_EXCEPTION; - - if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && - vmx_get_nmi_mask(vcpu)) - intr_info |= INTR_INFO_UNBLOCK_NMI; - - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); -} - -/* - * KVM wants to inject page-faults which it got to the guest. This function - * checks whether in a nested guest, we need to inject them to L1 or L2. - */ -static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned int nr = vcpu->arch.exception.nr; - bool has_payload = vcpu->arch.exception.has_payload; - unsigned long payload = vcpu->arch.exception.payload; - - if (nr == PF_VECTOR) { - if (vcpu->arch.exception.nested_apf) { - *exit_qual = vcpu->arch.apf.nested_apf_token; - return 1; - } - if (nested_vmx_is_page_fault_vmexit(vmcs12, - vcpu->arch.exception.error_code)) { - *exit_qual = has_payload ? payload : vcpu->arch.cr2; - return 1; - } - } else if (vmcs12->exception_bitmap & (1u << nr)) { - if (nr == DB_VECTOR) { - if (!has_payload) { - payload = vcpu->arch.dr6; - payload &= ~(DR6_FIXED_1 | DR6_BT); - payload ^= DR6_RTM; - } - *exit_qual = payload; - } else - *exit_qual = 0; - return 1; - } - - return 0; -} - -static void vmx_clear_hlt(struct kvm_vcpu *vcpu) -{ - /* - * Ensure that we clear the HLT state in the VMCS. We don't need to - * explicitly skip the instruction because if the HLT state is set, - * then the instruction is already executing and RIP has already been - * advanced. - */ - if (kvm_hlt_in_guest(vcpu->kvm) && - vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) - vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); -} - -static void vmx_queue_exception(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned nr = vcpu->arch.exception.nr; - bool has_error_code = vcpu->arch.exception.has_error_code; - u32 error_code = vcpu->arch.exception.error_code; - u32 intr_info = nr | INTR_INFO_VALID_MASK; - - kvm_deliver_exception_payload(vcpu); - - if (has_error_code) { - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); - intr_info |= INTR_INFO_DELIVER_CODE_MASK; - } - - if (vmx->rmode.vm86_active) { - int inc_eip = 0; - if (kvm_exception_is_soft(nr)) - inc_eip = vcpu->arch.event_exit_inst_len; - if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return; - } - - WARN_ON_ONCE(vmx->emulation_required); - - if (kvm_exception_is_soft(nr)) { - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmx->vcpu.arch.event_exit_inst_len); - intr_info |= INTR_TYPE_SOFT_EXCEPTION; - } else - intr_info |= INTR_TYPE_HARD_EXCEPTION; - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); - - vmx_clear_hlt(vcpu); -} - -static bool vmx_rdtscp_supported(void) -{ - return cpu_has_vmx_rdtscp(); -} - -static bool vmx_invpcid_supported(void) -{ - return cpu_has_vmx_invpcid(); -} - -/* - * Swap MSR entry in host/guest MSR entry array. - */ -static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) -{ - struct shared_msr_entry tmp; - - tmp = vmx->guest_msrs[to]; - vmx->guest_msrs[to] = vmx->guest_msrs[from]; - vmx->guest_msrs[from] = tmp; -} - -/* - * Set up the vmcs to automatically save and restore system - * msrs. Don't touch the 64-bit msrs if the guest is in legacy - * mode, as fiddling with msrs is very expensive. - */ -static void setup_msrs(struct vcpu_vmx *vmx) -{ - int save_nmsrs, index; - - save_nmsrs = 0; -#ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) { - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_LSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_CSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) - move_msr_up(vmx, index, save_nmsrs++); - /* - * MSR_STAR is only needed on long mode guests, and only - * if efer.sce is enabled. - */ - index = __find_msr_index(vmx, MSR_STAR); - if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) - move_msr_up(vmx, index, save_nmsrs++); - } -#endif - index = __find_msr_index(vmx, MSR_EFER); - if (index >= 0 && update_transition_efer(vmx, index)) - move_msr_up(vmx, index, save_nmsrs++); - - vmx->save_nmsrs = save_nmsrs; - vmx->guest_msrs_dirty = true; - - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(&vmx->vcpu); -} - -static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) - return vcpu->arch.tsc_offset - vmcs12->tsc_offset; - - return vcpu->arch.tsc_offset; -} - -static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) -{ - u64 active_offset = offset; - if (is_guest_mode(vcpu)) { - /* - * We're here if L1 chose not to trap WRMSR to TSC. According - * to the spec, this should set L1's TSC; The offset that L1 - * set for L2 remains unchanged, and still needs to be added - * to the newly set TSC to get L2's TSC. - */ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING)) - active_offset += vmcs12->tsc_offset; - } else { - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - vmcs_read64(TSC_OFFSET), offset); - } - - vmcs_write64(TSC_OFFSET, active_offset); - return active_offset; -} - -/* - * nested_vmx_allowed() checks whether a guest should be allowed to use VMX - * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for - * all guests if the "nested" module option is off, and can also be disabled - * for a single guest by disabling its VMX cpuid bit. - */ -static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) -{ - return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); -} - -/* - * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be - * returned for the various VMX controls MSRs when nested VMX is enabled. - * The same values should also be used to verify that vmcs12 control fields are - * valid during nested entry from L1 to L2. - * Each of these control msrs has a low and high 32-bit half: A low bit is on - * if the corresponding bit in the (32-bit) control field *must* be on, and a - * bit in the high half is on if the corresponding bit in the control field - * may be on. See also vmx_control_verify(). - */ -static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) -{ - if (!nested) { - memset(msrs, 0, sizeof(*msrs)); - return; - } - - /* - * Note that as a general rule, the high half of the MSRs (bits in - * the control fields which may be 1) should be initialized by the - * intersection of the underlying hardware's MSR (i.e., features which - * can be supported) and the list of features we want to expose - - * because they are known to be properly supported in our code. - * Also, usually, the low half of the MSRs (bits which must be 1) can - * be set to 0, meaning that L1 may turn off any of these bits. The - * reason is that if one of these bits is necessary, it will appear - * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control - * fields of vmcs01 and vmcs02, will turn these bits off - and - * nested_vmx_exit_reflected() will not pass related exits to L1. - * These rules have exceptions below. - */ - - /* pin-based controls */ - rdmsr(MSR_IA32_VMX_PINBASED_CTLS, - msrs->pinbased_ctls_low, - msrs->pinbased_ctls_high); - msrs->pinbased_ctls_low |= - PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - msrs->pinbased_ctls_high &= - PIN_BASED_EXT_INTR_MASK | - PIN_BASED_NMI_EXITING | - PIN_BASED_VIRTUAL_NMIS | - (apicv ? PIN_BASED_POSTED_INTR : 0); - msrs->pinbased_ctls_high |= - PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | - PIN_BASED_VMX_PREEMPTION_TIMER; - - /* exit controls */ - rdmsr(MSR_IA32_VMX_EXIT_CTLS, - msrs->exit_ctls_low, - msrs->exit_ctls_high); - msrs->exit_ctls_low = - VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - - msrs->exit_ctls_high &= -#ifdef CONFIG_X86_64 - VM_EXIT_HOST_ADDR_SPACE_SIZE | -#endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; - msrs->exit_ctls_high |= - VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | - VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; - - /* We support free control of debug control saving. */ - msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; - - /* entry controls */ - rdmsr(MSR_IA32_VMX_ENTRY_CTLS, - msrs->entry_ctls_low, - msrs->entry_ctls_high); - msrs->entry_ctls_low = - VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - msrs->entry_ctls_high &= -#ifdef CONFIG_X86_64 - VM_ENTRY_IA32E_MODE | -#endif - VM_ENTRY_LOAD_IA32_PAT; - msrs->entry_ctls_high |= - (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); - - /* We support free control of debug control loading. */ - msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; - - /* cpu-based controls */ - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, - msrs->procbased_ctls_low, - msrs->procbased_ctls_high); - msrs->procbased_ctls_low = - CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - msrs->procbased_ctls_high &= - CPU_BASED_VIRTUAL_INTR_PENDING | - CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | - CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | - CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | -#ifdef CONFIG_X86_64 - CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | -#endif - CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | - CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | - CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | - CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | - CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; - /* - * We can allow some features even when not supported by the - * hardware. For example, L1 can specify an MSR bitmap - and we - * can use it to avoid exits to L1 - even when L0 runs L2 - * without MSR bitmaps. - */ - msrs->procbased_ctls_high |= - CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | - CPU_BASED_USE_MSR_BITMAPS; - - /* We support free control of CR3 access interception. */ - msrs->procbased_ctls_low &= - ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); - - /* - * secondary cpu-based controls. Do not include those that - * depend on CPUID bits, they are added later by vmx_cpuid_update. - */ - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - msrs->secondary_ctls_low, - msrs->secondary_ctls_high); - msrs->secondary_ctls_low = 0; - msrs->secondary_ctls_high &= - SECONDARY_EXEC_DESC | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_WBINVD_EXITING; - - /* - * We can emulate "VMCS shadowing," even if the hardware - * doesn't support it. - */ - msrs->secondary_ctls_high |= - SECONDARY_EXEC_SHADOW_VMCS; - - if (enable_ept) { - /* nested EPT: emulate EPT also to L1 */ - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_EPT; - msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | - VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; - if (cpu_has_vmx_ept_execute_only()) - msrs->ept_caps |= - VMX_EPT_EXECUTE_ONLY_BIT; - msrs->ept_caps &= vmx_capability.ept; - msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | - VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | - VMX_EPT_1GB_PAGE_BIT; - if (enable_ept_ad_bits) { - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_PML; - msrs->ept_caps |= VMX_EPT_AD_BIT; - } - } - - if (cpu_has_vmx_vmfunc()) { - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_VMFUNC; - /* - * Advertise EPTP switching unconditionally - * since we emulate it - */ - if (enable_ept) - msrs->vmfunc_controls = - VMX_VMFUNC_EPTP_SWITCHING; - } - - /* - * Old versions of KVM use the single-context version without - * checking for support, so declare that it is supported even - * though it is treated as global context. The alternative is - * not failing the single-context invvpid, and it is worse. - */ - if (enable_vpid) { - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_VPID; - msrs->vpid_caps = VMX_VPID_INVVPID_BIT | - VMX_VPID_EXTENT_SUPPORTED_MASK; - } - - if (enable_unrestricted_guest) - msrs->secondary_ctls_high |= - SECONDARY_EXEC_UNRESTRICTED_GUEST; - - if (flexpriority_enabled) - msrs->secondary_ctls_high |= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - - /* miscellaneous data */ - rdmsr(MSR_IA32_VMX_MISC, - msrs->misc_low, - msrs->misc_high); - msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; - msrs->misc_low |= - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | - VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | - VMX_MISC_ACTIVITY_HLT; - msrs->misc_high = 0; - - /* - * This MSR reports some information about VMX support. We - * should return information about the VMX we emulate for the - * guest, and the VMCS structure we give it - not about the - * VMX support of the underlying hardware. - */ - msrs->basic = - VMCS12_REVISION | - VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); - - if (cpu_has_vmx_basic_inout()) - msrs->basic |= VMX_BASIC_INOUT; - - /* - * These MSRs specify bits which the guest must keep fixed on - * while L1 is in VMXON mode (in L1's root mode, or running an L2). - * We picked the standard core2 setting. - */ -#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) -#define VMXON_CR4_ALWAYSON X86_CR4_VMXE - msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; - msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; - - /* These MSRs specify bits which the guest must keep fixed off. */ - rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); - rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); - - /* highest index: VMX_PREEMPTION_TIMER_VALUE */ - msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; -} - -/* - * if fixed0[i] == 1: val[i] must be 1 - * if fixed1[i] == 0: val[i] must be 0 - */ -static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) -{ - return ((val & fixed1) | fixed0) == val; -} - -static inline bool vmx_control_verify(u32 control, u32 low, u32 high) -{ - return fixed_bits_valid(control, low, high); -} - -static inline u64 vmx_control_msr(u32 low, u32 high) -{ - return low | ((u64)high << 32); -} - -static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) -{ - superset &= mask; - subset &= mask; - - return (superset | subset) == superset; -} - -static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) -{ - const u64 feature_and_reserved = - /* feature (except bit 48; see below) */ - BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | - /* reserved */ - BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); - u64 vmx_basic = vmx->nested.msrs.basic; - - if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) - return -EINVAL; - - /* - * KVM does not emulate a version of VMX that constrains physical - * addresses of VMX structures (e.g. VMCS) to 32-bits. - */ - if (data & BIT_ULL(48)) - return -EINVAL; - - if (vmx_basic_vmcs_revision_id(vmx_basic) != - vmx_basic_vmcs_revision_id(data)) - return -EINVAL; - - if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) - return -EINVAL; - - vmx->nested.msrs.basic = data; - return 0; -} - -static int -vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) -{ - u64 supported; - u32 *lowp, *highp; - - switch (msr_index) { - case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - lowp = &vmx->nested.msrs.pinbased_ctls_low; - highp = &vmx->nested.msrs.pinbased_ctls_high; - break; - case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - lowp = &vmx->nested.msrs.procbased_ctls_low; - highp = &vmx->nested.msrs.procbased_ctls_high; - break; - case MSR_IA32_VMX_TRUE_EXIT_CTLS: - lowp = &vmx->nested.msrs.exit_ctls_low; - highp = &vmx->nested.msrs.exit_ctls_high; - break; - case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - lowp = &vmx->nested.msrs.entry_ctls_low; - highp = &vmx->nested.msrs.entry_ctls_high; - break; - case MSR_IA32_VMX_PROCBASED_CTLS2: - lowp = &vmx->nested.msrs.secondary_ctls_low; - highp = &vmx->nested.msrs.secondary_ctls_high; - break; - default: - BUG(); - } - - supported = vmx_control_msr(*lowp, *highp); - - /* Check must-be-1 bits are still 1. */ - if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) - return -EINVAL; - - /* Check must-be-0 bits are still 0. */ - if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) - return -EINVAL; - - *lowp = data; - *highp = data >> 32; - return 0; -} - -static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) -{ - const u64 feature_and_reserved_bits = - /* feature */ - BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | - BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | - /* reserved */ - GENMASK_ULL(13, 9) | BIT_ULL(31); - u64 vmx_misc; - - vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, - vmx->nested.msrs.misc_high); - - if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) - return -EINVAL; - - if ((vmx->nested.msrs.pinbased_ctls_high & - PIN_BASED_VMX_PREEMPTION_TIMER) && - vmx_misc_preemption_timer_rate(data) != - vmx_misc_preemption_timer_rate(vmx_misc)) - return -EINVAL; - - if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) - return -EINVAL; - - if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) - return -EINVAL; - - if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) - return -EINVAL; - - vmx->nested.msrs.misc_low = data; - vmx->nested.msrs.misc_high = data >> 32; - - /* - * If L1 has read-only VM-exit information fields, use the - * less permissive vmx_vmwrite_bitmap to specify write - * permissions for the shadow VMCS. - */ - if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu)) - vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); - - return 0; -} - -static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) -{ - u64 vmx_ept_vpid_cap; - - vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, - vmx->nested.msrs.vpid_caps); - - /* Every bit is either reserved or a feature bit. */ - if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) - return -EINVAL; - - vmx->nested.msrs.ept_caps = data; - vmx->nested.msrs.vpid_caps = data >> 32; - return 0; -} - -static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) -{ - u64 *msr; - - switch (msr_index) { - case MSR_IA32_VMX_CR0_FIXED0: - msr = &vmx->nested.msrs.cr0_fixed0; - break; - case MSR_IA32_VMX_CR4_FIXED0: - msr = &vmx->nested.msrs.cr4_fixed0; - break; - default: - BUG(); - } - - /* - * 1 bits (which indicates bits which "must-be-1" during VMX operation) - * must be 1 in the restored value. - */ - if (!is_bitwise_subset(data, *msr, -1ULL)) - return -EINVAL; - - *msr = data; - return 0; -} - -/* - * Called when userspace is restoring VMX MSRs. - * - * Returns 0 on success, non-0 otherwise. - */ -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * Don't allow changes to the VMX capability MSRs while the vCPU - * is in VMX operation. - */ - if (vmx->nested.vmxon) - return -EBUSY; - - switch (msr_index) { - case MSR_IA32_VMX_BASIC: - return vmx_restore_vmx_basic(vmx, data); - case MSR_IA32_VMX_PINBASED_CTLS: - case MSR_IA32_VMX_PROCBASED_CTLS: - case MSR_IA32_VMX_EXIT_CTLS: - case MSR_IA32_VMX_ENTRY_CTLS: - /* - * The "non-true" VMX capability MSRs are generated from the - * "true" MSRs, so we do not support restoring them directly. - * - * If userspace wants to emulate VMX_BASIC[55]=0, userspace - * should restore the "true" MSRs with the must-be-1 bits - * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND - * DEFAULT SETTINGS". - */ - return -EINVAL; - case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - case MSR_IA32_VMX_TRUE_EXIT_CTLS: - case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - case MSR_IA32_VMX_PROCBASED_CTLS2: - return vmx_restore_control_msr(vmx, msr_index, data); - case MSR_IA32_VMX_MISC: - return vmx_restore_vmx_misc(vmx, data); - case MSR_IA32_VMX_CR0_FIXED0: - case MSR_IA32_VMX_CR4_FIXED0: - return vmx_restore_fixed0_msr(vmx, msr_index, data); - case MSR_IA32_VMX_CR0_FIXED1: - case MSR_IA32_VMX_CR4_FIXED1: - /* - * These MSRs are generated based on the vCPU's CPUID, so we - * do not support restoring them directly. - */ - return -EINVAL; - case MSR_IA32_VMX_EPT_VPID_CAP: - return vmx_restore_vmx_ept_vpid_cap(vmx, data); - case MSR_IA32_VMX_VMCS_ENUM: - vmx->nested.msrs.vmcs_enum = data; - return 0; - default: - /* - * The rest of the VMX capability MSRs do not support restore. - */ - return -EINVAL; - } -} - -/* Returns 0 on success, non-0 otherwise. */ -static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) -{ - switch (msr_index) { - case MSR_IA32_VMX_BASIC: - *pdata = msrs->basic; - break; - case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - case MSR_IA32_VMX_PINBASED_CTLS: - *pdata = vmx_control_msr( - msrs->pinbased_ctls_low, - msrs->pinbased_ctls_high); - if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) - *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - break; - case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - case MSR_IA32_VMX_PROCBASED_CTLS: - *pdata = vmx_control_msr( - msrs->procbased_ctls_low, - msrs->procbased_ctls_high); - if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) - *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; - break; - case MSR_IA32_VMX_TRUE_EXIT_CTLS: - case MSR_IA32_VMX_EXIT_CTLS: - *pdata = vmx_control_msr( - msrs->exit_ctls_low, - msrs->exit_ctls_high); - if (msr_index == MSR_IA32_VMX_EXIT_CTLS) - *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; - break; - case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - case MSR_IA32_VMX_ENTRY_CTLS: - *pdata = vmx_control_msr( - msrs->entry_ctls_low, - msrs->entry_ctls_high); - if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) - *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; - break; - case MSR_IA32_VMX_MISC: - *pdata = vmx_control_msr( - msrs->misc_low, - msrs->misc_high); - break; - case MSR_IA32_VMX_CR0_FIXED0: - *pdata = msrs->cr0_fixed0; - break; - case MSR_IA32_VMX_CR0_FIXED1: - *pdata = msrs->cr0_fixed1; - break; - case MSR_IA32_VMX_CR4_FIXED0: - *pdata = msrs->cr4_fixed0; - break; - case MSR_IA32_VMX_CR4_FIXED1: - *pdata = msrs->cr4_fixed1; - break; - case MSR_IA32_VMX_VMCS_ENUM: - *pdata = msrs->vmcs_enum; - break; - case MSR_IA32_VMX_PROCBASED_CTLS2: - *pdata = vmx_control_msr( - msrs->secondary_ctls_low, - msrs->secondary_ctls_high); - break; - case MSR_IA32_VMX_EPT_VPID_CAP: - *pdata = msrs->ept_caps | - ((u64)msrs->vpid_caps << 32); - break; - case MSR_IA32_VMX_VMFUNC: - *pdata = msrs->vmfunc_controls; - break; - default: - return 1; - } - - return 0; -} - -static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, - uint64_t val) -{ - uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; - - return !(val & ~valid_bits); -} - -static int vmx_get_msr_feature(struct kvm_msr_entry *msr) -{ - switch (msr->index) { - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - if (!nested) - return 1; - return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); - default: - return 1; - } - - return 0; -} - -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; - - switch (msr_info->index) { -#ifdef CONFIG_X86_64 - case MSR_FS_BASE: - msr_info->data = vmcs_readl(GUEST_FS_BASE); - break; - case MSR_GS_BASE: - msr_info->data = vmcs_readl(GUEST_GS_BASE); - break; - case MSR_KERNEL_GS_BASE: - msr_info->data = vmx_read_guest_kernel_gs_base(vmx); - break; -#endif - case MSR_EFER: - return kvm_get_msr_common(vcpu, msr_info); - case MSR_IA32_SPEC_CTRL: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) - return 1; - - msr_info->data = to_vmx(vcpu)->spec_ctrl; - break; - case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) - return 1; - msr_info->data = to_vmx(vcpu)->arch_capabilities; - break; - case MSR_IA32_SYSENTER_CS: - msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); - break; - case MSR_IA32_SYSENTER_EIP: - msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); - break; - case MSR_IA32_SYSENTER_ESP: - msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); - break; - case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported() || - (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) - return 1; - msr_info->data = vmcs_read64(GUEST_BNDCFGS); - break; - case MSR_IA32_MCG_EXT_CTL: - if (!msr_info->host_initiated && - !(vmx->msr_ia32_feature_control & - FEATURE_CONTROL_LMCE)) - return 1; - msr_info->data = vcpu->arch.mcg_ext_ctl; - break; - case MSR_IA32_FEATURE_CONTROL: - msr_info->data = vmx->msr_ia32_feature_control; - break; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - if (!nested_vmx_allowed(vcpu)) - return 1; - return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, - &msr_info->data); - case MSR_IA32_XSS: - if (!vmx_xsaves_supported()) - return 1; - msr_info->data = vcpu->arch.ia32_xss; - break; - case MSR_TSC_AUX: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) - return 1; - /* Otherwise falls through */ - default: - msr = find_msr_entry(vmx, msr_info->index); - if (msr) { - msr_info->data = msr->data; - break; - } - return kvm_get_msr_common(vcpu, msr_info); - } - - return 0; -} - -static void vmx_leave_nested(struct kvm_vcpu *vcpu); - -/* - * Writes msr value into into the appropriate "register". - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; - int ret = 0; - u32 msr_index = msr_info->index; - u64 data = msr_info->data; - - switch (msr_index) { - case MSR_EFER: - ret = kvm_set_msr_common(vcpu, msr_info); - break; -#ifdef CONFIG_X86_64 - case MSR_FS_BASE: - vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_FS_BASE, data); - break; - case MSR_GS_BASE: - vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_GS_BASE, data); - break; - case MSR_KERNEL_GS_BASE: - vmx_write_guest_kernel_gs_base(vmx, data); - break; -#endif - case MSR_IA32_SYSENTER_CS: - vmcs_write32(GUEST_SYSENTER_CS, data); - break; - case MSR_IA32_SYSENTER_EIP: - vmcs_writel(GUEST_SYSENTER_EIP, data); - break; - case MSR_IA32_SYSENTER_ESP: - vmcs_writel(GUEST_SYSENTER_ESP, data); - break; - case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported() || - (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) - return 1; - if (is_noncanonical_address(data & PAGE_MASK, vcpu) || - (data & MSR_IA32_BNDCFGS_RSVD)) - return 1; - vmcs_write64(GUEST_BNDCFGS, data); - break; - case MSR_IA32_SPEC_CTRL: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) - return 1; - - /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) - return 1; - - vmx->spec_ctrl = data; - - if (!data) - break; - - /* - * For non-nested: - * When it's written (to non-zero) for the first time, pass - * it through. - * - * For nested: - * The handling of the MSR bitmap for L2 guests is done in - * nested_vmx_merge_msr_bitmap. We should not touch the - * vmcs02.msr_bitmap here since it gets completely overwritten - * in the merging. We update the vmcs01 here for L1 as well - * since it will end up touching the MSR anyway now. - */ - vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, - MSR_IA32_SPEC_CTRL, - MSR_TYPE_RW); - break; - case MSR_IA32_PRED_CMD: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) - return 1; - - if (data & ~PRED_CMD_IBPB) - return 1; - - if (!data) - break; - - wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - - /* - * For non-nested: - * When it's written (to non-zero) for the first time, pass - * it through. - * - * For nested: - * The handling of the MSR bitmap for L2 guests is done in - * nested_vmx_merge_msr_bitmap. We should not touch the - * vmcs02.msr_bitmap here since it gets completely overwritten - * in the merging. - */ - vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, - MSR_TYPE_W); - break; - case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated) - return 1; - vmx->arch_capabilities = data; - break; - case MSR_IA32_CR_PAT: - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) - return 1; - vmcs_write64(GUEST_IA32_PAT, data); - vcpu->arch.pat = data; - break; - } - ret = kvm_set_msr_common(vcpu, msr_info); - break; - case MSR_IA32_TSC_ADJUST: - ret = kvm_set_msr_common(vcpu, msr_info); - break; - case MSR_IA32_MCG_EXT_CTL: - if ((!msr_info->host_initiated && - !(to_vmx(vcpu)->msr_ia32_feature_control & - FEATURE_CONTROL_LMCE)) || - (data & ~MCG_EXT_CTL_LMCE_EN)) - return 1; - vcpu->arch.mcg_ext_ctl = data; - break; - case MSR_IA32_FEATURE_CONTROL: - if (!vmx_feature_control_msr_valid(vcpu, data) || - (to_vmx(vcpu)->msr_ia32_feature_control & - FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) - return 1; - vmx->msr_ia32_feature_control = data; - if (msr_info->host_initiated && data == 0) - vmx_leave_nested(vcpu); - break; - case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - if (!msr_info->host_initiated) - return 1; /* they are read-only */ - if (!nested_vmx_allowed(vcpu)) - return 1; - return vmx_set_vmx_msr(vcpu, msr_index, data); - case MSR_IA32_XSS: - if (!vmx_xsaves_supported()) - return 1; - /* - * The only supported bit as of Skylake is bit 8, but - * it is not supported on KVM. - */ - if (data != 0) - return 1; - vcpu->arch.ia32_xss = data; - if (vcpu->arch.ia32_xss != host_xss) - add_atomic_switch_msr(vmx, MSR_IA32_XSS, - vcpu->arch.ia32_xss, host_xss, false); - else - clear_atomic_switch_msr(vmx, MSR_IA32_XSS); - break; - case MSR_TSC_AUX: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) - return 1; - /* Check reserved bit, higher 32 bits should be zero */ - if ((data >> 32) != 0) - return 1; - /* Otherwise falls through */ - default: - msr = find_msr_entry(vmx, msr_index); - if (msr) { - u64 old_msr_data = msr->data; - msr->data = data; - if (msr - vmx->guest_msrs < vmx->save_nmsrs) { - preempt_disable(); - ret = kvm_set_shared_msr(msr->index, msr->data, - msr->mask); - preempt_enable(); - if (ret) - msr->data = old_msr_data; - } - break; - } - ret = kvm_set_msr_common(vcpu, msr_info); - } - - return ret; -} - -static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) -{ - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); - switch (reg) { - case VCPU_REGS_RSP: - vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); - break; - case VCPU_REGS_RIP: - vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); - break; - case VCPU_EXREG_PDPTR: - if (enable_ept) - ept_save_pdptrs(vcpu); - break; - default: - break; - } -} - -static __init int cpu_has_kvm_support(void) -{ - return cpu_has_vmx(); -} - -static __init int vmx_disabled_by_bios(void) -{ - u64 msr; - - rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - if (msr & FEATURE_CONTROL_LOCKED) { - /* launched w/ TXT and VMX disabled */ - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) - && tboot_enabled()) - return 1; - /* launched w/o TXT and VMX only enabled w/ TXT */ - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) - && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) - && !tboot_enabled()) { - printk(KERN_WARNING "kvm: disable TXT in the BIOS or " - "activate TXT before enabling KVM\n"); - return 1; - } - /* launched w/o TXT and VMX disabled */ - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) - && !tboot_enabled()) - return 1; - } - - return 0; -} - -static void kvm_cpu_vmxon(u64 addr) -{ - cr4_set_bits(X86_CR4_VMXE); - intel_pt_handle_vmx(1); - - asm volatile ("vmxon %0" : : "m"(addr)); -} - -static int hardware_enable(void) -{ - int cpu = raw_smp_processor_id(); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - u64 old, test_bits; - - if (cr4_read_shadow() & X86_CR4_VMXE) - return -EBUSY; - - /* - * This can happen if we hot-added a CPU but failed to allocate - * VP assist page for it. - */ - if (static_branch_unlikely(&enable_evmcs) && - !hv_get_vp_assist_page(cpu)) - return -EFAULT; - - INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - - /* - * Now we can enable the vmclear operation in kdump - * since the loaded_vmcss_on_cpu list on this cpu - * has been initialized. - * - * Though the cpu is not in VMX operation now, there - * is no problem to enable the vmclear operation - * for the loaded_vmcss_on_cpu list is empty! - */ - crash_enable_local_vmclear(cpu); - - rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - - test_bits = FEATURE_CONTROL_LOCKED; - test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - if (tboot_enabled()) - test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; - - if ((old & test_bits) != test_bits) { - /* enable and lock */ - wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); - } - kvm_cpu_vmxon(phys_addr); - if (enable_ept) - ept_sync_global(); - - return 0; -} - -static void vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v, *n; - - list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - __loaded_vmcs_clear(v); -} - - -/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() - * tricks. - */ -static void kvm_cpu_vmxoff(void) -{ - asm volatile (__ex("vmxoff")); - - intel_pt_handle_vmx(0); - cr4_clear_bits(X86_CR4_VMXE); -} - -static void hardware_disable(void) -{ - vmclear_local_loaded_vmcss(); - kvm_cpu_vmxoff(); -} - -static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, - u32 msr, u32 *result) -{ - u32 vmx_msr_low, vmx_msr_high; - u32 ctl = ctl_min | ctl_opt; - - rdmsr(msr, vmx_msr_low, vmx_msr_high); - - ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ - ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ - - /* Ensure minimum (required) set of control bits are supported. */ - if (ctl_min & ~ctl) - return -EIO; - - *result = ctl; - return 0; -} - -static __init bool allow_1_setting(u32 msr, u32 ctl) -{ - u32 vmx_msr_low, vmx_msr_high; - - rdmsr(msr, vmx_msr_low, vmx_msr_high); - return vmx_msr_high & ctl; -} - -static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) -{ - u32 vmx_msr_low, vmx_msr_high; - u32 min, opt, min2, opt2; - u32 _pin_based_exec_control = 0; - u32 _cpu_based_exec_control = 0; - u32 _cpu_based_2nd_exec_control = 0; - u32 _vmexit_control = 0; - u32 _vmentry_control = 0; - - memset(vmcs_conf, 0, sizeof(*vmcs_conf)); - min = CPU_BASED_HLT_EXITING | -#ifdef CONFIG_X86_64 - CPU_BASED_CR8_LOAD_EXITING | - CPU_BASED_CR8_STORE_EXITING | -#endif - CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_UNCOND_IO_EXITING | - CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETING | - CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING | - CPU_BASED_INVLPG_EXITING | - CPU_BASED_RDPMC_EXITING; - - opt = CPU_BASED_TPR_SHADOW | - CPU_BASED_USE_MSR_BITMAPS | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, - &_cpu_based_exec_control) < 0) - return -EIO; -#ifdef CONFIG_X86_64 - if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) - _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & - ~CPU_BASED_CR8_STORE_EXITING; -#endif - if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { - min2 = 0; - opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_ENABLE_VPID | - SECONDARY_EXEC_ENABLE_EPT | - SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING | - SECONDARY_EXEC_DESC | - SECONDARY_EXEC_RDTSCP | - SECONDARY_EXEC_ENABLE_INVPCID | - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_SHADOW_VMCS | - SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_RDSEED_EXITING | - SECONDARY_EXEC_RDRAND_EXITING | - SECONDARY_EXEC_ENABLE_PML | - SECONDARY_EXEC_TSC_SCALING | - SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_ENCLS_EXITING; - if (adjust_vmx_controls(min2, opt2, - MSR_IA32_VMX_PROCBASED_CTLS2, - &_cpu_based_2nd_exec_control) < 0) - return -EIO; - } -#ifndef CONFIG_X86_64 - if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; -#endif - - if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) - _cpu_based_2nd_exec_control &= ~( - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - - rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, - &vmx_capability.ept, &vmx_capability.vpid); - - if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { - /* CR3 accesses and invlpg don't need to cause VM Exits when EPT - enabled */ - _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_INVLPG_EXITING); - } else if (vmx_capability.ept) { - vmx_capability.ept = 0; - pr_warn_once("EPT CAP should not exist if not support " - "1-setting enable EPT VM-execution control\n"); - } - if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && - vmx_capability.vpid) { - vmx_capability.vpid = 0; - pr_warn_once("VPID CAP should not exist if not support " - "1-setting enable VPID VM-execution control\n"); - } - - min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; -#ifdef CONFIG_X86_64 - min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; -#endif - opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_CLEAR_BNDCFGS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, - &_vmexit_control) < 0) - return -EIO; - - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | - PIN_BASED_VMX_PREEMPTION_TIMER; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, - &_pin_based_exec_control) < 0) - return -EIO; - - if (cpu_has_broken_vmx_preemption_timer()) - _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) - _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; - - min = VM_ENTRY_LOAD_DEBUG_CONTROLS; - opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, - &_vmentry_control) < 0) - return -EIO; - - rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); - - /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ - if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) - return -EIO; - -#ifdef CONFIG_X86_64 - /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ - if (vmx_msr_high & (1u<<16)) - return -EIO; -#endif - - /* Require Write-Back (WB) memory type for VMCS accesses. */ - if (((vmx_msr_high >> 18) & 15) != 6) - return -EIO; - - vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_conf->size); - vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; - - vmcs_conf->revision_id = vmx_msr_low; - - vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; - vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; - vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; - vmcs_conf->vmexit_ctrl = _vmexit_control; - vmcs_conf->vmentry_ctrl = _vmentry_control; - - if (static_branch_unlikely(&enable_evmcs)) - evmcs_sanitize_exec_ctrls(vmcs_conf); - - cpu_has_load_ia32_efer = - allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, - VM_ENTRY_LOAD_IA32_EFER) - && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, - VM_EXIT_LOAD_IA32_EFER); - - cpu_has_load_perf_global_ctrl = - allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) - && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); - - /* - * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL - * but due to errata below it can't be used. Workaround is to use - * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. - * - * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] - * - * AAK155 (model 26) - * AAP115 (model 30) - * AAT100 (model 37) - * BC86,AAY89,BD102 (model 44) - * BA97 (model 46) - * - */ - if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { - switch (boot_cpu_data.x86_model) { - case 26: - case 30: - case 37: - case 44: - case 46: - cpu_has_load_perf_global_ctrl = false; - printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " - "does not work properly. Using workaround\n"); - break; - default: - break; - } - } - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - - return 0; -} - -static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) -{ - int node = cpu_to_node(cpu); - struct page *pages; - struct vmcs *vmcs; - - pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); - if (!pages) - return NULL; - vmcs = page_address(pages); - memset(vmcs, 0, vmcs_config.size); - - /* KVM supports Enlightened VMCS v1 only */ - if (static_branch_unlikely(&enable_evmcs)) - vmcs->hdr.revision_id = KVM_EVMCS_VERSION; - else - vmcs->hdr.revision_id = vmcs_config.revision_id; - - if (shadow) - vmcs->hdr.shadow_vmcs = 1; - return vmcs; -} - -static void free_vmcs(struct vmcs *vmcs) -{ - free_pages((unsigned long)vmcs, vmcs_config.order); -} - -/* - * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded - */ -static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) -{ - if (!loaded_vmcs->vmcs) - return; - loaded_vmcs_clear(loaded_vmcs); - free_vmcs(loaded_vmcs->vmcs); - loaded_vmcs->vmcs = NULL; - if (loaded_vmcs->msr_bitmap) - free_page((unsigned long)loaded_vmcs->msr_bitmap); - WARN_ON(loaded_vmcs->shadow_vmcs != NULL); -} - -static struct vmcs *alloc_vmcs(bool shadow) -{ - return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); -} - -static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) -{ - loaded_vmcs->vmcs = alloc_vmcs(false); - if (!loaded_vmcs->vmcs) - return -ENOMEM; - - loaded_vmcs->shadow_vmcs = NULL; - loaded_vmcs_init(loaded_vmcs); - - if (cpu_has_vmx_msr_bitmap()) { - loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!loaded_vmcs->msr_bitmap) - goto out_vmcs; - memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); - - if (IS_ENABLED(CONFIG_HYPERV) && - static_branch_unlikely(&enable_evmcs) && - (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { - struct hv_enlightened_vmcs *evmcs = - (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs; - - evmcs->hv_enlightenments_control.msr_bitmap = 1; - } - } - - memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); - - return 0; - -out_vmcs: - free_loaded_vmcs(loaded_vmcs); - return -ENOMEM; -} - -static void free_kvm_area(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - free_vmcs(per_cpu(vmxarea, cpu)); - per_cpu(vmxarea, cpu) = NULL; - } -} - -enum vmcs_field_width { - VMCS_FIELD_WIDTH_U16 = 0, - VMCS_FIELD_WIDTH_U64 = 1, - VMCS_FIELD_WIDTH_U32 = 2, - VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3 -}; - -static inline int vmcs_field_width(unsigned long field) -{ - if (0x1 & field) /* the *_HIGH fields are all 32 bit */ - return VMCS_FIELD_WIDTH_U32; - return (field >> 13) & 0x3 ; -} - -static inline int vmcs_field_readonly(unsigned long field) -{ - return (((field >> 10) & 0x3) == 1); -} - -static void init_vmcs_shadow_fields(void) -{ - int i, j; - - for (i = j = 0; i < max_shadow_read_only_fields; i++) { - u16 field = shadow_read_only_fields[i]; - if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && - (i + 1 == max_shadow_read_only_fields || - shadow_read_only_fields[i + 1] != field + 1)) - pr_err("Missing field from shadow_read_only_field %x\n", - field + 1); - - clear_bit(field, vmx_vmread_bitmap); -#ifdef CONFIG_X86_64 - if (field & 1) - continue; -#endif - if (j < i) - shadow_read_only_fields[j] = field; - j++; - } - max_shadow_read_only_fields = j; - - for (i = j = 0; i < max_shadow_read_write_fields; i++) { - u16 field = shadow_read_write_fields[i]; - if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && - (i + 1 == max_shadow_read_write_fields || - shadow_read_write_fields[i + 1] != field + 1)) - pr_err("Missing field from shadow_read_write_field %x\n", - field + 1); - - /* - * PML and the preemption timer can be emulated, but the - * processor cannot vmwrite to fields that don't exist - * on bare metal. - */ - switch (field) { - case GUEST_PML_INDEX: - if (!cpu_has_vmx_pml()) - continue; - break; - case VMX_PREEMPTION_TIMER_VALUE: - if (!cpu_has_vmx_preemption_timer()) - continue; - break; - case GUEST_INTR_STATUS: - if (!cpu_has_vmx_apicv()) - continue; - break; - default: - break; - } - - clear_bit(field, vmx_vmwrite_bitmap); - clear_bit(field, vmx_vmread_bitmap); -#ifdef CONFIG_X86_64 - if (field & 1) - continue; -#endif - if (j < i) - shadow_read_write_fields[j] = field; - j++; - } - max_shadow_read_write_fields = j; -} - -static __init int alloc_kvm_area(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct vmcs *vmcs; - - vmcs = alloc_vmcs_cpu(false, cpu); - if (!vmcs) { - free_kvm_area(); - return -ENOMEM; - } - - /* - * When eVMCS is enabled, alloc_vmcs_cpu() sets - * vmcs->revision_id to KVM_EVMCS_VERSION instead of - * revision_id reported by MSR_IA32_VMX_BASIC. - * - * However, even though not explictly documented by - * TLFS, VMXArea passed as VMXON argument should - * still be marked with revision_id reported by - * physical CPU. - */ - if (static_branch_unlikely(&enable_evmcs)) - vmcs->hdr.revision_id = vmcs_config.revision_id; - - per_cpu(vmxarea, cpu) = vmcs; - } - return 0; -} - -static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, - struct kvm_segment *save) -{ - if (!emulate_invalid_guest_state) { - /* - * CS and SS RPL should be equal during guest entry according - * to VMX spec, but in reality it is not always so. Since vcpu - * is in the middle of the transition from real mode to - * protected mode it is safe to assume that RPL 0 is a good - * default value. - */ - if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) - save->selector &= ~SEGMENT_RPL_MASK; - save->dpl = save->selector & SEGMENT_RPL_MASK; - save->s = 1; - } - vmx_set_segment(vcpu, save, seg); -} - -static void enter_pmode(struct kvm_vcpu *vcpu) -{ - unsigned long flags; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * Update real mode segment cache. It may be not up-to-date if sement - * register was written while vcpu was in a guest mode. - */ - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); - - vmx->rmode.vm86_active = 0; - - vmx_segment_cache_clear(vmx); - - vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); - - flags = vmcs_readl(GUEST_RFLAGS); - flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; - flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; - vmcs_writel(GUEST_RFLAGS, flags); - - vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | - (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); - - update_exception_bitmap(vcpu); - - fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); - fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); - fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); - fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); - fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); -} - -static void fix_rmode_seg(int seg, struct kvm_segment *save) -{ - const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - struct kvm_segment var = *save; - - var.dpl = 0x3; - if (seg == VCPU_SREG_CS) - var.type = 0x3; - - if (!emulate_invalid_guest_state) { - var.selector = var.base >> 4; - var.base = var.base & 0xffff0; - var.limit = 0xffff; - var.g = 0; - var.db = 0; - var.present = 1; - var.s = 1; - var.l = 0; - var.unusable = 0; - var.type = 0x3; - var.avl = 0; - if (save->base & 0xf) - printk_once(KERN_WARNING "kvm: segment base is not " - "paragraph aligned when entering " - "protected mode (seg=%d)", seg); - } - - vmcs_write16(sf->selector, var.selector); - vmcs_writel(sf->base, var.base); - vmcs_write32(sf->limit, var.limit); - vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); -} - -static void enter_rmode(struct kvm_vcpu *vcpu) -{ - unsigned long flags; - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); - - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); - vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); - - vmx->rmode.vm86_active = 1; - - /* - * Very old userspace does not call KVM_SET_TSS_ADDR before entering - * vcpu. Warn the user that an update is overdue. - */ - if (!kvm_vmx->tss_addr) - printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " - "called before entering vcpu\n"); - - vmx_segment_cache_clear(vmx); - - vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); - vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); - - flags = vmcs_readl(GUEST_RFLAGS); - vmx->rmode.save_rflags = flags; - - flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; - - vmcs_writel(GUEST_RFLAGS, flags); - vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); - update_exception_bitmap(vcpu); - - fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); - fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); - fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); - fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); - fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); - fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - - kvm_mmu_reset_context(vcpu); -} - -static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); - - if (!msr) - return; - - vcpu->arch.efer = efer; - if (efer & EFER_LMA) { - vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); - msr->data = efer; - } else { - vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); - - msr->data = efer & ~EFER_LME; - } - setup_msrs(vmx); -} - -#ifdef CONFIG_X86_64 - -static void enter_lmode(struct kvm_vcpu *vcpu) -{ - u32 guest_tr_ar; - - vmx_segment_cache_clear(to_vmx(vcpu)); - - guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); - if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { - pr_debug_ratelimited("%s: tss fixup for long mode. \n", - __func__); - vmcs_write32(GUEST_TR_AR_BYTES, - (guest_tr_ar & ~VMX_AR_TYPE_MASK) - | VMX_AR_TYPE_BUSY_64_TSS); - } - vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); -} - -static void exit_lmode(struct kvm_vcpu *vcpu) -{ - vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); - vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); -} - -#endif - -static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, - bool invalidate_gpa) -{ - if (enable_ept && (invalidate_gpa || !enable_vpid)) { - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return; - ept_sync_context(construct_eptp(vcpu, - vcpu->arch.mmu->root_hpa)); - } else { - vpid_sync_context(vpid); - } -} - -static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) -{ - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); -} - -static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) -{ - int vpid = to_vmx(vcpu)->vpid; - - if (!vpid_sync_vcpu_addr(vpid, addr)) - vpid_sync_context(vpid); - - /* - * If VPIDs are not supported or enabled, then the above is a no-op. - * But we don't really need a TLB flush in that case anyway, because - * each VM entry/exit includes an implicit flush when VPID is 0. - */ -} - -static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) -{ - ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; - - vcpu->arch.cr0 &= ~cr0_guest_owned_bits; - vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; -} - -static void vmx_decache_cr3(struct kvm_vcpu *vcpu) -{ - if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); -} - -static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) -{ - ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; - - vcpu->arch.cr4 &= ~cr4_guest_owned_bits; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; -} - -static void ept_load_pdptrs(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty)) - return; - - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); - } -} - -static void ept_save_pdptrs(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); - } - - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); -} - -static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high & - SECONDARY_EXEC_UNRESTRICTED_GUEST && - nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) - fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); - - return fixed_bits_valid(val, fixed0, fixed1); -} - -static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; - - return fixed_bits_valid(val, fixed0, fixed1); -} - -static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; - u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; - - return fixed_bits_valid(val, fixed0, fixed1); -} - -/* No difference in the restrictions on guest and host CR4 in VMX operation. */ -#define nested_guest_cr4_valid nested_cr4_valid -#define nested_host_cr4_valid nested_cr4_valid - -static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); - -static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, - unsigned long cr0, - struct kvm_vcpu *vcpu) -{ - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - vmx_decache_cr3(vcpu); - if (!(cr0 & X86_CR0_PG)) { - /* From paging/starting to nonpaging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | - (CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING)); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } else if (!is_paging(vcpu)) { - /* From nonpaging to paging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & - ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING)); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } - - if (!(cr0 & X86_CR0_WP)) - *hw_cr0 &= ~X86_CR0_WP; -} - -static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long hw_cr0; - - hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); - if (enable_unrestricted_guest) - hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; - else { - hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; - - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) - enter_pmode(vcpu); - - if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) - enter_rmode(vcpu); - } - -#ifdef CONFIG_X86_64 - if (vcpu->arch.efer & EFER_LME) { - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) - enter_lmode(vcpu); - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) - exit_lmode(vcpu); - } -#endif - - if (enable_ept && !enable_unrestricted_guest) - ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); - - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, hw_cr0); - vcpu->arch.cr0 = cr0; - - /* depends on vcpu->arch.cr0 to be set to a new value */ - vmx->emulation_required = emulation_required(vcpu); -} - -static int get_ept_level(struct kvm_vcpu *vcpu) -{ - if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) - return 5; - return 4; -} - -static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) -{ - u64 eptp = VMX_EPTP_MT_WB; - - eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; - - if (enable_ept_ad_bits && - (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) - eptp |= VMX_EPTP_AD_ENABLE_BIT; - eptp |= (root_hpa & PAGE_MASK); - - return eptp; -} - -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) -{ - struct kvm *kvm = vcpu->kvm; - unsigned long guest_cr3; - u64 eptp; - - guest_cr3 = cr3; - if (enable_ept) { - eptp = construct_eptp(vcpu, cr3); - vmcs_write64(EPT_POINTER, eptp); - - if (kvm_x86_ops->tlb_remote_flush) { - spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); - to_vmx(vcpu)->ept_pointer = eptp; - to_kvm_vmx(kvm)->ept_pointers_match - = EPT_POINTERS_CHECK; - spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); - } - - if (enable_unrestricted_guest || is_paging(vcpu) || - is_guest_mode(vcpu)) - guest_cr3 = kvm_read_cr3(vcpu); - else - guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; - ept_load_pdptrs(vcpu); - } - - vmcs_writel(GUEST_CR3, guest_cr3); -} - -static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - /* - * Pass through host's Machine Check Enable value to hw_cr4, which - * is in force while we are in guest mode. Do not let guests control - * this bit, even if host CR4.MCE == 0. - */ - unsigned long hw_cr4; - - hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); - if (enable_unrestricted_guest) - hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; - else if (to_vmx(vcpu)->rmode.vm86_active) - hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; - else - hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; - - if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { - if (cr4 & X86_CR4_UMIP) { - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_DESC); - hw_cr4 &= ~X86_CR4_UMIP; - } else if (!is_guest_mode(vcpu) || - !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_DESC); - } - - if (cr4 & X86_CR4_VMXE) { - /* - * To use VMXON (and later other VMX instructions), a guest - * must first be able to turn on cr4.VMXE (see handle_vmon()). - * So basically the check on whether to allow nested VMX - * is here. We operate under the default treatment of SMM, - * so VMX cannot be enabled under SMM. - */ - if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) - return 1; - } - - if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) - return 1; - - vcpu->arch.cr4 = cr4; - - if (!enable_unrestricted_guest) { - if (enable_ept) { - if (!is_paging(vcpu)) { - hw_cr4 &= ~X86_CR4_PAE; - hw_cr4 |= X86_CR4_PSE; - } else if (!(cr4 & X86_CR4_PAE)) { - hw_cr4 &= ~X86_CR4_PAE; - } - } - - /* - * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in - * hardware. To emulate this behavior, SMEP/SMAP/PKU needs - * to be manually disabled when guest switches to non-paging - * mode. - * - * If !enable_unrestricted_guest, the CPU is always running - * with CR0.PG=1 and CR4 needs to be modified. - * If enable_unrestricted_guest, the CPU automatically - * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. - */ - if (!is_paging(vcpu)) - hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); - } - - vmcs_writel(CR4_READ_SHADOW, cr4); - vmcs_writel(GUEST_CR4, hw_cr4); - return 0; -} - -static void vmx_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 ar; - - if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { - *var = vmx->rmode.segs[seg]; - if (seg == VCPU_SREG_TR - || var->selector == vmx_read_guest_seg_selector(vmx, seg)) - return; - var->base = vmx_read_guest_seg_base(vmx, seg); - var->selector = vmx_read_guest_seg_selector(vmx, seg); - return; - } - var->base = vmx_read_guest_seg_base(vmx, seg); - var->limit = vmx_read_guest_seg_limit(vmx, seg); - var->selector = vmx_read_guest_seg_selector(vmx, seg); - ar = vmx_read_guest_seg_ar(vmx, seg); - var->unusable = (ar >> 16) & 1; - var->type = ar & 15; - var->s = (ar >> 4) & 1; - var->dpl = (ar >> 5) & 3; - /* - * Some userspaces do not preserve unusable property. Since usable - * segment has to be present according to VMX spec we can use present - * property to amend userspace bug by making unusable segment always - * nonpresent. vmx_segment_access_rights() already marks nonpresent - * segment as unusable. - */ - var->present = !var->unusable; - var->avl = (ar >> 12) & 1; - var->l = (ar >> 13) & 1; - var->db = (ar >> 14) & 1; - var->g = (ar >> 15) & 1; -} - -static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_segment s; - - if (to_vmx(vcpu)->rmode.vm86_active) { - vmx_get_segment(vcpu, &s, seg); - return s.base; - } - return vmx_read_guest_seg_base(to_vmx(vcpu), seg); -} - -static int vmx_get_cpl(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (unlikely(vmx->rmode.vm86_active)) - return 0; - else { - int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); - return VMX_AR_DPL(ar); - } -} - -static u32 vmx_segment_access_rights(struct kvm_segment *var) -{ - u32 ar; - - if (var->unusable || !var->present) - ar = 1 << 16; - else { - ar = var->type & 15; - ar |= (var->s & 1) << 4; - ar |= (var->dpl & 3) << 5; - ar |= (var->present & 1) << 7; - ar |= (var->avl & 1) << 12; - ar |= (var->l & 1) << 13; - ar |= (var->db & 1) << 14; - ar |= (var->g & 1) << 15; - } - - return ar; -} - -static void vmx_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - vmx_segment_cache_clear(vmx); - - if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { - vmx->rmode.segs[seg] = *var; - if (seg == VCPU_SREG_TR) - vmcs_write16(sf->selector, var->selector); - else if (var->s) - fix_rmode_seg(seg, &vmx->rmode.segs[seg]); - goto out; - } - - vmcs_writel(sf->base, var->base); - vmcs_write32(sf->limit, var->limit); - vmcs_write16(sf->selector, var->selector); - - /* - * Fix the "Accessed" bit in AR field of segment registers for older - * qemu binaries. - * IA32 arch specifies that at the time of processor reset the - * "Accessed" bit in the AR field of segment registers is 1. And qemu - * is setting it to 0 in the userland code. This causes invalid guest - * state vmexit when "unrestricted guest" mode is turned on. - * Fix for this setup issue in cpu_reset is being pushed in the qemu - * tree. Newer qemu binaries with that qemu fix would not need this - * kvm hack. - */ - if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) - var->type |= 0x1; /* Accessed */ - - vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); - -out: - vmx->emulation_required = emulation_required(vcpu); -} - -static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); - - *db = (ar >> 14) & 1; - *l = (ar >> 13) & 1; -} - -static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - dt->size = vmcs_read32(GUEST_IDTR_LIMIT); - dt->address = vmcs_readl(GUEST_IDTR_BASE); -} - -static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - vmcs_write32(GUEST_IDTR_LIMIT, dt->size); - vmcs_writel(GUEST_IDTR_BASE, dt->address); -} - -static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - dt->size = vmcs_read32(GUEST_GDTR_LIMIT); - dt->address = vmcs_readl(GUEST_GDTR_BASE); -} - -static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - vmcs_write32(GUEST_GDTR_LIMIT, dt->size); - vmcs_writel(GUEST_GDTR_BASE, dt->address); -} - -static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_segment var; - u32 ar; - - vmx_get_segment(vcpu, &var, seg); - var.dpl = 0x3; - if (seg == VCPU_SREG_CS) - var.type = 0x3; - ar = vmx_segment_access_rights(&var); - - if (var.base != (var.selector << 4)) - return false; - if (var.limit != 0xffff) - return false; - if (ar != 0xf3) - return false; - - return true; -} - -static bool code_segment_valid(struct kvm_vcpu *vcpu) -{ - struct kvm_segment cs; - unsigned int cs_rpl; - - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - cs_rpl = cs.selector & SEGMENT_RPL_MASK; - - if (cs.unusable) - return false; - if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) - return false; - if (!cs.s) - return false; - if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { - if (cs.dpl > cs_rpl) - return false; - } else { - if (cs.dpl != cs_rpl) - return false; - } - if (!cs.present) - return false; - - /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ - return true; -} - -static bool stack_segment_valid(struct kvm_vcpu *vcpu) -{ - struct kvm_segment ss; - unsigned int ss_rpl; - - vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); - ss_rpl = ss.selector & SEGMENT_RPL_MASK; - - if (ss.unusable) - return true; - if (ss.type != 3 && ss.type != 7) - return false; - if (!ss.s) - return false; - if (ss.dpl != ss_rpl) /* DPL != RPL */ - return false; - if (!ss.present) - return false; - - return true; -} - -static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_segment var; - unsigned int rpl; - - vmx_get_segment(vcpu, &var, seg); - rpl = var.selector & SEGMENT_RPL_MASK; - - if (var.unusable) - return true; - if (!var.s) - return false; - if (!var.present) - return false; - if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { - if (var.dpl < rpl) /* DPL < RPL */ - return false; - } - - /* TODO: Add other members to kvm_segment_field to allow checking for other access - * rights flags - */ - return true; -} - -static bool tr_valid(struct kvm_vcpu *vcpu) -{ - struct kvm_segment tr; - - vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); - - if (tr.unusable) - return false; - if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ - return false; - if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ - return false; - if (!tr.present) - return false; - - return true; -} - -static bool ldtr_valid(struct kvm_vcpu *vcpu) -{ - struct kvm_segment ldtr; - - vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); - - if (ldtr.unusable) - return true; - if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ - return false; - if (ldtr.type != 2) - return false; - if (!ldtr.present) - return false; - - return true; -} - -static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) -{ - struct kvm_segment cs, ss; - - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); - - return ((cs.selector & SEGMENT_RPL_MASK) == - (ss.selector & SEGMENT_RPL_MASK)); -} - -/* - * Check if guest state is valid. Returns true if valid, false if - * not. - * We assume that registers are always usable - */ -static bool guest_state_valid(struct kvm_vcpu *vcpu) -{ - if (enable_unrestricted_guest) - return true; - - /* real mode guest state checks */ - if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { - if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) - return false; - } else { - /* protected mode guest state checks */ - if (!cs_ss_rpl_check(vcpu)) - return false; - if (!code_segment_valid(vcpu)) - return false; - if (!stack_segment_valid(vcpu)) - return false; - if (!data_segment_valid(vcpu, VCPU_SREG_DS)) - return false; - if (!data_segment_valid(vcpu, VCPU_SREG_ES)) - return false; - if (!data_segment_valid(vcpu, VCPU_SREG_FS)) - return false; - if (!data_segment_valid(vcpu, VCPU_SREG_GS)) - return false; - if (!tr_valid(vcpu)) - return false; - if (!ldtr_valid(vcpu)) - return false; - } - /* TODO: - * - Add checks on RIP - * - Add checks on RFLAGS - */ - - return true; -} - -static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); -} - -static int init_rmode_tss(struct kvm *kvm) -{ - gfn_t fn; - u16 data = 0; - int idx, r; - - idx = srcu_read_lock(&kvm->srcu); - fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT; - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - goto out; - data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; - r = kvm_write_guest_page(kvm, fn++, &data, - TSS_IOPB_BASE_OFFSET, sizeof(u16)); - if (r < 0) - goto out; - r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); - if (r < 0) - goto out; - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - goto out; - data = ~0; - r = kvm_write_guest_page(kvm, fn, &data, - RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, - sizeof(u8)); -out: - srcu_read_unlock(&kvm->srcu, idx); - return r; -} - -static int init_rmode_identity_map(struct kvm *kvm) -{ - struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); - int i, idx, r = 0; - kvm_pfn_t identity_map_pfn; - u32 tmp; - - /* Protect kvm_vmx->ept_identity_pagetable_done. */ - mutex_lock(&kvm->slots_lock); - - if (likely(kvm_vmx->ept_identity_pagetable_done)) - goto out2; - - if (!kvm_vmx->ept_identity_map_addr) - kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; - identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT; - - r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, - kvm_vmx->ept_identity_map_addr, PAGE_SIZE); - if (r < 0) - goto out2; - - idx = srcu_read_lock(&kvm->srcu); - r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); - if (r < 0) - goto out; - /* Set up identity-mapping pagetable for EPT in real mode */ - for (i = 0; i < PT32_ENT_PER_PAGE; i++) { - tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | - _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); - r = kvm_write_guest_page(kvm, identity_map_pfn, - &tmp, i * sizeof(tmp), sizeof(tmp)); - if (r < 0) - goto out; - } - kvm_vmx->ept_identity_pagetable_done = true; - -out: - srcu_read_unlock(&kvm->srcu, idx); - -out2: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static void seg_setup(int seg) -{ - const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - unsigned int ar; - - vmcs_write16(sf->selector, 0); - vmcs_writel(sf->base, 0); - vmcs_write32(sf->limit, 0xffff); - ar = 0x93; - if (seg == VCPU_SREG_CS) - ar |= 0x08; /* code segment */ - - vmcs_write32(sf->ar_bytes, ar); -} - -static int alloc_apic_access_page(struct kvm *kvm) -{ - struct page *page; - int r = 0; - - mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_page_done) - goto out; - r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); - if (r) - goto out; - - page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) { - r = -EFAULT; - goto out; - } - - /* - * Do not pin the page in memory, so that memory hot-unplug - * is able to migrate it. - */ - put_page(page); - kvm->arch.apic_access_page_done = true; -out: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static int allocate_vpid(void) -{ - int vpid; - - if (!enable_vpid) - return 0; - spin_lock(&vmx_vpid_lock); - vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); - if (vpid < VMX_NR_VPIDS) - __set_bit(vpid, vmx_vpid_bitmap); - else - vpid = 0; - spin_unlock(&vmx_vpid_lock); - return vpid; -} - -static void free_vpid(int vpid) -{ - if (!enable_vpid || vpid == 0) - return; - spin_lock(&vmx_vpid_lock); - __clear_bit(vpid, vmx_vpid_bitmap); - spin_unlock(&vmx_vpid_lock); -} - -static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return; - - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __clear_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __clear_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __clear_bit(msr, msr_bitmap + 0x400 / f); - - if (type & MSR_TYPE_W) - /* write-high */ - __clear_bit(msr, msr_bitmap + 0xc00 / f); - - } -} - -static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return; - - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __set_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __set_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __set_bit(msr, msr_bitmap + 0x400 / f); - - if (type & MSR_TYPE_W) - /* write-high */ - __set_bit(msr, msr_bitmap + 0xc00 / f); - - } -} - -static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type, bool value) -{ - if (value) - vmx_enable_intercept_for_msr(msr_bitmap, msr, type); - else - vmx_disable_intercept_for_msr(msr_bitmap, msr, type); -} - -/* - * If a msr is allowed by L0, we should check whether it is allowed by L1. - * The corresponding bit will be cleared unless both of L0 and L1 allow it. - */ -static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, - unsigned long *msr_bitmap_nested, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) - /* read-low */ - __clear_bit(msr, msr_bitmap_nested + 0x000 / f); - - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) - /* write-low */ - __clear_bit(msr, msr_bitmap_nested + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) - /* read-high */ - __clear_bit(msr, msr_bitmap_nested + 0x400 / f); - - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) - /* write-high */ - __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); - - } -} - -static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) -{ - u8 mode = 0; - - if (cpu_has_secondary_exec_ctrls() && - (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { - mode |= MSR_BITMAP_MODE_X2APIC; - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) - mode |= MSR_BITMAP_MODE_X2APIC_APICV; - } - - return mode; -} - -#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) - -static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, - u8 mode) -{ - int msr; - - for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned word = msr / BITS_PER_LONG; - msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; - msr_bitmap[word + (0x800 / sizeof(long))] = ~0; - } - - if (mode & MSR_BITMAP_MODE_X2APIC) { - /* - * TPR reads and writes can be virtualized even if virtual interrupt - * delivery is not in use. - */ - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); - if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { - vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); - } - } -} - -static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; - u8 mode = vmx_msr_bitmap_mode(vcpu); - u8 changed = mode ^ vmx->msr_bitmap_mode; - - if (!changed) - return; - - if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) - vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); - - vmx->msr_bitmap_mode = mode; -} - -static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) -{ - return enable_apicv; -} - -static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - gfn_t gfn; - - /* - * Don't need to mark the APIC access page dirty; it is never - * written to by the CPU during APIC virtualization. - */ - - if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } - - if (nested_cpu_has_posted_intr(vmcs12)) { - gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } -} - - -static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int max_irr; - void *vapic_page; - u16 status; - - if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) - return; - - vmx->nested.pi_pending = false; - if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return; - - max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); - if (max_irr != 256) { - vapic_page = kmap(vmx->nested.virtual_apic_page); - __kvm_apic_update_irr(vmx->nested.pi_desc->pir, - vapic_page, &max_irr); - kunmap(vmx->nested.virtual_apic_page); - - status = vmcs_read16(GUEST_INTR_STATUS); - if ((u8)max_irr > ((u8)status & 0xff)) { - status &= ~0xff; - status |= (u8)max_irr; - vmcs_write16(GUEST_INTR_STATUS, status); - } - } - - nested_mark_vmcs12_pages_dirty(vcpu); -} - -static u8 vmx_get_rvi(void) -{ - return vmcs_read16(GUEST_INTR_STATUS) & 0xff; -} - -static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - void *vapic_page; - u32 vppr; - int rvi; - - if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || - !nested_cpu_has_vid(get_vmcs12(vcpu)) || - WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) - return false; - - rvi = vmx_get_rvi(); - - vapic_page = kmap(vmx->nested.virtual_apic_page); - vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); - kunmap(vmx->nested.virtual_apic_page); - - return ((rvi & 0xf0) > (vppr & 0xf0)); -} - -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - bool nested) -{ -#ifdef CONFIG_SMP - int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; - - if (vcpu->mode == IN_GUEST_MODE) { - /* - * The vector of interrupt to be delivered to vcpu had - * been set in PIR before this function. - * - * Following cases will be reached in this block, and - * we always send a notification event in all cases as - * explained below. - * - * Case 1: vcpu keeps in non-root mode. Sending a - * notification event posts the interrupt to vcpu. - * - * Case 2: vcpu exits to root mode and is still - * runnable. PIR will be synced to vIRR before the - * next vcpu entry. Sending a notification event in - * this case has no effect, as vcpu is not in root - * mode. - * - * Case 3: vcpu exits to root mode and is blocked. - * vcpu_block() has already synced PIR to vIRR and - * never blocks vcpu if vIRR is not cleared. Therefore, - * a blocked vcpu here does not wait for any requested - * interrupts in PIR, and sending a notification event - * which has no effect is safe here. - */ - - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return true; - } -#endif - return false; -} - -static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, - int vector) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (is_guest_mode(vcpu) && - vector == vmx->nested.posted_intr_nv) { - /* - * If a posted intr is not recognized by hardware, - * we will accomplish it in the next vmentry. - */ - vmx->nested.pi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); - /* the PIR and ON have been set by L1. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) - kvm_vcpu_kick(vcpu); - return 0; - } - return -1; -} -/* - * Send interrupt to vcpu via posted interrupt way. - * 1. If target vcpu is running(non-root mode), send posted interrupt - * notification to vcpu and hardware will sync PIR to vIRR atomically. - * 2. If target vcpu isn't running(root mode), kick it to pick up the - * interrupt from PIR in next vmentry. - */ -static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int r; - - r = vmx_deliver_nested_posted_interrupt(vcpu, vector); - if (!r) - return; - - if (pi_test_and_set_pir(vector, &vmx->pi_desc)) - return; - - /* If a previous notification has sent the IPI, nothing to do. */ - if (pi_test_and_set_on(&vmx->pi_desc)) - return; - - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) - kvm_vcpu_kick(vcpu); -} - -/* - * Set up the vmcs's constant host-state fields, i.e., host-state fields that - * will not change in the lifetime of the guest. - * Note that host-state that does change is set elsewhere. E.g., host-state - * that is set differently for each CPU is set in vmx_vcpu_load(), not here. - */ -static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) -{ - u32 low32, high32; - unsigned long tmpl; - struct desc_ptr dt; - unsigned long cr0, cr3, cr4; - - cr0 = read_cr0(); - WARN_ON(cr0 & X86_CR0_TS); - vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ - - /* - * Save the most likely value for this task's CR3 in the VMCS. - * We can't use __get_current_cr3_fast() because we're not atomic. - */ - cr3 = __read_cr3(); - vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ - vmx->loaded_vmcs->host_state.cr3 = cr3; - - /* Save the most likely value for this task's CR4 in the VMCS. */ - cr4 = cr4_read_shadow(); - vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ - vmx->loaded_vmcs->host_state.cr4 = cr4; - - vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ -#ifdef CONFIG_X86_64 - /* - * Load null selectors, so we can avoid reloading them in - * vmx_prepare_switch_to_host(), in case userspace uses - * the null selectors too (the expected case). - */ - vmcs_write16(HOST_DS_SELECTOR, 0); - vmcs_write16(HOST_ES_SELECTOR, 0); -#else - vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ -#endif - vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - - store_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ - vmx->host_idt_base = dt.address; - - vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ - - rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); - vmcs_write32(HOST_IA32_SYSENTER_CS, low32); - rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); - vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ - - if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { - rdmsr(MSR_IA32_CR_PAT, low32, high32); - vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); - } - - if (cpu_has_load_ia32_efer) - vmcs_write64(HOST_IA32_EFER, host_efer); -} - -static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) -{ - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; - if (enable_ept) - vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; - if (is_guest_mode(&vmx->vcpu)) - vmx->vcpu.arch.cr4_guest_owned_bits &= - ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; - vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); -} - -static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) -{ - u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; - - if (!kvm_vcpu_apicv_active(&vmx->vcpu)) - pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; - - if (!enable_vnmi) - pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; - - /* Enable the preemption timer dynamically */ - pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; - return pin_based_exec_ctrl; -} - -static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - if (kvm_vcpu_apicv_active(vcpu)) - vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - else - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - } - - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); -} - -static u32 vmx_exec_control(struct vcpu_vmx *vmx) -{ - u32 exec_control = vmcs_config.cpu_based_exec_ctrl; - - if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) - exec_control &= ~CPU_BASED_MOV_DR_EXITING; - - if (!cpu_need_tpr_shadow(&vmx->vcpu)) { - exec_control &= ~CPU_BASED_TPR_SHADOW; -#ifdef CONFIG_X86_64 - exec_control |= CPU_BASED_CR8_STORE_EXITING | - CPU_BASED_CR8_LOAD_EXITING; -#endif - } - if (!enable_ept) - exec_control |= CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_INVLPG_EXITING; - if (kvm_mwait_in_guest(vmx->vcpu.kvm)) - exec_control &= ~(CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING); - if (kvm_hlt_in_guest(vmx->vcpu.kvm)) - exec_control &= ~CPU_BASED_HLT_EXITING; - return exec_control; -} - -static bool vmx_rdrand_supported(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDRAND_EXITING; -} - -static bool vmx_rdseed_supported(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDSEED_EXITING; -} - -static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) -{ - struct kvm_vcpu *vcpu = &vmx->vcpu; - - u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - - if (!cpu_need_virtualize_apic_accesses(vcpu)) - exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - if (vmx->vpid == 0) - exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; - if (!enable_ept) { - exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; - enable_unrestricted_guest = 0; - } - if (!enable_unrestricted_guest) - exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; - if (kvm_pause_in_guest(vmx->vcpu.kvm)) - exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - if (!kvm_vcpu_apicv_active(vcpu)) - exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; - - /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, - * in vmx_set_cr4. */ - exec_control &= ~SECONDARY_EXEC_DESC; - - /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD - (handle_vmptrld). - We can NOT enable shadow_vmcs here because we don't have yet - a current VMCS12 - */ - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - - if (!enable_pml) - exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - - if (vmx_xsaves_supported()) { - /* Exposing XSAVES only when XSAVE is exposed */ - bool xsaves_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); - - if (!xsaves_enabled) - exec_control &= ~SECONDARY_EXEC_XSAVES; - - if (nested) { - if (xsaves_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_XSAVES; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_XSAVES; - } - } - - if (vmx_rdtscp_supported()) { - bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); - if (!rdtscp_enabled) - exec_control &= ~SECONDARY_EXEC_RDTSCP; - - if (nested) { - if (rdtscp_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDTSCP; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDTSCP; - } - } - - if (vmx_invpcid_supported()) { - /* Exposing INVPCID only when PCID is exposed */ - bool invpcid_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && - guest_cpuid_has(vcpu, X86_FEATURE_PCID); - - if (!invpcid_enabled) { - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); - } - - if (nested) { - if (invpcid_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_INVPCID; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_ENABLE_INVPCID; - } - } - - if (vmx_rdrand_supported()) { - bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); - if (rdrand_enabled) - exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; - - if (nested) { - if (rdrand_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDRAND_EXITING; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDRAND_EXITING; - } - } - - if (vmx_rdseed_supported()) { - bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); - if (rdseed_enabled) - exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; - - if (nested) { - if (rdseed_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDSEED_EXITING; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDSEED_EXITING; - } - } - - vmx->secondary_exec_control = exec_control; -} - -static void ept_set_mmio_spte_mask(void) -{ - /* - * EPT Misconfigurations can be generated if the value of bits 2:0 - * of an EPT paging-structure entry is 110b (write/execute). - */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, - VMX_EPT_MISCONFIG_WX_VALUE); -} - -#define VMX_XSS_EXIT_BITMAP 0 -/* - * Sets up the vmcs for emulated real mode. - */ -static void vmx_vcpu_setup(struct vcpu_vmx *vmx) -{ - int i; - - if (enable_shadow_vmcs) { - /* - * At vCPU creation, "VMWRITE to any supported field - * in the VMCS" is supported, so use the more - * permissive vmx_vmread_bitmap to specify both read - * and write permissions for the shadow VMCS. - */ - vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); - vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap)); - } - if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); - - vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ - - /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); - vmx->hv_deadline_tsc = -1; - - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); - - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - vmx->secondary_exec_control); - } - - if (kvm_vcpu_apicv_active(&vmx->vcpu)) { - vmcs_write64(EOI_EXIT_BITMAP0, 0); - vmcs_write64(EOI_EXIT_BITMAP1, 0); - vmcs_write64(EOI_EXIT_BITMAP2, 0); - vmcs_write64(EOI_EXIT_BITMAP3, 0); - - vmcs_write16(GUEST_INTR_STATUS, 0); - - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); - } - - if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { - vmcs_write32(PLE_GAP, ple_gap); - vmx->ple_window = ple_window; - vmx->ple_window_dirty = true; - } - - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); - vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ - - vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ - vmx_set_constant_host_state(vmx); - vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ - vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ - - if (cpu_has_vmx_vmfunc()) - vmcs_write64(VM_FUNCTION_CONTROL, 0); - - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); - - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) - vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { - u32 index = vmx_msr_index[i]; - u32 data_low, data_high; - int j = vmx->nmsrs; - - if (rdmsr_safe(index, &data_low, &data_high) < 0) - continue; - if (wrmsr_safe(index, data_low, data_high) < 0) - continue; - vmx->guest_msrs[j].index = i; - vmx->guest_msrs[j].data = 0; - vmx->guest_msrs[j].mask = -1ull; - ++vmx->nmsrs; - } - - vmx->arch_capabilities = kvm_get_arch_capabilities(); - - vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); - - /* 22.2.1, 20.8.1 */ - vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); - - vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; - vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); - - set_cr4_guest_host_mask(vmx); - - if (vmx_xsaves_supported()) - vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); - - if (enable_pml) { - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - } - - if (cpu_has_vmx_encls_vmexit()) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); -} - -static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct msr_data apic_base_msr; - u64 cr0; - - vmx->rmode.vm86_active = 0; - vmx->spec_ctrl = 0; - - vcpu->arch.microcode_version = 0x100000000ULL; - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); - kvm_set_cr8(vcpu, 0); - - if (!init_event) { - apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - apic_base_msr.data |= MSR_IA32_APICBASE_BSP; - apic_base_msr.host_initiated = true; - kvm_set_apic_base(vcpu, &apic_base_msr); - } - - vmx_segment_cache_clear(vmx); - - seg_setup(VCPU_SREG_CS); - vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); - - seg_setup(VCPU_SREG_DS); - seg_setup(VCPU_SREG_ES); - seg_setup(VCPU_SREG_FS); - seg_setup(VCPU_SREG_GS); - seg_setup(VCPU_SREG_SS); - - vmcs_write16(GUEST_TR_SELECTOR, 0); - vmcs_writel(GUEST_TR_BASE, 0); - vmcs_write32(GUEST_TR_LIMIT, 0xffff); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); - - vmcs_write16(GUEST_LDTR_SELECTOR, 0); - vmcs_writel(GUEST_LDTR_BASE, 0); - vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); - vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - - if (!init_event) { - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - } - - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - kvm_rip_write(vcpu, 0xfff0); - - vmcs_writel(GUEST_GDTR_BASE, 0); - vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); - - vmcs_writel(GUEST_IDTR_BASE, 0); - vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); - - vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); - if (kvm_mpx_supported()) - vmcs_write64(GUEST_BNDCFGS, 0); - - setup_msrs(vmx); - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - - if (cpu_has_vmx_tpr_shadow() && !init_event) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (cpu_need_tpr_shadow(vcpu)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - __pa(vcpu->arch.apic->regs)); - vmcs_write32(TPR_THRESHOLD, 0); - } - - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - - if (vmx->vpid != 0) - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - - cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx->vcpu.arch.cr0 = cr0; - vmx_set_cr0(vcpu, cr0); /* enter rmode */ - vmx_set_cr4(vcpu, 0); - vmx_set_efer(vcpu, 0); - - update_exception_bitmap(vcpu); - - vpid_sync_context(vmx->vpid); - if (init_event) - vmx_clear_hlt(vcpu); -} - -/* - * In nested virtualization, check if L1 asked to exit on external interrupts. - * For most existing hypervisors, this will always return true. - */ -static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) -{ - return get_vmcs12(vcpu)->pin_based_vm_exec_control & - PIN_BASED_EXT_INTR_MASK; -} - -/* - * In nested virtualization, check if L1 has set - * VM_EXIT_ACK_INTR_ON_EXIT - */ -static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) -{ - return get_vmcs12(vcpu)->vm_exit_controls & - VM_EXIT_ACK_INTR_ON_EXIT; -} - -static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) -{ - return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); -} - -static void enable_irq_window(struct kvm_vcpu *vcpu) -{ - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_INTR_PENDING); -} - -static void enable_nmi_window(struct kvm_vcpu *vcpu) -{ - if (!enable_vnmi || - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { - enable_irq_window(vcpu); - return; - } - - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_NMI_PENDING); -} - -static void vmx_inject_irq(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - uint32_t intr; - int irq = vcpu->arch.interrupt.nr; - - trace_kvm_inj_virq(irq); - - ++vcpu->stat.irq_injections; - if (vmx->rmode.vm86_active) { - int inc_eip = 0; - if (vcpu->arch.interrupt.soft) - inc_eip = vcpu->arch.event_exit_inst_len; - if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return; - } - intr = irq | INTR_INFO_VALID_MASK; - if (vcpu->arch.interrupt.soft) { - intr |= INTR_TYPE_SOFT_INTR; - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmx->vcpu.arch.event_exit_inst_len); - } else - intr |= INTR_TYPE_EXT_INTR; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); - - vmx_clear_hlt(vcpu); -} - -static void vmx_inject_nmi(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!enable_vnmi) { - /* - * Tracking the NMI-blocked state in software is built upon - * finding the next open IRQ window. This, in turn, depends on - * well-behaving guests: They have to keep IRQs disabled at - * least as long as the NMI handler runs. Otherwise we may - * cause NMI nesting, maybe breaking the guest. But as this is - * highly unlikely, we can live with the residual risk. - */ - vmx->loaded_vmcs->soft_vnmi_blocked = 1; - vmx->loaded_vmcs->vnmi_blocked_time = 0; - } - - ++vcpu->stat.nmi_injections; - vmx->loaded_vmcs->nmi_known_unmasked = false; - - if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return; - } - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); - - vmx_clear_hlt(vcpu); -} - -static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - bool masked; - - if (!enable_vnmi) - return vmx->loaded_vmcs->soft_vnmi_blocked; - if (vmx->loaded_vmcs->nmi_known_unmasked) - return false; - masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; - vmx->loaded_vmcs->nmi_known_unmasked = !masked; - return masked; -} - -static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!enable_vnmi) { - if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { - vmx->loaded_vmcs->soft_vnmi_blocked = masked; - vmx->loaded_vmcs->vnmi_blocked_time = 0; - } - } else { - vmx->loaded_vmcs->nmi_known_unmasked = !masked; - if (masked) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - } -} - -static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) -{ - if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - - if (!enable_vnmi && - to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) - return 0; - - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI - | GUEST_INTR_STATE_NMI)); -} - -static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) -{ - return (!to_vmx(vcpu)->nested.nested_run_pending && - vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); -} - -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - int ret; - - if (enable_unrestricted_guest) - return 0; - - ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, - PAGE_SIZE * 3); - if (ret) - return ret; - to_kvm_vmx(kvm)->tss_addr = addr; - return init_rmode_tss(kvm); -} - -static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) -{ - to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; - return 0; -} - -static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) -{ - switch (vec) { - case BP_VECTOR: - /* - * Update instruction length as we may reinject the exception - * from user space while in guest debugging mode. - */ - to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - return false; - /* fall through */ - case DB_VECTOR: - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return false; - /* fall through */ - case DE_VECTOR: - case OF_VECTOR: - case BR_VECTOR: - case UD_VECTOR: - case DF_VECTOR: - case SS_VECTOR: - case GP_VECTOR: - case MF_VECTOR: - return true; - break; - } - return false; -} - -static int handle_rmode_exception(struct kvm_vcpu *vcpu, - int vec, u32 err_code) -{ - /* - * Instruction with address size override prefix opcode 0x67 - * Cause the #SS fault with 0 error code in VM86 mode. - */ - if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { - if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { - if (vcpu->arch.halt_request) { - vcpu->arch.halt_request = 0; - return kvm_vcpu_halt(vcpu); - } - return 1; - } - return 0; - } - - /* - * Forward all other exceptions that are valid in real mode. - * FIXME: Breaks guest debugging in real mode, needs to be fixed with - * the required debugging infrastructure rework. - */ - kvm_queue_exception(vcpu, vec); - return 1; -} - -/* - * Trigger machine check on the host. We assume all the MSRs are already set up - * by the CPU and that we still run on the same CPU as the MCE occurred on. - * We pass a fake environment to the machine check handler because we want - * the guest to be always treated like user space, no matter what context - * it used internally. - */ -static void kvm_machine_check(void) -{ -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) - struct pt_regs regs = { - .cs = 3, /* Fake ring 3 no matter what the guest ran on */ - .flags = X86_EFLAGS_IF, - }; - - do_machine_check(&regs, 0); -#endif -} - -static int handle_machine_check(struct kvm_vcpu *vcpu) -{ - /* already handled by vcpu_run */ - return 1; -} - -static int handle_exception(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_run *kvm_run = vcpu->run; - u32 intr_info, ex_no, error_code; - unsigned long cr2, rip, dr6; - u32 vect_info; - enum emulation_result er; - - vect_info = vmx->idt_vectoring_info; - intr_info = vmx->exit_intr_info; - - if (is_machine_check(intr_info)) - return handle_machine_check(vcpu); - - if (is_nmi(intr_info)) - return 1; /* already handled by vmx_vcpu_run() */ - - if (is_invalid_opcode(intr_info)) - return handle_ud(vcpu); - - error_code = 0; - if (intr_info & INTR_INFO_DELIVER_CODE_MASK) - error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - - if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { - WARN_ON_ONCE(!enable_vmware_backdoor); - er = kvm_emulate_instruction(vcpu, - EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); - if (er == EMULATE_USER_EXIT) - return 0; - else if (er != EMULATE_DONE) - kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); - return 1; - } - - /* - * The #PF with PFEC.RSVD = 1 indicates the guest is accessing - * MMIO, it is better to report an internal error. - * See the comments in vmx_handle_exit. - */ - if ((vect_info & VECTORING_INFO_VALID_MASK) && - !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 3; - vcpu->run->internal.data[0] = vect_info; - vcpu->run->internal.data[1] = intr_info; - vcpu->run->internal.data[2] = error_code; - return 0; - } - - if (is_page_fault(intr_info)) { - cr2 = vmcs_readl(EXIT_QUALIFICATION); - /* EPT won't cause page fault directly */ - WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); - return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); - } - - ex_no = intr_info & INTR_INFO_VECTOR_MASK; - - if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) - return handle_rmode_exception(vcpu, ex_no, error_code); - - switch (ex_no) { - case AC_VECTOR: - kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); - return 1; - case DB_VECTOR: - dr6 = vmcs_readl(EXIT_QUALIFICATION); - if (!(vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= dr6 | DR6_RTM; - if (is_icebp(intr_info)) - skip_emulated_instruction(vcpu); - - kvm_queue_exception(vcpu, DB_VECTOR); - return 1; - } - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; - kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); - /* fall through */ - case BP_VECTOR: - /* - * Update instruction length as we may reinject #BP from - * user space while in guest debugging mode. Reading it for - * #DB as well causes no harm, it is not used in that case. - */ - vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - kvm_run->exit_reason = KVM_EXIT_DEBUG; - rip = kvm_rip_read(vcpu); - kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; - kvm_run->debug.arch.exception = ex_no; - break; - default: - kvm_run->exit_reason = KVM_EXIT_EXCEPTION; - kvm_run->ex.exception = ex_no; - kvm_run->ex.error_code = error_code; - break; - } - return 0; -} - -static int handle_external_interrupt(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.irq_exits; - return 1; -} - -static int handle_triple_fault(struct kvm_vcpu *vcpu) -{ - vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; - vcpu->mmio_needed = 0; - return 0; -} - -static int handle_io(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification; - int size, in, string; - unsigned port; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - string = (exit_qualification & 16) != 0; - - ++vcpu->stat.io_exits; - - if (string) - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; - - port = exit_qualification >> 16; - size = (exit_qualification & 7) + 1; - in = (exit_qualification & 8) != 0; - - return kvm_fast_pio(vcpu, size, port, in); -} - -static void -vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) -{ - /* - * Patch in the VMCALL instruction: - */ - hypercall[0] = 0x0f; - hypercall[1] = 0x01; - hypercall[2] = 0xc1; -} - -/* called to set cr0 as appropriate for a mov-to-cr0 exit. */ -static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) -{ - if (is_guest_mode(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned long orig_val = val; - - /* - * We get here when L2 changed cr0 in a way that did not change - * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), - * but did change L0 shadowed bits. So we first calculate the - * effective cr0 value that L1 would like to write into the - * hardware. It consists of the L2-owned bits from the new - * value combined with the L1-owned bits from L1's guest_cr0. - */ - val = (val & ~vmcs12->cr0_guest_host_mask) | - (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - - if (!nested_guest_cr0_valid(vcpu, val)) - return 1; - - if (kvm_set_cr0(vcpu, val)) - return 1; - vmcs_writel(CR0_READ_SHADOW, orig_val); - return 0; - } else { - if (to_vmx(vcpu)->nested.vmxon && - !nested_host_cr0_valid(vcpu, val)) - return 1; - - return kvm_set_cr0(vcpu, val); - } -} - -static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) -{ - if (is_guest_mode(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned long orig_val = val; - - /* analogously to handle_set_cr0 */ - val = (val & ~vmcs12->cr4_guest_host_mask) | - (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); - if (kvm_set_cr4(vcpu, val)) - return 1; - vmcs_writel(CR4_READ_SHADOW, orig_val); - return 0; - } else - return kvm_set_cr4(vcpu, val); -} - -static int handle_desc(struct kvm_vcpu *vcpu) -{ - WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; -} - -static int handle_cr(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification, val; - int cr; - int reg; - int err; - int ret; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - cr = exit_qualification & 15; - reg = (exit_qualification >> 8) & 15; - switch ((exit_qualification >> 4) & 3) { - case 0: /* mov to cr */ - val = kvm_register_readl(vcpu, reg); - trace_kvm_cr_write(cr, val); - switch (cr) { - case 0: - err = handle_set_cr0(vcpu, val); - return kvm_complete_insn_gp(vcpu, err); - case 3: - WARN_ON_ONCE(enable_unrestricted_guest); - err = kvm_set_cr3(vcpu, val); - return kvm_complete_insn_gp(vcpu, err); - case 4: - err = handle_set_cr4(vcpu, val); - return kvm_complete_insn_gp(vcpu, err); - case 8: { - u8 cr8_prev = kvm_get_cr8(vcpu); - u8 cr8 = (u8)val; - err = kvm_set_cr8(vcpu, cr8); - ret = kvm_complete_insn_gp(vcpu, err); - if (lapic_in_kernel(vcpu)) - return ret; - if (cr8_prev <= cr8) - return ret; - /* - * TODO: we might be squashing a - * KVM_GUESTDBG_SINGLESTEP-triggered - * KVM_EXIT_DEBUG here. - */ - vcpu->run->exit_reason = KVM_EXIT_SET_TPR; - return 0; - } - } - break; - case 2: /* clts */ - WARN_ONCE(1, "Guest should always own CR0.TS"); - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - return kvm_skip_emulated_instruction(vcpu); - case 1: /*mov from cr*/ - switch (cr) { - case 3: - WARN_ON_ONCE(enable_unrestricted_guest); - val = kvm_read_cr3(vcpu); - kvm_register_write(vcpu, reg, val); - trace_kvm_cr_read(cr, val); - return kvm_skip_emulated_instruction(vcpu); - case 8: - val = kvm_get_cr8(vcpu); - kvm_register_write(vcpu, reg, val); - trace_kvm_cr_read(cr, val); - return kvm_skip_emulated_instruction(vcpu); - } - break; - case 3: /* lmsw */ - val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; - trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); - kvm_lmsw(vcpu, val); - - return kvm_skip_emulated_instruction(vcpu); - default: - break; - } - vcpu->run->exit_reason = 0; - vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", - (int)(exit_qualification >> 4) & 3, cr); - return 0; -} - -static int handle_dr(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification; - int dr, dr7, reg; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - dr = exit_qualification & DEBUG_REG_ACCESS_NUM; - - /* First, if DR does not exist, trigger UD */ - if (!kvm_require_dr(vcpu, dr)) - return 1; - - /* Do not handle if the CPL > 0, will trigger GP on re-entry */ - if (!kvm_require_cpl(vcpu, 0)) - return 1; - dr7 = vmcs_readl(GUEST_DR7); - if (dr7 & DR7_GD) { - /* - * As the vm-exit takes precedence over the debug trap, we - * need to emulate the latter, either for the host or the - * guest debugging itself. - */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; - vcpu->run->debug.arch.dr7 = dr7; - vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); - vcpu->run->debug.arch.exception = DB_VECTOR; - vcpu->run->exit_reason = KVM_EXIT_DEBUG; - return 0; - } else { - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BD | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); - return 1; - } - } - - if (vcpu->guest_debug == 0) { - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_MOV_DR_EXITING); - - /* - * No more DR vmexits; force a reload of the debug registers - * and reenter on this instruction. The next vmexit will - * retrieve the full state of the debug registers. - */ - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; - return 1; - } - - reg = DEBUG_REG_ACCESS_REG(exit_qualification); - if (exit_qualification & TYPE_MOV_FROM_DR) { - unsigned long val; - - if (kvm_get_dr(vcpu, dr, &val)) - return 1; - kvm_register_write(vcpu, reg, val); - } else - if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) - return 1; - - return kvm_skip_emulated_instruction(vcpu); -} - -static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.dr6; -} - -static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) -{ -} - -static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) -{ - get_debugreg(vcpu->arch.db[0], 0); - get_debugreg(vcpu->arch.db[1], 1); - get_debugreg(vcpu->arch.db[2], 2); - get_debugreg(vcpu->arch.db[3], 3); - get_debugreg(vcpu->arch.dr6, 6); - vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); - - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; - vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); -} - -static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) -{ - vmcs_writel(GUEST_DR7, val); -} - -static int handle_cpuid(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_cpuid(vcpu); -} - -static int handle_rdmsr(struct kvm_vcpu *vcpu) -{ - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - struct msr_data msr_info; - - msr_info.index = ecx; - msr_info.host_initiated = false; - if (vmx_get_msr(vcpu, &msr_info)) { - trace_kvm_msr_read_ex(ecx); - kvm_inject_gp(vcpu, 0); - return 1; - } - - trace_kvm_msr_read(ecx, msr_info.data); - - /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; - vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; - return kvm_skip_emulated_instruction(vcpu); -} - -static int handle_wrmsr(struct kvm_vcpu *vcpu) -{ - struct msr_data msr; - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) - | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - - msr.data = data; - msr.index = ecx; - msr.host_initiated = false; - if (kvm_set_msr(vcpu, &msr) != 0) { - trace_kvm_msr_write_ex(ecx, data); - kvm_inject_gp(vcpu, 0); - return 1; - } - - trace_kvm_msr_write(ecx, data); - return kvm_skip_emulated_instruction(vcpu); -} - -static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) -{ - kvm_apic_update_ppr(vcpu); - return 1; -} - -static int handle_interrupt_window(struct kvm_vcpu *vcpu) -{ - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_INTR_PENDING); - - kvm_make_request(KVM_REQ_EVENT, vcpu); - - ++vcpu->stat.irq_window_exits; - return 1; -} - -static int handle_halt(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_halt(vcpu); -} - -static int handle_vmcall(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_hypercall(vcpu); -} - -static int handle_invd(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; -} - -static int handle_invlpg(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - kvm_mmu_invlpg(vcpu, exit_qualification); - return kvm_skip_emulated_instruction(vcpu); -} - -static int handle_rdpmc(struct kvm_vcpu *vcpu) -{ - int err; - - err = kvm_rdpmc(vcpu); - return kvm_complete_insn_gp(vcpu, err); -} - -static int handle_wbinvd(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_wbinvd(vcpu); -} - -static int handle_xsetbv(struct kvm_vcpu *vcpu) -{ - u64 new_bv = kvm_read_edx_eax(vcpu); - u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); - - if (kvm_set_xcr(vcpu, index, new_bv) == 0) - return kvm_skip_emulated_instruction(vcpu); - return 1; -} - -static int handle_xsaves(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); - return 1; -} - -static int handle_xrstors(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); - return 1; -} - -static int handle_apic_access(struct kvm_vcpu *vcpu) -{ - if (likely(fasteoi)) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - int access_type, offset; - - access_type = exit_qualification & APIC_ACCESS_TYPE; - offset = exit_qualification & APIC_ACCESS_OFFSET; - /* - * Sane guest uses MOV to write EOI, with written value - * not cared. So make a short-circuit here by avoiding - * heavy instruction emulation. - */ - if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && - (offset == APIC_EOI)) { - kvm_lapic_set_eoi(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } - } - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; -} - -static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - int vector = exit_qualification & 0xff; - - /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ - kvm_apic_set_eoi_accelerated(vcpu, vector); - return 1; -} - -static int handle_apic_write(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 offset = exit_qualification & 0xfff; - - /* APIC-write VM exit is trap-like and thus no need to adjust IP */ - kvm_apic_write_nodecode(vcpu, offset); - return 1; -} - -static int handle_task_switch(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qualification; - bool has_error_code = false; - u32 error_code = 0; - u16 tss_selector; - int reason, type, idt_v, idt_index; - - idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); - idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); - type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - reason = (u32)exit_qualification >> 30; - if (reason == TASK_SWITCH_GATE && idt_v) { - switch (type) { - case INTR_TYPE_NMI_INTR: - vcpu->arch.nmi_injected = false; - vmx_set_nmi_mask(vcpu, true); - break; - case INTR_TYPE_EXT_INTR: - case INTR_TYPE_SOFT_INTR: - kvm_clear_interrupt_queue(vcpu); - break; - case INTR_TYPE_HARD_EXCEPTION: - if (vmx->idt_vectoring_info & - VECTORING_INFO_DELIVER_CODE_MASK) { - has_error_code = true; - error_code = - vmcs_read32(IDT_VECTORING_ERROR_CODE); - } - /* fall through */ - case INTR_TYPE_SOFT_EXCEPTION: - kvm_clear_exception_queue(vcpu); - break; - default: - break; - } - } - tss_selector = exit_qualification; - - if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && - type != INTR_TYPE_EXT_INTR && - type != INTR_TYPE_NMI_INTR)) - skip_emulated_instruction(vcpu); - - if (kvm_task_switch(vcpu, tss_selector, - type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, - has_error_code, error_code) == EMULATE_FAIL) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } - - /* - * TODO: What about debug traps on tss switch? - * Are we supposed to inject them and update dr6? - */ - - return 1; -} - -static int handle_ept_violation(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification; - gpa_t gpa; - u64 error_code; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - /* - * EPT violation happened while executing iret from NMI, - * "blocked by NMI" bit has to be set before next VM entry. - * There are errata that may cause this bit to not be set: - * AAK134, BY25. - */ - if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - enable_vnmi && - (exit_qualification & INTR_INFO_UNBLOCK_NMI)) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); - - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - trace_kvm_page_fault(gpa, exit_qualification); - - /* Is it a read fault? */ - error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) - ? PFERR_USER_MASK : 0; - /* Is it a write fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) - ? PFERR_WRITE_MASK : 0; - /* Is it a fetch fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) - ? PFERR_FETCH_MASK : 0; - /* ept page table entry is present? */ - error_code |= (exit_qualification & - (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | - EPT_VIOLATION_EXECUTABLE)) - ? PFERR_PRESENT_MASK : 0; - - error_code |= (exit_qualification & 0x100) != 0 ? - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; - - vcpu->arch.exit_qualification = exit_qualification; - return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); -} - -static int handle_ept_misconfig(struct kvm_vcpu *vcpu) -{ - gpa_t gpa; - - /* - * A nested guest cannot optimize MMIO vmexits, because we have an - * nGPA here instead of the required GPA. - */ - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - if (!is_guest_mode(vcpu) && - !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { - trace_kvm_fast_mmio(gpa); - /* - * Doing kvm_skip_emulated_instruction() depends on undefined - * behavior: Intel's manual doesn't mandate - * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG - * occurs and while on real hardware it was observed to be set, - * other hypervisors (namely Hyper-V) don't set it, we end up - * advancing IP with some random value. Disable fast mmio when - * running nested and keep it for real hardware in hope that - * VM_EXIT_INSTRUCTION_LEN will always be set correctly. - */ - if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) - return kvm_skip_emulated_instruction(vcpu); - else - return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == - EMULATE_DONE; - } - - return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); -} - -static int handle_nmi_window(struct kvm_vcpu *vcpu) -{ - WARN_ON_ONCE(!enable_vnmi); - vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, - CPU_BASED_VIRTUAL_NMI_PENDING); - ++vcpu->stat.nmi_window_exits; - kvm_make_request(KVM_REQ_EVENT, vcpu); - - return 1; -} - -static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - enum emulation_result err = EMULATE_DONE; - int ret = 1; - u32 cpu_exec_ctrl; - bool intr_window_requested; - unsigned count = 130; - - /* - * We should never reach the point where we are emulating L2 - * due to invalid guest state as that means we incorrectly - * allowed a nested VMEntry with an invalid vmcs12. - */ - WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); - - cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; - - while (vmx->emulation_required && count-- != 0) { - if (intr_window_requested && vmx_interrupt_allowed(vcpu)) - return handle_interrupt_window(&vmx->vcpu); - - if (kvm_test_request(KVM_REQ_EVENT, vcpu)) - return 1; - - err = kvm_emulate_instruction(vcpu, 0); - - if (err == EMULATE_USER_EXIT) { - ++vcpu->stat.mmio_exits; - ret = 0; - goto out; - } - - if (err != EMULATE_DONE) - goto emulation_error; - - if (vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending) - goto emulation_error; - - if (vcpu->arch.halt_request) { - vcpu->arch.halt_request = 0; - ret = kvm_vcpu_halt(vcpu); - goto out; - } - - if (signal_pending(current)) - goto out; - if (need_resched()) - schedule(); - } - -out: - return ret; - -emulation_error: - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; -} - -static void grow_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; - - vmx->ple_window = __grow_ple_window(old, ple_window, - ple_window_grow, - ple_window_max); - - if (vmx->ple_window != old) - vmx->ple_window_dirty = true; - - trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); -} - -static void shrink_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; - - vmx->ple_window = __shrink_ple_window(old, ple_window, - ple_window_shrink, - ple_window); - - if (vmx->ple_window != old) - vmx->ple_window_dirty = true; - - trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); -} - -/* - * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. - */ -static void wakeup_handler(void) -{ - struct kvm_vcpu *vcpu; - int cpu = smp_processor_id(); - - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (pi_test_on(pi_desc) == 1) - kvm_vcpu_kick(vcpu); - } - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); -} - -static void vmx_enable_tdp(void) -{ - kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, - enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK, - cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - VMX_EPT_RWX_MASK, 0ull); - - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); -} - -static __init int hardware_setup(void) -{ - unsigned long host_bndcfgs; - int r = -ENOMEM, i; - - rdmsrl_safe(MSR_EFER, &host_efer); - - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); - - for (i = 0; i < VMX_BITMAP_NR; i++) { - vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_bitmap[i]) - goto out; - } - - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); - memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - - if (setup_vmcs_config(&vmcs_config) < 0) { - r = -EIO; - goto out; - } - - if (boot_cpu_has(X86_FEATURE_NX)) - kvm_enable_efer_bits(EFER_NX); - - if (boot_cpu_has(X86_FEATURE_MPX)) { - rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); - WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); - } - - if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || - !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) - enable_vpid = 0; - - if (!cpu_has_vmx_ept() || - !cpu_has_vmx_ept_4levels() || - !cpu_has_vmx_ept_mt_wb() || - !cpu_has_vmx_invept_global()) - enable_ept = 0; - - if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) - enable_ept_ad_bits = 0; - - if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) - enable_unrestricted_guest = 0; - - if (!cpu_has_vmx_flexpriority()) - flexpriority_enabled = 0; - - if (!cpu_has_virtual_nmis()) - enable_vnmi = 0; - - /* - * set_apic_access_page_addr() is used to reload apic access - * page upon invalidation. No need to do anything if not - * using the APIC_ACCESS_ADDR VMCS field. - */ - if (!flexpriority_enabled) - kvm_x86_ops->set_apic_access_page_addr = NULL; - - if (!cpu_has_vmx_tpr_shadow()) - kvm_x86_ops->update_cr8_intercept = NULL; - - if (enable_ept && !cpu_has_vmx_ept_2m_page()) - kvm_disable_largepages(); - -#if IS_ENABLED(CONFIG_HYPERV) - if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH - && enable_ept) - kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb; -#endif - - if (!cpu_has_vmx_ple()) { - ple_gap = 0; - ple_window = 0; - ple_window_grow = 0; - ple_window_max = 0; - ple_window_shrink = 0; - } - - if (!cpu_has_vmx_apicv()) { - enable_apicv = 0; - kvm_x86_ops->sync_pir_to_irr = NULL; - } - - if (cpu_has_vmx_tsc_scaling()) { - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - } - - set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - - if (enable_ept) - vmx_enable_tdp(); - else - kvm_disable_tdp(); - - if (!nested) { - kvm_x86_ops->get_nested_state = NULL; - kvm_x86_ops->set_nested_state = NULL; - } - - /* - * Only enable PML when hardware supports PML feature, and both EPT - * and EPT A/D bit features are enabled -- PML depends on them to work. - */ - if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) - enable_pml = 0; - - if (!enable_pml) { - kvm_x86_ops->slot_enable_log_dirty = NULL; - kvm_x86_ops->slot_disable_log_dirty = NULL; - kvm_x86_ops->flush_log_dirty = NULL; - kvm_x86_ops->enable_log_dirty_pt_masked = NULL; - } - - if (!cpu_has_vmx_preemption_timer()) - kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; - - if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { - u64 vmx_msr; - - rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); - cpu_preemption_timer_multi = - vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; - } else { - kvm_x86_ops->set_hv_timer = NULL; - kvm_x86_ops->cancel_hv_timer = NULL; - } - - if (!cpu_has_vmx_shadow_vmcs()) - enable_shadow_vmcs = 0; - if (enable_shadow_vmcs) - init_vmcs_shadow_fields(); - - kvm_set_posted_intr_wakeup_handler(wakeup_handler); - nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv); - - kvm_mce_cap_supported |= MCG_LMCE_P; - - return alloc_kvm_area(); - -out: - for (i = 0; i < VMX_BITMAP_NR; i++) - free_page((unsigned long)vmx_bitmap[i]); - - return r; -} - -static __exit void hardware_unsetup(void) -{ - int i; - - for (i = 0; i < VMX_BITMAP_NR; i++) - free_page((unsigned long)vmx_bitmap[i]); - - free_kvm_area(); -} - -/* - * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE - * exiting, so only get here on cpu with PAUSE-Loop-Exiting. - */ -static int handle_pause(struct kvm_vcpu *vcpu) -{ - if (!kvm_pause_in_guest(vcpu->kvm)) - grow_ple_window(vcpu); - - /* - * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" - * VM-execution control is ignored if CPL > 0. OTOH, KVM - * never set PAUSE_EXITING and just set PLE if supported, - * so the vcpu must be CPL=0 if it gets a PAUSE exit. - */ - kvm_vcpu_on_spin(vcpu, true); - return kvm_skip_emulated_instruction(vcpu); -} - -static int handle_nop(struct kvm_vcpu *vcpu) -{ - return kvm_skip_emulated_instruction(vcpu); -} - -static int handle_mwait(struct kvm_vcpu *vcpu) -{ - printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); - return handle_nop(vcpu); -} - -static int handle_invalid_op(struct kvm_vcpu *vcpu) -{ - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; -} - -static int handle_monitor_trap(struct kvm_vcpu *vcpu) -{ - return 1; -} - -static int handle_monitor(struct kvm_vcpu *vcpu) -{ - printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); - return handle_nop(vcpu); -} - -/* - * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), - * set the success or error code of an emulated VMX instruction (as specified - * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated - * instruction. - */ -static int nested_vmx_succeed(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); - return kvm_skip_emulated_instruction(vcpu); -} - -static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_CF); - return kvm_skip_emulated_instruction(vcpu); -} - -static int nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * failValid writes the error number to the current VMCS, which - * can't be done if there isn't a current VMCS. - */ - if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) - return nested_vmx_failInvalid(vcpu); - - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_ZF); - get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; - /* - * We don't need to force a shadow sync because - * VM_INSTRUCTION_ERROR is not shadowed - */ - return kvm_skip_emulated_instruction(vcpu); -} - -static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) -{ - /* TODO: not to reset guest simply here. */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); -} - -static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) -{ - struct vcpu_vmx *vmx = - container_of(timer, struct vcpu_vmx, nested.preemption_timer); - - vmx->nested.preemption_timer_expired = true; - kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); - kvm_vcpu_kick(&vmx->vcpu); - - return HRTIMER_NORESTART; -} - -/* - * Decode the memory-address operand of a vmx instruction, as recorded on an - * exit caused by such an instruction (run by a guest hypervisor). - * On success, returns 0. When the operand is invalid, returns 1 and throws - * #UD or #GP. - */ -static int get_vmx_mem_address(struct kvm_vcpu *vcpu, - unsigned long exit_qualification, - u32 vmx_instruction_info, bool wr, gva_t *ret) -{ - gva_t off; - bool exn; - struct kvm_segment s; - - /* - * According to Vol. 3B, "Information for VM Exits Due to Instruction - * Execution", on an exit, vmx_instruction_info holds most of the - * addressing components of the operand. Only the displacement part - * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). - * For how an actual address is calculated from all these components, - * refer to Vol. 1, "Operand Addressing". - */ - int scaling = vmx_instruction_info & 3; - int addr_size = (vmx_instruction_info >> 7) & 7; - bool is_reg = vmx_instruction_info & (1u << 10); - int seg_reg = (vmx_instruction_info >> 15) & 7; - int index_reg = (vmx_instruction_info >> 18) & 0xf; - bool index_is_valid = !(vmx_instruction_info & (1u << 22)); - int base_reg = (vmx_instruction_info >> 23) & 0xf; - bool base_is_valid = !(vmx_instruction_info & (1u << 27)); - - if (is_reg) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - /* Addr = segment_base + offset */ - /* offset = base + [index * scale] + displacement */ - off = exit_qualification; /* holds the displacement */ - if (base_is_valid) - off += kvm_register_read(vcpu, base_reg); - if (index_is_valid) - off += kvm_register_read(vcpu, index_reg)<<scaling; - vmx_get_segment(vcpu, &s, seg_reg); - *ret = s.base + off; - - if (addr_size == 1) /* 32 bit */ - *ret &= 0xffffffff; - - /* Checks for #GP/#SS exceptions. */ - exn = false; - if (is_long_mode(vcpu)) { - /* Long mode: #GP(0)/#SS(0) if the memory address is in a - * non-canonical form. This is the only check on the memory - * destination for long mode! - */ - exn = is_noncanonical_address(*ret, vcpu); - } else if (is_protmode(vcpu)) { - /* Protected mode: apply checks for segment validity in the - * following order: - * - segment type check (#GP(0) may be thrown) - * - usability check (#GP(0)/#SS(0)) - * - limit check (#GP(0)/#SS(0)) - */ - if (wr) - /* #GP(0) if the destination operand is located in a - * read-only data segment or any code segment. - */ - exn = ((s.type & 0xa) == 0 || (s.type & 8)); - else - /* #GP(0) if the source operand is located in an - * execute-only code segment - */ - exn = ((s.type & 0xa) == 8); - if (exn) { - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return 1; - } - /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. - */ - exn = (s.unusable != 0); - /* Protected mode: #GP(0)/#SS(0) if the memory - * operand is outside the segment limit. - */ - exn = exn || (off + sizeof(u64) > s.limit); - } - if (exn) { - kvm_queue_exception_e(vcpu, - seg_reg == VCPU_SREG_SS ? - SS_VECTOR : GP_VECTOR, - 0); - return 1; - } - - return 0; -} - -static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) -{ - gva_t gva; - struct x86_exception e; - - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva)) - return 1; - - if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - return 0; -} - -/* - * Allocate a shadow VMCS and associate it with the currently loaded - * VMCS, unless such a shadow VMCS already exists. The newly allocated - * VMCS is also VMCLEARed, so that it is ready for use. - */ -static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; - - /* - * We should allocate a shadow vmcs for vmcs01 only when L1 - * executes VMXON and free it when L1 executes VMXOFF. - * As it is invalid to execute VMXON twice, we shouldn't reach - * here when vmcs01 already have an allocated shadow vmcs. - */ - WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); - - if (!loaded_vmcs->shadow_vmcs) { - loaded_vmcs->shadow_vmcs = alloc_vmcs(true); - if (loaded_vmcs->shadow_vmcs) - vmcs_clear(loaded_vmcs->shadow_vmcs); - } - return loaded_vmcs->shadow_vmcs; -} - -static int enter_vmx_operation(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int r; - - r = alloc_loaded_vmcs(&vmx->nested.vmcs02); - if (r < 0) - goto out_vmcs02; - - vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); - if (!vmx->nested.cached_vmcs12) - goto out_cached_vmcs12; - - vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); - if (!vmx->nested.cached_shadow_vmcs12) - goto out_cached_shadow_vmcs12; - - if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) - goto out_shadow_vmcs; - - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; - - vmx->nested.vpid02 = allocate_vpid(); - - vmx->nested.vmcs02_initialized = false; - vmx->nested.vmxon = true; - return 0; - -out_shadow_vmcs: - kfree(vmx->nested.cached_shadow_vmcs12); - -out_cached_shadow_vmcs12: - kfree(vmx->nested.cached_vmcs12); - -out_cached_vmcs12: - free_loaded_vmcs(&vmx->nested.vmcs02); - -out_vmcs02: - return -ENOMEM; -} - -/* - * Emulate the VMXON instruction. - * Currently, we just remember that VMX is active, and do not save or even - * inspect the argument to VMXON (the so-called "VMXON pointer") because we - * do not currently need to store anything in that guest-allocated memory - * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their - * argument is different from the VMXON pointer (which the spec says they do). - */ -static int handle_vmon(struct kvm_vcpu *vcpu) -{ - int ret; - gpa_t vmptr; - struct page *page; - struct vcpu_vmx *vmx = to_vmx(vcpu); - const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED - | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - - /* - * The Intel VMX Instruction Reference lists a bunch of bits that are - * prerequisite to running VMXON, most notably cr4.VMXE must be set to - * 1 (see vmx_set_cr4() for when we allow the guest to set this). - * Otherwise, we should fail with #UD. But most faulting conditions - * have already been checked by hardware, prior to the VM-exit for - * VMXON. We do test guest cr4.VMXE because processor CR4 always has - * that bit set to 1 in non-root mode. - */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - /* CPL=0 must be checked manually. */ - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - if (vmx->nested.vmxon) - return nested_vmx_failValid(vcpu, - VMXERR_VMXON_IN_VMX_ROOT_OPERATION); - - if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) - != VMXON_NEEDED_FEATURES) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - if (nested_vmx_get_vmptr(vcpu, &vmptr)) - return 1; - - /* - * SDM 3: 24.11.5 - * The first 4 bytes of VMXON region contain the supported - * VMCS revision identifier - * - * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; - * which replaces physical address width with 32 - */ - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) - return nested_vmx_failInvalid(vcpu); - - page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) - return nested_vmx_failInvalid(vcpu); - - if (*(u32 *)kmap(page) != VMCS12_REVISION) { - kunmap(page); - kvm_release_page_clean(page); - return nested_vmx_failInvalid(vcpu); - } - kunmap(page); - kvm_release_page_clean(page); - - vmx->nested.vmxon_ptr = vmptr; - ret = enter_vmx_operation(vcpu); - if (ret) - return ret; - - return nested_vmx_succeed(vcpu); -} - -/* - * Intel's VMX Instruction Reference specifies a common set of prerequisites - * for running VMX instructions (except VMXON, whose prerequisites are - * slightly different). It also specifies what exception to inject otherwise. - * Note that many of these exceptions have priority over VM exits, so they - * don't have to be checked again here. - */ -static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) -{ - if (!to_vmx(vcpu)->nested.vmxon) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 0; - } - - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 0; - } - - return 1; -} - -static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) -{ - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, -1ull); -} - -static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!vmx->nested.hv_evmcs) - return; - - kunmap(vmx->nested.hv_evmcs_page); - kvm_release_page_dirty(vmx->nested.hv_evmcs_page); - vmx->nested.hv_evmcs_vmptr = -1ull; - vmx->nested.hv_evmcs_page = NULL; - vmx->nested.hv_evmcs = NULL; -} - -static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vmx->nested.current_vmptr == -1ull) - return; - - if (enable_shadow_vmcs) { - /* copy to memory all shadowed fields in case - they were modified */ - copy_shadow_to_vmcs12(vmx); - vmx->nested.need_vmcs12_sync = false; - vmx_disable_shadow_vmcs(vmx); - } - vmx->nested.posted_intr_nv = -1; - - /* Flush VMCS12 to guest memory */ - kvm_vcpu_write_guest_page(vcpu, - vmx->nested.current_vmptr >> PAGE_SHIFT, - vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); - - kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - - vmx->nested.current_vmptr = -1ull; -} - -/* - * Free whatever needs to be freed from vmx->nested when L1 goes down, or - * just stops using VMX. - */ -static void free_nested(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) - return; - - vmx->nested.vmxon = false; - vmx->nested.smm.vmxon = false; - free_vpid(vmx->nested.vpid02); - vmx->nested.posted_intr_nv = -1; - vmx->nested.current_vmptr = -1ull; - if (enable_shadow_vmcs) { - vmx_disable_shadow_vmcs(vmx); - vmcs_clear(vmx->vmcs01.shadow_vmcs); - free_vmcs(vmx->vmcs01.shadow_vmcs); - vmx->vmcs01.shadow_vmcs = NULL; - } - kfree(vmx->nested.cached_vmcs12); - kfree(vmx->nested.cached_shadow_vmcs12); - /* Unpin physical memory we referred to in the vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_dirty(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - if (vmx->nested.virtual_apic_page) { - kvm_release_page_dirty(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = NULL; - } - if (vmx->nested.pi_desc_page) { - kunmap(vmx->nested.pi_desc_page); - kvm_release_page_dirty(vmx->nested.pi_desc_page); - vmx->nested.pi_desc_page = NULL; - vmx->nested.pi_desc = NULL; - } - - kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - - nested_release_evmcs(vcpu); - - free_loaded_vmcs(&vmx->nested.vmcs02); -} - -/* Emulate the VMXOFF instruction */ -static int handle_vmoff(struct kvm_vcpu *vcpu) -{ - if (!nested_vmx_check_permission(vcpu)) - return 1; - free_nested(vcpu); - return nested_vmx_succeed(vcpu); -} - -/* Emulate the VMCLEAR instruction */ -static int handle_vmclear(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 zero = 0; - gpa_t vmptr; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (nested_vmx_get_vmptr(vcpu, &vmptr)) - return 1; - - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) - return nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_INVALID_ADDRESS); - - if (vmptr == vmx->nested.vmxon_ptr) - return nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_VMXON_POINTER); - - if (vmx->nested.hv_evmcs_page) { - if (vmptr == vmx->nested.hv_evmcs_vmptr) - nested_release_evmcs(vcpu); - } else { - if (vmptr == vmx->nested.current_vmptr) - nested_release_vmcs12(vcpu); - - kvm_vcpu_write_guest(vcpu, - vmptr + offsetof(struct vmcs12, - launch_state), - &zero, sizeof(zero)); - } - - return nested_vmx_succeed(vcpu); -} - -static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); - -/* Emulate the VMLAUNCH instruction */ -static int handle_vmlaunch(struct kvm_vcpu *vcpu) -{ - return nested_vmx_run(vcpu, true); -} - -/* Emulate the VMRESUME instruction */ -static int handle_vmresume(struct kvm_vcpu *vcpu) -{ - - return nested_vmx_run(vcpu, false); -} - -/* - * Read a vmcs12 field. Since these can have varying lengths and we return - * one type, we chose the biggest type (u64) and zero-extend the return value - * to that size. Note that the caller, handle_vmread, might need to use only - * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of - * 64-bit fields are to be returned). - */ -static inline int vmcs12_read_any(struct vmcs12 *vmcs12, - unsigned long field, u64 *ret) -{ - short offset = vmcs_field_to_offset(field); - char *p; - - if (offset < 0) - return offset; - - p = (char *)vmcs12 + offset; - - switch (vmcs_field_width(field)) { - case VMCS_FIELD_WIDTH_NATURAL_WIDTH: - *ret = *((natural_width *)p); - return 0; - case VMCS_FIELD_WIDTH_U16: - *ret = *((u16 *)p); - return 0; - case VMCS_FIELD_WIDTH_U32: - *ret = *((u32 *)p); - return 0; - case VMCS_FIELD_WIDTH_U64: - *ret = *((u64 *)p); - return 0; - default: - WARN_ON(1); - return -ENOENT; - } -} - - -static inline int vmcs12_write_any(struct vmcs12 *vmcs12, - unsigned long field, u64 field_value){ - short offset = vmcs_field_to_offset(field); - char *p = (char *)vmcs12 + offset; - if (offset < 0) - return offset; - - switch (vmcs_field_width(field)) { - case VMCS_FIELD_WIDTH_U16: - *(u16 *)p = field_value; - return 0; - case VMCS_FIELD_WIDTH_U32: - *(u32 *)p = field_value; - return 0; - case VMCS_FIELD_WIDTH_U64: - *(u64 *)p = field_value; - return 0; - case VMCS_FIELD_WIDTH_NATURAL_WIDTH: - *(natural_width *)p = field_value; - return 0; - default: - WARN_ON(1); - return -ENOENT; - } - -} - -static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) -{ - struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; - - /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ - vmcs12->tpr_threshold = evmcs->tpr_threshold; - vmcs12->guest_rip = evmcs->guest_rip; - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { - vmcs12->guest_rsp = evmcs->guest_rsp; - vmcs12->guest_rflags = evmcs->guest_rflags; - vmcs12->guest_interruptibility_info = - evmcs->guest_interruptibility_info; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { - vmcs12->cpu_based_vm_exec_control = - evmcs->cpu_based_vm_exec_control; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { - vmcs12->exception_bitmap = evmcs->exception_bitmap; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { - vmcs12->vm_entry_controls = evmcs->vm_entry_controls; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { - vmcs12->vm_entry_intr_info_field = - evmcs->vm_entry_intr_info_field; - vmcs12->vm_entry_exception_error_code = - evmcs->vm_entry_exception_error_code; - vmcs12->vm_entry_instruction_len = - evmcs->vm_entry_instruction_len; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { - vmcs12->host_ia32_pat = evmcs->host_ia32_pat; - vmcs12->host_ia32_efer = evmcs->host_ia32_efer; - vmcs12->host_cr0 = evmcs->host_cr0; - vmcs12->host_cr3 = evmcs->host_cr3; - vmcs12->host_cr4 = evmcs->host_cr4; - vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; - vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; - vmcs12->host_rip = evmcs->host_rip; - vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; - vmcs12->host_es_selector = evmcs->host_es_selector; - vmcs12->host_cs_selector = evmcs->host_cs_selector; - vmcs12->host_ss_selector = evmcs->host_ss_selector; - vmcs12->host_ds_selector = evmcs->host_ds_selector; - vmcs12->host_fs_selector = evmcs->host_fs_selector; - vmcs12->host_gs_selector = evmcs->host_gs_selector; - vmcs12->host_tr_selector = evmcs->host_tr_selector; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { - vmcs12->pin_based_vm_exec_control = - evmcs->pin_based_vm_exec_control; - vmcs12->vm_exit_controls = evmcs->vm_exit_controls; - vmcs12->secondary_vm_exec_control = - evmcs->secondary_vm_exec_control; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { - vmcs12->io_bitmap_a = evmcs->io_bitmap_a; - vmcs12->io_bitmap_b = evmcs->io_bitmap_b; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { - vmcs12->msr_bitmap = evmcs->msr_bitmap; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { - vmcs12->guest_es_base = evmcs->guest_es_base; - vmcs12->guest_cs_base = evmcs->guest_cs_base; - vmcs12->guest_ss_base = evmcs->guest_ss_base; - vmcs12->guest_ds_base = evmcs->guest_ds_base; - vmcs12->guest_fs_base = evmcs->guest_fs_base; - vmcs12->guest_gs_base = evmcs->guest_gs_base; - vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; - vmcs12->guest_tr_base = evmcs->guest_tr_base; - vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; - vmcs12->guest_idtr_base = evmcs->guest_idtr_base; - vmcs12->guest_es_limit = evmcs->guest_es_limit; - vmcs12->guest_cs_limit = evmcs->guest_cs_limit; - vmcs12->guest_ss_limit = evmcs->guest_ss_limit; - vmcs12->guest_ds_limit = evmcs->guest_ds_limit; - vmcs12->guest_fs_limit = evmcs->guest_fs_limit; - vmcs12->guest_gs_limit = evmcs->guest_gs_limit; - vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; - vmcs12->guest_tr_limit = evmcs->guest_tr_limit; - vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; - vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; - vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; - vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; - vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; - vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; - vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; - vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; - vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; - vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; - vmcs12->guest_es_selector = evmcs->guest_es_selector; - vmcs12->guest_cs_selector = evmcs->guest_cs_selector; - vmcs12->guest_ss_selector = evmcs->guest_ss_selector; - vmcs12->guest_ds_selector = evmcs->guest_ds_selector; - vmcs12->guest_fs_selector = evmcs->guest_fs_selector; - vmcs12->guest_gs_selector = evmcs->guest_gs_selector; - vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; - vmcs12->guest_tr_selector = evmcs->guest_tr_selector; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { - vmcs12->tsc_offset = evmcs->tsc_offset; - vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; - vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { - vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; - vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; - vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; - vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; - vmcs12->guest_cr0 = evmcs->guest_cr0; - vmcs12->guest_cr3 = evmcs->guest_cr3; - vmcs12->guest_cr4 = evmcs->guest_cr4; - vmcs12->guest_dr7 = evmcs->guest_dr7; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { - vmcs12->host_fs_base = evmcs->host_fs_base; - vmcs12->host_gs_base = evmcs->host_gs_base; - vmcs12->host_tr_base = evmcs->host_tr_base; - vmcs12->host_gdtr_base = evmcs->host_gdtr_base; - vmcs12->host_idtr_base = evmcs->host_idtr_base; - vmcs12->host_rsp = evmcs->host_rsp; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { - vmcs12->ept_pointer = evmcs->ept_pointer; - vmcs12->virtual_processor_id = evmcs->virtual_processor_id; - } - - if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { - vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; - vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; - vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; - vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; - vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; - vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; - vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; - vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; - vmcs12->guest_pending_dbg_exceptions = - evmcs->guest_pending_dbg_exceptions; - vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; - vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; - vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; - vmcs12->guest_activity_state = evmcs->guest_activity_state; - vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; - } - - /* - * Not used? - * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; - * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; - * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; - * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; - * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; - * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; - * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; - * vmcs12->page_fault_error_code_mask = - * evmcs->page_fault_error_code_mask; - * vmcs12->page_fault_error_code_match = - * evmcs->page_fault_error_code_match; - * vmcs12->cr3_target_count = evmcs->cr3_target_count; - * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; - * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; - * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; - */ - - /* - * Read only fields: - * vmcs12->guest_physical_address = evmcs->guest_physical_address; - * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; - * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; - * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; - * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; - * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; - * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; - * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; - * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; - * vmcs12->exit_qualification = evmcs->exit_qualification; - * vmcs12->guest_linear_address = evmcs->guest_linear_address; - * - * Not present in struct vmcs12: - * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; - * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; - * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; - * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; - */ - - return 0; -} - -static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) -{ - struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; - - /* - * Should not be changed by KVM: - * - * evmcs->host_es_selector = vmcs12->host_es_selector; - * evmcs->host_cs_selector = vmcs12->host_cs_selector; - * evmcs->host_ss_selector = vmcs12->host_ss_selector; - * evmcs->host_ds_selector = vmcs12->host_ds_selector; - * evmcs->host_fs_selector = vmcs12->host_fs_selector; - * evmcs->host_gs_selector = vmcs12->host_gs_selector; - * evmcs->host_tr_selector = vmcs12->host_tr_selector; - * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; - * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; - * evmcs->host_cr0 = vmcs12->host_cr0; - * evmcs->host_cr3 = vmcs12->host_cr3; - * evmcs->host_cr4 = vmcs12->host_cr4; - * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; - * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; - * evmcs->host_rip = vmcs12->host_rip; - * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; - * evmcs->host_fs_base = vmcs12->host_fs_base; - * evmcs->host_gs_base = vmcs12->host_gs_base; - * evmcs->host_tr_base = vmcs12->host_tr_base; - * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; - * evmcs->host_idtr_base = vmcs12->host_idtr_base; - * evmcs->host_rsp = vmcs12->host_rsp; - * sync_vmcs12() doesn't read these: - * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; - * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; - * evmcs->msr_bitmap = vmcs12->msr_bitmap; - * evmcs->ept_pointer = vmcs12->ept_pointer; - * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; - * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; - * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; - * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; - * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; - * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; - * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; - * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; - * evmcs->tpr_threshold = vmcs12->tpr_threshold; - * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; - * evmcs->exception_bitmap = vmcs12->exception_bitmap; - * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; - * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; - * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; - * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; - * evmcs->page_fault_error_code_mask = - * vmcs12->page_fault_error_code_mask; - * evmcs->page_fault_error_code_match = - * vmcs12->page_fault_error_code_match; - * evmcs->cr3_target_count = vmcs12->cr3_target_count; - * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; - * evmcs->tsc_offset = vmcs12->tsc_offset; - * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; - * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; - * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; - * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; - * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; - * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; - * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; - * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; - * - * Not present in struct vmcs12: - * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; - * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; - * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; - * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; - */ - - evmcs->guest_es_selector = vmcs12->guest_e