whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit d8242d22d501b8403c780f306762fbdf889ec0c5
parent ef78e5ec9214376c5cb989f5da70b02d0c117b66
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Wed, 28 Nov 2018 08:29:18 -0800

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "Bugfixes, many of them reported by syzkaller and mostly predating the
  merge window"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  kvm: svm: Ensure an IBPB on all affected CPUs when freeing a vmcb
  kvm: mmu: Fix race in emulated page table writes
  KVM: nVMX: vmcs12 revision_id is always VMCS12_REVISION even when copied from eVMCS
  KVM: nVMX: Verify eVMCS revision id match supported eVMCS version on eVMCS VMPTRLD
  KVM: nVMX/nSVM: Fix bug which sets vcpu->arch.tsc_offset to L1 tsc_offset
  x86/kvm/vmx: fix old-style function declaration
  KVM: x86: fix empty-body warnings
  KVM: VMX: Update shared MSRs to be saved/restored on MSR_EFER.LMA changes
  KVM: x86: Fix kernel info-leak in KVM_HC_CLOCK_PAIRING hypercall
  KVM: nVMX: Fix kernel info-leak when enabling KVM_CAP_HYPERV_ENLIGHTENED_VMCS more than once
  svm: Add mutex_lock to protect apic_access_page_done on AMD systems
  KVM: X86: Fix scan ioapic use-before-initialization
  KVM: LAPIC: Fix pv ipis use-before-initialization
  KVM: VMX: re-add ple_gap module parameter
  KVM: PPC: Book3S HV: Fix handling for interrupted H_ENTER_NESTED

Diffstat:
March/powerpc/kvm/book3s_hv.c | 1+
March/x86/include/asm/kvm_host.h | 3++-
March/x86/kvm/lapic.c | 7++++++-
March/x86/kvm/mmu.c | 27+++++++++------------------
March/x86/kvm/svm.c | 44+++++++++++++++++++++++++++++---------------
March/x86/kvm/vmx.c | 98++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
March/x86/kvm/x86.c | 10++++++----
7 files changed, 118 insertions(+), 72 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c @@ -983,6 +983,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) ret = kvmhv_enter_nested_guest(vcpu); if (ret == H_INTERRUPT) { kvmppc_set_gpr(vcpu, 3, 0); + vcpu->arch.hcall_needed = 0; return -EINTR; } break; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h @@ -1094,7 +1094,8 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu); - void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + /* Returns actual tsc_offset set in active VMCS */ + u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c @@ -55,7 +55,7 @@ #define PRIo64 "o" /* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ -#define apic_debug(fmt, arg...) +#define apic_debug(fmt, arg...) do {} while (0) /* 14 is the version for Xeon and Pentium 8.4.8*/ #define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) @@ -576,6 +576,11 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); + if (unlikely(!map)) { + count = -EOPNOTSUPP; + goto out; + } + if (min > map->max_apic_id) goto out; /* Bits above cluster_size are masked in the caller. */ diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c @@ -5074,9 +5074,9 @@ static bool need_remote_flush(u64 old, u64 new) } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, - const u8 *new, int *bytes) + int *bytes) { - u64 gentry; + u64 gentry = 0; int r; /* @@ -5088,22 +5088,12 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; - r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8); - if (r) - gentry = 0; - new = (const u8 *)&gentry; } - switch (*bytes) { - case 4: - gentry = *(const u32 *)new; - break; - case 8: - gentry = *(const u64 *)new; - break; - default: - gentry = 0; - break; + if (*bytes == 4 || *bytes == 8) { + r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes); + if (r) + gentry = 0; } return gentry; @@ -5207,8 +5197,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes); - /* * No need to care whether allocation memory is successful * or not since pte prefetch is skiped if it does not have @@ -5217,6 +5205,9 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mmu_topup_memory_caches(vcpu); spin_lock(&vcpu->kvm->mmu_lock); + + gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); + ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c @@ -1446,7 +1446,7 @@ static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu) return vcpu->arch.tsc_offset; } -static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); u64 g_tsc_offset = 0; @@ -1464,6 +1464,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) svm->vmcb->control.tsc_offset = offset + g_tsc_offset; mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + return svm->vmcb->control.tsc_offset; } static void avic_init_vmcb(struct vcpu_svm *svm) @@ -1664,20 +1665,23 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, static int avic_init_access_page(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - int ret; + int ret = 0; + mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page_done) - return 0; + goto out; - ret = x86_set_memory_region(kvm, - APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, - PAGE_SIZE); + ret = __x86_set_memory_region(kvm, + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, + PAGE_SIZE); if (ret) - return ret; + goto out; kvm->arch.apic_access_page_done = true; - return 0; +out: + mutex_unlock(&kvm->slots_lock); + return ret; } static int avic_init_backing_page(struct kvm_vcpu *vcpu) @@ -2189,21 +2193,31 @@ out: return ERR_PTR(err); } +static void svm_clear_current_vmcb(struct vmcb *vmcb) +{ + int i; + + for_each_online_cpu(i) + cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); +} + static void svm_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + /* + * The vmcb page can be recycled, causing a false negative in + * svm_vcpu_load(). So, ensure that no logical CPU has this + * vmcb page recorded as its current vmcb. + */ + svm_clear_current_vmcb(svm->vmcb); + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); - /* - * The vmcb page can be recycled, causing a false negative in - * svm_vcpu_load(). So do a full IBPB now. - */ - indirect_branch_prediction_barrier(); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -7149,7 +7163,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .has_wbinvd_exit = svm_has_wbinvd_exit, .read_l1_tsc_offset = svm_read_l1_tsc_offset, - .write_tsc_offset = svm_write_tsc_offset, + .write_l1_tsc_offset = svm_write_l1_tsc_offset, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c @@ -174,6 +174,7 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); * refer SDM volume 3b section 21.6.13 & 22.1.3. */ static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; +module_param(ple_gap, uint, 0444); static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, uint, 0444); @@ -984,6 +985,7 @@ struct vcpu_vmx { struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; + bool guest_msrs_dirty; unsigned long host_idt_base; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; @@ -1306,7 +1308,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, u16 error_code); static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); -static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type); static DEFINE_PER_CPU(struct vmcs *, vmxarea); @@ -1610,12 +1612,6 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* We don't support disabling the feature for simplicity. */ - if (vmx->nested.enlightened_vmcs_enabled) - return 0; - - vmx->nested.enlightened_vmcs_enabled = true; - /* * vmcs_version represents the range of supported Enlightened VMCS * versions: lower 8 bits is the minimal version, higher 8 bits is the @@ -1625,6 +1621,12 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu, if (vmcs_version) *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1; + /* We don't support disabling the feature for simplicity. */ + if (vmx->nested.enlightened_vmcs_enabled) + return 0; + + vmx->nested.enlightened_vmcs_enabled = true; + vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; @@ -2897,6 +2899,20 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmx->req_immediate_exit = false; + /* + * Note that guest MSRs to be saved/restored can also be changed + * when guest state is loaded. This happens when guest transitions + * to/from long-mode by setting MSR_EFER.LMA. + */ + if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) { + vmx->guest_msrs_dirty = false; + for (i = 0; i < vmx->save_nmsrs; ++i) + kvm_set_shared_msr(vmx->guest_msrs[i].index, + vmx->guest_msrs[i].data, + vmx->guest_msrs[i].mask); + + } + if (vmx->loaded_cpu_state) return; @@ -2957,11 +2973,6 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmcs_writel(HOST_GS_BASE, gs_base); host_state->gs_base = gs_base; } - - for (i = 0; i < vmx->save_nmsrs; ++i) - kvm_set_shared_msr(vmx->guest_msrs[i].index, - vmx->guest_msrs[i].data, - vmx->guest_msrs[i].mask); } static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) @@ -3436,6 +3447,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) move_msr_up(vmx, index, save_nmsrs++); vmx->save_nmsrs = save_nmsrs; + vmx->guest_msrs_dirty = true; if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(&vmx->vcpu); @@ -3452,11 +3464,9 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) return vcpu->arch.tsc_offset; } -/* - * writes 'offset' into guest's timestamp counter offset register - */ -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { + u64 active_offset = offset; if (is_guest_mode(vcpu)) { /* * We're here if L1 chose not to trap WRMSR to TSC. According @@ -3464,17 +3474,16 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) * set for L2 remains unchanged, and still needs to be added * to the newly set TSC to get L2's TSC. */ - struct vmcs12 *vmcs12; - /* recalculate vmcs02.TSC_OFFSET: */ - vmcs12 = get_vmcs12(vcpu); - vmcs_write64(TSC_OFFSET, offset + - (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? - vmcs12->tsc_offset : 0)); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING)) + active_offset += vmcs12->tsc_offset; } else { trace_kvm_write_tsc_offset(vcpu->vcpu_id, vmcs_read64(TSC_OFFSET), offset); - vmcs_write64(TSC_OFFSET, offset); } + + vmcs_write64(TSC_OFFSET, active_offset); + return active_offset; } /* @@ -5944,7 +5953,7 @@ static void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } -static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type) { int f = sizeof(unsigned long); @@ -5982,7 +5991,7 @@ static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bit } } -static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type) { int f = sizeof(unsigned long); @@ -6020,7 +6029,7 @@ static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitm } } -static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type, bool value) { if (value) @@ -8664,8 +8673,6 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; - vmcs12->hdr.revision_id = evmcs->revision_id; - /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ vmcs12->tpr_threshold = evmcs->tpr_threshold; vmcs12->guest_rip = evmcs->guest_rip; @@ -9369,7 +9376,30 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); - if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) { + /* + * Currently, KVM only supports eVMCS version 1 + * (== KVM_EVMCS_VERSION) and thus we expect guest to set this + * value to first u32 field of eVMCS which should specify eVMCS + * VersionNumber. + * + * Guest should be aware of supported eVMCS versions by host by + * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is + * expected to set this CPUID leaf according to the value + * returned in vmcs_version from nested_enable_evmcs(). + * + * However, it turns out that Microsoft Hyper-V fails to comply + * to their own invented interface: When Hyper-V use eVMCS, it + * just sets first u32 field of eVMCS to revision_id specified + * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number + * which is one of the supported versions specified in + * CPUID.0x4000000A.EAX[0:15]. + * + * To overcome Hyper-V bug, we accept here either a supported + * eVMCS version or VMCS12 revision_id as valid values for first + * u32 field of eVMCS. + */ + if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && + (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { nested_release_evmcs(vcpu); return 0; } @@ -9390,9 +9420,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, * present in struct hv_enlightened_vmcs, ...). Make sure there * are no leftovers. */ - if (from_launch) - memset(vmx->nested.cached_vmcs12, 0, - sizeof(*vmx->nested.cached_vmcs12)); + if (from_launch) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + memset(vmcs12, 0, sizeof(*vmcs12)); + vmcs12->hdr.revision_id = VMCS12_REVISION; + } } return 1; @@ -15062,7 +15094,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .read_l1_tsc_offset = vmx_read_l1_tsc_offset, - .write_tsc_offset = vmx_write_tsc_offset, + .write_l1_tsc_offset = vmx_write_l1_tsc_offset, .set_tdp_cr3 = vmx_set_cr3, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c @@ -1665,8 +1665,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { - kvm_x86_ops->write_tsc_offset(vcpu, offset); - vcpu->arch.tsc_offset = offset; + vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset); } static inline bool kvm_check_tsc_unstable(void) @@ -1794,7 +1793,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc); static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { - kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment); + u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); } static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) @@ -6918,6 +6918,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, clock_pairing.nsec = ts.tv_nsec; clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle); clock_pairing.flags = 0; + memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad)); ret = 0; if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing, @@ -7455,7 +7456,8 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) else { if (vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); - kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); + if (ioapic_in_kernel(vcpu->kvm)) + kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } if (is_guest_mode(vcpu))