whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit f8f65382c98a28e3c2b20df9dd4231dca5a11682
parent 0f3aa48ad4c307fb72b8eb43add26c8f314d396a
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Sat,  8 Sep 2018 15:52:45 -0700

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM fixes from Radim Krčmář:
 "ARM:
   - Fix a VFP corruption in 32-bit guest
   - Add missing cache invalidation for CoW pages
   - Two small cleanups

  s390:
   - Fallout from the hugetlbfs support: pfmf interpretion and locking
   - VSIE: fix keywrapping for nested guests

  PPC:
   - Fix a bug where pages might not get marked dirty, causing guest
     memory corruption on migration
   - Fix a bug causing reads from guest memory to use the wrong guest
     real address for very large HPT guests (>256G of memory), leading
     to failures in instruction emulation.

  x86:
   - Fix out of bound access from malicious pv ipi hypercalls
     (introduced in rc1)
   - Fix delivery of pending interrupts when entering a nested guest,
     preventing arbitrarily late injection
   - Sanitize kvm_stat output after destroying a guest
   - Fix infinite loop when emulating a nested guest page fault and
     improve the surrounding emulation code
   - Two minor cleanups"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (28 commits)
  KVM: LAPIC: Fix pv ipis out-of-bounds access
  KVM: nVMX: Fix loss of pending IRQ/NMI before entering L2
  arm64: KVM: Remove pgd_lock
  KVM: Remove obsolete kvm_unmap_hva notifier backend
  arm64: KVM: Only force FPEXC32_EL2.EN if trapping FPSIMD
  KVM: arm/arm64: Clean dcache to PoC when changing PTE due to CoW
  KVM: s390: Properly lock mm context allow_gmap_hpage_1m setting
  KVM: s390: vsie: copy wrapping keys to right place
  KVM: s390: Fix pfmf and conditional skey emulation
  tools/kvm_stat: re-animate display of dead guests
  tools/kvm_stat: indicate dead guests as such
  tools/kvm_stat: handle guest removals more gracefully
  tools/kvm_stat: don't reset stats when setting PID filter for debugfs
  tools/kvm_stat: fix updates for dead guests
  tools/kvm_stat: fix handling of invalid paths in debugfs provider
  tools/kvm_stat: fix python3 issues
  KVM: x86: Unexport x86_emulate_instruction()
  KVM: x86: Rename emulate_instruction() to kvm_emulate_instruction()
  KVM: x86: Do not re-{try,execute} after failed emulation in L2
  KVM: x86: Default to not allowing emulation retry in kvm_mmu_page_fault
  ...

Diffstat:
March/arm/include/asm/kvm_host.h | 1-
March/arm64/include/asm/kvm_host.h | 4+---
March/arm64/kvm/hyp/switch.c | 9++++++---
March/mips/include/asm/kvm_host.h | 1-
March/mips/kvm/mmu.c | 10----------
March/powerpc/kvm/book3s_64_mmu_hv.c | 2+-
March/powerpc/kvm/book3s_64_mmu_radix.c | 6+++---
March/s390/include/asm/mmu.h | 8+++++++-
March/s390/kvm/kvm-s390.c | 2++
March/s390/kvm/priv.c | 30++++++++++++++++++------------
March/s390/kvm/vsie.c | 3++-
March/x86/include/asm/kvm_host.h | 22+++++++---------------
March/x86/kvm/lapic.c | 27++++++++++++++++++++-------
March/x86/kvm/mmu.c | 26+++++++++++++++-----------
March/x86/kvm/svm.c | 19+++++++++----------
March/x86/kvm/vmx.c | 43+++++++++++++++++++++++++++++++------------
March/x86/kvm/x86.c | 28+++++++++++++++++++++++-----
March/x86/kvm/x86.h | 2++
Mtools/kvm/kvm_stat/kvm_stat | 59+++++++++++++++++++++++++++++++++++++++++++++++++----------
Mvirt/kvm/arm/mmu.c | 21++++++++-------------
Mvirt/kvm/arm/trace.h | 15---------------
21 files changed, 204 insertions(+), 134 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h @@ -223,7 +223,6 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h @@ -61,8 +61,7 @@ struct kvm_arch { u64 vmid_gen; u32 vmid; - /* 1-level 2nd stage table and lock */ - spinlock_t pgd_lock; + /* 1-level 2nd stage table, protected by kvm->mmu_lock */ pgd_t *pgd; /* VTTBR value associated with above pgd and vmid */ @@ -357,7 +356,6 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c @@ -98,8 +98,10 @@ static void activate_traps_vhe(struct kvm_vcpu *vcpu) val = read_sysreg(cpacr_el1); val |= CPACR_EL1_TTA; val &= ~CPACR_EL1_ZEN; - if (!update_fp_enabled(vcpu)) + if (!update_fp_enabled(vcpu)) { val &= ~CPACR_EL1_FPEN; + __activate_traps_fpsimd32(vcpu); + } write_sysreg(val, cpacr_el1); @@ -114,8 +116,10 @@ static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu) val = CPTR_EL2_DEFAULT; val |= CPTR_EL2_TTA | CPTR_EL2_TZ; - if (!update_fp_enabled(vcpu)) + if (!update_fp_enabled(vcpu)) { val |= CPTR_EL2_TFP; + __activate_traps_fpsimd32(vcpu); + } write_sysreg(val, cptr_el2); } @@ -129,7 +133,6 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu) if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE)) write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2); - __activate_traps_fpsimd32(vcpu); if (has_vhe()) activate_traps_vhe(vcpu); else diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h @@ -931,7 +931,6 @@ enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, bool write); #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c @@ -512,16 +512,6 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, return 1; } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) -{ - unsigned long end = hva + PAGE_SIZE; - - handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL); - - kvm_mips_callbacks->flush_shadow_all(kvm); - return 0; -} - int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) { handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -358,7 +358,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned long pp, key; unsigned long v, orig_v, gr; __be64 *hptep; - int index; + long int index; int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); if (kvm_is_radix(vcpu->kvm)) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -725,10 +725,10 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, gpa, shift); kvmppc_radix_tlbie_page(kvm, gpa, shift); if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) { - unsigned long npages = 1; + unsigned long psize = PAGE_SIZE; if (shift) - npages = 1ul << (shift - PAGE_SHIFT); - kvmppc_update_dirty_map(memslot, gfn, npages); + psize = 1ul << shift; + kvmppc_update_dirty_map(memslot, gfn, psize); } } return 0; diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h @@ -16,7 +16,13 @@ typedef struct { unsigned long asce; unsigned long asce_limit; unsigned long vdso_base; - /* The mmu context allocates 4K page tables. */ + /* + * The following bitfields need a down_write on the mm + * semaphore when they are written to. As they are only + * written once, they can be read without a lock. + * + * The mmu context allocates 4K page tables. + */ unsigned int alloc_pgste:1; /* The mmu context uses extended page tables. */ unsigned int has_pgste:1; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c @@ -695,7 +695,9 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) r = -EINVAL; else { r = 0; + down_write(&kvm->mm->mmap_sem); kvm->mm->context.allow_gmap_hpage_1m = 1; + up_write(&kvm->mm->mmap_sem); /* * We might have to create fake 4k page * tables. To avoid that the hardware works on diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c @@ -280,9 +280,11 @@ retry: goto retry; } } - if (rc) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); up_read(&current->mm->mmap_sem); + if (rc == -EFAULT) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (rc < 0) + return rc; vcpu->run->s.regs.gprs[reg1] &= ~0xff; vcpu->run->s.regs.gprs[reg1] |= key; return 0; @@ -324,9 +326,11 @@ retry: goto retry; } } - if (rc < 0) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); up_read(&current->mm->mmap_sem); + if (rc == -EFAULT) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (rc < 0) + return rc; kvm_s390_set_psw_cc(vcpu, rc); return 0; } @@ -390,12 +394,12 @@ static int handle_sske(struct kvm_vcpu *vcpu) FAULT_FLAG_WRITE, &unlocked); rc = !rc ? -EAGAIN : rc; } + up_read(&current->mm->mmap_sem); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - - up_read(&current->mm->mmap_sem); - if (rc >= 0) - start += PAGE_SIZE; + if (rc < 0) + return rc; + start += PAGE_SIZE; } if (m3 & (SSKE_MC | SSKE_MR)) { @@ -1002,13 +1006,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) FAULT_FLAG_WRITE, &unlocked); rc = !rc ? -EAGAIN : rc; } + up_read(&current->mm->mmap_sem); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - - up_read(&current->mm->mmap_sem); - if (rc >= 0) - start += PAGE_SIZE; + if (rc == -EAGAIN) + continue; + if (rc < 0) + return rc; } + start += PAGE_SIZE; } if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_64BIT) { diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c @@ -173,7 +173,8 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return set_validity_icpt(scb_s, 0x0039U); /* copy only the wrapping keys */ - if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56)) + if (read_guest_real(vcpu, crycb_addr + 72, + vsie_page->crycb.dea_wrapping_key_mask, 56)) return set_validity_icpt(scb_s, 0x0035U); scb_s->ecb3 |= ecb3_flags; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h @@ -1237,19 +1237,12 @@ enum emulation_result { #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) -#define EMULTYPE_RETRY (1 << 3) -#define EMULTYPE_NO_REEXECUTE (1 << 4) -#define EMULTYPE_NO_UD_ON_FAIL (1 << 5) -#define EMULTYPE_VMWARE (1 << 6) -int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, - int emulation_type, void *insn, int insn_len); - -static inline int emulate_instruction(struct kvm_vcpu *vcpu, - int emulation_type) -{ - return x86_emulate_instruction(vcpu, 0, - emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0); -} +#define EMULTYPE_ALLOW_RETRY (1 << 3) +#define EMULTYPE_NO_UD_ON_FAIL (1 << 4) +#define EMULTYPE_VMWARE (1 << 5) +int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); +int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, + void *insn, int insn_len); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); @@ -1450,7 +1443,6 @@ asmlinkage void kvm_spurious_fault(void); ____kvm_handle_fault_on_reboot(insn, "") #define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); @@ -1463,7 +1455,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, - unsigned long ipi_bitmap_high, int min, + unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); u64 kvm_get_arch_capabilities(void); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c @@ -548,7 +548,7 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, } int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, - unsigned long ipi_bitmap_high, int min, + unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit) { int i; @@ -571,18 +571,31 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); + if (min > map->max_apic_id) + goto out; /* Bits above cluster_size are masked in the caller. */ - for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); + for_each_set_bit(i, &ipi_bitmap_low, + min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { + if (map->phys_map[min + i]) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } } min += cluster_size; - for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); + + if (min > map->max_apic_id) + goto out; + + for_each_set_bit(i, &ipi_bitmap_high, + min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { + if (map->phys_map[min + i]) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } } +out: rcu_read_unlock(); return count; } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c @@ -1853,11 +1853,6 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) -{ - return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); -} - int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) { return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); @@ -5217,7 +5212,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { - int r, emulation_type = EMULTYPE_RETRY; + int r, emulation_type = 0; enum emulation_result er; bool direct = vcpu->arch.mmu.direct_map; @@ -5230,10 +5225,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, cr2, direct); - if (r == RET_PF_EMULATE) { - emulation_type = 0; + if (r == RET_PF_EMULATE) goto emulate; - } } if (r == RET_PF_INVALID) { @@ -5260,8 +5253,19 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, return 1; } - if (mmio_info_in_cache(vcpu, cr2, direct)) - emulation_type = 0; + /* + * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still + * optimistically try to just unprotect the page and let the processor + * re-execute the instruction that caused the page fault. Do not allow + * retrying MMIO emulation, as it's not only pointless but could also + * cause us to enter an infinite loop because the processor will keep + * faulting on the non-existent MMIO address. Retrying an instruction + * from a nested guest is also pointless and dangerous as we are only + * explicitly shadowing L1's page tables, i.e. unprotecting something + * for L1 isn't going to magically fix whatever issue cause L2 to fail. + */ + if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu)) + emulation_type = EMULTYPE_ALLOW_RETRY; emulate: /* * On AMD platforms, under certain conditions insn_len may be zero on #NPF. diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c @@ -776,7 +776,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) } if (!svm->next_rip) { - if (emulate_instruction(vcpu, EMULTYPE_SKIP) != + if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) != EMULATE_DONE) printk(KERN_DEBUG "%s: NOP\n", __func__); return; @@ -2715,7 +2715,7 @@ static int gp_interception(struct vcpu_svm *svm) WARN_ON_ONCE(!enable_vmware_backdoor); - er = emulate_instruction(vcpu, + er = kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); if (er == EMULATE_USER_EXIT) return 0; @@ -2819,7 +2819,7 @@ static int io_interception(struct vcpu_svm *svm) string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; if (string) - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; @@ -3861,7 +3861,7 @@ static int iret_interception(struct vcpu_svm *svm) static int invlpg_interception(struct vcpu_svm *svm) { if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); return kvm_skip_emulated_instruction(&svm->vcpu); @@ -3869,13 +3869,13 @@ static int invlpg_interception(struct vcpu_svm *svm) static int emulate_on_interception(struct vcpu_svm *svm) { - return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; } static int rsm_interception(struct vcpu_svm *svm) { - return x86_emulate_instruction(&svm->vcpu, 0, 0, - rsm_ins_bytes, 2) == EMULATE_DONE; + return kvm_emulate_instruction_from_buffer(&svm->vcpu, + rsm_ins_bytes, 2) == EMULATE_DONE; } static int rdpmc_interception(struct vcpu_svm *svm) @@ -4700,7 +4700,7 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) ret = avic_unaccel_trap_write(svm); } else { /* Handling Fault */ - ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); + ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); } return ret; @@ -6747,7 +6747,7 @@ e_free: static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) { unsigned long vaddr, vaddr_end, next_vaddr; - unsigned long dst_vaddr, dst_vaddr_end; + unsigned long dst_vaddr; struct page **src_p, **dst_p; struct kvm_sev_dbg debug; unsigned long n; @@ -6763,7 +6763,6 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) size = debug.len; vaddr_end = vaddr + size; dst_vaddr = debug.dst_uaddr; - dst_vaddr_end = dst_vaddr + size; for (; vaddr < vaddr_end; vaddr = next_vaddr) { int len, s_off, d_off; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c @@ -6983,7 +6983,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { - if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { + if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; return kvm_vcpu_halt(vcpu); @@ -7054,7 +7054,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { WARN_ON_ONCE(!enable_vmware_backdoor); - er = emulate_instruction(vcpu, + er = kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); if (er == EMULATE_USER_EXIT) return 0; @@ -7157,7 +7157,7 @@ static int handle_io(struct kvm_vcpu *vcpu) ++vcpu->stat.io_exits; if (string) - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -7231,7 +7231,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) static int handle_desc(struct kvm_vcpu *vcpu) { WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_cr(struct kvm_vcpu *vcpu) @@ -7480,7 +7480,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) static int handle_invd(struct kvm_vcpu *vcpu) { - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_invlpg(struct kvm_vcpu *vcpu) @@ -7547,7 +7547,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } } - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) @@ -7704,8 +7704,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) return kvm_skip_emulated_instruction(vcpu); else - return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP, - NULL, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == + EMULATE_DONE; } return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); @@ -7748,7 +7748,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; - err = emulate_instruction(vcpu, 0); + err = kvm_emulate_instruction(vcpu, 0); if (err == EMULATE_USER_EXIT) { ++vcpu->stat.mmio_exits; @@ -12537,8 +12537,11 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); bool from_vmentry = !!exit_qual; u32 dummy_exit_qual; + u32 vmcs01_cpu_exec_ctrl; int r = 0; + vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) @@ -12575,6 +12578,25 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) } /* + * If L1 had a pending IRQ/NMI until it executed + * VMLAUNCH/VMRESUME which wasn't delivered because it was + * disallowed (e.g. interrupts disabled), L0 needs to + * evaluate if this pending event should cause an exit from L2 + * to L1 or delivered directly to L2 (e.g. In case L1 don't + * intercept EXTERNAL_INTERRUPT). + * + * Usually this would be handled by L0 requesting a + * IRQ/NMI window by setting VMCS accordingly. However, + * this setting was done on VMCS01 and now VMCS02 is active + * instead. Thus, we force L0 to perform pending event + * evaluation by requesting a KVM_REQ_EVENT. + */ + if (vmcs01_cpu_exec_ctrl & + (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING)) { + kvm_make_request(KVM_REQ_EVENT, vcpu); + } + + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet * returned as far as L1 is concerned. It will only return (and set @@ -13988,9 +14010,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) return -EINVAL; - if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) - vmx->nested.nested_run_pending = 1; - vmx->nested.dirty_vmcs12 = true; ret = enter_vmx_non_root_mode(vcpu, NULL); if (ret) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c @@ -4987,7 +4987,7 @@ int handle_ud(struct kvm_vcpu *vcpu) emul_type = 0; } - er = emulate_instruction(vcpu, emul_type); + er = kvm_emulate_instruction(vcpu, emul_type); if (er == EMULATE_USER_EXIT) return 0; if (er != EMULATE_DONE) @@ -5870,7 +5870,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, gpa_t gpa = cr2; kvm_pfn_t pfn; - if (emulation_type & EMULTYPE_NO_REEXECUTE) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + return false; + + if (WARN_ON_ONCE(is_guest_mode(vcpu))) return false; if (!vcpu->arch.mmu.direct_map) { @@ -5958,7 +5961,10 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, */ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - if (!(emulation_type & EMULTYPE_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + return false; + + if (WARN_ON_ONCE(is_guest_mode(vcpu))) return false; if (x86_page_table_writing_insn(ctxt)) @@ -6276,7 +6282,19 @@ restart: return r; } -EXPORT_SYMBOL_GPL(x86_emulate_instruction); + +int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) +{ + return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); +} +EXPORT_SYMBOL_GPL(kvm_emulate_instruction); + +int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, + void *insn, int insn_len) +{ + return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len); +} +EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) @@ -7734,7 +7752,7 @@ static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { int r; vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); + r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r != EMULATE_DONE) return 0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h @@ -274,6 +274,8 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); +int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, + int emulation_type, void *insn, int insn_len); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat @@ -759,12 +759,18 @@ class DebugfsProvider(Provider): if len(vms) == 0: self.do_read = False - self.paths = filter(lambda x: "{}-".format(pid) in x, vms) + self.paths = list(filter(lambda x: "{}-".format(pid) in x, vms)) else: self.paths = [] self.do_read = True - self.reset() + + def _verify_paths(self): + """Remove invalid paths""" + for path in self.paths: + if not os.path.exists(os.path.join(PATH_DEBUGFS_KVM, path)): + self.paths.remove(path) + continue def read(self, reset=0, by_guest=0): """Returns a dict with format:'file name / field -> current value'. @@ -780,6 +786,7 @@ class DebugfsProvider(Provider): # If no debugfs filtering support is available, then don't read. if not self.do_read: return results + self._verify_paths() paths = self.paths if self._pid == 0: @@ -1096,15 +1103,16 @@ class Tui(object): pid = self.stats.pid_filter self.screen.erase() gname = self.get_gname_from_pid(pid) + self._gname = gname if gname: gname = ('({})'.format(gname[:MAX_GUEST_NAME_LEN] + '...' if len(gname) > MAX_GUEST_NAME_LEN else gname)) if pid > 0: - self.screen.addstr(0, 0, 'kvm statistics - pid {0} {1}' - .format(pid, gname), curses.A_BOLD) + self._headline = 'kvm statistics - pid {0} {1}'.format(pid, gname) else: - self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD) + self._headline = 'kvm statistics - summary' + self.screen.addstr(0, 0, self._headline, curses.A_BOLD) if self.stats.fields_filter: regex = self.stats.fields_filter if len(regex) > MAX_REGEX_LEN: @@ -1162,6 +1170,19 @@ class Tui(object): return sorted_items + if not self._is_running_guest(self.stats.pid_filter): + if self._gname: + try: # ...to identify the guest by name in case it's back + pids = self.get_pid_from_gname(self._gname) + if len(pids) == 1: + self._refresh_header(pids[0]) + self._update_pid(pids[0]) + return + except: + pass + self._display_guest_dead() + # leave final data on screen + return row = 3 self.screen.move(row, 0) self.screen.clrtobot() @@ -1184,6 +1205,7 @@ class Tui(object): # print events tavg = 0 tcur = 0 + guest_removed = False for key, values in get_sorted_events(self, stats): if row >= self.screen.getmaxyx()[0] - 1 or values == (0, 0): break @@ -1191,7 +1213,10 @@ class Tui(object): key = self.get_gname_from_pid(key) if not key: continue - cur = int(round(values.delta / sleeptime)) if values.delta else '' + cur = int(round(values.delta / sleeptime)) if values.delta else 0 + if cur < 0: + guest_removed = True + continue if key[0] != ' ': if values.delta: tcur += values.delta @@ -1204,13 +1229,21 @@ class Tui(object): values.value * 100 / float(ltotal), cur)) row += 1 if row == 3: - self.screen.addstr(4, 1, 'No matching events reported yet') + if guest_removed: + self.screen.addstr(4, 1, 'Guest removed, updating...') + else: + self.screen.addstr(4, 1, 'No matching events reported yet') if row > 4: tavg = int(round(tcur / sleeptime)) if tcur > 0 else '' self.screen.addstr(row, 1, '%-40s %10d %8s' % ('Total', total, tavg), curses.A_BOLD) self.screen.refresh() + def _display_guest_dead(self): + marker = ' Guest is DEAD ' + y = min(len(self._headline), 80 - len(marker)) + self.screen.addstr(0, y, marker, curses.A_BLINK | curses.A_STANDOUT) + def _show_msg(self, text): """Display message centered text and exit on key press""" hint = 'Press any key to continue' @@ -1219,10 +1252,10 @@ class Tui(object): (x, term_width) = self.screen.getmaxyx() row = 2 for line in text: - start = (term_width - len(line)) / 2 + start = (term_width - len(line)) // 2 self.screen.addstr(row, start, line) row += 1 - self.screen.addstr(row + 1, (term_width - len(hint)) / 2, hint, + self.screen.addstr(row + 1, (term_width - len(hint)) // 2, hint, curses.A_STANDOUT) self.screen.getkey() @@ -1319,6 +1352,12 @@ class Tui(object): msg = '"' + str(val) + '": Invalid value' self._refresh_header() + def _is_running_guest(self, pid): + """Check if pid is still a running process.""" + if not pid: + return True + return os.path.isdir(os.path.join('/proc/', str(pid))) + def _show_vm_selection_by_guest(self): """Draws guest selection mask. @@ -1346,7 +1385,7 @@ class Tui(object): if not guest or guest == '0': break if guest.isdigit(): - if not os.path.isdir(os.path.join('/proc/', guest)): + if not self._is_running_guest(guest): msg = '"' + guest + '": Not a running process' continue pid = int(guest) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c @@ -1817,18 +1817,6 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *dat return 0; } -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) -{ - unsigned long end = hva + PAGE_SIZE; - - if (!kvm->arch.pgd) - return 0; - - trace_kvm_unmap_hva(hva); - handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL); - return 0; -} - int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) { @@ -1860,13 +1848,20 @@ static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { unsigned long end = hva + PAGE_SIZE; + kvm_pfn_t pfn = pte_pfn(pte); pte_t stage2_pte; if (!kvm->arch.pgd) return; trace_kvm_set_spte_hva(hva); - stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2); + + /* + * We've moved a page around, probably through CoW, so let's treat it + * just like a translation fault and clean the cache to the PoC. + */ + clean_dcache_guest_page(pfn, PAGE_SIZE); + stage2_pte = pfn_pte(pfn, PAGE_S2); handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); } diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h @@ -134,21 +134,6 @@ TRACE_EVENT(kvm_mmio_emulate, __entry->vcpu_pc, __entry->instr, __entry->cpsr) ); -TRACE_EVENT(kvm_unmap_hva, - TP_PROTO(unsigned long hva), - TP_ARGS(hva), - - TP_STRUCT__entry( - __field( unsigned long, hva ) - ), - - TP_fast_assign( - __entry->hva = hva; - ), - - TP_printk("mmu notifier unmap hva: %#08lx", __entry->hva) -); - TRACE_EVENT(kvm_unmap_hva_range, TP_PROTO(unsigned long start, unsigned long end), TP_ARGS(start, end),