whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 99792e0cea1ed733cdc8d0758677981e0cbebfed
parent 382d72a9aa525b56ab8453ce61751fa712414d3d
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Tue, 23 Oct 2018 17:05:28 +0100

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "Lots of changes in this cycle:

   - Lots of CPA (change page attribute) optimizations and related
     cleanups (Thomas Gleixner, Peter Zijstra)

   - Make lazy TLB mode even lazier (Rik van Riel)

   - Fault handler cleanups and improvements (Dave Hansen)

   - kdump, vmcore: Enable kdumping encrypted memory with AMD SME
     enabled (Lianbo Jiang)

   - Clean up VM layout documentation (Baoquan He, Ingo Molnar)

   - ... plus misc other fixes and enhancements"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits)
  x86/stackprotector: Remove the call to boot_init_stack_canary() from cpu_startup_entry()
  x86/mm: Kill stray kernel fault handling comment
  x86/mm: Do not warn about PCI BIOS W+X mappings
  resource: Clean it up a bit
  resource: Fix find_next_iomem_res() iteration issue
  resource: Include resource end in walk_*() interfaces
  x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error
  x86/mm: Remove spurious fault pkey check
  x86/mm/vsyscall: Consider vsyscall page part of user address space
  x86/mm: Add vsyscall address helper
  x86/mm: Fix exception table comments
  x86/mm: Add clarifying comments for user addr space
  x86/mm: Break out user address space handling
  x86/mm: Break out kernel address space handling
  x86/mm: Clarify hardware vs. software "error_code"
  x86/mm/tlb: Make lazy TLB mode lazier
  x86/mm/tlb: Add freed_tables element to flush_tlb_info
  x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range
  smp,cpumask: introduce on_each_cpu_cond_mask
  smp: use __cpumask_set_cpu in on_each_cpu_cond
  ...

Diffstat:
MDocumentation/x86/x86_64/mm.txt | 171+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
March/x86/Kconfig | 8++++++++
March/x86/include/asm/io.h | 3++-
March/x86/include/asm/kexec.h | 2+-
March/x86/include/asm/page_64_types.h | 15+++++++++------
March/x86/include/asm/tlb.h | 21++++++++++++++-------
March/x86/include/asm/tlbflush.h | 33++++++++++++---------------------
March/x86/kernel/crash_dump_64.c | 60+++++++++++++++++++++++++++++++++++++++++-------------------
March/x86/kernel/ldt.c | 2+-
March/x86/kernel/vm86_32.c | 2+-
March/x86/mm/dump_pagetables.c | 35+++++++++++++++++++++++++++--------
March/x86/mm/fault.c | 288++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
March/x86/mm/init_32.c | 23++++-------------------
March/x86/mm/ioremap.c | 24++++++++++++++++--------
March/x86/mm/pageattr.c | 627+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
March/x86/mm/tlb.c | 167++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
March/x86/xen/smp_pv.c | 2++
Mdrivers/iommu/amd_iommu_init.c | 14++++++++++++--
Mfs/proc/vmcore.c | 34++++++++++++++++++++++++++++------
Minclude/linux/crash_dump.h | 4++++
Minclude/linux/smp.h | 4++++
Mkernel/kexec_core.c | 6++++++
Mkernel/resource.c | 141+++++++++++++++++++++++++++++++++++--------------------------------------------
Mkernel/sched/idle.c | 15---------------
Mkernel/sched/sched.h | 1-
Mkernel/smp.c | 19++++++++++++++-----
Mkernel/up.c | 14+++++++++++---
Mmm/pgtable-generic.c | 1+
28 files changed, 1117 insertions(+), 619 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt @@ -1,55 +1,124 @@ +==================================================== +Complete virtual memory map with 4-level page tables +==================================================== -Virtual memory map with 4 level page tables: - -0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm -hole caused by [47:63] sign extension -ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor -ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory -ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole -ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space -ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole -ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) -... unused hole ... -ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) -... unused hole ... - vaddr_end for KASLR -fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping -fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI -ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks -... unused hole ... -ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space -... unused hole ... -ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space -[fixmap start] - ffffffffff5fffff kernel-internal fixmap range -ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI -ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole - -Virtual memory map with 5 level page tables: - -0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm -hole caused by [56:63] sign extension -ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor -ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory -ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI -ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) -ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole -ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) -... unused hole ... -ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) -... unused hole ... - vaddr_end for KASLR -fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping -... unused hole ... -ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks -... unused hole ... -ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space -... unused hole ... -ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space -[fixmap start] - ffffffffff5fffff kernel-internal fixmap range -ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI -ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole +Notes: + + - Negative addresses such as "-23 TB" are absolute addresses in bytes, counted down + from the top of the 64-bit address space. It's easier to understand the layout + when seen both in absolute addresses and in distance-from-top notation. + + For example 0xffffe90000000000 == -23 TB, it's 23 TB lower than the top of the + 64-bit address space (ffffffffffffffff). + + Note that as we get closer to the top of the address space, the notation changes + from TB to GB and then MB/KB. + + - "16M TB" might look weird at first sight, but it's an easier to visualize size + notation than "16 EB", which few will recognize at first sight as 16 exabytes. + It also shows it nicely how incredibly large 64-bit address space is. + +======================================================================================================================== + Start addr | Offset | End addr | Size | VM area description +======================================================================================================================== + | | | | + 0000000000000000 | 0 | 00007fffffffffff | 128 TB | user-space virtual memory, different per mm +__________________|____________|__________________|_________|___________________________________________________________ + | | | | + 0000800000000000 | +128 TB | ffff7fffffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical + | | | | virtual memory addresses up to the -128 TB + | | | | starting offset of kernel mappings. +__________________|____________|__________________|_________|___________________________________________________________ + | + | Kernel-space virtual memory, shared between all processes: +____________________________________________________________|___________________________________________________________ + | | | | + ffff800000000000 | -128 TB | ffff87ffffffffff | 8 TB | ... guard hole, also reserved for hypervisor + ffff880000000000 | -120 TB | ffffc7ffffffffff | 64 TB | direct mapping of all physical memory (page_offset_base) + ffffc80000000000 | -56 TB | ffffc8ffffffffff | 1 TB | ... unused hole + ffffc90000000000 | -55 TB | ffffe8ffffffffff | 32 TB | vmalloc/ioremap space (vmalloc_base) + ffffe90000000000 | -23 TB | ffffe9ffffffffff | 1 TB | ... unused hole + ffffea0000000000 | -22 TB | ffffeaffffffffff | 1 TB | virtual memory map (vmemmap_base) + ffffeb0000000000 | -21 TB | ffffebffffffffff | 1 TB | ... unused hole + ffffec0000000000 | -20 TB | fffffbffffffffff | 16 TB | KASAN shadow memory + fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole + | | | | vaddr_end for KASLR + fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping + fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | LDT remap for PTI + ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks +__________________|____________|__________________|_________|____________________________________________________________ + | + | Identical layout to the 47-bit one from here on: +____________________________________________________________|____________________________________________________________ + | | | | + ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole + ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space + ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole + ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0 + ffffffff80000000 |-2048 MB | | | + ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space + ffffffffff000000 | -16 MB | | | + FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset + ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI + ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole +__________________|____________|__________________|_________|___________________________________________________________ + + +==================================================== +Complete virtual memory map with 5-level page tables +==================================================== + +Notes: + + - With 56-bit addresses, user-space memory gets expanded by a factor of 512x, + from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting + offset and many of the regions expand to support the much larger physical + memory supported. + +======================================================================================================================== + Start addr | Offset | End addr | Size | VM area description +======================================================================================================================== + | | | | + 0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm +__________________|____________|__________________|_________|___________________________________________________________ + | | | | + 0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical + | | | | virtual memory addresses up to the -128 TB + | | | | starting offset of kernel mappings. +__________________|____________|__________________|_________|___________________________________________________________ + | + | Kernel-space virtual memory, shared between all processes: +____________________________________________________________|___________________________________________________________ + | | | | + ff00000000000000 | -64 PB | ff0fffffffffffff | 4 PB | ... guard hole, also reserved for hypervisor + ff10000000000000 | -60 PB | ff8fffffffffffff | 32 PB | direct mapping of all physical memory (page_offset_base) + ff90000000000000 | -28 PB | ff9fffffffffffff | 4 PB | LDT remap for PTI + ffa0000000000000 | -24 PB | ffd1ffffffffffff | 12.5 PB | vmalloc/ioremap space (vmalloc_base) + ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole + ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base) + ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole + ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory + fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole + | | | | vaddr_end for KASLR + fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping + fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole + ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks +__________________|____________|__________________|_________|____________________________________________________________ + | + | Identical layout to the 47-bit one from here on: +____________________________________________________________|____________________________________________________________ + | | | | + ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole + ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space + ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole + ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0 + ffffffff80000000 |-2048 MB | | | + ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space + ffffffffff000000 | -16 MB | | | + FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset + ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI + ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole +__________________|____________|__________________|_________|___________________________________________________________ Architecture defines a 64-bit virtual address. Implementations can support less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig @@ -1487,6 +1487,14 @@ config X86_DIRECT_GBPAGES supports them), so don't confuse the user by printing that we have them enabled. +config X86_CPA_STATISTICS + bool "Enable statistic for Change Page Attribute" + depends on DEBUG_FS + ---help--- + Expose statistics about the Change Page Attribute mechanims, which + helps to determine the effectivness of preserving large and huge + page mappings when mapping protections are changed. + config ARCH_HAS_MEM_ENCRYPT def_bool y diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h @@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size) #define ioremap_nocache ioremap_nocache extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc - extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); #define ioremap_cache ioremap_cache extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); #define ioremap_prot ioremap_prot +extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size); +#define ioremap_encrypted ioremap_encrypted /** * ioremap - map bus memory into CPU space diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h @@ -67,7 +67,7 @@ struct kimage; /* Memory to backup during crash kdump */ #define KEXEC_BACKUP_SRC_START (0UL) -#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ +#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */ /* * CPU does not save ss and sp on stack if execution is already diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h @@ -59,13 +59,16 @@ #endif /* - * Kernel image size is limited to 1GiB due to the fixmap living in the - * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use - * 512MiB by default, leaving 1.5GiB for modules once the page tables - * are fully set up. If kernel ASLR is configured, it can extend the - * kernel page table mapping, reducing the size of the modules area. + * Maximum kernel image size is limited to 1 GiB, due to the fixmap living + * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). + * + * On KASLR use 1 GiB by default, leaving 1 GiB for modules once the + * page tables are fully set up. + * + * If KASLR is disabled we can shrink it to 0.5 GiB and increase the size + * of the modules area to 1.5 GiB. */ -#if defined(CONFIG_RANDOMIZE_BASE) +#ifdef CONFIG_RANDOMIZE_BASE #define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) #else #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h @@ -6,16 +6,23 @@ #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) -#define tlb_flush(tlb) \ -{ \ - if (!tlb->fullmm && !tlb->need_flush_all) \ - flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \ - else \ - flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \ -} +static inline void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> +static inline void tlb_flush(struct mmu_gather *tlb) +{ + unsigned long start = 0UL, end = TLB_FLUSH_ALL; + unsigned int stride_shift = tlb_get_unmap_shift(tlb); + + if (!tlb->fullmm && !tlb->need_flush_all) { + start = tlb->start; + end = tlb->end; + } + + flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); +} + /* * While x86 architecture in general requires an IPI to perform TLB * shootdown, enablement code for several hypervisors overrides diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h @@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #endif -static inline bool tlb_defer_switch_to_init_mm(void) -{ - /* - * If we have PCID, then switching to init_mm is reasonably - * fast. If we don't have PCID, then switching to init_mm is - * quite slow, so we try to defer it in the hopes that we can - * avoid it entirely. The latter approach runs the risk of - * receiving otherwise unnecessary IPIs. - * - * This choice is just a heuristic. The tlb code can handle this - * function returning true or false regardless of whether we have - * PCID. - */ - return !static_cpu_has(X86_FEATURE_PCID); -} - struct tlb_context { u64 ctx_id; u64 tlb_gen; @@ -547,23 +531,30 @@ struct flush_tlb_info { unsigned long start; unsigned long end; u64 new_tlb_gen; + unsigned int stride_shift; + bool freed_tables; }; #define local_flush_tlb() __flush_tlb() -#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL) +#define flush_tlb_mm(mm) \ + flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true) -#define flush_tlb_range(vma, start, end) \ - flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) +#define flush_tlb_range(vma, start, end) \ + flush_tlb_mm_range((vma)->vm_mm, start, end, \ + ((vma)->vm_flags & VM_HUGETLB) \ + ? huge_page_shift(hstate_vma(vma)) \ + : PAGE_SHIFT, false) extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long vmflag); + unsigned long end, unsigned int stride_shift, + bool freed_tables); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) { - flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE); + flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } void native_flush_tlb_others(const struct cpumask *cpumask, diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c @@ -11,40 +11,62 @@ #include <linux/uaccess.h> #include <linux/io.h> -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf, + bool encrypted) { void *vaddr; if (!csize) return 0; - vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); + if (encrypted) + vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE); + else + vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); + if (!vaddr) return -ENOMEM; if (userbuf) { - if (copy_to_user(buf, vaddr + offset, csize)) { - iounmap(vaddr); + if (copy_to_user((void __user *)buf, vaddr + offset, csize)) { + iounmap((void __iomem *)vaddr); return -EFAULT; } } else memcpy(buf, vaddr + offset, csize); set_iounmap_nonlazy(); - iounmap(vaddr); + iounmap((void __iomem *)vaddr); return csize; } + +/** + * copy_oldmem_page - copy one page of memory + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from the old kernel's memory. For this page, there is no pte + * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false); +} + +/** + * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the + * memory with the encryption mask set to accomodate kdump on SME-enabled + * machines. + */ +ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); +} diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c @@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) map_ldt_struct_to_user(mm); va = (unsigned long)ldt_slot_va(slot); - flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false); ldt->slot = slot; return 0; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c @@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) pte_unmap_unlock(pte, ptl); out: up_write(&mm->mmap_sem); - flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL); + flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false); } diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c @@ -19,7 +19,9 @@ #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/highmem.h> +#include <linux/pci.h> +#include <asm/e820/types.h> #include <asm/pgtable.h> /* @@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u) return (signed long)(u << shift) >> shift; } +static void note_wx(struct pg_state *st) +{ + unsigned long npages; + + npages = (st->current_address - st->start_address) / PAGE_SIZE; + +#ifdef CONFIG_PCI_BIOS + /* + * If PCI BIOS is enabled, the PCI BIOS area is forced to WX. + * Inform about it, but avoid the warning. + */ + if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && + st->current_address <= PAGE_OFFSET + BIOS_END) { + pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages); + return; + } +#endif + /* Account the WX pages */ + st->wx_pages += npages; + WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n", + (void *)st->start_address); +} + /* * This function gets called on a break in a continuous series * of PTE entries; the next one is different so we need to @@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, unsigned long delta; int width = sizeof(unsigned long) * 2; - if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) { - WARN_ONCE(1, - "x86/mm: Found insecure W+X mapping at address %p/%pS\n", - (void *)st->start_address, - (void *)st->start_address); - st->wx_pages += (st->current_address - - st->start_address) / PAGE_SIZE; - } + if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) + note_wx(st); /* * Now print the actual finished series diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c @@ -851,6 +851,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, show_opcodes(regs, loglvl); } +/* + * The (legacy) vsyscall page is the long page in the kernel portion + * of the address space that has user-accessible permissions. + */ +static bool is_vsyscall_vaddr(unsigned long vaddr) +{ + return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); +} + static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 *pkey, int si_code) @@ -874,18 +883,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_errata100(regs, address)) return; -#ifdef CONFIG_X86_64 - /* - * Instruction fetch faults in the vsyscall page might need - * emulation. - */ - if (unlikely((error_code & X86_PF_INSTR) && - ((address & ~0xfff) == VSYSCALL_ADDR))) { - if (emulate_vsyscall(regs, address)) - return; - } -#endif - /* * To avoid leaking information about the kernel page table * layout, pretend that user-mode accesses to kernel addresses @@ -1043,19 +1040,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } } -static int spurious_fault_check(unsigned long error_code, pte_t *pte) +static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) return 0; if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) return 0; - /* - * Note: We do not do lazy flushing on protection key - * changes, so no spurious fault will ever set X86_PF_PK. - */ - if ((error_code & X86_PF_PK)) - return 1; return 1; } @@ -1082,7 +1073,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * (Optional Invalidation). */ static noinline int -spurious_fault(unsigned long error_code, unsigned long address) +spurious_kernel_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; p4d_t *p4d; @@ -1113,27 +1104,27 @@ spurious_fault(unsigned long error_code, unsigned long address) return 0; if (p4d_large(*p4d)) - return spurious_fault_check(error_code, (pte_t *) p4d); + return spurious_kernel_fault_check(error_code, (pte_t *) p4d); pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; if (pud_large(*pud)) - return spurious_fault_check(error_code, (pte_t *) pud); + return spurious_kernel_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; if (pmd_large(*pmd)) - return spurious_fault_check(error_code, (pte_t *) pmd); + return spurious_kernel_fault_check(error_code, (pte_t *) pmd); pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; - ret = spurious_fault_check(error_code, pte); + ret = spurious_kernel_fault_check(error_code, pte); if (!ret) return 0; @@ -1141,12 +1132,12 @@ spurious_fault(unsigned long error_code, unsigned long address) * Make sure we have permissions in PMD. * If not, then there's a bug in the page tables: */ - ret = spurious_fault_check(error_code, (pte_t *) pmd); + ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); return ret; } -NOKPROBE_SYMBOL(spurious_fault); +NOKPROBE_SYMBOL(spurious_kernel_fault); int show_unhandled_signals = 1; @@ -1193,6 +1184,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) static int fault_in_kernel_space(unsigned long address) { + /* + * On 64-bit systems, the vsyscall page is at an address above + * TASK_SIZE_MAX, but is not considered part of the kernel + * address space. + */ + if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) + return false; + return address >= TASK_SIZE_MAX; } @@ -1214,31 +1213,23 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) } /* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. + * Called for all faults where 'address' is part of the kernel address + * space. Might get called for faults that originate from *code* that + * ran in userspace or the kernel. */ -static noinline void -__do_page_fault(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static void +do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address) { - struct vm_area_struct *vma; - struct task_struct *tsk; - struct mm_struct *mm; - vm_fault_t fault, major = 0; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - u32 pkey; - - tsk = current; - mm = tsk->mm; - - prefetchw(&mm->mmap_sem); - - if (unlikely(kmmio_fault(regs, address))) - return; + /* + * Protection keys exceptions only happen on user pages. We + * have no user pages in the kernel portion of the address + * space, so do not expect them here. + */ + WARN_ON_ONCE(hw_error_code & X86_PF_PK); /* - * We fault-in kernel-space virtual memory on-demand. The + * We can fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may @@ -1246,41 +1237,74 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * only copy the information from the master page table, * nothing more. * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. + * Before doing this on-demand faulting, ensure that the + * fault is not any of the following: + * 1. A fault on a PTE with a reserved bit set. + * 2. A fault caused by a user-mode access. (Do not demand- + * fault kernel memory due to user-mode accesses). + * 3. A fault caused by a page-level protection violation. + * (A demand fault would be on a non-present page which + * would have X86_PF_PROT==0). */ - if (unlikely(fault_in_kernel_space(address))) { - if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { - if (vmalloc_fault(address) >= 0) - return; - } - - /* Can handle a stale RO->RW TLB: */ - if (spurious_fault(error_code, address)) + if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { + if (vmalloc_fault(address) >= 0) return; + } - /* kprobes don't want to hook the spurious faults: */ - if (kprobes_fault(regs)) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock: - */ - bad_area_nosemaphore(regs, error_code, address, NULL); + /* Was the fault spurious, caused by lazy TLB invalidation? */ + if (spurious_kernel_fault(hw_error_code, address)) + return; + /* kprobes don't want to hook the spurious faults: */ + if (kprobes_fault(regs)) return; - } + + /* + * Note, despite being a "bad area", there are quite a few + * acceptable reasons to get here, such as erratum fixups + * and handling kernel code that can fault, like get_user(). + * + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock: + */ + bad_area_nosemaphore(regs, hw_error_code, address, NULL); +} +NOKPROBE_SYMBOL(do_kern_addr_fault); + +/* Handle faults in the user portion of the address space */ +static inline +void do_user_addr_fault(struct pt_regs *regs, + unsigned long hw_error_code, + unsigned long address) +{ + unsigned long sw_error_code; + struct vm_area_struct *vma; + struct task_struct *tsk; + struct mm_struct *mm; + vm_fault_t fault, major = 0; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + u32 pkey; + + tsk = current; + mm = tsk->mm; /* kprobes don't want to hook the spurious faults: */ if (unlikely(kprobes_fault(regs))) return; - if (unlikely(error_code & X86_PF_RSVD)) - pgtable_bad(regs, error_code, address); + /* + * Reserved bits are never expected to be set on + * entries in the user portion of the page tables. + */ + if (unlikely(hw_error_code & X86_PF_RSVD)) + pgtable_bad(regs, hw_error_code, address); - if (unlikely(smap_violation(error_code, regs))) { - bad_area_nosemaphore(regs, error_code, address, NULL); + /* + * Check for invalid kernel (supervisor) access to user + * pages in the user address space. + */ + if (unlikely(smap_violation(hw_error_code, regs))) { + bad_area_nosemaphore(regs, hw_error_code, address, NULL); return; } @@ -1289,11 +1313,18 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { - bad_area_nosemaphore(regs, error_code, address, NULL); + bad_area_nosemaphore(regs, hw_error_code, address, NULL); return; } /* + * hw_error_code is literally the "page fault error code" passed to + * the kernel directly from the hardware. But, we will shortly be + * modifying it in software, so give it a new name. + */ + sw_error_code = hw_error_code; + + /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. * @@ -1302,7 +1333,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, */ if (user_mode(regs)) { local_irq_enable(); - error_code |= X86_PF_USER; + /* + * Up to this point, X86_PF_USER set in hw_error_code + * indicated a user-mode access. But, after this, + * X86_PF_USER in sw_error_code will indicate either + * that, *or* an implicit kernel(supervisor)-mode access + * which originated from user mode. + */ + if (!(hw_error_code & X86_PF_USER)) { + /* + * The CPU was in user mode, but the CPU says + * the fault was not a user-mode access. + * Must be an implicit kernel-mode access, + * which we do not expect to happen in the + * user address space. + */ + pr_warn_once("kernel-mode error from user-mode: %lx\n", + hw_error_code); + + sw_error_code |= X86_PF_USER; + } flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) @@ -1311,31 +1361,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - if (error_code & X86_PF_WRITE) + if (sw_error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; - if (error_code & X86_PF_INSTR) + if (sw_error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; +#ifdef CONFIG_X86_64 /* - * When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in - * the kernel and should generate an OOPS. Unfortunately, in the - * case of an erroneous fault occurring in a code path which already - * holds mmap_sem we will deadlock attempting to validate the fault - * against the address space. Luckily the kernel only validly - * references user space from well defined areas of code, which are - * listed in the exceptions table. + * Instruction fetch faults in the vsyscall page might need + * emulation. The vsyscall page is at a high address + * (>PAGE_OFFSET), but is considered to be part of the user + * address space. * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a - * deadlock. Attempt to lock the address space, if we cannot we then - * validate the source. If this is invalid we can skip the address - * space check, thus avoiding the deadlock: + * The vsyscall page does not have a "real" VMA, so do this + * emulation before we go searching for VMAs. + */ + if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { + if (emulate_vsyscall(regs, address)) + return; + } +#endif + + /* + * Kernel-mode access to the user address space should only occur + * on well-defined single instructions listed in the exception + * tables. But, an erroneous kernel fault occurring outside one of + * those areas which also holds mmap_sem might deadlock attempting + * to validate the fault against the address space. + * + * Only do the expensive exception table search when we might be at + * risk of a deadlock. This happens if we + * 1. Failed to acquire mmap_sem, and + * 2. The access did not originate in userspace. Note: either the + * hardware or earlier page fault code may set X86_PF_USER + * in sw_error_code. */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { - if (!(error_code & X86_PF_USER) && + if (!(sw_error_code & X86_PF_USER) && !search_exception_tables(regs->ip)) { - bad_area_nosemaphore(regs, error_code, address, NULL); + /* + * Fault from code in kernel from + * which we do not expect faults. + */ + bad_area_nosemaphore(regs, sw_error_code, address, NULL); return; } retry: @@ -1351,16 +1419,16 @@ retry: vma = find_vma(mm, address); if (unlikely(!vma)) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } if (likely(vma->vm_start <= address)) goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } - if (error_code & X86_PF_USER) { + if (sw_error_code & X86_PF_USER) { /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter @@ -1368,12 +1436,12 @@ retry: * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } } if (unlikely(expand_stack(vma, address))) { - bad_area(regs, error_code, address); + bad_area(regs, sw_error_code, address); return; } @@ -1382,8 +1450,8 @@ retry: * we can handle it.. */ good_area: - if (unlikely(access_error(error_code, vma))) { - bad_area_access_error(regs, error_code, address, vma); + if (unlikely(access_error(sw_error_code, vma))) { + bad_area_access_error(regs, sw_error_code, address, vma); return; } @@ -1425,13 +1493,13 @@ good_area: return; /* Not returning to user mode? Handle exceptions or die: */ - no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR); return; } up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, &pkey, fault); + mm_fault_error(regs, sw_error_code, address, &pkey, fault); return; } @@ -1449,6 +1517,28 @@ good_area: check_v8086_mode(regs, address, tsk); } +NOKPROBE_SYMBOL(do_user_addr_fault); + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +static noinline void +__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address) +{ + prefetchw(&current->mm->mmap_sem); + + if (unlikely(kmmio_fault(regs, address))) + return; + + /* Was the fault on kernel-controlled part of the address space? */ + if (unlikely(fault_in_kernel_space(address))) + do_kern_addr_fault(regs, hw_error_code, address); + else + do_user_addr_fault(regs, hw_error_code, address); +} NOKPROBE_SYMBOL(__do_page_fault); static nokprobe_inline void diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c @@ -923,34 +923,19 @@ static void mark_nxdata_nx(void) void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long size = PFN_ALIGN(_etext) - start; + unsigned long size = (unsigned long)__end_rodata - start; set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); - printk(KERN_INFO "Write protecting the kernel text: %luk\n", + pr_info("Write protecting kernel text and read-only data: %luk\n", size >> 10); kernel_set_to_readonly = 1; #ifdef CONFIG_CPA_DEBUG - printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", - start, start+size); - set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); - - printk(KERN_INFO "Testing CPA: write protecting again\n"); - set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); -#endif - - start += size; - size = (unsigned long)__end_rodata - start; - set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); - printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", - size >> 10); - -#ifdef CONFIG_CPA_DEBUG - printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); + pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size); set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); - printk(KERN_INFO "Testing CPA: write protecting again\n"); + pr_info("Testing CPA: write protecting again\n"); set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); #endif mark_nxdata_nx(); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c @@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size, * caller shouldn't need to know that small detail. */ static void __iomem *__ioremap_caller(resource_size_t phys_addr, - unsigned long size, enum page_cache_mode pcm, void *caller) + unsigned long size, enum page_cache_mode pcm, + void *caller, bool encrypted) { unsigned long offset, vaddr; resource_size_t last_addr; @@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * resulting mapping. */ prot = PAGE_KERNEL_IO; - if (sev_active() && mem_flags.desc_other) + if ((sev_active() && mem_flags.desc_other) || encrypted) prot = pgprot_encrypted(prot); switch (pcm) { @@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; return __ioremap_caller(phys_addr, size, pcm, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_nocache); @@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size) enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC; return __ioremap_caller(phys_addr, size, pcm, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL_GPL(ioremap_uc); @@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc); void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_wc); @@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc); void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size) { return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_wt); +void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, + __builtin_return_address(0), true); +} +EXPORT_SYMBOL(ioremap_encrypted); + void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) { return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_cache); @@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, { return __ioremap_caller(phys_addr, size, pgprot2cachemode(__pgprot(prot_val)), - __builtin_return_address(0)); + __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_prot); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c @@ -37,11 +37,20 @@ struct cpa_data { unsigned long numpages; int flags; unsigned long pfn; - unsigned force_split : 1; + unsigned force_split : 1, + force_static_prot : 1; int curpage; struct page **pages; }; +enum cpa_warn { + CPA_CONFLICT, + CPA_PROTECT, + CPA_DETECT, +}; + +static const int cpa_warn_level = CPA_PROTECT; + /* * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) * using cpa_lock. So that we don't allow any other cpu, with stale large tlb @@ -94,6 +103,87 @@ void arch_report_meminfo(struct seq_file *m) static inline void split_page_count(int level) { } #endif +#ifdef CONFIG_X86_CPA_STATISTICS + +static unsigned long cpa_1g_checked; +static unsigned long cpa_1g_sameprot; +static unsigned long cpa_1g_preserved; +static unsigned long cpa_2m_checked; +static unsigned long cpa_2m_sameprot; +static unsigned long cpa_2m_preserved; +static unsigned long cpa_4k_install; + +static inline void cpa_inc_1g_checked(void) +{ + cpa_1g_checked++; +} + +static inline void cpa_inc_2m_checked(void) +{ + cpa_2m_checked++; +} + +static inline void cpa_inc_4k_install(void) +{ + cpa_4k_install++; +} + +static inline void cpa_inc_lp_sameprot(int level) +{ + if (level == PG_LEVEL_1G) + cpa_1g_sameprot++; + else + cpa_2m_sameprot++; +} + +static inline void cpa_inc_lp_preserved(int level) +{ + if (level == PG_LEVEL_1G) + cpa_1g_preserved++; + else + cpa_2m_preserved++; +} + +static int cpastats_show(struct seq_file *m, void *p) +{ + seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked); + seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot); + seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved); + seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked); + seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot); + seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved); + seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install); + return 0; +} + +static int cpastats_open(struct inode *inode, struct file *file) +{ + return single_open(file, cpastats_show, NULL); +} + +static const struct file_operations cpastats_fops = { + .open = cpastats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init cpa_stats_init(void) +{ + debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL, + &cpastats_fops); + return 0; +} +late_initcall(cpa_stats_init); +#else +static inline void cpa_inc_1g_checked(void) { } +static inline void cpa_inc_2m_checked(void) { } +static inline void cpa_inc_4k_install(void) { } +static inline void cpa_inc_lp_sameprot(int level) { } +static inline void cpa_inc_lp_preserved(int level) { } +#endif + + static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -195,14 +285,20 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static void __cpa_flush_range(void *arg) +static bool __cpa_flush_range(unsigned long start, int numpages, int cache) { - /* - * We could optimize that further and do individual per page - * tlb invalidates for a low number of pages. Caveat: we must - * flush the high aliases on 64bit as well. - */ - __flush_tlb_all(); + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + + WARN_ON(PAGE_ALIGN(start) != start); + + if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { + cpa_flush_all(cache); + return true; + } + + flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); + + return !cache; } static void cpa_flush_range(unsigned long start, int numpages, int cache) @@ -210,12 +306,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) unsigned int i, level; unsigned long addr; - BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - WARN_ON(PAGE_ALIGN(start) != start); - - on_each_cpu(__cpa_flush_range, NULL, 1); - - if (!cache) + if (__cpa_flush_range(start, numpages, cache)) return; /* @@ -235,30 +326,13 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } -static void cpa_flush_array(unsigned long *start, int numpages, int cache, +static void cpa_flush_array(unsigned long baddr, unsigned long *start, + int numpages, int cache, int in_flags, struct page **pages) { unsigned int i, level; -#ifdef CONFIG_PREEMPT - /* - * Avoid wbinvd() because it causes latencies on all CPUs, - * regardless of any CPU isolation that may be in effect. - * - * This should be extended for CAT enabled systems independent of - * PREEMPT because wbinvd() does not respect the CAT partitions and - * this is exposed to unpriviledged users through the graphics - * subsystem. - */ - unsigned long do_wbinvd = 0; -#else - unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ -#endif - - BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); - - if (!cache || do_wbinvd) + if (__cpa_flush_range(baddr, numpages, cache)) return; /* @@ -286,84 +360,179 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, } } -/* - * Certain areas of memory on x86 require very specific protection flags, - * for example the BIOS area or kernel text. Callers don't always get this - * right (again, ioremap() on BIOS memory is not uncommon) so this function - * checks and fixes these known static required protection bits. - */ -static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, - unsigned long pfn) +static bool overlaps(unsigned long r1_start, unsigned long r1_end, + unsigned long r2_start, unsigned long r2_end) { - pgprot_t forbidden = __pgprot(0); + return (r1_start <= r2_end && r1_end >= r2_start) || + (r2_start <= r1_end && r2_end >= r1_start); +} - /* - * The BIOS area between 640k and 1Mb needs to be executable for - * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. - */ #ifdef CONFIG_PCI_BIOS - if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) - pgprot_val(forbidden) |= _PAGE_NX; +/* + * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS + * based config access (CONFIG_PCI_GOBIOS) support. + */ +#define BIOS_PFN PFN_DOWN(BIOS_BEGIN) +#define BIOS_PFN_END PFN_DOWN(BIOS_END - 1) + +static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) +{ + if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END)) + return _PAGE_NX; + return 0; +} +#else +static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) +{ + return 0; +} #endif - /* - * The kernel text needs to be executable for obvious reasons - * Does not cover __inittext since that is gone later on. On - * 64bit we do not enforce !NX on the low mapping - */ - if (within(address, (unsigned long)_text, (unsigned long)_etext)) - pgprot_val(forbidden) |= _PAGE_NX; +/* + * The .rodata section needs to be read-only. Using the pfn catches all + * aliases. This also includes __ro_after_init, so do not enforce until + * kernel_set_to_readonly is true. + */ +static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn) +{ + unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata)); /* - * The .rodata section needs to be read-only. Using the pfn - * catches all aliases. This also includes __ro_after_init, - * so do not enforce until kernel_set_to_readonly is true. + * Note: __end_rodata is at page aligned and not inclusive, so + * subtract 1 to get the last enforced PFN in the rodata area. */ - if (kernel_set_to_readonly && - within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, - __pa_symbol(__end_rodata) >> PAGE_SHIFT)) - pgprot_val(forbidden) |= _PAGE_RW; + epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1; + + if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro)) + return _PAGE_RW; + return 0; +} + +/* + * Protect kernel text against becoming non executable by forbidding + * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext) + * out of which the kernel actually executes. Do not protect the low + * mapping. + * + * This does not cover __inittext since that is gone after boot. + */ +static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end) +{ + unsigned long t_end = (unsigned long)_etext - 1; + unsigned long t_start = (unsigned long)_text; + + if (overlaps(start, end, t_start, t_end)) + return _PAGE_NX; + return 0; +} #if defined(CONFIG_X86_64) +/* + * Once the kernel maps the text as RO (kernel_set_to_readonly is set), + * kernel text mappings for the large page aligned text, rodata sections + * will be always read-only. For the kernel identity mappings covering the + * holes caused by this alignment can be anything that user asks. + * + * This will preserve the large page mappings for kernel text/data at no + * extra cost. + */ +static pgprotval_t protect_kernel_text_ro(unsigned long start, + unsigned long end) +{ + unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1; + unsigned long t_start = (unsigned long)_text; + unsigned int level; + + if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end)) + return 0; /* - * Once the kernel maps the text as RO (kernel_set_to_readonly is set), - * kernel text mappings for the large page aligned text, rodata sections - * will be always read-only. For the kernel identity mappings covering - * the holes caused by this alignment can be anything that user asks. + * Don't enforce the !RW mapping for the kernel text mapping, if + * the current mapping is already using small page mapping. No + * need to work hard to preserve large page mappings in this case. * - * This will preserve the large page mappings for kernel text/data - * at no extra cost. + * This also fixes the Linux Xen paravirt guest boot failure caused + * by unexpected read-only mappings for kernel identity + * mappings. In this paravirt guest case, the kernel text mapping + * and the kernel identity mapping share the same page-table pages, + * so the protections for kernel text and identity mappings have to + * be the same. */ - if (kernel_set_to_readonly && - within(address, (unsigned long)_text, - (unsigned long)__end_rodata_hpage_align)) { - unsigned int level; - - /* - * Don't enforce the !RW mapping for the kernel text mapping, - * if the current mapping is already using small page mapping. - * No need to work hard to preserve large page mappings in this - * case. - * - * This also fixes the Linux Xen paravirt guest boot failure - * (because of unexpected read-only mappings for kernel identity - * mappings). In this paravirt guest case, the kernel text - * mapping and the kernel identity mapping share the same - * page-table pages. Thus we can't really use different - * protections for the kernel text and identity mappings. Also, - * these shared mappings are made of small page mappings. - * Thus this don't enforce !RW mapping for small page kernel - * text mapping logic will help Linux Xen parvirt guest boot - * as well. - */ - if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) - pgprot_val(forbidden) |= _PAGE_RW; - } + if (lookup_address(start, &level) && (level != PG_LEVEL_4K)) + return _PAGE_RW; + return 0; +} +#else +static pgprotval_t protect_kernel_text_ro(unsigned long start, + unsigned long end) +{ + return 0; +} #endif - prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); +static inline bool conflicts(pgprot_t prot, pgprotval_t val) +{ + return (pgprot_val(prot) & ~val) != pgprot_val(prot); +} - return prot; +static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val, + unsigned long start, unsigned long end, + unsigned long pfn, const char *txt) +{ + static const char *lvltxt[] = { + [CPA_CONFLICT] = "conflict", + [CPA_PROTECT] = "protect", + [CPA_DETECT] = "detect", + }; + + if (warnlvl > cpa_warn_level || !conflicts(prot, val)) + return; + + pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n", + lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot), + (unsigned long long)val); +} + +/* + * Certain areas of memory on x86 require very specific protection flags, + * for example the BIOS area or kernel text. Callers don't always get this + * right (again, ioremap() on BIOS memory is not uncommon) so this function + * checks and fixes these known static required protection bits. + */ +static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, + unsigned long pfn, unsigned long npg, + int warnlvl) +{ + pgprotval_t forbidden, res; + unsigned long end; + + /* + * There is no point in checking RW/NX conflicts when the requested + * mapping is setting the page !PRESENT. + */ + if (!(pgprot_val(prot) & _PAGE_PRESENT)) + return prot; + + /* Operate on the virtual address */ + end = start + npg * PAGE_SIZE - 1; + + res = protect_kernel_text(start, end); + check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX"); + forbidden = res; + + res = protect_kernel_text_ro(start, end); + check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO"); + forbidden |= res; + + /* Check the PFN directly */ + res = protect_pci_bios(pfn, pfn + npg - 1); + check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX"); + forbidden |= res; + + res = protect_rodata(pfn, pfn + npg - 1); + check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO"); + forbidden |= res; + + return __pgprot(pgprot_val(prot) & ~forbidden); } /* @@ -421,18 +590,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, */ pte_t *lookup_address(unsigned long address, unsigned int *level) { - return lookup_address_in_pgd(pgd_offset_k(address), address, level); + return lookup_address_in_pgd(pgd_offset_k(address), address, level); } EXPORT_SYMBOL_GPL(lookup_address); static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, unsigned int *level) { - if (cpa->pgd) + if (cpa->pgd) return lookup_address_in_pgd(cpa->pgd + pgd_index(address), address, level); - return lookup_address(address, level); + return lookup_address(address, level); } /* @@ -549,40 +718,35 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) return prot; } -static int -try_preserve_large_page(pte_t *kpte, unsigned long address, - struct cpa_data *cpa) +static int __should_split_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn; + unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn; + pgprot_t old_prot, new_prot, req_prot, chk_prot; pte_t new_pte, old_pte, *tmp; - pgprot_t old_prot, new_prot, req_prot; - int i, do_split = 1; enum pg_level level; - if (cpa->force_split) - return 1; - - spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up already: */ tmp = _lookup_address_cpa(cpa, address, &level); if (tmp != kpte) - goto out_unlock; + return 1; switch (level) { case PG_LEVEL_2M: old_prot = pmd_pgprot(*(pmd_t *)kpte); old_pfn = pmd_pfn(*(pmd_t *)kpte); + cpa_inc_2m_checked(); break; case PG_LEVEL_1G: old_prot = pud_pgprot(*(pud_t *)kpte); old_pfn = pud_pfn(*(pud_t *)kpte); + cpa_inc_1g_checked(); break; default: - do_split = -EINVAL; - goto out_unlock; + return -EINVAL; } psize = page_level_size(level); @@ -592,8 +756,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * Calculate the number of pages, which fit into this large * page starting at address: */ - nextpage_addr = (address + psize) & pmask; - numpages = (nextpage_addr - address) >> PAGE_SHIFT; + lpaddr = (address + psize) & pmask; + numpages = (lpaddr - address) >> PAGE_SHIFT; if (numpages < cpa->numpages) cpa->numpages = numpages; @@ -620,71 +784,142 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pgprot_val(req_prot) |= _PAGE_PSE; /* - * old_pfn points to the large page base pfn. So we need - * to add the offset of the virtual address: + * old_pfn points to the large page base pfn. So we need to add the + * offset of the virtual address: */ pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; - new_prot = static_protections(req_prot, address, pfn); + /* + * Calculate the large page base address and the number of 4K pages + * in the large page + */ + lpaddr = address & pmask; + numpages = psize >> PAGE_SHIFT; /* - * We need to check the full range, whether - * static_protection() requires a different pgprot for one of - * the pages in the range we try to preserve: + * Sanity check that the existing mapping is correct versus the static + * protections. static_protections() guards against !PRESENT, so no + * extra conditional required here. */ - addr = address & pmask; - pfn = old_pfn; - for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { - pgprot_t chk_prot = static_protections(req_prot, addr, pfn); + chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages, + CPA_CONFLICT); - if (pgprot_val(chk_prot) != pgprot_val(new_prot)) - goto out_unlock; + if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) { + /* + * Split the large page and tell the split code to + * enforce static protections. + */ + cpa->force_static_prot = 1; + return 1; } /* - * If there are no changes, return. maxpages has been updated - * above: + * Optimization: If the requested pgprot is the same as the current + * pgprot, then the large page can be preserved and no updates are + * required independent of alignment and length of the requested + * range. The above already established that the current pgprot is + * correct, which in consequence makes the requested pgprot correct + * as well if it is the same. The static protection scan below will + * not come to a different conclusion. */ - if (pgprot_val(new_prot) == pgprot_val(old_prot)) { - do_split = 0; - goto out_unlock; + if (pgprot_val(req_prot) == pgprot_val(old_prot)) { + cpa_inc_lp_sameprot(level); + return 0; } /* - * We need to change the attributes. Check, whether we can - * change the large page in one go. We request a split, when - * the address is not aligned and the number of pages is - * smaller than the number of pages in the large page. Note - * that we limited the number of possible pages already to - * the number of pages in the large page. + * If the requested range does not cover the full page, split it up */ - if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { - /* - * The address is aligned and the number of pages - * covers the full page. - */ - new_pte = pfn_pte(old_pfn, new_prot); - __set_pmd_pte(kpte, address, new_pte); - cpa->flags |= CPA_FLUSHTLB; - do_split = 0; - } + if (address != lpaddr || cpa->numpages != numpages) + return 1; -out_unlock: + /* + * Check whether the requested pgprot is conflicting with a static + * protection requirement in the large page. + */ + new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, + CPA_DETECT); + + /* + * If there is a conflict, split the large page. + * + * There used to be a 4k wise evaluation trying really hard to + * preserve the large pages, but experimentation has shown, that this + * does not help at all. There might be corner cases which would + * preserve one large page occasionally, but it's really not worth the + * extra code and cycles for the common case. + */ + if (pgprot_val(req_prot) != pgprot_val(new_prot)) + return 1; + + /* All checks passed. Update the large page mapping. */ + new_pte = pfn_pte(old_pfn, new_prot); + __set_pmd_pte(kpte, address, new_pte); + cpa->flags |= CPA_FLUSHTLB; + cpa_inc_lp_preserved(level); + return 0; +} + +static int should_split_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) +{ + int do_split; + + if (cpa->force_split) + return 1; + + spin_lock(&pgd_lock); + do_split = __should_split_large_page(kpte, address, cpa); spin_unlock(&pgd_lock); return do_split; } +static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn, + pgprot_t ref_prot, unsigned long address, + unsigned long size) +{ + unsigned int npg = PFN_DOWN(size); + pgprot_t prot; + + /* + * If should_split_large_page() discovered an inconsistent mapping, + * remove the invalid protection in the split mapping. + */ + if (!cpa->force_static_prot) + goto set; + + prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT); + + if (pgprot_val(prot) == pgprot_val(ref_prot)) + goto set; + + /* + * If this is splitting a PMD, fix it up. PUD splits cannot be + * fixed trivially as that would require to rescan the newly + * installed PMD mappings after returning from split_large_page() + * so an eventual further split can allocate the necessary PTE + * pages. Warn for now and revisit it in case this actually + * happens. + */ + if (size == PAGE_SIZE) + ref_prot = prot; + else + pr_warn_once("CPA: Cannot fixup static protections for PUD split\n"); +set: + set_pte(pte, pfn_pte(pfn, ref_prot)); +} + static int __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, struct page *base) { + unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1; pte_t *pbase = (pte_t *)page_address(base); - unsigned long ref_pfn, pfn, pfninc = 1; unsigned int i, level; - pte_t *tmp; pgprot_t ref_prot; + pte_t *tmp; spin_lock(&pgd_lock); /* @@ -707,15 +942,17 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, * PAT bit to correct position. */ ref_prot = pgprot_large_2_4k(ref_prot); - ref_pfn = pmd_pfn(*(pmd_t *)kpte); + lpaddr = address & PMD_MASK; + lpinc = PAGE_SIZE; break; case PG_LEVEL_1G: ref_prot = pud_pgprot(*(pud_t *)kpte); ref_pfn = pud_pfn(*(pud_t *)kpte); pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; - + lpaddr = address & PUD_MASK; + lpinc = PMD_SIZE; /* * Clear the PSE flags if the PRESENT flag is not set * otherwise pmd_present/pmd_huge will return true @@ -736,8 +973,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, * Get the target pfn from the original entry: */ pfn = ref_pfn; - for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) - set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); + for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc) + split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc); if (virt_addr_valid(address)) { unsigned long pfn = PFN_DOWN(__pa(address)); @@ -756,14 +993,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); /* - * Intel Atom errata AAH41 workaround. + * Do a global flush tlb after splitting the large page + * and before we do the actual change page attribute in the PTE. + * + * Without this, we violate the TLB application note, that says: + * "The TLBs may contain both ordinary and large-page + * translations for a 4-KByte range of linear addresses. This + * may occur if software modifies the paging structures so that + * the page size used for the address range changes. If the two + * translations differ with respect to page frame or attributes + * (e.g., permissions), processor behavior is undefined and may + * be implementation-specific." * - * The real fix should be in hw or in a microcode update, but - * we also probabilistically try to reduce the window of having - * a large TLB mixed with 4K TLBs while instruction fetches are - * going on. + * We do this global tlb flush inside the cpa_lock, so that we + * don't allow any other cpu, with stale tlb entries change the + * page attribute in parallel, that also falls into the + * just split large page entry. */ - __flush_tlb_all(); + flush_tlb_all(); spin_unlock(&pgd_lock); return 0; @@ -1247,7 +1494,9 @@ repeat: pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); - new_prot = static_protections(new_prot, address, pfn); + cpa_inc_4k_install(); + new_prot = static_protections(new_prot, address, pfn, 1, + CPA_PROTECT); new_prot = pgprot_clear_protnone_bits(new_prot); @@ -1273,7 +1522,7 @@ repeat: * Check, whether we can keep the large page intact * and just change the pte: */ - do_split = try_preserve_large_page(kpte, address, cpa); + do_split = should_split_large_page(kpte, address, cpa); /* * When the range fits into the existing large page, * return. cp->numpages and cpa->tlbflush have been updated in @@ -1286,28 +1535,8 @@ repeat: * We have to split the large page: */ err = split_large_page(cpa, kpte, address); - if (!err) { - /* - * Do a global flush tlb after splitting the large page - * and before we do the actual change page attribute in the PTE. - * - * With out this, we violate the TLB application note, that says - * "The TLBs may contain both ordinary and large-page - * translations for a 4-KByte range of linear addresses. This - * may occur if software modifies the paging structures so that - * the page size used for the address range changes. If the two - * translations differ with respect to page frame or attributes - * (e.g., permissions), processor behavior is undefined and may - * be implementation-specific." - * - * We do this global tlb flush inside the cpa_lock, so that we - * don't allow any other cpu, with stale tlb entries change the - * page attribute in parallel, that also falls into the - * just split large page entry. - */ - flush_tlb_all(); + if (!err) goto repeat; - } return err; } @@ -1529,19 +1758,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cache = !!pgprot2cachemode(mask_set); /* - * On success we use CLFLUSH, when the CPU supports it to - * avoid the WBINVD. If the CPU does not support it and in the - * error case we fall back to cpa_flush_all (which uses - * WBINVD): + * On error; flush everything to be sure. */ - if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { - if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { - cpa_flush_array(addr, numpages, cache, - cpa.flags, pages); - } else - cpa_flush_range(baddr, numpages, cache); - } else + if (ret) { cpa_flush_all(cache); + goto out; + } + + if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + cpa_flush_array(baddr, addr, numpages, cache, + cpa.flags, pages); + } else { + cpa_flush_range(baddr, numpages, cache); + } out: return ret; @@ -1856,10 +2085,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - if (static_cpu_has(X86_FEATURE_CLFLUSH)) - cpa_flush_range(start, numpages, 1); - else - cpa_flush_all(1); + cpa_flush_range(start, numpages, 1); ret = __change_page_attr_set_clr(&cpa, 1); @@ -1870,10 +2096,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) * in case TLB flushing gets optimized in the cpa_flush_range() * path use the same logic as above. */ - if (static_cpu_has(X86_FEATURE_CLFLUSH)) - cpa_flush_range(start, numpages, 0); - else - cpa_flush_all(0); + cpa_flush_range(start, numpages, 0); return ret; } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c @@ -185,8 +185,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; + bool need_flush; + u16 new_asid; /* * NB: The scheduler will call us with prev == next when switching @@ -240,20 +243,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next->context.ctx_id); /* - * We don't currently support having a real mm loaded without - * our cpu set in mm_cpumask(). We have all the bookkeeping - * in place to figure out whether we would need to flush - * if our cpu were cleared in mm_cpumask(), but we don't - * currently use it. + * Even in lazy TLB mode, the CPU should stay set in the + * mm_cpumask. The TLB shootdown code can figure out from + * from cpu_tlbstate.is_lazy whether or not to send an IPI. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); - return; + /* + * If the CPU is not in lazy TLB mode, we are just switching + * from one thread in a process to another thread in the same + * process. No TLB flush required. + */ + if (!was_lazy) + return; + + /* + * Read the tlb_gen to check whether a flush is needed. + * If the TLB is up to date, just use it. + * The barrier synchronizes with the tlb_gen increment in + * the TLB shootdown code. + */ + smp_mb(); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == + next_tlb_gen) + return; + + /* + * TLB contents went out of date while we were in lazy + * mode. Fall through to the TLB switching code below. + */ + new_asid = prev_asid; + need_flush = true; } else { - u16 new_asid; - bool need_flush; u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); /* @@ -308,46 +332,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, /* Let nmi_uaccess_okay() know that we're changing CR3. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); + } - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); - - /* - * NB: This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we need to use the _rcuidle variant. - * - * (There is no good reason for this. The idle code should - * be rearranged to call this before rcu_idle_enter().) - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); - - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); - } + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, true); /* - * Record last user mm's context id, so we can avoid - * flushing branch buffer with IBPB if we switch back - * to the same user. + * NB: This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we need to use the _rcuidle variant. + * + * (There is no good reason for this. The idle code should + * be rearranged to call this before rcu_idle_enter().) */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - - /* Make sure we write CR3 before loaded_mm. */ - barrier(); + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + load_new_mm_cr3(next->pgd, new_asid, false); - this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } - load_mm_cr4(next); - switch_ldt(real_prev, next); + /* + * Record last user mm's context id, so we can avoid + * flushing branch buffer with IBPB if we switch back + * to the same user. + */ + if (next != &init_mm) + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); + + /* Make sure we write CR3 before loaded_mm. */ + barrier(); + + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + + if (next != real_prev) { + load_mm_cr4(next); + switch_ldt(real_prev, next); + } } /* @@ -368,20 +394,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - if (tlb_defer_switch_to_init_mm()) { - /* - * There's a significant optimization that may be possible - * here. We have accurate enough TLB flush tracking that we - * don't need to maintain coherence of TLB per se when we're - * lazy. We do, however, need to maintain coherence of - * paging-structure caches. We could, in principle, leave our - * old mm loaded and only switch to init_mm when - * tlb_remove_page() happens. - */ - this_cpu_write(cpu_tlbstate.is_lazy, true); - } else { - switch_mm(NULL, &init_mm, NULL); - } + this_cpu_write(cpu_tlbstate.is_lazy, true); } /* @@ -468,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * paging-structure cache to avoid speculatively reading * garbage into our TLB. Since switching to init_mm is barely * slower than a minimal flush, just switch to init_mm. + * + * This should be rare, with native_flush_tlb_others skipping + * IPIs to lazy TLB mode CPUs. */ switch_mm_irqs_off(NULL, &init_mm, NULL); return; @@ -528,17 +544,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, f->new_tlb_gen == local_tlb_gen + 1 && f->new_tlb_gen == mm_tlb_gen) { /* Partial flush */ - unsigned long addr; - unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift; + unsigned long addr = f->start; - addr = f->start; while (addr < f->end) { __flush_tlb_one_user(addr); - addr += PAGE_SIZE; + addr += 1UL << f->stride_shift; } if (local) - count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); - trace_tlb_flush(reason, nr_pages); + count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); + trace_tlb_flush(reason, nr_invalidate); } else { /* Full flush. */ local_flush_tlb(); @@ -571,6 +586,11 @@ static void flush_tlb_func_remote(void *info) flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); } +static bool tlb_is_not_lazy(int cpu, void *data) +{ + return !per_cpu(cpu_tlbstate.is_lazy, cpu); +} + void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { @@ -606,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (void *)info, 1); return; } - smp_call_function_many(cpumask, flush_tlb_func_remote, + + /* + * If no page tables were freed, we can skip sending IPIs to + * CPUs in lazy TLB mode. They will flush the CPU themselves + * at the next context switch. + * + * However, if page tables are getting freed, we need to send the + * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping + * up on the new contents of what used to be page tables, while + * doing a speculative memory access. + */ + if (info->freed_tables) + smp_call_function_many(cpumask, flush_tlb_func_remote, (void *)info, 1); + else + on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote, + (void *)info, 1, GFP_ATOMIC, cpumask); } /* @@ -623,12 +658,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask, static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long vmflag) + unsigned long end, unsigned int stride_shift, + bool freed_tables) { int cpu; struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { .mm = mm, + .stride_shift = stride_shift, + .freed_tables = freed_tables, }; cpu = get_cpu(); @@ -638,8 +676,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, /* Should we flush just the requested range? */ if ((end != TLB_FLUSH_ALL) && - !(vmflag & VM_HUGETLB) && - ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { + ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) { info.start = start; info.end = end; } else { diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c @@ -22,6 +22,7 @@ #include <linux/tick.h> #include <linux/nmi.h> #include <linux/cpuhotplug.h> +#include <linux/stackprotector.h> #include <asm/paravirt.h> #include <asm/desc.h> @@ -88,6 +89,7 @@ static void cpu_bringup(void) asmlinkage __visible void cpu_bringup_and_idle(void) { cpu_bringup(); + boot_init_stack_canary(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c @@ -902,12 +902,22 @@ static bool copy_device_table(void) } } - old_devtb_phys = entry & PAGE_MASK; + /* + * When SME is enabled in the first kernel, the entry includes the + * memory encryption mask(sme_me_mask), we must remove the memory + * encryption mask to obtain the true physical address in kdump kernel. + */ + old_devtb_phys = __sme_clr(entry) & PAGE_MASK; + if (old_devtb_phys >= 0x100000000ULL) { pr_err("The address of old device table is above 4G, not trustworthy!\n"); return false; } - old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB); + old_devtb = (sme_active() && is_kdump_kernel()) + ? (__force void *)ioremap_encrypted(old_devtb_phys, + dev_table_size) + : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB); + if (!old_devtb) return false; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c @@ -24,6 +24,8 @@ #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/uaccess.h> +#include <linux/mem_encrypt.h> +#include <asm/pgtable.h> #include <asm/io.h> #include "internal.h" @@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn) /* Reads a page from the oldmem device from given offset. */ static ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf) + u64 *ppos, int userbuf, + bool encrypted) { unsigned long pfn, offset; size_t nr_bytes; @@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count, if (pfn_is_ram(pfn) == 0) memset(buf, 0, nr_bytes); else { - tmp = copy_oldmem_page(pfn, buf, nr_bytes, - offset, userbuf); + if (encrypted) + tmp = copy_oldmem_page_encrypted(pfn, buf, + nr_bytes, + offset, + userbuf); + else + tmp = copy_oldmem_page(pfn, buf, nr_bytes, + offset, userbuf); + if (tmp < 0) return tmp; } @@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr) */ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0); + return read_from_oldmem(buf, count, ppos, 0, false); } /* @@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) */ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0); + return read_from_oldmem(buf, count, ppos, 0, sme_active()); } /* @@ -173,10 +183,21 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { + prot = pgprot_encrypted(prot); return remap_pfn_range(vma, from, pfn, size, prot); } /* + * Architectures which support memory encryption override this. + */ +ssize_t __weak +copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + return copy_oldmem_page(pfn, buf, csize, offset, userbuf); +} + +/* * Copy to either kernel or user space */ static int copy_to(void *target, void *src, size_t size, int userbuf) @@ -351,7 +372,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, m->offset + m->size - *fpos, buflen); start = m->paddr + *fpos - m->offset; - tmp = read_from_oldmem(buffer, tsz, &start, userbuf); + tmp = read_from_oldmem(buffer, tsz, &start, + userbuf, sme_active()); if (tmp < 0) return tmp; buflen -= tsz; diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h @@ -26,6 +26,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); +extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, + int userbuf); + void vmcore_cleanup(void); /* Architecture code defines this if there are other possible ELF diff --git a/include/linux/smp.h b/include/linux/smp.h @@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), smp_call_func_t func, void *info, bool wait, gfp_t gfp_flags); +void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags, const struct cpumask *mask); + int smp_call_function_single_async(int cpu, call_single_data_t *csd); #ifdef CONFIG_SMP diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c @@ -471,6 +471,10 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, } } + /* Ensure that these pages are decrypted if SME is enabled. */ + if (pages) + arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0); + return pages; } @@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image, result = -ENOMEM; goto out; } + arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap(page); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, @@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image, result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); + arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; goto out; diff --git a/kernel/resource.c b/kernel/resource.c @@ -318,33 +318,34 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -/* - * Finds the lowest iomem resource existing within [res->start.res->end). - * The caller must specify res->start, res->end, res->flags, and optionally - * desc. If found, returns 0, res is overwritten, if not found, returns -1. - * This function walks the whole tree and not just first level children until - * and unless first_level_children_only is true. +/** + * Finds the lowest iomem resource that covers part of [start..end]. The + * caller must specify start, end, flags, and desc (which may be + * IORES_DESC_NONE). + * + * If a resource is found, returns 0 and *res is overwritten with the part + * of the resource that's within [start..end]; if none is found, returns + * -1. + * + * This function walks the whole tree and not just first level children + * unless @first_lvl is true. */ -static int find_next_iomem_res(struct resource *res, unsigned long desc, - bool first_level_children_only) +static int find_next_iomem_res(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + bool first_lvl, struct resource *res) { - resource_size_t start, end; struct resource *p; - bool sibling_only = false; - BUG_ON(!res); - - start = res->start; - end = res->end; - BUG_ON(start >= end); + if (!res) + return -EINVAL; - if (first_level_children_only) - sibling_only = true; + if (start >= end) + return -EINVAL; read_lock(&resource_lock); - for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { - if ((p->flags & res->flags) != res->flags) + for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) { + if ((p->flags & flags) != flags) continue; if ((desc != IORES_DESC_NONE) && (desc != p->desc)) continue; @@ -352,45 +353,43 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc, p = NULL; break; } - if ((p->end >= start) && (p->start < end)) + if ((p->end >= start) && (p->start <= end)) break; } read_unlock(&resource_lock); if (!p) return -1; + /* copy data */ - if (res->start < p->start) - res->start = p->start; - if (res->end > p->end) - res->end = p->end; + res->start = max(start, p->start); + res->end = min(end, p->end); res->flags = p->flags; res->desc = p->desc; return 0; } -static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, - bool first_level_children_only, - void *arg, +static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + bool first_lvl, void *arg, int (*func)(struct resource *, void *)) { - u64 orig_end = res->end; + struct resource res; int ret = -1; - while ((res->start < res->end) && - !find_next_iomem_res(res, desc, first_level_children_only)) { - ret = (*func)(res, arg); + while (start < end && + !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) { + ret = (*func)(&res, arg); if (ret) break; - res->start = res->end + 1; - res->end = orig_end; + start = res.end + 1; } return ret; } -/* +/** * Walks through iomem resources and calls func() with matching resource * ranges. This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and @@ -407,13 +406,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)) { - struct resource res; - - res.start = start; - res.end = end; - res.flags = flags; - - return __walk_iomem_res_desc(&res, desc, false, arg, func); + return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func); } EXPORT_SYMBOL_GPL(walk_iomem_res_desc); @@ -425,15 +418,11 @@ EXPORT_SYMBOL_GPL(walk_iomem_res_desc); * ranges. */ int walk_system_ram_res(u64 start, u64 end, void *arg, - int (*func)(struct resource *, void *)) + int (*func)(struct resource *, void *)) { - struct resource res; + unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - res.start = start; - res.end = end; - res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - - return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, arg, func); } @@ -444,13 +433,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, int walk_mem_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)) { - struct resource res; - - res.start = start; - res.end = end; - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; - return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, arg, func); } @@ -462,27 +447,27 @@ int walk_mem_res(u64 start, u64 end, void *arg, * It is to be used only for System RAM. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, - void *arg, int (*func)(unsigned long, unsigned long, void *)) + void *arg, int (*func)(unsigned long, unsigned long, void *)) { + resource_size_t start, end; + unsigned long flags; struct resource res; unsigned long pfn, end_pfn; - u64 orig_end; int ret = -1; - res.start = (u64) start_pfn << PAGE_SHIFT; - res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; - res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - orig_end = res.end; - while ((res.start < res.end) && - (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) { + start = (u64) start_pfn << PAGE_SHIFT; + end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + while (start < end && + !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, + true, &res)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) ret = (*func)(pfn, end_pfn - pfn, arg); if (ret) break; - res.start = res.end + 1; - res.end = orig_end; + start = res.end + 1; } return ret; } @@ -658,8 +643,8 @@ static int find_resource(struct resource *root, struct resource *new, * @constraint: the size and alignment constraints to be met. */ static int reallocate_resource(struct resource *root, struct resource *old, - resource_size_t newsize, - struct resource_constraint *constraint) + resource_size_t newsize, + struct resource_constraint *constraint) { int err=0; struct resource new = *old; @@ -972,7 +957,7 @@ skip: * Existing children of the resource are assumed to be immutable. */ int adjust_resource(struct resource *res, resource_size_t start, - resource_size_t size) + resource_size_t size) { int result; @@ -983,9 +968,9 @@ int adjust_resource(struct resource *res, resource_size_t start, } EXPORT_SYMBOL(adjust_resource); -static void __init __reserve_region_with_split(struct resource *root, - resource_size_t start, resource_size_t end, - const char *name) +static void __init +__reserve_region_with_split(struct resource *root, resource_size_t start, + resource_size_t end, const char *name) { struct resource *parent = root; struct resource *conflict; @@ -1044,9 +1029,9 @@ static void __init __reserve_region_with_split(struct resource *root, } -void __init reserve_region_with_split(struct resource *root, - resource_size_t start, resource_size_t end, - const char *name) +void __init +reserve_region_with_split(struct resource *root, resource_size_t start, + resource_size_t end, const char *name) { int abort = 0; @@ -1172,7 +1157,7 @@ EXPORT_SYMBOL(__request_region); * The described resource region must match a currently busy region. */ void __release_region(struct resource *parent, resource_size_t start, - resource_size_t n) + resource_size_t n) { struct resource **p; resource_size_t end; @@ -1234,7 +1219,7 @@ EXPORT_SYMBOL(__release_region); * simplicity. Enhance this logic when necessary. */ int release_mem_region_adjustable(struct resource *parent, - resource_size_t start, resource_size_t size) + resource_size_t start, resource_size_t size) { struct resource **p; struct resource *res; @@ -1410,9 +1395,9 @@ static int devm_region_match(struct device *dev, void *res, void *match_data) this->start == match->start && this->n == match->n; } -struct resource * __devm_request_region(struct device *dev, - struct resource *parent, resource_size_t start, - resource_size_t n, const char *name) +struct resource * +__devm_request_region(struct device *dev, struct resource *parent, + resource_size_t start, resource_size_t n, const char *name) { struct region_devres *dr = NULL; struct resource *res; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c @@ -347,21 +347,6 @@ EXPORT_SYMBOL_GPL(play_idle); void cpu_startup_entry(enum cpuhp_state state) { - /* - * This #ifdef needs to die, but it's too late in the cycle to - * make this generic (ARM and SH have never invoked the canary - * init for the non boot CPUs!). Will be fixed in 3.11 - */ -#ifdef CONFIG_X86 - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. The boot CPU already has it initialized but no harm - * in doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); -#endif arch_cpu_idle_prepare(); cpuhp_online_idle(state); while (1) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h @@ -56,7 +56,6 @@ #include <linux/profile.h> #include <linux/rcupdate_wait.h> #include <linux/security.h> -#include <linux/stackprotector.h> #include <linux/stop_machine.h> #include <linux/suspend.h> #include <linux/swait.h> diff --git a/kernel/smp.c b/kernel/smp.c @@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); * You must not call this function with disabled interrupts or * from a hardware interrupt handler or from a bottom half handler. */ -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), +void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags) + gfp_t gfp_flags, const struct cpumask *mask) { cpumask_var_t cpus; int cpu, ret; @@ -680,9 +680,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { preempt_disable(); - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) if (cond_func(cpu, info)) - cpumask_set_cpu(cpu, cpus); + __cpumask_set_cpu(cpu, cpus); on_each_cpu_mask(cpus, func, info, wait); preempt_enable(); free_cpumask_var(cpus); @@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), * just have to IPI them one by one. */ preempt_disable(); - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) if (cond_func(cpu, info)) { ret = smp_call_function_single(cpu, func, info, wait); @@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), preempt_enable(); } } +EXPORT_SYMBOL(on_each_cpu_cond_mask); + +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, + cpu_online_mask); +} EXPORT_SYMBOL(on_each_cpu_cond); static void do_nothing(void *unused) diff --git a/kernel/up.c b/kernel/up.c @@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); * Preemption is disabled here to make sure the cond_func is called under the * same condtions in UP and SMP. */ -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags) +void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags, const struct cpumask *mask) { unsigned long flags; @@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), } preempt_enable(); } +EXPORT_SYMBOL(on_each_cpu_cond_mask); + +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL); +} EXPORT_SYMBOL(on_each_cpu_cond); int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c @@ -8,6 +8,7 @@ */ #include <linux/pagemap.h> +#include <linux/hugetlb.h> #include <asm/tlb.h> #include <asm-generic/pgtable.h>