whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit e1d20beae70eb918cca7f07a77ce199fd148fdd2
parent cbbfb0ae2ca979222297062647ced653682a6cc7
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Tue, 23 Oct 2018 15:24:22 +0100

Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 asm updates from Ingo Molnar:
 "The main changes in this cycle were the fsgsbase related preparatory
  patches from Chang S. Bae - but there's also an optimized
  memcpy_flushcache() and a cleanup for the __cmpxchg_double() assembly
  glue"

* 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/fsgsbase/64: Clean up various details
  x86/segments: Introduce the 'CPUNODE' naming to better document the segment limit CPU/node NR trick
  x86/vdso: Initialize the CPU/node NR segment descriptor earlier
  x86/vdso: Introduce helper functions for CPU and node number
  x86/segments/64: Rename the GDT PER_CPU entry to CPU_NUMBER
  x86/fsgsbase/64: Factor out FS/GS segment loading from __switch_to()
  x86/fsgsbase/64: Convert the ELF core dump code to the new FSGSBASE helpers
  x86/fsgsbase/64: Make ptrace use the new FS/GS base helpers
  x86/fsgsbase/64: Introduce FS/GS base helper functions
  x86/fsgsbase/64: Fix ptrace() to read the FS/GS base accurately
  x86/asm: Use CC_SET()/CC_OUT() in __cmpxchg_double()
  x86/asm: Optimize memcpy_flushcache()

Diffstat:
March/x86/entry/vdso/vgetcpu.c | 8+-------
March/x86/entry/vdso/vma.c | 38+-------------------------------------
March/x86/include/asm/cmpxchg.h | 10++++++----
March/x86/include/asm/elf.h | 6+++---
Aarch/x86/include/asm/fsgsbase.h | 49+++++++++++++++++++++++++++++++++++++++++++++++++
March/x86/include/asm/segment.h | 46+++++++++++++++++++++++++++++++++++++++++++---
March/x86/include/asm/string_64.h | 20+++++++++++++++++++-
March/x86/include/asm/vgtod.h | 26--------------------------
March/x86/kernel/cpu/common.c | 24++++++++++++++++++++++++
March/x86/kernel/process_64.c | 183+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
March/x86/kernel/ptrace.c | 28+++++++++-------------------
March/x86/lib/usercopy_64.c | 4++--
12 files changed, 299 insertions(+), 143 deletions(-)

diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c @@ -13,14 +13,8 @@ notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { - unsigned int p; + vdso_read_cpunode(cpu, node); - p = __getcpu(); - - if (cpu) - *cpu = p & VGETCPU_CPU_MASK; - if (node) - *node = p >> 12; return 0; } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c @@ -332,40 +332,6 @@ static __init int vdso_setup(char *s) return 0; } __setup("vdso=", vdso_setup); -#endif - -#ifdef CONFIG_X86_64 -static void vgetcpu_cpu_init(void *arg) -{ - int cpu = smp_processor_id(); - struct desc_struct d = { }; - unsigned long node = 0; -#ifdef CONFIG_NUMA - node = cpu_to_node(cpu); -#endif - if (static_cpu_has(X86_FEATURE_RDTSCP)) - write_rdtscp_aux((node << 12) | cpu); - - /* - * Store cpu number in limit so that it can be loaded - * quickly in user space in vgetcpu. (12 bits for the CPU - * and 8 bits for the node) - */ - d.limit0 = cpu | ((node & 0xf) << 12); - d.limit1 = node >> 4; - d.type = 5; /* RO data, expand down, accessed */ - d.dpl = 3; /* Visible to user code */ - d.s = 1; /* Not a system segment */ - d.p = 1; /* Present */ - d.d = 1; /* 32-bit */ - - write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); -} - -static int vgetcpu_online(unsigned int cpu) -{ - return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); -} static int __init init_vdso(void) { @@ -375,9 +341,7 @@ static int __init init_vdso(void) init_vdso_image(&vdso_image_x32); #endif - /* notifier priority > KVM */ - return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE, - "x86/vdso/vma:online", vgetcpu_online, NULL); + return 0; } subsys_initcall(init_vdso); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h @@ -242,10 +242,12 @@ extern void __add_wrong_size(void) BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \ VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \ VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \ - asm volatile(pfx "cmpxchg%c4b %2; sete %0" \ - : "=a" (__ret), "+d" (__old2), \ - "+m" (*(p1)), "+m" (*(p2)) \ - : "i" (2 * sizeof(long)), "a" (__old1), \ + asm volatile(pfx "cmpxchg%c5b %1" \ + CC_SET(e) \ + : CC_OUT(e) (__ret), \ + "+m" (*(p1)), "+m" (*(p2)), \ + "+a" (__old1), "+d" (__old2) \ + : "i" (2 * sizeof(long)), \ "b" (__new1), "c" (__new2)); \ __ret; \ }) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h @@ -10,6 +10,7 @@ #include <asm/ptrace.h> #include <asm/user.h> #include <asm/auxvec.h> +#include <asm/fsgsbase.h> typedef unsigned long elf_greg_t; @@ -204,7 +205,6 @@ void set_personality_ia32(bool); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ - unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -227,8 +227,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ - rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ + (pr_reg)[21] = x86_fsbase_read_cpu(); \ + (pr_reg)[22] = x86_gsbase_read_cpu_inactive(); \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_FSGSBASE_H +#define _ASM_FSGSBASE_H + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_X86_64 + +#include <asm/msr-index.h> + +/* + * Read/write a task's FSBASE or GSBASE. This returns the value that + * the FS/GS base would have (if the task were to be resumed). These + * work on the current task or on a non-running (typically stopped + * ptrace child) task. + */ +extern unsigned long x86_fsbase_read_task(struct task_struct *task); +extern unsigned long x86_gsbase_read_task(struct task_struct *task); +extern int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); +extern int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); + +/* Helper functions for reading/writing FS/GS base */ + +static inline unsigned long x86_fsbase_read_cpu(void) +{ + unsigned long fsbase; + + rdmsrl(MSR_FS_BASE, fsbase); + + return fsbase; +} + +static inline unsigned long x86_gsbase_read_cpu_inactive(void) +{ + unsigned long gsbase; + + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + + return gsbase; +} + +extern void x86_fsbase_write_cpu(unsigned long fsbase); +extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); + +#endif /* CONFIG_X86_64 */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_FSGSBASE_H */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h @@ -186,8 +186,7 @@ #define GDT_ENTRY_TLS_MIN 12 #define GDT_ENTRY_TLS_MAX 14 -/* Abused to load per CPU data from limit */ -#define GDT_ENTRY_PER_CPU 15 +#define GDT_ENTRY_CPUNODE 15 /* * Number of entries in the GDT table: @@ -207,7 +206,7 @@ #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) #define __USER32_DS __USER_DS #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) -#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3) +#define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3) #endif @@ -225,6 +224,47 @@ #define GDT_ENTRY_TLS_ENTRIES 3 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) +#ifdef CONFIG_X86_64 + +/* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */ +#define VDSO_CPUNODE_BITS 12 +#define VDSO_CPUNODE_MASK 0xfff + +#ifndef __ASSEMBLY__ + +/* Helper functions to store/load CPU and node numbers */ + +static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node) +{ + return (node << VDSO_CPUNODE_BITS) | cpu; +} + +static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) +{ + unsigned int p; + + /* + * Load CPU and node number from the GDT. LSL is faster than RDTSCP + * and works on all CPUs. This is volatile so that it orders + * correctly with respect to barrier() and to keep GCC from cleverly + * hoisting it out of the calling function. + * + * If RDPID is available, use it. + */ + alternative_io ("lsl %[seg],%[p]", + ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ + X86_FEATURE_RDPID, + [p] "=a" (p), [seg] "r" (__CPUNODE_SEG)); + + if (cpu) + *cpu = (p & VDSO_CPUNODE_MASK); + if (node) + *node = (p >> VDSO_CPUNODE_BITS); +} + +#endif /* !__ASSEMBLY__ */ +#endif /* CONFIG_X86_64 */ + #ifdef __KERNEL__ /* diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h @@ -149,7 +149,25 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt) #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1 -void memcpy_flushcache(void *dst, const void *src, size_t cnt); +void __memcpy_flushcache(void *dst, const void *src, size_t cnt); +static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) +{ + if (__builtin_constant_p(cnt)) { + switch (cnt) { + case 4: + asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src)); + return; + case 8: + asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src)); + return; + case 16: + asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src)); + asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8))); + return; + } + } + __memcpy_flushcache(dst, src, cnt); +} #endif #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h @@ -77,30 +77,4 @@ static inline void gtod_write_end(struct vsyscall_gtod_data *s) ++s->seq; } -#ifdef CONFIG_X86_64 - -#define VGETCPU_CPU_MASK 0xfff - -static inline unsigned int __getcpu(void) -{ - unsigned int p; - - /* - * Load per CPU data from GDT. LSL is faster than RDTSCP and - * works on all CPUs. This is volatile so that it orders - * correctly wrt barrier() and to keep gcc from cleverly - * hoisting it out of the calling function. - * - * If RDPID is available, use it. - */ - alternative_io ("lsl %[seg],%[p]", - ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ - X86_FEATURE_RDPID, - [p] "=a" (p), [seg] "r" (__PER_CPU_SEG)); - - return p; -} - -#endif /* CONFIG_X86_64 */ - #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c @@ -1669,6 +1669,29 @@ static void wait_for_master_cpu(int cpu) #endif } +#ifdef CONFIG_X86_64 +static void setup_getcpu(int cpu) +{ + unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); + struct desc_struct d = { }; + + if (static_cpu_has(X86_FEATURE_RDTSCP)) + write_rdtscp_aux(cpudata); + + /* Store CPU and node number in limit. */ + d.limit0 = cpudata; + d.limit1 = cpudata >> 16; + + d.type = 5; /* RO data, expand down, accessed */ + d.dpl = 3; /* Visible to user code */ + d.s = 1; /* Not a system segment */ + d.p = 1; /* Present */ + d.d = 1; /* 32-bit */ + + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S); +} +#endif + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -1706,6 +1729,7 @@ void cpu_init(void) early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif + setup_getcpu(cpu); me = current; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c @@ -54,6 +54,7 @@ #include <asm/vdso.h> #include <asm/intel_rdt_sched.h> #include <asm/unistd.h> +#include <asm/fsgsbase.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include <asm/unistd_32_ia32.h> @@ -286,6 +287,138 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, } } +static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, + struct thread_struct *next) +{ + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); +} + +static unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector) +{ + unsigned short idx = selector >> 3; + unsigned long base; + + if (likely((selector & SEGMENT_TI_MASK) == 0)) { + if (unlikely(idx >= GDT_ENTRIES)) + return 0; + + /* + * There are no user segments in the GDT with nonzero bases + * other than the TLS segments. + */ + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return 0; + + idx -= GDT_ENTRY_TLS_MIN; + base = get_desc_base(&task->thread.tls_array[idx]); + } else { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + struct ldt_struct *ldt; + + /* + * If performance here mattered, we could protect the LDT + * with RCU. This is a slow path, though, so we can just + * take the mutex. + */ + mutex_lock(&task->mm->context.lock); + ldt = task->mm->context.ldt; + if (unlikely(idx >= ldt->nr_entries)) + base = 0; + else + base = get_desc_base(ldt->entries + idx); + mutex_unlock(&task->mm->context.lock); +#else + base = 0; +#endif + } + + return base; +} + +void x86_fsbase_write_cpu(unsigned long fsbase) +{ + /* + * Set the selector to 0 as a notion, that the segment base is + * overwritten, which will be checked for skipping the segment load + * during context switch. + */ + loadseg(FS, 0); + wrmsrl(MSR_FS_BASE, fsbase); +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + /* Set the selector to 0 for the same reason as %fs above. */ + loadseg(GS, 0); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); +} + +unsigned long x86_fsbase_read_task(struct task_struct *task) +{ + unsigned long fsbase; + + if (task == current) + fsbase = x86_fsbase_read_cpu(); + else if (task->thread.fsindex == 0) + fsbase = task->thread.fsbase; + else + fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); + + return fsbase; +} + +unsigned long x86_gsbase_read_task(struct task_struct *task) +{ + unsigned long gsbase; + + if (task == current) + gsbase = x86_gsbase_read_cpu_inactive(); + else if (task->thread.gsindex == 0) + gsbase = task->thread.gsbase; + else + gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); + + return gsbase; +} + +int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) +{ + /* + * Not strictly needed for %fs, but do it for symmetry + * with %gs + */ + if (unlikely(fsbase >= TASK_SIZE_MAX)) + return -EPERM; + + preempt_disable(); + task->thread.fsbase = fsbase; + if (task == current) + x86_fsbase_write_cpu(fsbase); + task->thread.fsindex = 0; + preempt_enable(); + + return 0; +} + +int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) +{ + if (unlikely(gsbase >= TASK_SIZE_MAX)) + return -EPERM; + + preempt_disable(); + task->thread.gsbase = gsbase; + if (task == current) + x86_gsbase_write_cpu_inactive(gsbase); + task->thread.gsindex = 0; + preempt_enable(); + + return 0; +} + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@ -473,10 +606,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); + x86_fsgsbase_load(prev, next); switch_fpu_finish(next_fpu, cpu); @@ -627,54 +757,25 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; - int doit = task == current; - int cpu; switch (option) { - case ARCH_SET_GS: - if (arg2 >= TASK_SIZE_MAX) - return -EPERM; - cpu = get_cpu(); - task->thread.gsindex = 0; - task->thread.gsbase = arg2; - if (doit) { - load_gs_index(0); - ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2); - } - put_cpu(); + case ARCH_SET_GS: { + ret = x86_gsbase_write_task(task, arg2); break; - case ARCH_SET_FS: - /* Not strictly needed for fs, but do it for symmetry - with gs */ - if (arg2 >= TASK_SIZE_MAX) - return -EPERM; - cpu = get_cpu(); - task->thread.fsindex = 0; - task->thread.fsbase = arg2; - if (doit) { - /* set the selector to 0 to not confuse __switch_to */ - loadsegment(fs, 0); - ret = wrmsrl_safe(MSR_FS_BASE, arg2); - } - put_cpu(); + } + case ARCH_SET_FS: { + ret = x86_fsbase_write_task(task, arg2); break; + } case ARCH_GET_FS: { - unsigned long base; + unsigned long base = x86_fsbase_read_task(task); - if (doit) - rdmsrl(MSR_FS_BASE, base); - else - base = task->thread.fsbase; ret = put_user(base, (unsigned long __user *)arg2); break; } case ARCH_GET_GS: { - unsigned long base; + unsigned long base = x86_gsbase_read_task(task); - if (doit) - rdmsrl(MSR_KERNEL_GS_BASE, base); - else - base = task->thread.gsbase; ret = put_user(base, (unsigned long __user *)arg2); break; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c @@ -39,6 +39,7 @@ #include <asm/hw_breakpoint.h> #include <asm/traps.h> #include <asm/syscall.h> +#include <asm/fsgsbase.h> #include "tls.h" @@ -396,12 +397,11 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; /* - * When changing the segment base, use do_arch_prctl_64 - * to set either thread.fs or thread.fsindex and the - * corresponding GDT slot. + * When changing the FS base, use the same + * mechanism as for do_arch_prctl_64(). */ if (child->thread.fsbase != value) - return do_arch_prctl_64(child, ARCH_SET_FS, value); + return x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): /* @@ -410,7 +410,7 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; if (child->thread.gsbase != value) - return do_arch_prctl_64(child, ARCH_SET_GS, value); + return x86_gsbase_write_task(child, value); return 0; #endif } @@ -434,20 +434,10 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset) return get_flags(task); #ifdef CONFIG_X86_64 - case offsetof(struct user_regs_struct, fs_base): { - /* - * XXX: This will not behave as expected if called on - * current or if fsindex != 0. - */ - return task->thread.fsbase; - } - case offsetof(struct user_regs_struct, gs_base): { - /* - * XXX: This will not behave as expected if called on - * current or if fsindex != 0. - */ - return task->thread.gsbase; - } + case offsetof(struct user_regs_struct, fs_base): + return x86_fsbase_read_task(task); + case offsetof(struct user_regs_struct, gs_base): + return x86_gsbase_read_task(task); #endif } diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c @@ -153,7 +153,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) return rc; } -void memcpy_flushcache(void *_dst, const void *_src, size_t size) +void __memcpy_flushcache(void *_dst, const void *_src, size_t size) { unsigned long dest = (unsigned long) _dst; unsigned long source = (unsigned long) _src; @@ -216,7 +216,7 @@ void memcpy_flushcache(void *_dst, const void *_src, size_t size) clean_cache_range((void *) dest, size); } } -EXPORT_SYMBOL_GPL(memcpy_flushcache); +EXPORT_SYMBOL_GPL(__memcpy_flushcache); void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len)