whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit d82924c3b8d0607094b94fab290a33c5ad7d586c
parent d7197a5ad8528642cb70f1d27d4d5c7332a2b395
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Tue, 23 Oct 2018 18:43:04 +0100

Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 pti updates from Ingo Molnar:
 "The main changes:

   - Make the IBPB barrier more strict and add STIBP support (Jiri
     Kosina)

   - Micro-optimize and clean up the entry code (Andy Lutomirski)

   - ... plus misc other fixes"

* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/speculation: Propagate information about RSB filling mitigation to sysfs
  x86/speculation: Enable cross-hyperthread spectre v2 STIBP mitigation
  x86/speculation: Apply IBPB more strictly to avoid cross-process data leak
  x86/speculation: Add RETPOLINE_AMD support to the inline asm CALL_NOSPEC variant
  x86/CPU: Fix unused variable warning when !CONFIG_IA32_EMULATION
  x86/pti/64: Remove the SYSCALL64 entry trampoline
  x86/entry/64: Use the TSS sp2 slot for SYSCALL/SYSRET scratch space
  x86/entry/64: Document idtentry

Diffstat:
March/x86/entry/entry_64.S | 117+++++++++++++++++++++++++++++++-------------------------------------------------
March/x86/include/asm/cpu_entry_area.h | 2--
March/x86/include/asm/nospec-branch.h | 17+++++++++++++----
March/x86/include/asm/processor.h | 6++++++
March/x86/include/asm/sections.h | 1-
March/x86/kernel/asm-offsets.c | 5++---
March/x86/kernel/cpu/bugs.c | 58++++++++++++++++++++++++++++++++++++++++++++++++++++------
March/x86/kernel/cpu/common.c | 16+++-------------
March/x86/kernel/kprobes/core.c | 10+---------
March/x86/kernel/process_64.c | 2--
March/x86/kernel/traps.c | 4++++
March/x86/kernel/vmlinux.lds.S | 10----------
March/x86/mm/cpu_entry_area.c | 36------------------------------------
March/x86/mm/pti.c | 33++++++++++++++++++++++++++++++++-
March/x86/mm/tlb.c | 31++++++++++++++++++++-----------
March/x86/xen/xen-asm_64.S | 8+++++---
Minclude/linux/ptrace.h | 21+++++++++++++++++++--
Mkernel/cpu.c | 11++++++++++-
Mkernel/ptrace.c | 10++++++++++
19 files changed, 222 insertions(+), 176 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S @@ -142,67 +142,6 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" - -/* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. - * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: - */ - -#define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - -/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ - SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA - -ENTRY(entry_SYSCALL_64_trampoline) - UNWIND_HINT_EMPTY - swapgs - - /* Stash the user RSP. */ - movq %rsp, RSP_SCRATCH - - /* Note: using %rsp as a scratch reg. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - - /* Load the top of the task stack into RSP */ - movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp - - /* Start building the simulated IRET frame. */ - pushq $__USER_DS /* pt_regs->ss */ - pushq RSP_SCRATCH /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - - /* - * x86 lacks a near absolute jump, and we can't jump to the real - * entry text with a relative jump. We could push the target - * address and then use retq, but this destroys the pipeline on - * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, - * spill RDI and restore it in a second-stage trampoline. - */ - pushq %rdi - movq $entry_SYSCALL_64_stage2, %rdi - JMP_NOSPEC %rdi -END(entry_SYSCALL_64_trampoline) - - .popsection - -ENTRY(entry_SYSCALL_64_stage2) - UNWIND_HINT_EMPTY - popq %rdi - jmp entry_SYSCALL_64_after_hwframe -END(entry_SYSCALL_64_stage2) - ENTRY(entry_SYSCALL_64) UNWIND_HINT_EMPTY /* @@ -212,21 +151,19 @@ ENTRY(entry_SYSCALL_64) */ swapgs - /* - * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it - * is not required to switch CR3. - */ - movq %rsp, PER_CPU_VAR(rsp_scratch) + /* tss.sp2 is scratch space. */ + movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ - pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ + pushq $__USER_DS /* pt_regs->ss */ + pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ GLOBAL(entry_SYSCALL_64_after_hwframe) - pushq %rax /* pt_regs->orig_ax */ + pushq %rax /* pt_regs->orig_ax */ PUSH_AND_CLEAR_REGS rax=$-ENOSYS @@ -900,6 +837,42 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) +/** + * idtentry - Generate an IDT entry stub + * @sym: Name of the generated entry point + * @do_sym: C function to be called + * @has_error_code: True if this IDT vector has an error code on the stack + * @paranoid: non-zero means that this vector may be invoked from + * kernel mode with user GSBASE and/or user CR3. + * 2 is special -- see below. + * @shift_ist: Set to an IST index if entries from kernel mode should + * decrement the IST stack so that nested entries get a + * fresh stack. (This is for #DB, which has a nasty habit + * of recursing.) + * + * idtentry generates an IDT stub that sets up a usable kernel context, + * creates struct pt_regs, and calls @do_sym. The stub has the following + * special behaviors: + * + * On an entry from user mode, the stub switches from the trampoline or + * IST stack to the normal thread stack. On an exit to user mode, the + * normal exit-to-usermode path is invoked. + * + * On an exit to kernel mode, if @paranoid == 0, we check for preemption, + * whereas we omit the preemption check if @paranoid != 0. This is purely + * because the implementation is simpler this way. The kernel only needs + * to check for asynchronous kernel preemption when IRQ handlers return. + * + * If @paranoid == 0, then the stub will handle IRET faults by pretending + * that the fault came from user mode. It will handle gs_change faults by + * pretending that the fault happened with kernel GSBASE. Since this handling + * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have + * @paranoid == 0. This special handling will do the wrong thing for + * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0. + * + * @paranoid == 2 is special: the stub will never switch stacks. This is for + * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. + */ .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h @@ -30,8 +30,6 @@ struct cpu_entry_area { */ struct tss_struct tss; - char entry_trampoline[PAGE_SIZE]; - #ifdef CONFIG_X86_64 /* * Exception stacks used for IST entries. diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h @@ -170,11 +170,15 @@ */ # define CALL_NOSPEC \ ANNOTATE_NOSPEC_ALTERNATIVE \ - ALTERNATIVE( \ + ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ "call __x86_indirect_thunk_%V[thunk_target]\n", \ - X86_FEATURE_RETPOLINE) + X86_FEATURE_RETPOLINE, \ + "lfence;\n" \ + ANNOTATE_RETPOLINE_SAFE \ + "call *%[thunk_target]\n", \ + X86_FEATURE_RETPOLINE_AMD) # define THUNK_TARGET(addr) [thunk_target] "r" (addr) #elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE) @@ -184,7 +188,8 @@ * here, anyway. */ # define CALL_NOSPEC \ - ALTERNATIVE( \ + ANNOTATE_NOSPEC_ALTERNATIVE \ + ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ " jmp 904f;\n" \ @@ -199,7 +204,11 @@ " ret;\n" \ " .align 16\n" \ "904: call 901b;\n", \ - X86_FEATURE_RETPOLINE) + X86_FEATURE_RETPOLINE, \ + "lfence;\n" \ + ANNOTATE_RETPOLINE_SAFE \ + "call *%[thunk_target]\n", \ + X86_FEATURE_RETPOLINE_AMD) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #else /* No retpoline for C / inline asm */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h @@ -316,7 +316,13 @@ struct x86_hw_tss { */ u64 sp1; + /* + * Since Linux does not use ring 2, the 'sp2' slot is unused by + * hardware. entry_SYSCALL_64 uses it as scratch space to stash + * the user RSP value. + */ u64 sp2; + u64 reserved2; u64 ist[7]; u32 reserved3; diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h @@ -11,7 +11,6 @@ extern char __end_rodata_aligned[]; #if defined(CONFIG_X86_64) extern char __end_rodata_hpage_align[]; -extern char __entry_trampoline_start[], __entry_trampoline_end[]; #endif #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c @@ -96,13 +96,12 @@ void common(void) { OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); /* Layout info for cpu_entry_area */ - OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); - OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1))); - /* Offset for sp0 and sp1 into the tss_struct */ + /* Offset for fields in tss_struct */ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); + OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c @@ -35,12 +35,10 @@ static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); -/* - * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any - * writes to SPEC_CTRL contain whatever reserved bits have been set. - */ -u64 __ro_after_init x86_spec_ctrl_base; +/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ +u64 x86_spec_ctrl_base; EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); +static DEFINE_MUTEX(spec_ctrl_mutex); /* * The vendor and possibly platform specific bits which can be modified in @@ -326,6 +324,46 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } +static bool stibp_needed(void) +{ + if (spectre_v2_enabled == SPECTRE_V2_NONE) + return false; + + if (!boot_cpu_has(X86_FEATURE_STIBP)) + return false; + + return true; +} + +static void update_stibp_msr(void *info) +{ + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); +} + +void arch_smt_update(void) +{ + u64 mask; + + if (!stibp_needed()) + return; + + mutex_lock(&spec_ctrl_mutex); + mask = x86_spec_ctrl_base; + if (cpu_smt_control == CPU_SMT_ENABLED) + mask |= SPEC_CTRL_STIBP; + else + mask &= ~SPEC_CTRL_STIBP; + + if (mask != x86_spec_ctrl_base) { + pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n", + cpu_smt_control == CPU_SMT_ENABLED ? + "Enabling" : "Disabling"); + x86_spec_ctrl_base = mask; + on_each_cpu(update_stibp_msr, NULL, 1); + } + mutex_unlock(&spec_ctrl_mutex); +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -426,6 +464,9 @@ specv2_set_mode: setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } + + /* Enable STIBP if appropriate */ + arch_smt_update(); } #undef pr_fmt @@ -816,6 +857,8 @@ static ssize_t l1tf_show_state(char *buf) static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { + int ret; + if (!boot_cpu_has_bug(bug)) return sprintf(buf, "Not affected\n"); @@ -833,10 +876,13 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "Mitigation: __user pointer sanitization\n"); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], + ret = sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + (x86_spec_ctrl_base & SPEC_CTRL_STIBP) ? ", STIBP" : "", + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", spectre_v2_module_string()); + return ret; case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c @@ -1534,19 +1534,8 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count); /* May not be marked __init: used by software suspend */ void syscall_init(void) { - extern char _entry_trampoline[]; - extern char entry_SYSCALL_64_trampoline[]; - - int cpu = smp_processor_id(); - unsigned long SYSCALL64_entry_trampoline = - (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + - (entry_SYSCALL_64_trampoline - _entry_trampoline); - wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - if (static_cpu_has(X86_FEATURE_PTI)) - wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); - else - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); @@ -1557,7 +1546,8 @@ void syscall_init(void) * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c @@ -1028,18 +1028,10 @@ NOKPROBE_SYMBOL(kprobe_fault_handler); bool arch_within_kprobe_blacklist(unsigned long addr) { - bool is_in_entry_trampoline_section = false; - -#ifdef CONFIG_X86_64 - is_in_entry_trampoline_section = - (addr >= (unsigned long)__entry_trampoline_start && - addr < (unsigned long)__entry_trampoline_end); -#endif return (addr >= (unsigned long)__kprobes_text_start && addr < (unsigned long)__kprobes_text_end) || (addr >= (unsigned long)__entry_text_start && - addr < (unsigned long)__entry_text_end) || - is_in_entry_trampoline_section; + addr < (unsigned long)__entry_text_end); } int __init arch_init_kprobes(void) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c @@ -60,8 +60,6 @@ #include <asm/unistd_32_ia32.h> #endif -__visible DEFINE_PER_CPU(unsigned long, rsp_scratch); - /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c @@ -383,6 +383,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * we won't enable interupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. + * + * We will enter general_protection with kernel GSBASE, + * which is what the stub expects, given that the faulting + * RIP will be the IRET instruction. */ regs->ip = (unsigned long)general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S @@ -136,16 +136,6 @@ SECTIONS *(.fixup) *(.gnu.warning) -#ifdef CONFIG_X86_64 - . = ALIGN(PAGE_SIZE); - __entry_trampoline_start = .; - _entry_trampoline = .; - *(.entry_trampoline) - . = ALIGN(PAGE_SIZE); - __entry_trampoline_end = .; - ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); -#endif - #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; *(.text.__x86.indirect_thunk) diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c @@ -15,7 +15,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage) #ifdef CONFIG_X86_64 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); -static DEFINE_PER_CPU(struct kcore_list, kcore_entry_trampoline); #endif struct cpu_entry_area *get_cpu_entry_area(int cpu) @@ -83,8 +82,6 @@ static void percpu_setup_debug_store(int cpu) static void __init setup_cpu_entry_area(int cpu) { #ifdef CONFIG_X86_64 - extern char _entry_trampoline[]; - /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; pgprot_t tss_prot = PAGE_KERNEL_RO; @@ -146,43 +143,10 @@ static void __init setup_cpu_entry_area(int cpu) cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, &per_cpu(exception_stacks, cpu), sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); - - cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, - __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); - /* - * The cpu_entry_area alias addresses are not in the kernel binary - * so they do not show up in /proc/kcore normally. This adds entries - * for them manually. - */ - kclist_add_remap(&per_cpu(kcore_entry_trampoline, cpu), - _entry_trampoline, - &get_cpu_entry_area(cpu)->entry_trampoline, PAGE_SIZE); #endif percpu_setup_debug_store(cpu); } -#ifdef CONFIG_X86_64 -int arch_get_kallsym(unsigned int symnum, unsigned long *value, char *type, - char *name) -{ - unsigned int cpu, ncpu = 0; - - if (symnum >= num_possible_cpus()) - return -EINVAL; - - for_each_possible_cpu(cpu) { - if (ncpu++ >= symnum) - break; - } - - *value = (unsigned long)&get_cpu_entry_area(cpu)->entry_trampoline; - *type = 't'; - strlcpy(name, "__entry_SYSCALL_64_trampoline", KSYM_NAME_LEN); - - return 0; -} -#endif - static __init void setup_cpu_entry_area_ptes(void) { #ifdef CONFIG_X86_32 diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c @@ -434,11 +434,42 @@ static void __init pti_clone_p4d(unsigned long addr) } /* - * Clone the CPU_ENTRY_AREA into the user space visible page table. + * Clone the CPU_ENTRY_AREA and associated data into the user space visible + * page table. */ static void __init pti_clone_user_shared(void) { + unsigned int cpu; + pti_clone_p4d(CPU_ENTRY_AREA_BASE); + + for_each_possible_cpu(cpu) { + /* + * The SYSCALL64 entry code needs to be able to find the + * thread stack and needs one word of scratch space in which + * to spill a register. All of this lives in the TSS, in + * the sp1 and sp2 slots. + * + * This is done for all possible CPUs during boot to ensure + * that it's propagated to all mms. If we were to add one of + * these mappings during CPU hotplug, we would need to take + * some measure to make sure that every mm that subsequently + * ran on that CPU would have the relevant PGD entry in its + * pagetables. The usual vmalloc_fault() mechanism would not + * work for page faults taken in entry_SYSCALL_64 before RSP + * is set up. + */ + + unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); + phys_addr_t pa = per_cpu_ptr_to_phys((void *)va); + pte_t *target_pte; + + target_pte = pti_user_pagetable_walk_pte(va); + if (WARN_ON(!target_pte)) + return; + + *target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL); + } } #else /* CONFIG_X86_64 */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <linux/cpu.h> #include <linux/debugfs.h> +#include <linux/ptrace.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -180,6 +181,19 @@ static void sync_current_stack_to_mm(struct mm_struct *mm) } } +static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id) +{ + /* + * Check if the current (previous) task has access to the memory + * of the @tsk (next) task. If access is denied, make sure to + * issue a IBPB to stop user->user Spectre-v2 attacks. + * + * Note: __ptrace_may_access() returns 0 or -ERRNO. + */ + return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id && + ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB)); +} + void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { @@ -286,18 +300,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * one process from doing Spectre-v2 attacks on another. * * As an optimization, flush indirect branches only when - * switching into processes that disable dumping. This - * protects high value processes like gpg, without having - * too high performance overhead. IBPB is *expensive*! - * - * This will not flush branches when switching into kernel - * threads. It will also not flush if we switch to idle - * thread and back to the same process. It will flush if we - * switch to a different non-dumpable process. + * switching into a processes that can't be ptrace by the + * current one (as in such case, attacker has much more + * convenient way how to tamper with the next process than + * branch buffer poisoning). */ - if (tsk && tsk->mm && - tsk->mm->context.ctx_id != last_ctx_id && - get_dumpable(tsk->mm) != SUID_DUMP_USER) + if (static_cpu_has(X86_FEATURE_USE_IBPB) && + ibpb_needed(tsk, last_ctx_id)) indirect_branch_prediction_barrier(); if (IS_ENABLED(CONFIG_VMAP_STACK)) { diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S @@ -91,13 +91,15 @@ ENTRY(xen_iret) ENTRY(xen_sysret64) /* * We're already on the usermode stack at this point, but - * still with the kernel gs, so we can easily switch back + * still with the kernel gs, so we can easily switch back. + * + * tss.sp2 is scratch space. */ - movq %rsp, PER_CPU_VAR(rsp_scratch) + movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp pushq $__USER_DS - pushq PER_CPU_VAR(rsp_scratch) + pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) pushq %r11 pushq $__USER_CS pushq %rcx diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h @@ -62,14 +62,17 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 -#define PTRACE_MODE_FSCREDS 0x08 -#define PTRACE_MODE_REALCREDS 0x10 +#define PTRACE_MODE_FSCREDS 0x08 +#define PTRACE_MODE_REALCREDS 0x10 +#define PTRACE_MODE_SCHED 0x20 +#define PTRACE_MODE_IBPB 0x40 /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) +#define PTRACE_MODE_SPEC_IBPB (PTRACE_MODE_ATTACH_REALCREDS | PTRACE_MODE_IBPB) /** * ptrace_may_access - check whether the caller is permitted to access @@ -87,6 +90,20 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); +/** + * ptrace_may_access - check whether the caller is permitted to access + * a target task. + * @task: target task + * @mode: selects type of access and caller credentials + * + * Returns true on success, false on denial. + * + * Similar to ptrace_may_access(). Only to be called from context switch + * code. Does not call into audit and the regular LSM hooks due to locking + * constraints. + */ +extern bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode); + static inline int ptrace_reparented(struct task_struct *child) { return !same_thread_group(child->real_parent, child->parent); diff --git a/kernel/cpu.c b/kernel/cpu.c @@ -2055,6 +2055,12 @@ static void cpuhp_online_cpu_device(unsigned int cpu) kobject_uevent(&dev->kobj, KOBJ_ONLINE); } +/* + * Architectures that need SMT-specific errata handling during SMT hotplug + * should override this. + */ +void __weak arch_smt_update(void) { }; + static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; @@ -2081,8 +2087,10 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) */ cpuhp_offline_cpu_device(cpu); } - if (!ret) + if (!ret) { cpu_smt_control = ctrlval; + arch_smt_update(); + } cpu_maps_update_done(); return ret; } @@ -2093,6 +2101,7 @@ static int cpuhp_smt_enable(void) cpu_maps_update_begin(); cpu_smt_control = CPU_SMT_ENABLED; + arch_smt_update(); for_each_present_cpu(cpu) { /* Skip online CPUs and CPUs on offline nodes */ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) diff --git a/kernel/ptrace.c b/kernel/ptrace.c @@ -261,6 +261,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) { + if (mode & PTRACE_MODE_SCHED) + return false; + if (mode & PTRACE_MODE_NOAUDIT) return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); else @@ -328,9 +331,16 @@ ok: !ptrace_has_cap(mm->user_ns, mode))) return -EPERM; + if (mode & PTRACE_MODE_SCHED) + return 0; return security_ptrace_access_check(task, mode); } +bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode) +{ + return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED); +} + bool ptrace_may_access(struct task_struct *task, unsigned int mode) { int err;