whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit eed9688f8513189295887e5a27ec7f576754b60e
parent 72af84151fa7e98720a0482b59de2d90ec7f10a7
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Wed, 26 Dec 2018 13:03:47 -0800

Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 RAS updates from Borislav Petkov:
 "This time around we have a subsystem reorganization to offer, with the
  new directory being

    arch/x86/kernel/cpu/mce/

  and all compilation units' names streamlined under it"

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce: Restore MCE injector's module name
  x86/mce: Unify pr_* prefix
  x86/mce: Streamline MCE subsystem's naming

Diffstat:
March/x86/kernel/cpu/Makefile | 2+-
Aarch/x86/kernel/cpu/mce/Makefile | 16++++++++++++++++
Aarch/x86/kernel/cpu/mce/amd.c | 1437+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kernel/cpu/mce/apei.c | 157+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kernel/cpu/mce/core.c | 2497+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kernel/cpu/mce/dev-mcelog.c | 358+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kernel/cpu/mce/genpool.c | 145+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kernel/cpu/mce/inject.c | 739+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kernel/cpu/mce/intel.c | 518+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kernel/cpu/mce/internal.h | 176+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kernel/cpu/mce/p5.c | 71+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kernel/cpu/mce/severity.c | 419+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kernel/cpu/mce/therm_throt.c | 522+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/kernel/cpu/mce/threshold.c | 31+++++++++++++++++++++++++++++++
Aarch/x86/kernel/cpu/mce/winchip.c | 46++++++++++++++++++++++++++++++++++++++++++++++
Darch/x86/kernel/cpu/mcheck/Makefile | 14--------------
Darch/x86/kernel/cpu/mcheck/dev-mcelog.c | 360-------------------------------------------------------------------------------
Darch/x86/kernel/cpu/mcheck/mce-apei.c | 157-------------------------------------------------------------------------------
Darch/x86/kernel/cpu/mcheck/mce-genpool.c | 145-------------------------------------------------------------------------------
Darch/x86/kernel/cpu/mcheck/mce-inject.c | 739-------------------------------------------------------------------------------
Darch/x86/kernel/cpu/mcheck/mce-internal.h | 173-------------------------------------------------------------------------------
Darch/x86/kernel/cpu/mcheck/mce-severity.c | 419-------------------------------------------------------------------------------
Darch/x86/kernel/cpu/mcheck/mce.c | 2499-------------------------------------------------------------------------------
Darch/x86/kernel/cpu/mcheck/mce_amd.c | 1437-------------------------------------------------------------------------------
Darch/x86/kernel/cpu/mcheck/mce_intel.c | 518-------------------------------------------------------------------------------
Darch/x86/kernel/cpu/mcheck/p5.c | 69---------------------------------------------------------------------
Darch/x86/kernel/cpu/mcheck/therm_throt.c | 520-------------------------------------------------------------------------------
Darch/x86/kernel/cpu/mcheck/threshold.c | 29-----------------------------
Darch/x86/kernel/cpu/mcheck/winchip.c | 44--------------------------------------------
29 files changed, 7133 insertions(+), 7124 deletions(-)

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile @@ -36,7 +36,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_X86_MCE) += mcheck/ +obj-$(CONFIG_X86_MCE) += mce/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MICROCODE) += microcode/ obj-$(CONFIG_RESCTRL) += resctrl/ diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y = core.o severity.o genpool.o + +obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o +obj-$(CONFIG_X86_MCE_INTEL) += intel.o +obj-$(CONFIG_X86_MCE_AMD) += amd.o +obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o + +mce-inject-y := inject.o +obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o + +obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o + +obj-$(CONFIG_ACPI_APEI) += apei.o + +obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c @@ -0,0 +1,1437 @@ +/* + * (c) 2005-2016 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + * + * Written by Jacob Shin - AMD, Inc. + * Maintained by: Borislav Petkov <bp@alien8.de> + * + * All MC4_MISCi registers are shared between cores on a node. + */ +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/kobject.h> +#include <linux/percpu.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/string.h> + +#include <asm/amd_nb.h> +#include <asm/apic.h> +#include <asm/mce.h> +#include <asm/msr.h> +#include <asm/trace/irq_vectors.h> + +#include "internal.h" + +#define NR_BLOCKS 5 +#define THRESHOLD_MAX 0xFFF +#define INT_TYPE_APIC 0x00020000 +#define MASK_VALID_HI 0x80000000 +#define MASK_CNTP_HI 0x40000000 +#define MASK_LOCKED_HI 0x20000000 +#define MASK_LVTOFF_HI 0x00F00000 +#define MASK_COUNT_EN_HI 0x00080000 +#define MASK_INT_TYPE_HI 0x00060000 +#define MASK_OVERFLOW_HI 0x00010000 +#define MASK_ERR_COUNT_HI 0x00000FFF +#define MASK_BLKPTR_LO 0xFF000000 +#define MCG_XBLK_ADDR 0xC0000400 + +/* Deferred error settings */ +#define MSR_CU_DEF_ERR 0xC0000410 +#define MASK_DEF_LVTOFF 0x000000F0 +#define MASK_DEF_INT_TYPE 0x00000006 +#define DEF_LVT_OFF 0x2 +#define DEF_INT_TYPE_APIC 0x2 + +/* Scalable MCA: */ + +/* Threshold LVT offset is at MSR0xC0000410[15:12] */ +#define SMCA_THR_LVT_OFF 0xF000 + +static bool thresholding_irq_en; + +static const char * const th_names[] = { + "load_store", + "insn_fetch", + "combined_unit", + "decode_unit", + "northbridge", + "execution_unit", +}; + +static const char * const smca_umc_block_names[] = { + "dram_ecc", + "misc_umc" +}; + +struct smca_bank_name { + const char *name; /* Short name for sysfs */ + const char *long_name; /* Long name for pretty-printing */ +}; + +static struct smca_bank_name smca_names[] = { + [SMCA_LS] = { "load_store", "Load Store Unit" }, + [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, + [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, + [SMCA_DE] = { "decode_unit", "Decode Unit" }, + [SMCA_RESERVED] = { "reserved", "Reserved" }, + [SMCA_EX] = { "execution_unit", "Execution Unit" }, + [SMCA_FP] = { "floating_point", "Floating Point Unit" }, + [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, + [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, + [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, + [SMCA_UMC] = { "umc", "Unified Memory Controller" }, + [SMCA_PB] = { "param_block", "Parameter Block" }, + [SMCA_PSP] = { "psp", "Platform Security Processor" }, + [SMCA_SMU] = { "smu", "System Management Unit" }, +}; + +static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init = +{ + [0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 } +}; + +const char *smca_get_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].name; +} + +const char *smca_get_long_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].long_name; +} +EXPORT_SYMBOL_GPL(smca_get_long_name); + +static enum smca_bank_types smca_get_bank_type(unsigned int bank) +{ + struct smca_bank *b; + + if (bank >= MAX_NR_BANKS) + return N_SMCA_BANK_TYPES; + + b = &smca_banks[bank]; + if (!b->hwid) + return N_SMCA_BANK_TYPES; + + return b->hwid->bank_type; +} + +static struct smca_hwid smca_hwid_mcatypes[] = { + /* { bank_type, hwid_mcatype, xec_bitmap } */ + + /* Reserved type */ + { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 }, + + /* ZN Core (HWID=0xB0) MCA types */ + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF }, + { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, + { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, + { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, + /* HWID 0xB0 MCATYPE 0x4 is Reserved */ + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF }, + { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, + { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, + + /* Data Fabric MCA types */ + { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF }, + + /* Unified Memory Controller MCA type */ + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F }, + + /* Parameter Block MCA type */ + { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, + + /* Platform Security Processor MCA type */ + { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, + + /* System Management Unit MCA type */ + { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, +}; + +struct smca_bank smca_banks[MAX_NR_BANKS]; +EXPORT_SYMBOL_GPL(smca_banks); + +/* + * In SMCA enabled processors, we can have multiple banks for a given IP type. + * So to define a unique name for each bank, we use a temp c-string to append + * the MCA_IPID[InstanceId] to type's name in get_name(). + * + * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN + * is greater than 8 plus 1 (for underscore) plus length of longest type name. + */ +#define MAX_MCATYPE_NAME_LEN 30 +static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; + +static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); +static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ + +static void amd_threshold_interrupt(void); +static void amd_deferred_error_interrupt(void); + +static void default_deferred_error_interrupt(void) +{ + pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR); +} +void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; + +static void smca_configure(unsigned int bank, unsigned int cpu) +{ + unsigned int i, hwid_mcatype; + struct smca_hwid *s_hwid; + u32 high, low; + u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank); + + /* Set appropriate bits in MCA_CONFIG */ + if (!rdmsr_safe(smca_config, &low, &high)) { + /* + * OS is required to set the MCAX bit to acknowledge that it is + * now using the new MSR ranges and new registers under each + * bank. It also means that the OS will configure deferred + * errors in the new MCx_CONFIG register. If the bit is not set, + * uncorrectable errors will cause a system panic. + * + * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) + */ + high |= BIT(0); + + /* + * SMCA sets the Deferred Error Interrupt type per bank. + * + * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us + * if the DeferredIntType bit field is available. + * + * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the + * high portion of the MSR). OS should set this to 0x1 to enable + * APIC based interrupt. First, check that no interrupt has been + * set. + */ + if ((low & BIT(5)) && !((high >> 5) & 0x3)) + high |= BIT(5); + + wrmsr(smca_config, low, high); + } + + /* Return early if this bank was already initialized. */ + if (smca_banks[bank].hwid) + return; + + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { + pr_warn("Failed to read MCA_IPID for bank %d\n", bank); + return; + } + + hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID, + (high & MCI_IPID_MCATYPE) >> 16); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { + s_hwid = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == s_hwid->hwid_mcatype) { + smca_banks[bank].hwid = s_hwid; + smca_banks[bank].id = low; + smca_banks[bank].sysfs_id = s_hwid->count++; + break; + } + } +} + +struct thresh_restart { + struct threshold_block *b; + int reset; + int set_lvt_off; + int lvt_off; + u16 old_limit; +}; + +static inline bool is_shared_bank(int bank) +{ + /* + * Scalable MCA provides for only one core to have access to the MSRs of + * a shared bank. + */ + if (mce_flags.smca) + return false; + + /* Bank 4 is for northbridge reporting and is thus shared */ + return (bank == 4); +} + +static const char *bank4_names(const struct threshold_block *b) +{ + switch (b->address) { + /* MSR4_MISC0 */ + case 0x00000413: + return "dram"; + + case 0xc0000408: + return "ht_links"; + + case 0xc0000409: + return "l3_cache"; + + default: + WARN(1, "Funny MSR: 0x%08x\n", b->address); + return ""; + } +}; + + +static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) +{ + /* + * bank 4 supports APIC LVT interrupts implicitly since forever. + */ + if (bank == 4) + return true; + + /* + * IntP: interrupt present; if this bit is set, the thresholding + * bank can generate APIC LVT interrupts + */ + return msr_high_bits & BIT(28); +} + +static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +{ + int msr = (hi & MASK_LVTOFF_HI) >> 20; + + if (apic < 0) { + pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, + b->bank, b->block, b->address, hi, lo); + return 0; + } + + if (apic != msr) { + /* + * On SMCA CPUs, LVT offset is programmed at a different MSR, and + * the BIOS provides the value. The original field where LVT offset + * was set is reserved. Return early here: + */ + if (mce_flags.smca) + return 0; + + pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", + b->cpu, apic, b->bank, b->block, b->address, hi, lo); + return 0; + } + + return 1; +}; + +/* Reprogram MCx_MISC MSR behind this threshold bank. */ +static void threshold_restart_bank(void *_tr) +{ + struct thresh_restart *tr = _tr; + u32 hi, lo; + + rdmsr(tr->b->address, lo, hi); + + if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) + tr->reset = 1; /* limit cannot be lower than err count */ + + if (tr->reset) { /* reset err count and overflow bit */ + hi = + (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + (THRESHOLD_MAX - tr->b->threshold_limit); + } else if (tr->old_limit) { /* change limit w/o reset */ + int new_count = (hi & THRESHOLD_MAX) + + (tr->old_limit - tr->b->threshold_limit); + + hi = (hi & ~MASK_ERR_COUNT_HI) | + (new_count & THRESHOLD_MAX); + } + + /* clear IntType */ + hi &= ~MASK_INT_TYPE_HI; + + if (!tr->b->interrupt_capable) + goto done; + + if (tr->set_lvt_off) { + if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { + /* set new lvt offset */ + hi &= ~MASK_LVTOFF_HI; + hi |= tr->lvt_off << 20; + } + } + + if (tr->b->interrupt_enable) + hi |= INT_TYPE_APIC; + + done: + + hi |= MASK_COUNT_EN_HI; + wrmsr(tr->b->address, lo, hi); +} + +static void mce_threshold_block_init(struct threshold_block *b, int offset) +{ + struct thresh_restart tr = { + .b = b, + .set_lvt_off = 1, + .lvt_off = offset, + }; + + b->threshold_limit = THRESHOLD_MAX; + threshold_restart_bank(&tr); +}; + +static int setup_APIC_mce_threshold(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; +} + +static int setup_APIC_deferred_error(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; +} + +static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) +{ + u32 low = 0, high = 0; + int def_offset = -1, def_new; + + if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high)) + return; + + def_new = (low & MASK_DEF_LVTOFF) >> 4; + if (!(low & MASK_DEF_LVTOFF)) { + pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n"); + def_new = DEF_LVT_OFF; + low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4); + } + + def_offset = setup_APIC_deferred_error(def_offset, def_new); + if ((def_offset == def_new) && + (deferred_error_int_vector != amd_deferred_error_interrupt)) + deferred_error_int_vector = amd_deferred_error_interrupt; + + if (!mce_flags.smca) + low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; + + wrmsr(MSR_CU_DEF_ERR, low, high); +} + +static u32 smca_get_block_address(unsigned int bank, unsigned int block) +{ + u32 low, high; + u32 addr = 0; + + if (smca_get_bank_type(bank) == SMCA_RESERVED) + return addr; + + if (!block) + return MSR_AMD64_SMCA_MCx_MISC(bank); + + /* Check our cache first: */ + if (smca_bank_addrs[bank][block] != -1) + return smca_bank_addrs[bank][block]; + + /* + * For SMCA enabled processors, BLKPTR field of the first MISC register + * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). + */ + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + goto out; + + if (!(low & MCI_CONFIG_MCAX)) + goto out; + + if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && + (low & MASK_BLKPTR_LO)) + addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + +out: + smca_bank_addrs[bank][block] = addr; + return addr; +} + +static u32 get_block_address(u32 current_addr, u32 low, u32 high, + unsigned int bank, unsigned int block) +{ + u32 addr = 0, offset = 0; + + if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) + return addr; + + if (mce_flags.smca) + return smca_get_block_address(bank, block); + + /* Fall back to method we used for older processors: */ + switch (block) { + case 0: + addr = msr_ops.misc(bank); + break; + case 1: + offset = ((low & MASK_BLKPTR_LO) >> 21); + if (offset) + addr = MCG_XBLK_ADDR + offset; + break; + default: + addr = ++current_addr; + } + return addr; +} + +static int +prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, + int offset, u32 misc_high) +{ + unsigned int cpu = smp_processor_id(); + u32 smca_low, smca_high; + struct threshold_block b; + int new; + + if (!block) + per_cpu(bank_map, cpu) |= (1 << bank); + + memset(&b, 0, sizeof(b)); + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = addr; + b.interrupt_capable = lvt_interrupt_supported(bank, misc_high); + + if (!b.interrupt_capable) + goto done; + + b.interrupt_enable = 1; + + if (!mce_flags.smca) { + new = (misc_high & MASK_LVTOFF_HI) >> 20; + goto set_offset; + } + + /* Gather LVT offset for thresholding: */ + if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) + goto out; + + new = (smca_low & SMCA_THR_LVT_OFF) >> 12; + +set_offset: + offset = setup_APIC_mce_threshold(offset, new); + if (offset == new) + thresholding_irq_en = true; + +done: + mce_threshold_block_init(&b, offset); + +out: + return offset; +} + +/* cpu init entry point, called from mce.c with preempt off */ +void mce_amd_feature_init(struct cpuinfo_x86 *c) +{ + u32 low = 0, high = 0, address = 0; + unsigned int bank, block, cpu = smp_processor_id(); + int offset = -1; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (mce_flags.smca) + smca_configure(bank, cpu); + + for (block = 0; block < NR_BLOCKS; ++block) { + address = get_block_address(address, low, high, bank, block); + if (!address) + break; + + if (rdmsr_safe(address, &low, &high)) + break; + + if (!(high & MASK_VALID_HI)) + continue; + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + continue; + + offset = prepare_threshold_block(bank, block, address, offset, high); + } + } + + if (mce_flags.succor) + deferred_error_interrupt_enable(c); +} + +int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) +{ + u64 dram_base_addr, dram_limit_addr, dram_hole_base; + /* We start from the normalized address */ + u64 ret_addr = norm_addr; + + u32 tmp; + + u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; + u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; + u8 intlv_addr_sel, intlv_addr_bit; + u8 num_intlv_bits, hashed_bit; + u8 lgcy_mmio_hole_en, base = 0; + u8 cs_mask, cs_id = 0; + bool hash_enabled = false; + + /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ + if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp)) + goto out_err; + + /* Remove HiAddrOffset from normalized address, if enabled: */ + if (tmp & BIT(0)) { + u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8; + + if (norm_addr >= hi_addr_offset) { + ret_addr -= hi_addr_offset; + base = 1; + } + } + + /* Read D18F0x110 (DramBaseAddress). */ + if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp)) + goto out_err; + + /* Check if address range is valid. */ + if (!(tmp & BIT(0))) { + pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", + __func__, tmp); + goto out_err; + } + + lgcy_mmio_hole_en = tmp & BIT(1); + intlv_num_chan = (tmp >> 4) & 0xF; + intlv_addr_sel = (tmp >> 8) & 0x7; + dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16; + + /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ + if (intlv_addr_sel > 3) { + pr_err("%s: Invalid interleave address select %d.\n", + __func__, intlv_addr_sel); + goto out_err; + } + + /* Read D18F0x114 (DramLimitAddress). */ + if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp)) + goto out_err; + + intlv_num_sockets = (tmp >> 8) & 0x1; + intlv_num_dies = (tmp >> 10) & 0x3; + dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); + + intlv_addr_bit = intlv_addr_sel + 8; + + /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ + switch (intlv_num_chan) { + case 0: intlv_num_chan = 0; break; + case 1: intlv_num_chan = 1; break; + case 3: intlv_num_chan = 2; break; + case 5: intlv_num_chan = 3; break; + case 7: intlv_num_chan = 4; break; + + case 8: intlv_num_chan = 1; + hash_enabled = true; + break; + default: + pr_err("%s: Invalid number of interleaved channels %d.\n", + __func__, intlv_num_chan); + goto out_err; + } + + num_intlv_bits = intlv_num_chan; + + if (intlv_num_dies > 2) { + pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", + __func__, intlv_num_dies); + goto out_err; + } + + num_intlv_bits += intlv_num_dies; + + /* Add a bit if sockets are interleaved. */ + num_intlv_bits += intlv_num_sockets; + + /* Assert num_intlv_bits <= 4 */ + if (num_intlv_bits > 4) { + pr_err("%s: Invalid interleave bits %d.\n", + __func__, num_intlv_bits); + goto out_err; + } + + if (num_intlv_bits > 0) { + u64 temp_addr_x, temp_addr_i, temp_addr_y; + u8 die_id_bit, sock_id_bit, cs_fabric_id; + + /* + * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. + * This is the fabric id for this coherent slave. Use + * umc/channel# as instance id of the coherent slave + * for FICAA. + */ + if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp)) + goto out_err; + + cs_fabric_id = (tmp >> 8) & 0xFF; + die_id_bit = 0; + + /* If interleaved over more than 1 channel: */ + if (intlv_num_chan) { + die_id_bit = intlv_num_chan; + cs_mask = (1 << die_id_bit) - 1; + cs_id = cs_fabric_id & cs_mask; + } + + sock_id_bit = die_id_bit; + + /* Read D18F1x208 (SystemFabricIdMask). */ + if (intlv_num_dies || intlv_num_sockets) + if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp)) + goto out_err; + + /* If interleaved over more than 1 die. */ + if (intlv_num_dies) { + sock_id_bit = die_id_bit + intlv_num_dies; + die_id_shift = (tmp >> 24) & 0xF; + die_id_mask = (tmp >> 8) & 0xFF; + + cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; + } + + /* If interleaved over more than 1 socket. */ + if (intlv_num_sockets) { + socket_id_shift = (tmp >> 28) & 0xF; + socket_id_mask = (tmp >> 16) & 0xFF; + + cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; + } + + /* + * The pre-interleaved address consists of XXXXXXIIIYYYYY + * where III is the ID for this CS, and XXXXXXYYYYY are the + * address bits from the post-interleaved address. + * "num_intlv_bits" has been calculated to tell us how many "I" + * bits there are. "intlv_addr_bit" tells us how many "Y" bits + * there are (where "I" starts). + */ + temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0); + temp_addr_i = (cs_id << intlv_addr_bit); + temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; + ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; + } + + /* Add dram base address */ + ret_addr += dram_base_addr; + + /* If legacy MMIO hole enabled */ + if (lgcy_mmio_hole_en) { + if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp)) + goto out_err; + + dram_hole_base = tmp & GENMASK(31, 24); + if (ret_addr >= dram_hole_base) + ret_addr += (BIT_ULL(32) - dram_hole_base); + } + + if (hash_enabled) { + /* Save some parentheses and grab ls-bit at the end. */ + hashed_bit = (ret_addr >> 12) ^ + (ret_addr >> 18) ^ + (ret_addr >> 21) ^ + (ret_addr >> 30) ^ + cs_id; + + hashed_bit &= BIT(0); + + if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0))) + ret_addr ^= BIT(intlv_addr_bit); + } + + /* Is calculated system address is above DRAM limit address? */ + if (ret_addr > dram_limit_addr) + goto out_err; + + *sys_addr = ret_addr; + return 0; + +out_err: + return -EINVAL; +} +EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); + +bool amd_mce_is_memory_error(struct mce *m) +{ + /* ErrCodeExt[20:16] */ + u8 xec = (m->status >> 16) & 0x1f; + + if (mce_flags.smca) + return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0; + + return m->bank == 4 && xec == 0x8; +} + +static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) +{ + struct mce m; + + mce_setup(&m); + + m.status = status; + m.misc = misc; + m.bank = bank; + m.tsc = rdtsc(); + + if (m.status & MCI_STATUS_ADDRV) { + m.addr = addr; + + /* + * Extract [55:<lsb>] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m.addr >> 56) & 0x3f; + + m.addr &= GENMASK_ULL(55, lsb); + } + } + + if (mce_flags.smca) { + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + + if (m.status & MCI_STATUS_SYNDV) + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + } + + mce_log(&m); +} + +asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) +{ + entering_irq(); + trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); + inc_irq_stat(irq_deferred_error_count); + deferred_error_int_vector(); + trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); + exiting_ack_irq(); +} + +/* + * Returns true if the logged error is deferred. False, otherwise. + */ +static inline bool +_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) +{ + u64 status, addr = 0; + + rdmsrl(msr_stat, status); + if (!(status & MCI_STATUS_VAL)) + return false; + + if (status & MCI_STATUS_ADDRV) + rdmsrl(msr_addr, addr); + + __log_error(bank, status, addr, misc); + + wrmsrl(msr_stat, 0); + + return status & MCI_STATUS_DEFERRED; +} + +/* + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. + */ +static void log_error_deferred(unsigned int bank) +{ + bool defrd; + + defrd = _log_error_bank(bank, msr_ops.status(bank), + msr_ops.addr(bank), 0); + + if (!mce_flags.smca) + return; + + /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */ + if (defrd) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + return; + } + + /* + * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check + * for a valid error. + */ + _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), + MSR_AMD64_SMCA_MCx_DEADDR(bank), 0); +} + +/* APIC interrupt handler for deferred errors */ +static void amd_deferred_error_interrupt(void) +{ + unsigned int bank; + + for (bank = 0; bank < mca_cfg.banks; ++bank) + log_error_deferred(bank); +} + +static void log_error_thresholding(unsigned int bank, u64 misc) +{ + _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc); +} + +static void log_and_reset_block(struct threshold_block *block) +{ + struct thresh_restart tr; + u32 low = 0, high = 0; + + if (!block) + return; + + if (rdmsr_safe(block->address, &low, &high)) + return; + + if (!(high & MASK_OVERFLOW_HI)) + return; + + /* Log the MCE which caused the threshold event. */ + log_error_thresholding(block->bank, ((u64)high << 32) | low); + + /* Reset threshold block after logging error. */ + memset(&tr, 0, sizeof(tr)); + tr.b = block; + threshold_restart_bank(&tr); +} + +/* + * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt + * goes off when error_count reaches threshold_limit. + */ +static void amd_threshold_interrupt(void) +{ + struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; + unsigned int bank, cpu = smp_processor_id(); + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + + first_block = per_cpu(threshold_banks, cpu)[bank]->blocks; + if (!first_block) + continue; + + /* + * The first block is also the head of the list. Check it first + * before iterating over the rest. + */ + log_and_reset_block(first_block); + list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + log_and_reset_block(block); + } +} + +/* + * Sysfs Interface + */ + +struct threshold_attr { + struct attribute attr; + ssize_t (*show) (struct threshold_block *, char *); + ssize_t (*store) (struct threshold_block *, const char *, size_t count); +}; + +#define SHOW_FIELDS(name) \ +static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ +{ \ + return sprintf(buf, "%lu\n", (unsigned long) b->name); \ +} +SHOW_FIELDS(interrupt_enable) +SHOW_FIELDS(threshold_limit) + +static ssize_t +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) +{ + struct thresh_restart tr; + unsigned long new; + + if (!b->interrupt_capable) + return -EINVAL; + + if (kstrtoul(buf, 0, &new) < 0) + return -EINVAL; + + b->interrupt_enable = !!new; + + memset(&tr, 0, sizeof(tr)); + tr.b = b; + + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + + return size; +} + +static ssize_t +store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) +{ + struct thresh_restart tr; + unsigned long new; + + if (kstrtoul(buf, 0, &new) < 0) + return -EINVAL; + + if (new > THRESHOLD_MAX) + new = THRESHOLD_MAX; + if (new < 1) + new = 1; + + memset(&tr, 0, sizeof(tr)); + tr.old_limit = b->threshold_limit; + b->threshold_limit = new; + tr.b = b; + + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + + return size; +} + +static ssize_t show_error_count(struct threshold_block *b, char *buf) +{ + u32 lo, hi; + + rdmsr_on_cpu(b->cpu, b->address, &lo, &hi); + + return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - + (THRESHOLD_MAX - b->threshold_limit))); +} + +static struct threshold_attr error_count = { + .attr = {.name = __stringify(error_count), .mode = 0444 }, + .show = show_error_count, +}; + +#define RW_ATTR(val) \ +static struct threshold_attr val = { \ + .attr = {.name = __stringify(val), .mode = 0644 }, \ + .show = show_## val, \ + .store = store_## val, \ +}; + +RW_ATTR(interrupt_enable); +RW_ATTR(threshold_limit); + +static struct attribute *default_attrs[] = { + &threshold_limit.attr, + &error_count.attr, + NULL, /* possibly interrupt_enable if supported, see below */ + NULL, +}; + +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr) + +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + + ret = a->show ? a->show(b, buf) : -EIO; + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + + ret = a->store ? a->store(b, buf, count) : -EIO; + + return ret; +} + +static const struct sysfs_ops threshold_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type threshold_ktype = { + .sysfs_ops = &threshold_ops, + .default_attrs = default_attrs, +}; + +static const char *get_name(unsigned int bank, struct threshold_block *b) +{ + enum smca_bank_types bank_type; + + if (!mce_flags.smca) { + if (b && bank == 4) + return bank4_names(b); + + return th_names[bank]; + } + + bank_type = smca_get_bank_type(bank); + if (bank_type >= N_SMCA_BANK_TYPES) + return NULL; + + if (b && bank_type == SMCA_UMC) { + if (b->block < ARRAY_SIZE(smca_umc_block_names)) + return smca_umc_block_names[b->block]; + return NULL; + } + + if (smca_banks[bank].hwid->count == 1) + return smca_get_name(bank_type); + + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, + "%s_%x", smca_get_name(bank_type), + smca_banks[bank].sysfs_id); + return buf_mcatype; +} + +static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, + unsigned int block, u32 address) +{ + struct threshold_block *b = NULL; + u32 low, high; + int err; + + if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) + return 0; + + if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) + return 0; + + if (!(high & MASK_VALID_HI)) { + if (block) + goto recurse; + else + return 0; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + goto recurse; + + b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); + if (!b) + return -ENOMEM; + + b->block = block; + b->bank = bank; + b->cpu = cpu; + b->address = address; + b->interrupt_enable = 0; + b->interrupt_capable = lvt_interrupt_supported(bank, high); + b->threshold_limit = THRESHOLD_MAX; + + if (b->interrupt_capable) { + threshold_ktype.default_attrs[2] = &interrupt_enable.attr; + b->interrupt_enable = 1; + } else { + threshold_ktype.default_attrs[2] = NULL; + } + + INIT_LIST_HEAD(&b->miscj); + + if (per_cpu(threshold_banks, cpu)[bank]->blocks) { + list_add(&b->miscj, + &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); + } else { + per_cpu(threshold_banks, cpu)[bank]->blocks = b; + } + + err = kobject_init_and_add(&b->kobj, &threshold_ktype, + per_cpu(threshold_banks, cpu)[bank]->kobj, + get_name(bank, b)); + if (err) + goto out_free; +recurse: + address = get_block_address(address, low, high, bank, ++block); + if (!address) + return 0; + + err = allocate_threshold_blocks(cpu, bank, block, address); + if (err) + goto out_free; + + if (b) + kobject_uevent(&b->kobj, KOBJ_ADD); + + return err; + +out_free: + if (b) { + kobject_put(&b->kobj); + list_del(&b->miscj); + kfree(b); + } + return err; +} + +static int __threshold_add_blocks(struct threshold_bank *b) +{ + struct list_head *head = &b->blocks->miscj; + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + int err = 0; + + err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); + if (err) + return err; + + list_for_each_entry_safe(pos, tmp, head, miscj) { + + err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); + if (err) { + list_for_each_entry_safe_reverse(pos, tmp, head, miscj) + kobject_del(&pos->kobj); + + return err; + } + } + return err; +} + +static int threshold_create_bank(unsigned int cpu, unsigned int bank) +{ + struct device *dev = per_cpu(mce_device, cpu); + struct amd_northbridge *nb = NULL; + struct threshold_bank *b = NULL; + const char *name = get_name(bank, NULL); + int err = 0; + + if (!dev) + return -ENODEV; + + if (is_shared_bank(bank)) { + nb = node_to_amd_nb(amd_get_nb_id(cpu)); + + /* threshold descriptor already initialized on this node? */ + if (nb && nb->bank4) { + /* yes, use it */ + b = nb->bank4; + err = kobject_add(b->kobj, &dev->kobj, name); + if (err) + goto out; + + per_cpu(threshold_banks, cpu)[bank] = b; + refcount_inc(&b->cpus); + + err = __threshold_add_blocks(b); + + goto out; + } + } + + b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); + if (!b) { + err = -ENOMEM; + goto out; + } + + b->kobj = kobject_create_and_add(name, &dev->kobj); + if (!b->kobj) { + err = -EINVAL; + goto out_free; + } + + per_cpu(threshold_banks, cpu)[bank] = b; + + if (is_shared_bank(bank)) { + refcount_set(&b->cpus, 1); + + /* nb is already initialized, see above */ + if (nb) { + WARN_ON(nb->bank4); + nb->bank4 = b; + } + } + + err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank)); + if (!err) + goto out; + + out_free: + kfree(b); + + out: + return err; +} + +static void deallocate_threshold_block(unsigned int cpu, + unsigned int bank) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; + + if (!head) + return; + + list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { + kobject_put(&pos->kobj); + list_del(&pos->miscj); + kfree(pos); + } + + kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); + per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; +} + +static void __threshold_remove_blocks(struct threshold_bank *b) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + + kobject_del(b->kobj); + + list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) + kobject_del(&pos->kobj); +} + +static void threshold_remove_bank(unsigned int cpu, int bank) +{ + struct amd_northbridge *nb; + struct threshold_bank *b; + + b = per_cpu(threshold_banks, cpu)[bank]; + if (!b) + return; + + if (!b->blocks) + goto free_out; + + if (is_shared_bank(bank)) { + if (!refcount_dec_and_test(&b->cpus)) { + __threshold_remove_blocks(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; + return; + } else { + /* + * the last CPU on this node using the shared bank is + * going away, remove that bank now. + */ + nb = node_to_amd_nb(amd_get_nb_id(cpu)); + nb->bank4 = NULL; + } + } + + deallocate_threshold_block(cpu, bank); + +free_out: + kobject_del(b->kobj); + kobject_put(b->kobj); + kfree(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; +} + +int mce_threshold_remove_device(unsigned int cpu) +{ + unsigned int bank; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + threshold_remove_bank(cpu, bank); + } + kfree(per_cpu(threshold_banks, cpu)); + per_cpu(threshold_banks, cpu) = NULL; + return 0; +} + +/* create dir/files for all valid threshold banks */ +int mce_threshold_create_device(unsigned int cpu) +{ + unsigned int bank; + struct threshold_bank **bp; + int err = 0; + + bp = per_cpu(threshold_banks, cpu); + if (bp) + return 0; + + bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *), + GFP_KERNEL); + if (!bp) + return -ENOMEM; + + per_cpu(threshold_banks, cpu) = bp; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + err = threshold_create_bank(cpu, bank); + if (err) + goto err; + } + return err; +err: + mce_threshold_remove_device(cpu); + return err; +} + +static __init int threshold_init_device(void) +{ + unsigned lcpu = 0; + + /* to hit CPUs online before the notifier is up */ + for_each_online_cpu(lcpu) { + int err = mce_threshold_create_device(lcpu); + + if (err) + return err; + } + + if (thresholding_irq_en) + mce_threshold_vector = amd_threshold_interrupt; + + return 0; +} +/* + * there are 3 funcs which need to be _initcalled in a logic sequence: + * 1. xen_late_init_mcelog + * 2. mcheck_init_device + * 3. threshold_init_device + * + * xen_late_init_mcelog must register xen_mce_chrdev_device before + * native mce_chrdev_device registration if running under xen platform; + * + * mcheck_init_device should be inited before threshold_init_device to + * initialize mce_device, otherwise a NULL ptr dereference will cause panic. + * + * so we use following _initcalls + * 1. device_initcall(xen_late_init_mcelog); + * 2. device_initcall_sync(mcheck_init_device); + * 3. late_initcall(threshold_init_device); + * + * when running under xen, the initcall order is 1,2,3; + * on baremetal, we skip 1 and we do only 2 and 3. + */ +late_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c @@ -0,0 +1,157 @@ +/* + * Bridge between MCE and APEI + * + * On some machine, corrected memory errors are reported via APEI + * generic hardware error source (GHES) instead of corrected Machine + * Check. These corrected memory errors can be reported to user space + * through /dev/mcelog via faking a corrected Machine Check, so that + * the error memory page can be offlined by /sbin/mcelog if the error + * count for one page is beyond the threshold. + * + * For fatal MCE, save MCE record into persistent storage via ERST, so + * that the MCE record can be logged after reboot via ERST. + * + * Copyright 2010 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/cper.h> +#include <acpi/apei.h> +#include <acpi/ghes.h> +#include <asm/mce.h> + +#include "internal.h" + +void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) +{ + struct mce m; + + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) + return; + + mce_setup(&m); + m.bank = -1; + /* Fake a memory read error with unknown channel */ + m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + + if (severity >= GHES_SEV_RECOVERABLE) + m.status |= MCI_STATUS_UC; + + if (severity >= GHES_SEV_PANIC) { + m.status |= MCI_STATUS_PCC; + m.tsc = rdtsc(); + } + + m.addr = mem_err->physical_addr; + mce_log(&m); +} +EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); + +#define CPER_CREATOR_MCE \ + UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ + 0x64, 0x90, 0xb8, 0x9d) +#define CPER_SECTION_TYPE_MCE \ + UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ + 0x04, 0x4a, 0x38, 0xfc) + +/* + * CPER specification (in UEFI specification 2.3 appendix N) requires + * byte-packed. + */ +struct cper_mce_record { + struct cper_record_header hdr; + struct cper_section_descriptor sec_hdr; + struct mce mce; +} __packed; + +int apei_write_mce(struct mce *m) +{ + struct cper_mce_record rcd; + + memset(&rcd, 0, sizeof(rcd)); + memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE); + rcd.hdr.revision = CPER_RECORD_REV; + rcd.hdr.signature_end = CPER_SIG_END; + rcd.hdr.section_count = 1; + rcd.hdr.error_severity = CPER_SEV_FATAL; + /* timestamp, platform_id, partition_id are all invalid */ + rcd.hdr.validation_bits = 0; + rcd.hdr.record_length = sizeof(rcd); + rcd.hdr.creator_id = CPER_CREATOR_MCE; + rcd.hdr.notification_type = CPER_NOTIFY_MCE; + rcd.hdr.record_id = cper_next_record_id(); + rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR; + + rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd; + rcd.sec_hdr.section_length = sizeof(rcd.mce); + rcd.sec_hdr.revision = CPER_SEC_REV; + /* fru_id and fru_text is invalid */ + rcd.sec_hdr.validation_bits = 0; + rcd.sec_hdr.flags = CPER_SEC_PRIMARY; + rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; + rcd.sec_hdr.section_severity = CPER_SEV_FATAL; + + memcpy(&rcd.mce, m, sizeof(*m)); + + return erst_write(&rcd.hdr); +} + +ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + struct cper_mce_record rcd; + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + /* try to skip other type records in storage */ + else if (rc != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + goto retry; + memcpy(m, &rcd.mce, sizeof(*m)); + rc = sizeof(*m); +out: + erst_get_record_id_end(); + + return rc; +} + +/* Check whether there is record in ERST */ +int apei_check_mce(void) +{ + return erst_get_record_count(); +} + +int apei_clear_mce(u64 record_id) +{ + return erst_clear(record_id); +} diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c @@ -0,0 +1,2497 @@ +/* + * Machine check handler. + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ + +#include <linux/thread_info.h> +#include <linux/capability.h> +#include <linux/miscdevice.h> +#include <linux/ratelimit.h> +#include <linux/rcupdate.h> +#include <linux/kobject.h> +#include <linux/uaccess.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/string.h> +#include <linux/device.h> +#include <linux/syscore_ops.h> +#include <linux/delay.h> +#include <linux/ctype.h> +#include <linux/sched.h> +#include <linux/sysfs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <linux/poll.h> +#include <linux/nmi.h> +#include <linux/cpu.h> +#include <linux/ras.h> +#include <linux/smp.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/debugfs.h> +#include <linux/irq_work.h> +#include <linux/export.h> +#include <linux/jump_label.h> +#include <linux/set_memory.h> + +#include <asm/intel-family.h> +#include <asm/processor.h> +#include <asm/traps.h> +#include <asm/tlbflush.h> +#include <asm/mce.h> +#include <asm/msr.h> +#include <asm/reboot.h> + +#include "internal.h" + +static DEFINE_MUTEX(mce_log_mutex); + +/* sysfs synchronization */ +static DEFINE_MUTEX(mce_sysfs_mutex); + +#define CREATE_TRACE_POINTS +#include <trace/events/mce.h> + +#define SPINUNIT 100 /* 100ns */ + +DEFINE_PER_CPU(unsigned, mce_exception_count); + +struct mce_bank *mce_banks __read_mostly; +struct mce_vendor_flags mce_flags __read_mostly; + +struct mca_config mca_cfg __read_mostly = { + .bootlog = -1, + /* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ + .tolerant = 1, + .monarch_timeout = -1 +}; + +static DEFINE_PER_CPU(struct mce, mces_seen); +static unsigned long mce_need_notify; +static int cpu_missing; + +/* + * MCA banks polled by the period polling timer for corrected events. + * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). + */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + +/* + * MCA banks controlled through firmware first for corrected errors. + * This is a global list of banks for which we won't enable CMCI and we + * won't poll. Firmware controls these banks and is responsible for + * reporting corrected errors through GHES. Uncorrected/recoverable + * errors are still notified through a machine check. + */ +mce_banks_t mce_banks_ce_disabled; + +static struct work_struct mce_work; +static struct irq_work mce_irq_work; + +static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); + +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); + +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = m->extcpu = smp_processor_id(); + /* need the internal __ version to avoid deadlocks */ + m->time = __ktime_get_real_seconds(); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->cpuid = cpuid_eax(1); + m->socketid = cpu_data(m->extcpu).phys_proc_id; + m->apicid = cpu_data(m->extcpu).initial_apicid; + rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); + + if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) + rdmsrl(MSR_PPIN, m->ppin); + + m->microcode = boot_cpu_data.microcode; +} + +DEFINE_PER_CPU(struct mce, injectm); +EXPORT_PER_CPU_SYMBOL_GPL(injectm); + +void mce_log(struct mce *m) +{ + if (!mce_gen_pool_add(m)) + irq_work_queue(&mce_irq_work); +} + +void mce_inject_log(struct mce *m) +{ + mutex_lock(&mce_log_mutex); + mce_log(m); + mutex_unlock(&mce_log_mutex); +} +EXPORT_SYMBOL_GPL(mce_inject_log); + +static struct notifier_block mce_srao_nb; + +/* + * We run the default notifier if we have only the SRAO, the first and the + * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS + * notifiers registered on the chain. + */ +#define NUM_DEFAULT_NOTIFIERS 3 +static atomic_t num_notifiers; + +void mce_register_decode_chain(struct notifier_block *nb) +{ + if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) + return; + + atomic_inc(&num_notifiers); + + blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_register_decode_chain); + +void mce_unregister_decode_chain(struct notifier_block *nb) +{ + atomic_dec(&num_notifiers); + + blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); + +static inline u32 ctl_reg(int bank) +{ + return MSR_IA32_MCx_CTL(bank); +} + +static inline u32 status_reg(int bank) +{ + return MSR_IA32_MCx_STATUS(bank); +} + +static inline u32 addr_reg(int bank) +{ + return MSR_IA32_MCx_ADDR(bank); +} + +static inline u32 misc_reg(int bank) +{ + return MSR_IA32_MCx_MISC(bank); +} + +static inline u32 smca_ctl_reg(int bank) +{ + return MSR_AMD64_SMCA_MCx_CTL(bank); +} + +static inline u32 smca_status_reg(int bank) +{ + return MSR_AMD64_SMCA_MCx_STATUS(bank); +} + +static inline u32 smca_addr_reg(int bank) +{ + return MSR_AMD64_SMCA_MCx_ADDR(bank); +} + +static inline u32 smca_misc_reg(int bank) +{ + return MSR_AMD64_SMCA_MCx_MISC(bank); +} + +struct mca_msr_regs msr_ops = { + .ctl = ctl_reg, + .status = status_reg, + .addr = addr_reg, + .misc = misc_reg +}; + +static void __print_mce(struct mce *m) +{ + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", + m->extcpu, + (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), + m->mcgstatus, m->bank, m->status); + + if (m->ip) { + pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->ip); + + if (m->cs == __KERNEL_CS) + pr_cont("{%pS}", (void *)(unsigned long)m->ip); + pr_cont("\n"); + } + + pr_emerg(HW_ERR "TSC %llx ", m->tsc); + if (m->addr) + pr_cont("ADDR %llx ", m->addr); + if (m->misc) + pr_cont("MISC %llx ", m->misc); + + if (mce_flags.smca) { + if (m->synd) + pr_cont("SYND %llx ", m->synd); + if (m->ipid) + pr_cont("IPID %llx ", m->ipid); + } + + pr_cont("\n"); + /* + * Note this output is parsed by external tools and old fields + * should not be changed. + */ + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, + m->microcode); +} + +static void print_mce(struct mce *m) +{ + __print_mce(m); + + if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) + pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); +} + +#define PANIC_TIMEOUT 5 /* 5 seconds */ + +static atomic_t mce_panicked; + +static int fake_panic; +static atomic_t mce_fake_panicked; + +/* Panic in progress. Enable interrupts and wait for final IPI */ +static void wait_for_panic(void) +{ + long timeout = PANIC_TIMEOUT*USEC_PER_SEC; + + preempt_disable(); + local_irq_enable(); + while (timeout-- > 0) + udelay(1); + if (panic_timeout == 0) + panic_timeout = mca_cfg.panic_timeout; + panic("Panicing machine check CPU died"); +} + +static void mce_panic(const char *msg, struct mce *final, char *exp) +{ + int apei_err = 0; + struct llist_node *pending; + struct mce_evt_llist *l; + + if (!fake_panic) { + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_inc_return(&mce_panicked) > 1) + wait_for_panic(); + barrier(); + + bust_spinlocks(1); + console_verbose(); + } else { + /* Don't log too much for fake panic */ + if (atomic_inc_return(&mce_fake_panicked) > 1) + return; + } + pending = mce_gen_pool_prepare_records(); + /* First print corrected ones that are still unlogged */ + llist_for_each_entry(l, pending, llnode) { + struct mce *m = &l->mce; + if (!(m->status & MCI_STATUS_UC)) { + print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } + } + /* Now print uncorrected but with the final one last */ + llist_for_each_entry(l, pending, llnode) { + struct mce *m = &l->mce; + if (!(m->status & MCI_STATUS_UC)) + continue; + if (!final || mce_cmp(m, final)) { + print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } + } + if (final) { + print_mce(final); + if (!apei_err) + apei_err = apei_write_mce(final); + } + if (cpu_missing) + pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); + if (exp) + pr_emerg(HW_ERR "Machine check: %s\n", exp); + if (!fake_panic) { + if (panic_timeout == 0) + panic_timeout = mca_cfg.panic_timeout; + panic(msg); + } else + pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); +} + +/* Support code for software error injection */ + +static int msr_to_offset(u32 msr) +{ + unsigned bank = __this_cpu_read(injectm.bank); + + if (msr == mca_cfg.rip_msr) + return offsetof(struct mce, ip); + if (msr == msr_ops.status(bank)) + return offsetof(struct mce, status); + if (msr == msr_ops.addr(bank)) + return offsetof(struct mce, addr); + if (msr == msr_ops.misc(bank)) + return offsetof(struct mce, misc); + if (msr == MSR_IA32_MCG_STATUS) + return offsetof(struct mce, mcgstatus); + return -1; +} + +/* MSR access wrappers used for error injection */ +static u64 mce_rdmsrl(u32 msr) +{ + u64 v; + + if (__this_cpu_read(injectm.finished)) { + int offset = msr_to_offset(msr); + + if (offset < 0) + return 0; + return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); + } + + if (rdmsrl_safe(msr, &v)) { + WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); + /* + * Return zero in case the access faulted. This should + * not happen normally but can happen if the CPU does + * something weird, or if the code is buggy. + */ + v = 0; + } + + return v; +} + +static void mce_wrmsrl(u32 msr, u64 v) +{ + if (__this_cpu_read(injectm.finished)) { + int offset = msr_to_offset(msr); + + if (offset >= 0) + *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; + return; + } + wrmsrl(msr, v); +} + +/* + * Collect all global (w.r.t. this processor) status about this machine + * check into our "mce" struct so that we can use it later to assess + * the severity of the problem as we read per-bank specific details. + */ +static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) +{ + mce_setup(m); + + m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (regs) { + /* + * Get the address of the instruction at the time of + * the machine check error. + */ + if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { + m->ip = regs->ip; + m->cs = regs->cs; + + /* + * When in VM86 mode make the cs look like ring 3 + * always. This is a lie, but it's better than passing + * the additional vm86 bit around everywhere. + */ + if (v8086_mode(regs)) + m->cs |= 3; + } + /* Use accurate RIP reporting if available. */ + if (mca_cfg.rip_msr) + m->ip = mce_rdmsrl(mca_cfg.rip_msr); + } +} + +int mce_available(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return 0; + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +static void mce_schedule_work(void) +{ + if (!mce_gen_pool_empty()) + schedule_work(&mce_work); +} + +static void mce_irq_work_cb(struct irq_work *entry) +{ + mce_schedule_work(); +} + +static void mce_report_event(struct pt_regs *regs) +{ + if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { + mce_notify_irq(); + /* + * Triggering the work queue here is just an insurance + * policy in case the syscall exit notify handler + * doesn't run soon enough or ends up running on the + * wrong CPU (can happen when audit sleeps) + */ + mce_schedule_work(); + return; + } + + irq_work_queue(&mce_irq_work); +} + +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses up to page granuality for now. + */ +int mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_ADDRV)) + return 0; + + /* Checks after this one are Intel-specific: */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 1; + + if (!(m->status & MCI_STATUS_MISCV)) + return 0; + + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) + return 0; + + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) + return 0; + + return 1; +} +EXPORT_SYMBOL_GPL(mce_usable_address); + +bool mce_is_memory_error(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD || + m->cpuvendor == X86_VENDOR_HYGON) { + return amd_mce_is_memory_error(m); + } else if (m->cpuvendor == X86_VENDOR_INTEL) { + /* + * Intel SDM Volume 3B - 15.9.2 Compound Error Codes + * + * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for + * indicating a memory error. Bit 8 is used for indicating a + * cache hierarchy error. The combination of bit 2 and bit 3 + * is used for indicating a `generic' cache hierarchy error + * But we can't just blindly check the above bits, because if + * bit 11 is set, then it is a bus/interconnect error - and + * either way the above bits just gives more detail on what + * bus/interconnect error happened. Note that bit 12 can be + * ignored, as it's the "filter" bit. + */ + return (m->status & 0xef80) == BIT(7) || + (m->status & 0xef00) == BIT(8) || + (m->status & 0xeffc) == 0xc; + } + + return false; +} +EXPORT_SYMBOL_GPL(mce_is_memory_error); + +bool mce_is_correctable(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) + return false; + + if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED) + return false; + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(mce_is_correctable); + +static bool cec_add_mce(struct mce *m) +{ + if (!m) + return false; + + /* We eat only correctable DRAM errors with usable addresses. */ + if (mce_is_memory_error(m) && + mce_is_correctable(m) && + mce_usable_address(m)) + if (!cec_add_elem(m->addr >> PAGE_SHIFT)) + return true; + + return false; +} + +static int mce_first_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + if (cec_add_mce(m)) + return NOTIFY_STOP; + + /* Emit the trace record: */ + trace_mce_record(m); + + set_bit(0, &mce_need_notify); + + mce_notify_irq(); + + return NOTIFY_DONE; +} + +static struct notifier_block first_nb = { + .notifier_call = mce_first_notifier, + .priority = MCE_PRIO_FIRST, +}; + +static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned long pfn; + + if (!mce) + return NOTIFY_DONE; + + if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) { + pfn = mce->addr >> PAGE_SHIFT; + if (!memory_failure(pfn, 0)) + set_mce_nospec(pfn); + } + + return NOTIFY_OK; +} +static struct notifier_block mce_srao_nb = { + .notifier_call = srao_decode_notifier, + .priority = MCE_PRIO_SRAO, +}; + +static int mce_default_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS) + return NOTIFY_DONE; + + __print_mce(m); + + return NOTIFY_DONE; +} + +static struct notifier_block mce_default_nb = { + .notifier_call = mce_default_notifier, + /* lowest prio, we want it to run last. */ + .priority = MCE_PRIO_LOWEST, +}; + +/* + * Read ADDR and MISC registers. + */ +static void mce_read_aux(struct mce *m, int i) +{ + if (m->status & MCI_STATUS_MISCV) + m->misc = mce_rdmsrl(msr_ops.misc(i)); + + if (m->status & MCI_STATUS_ADDRV) { + m->addr = mce_rdmsrl(msr_ops.addr(i)); + + /* + * Mask the reported address by the reported granularity. + */ + if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { + u8 shift = MCI_MISC_ADDR_LSB(m->misc); + m->addr >>= shift; + m->addr <<= shift; + } + + /* + * Extract [55:<lsb>] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m->addr >> 56) & 0x3f; + + m->addr &= GENMASK_ULL(55, lsb); + } + } + + if (mce_flags.smca) { + m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + + if (m->status & MCI_STATUS_SYNDV) + m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); + } +} + +DEFINE_PER_CPU(unsigned, mce_poll_count); + +/* + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + * + * Note: spec recommends to panic for fatal unsignalled + * errors here. However this would be quite problematic -- + * we would need to reimplement the Monarch handling and + * it would mess up the exclusion between exception handler + * and poll hander -- * so we skip this for now. + * These cases should not happen anyways, or only when the CPU + * is already totally * confused. In this case it's likely it will + * not fully execute the machine check handler either. + */ +bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +{ + bool error_seen = false; + struct mce m; + int i; + + this_cpu_inc(mce_poll_count); + + mce_gather_info(&m, NULL); + + if (flags & MCP_TIMESTAMP) + m.tsc = rdtsc(); + + for (i = 0; i < mca_cfg.banks; i++) { + if (!mce_banks[i].ctl || !test_bit(i, *b)) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + + barrier(); + m.status = mce_rdmsrl(msr_ops.status(i)); + if (!(m.status & MCI_STATUS_VAL)) + continue; + + /* + * Uncorrected or signalled events are handled by the exception + * handler when it is enabled, so don't process those here. + * + * TBD do the same check for MCI_STATUS_EN here? + */ + if (!(flags & MCP_UC) && + (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) + continue; + + error_seen = true; + + mce_read_aux(&m, i); + + m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); + + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) + mce_log(&m); + else if (mce_usable_address(&m)) { + /* + * Although we skipped logging this, we still want + * to take action. Add to the pool so the registered + * notifiers will see it. + */ + if (!mce_gen_pool_add(&m)) + mce_schedule_work(); + } + + /* + * Clear state for this bank. + */ + mce_wrmsrl(msr_ops.status(i), 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ + + sync_core(); + + return error_seen; +} +EXPORT_SYMBOL_GPL(machine_check_poll); + +/* + * Do a quick check if any of the events requires a panic. + * This decides if we keep the events around or clear them. + */ +static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, + struct pt_regs *regs) +{ + char *tmp; + int i; + + for (i = 0; i < mca_cfg.banks; i++) { + m->status = mce_rdmsrl(msr_ops.status(i)); + if (!(m->status & MCI_STATUS_VAL)) + continue; + + __set_bit(i, validp); + if (quirk_no_way_out) + quirk_no_way_out(i, m, regs); + + if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + mce_read_aux(m, i); + *msg = tmp; + return 1; + } + } + return 0; +} + +/* + * Variable to establish order between CPUs while scanning. + * Each CPU spins initially until executing is equal its number. + */ +static atomic_t mce_executing; + +/* + * Defines order of CPUs on entry. First CPU becomes Monarch. + */ +static atomic_t mce_callin; + +/* + * Check if a timeout waiting for other CPUs happened. + */ +static int mce_timed_out(u64 *t, const char *msg) +{ + /* + * The others already did panic for some reason. + * Bail out like in a timeout. + * rmb() to tell the compiler that system_state + * might have been modified by someone else. + */ + rmb(); + if (atomic_read(&mce_panicked)) + wait_for_panic(); + if (!mca_cfg.monarch_timeout) + goto out; + if ((s64)*t < SPINUNIT) { + if (mca_cfg.tolerant <= 1) + mce_panic(msg, NULL, NULL); + cpu_missing = 1; + return 1; + } + *t -= SPINUNIT; +out: + touch_nmi_watchdog(); + return 0; +} + +/* + * The Monarch's reign. The Monarch is the CPU who entered + * the machine check handler first. It waits for the others to + * raise the exception too and then grades them. When any + * error is fatal panic. Only then let the others continue. + * + * The other CPUs entering the MCE handler will be controlled by the + * Monarch. They are called Subjects. + * + * This way we prevent any potential data corruption in a unrecoverable case + * and also makes sure always all CPU's errors are examined. + * + * Also this detects the case of a machine check event coming from outer + * space (not detected by any CPUs) In this case some external agent wants + * us to shut down, so panic too. + * + * The other CPUs might still decide to panic if the handler happens + * in a unrecoverable place, but in this case the system is in a semi-stable + * state and won't corrupt anything by itself. It's ok to let the others + * continue for a bit first. + * + * All the spin loops have timeouts; when a timeout happens a CPU + * typically elects itself to be Monarch. + */ +static void mce_reign(void) +{ + int cpu; + struct mce *m = NULL; + int global_worst = 0; + char *msg = NULL; + char *nmsg = NULL; + + /* + * This CPU is the Monarch and the other CPUs have run + * through their handlers. + * Grade the severity of the errors of all the CPUs. + */ + for_each_possible_cpu(cpu) { + int severity = mce_severity(&per_cpu(mces_seen, cpu), + mca_cfg.tolerant, + &nmsg, true); + if (severity > global_worst) { + msg = nmsg; + global_worst = severity; + m = &per_cpu(mces_seen, cpu); + } + } + + /* + * Cannot recover? Panic here then. + * This dumps all the mces in the log buffer and stops the + * other CPUs. + */ + if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + mce_panic("Fatal machine check", m, msg); + + /* + * For UC somewhere we let the CPU who detects it handle it. + * Also must let continue the others, otherwise the handling + * CPU could deadlock on a lock. + */ + + /* + * No machine check event found. Must be some external + * source or one CPU is hung. Panic. + */ + if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) + mce_panic("Fatal machine check from unknown source", NULL, NULL); + + /* + * Now clear all the mces_seen so that they don't reappear on + * the next mce. + */ + for_each_possible_cpu(cpu) + memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); +} + +static atomic_t global_nwo; + +/* + * Start of Monarch synchronization. This waits until all CPUs have + * entered the exception handler and then determines if any of them + * saw a fatal event that requires panic. Then it executes them + * in the entry order. + * TBD double check parallel CPU hotunplug + */ +static int mce_start(int *no_way_out) +{ + int order; + int cpus = num_online_cpus(); + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + + if (!timeout) + return -1; + + atomic_add(*no_way_out, &global_nwo); + /* + * Rely on the implied barrier below, such that global_nwo + * is updated before mce_callin. + */ + order = atomic_inc_return(&mce_callin); + + /* + * Wait for everyone. + */ + while (atomic_read(&mce_callin) != cpus) { + if (mce_timed_out(&timeout, + "Timeout: Not all CPUs entered broadcast exception handler")) { + atomic_set(&global_nwo, 0); + return -1; + } + ndelay(SPINUNIT); + } + + /* + * mce_callin should be read before global_nwo + */ + smp_rmb(); + + if (order == 1) { + /* + * Monarch: Starts executing now, the others wait. + */ + atomic_set(&mce_executing, 1); + } else { + /* + * Subject: Now start the scanning loop one by one in + * the original callin order. + * This way when there are any shared banks it will be + * only seen by one CPU before cleared, avoiding duplicates. + */ + while (atomic_read(&mce_executing) < order) { + if (mce_timed_out(&timeout, + "Timeout: Subject CPUs unable to finish machine check processing")) { + atomic_set(&global_nwo, 0); + return -1; + } + ndelay(SPINUNIT); + } + } + + /* + * Cache the global no_way_out state. + */ + *no_way_out = atomic_read(&global_nwo); + + return order; +} + +/* + * Synchronize between CPUs after main scanning loop. + * This invokes the bulk of the Monarch processing. + */ +static int mce_end(int order) +{ + int ret = -1; + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + + if (!timeout) + goto reset; + if (order < 0) + goto reset; + + /* + * Allow others to run. + */ + atomic_inc(&mce_executing); + + if (order == 1) { + /* CHECKME: Can this race with a parallel hotplug? */ + int cpus = num_online_cpus(); + + /* + * Monarch: Wait for everyone to go through their scanning + * loops. + */ + while (atomic_read(&mce_executing) <= cpus) { + if (mce_timed_out(&timeout, + "Timeout: Monarch CPU unable to finish machine check processing")) + goto reset; + ndelay(SPINUNIT); + } + + mce_reign(); + barrier(); + ret = 0; + } else { + /* + * Subject: Wait for Monarch to finish. + */ + while (atomic_read(&mce_executing) != 0) { + if (mce_timed_out(&timeout, + "Timeout: Monarch CPU did not finish machine check processing")) + goto reset; + ndelay(SPINUNIT); + } + + /* + * Don't reset anything. That's done by the Monarch. + */ + return 0; + } + + /* + * Reset all global state. + */ +reset: + atomic_set(&global_nwo, 0); + atomic_set(&mce_callin, 0); + barrier(); + + /* + * Let others run again. + */ + atomic_set(&mce_executing, 0); + return ret; +} + +static void mce_clear_state(unsigned long *toclear) +{ + int i; + + for (i = 0; i < mca_cfg.banks; i++) { + if (test_bit(i, toclear)) + mce_wrmsrl(msr_ops.status(i), 0); + } +} + +static int do_memory_failure(struct mce *m) +{ + int flags = MF_ACTION_REQUIRED; + int ret; + + pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); + if (!(m->mcgstatus & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + ret = memory_failure(m->addr >> PAGE_SHIFT, flags); + if (ret) + pr_err("Memory error not recovered"); + else + set_mce_nospec(m->addr >> PAGE_SHIFT); + return ret; +} + + +/* + * Cases where we avoid rendezvous handler timeout: + * 1) If this CPU is offline. + * + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to + * skip those CPUs which remain looping in the 1st kernel - see + * crash_nmi_callback(). + * + * Note: there still is a small window between kexec-ing and the new, + * kdump kernel establishing a new #MC handler where a broadcasted MCE + * might not get handled properly. + */ +static bool __mc_check_crashing_cpu(int cpu) +{ + if (cpu_is_offline(cpu) || + (crashing_cpu != -1 && crashing_cpu != cpu)) { + u64 mcgstatus; + + mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (mcgstatus & MCG_STATUS_RIPV) { + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + return true; + } + } + return false; +} + +static void __mc_scan_banks(struct mce *m, struct mce *final, + unsigned long *toclear, unsigned long *valid_banks, + int no_way_out, int *worst) +{ + struct mca_config *cfg = &mca_cfg; + int severity, i; + + for (i = 0; i < cfg->banks; i++) { + __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; + + if (!mce_banks[i].ctl) + continue; + + m->misc = 0; + m->addr = 0; + m->bank = i; + + m->status = mce_rdmsrl(msr_ops.status(i)); + if (!(m->status & MCI_STATUS_VAL)) + continue; + + /* + * Corrected or non-signaled errors are handled by + * machine_check_poll(). Leave them alone, unless this panics. + */ + if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + !no_way_out) + continue; + + /* Set taint even when machine check was not enabled. */ + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + + severity = mce_severity(m, cfg->tolerant, NULL, true); + + /* + * When machine check was for corrected/deferred handler don't + * touch, unless we're panicking. + */ + if ((severity == MCE_KEEP_SEVERITY || + severity == MCE_UCNA_SEVERITY) && !no_way_out) + continue; + + __set_bit(i, toclear); + + /* Machine check event was not enabled. Clear, but ignore. */ + if (severity == MCE_NO_SEVERITY) + continue; + + mce_read_aux(m, i); + + /* assuming valid severity level != 0 */ + m->severity = severity; + + mce_log(m); + + if (severity > *worst) { + *final = *m; + *worst = severity; + } + } + + /* mce_clear_state will clear *final, save locally for use later */ + *m = *final; +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! + * + * On Intel systems this is entered on all CPUs in parallel through + * MCE broadcast. However some CPUs might be broken beyond repair, + * so be always careful when synchronizing with others. + */ +void do_machine_check(struct pt_regs *regs, long error_code) +{ + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); + DECLARE_BITMAP(toclear, MAX_NR_BANKS); + struct mca_config *cfg = &mca_cfg; + int cpu = smp_processor_id(); + char *msg = "Unknown"; + struct mce m, *final; + int worst = 0; + + /* + * Establish sequential order between the CPUs entering the machine + * check handler. + */ + int order = -1; + + /* + * If no_way_out gets set, there is no safe way to recover from this + * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. + */ + int no_way_out = 0; + + /* + * If kill_it gets set, there might be a way to recover from this + * error. + */ + int kill_it = 0; + + /* + * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES + * on Intel. + */ + int lmce = 1; + + if (__mc_check_crashing_cpu(cpu)) + return; + + ist_enter(regs); + + this_cpu_inc(mce_exception_count); + + mce_gather_info(&m, regs); + m.tsc = rdtsc(); + + final = this_cpu_ptr(&mces_seen); + *final = m; + + memset(valid_banks, 0, sizeof(valid_banks)); + no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); + + barrier(); + + /* + * When no restart IP might need to kill or panic. + * Assume the worst for now, but if we find the + * severity is MCE_AR_SEVERITY we have other options. + */ + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + kill_it = 1; + + /* + * Check if this MCE is signaled to only this logical processor, + * on Intel only. + */ + if (m.cpuvendor == X86_VENDOR_INTEL) + lmce = m.mcgstatus & MCG_STATUS_LMCES; + + /* + * Local machine check may already know that we have to panic. + * Broadcast machine check begins rendezvous in mce_start() + * Go through all banks in exclusion of the other CPUs. This way we + * don't report duplicated events on shared banks because the first one + * to see it will clear it. + */ + if (lmce) { + if (no_way_out) + mce_panic("Fatal local machine check", &m, msg); + } else { + order = mce_start(&no_way_out); + } + + __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst); + + if (!no_way_out) + mce_clear_state(toclear); + + /* + * Do most of the synchronization with other CPUs. + * When there's any problem use only local no_way_out state. + */ + if (!lmce) { + if (mce_end(order) < 0) + no_way_out = worst >= MCE_PANIC_SEVERITY; + } else { + /* + * If there was a fatal machine check we should have + * already called mce_panic earlier in this function. + * Since we re-read the banks, we might have found + * something new. Check again to see if we found a + * fatal error. We call "mce_severity()" again to + * make sure we have the right "msg". + */ + if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { + mce_severity(&m, cfg->tolerant, &msg, true); + mce_panic("Local fatal machine check!", &m, msg); + } + } + + /* + * If tolerant is at an insane level we drop requests to kill + * processes and continue even when there is no way out. + */ + if (cfg->tolerant == 3) + kill_it = 0; + else if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); + + if (worst > 0) + mce_report_event(regs); + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + + sync_core(); + + if (worst != MCE_AR_SEVERITY && !kill_it) + goto out_ist; + + /* Fault was in user mode and we need to take some action */ + if ((m.cs & 3) == 3) { + ist_begin_non_atomic(regs); + local_irq_enable(); + + if (kill_it || do_memory_failure(&m)) + force_sig(SIGBUS, current); + local_irq_disable(); + ist_end_non_atomic(); + } else { + if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) + mce_panic("Failed kernel mode recovery", &m, NULL); + } + +out_ist: + ist_exit(regs); +} +EXPORT_SYMBOL_GPL(do_machine_check); + +#ifndef CONFIG_MEMORY_FAILURE +int memory_failure(unsigned long pfn, int flags) +{ + /* mce_severity() should not hand us an ACTION_REQUIRED error */ + BUG_ON(flags & MF_ACTION_REQUIRED); + pr_err("Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", + pfn); + + return 0; +} +#endif + +/* + * Periodic polling timer for "silent" machine check errors. If the + * poller finds an MCE, poll 2x faster. When the poller finds no more + * errors, poll 2x slower (up to check_interval seconds). + */ +static unsigned long check_interval = INITIAL_CHECK_INTERVAL; + +static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ +static DEFINE_PER_CPU(struct timer_list, mce_timer); + +static unsigned long mce_adjust_timer_default(unsigned long interval) +{ + return interval; +} + +static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; + +static void __start_timer(struct timer_list *t, unsigned long interval) +{ + unsigned long when = jiffies + interval; + unsigned long flags; + + local_irq_save(flags); + + if (!timer_pending(t) || time_before(when, t->expires)) + mod_timer(t, round_jiffies(when)); + + local_irq_restore(flags); +} + +static void mce_timer_fn(struct timer_list *t) +{ + struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); + unsigned long iv; + + WARN_ON(cpu_t != t); + + iv = __this_cpu_read(mce_next_interval); + + if (mce_available(this_cpu_ptr(&cpu_info))) { + machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); + + if (mce_intel_cmci_poll()) { + iv = mce_adjust_timer(iv); + goto done; + } + } + + /* + * Alert userspace if needed. If we logged an MCE, reduce the polling + * interval, otherwise increase the polling interval. + */ + if (mce_notify_irq()) + iv = max(iv / 2, (unsigned long) HZ/100); + else + iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); + +done: + __this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); +} + +/* + * Ensure that the timer is firing in @interval from now. + */ +void mce_timer_kick(unsigned long interval) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + unsigned long iv = __this_cpu_read(mce_next_interval); + + __start_timer(t, interval); + + if (interval < iv) + __this_cpu_write(mce_next_interval, interval); +} + +/* Must not be called in IRQ context where del_timer_sync() can deadlock */ +static void mce_timer_delete_all(void) +{ + int cpu; + + for_each_online_cpu(cpu) + del_timer_sync(&per_cpu(mce_timer, cpu)); +} + +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +int mce_notify_irq(void) +{ + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + + if (test_and_clear_bit(0, &mce_need_notify)) { + mce_work_trigger(); + + if (__ratelimit(&ratelimit)) + pr_info(HW_ERR "Machine check events logged\n"); + + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(mce_notify_irq); + +static int __mcheck_cpu_mce_banks_init(void) +{ + int i; + u8 num_banks = mca_cfg.banks; + + mce_banks = kcalloc(num_banks, sizeof(struct mce_bank), GFP_KERNEL); + if (!mce_banks) + return -ENOMEM; + + for (i = 0; i < num_banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + b->ctl = -1ULL; + b->init = 1; + } + return 0; +} + +/* + * Initialize Machine Checks for a CPU. + */ +static int __mcheck_cpu_cap_init(void) +{ + unsigned b; + u64 cap; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + + b = cap & MCG_BANKCNT_MASK; + if (!mca_cfg.banks) + pr_info("CPU supports %d MCE banks\n", b); + + if (b > MAX_NR_BANKS) { + pr_warn("Using only %u machine check banks out of %u\n", + MAX_NR_BANKS, b); + b = MAX_NR_BANKS; + } + + /* Don't support asymmetric configurations today */ + WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); + mca_cfg.banks = b; + + if (!mce_banks) { + int err = __mcheck_cpu_mce_banks_init(); + + if (err) + return err; + } + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mca_cfg.ser = 1; + + return 0; +} + +static void __mcheck_cpu_init_generic(void) +{ + enum mcp_flags m_fl = 0; + mce_banks_t all_banks; + u64 cap; + + if (!mca_cfg.bootlog) + m_fl = MCP_DONTLOG; + + /* + * Log the machine checks left over from the previous reset. + */ + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC | m_fl, &all_banks); + + cr4_set_bits(X86_CR4_MCE); + + rdmsrl(MSR_IA32_MCG_CAP, cap); + if (cap & MCG_CTL_P) + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); +} + +static void __mcheck_cpu_init_clear_banks(void) +{ + int i; + + for (i = 0; i < mca_cfg.banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + if (!b->init) + continue; + wrmsrl(msr_ops.ctl(i), b->ctl); + wrmsrl(msr_ops.status(i), 0); + } +} + +/* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + +/* Add per CPU specific workarounds here */ +static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +{ + struct mca_config *cfg = &mca_cfg; + + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + pr_info("unknown CPU type - not enabling MCE support\n"); + return -EOPNOTSUPP; + } + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86_vendor == X86_VENDOR_AMD) { + if (c->x86 == 15 && cfg->banks > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + if (c->x86 < 0x11 && cfg->bootlog < 0) { + /* + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: + */ + cfg->bootlog = 0; + } + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && cfg->banks > 0) + mce_banks[0].ctl = 0; + + /* + * overflow_recov is supported for F15h Models 00h-0fh + * even though we don't have a CPUID bit for it. + */ + if (c->x86 == 0x15 && c->x86_model <= 0xf) + mce_flags.overflow_recov = 1; + + /* + * Turn off MC4_MISC thresholding banks on those models since + * they're not supported there. + */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { + int i; + u64 hwcr; + bool need_toggle; + u32 msrs[] = { + 0x00000413, /* MC4_MISC0 */ + 0xc0000408, /* MC4_MISC1 */ + }; + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + /* Clear CntP bit safely */ + for (i = 0; i < ARRAY_SIZE(msrs); i++) + msr_clear_bit(msrs[i], 62); + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); + } + } + + if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + */ + + if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) + mce_banks[0].init = 0; + + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && + cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) + cfg->bootlog = 0; + + if (c->x86 == 6 && c->x86_model == 45) + quirk_no_way_out = quirk_sandybridge_ifu; + } + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = 0; + if (cfg->bootlog != 0) + cfg->panic_timeout = 30; + + return 0; +} + +static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +{ + if (c->x86 != 5) + return 0; + + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + intel_p5_mcheck_init(c); + return 1; + break; + case X86_VENDOR_CENTAUR: + winchip_mcheck_init(c); + return 1; + break; + default: + return 0; + } + + return 0; +} + +/* + * Init basic CPU features needed for early decoding of MCEs. + */ +static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) +{ + if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { + mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); + mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); + + if (mce_flags.smca) { + msr_ops.ctl = smca_ctl_reg; + msr_ops.status = smca_status_reg; + msr_ops.addr = smca_addr_reg; + msr_ops.misc = smca_misc_reg; + } + } +} + +static void mce_centaur_feature_init(struct cpuinfo_x86 *c) +{ + struct mca_config *cfg = &mca_cfg; + + /* + * All newer Centaur CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) || + c->x86 > 6) { + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + } +} + +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + mce_adjust_timer = cmci_intel_adjust_timer; + break; + + case X86_VENDOR_AMD: { + mce_amd_feature_init(c); + break; + } + + case X86_VENDOR_HYGON: + mce_hygon_feature_init(c); + break; + + case X86_VENDOR_CENTAUR: + mce_centaur_feature_init(c); + break; + + default: + break; + } +} + +static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_clear(c); + break; + default: + break; + } +} + +static void mce_start_timer(struct timer_list *t) +{ + unsigned long iv = check_interval * HZ; + + if (mca_cfg.ignore_ce || !iv) + return; + + this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); +} + +static void __mcheck_cpu_setup_timer(void) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + + timer_setup(t, mce_timer_fn, TIMER_PINNED); +} + +static void __mcheck_cpu_init_timer(void) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + + timer_setup(t, mce_timer_fn, TIMER_PINNED); + mce_start_timer(t); +} + +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + +dotraplinkage void do_mce(struct pt_regs *regs, long error_code) +{ + machine_check_vector(regs, error_code); +} + +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off: + */ +void mcheck_cpu_init(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return; + + if (__mcheck_cpu_ancient_init(c)) + return; + + if (!mce_available(c)) + return; + + if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { + mca_cfg.disabled = 1; + return; + } + + if (mce_gen_pool_init()) { + mca_cfg.disabled = 1; + pr_emerg("Couldn't allocate MCE records pool!\n"); + return; + } + + machine_check_vector = do_machine_check; + + __mcheck_cpu_init_early(c); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(c); + __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_setup_timer(); +} + +/* + * Called for each booted CPU to clear some machine checks opt-ins + */ +void mcheck_cpu_clear(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return; + + if (!mce_available(c)) + return; + + /* + * Possibly to clear general settings generic to x86 + * __mcheck_cpu_clear_generic(c); + */ + __mcheck_cpu_clear_vendor(c); + +} + +static void __mce_disable_bank(void *arg) +{ + int bank = *((int *)arg); + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + cmci_disable_bank(bank); +} + +void mce_disable_bank(int bank) +{ + if (bank >= mca_cfg.banks) { + pr_warn(FW_BUG + "Ignoring request to disable invalid MCA bank %d.\n", + bank); + return; + } + set_bit(bank, mce_banks_ce_disabled); + on_each_cpu(__mce_disable_bank, &bank, 1); +} + +/* + * mce=off Disables machine check + * mce=no_cmci Disables CMCI + * mce=no_lmce Disables LMCE + * mce=dont_log_ce Clears corrected events silently, no log created for CEs. + * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. + * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) + * monarchtimeout is how long to wait for other CPUs on machine + * check, or 0 to not wait + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h + and older. + * mce=nobootlog Don't log MCEs from before booting. + * mce=bios_cmci_threshold Don't program the CMCI threshold + * mce=recovery force enable memcpy_mcsafe() + */ +static int __init mcheck_enable(char *str) +{ + struct mca_config *cfg = &mca_cfg; + + if (*str == 0) { + enable_p5_mce(); + return 1; + } + if (*str == '=') + str++; + if (!strcmp(str, "off")) + cfg->disabled = 1; + else if (!strcmp(str, "no_cmci")) + cfg->cmci_disabled = true; + else if (!strcmp(str, "no_lmce")) + cfg->lmce_disabled = 1; + else if (!strcmp(str, "dont_log_ce")) + cfg->dont_log_ce = true; + else if (!strcmp(str, "ignore_ce")) + cfg->ignore_ce = true; + else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) + cfg->bootlog = (str[0] == 'b'); + else if (!strcmp(str, "bios_cmci_threshold")) + cfg->bios_cmci_threshold = 1; + else if (!strcmp(str, "recovery")) + cfg->recovery = 1; + else if (isdigit(str[0])) { + if (get_option(&str, &cfg->tolerant) == 2) + get_option(&str, &(cfg->monarch_timeout)); + } else { + pr_info("mce argument %s ignored. Please use /sys\n", str); + return 0; + } + return 1; +} +__setup("mce", mcheck_enable); + +int __init mcheck_init(void) +{ + mcheck_intel_therm_init(); + mce_register_decode_chain(&first_nb); + mce_register_decode_chain(&mce_srao_nb); + mce_register_decode_chain(&mce_default_nb); + mcheck_vendor_init_severity(); + + INIT_WORK(&mce_work, mce_gen_pool_process); + init_irq_work(&mce_irq_work, mce_irq_work_cb); + + return 0; +} + +/* + * mce_syscore: PM support + */ + +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static void mce_disable_error_reporting(void) +{ + int i; + + for (i = 0; i < mca_cfg.banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(msr_ops.ctl(i), 0); + } + return; +} + +static void vendor_disable_error_reporting(void) +{ + /* + * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs + * are socket-wide. + * Disabling them for just a single offlined CPU is bad, since it will + * inhibit reporting for all shared resources on the socket like the + * last level cache (LLC), the integrated memory controller (iMC), etc. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON || + boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return; + + mce_disable_error_reporting(); +} + +static int mce_syscore_suspend(void) +{ + vendor_disable_error_reporting(); + return 0; +} + +static void mce_syscore_shutdown(void) +{ + vendor_disable_error_reporting(); +} + +/* + * On resume clear all MCE state. Don't want to see leftovers from the BIOS. + * Only one CPU is active at this time, the others get re-added later using + * CPU hotplug: + */ +static void mce_syscore_resume(void) +{ + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); + __mcheck_cpu_init_clear_banks(); +} + +static struct syscore_ops mce_syscore_ops = { + .suspend = mce_syscore_suspend, + .shutdown = mce_syscore_shutdown, + .resume = mce_syscore_resume, +}; + +/* + * mce_device: Sysfs support + */ + +static void mce_cpu_restart(void *data) +{ + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_timer(); +} + +/* Reinit MCEs after user configuration changes */ +static void mce_restart(void) +{ + mce_timer_delete_all(); + on_each_cpu(mce_cpu_restart, NULL, 1); +} + +/* Toggle features for corrected errors */ +static void mce_disable_cmci(void *data) +{ + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + cmci_clear(); +} + +static void mce_enable_ce(void *all) +{ + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + cmci_reenable(); + cmci_recheck(); + if (all) + __mcheck_cpu_init_timer(); +} + +static struct bus_type mce_subsys = { + .name = "machinecheck", + .dev_name = "machinecheck", +}; + +DEFINE_PER_CPU(struct device *, mce_device); + +static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) +{ + return container_of(attr, struct mce_bank, attr); +} + +static ssize_t show_bank(struct device *s, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); +} + +static ssize_t set_bank(struct device *s, struct device_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (kstrtou64(buf, 0, &new) < 0) + return -EINVAL; + + attr_to_bank(attr)->ctl = new; + mce_restart(); + + return size; +} + +static ssize_t set_ignore_ce(struct device *s, + struct device_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (kstrtou64(buf, 0, &new) < 0) + return -EINVAL; + + mutex_lock(&mce_sysfs_mutex); + if (mca_cfg.ignore_ce ^ !!new) { + if (new) { + /* disable ce features */ + mce_timer_delete_all(); + on_each_cpu(mce_disable_cmci, NULL, 1); + mca_cfg.ignore_ce = true; + } else { + /* enable ce features */ + mca_cfg.ignore_ce = false; + on_each_cpu(mce_enable_ce, (void *)1, 1); + } + } + mutex_unlock(&mce_sysfs_mutex); + + return size; +} + +static ssize_t set_cmci_disabled(struct device *s, + struct device_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (kstrtou64(buf, 0, &new) < 0) + return -EINVAL; + + mutex_lock(&mce_sysfs_mutex); + if (mca_cfg.cmci_disabled ^ !!new) { + if (new) { + /* disable cmci */ + on_each_cpu(mce_disable_cmci, NULL, 1); + mca_cfg.cmci_disabled = true; + } else { + /* enable cmci */ + mca_cfg.cmci_disabled = false; + on_each_cpu(mce_enable_ce, NULL, 1); + } + } + mutex_unlock(&mce_sysfs_mutex); + + return size; +} + +static ssize_t store_int_with_restart(struct device *s, + struct device_attribute *attr, + const char *buf, size_t size) +{ + unsigned long old_check_interval = check_interval; + ssize_t ret = device_store_ulong(s, attr, buf, size); + + if (check_interval == old_check_interval) + return ret; + + mutex_lock(&mce_sysfs_mutex); + mce_restart(); + mutex_unlock(&mce_sysfs_mutex); + + return ret; +} + +static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); +static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); + +static struct dev_ext_attribute dev_attr_check_interval = { + __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), + &check_interval +}; + +static struct dev_ext_attribute dev_attr_ignore_ce = { + __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), + &mca_cfg.ignore_ce +}; + +static struct dev_ext_attribute dev_attr_cmci_disabled = { + __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), + &mca_cfg.cmci_disabled +}; + +static struct device_attribute *mce_device_attrs[] = { + &dev_attr_tolerant.attr, + &dev_attr_check_interval.attr, +#ifdef CONFIG_X86_MCELOG_LEGACY + &dev_attr_trigger, +#endif + &dev_attr_monarch_timeout.attr, + &dev_attr_dont_log_ce.attr, + &dev_attr_ignore_ce.attr, + &dev_attr_cmci_disabled.attr, + NULL +}; + +static cpumask_var_t mce_device_initialized; + +static void mce_device_release(struct device *dev) +{ + kfree(dev); +} + +/* Per cpu device init. All of the cpus still share the same ctrl bank: */ +static int mce_device_create(unsigned int cpu) +{ + struct device *dev; + int err; + int i, j; + + if (!mce_available(&boot_cpu_data)) + return -EIO; + + dev = per_cpu(mce_device, cpu); + if (dev) + return 0; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + dev->id = cpu; + dev->bus = &mce_subsys; + dev->release = &mce_device_release; + + err = device_register(dev); + if (err) { + put_device(dev); + return err; + } + + for (i = 0; mce_device_attrs[i]; i++) { + err = device_create_file(dev, mce_device_attrs[i]); + if (err) + goto error; + } + for (j = 0; j < mca_cfg.banks; j++) { + err = device_create_file(dev, &mce_banks[j].attr); + if (err) + goto error2; + } + cpumask_set_cpu(cpu, mce_device_initialized); + per_cpu(mce_device, cpu) = dev; + + return 0; +error2: + while (--j >= 0) + device_remove_file(dev, &mce_banks[j].attr); +error: + while (--i >= 0) + device_remove_file(dev, mce_device_attrs[i]); + + device_unregister(dev); + + return err; +} + +static void mce_device_remove(unsigned int cpu) +{ + struct device *dev = per_cpu(mce_device, cpu); + int i; + + if (!cpumask_test_cpu(cpu, mce_device_initialized)) + return; + + for (i = 0; mce_device_attrs[i]; i++) + device_remove_file(dev, mce_device_attrs[i]); + + for (i = 0; i < mca_cfg.banks; i++) + device_remove_file(dev, &mce_banks[i].attr); + + device_unregister(dev); + cpumask_clear_cpu(cpu, mce_device_initialized); + per_cpu(mce_device, cpu) = NULL; +} + +/* Make sure there are no machine checks on offlined CPUs. */ +static void mce_disable_cpu(void) +{ + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + + if (!cpuhp_tasks_frozen) + cmci_clear(); + + vendor_disable_error_reporting(); +} + +static void mce_reenable_cpu(void) +{ + int i; + + if (!mce_available(raw_cpu_ptr(&cpu_info))) + return; + + if (!cpuhp_tasks_frozen) + cmci_reenable(); + for (i = 0; i < mca_cfg.banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(msr_ops.ctl(i), b->ctl); + } +} + +static int mce_cpu_dead(unsigned int cpu) +{ + mce_intel_hcpu_update(cpu); + + /* intentionally ignoring frozen here */ + if (!cpuhp_tasks_frozen) + cmci_rediscover(); + return 0; +} + +static int mce_cpu_online(unsigned int cpu) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + int ret; + + mce_device_create(cpu); + + ret = mce_threshold_create_device(cpu); + if (ret) { + mce_device_remove(cpu); + return ret; + } + mce_reenable_cpu(); + mce_start_timer(t); + return 0; +} + +static int mce_cpu_pre_down(unsigned int cpu) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + + mce_disable_cpu(); + del_timer_sync(t); + mce_threshold_remove_device(cpu); + mce_device_remove(cpu); + return 0; +} + +static __init void mce_init_banks(void) +{ + int i; + + for (i = 0; i < mca_cfg.banks; i++) { + struct mce_bank *b = &mce_banks[i]; + struct device_attribute *a = &b->attr; + + sysfs_attr_init(&a->attr); + a->attr.name = b->attrname; + snprintf(b->attrname, ATTR_LEN, "bank%d", i); + + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; + } +} + +static __init int mcheck_init_device(void) +{ + int err; + + /* + * Check if we have a spare virtual bit. This will only become + * a problem if/when we move beyond 5-level page tables. + */ + MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63); + + if (!mce_available(&boot_cpu_data)) { + err = -EIO; + goto err_out; + } + + if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { + err = -ENOMEM; + goto err_out; + } + + mce_init_banks(); + + err = subsys_system_register(&mce_subsys, NULL); + if (err) + goto err_out_mem; + + err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL, + mce_cpu_dead); + if (err) + goto err_out_mem; + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online", + mce_cpu_online, mce_cpu_pre_down); + if (err < 0) + goto err_out_online; + + register_syscore_ops(&mce_syscore_ops); + + return 0; + +err_out_online: + cpuhp_remove_state(CPUHP_X86_MCE_DEAD); + +err_out_mem: + free_cpumask_var(mce_device_initialized); + +err_out: + pr_err("Unable to init MCE device (rc: %d)\n", err); + + return err; +} +device_initcall_sync(mcheck_init_device); + +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ + mca_cfg.disabled = 1; + return 1; +} +__setup("nomce", mcheck_disable); + +#ifdef CONFIG_DEBUG_FS +struct dentry *mce_get_debugfs_dir(void) +{ + static struct dentry *dmce; + + if (!dmce) + dmce = debugfs_create_dir("mce", NULL); + + return dmce; +} + +static void mce_reset(void) +{ + cpu_missing = 0; + atomic_set(&mce_fake_panicked, 0); + atomic_set(&mce_executing, 0); + atomic_set(&mce_callin, 0); + atomic_set(&global_nwo, 0); +} + +static int fake_panic_get(void *data, u64 *val) +{ + *val = fake_panic; + return 0; +} + +static int fake_panic_set(void *data, u64 val) +{ + mce_reset(); + fake_panic = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, + fake_panic_set, "%llu\n"); + +static int __init mcheck_debugfs_init(void) +{ + struct dentry *dmce, *ffake_panic; + + dmce = mce_get_debugfs_dir(); + if (!dmce) + return -ENOMEM; + ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, + &fake_panic_fops); + if (!ffake_panic) + return -ENOMEM; + + return 0; +} +#else +static int __init mcheck_debugfs_init(void) { return -EINVAL; } +#endif + +DEFINE_STATIC_KEY_FALSE(mcsafe_key); +EXPORT_SYMBOL_GPL(mcsafe_key); + +static int __init mcheck_late_init(void) +{ + if (mca_cfg.recovery) + static_branch_inc(&mcsafe_key); + + mcheck_debugfs_init(); + cec_init(); + + /* + * Flush out everything that has been logged during early boot, now that + * everything has been initialized (workqueues, decoders, ...). + */ + mce_schedule_work(); + + return 0; +} +late_initcall(mcheck_late_init); diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -0,0 +1,358 @@ +/* + * /dev/mcelog driver + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ + +#include <linux/miscdevice.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/poll.h> + +#include "internal.h" + +static BLOCKING_NOTIFIER_HEAD(mce_injector_chain); + +static DEFINE_MUTEX(mce_chrdev_read_mutex); + +static char mce_helper[128]; +static char *mce_helper_argv[2] = { mce_helper, NULL }; + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log_buffer mcelog = { + .signature = MCE_LOG_SIGNATURE, + .len = MCE_LOG_LEN, + .recordlen = sizeof(struct mce), +}; + +static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); + +static int dev_mce_log(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned int entry; + + mutex_lock(&mce_chrdev_read_mutex); + + entry = mcelog.next; + + /* + * When the buffer fills up discard new entries. Assume that the + * earlier errors are the more interesting ones: + */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); + goto unlock; + } + + mcelog.next = entry + 1; + + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + mcelog.entry[entry].finished = 1; + + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&mce_chrdev_wait); + +unlock: + mutex_unlock(&mce_chrdev_read_mutex); + + return NOTIFY_OK; +} + +static struct notifier_block dev_mcelog_nb = { + .notifier_call = dev_mce_log, + .priority = MCE_PRIO_MCELOG, +}; + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + + +void mce_work_trigger(void) +{ + if (mce_helper[0]) + schedule_work(&mce_trigger_work); +} + +static ssize_t +show_trigger(struct device *s, struct device_attribute *attr, char *buf) +{ + strcpy(buf, mce_helper); + strcat(buf, "\n"); + return strlen(mce_helper) + 1; +} + +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, + const char *buf, size_t siz) +{ + char *p; + + strncpy(mce_helper, buf, sizeof(mce_helper)); + mce_helper[sizeof(mce_helper)-1] = 0; + p = strchr(mce_helper, '\n'); + + if (p) + *p = 0; + + return strlen(mce_helper) + !!p; +} + +DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); + +/* + * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_chrdev_state_lock); +static int mce_chrdev_open_count; /* #times opened */ +static int mce_chrdev_open_exclu; /* already open exclusive? */ + +static int mce_chrdev_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + if (mce_chrdev_open_exclu || + (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_chrdev_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + mce_chrdev_open_exclu = 1; + mce_chrdev_open_count++; + + spin_unlock(&mce_chrdev_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_chrdev_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + mce_chrdev_open_count--; + mce_chrdev_open_exclu = 0; + + spin_unlock(&mce_chrdev_state_lock); + + return 0; +} + +static int mce_apei_read_done; + +/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ +static int __mce_read_apei(char __user **ubuf, size_t usize) +{ + int rc; + u64 record_id; + struct mce m; + + if (usize < sizeof(struct mce)) + return -EINVAL; + + rc = apei_read_mce(&m, &record_id); + /* Error or no more MCE record */ + if (rc <= 0) { + mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; + return rc; + } + rc = -EFAULT; + if (copy_to_user(*ubuf, &m, sizeof(struct mce))) + return rc; + /* + * In fact, we should have cleared the record after that has + * been flushed to the disk or sent to network in + * /sbin/mcelog, but we have no interface to support that now, + * so just clear it to avoid duplication. + */ + rc = apei_clear_mce(record_id); + if (rc) { + mce_apei_read_done = 1; + return rc; + } + *ubuf += sizeof(struct mce); + + return 0; +} + +static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) +{ + char __user *buf = ubuf; + unsigned next; + int i, err; + + mutex_lock(&mce_chrdev_read_mutex); + + if (!mce_apei_read_done) { + err = __mce_read_apei(&buf, usize); + if (err || buf != ubuf) + goto out; + } + + /* Only supports full reads right now */ + err = -EINVAL; + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + goto out; + + next = mcelog.next; + err = 0; + + for (i = 0; i < next; i++) { + struct mce *m = &mcelog.entry[i]; + + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); + } + + memset(mcelog.entry, 0, next * sizeof(struct mce)); + mcelog.next = 0; + + if (err) + err = -EFAULT; + +out: + mutex_unlock(&mce_chrdev_read_mutex); + + return err ? err : buf - ubuf; +} + +static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_chrdev_wait, wait); + if (READ_ONCE(mcelog.next)) + return EPOLLIN | EPOLLRDNORM; + if (!mce_apei_read_done && apei_check_mce()) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +void mce_register_injector_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_register(&mce_injector_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_register_injector_chain); + +void mce_unregister_injector_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&mce_injector_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_injector_chain); + +static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + struct mce m; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * There are some cases where real MSR reads could slip + * through. + */ + if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) + return -EIO; + + if ((unsigned long)usize > sizeof(struct mce)) + usize = sizeof(struct mce); + if (copy_from_user(&m, ubuf, usize)) + return -EFAULT; + + if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) + return -EINVAL; + + /* + * Need to give user space some time to set everything up, + * so do it a jiffie or two later everywhere. + */ + schedule_timeout(2); + + blocking_notifier_call_chain(&mce_injector_chain, 0, &m); + + return usize; +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_chrdev_open, + .release = mce_chrdev_release, + .read = mce_chrdev_read, + .write = mce_chrdev_write, + .poll = mce_chrdev_poll, + .unlocked_ioctl = mce_chrdev_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice mce_chrdev_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +static __init int dev_mcelog_init_device(void) +{ + int err; + + /* register character device /dev/mcelog */ + err = misc_register(&mce_chrdev_device); + if (err) { + if (err == -EBUSY) + /* Xen dom0 might have registered the device already. */ + pr_info("Unable to init device /dev/mcelog, already registered"); + else + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + + return err; + } + + mce_register_decode_chain(&dev_mcelog_nb); + return 0; +} +device_initcall_sync(dev_mcelog_init_device); diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c @@ -0,0 +1,145 @@ +/* + * MCE event pool management in MCE context + * + * Copyright (C) 2015 Intel Corp. + * Author: Chen, Gong <gong.chen@linux.intel.com> + * + * This file is licensed under GPLv2. + */ +#include <linux/smp.h> +#include <linux/mm.h> +#include <linux/genalloc.h> +#include <linux/llist.h> +#include "internal.h" + +/* + * printk() is not safe in MCE context. This is a lock-less memory allocator + * used to save error information organized in a lock-less list. + * + * This memory pool is only to be used to save MCE records in MCE context. + * MCE events are rare, so a fixed size memory pool should be enough. Use + * 2 pages to save MCE events for now (~80 MCE records at most). + */ +#define MCE_POOLSZ (2 * PAGE_SIZE) + +static struct gen_pool *mce_evt_pool; +static LLIST_HEAD(mce_event_llist); +static char gen_pool_buf[MCE_POOLSZ]; + +/* + * Compare the record "t" with each of the records on list "l" to see if + * an equivalent one is present in the list. + */ +static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) +{ + struct mce_evt_llist *node; + struct mce *m1, *m2; + + m1 = &t->mce; + + llist_for_each_entry(node, &l->llnode, llnode) { + m2 = &node->mce; + + if (!mce_cmp(m1, m2)) + return true; + } + return false; +} + +/* + * The system has panicked - we'd like to peruse the list of MCE records + * that have been queued, but not seen by anyone yet. The list is in + * reverse time order, so we need to reverse it. While doing that we can + * also drop duplicate records (these were logged because some banks are + * shared between cores or by all threads on a socket). + */ +struct llist_node *mce_gen_pool_prepare_records(void) +{ + struct llist_node *head; + LLIST_HEAD(new_head); + struct mce_evt_llist *node, *t; + + head = llist_del_all(&mce_event_llist); + if (!head) + return NULL; + + /* squeeze out duplicates while reversing order */ + llist_for_each_entry_safe(node, t, head, llnode) { + if (!is_duplicate_mce_record(node, t)) + llist_add(&node->llnode, &new_head); + } + + return new_head.first; +} + +void mce_gen_pool_process(struct work_struct *__unused) +{ + struct llist_node *head; + struct mce_evt_llist *node, *tmp; + struct mce *mce; + + head = llist_del_all(&mce_event_llist); + if (!head) + return; + + head = llist_reverse_order(head); + llist_for_each_entry_safe(node, tmp, head, llnode) { + mce = &node->mce; + blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); + } +} + +bool mce_gen_pool_empty(void) +{ + return llist_empty(&mce_event_llist); +} + +int mce_gen_pool_add(struct mce *mce) +{ + struct mce_evt_llist *node; + + if (!mce_evt_pool) + return -EINVAL; + + node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); + if (!node) { + pr_warn_ratelimited("MCE records pool full!\n"); + return -ENOMEM; + } + + memcpy(&node->mce, mce, sizeof(*mce)); + llist_add(&node->llnode, &mce_event_llist); + + return 0; +} + +static int mce_gen_pool_create(void) +{ + struct gen_pool *tmpp; + int ret = -ENOMEM; + + tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1); + if (!tmpp) + goto out; + + ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1); + if (ret) { + gen_pool_destroy(tmpp); + goto out; + } + + mce_evt_pool = tmpp; + +out: + return ret; +} + +int mce_gen_pool_init(void) +{ + /* Just init mce_gen_pool once. */ + if (mce_evt_pool) + return 0; + + return mce_gen_pool_create(); +} diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c @@ -0,0 +1,739 @@ +/* + * Machine check injection support. + * Copyright 2008 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Authors: + * Andi Kleen + * Ying Huang + * + * The AMD part (from mce_amd_inj.c): a simple MCE injection facility + * for testing different aspects of the RAS code. This driver should be + * built as module so that it can be loaded on production kernels for + * testing purposes. + * + * This file may be distributed under the terms of the GNU General Public + * License version 2. + * + * Copyright (c) 2010-17: Borislav Petkov <bp@alien8.de> + * Advanced Micro Devices Inc. + */ + +#include <linux/cpu.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/pci.h> +#include <linux/uaccess.h> + +#include <asm/amd_nb.h> +#include <asm/apic.h> +#include <asm/irq_vectors.h> +#include <asm/mce.h> +#include <asm/nmi.h> +#include <asm/smp.h> + +#include "internal.h" + +/* + * Collect all the MCi_XXX settings + */ +static struct mce i_mce; +static struct dentry *dfs_inj; + +static u8 n_banks; + +#define MAX_FLAG_OPT_SIZE 4 +#define NBCFG 0x44 + +enum injection_type { + SW_INJ = 0, /* SW injection, simply decode the error */ + HW_INJ, /* Trigger a #MC */ + DFR_INT_INJ, /* Trigger Deferred error interrupt */ + THR_INT_INJ, /* Trigger threshold interrupt */ + N_INJ_TYPES, +}; + +static const char * const flags_options[] = { + [SW_INJ] = "sw", + [HW_INJ] = "hw", + [DFR_INT_INJ] = "df", + [THR_INT_INJ] = "th", + NULL +}; + +/* Set default injection to SW_INJ */ +static enum injection_type inj_type = SW_INJ; + +#define MCE_INJECT_SET(reg) \ +static int inj_##reg##_set(void *data, u64 val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + m->reg = val; \ + return 0; \ +} + +MCE_INJECT_SET(status); +MCE_INJECT_SET(misc); +MCE_INJECT_SET(addr); +MCE_INJECT_SET(synd); + +#define MCE_INJECT_GET(reg) \ +static int inj_##reg##_get(void *data, u64 *val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + *val = m->reg; \ + return 0; \ +} + +MCE_INJECT_GET(status); +MCE_INJECT_GET(misc); +MCE_INJECT_GET(addr); +MCE_INJECT_GET(synd); + +DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); + +static void setup_inj_struct(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + + m->cpuvendor = boot_cpu_data.x86_vendor; + m->time = ktime_get_real_seconds(); + m->cpuid = cpuid_eax(1); + m->microcode = boot_cpu_data.microcode; +} + +/* Update fake mce registers on current CPU. */ +static void inject_mce(struct mce *m) +{ + struct mce *i = &per_cpu(injectm, m->extcpu); + + /* Make sure no one reads partially written injectm */ + i->finished = 0; + mb(); + m->finished = 0; + /* First set the fields after finished */ + i->extcpu = m->extcpu; + mb(); + /* Now write record in order, finished last (except above) */ + memcpy(i, m, sizeof(struct mce)); + /* Finally activate it */ + mb(); + i->finished = 1; +} + +static void raise_poll(struct mce *m) +{ + unsigned long flags; + mce_banks_t b; + + memset(&b, 0xff, sizeof(mce_banks_t)); + local_irq_save(flags); + machine_check_poll(0, &b); + local_irq_restore(flags); + m->finished = 0; +} + +static void raise_exception(struct mce *m, struct pt_regs *pregs) +{ + struct pt_regs regs; + unsigned long flags; + + if (!pregs) { + memset(&regs, 0, sizeof(struct pt_regs)); + regs.ip = m->ip; + regs.cs = m->cs; + pregs = &regs; + } + /* in mcheck exeception handler, irq will be disabled */ + local_irq_save(flags); + do_machine_check(pregs, 0); + local_irq_restore(flags); + m->finished = 0; +} + +static cpumask_var_t mce_inject_cpumask; +static DEFINE_MUTEX(mce_inject_mutex); + +static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + struct mce *m = this_cpu_ptr(&injectm); + if (!cpumask_test_cpu(cpu, mce_inject_cpumask)) + return NMI_DONE; + cpumask_clear_cpu(cpu, mce_inject_cpumask); + if (m->inject_flags & MCJ_EXCEPTION) + raise_exception(m, regs); + else if (m->status) + raise_poll(m); + return NMI_HANDLED; +} + +static void mce_irq_ipi(void *info) +{ + int cpu = smp_processor_id(); + struct mce *m = this_cpu_ptr(&injectm); + + if (cpumask_test_cpu(cpu, mce_inject_cpumask) && + m->inject_flags & MCJ_EXCEPTION) { + cpumask_clear_cpu(cpu, mce_inject_cpumask); + raise_exception(m, NULL); + } +} + +/* Inject mce on current CPU */ +static int raise_local(void) +{ + struct mce *m = this_cpu_ptr(&injectm); + int context = MCJ_CTX(m->inject_flags); + int ret = 0; + int cpu = m->extcpu; + + if (m->inject_flags & MCJ_EXCEPTION) { + pr_info("Triggering MCE exception on CPU %d\n", cpu); + switch (context) { + case MCJ_CTX_IRQ: + /* + * Could do more to fake interrupts like + * calling irq_enter, but the necessary + * machinery isn't exported currently. + */ + /*FALL THROUGH*/ + case MCJ_CTX_PROCESS: + raise_exception(m, NULL); + break; + default: + pr_info("Invalid MCE context\n"); + ret = -EINVAL; + } + pr_info("MCE exception done on CPU %d\n", cpu); + } else if (m->status) { + pr_info("Starting machine check poll CPU %d\n", cpu); + raise_poll(m); + mce_notify_irq(); + pr_info("Machine check poll done on CPU %d\n", cpu); + } else + m->finished = 0; + + return ret; +} + +static void __maybe_unused raise_mce(struct mce *m) +{ + int context = MCJ_CTX(m->inject_flags); + + inject_mce(m); + + if (context == MCJ_CTX_RANDOM) + return; + + if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) { + unsigned long start; + int cpu; + + get_online_cpus(); + cpumask_copy(mce_inject_cpumask, cpu_online_mask); + cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); + for_each_online_cpu(cpu) { + struct mce *mcpu = &per_cpu(injectm, cpu); + if (!mcpu->finished || + MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) + cpumask_clear_cpu(cpu, mce_inject_cpumask); + } + if (!cpumask_empty(mce_inject_cpumask)) { + if (m->inject_flags & MCJ_IRQ_BROADCAST) { + /* + * don't wait because mce_irq_ipi is necessary + * to be sync with following raise_local + */ + preempt_disable(); + smp_call_function_many(mce_inject_cpumask, + mce_irq_ipi, NULL, 0); + preempt_enable(); + } else if (m->inject_flags & MCJ_NMI_BROADCAST) + apic->send_IPI_mask(mce_inject_cpumask, + NMI_VECTOR); + } + start = jiffies; + while (!cpumask_empty(mce_inject_cpumask)) { + if (!time_before(jiffies, start + 2*HZ)) { + pr_err("Timeout waiting for mce inject %lx\n", + *cpumask_bits(mce_inject_cpumask)); + break; + } + cpu_relax(); + } + raise_local(); + put_cpu(); + put_online_cpus(); + } else { + preempt_disable(); + raise_local(); + preempt_enable(); + } +} + +static int mce_inject_raise(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + mutex_lock(&mce_inject_mutex); + raise_mce(m); + mutex_unlock(&mce_inject_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block inject_nb = { + .notifier_call = mce_inject_raise, +}; + +/* + * Caller needs to be make sure this cpu doesn't disappear + * from under us, i.e.: get_cpu/put_cpu. + */ +static int toggle_hw_mce_inject(unsigned int cpu, bool enable) +{ + u32 l, h; + int err; + + err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h); + if (err) { + pr_err("%s: error reading HWCR\n", __func__); + return err; + } + + enable ? (l |= BIT(18)) : (l &= ~BIT(18)); + + err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h); + if (err) + pr_err("%s: error writing HWCR\n", __func__); + + return err; +} + +static int __set_inj(const char *buf) +{ + int i; + + for (i = 0; i < N_INJ_TYPES; i++) { + if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + inj_type = i; + return 0; + } + } + return -EINVAL; +} + +static ssize_t flags_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE]; + int n; + + n = sprintf(buf, "%s\n", flags_options[inj_type]); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static ssize_t flags_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE], *__buf; + int err; + + if (cnt > MAX_FLAG_OPT_SIZE) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt - 1] = 0; + + /* strip whitespace */ + __buf = strstrip(buf); + + err = __set_inj(__buf); + if (err) { + pr_err("%s: Invalid flags value: %s\n", __func__, __buf); + return err; + } + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations flags_fops = { + .read = flags_read, + .write = flags_write, + .llseek = generic_file_llseek, +}; + +/* + * On which CPU to inject? + */ +MCE_INJECT_GET(extcpu); + +static int inj_extcpu_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= nr_cpu_ids || !cpu_online(val)) { + pr_err("%s: Invalid CPU: %llu\n", __func__, val); + return -EINVAL; + } + m->extcpu = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n"); + +static void trigger_mce(void *info) +{ + asm volatile("int $18"); +} + +static void trigger_dfr_int(void *info) +{ + asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); +} + +static void trigger_thr_int(void *info) +{ + asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); +} + +static u32 get_nbc_for_node(int node_id) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + u32 cores_per_node; + + cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); + + return cores_per_node * node_id; +} + +static void toggle_nb_mca_mst_cpu(u16 nid) +{ + struct amd_northbridge *nb; + struct pci_dev *F3; + u32 val; + int err; + + nb = node_to_amd_nb(nid); + if (!nb) + return; + + F3 = nb->misc; + if (!F3) + return; + + err = pci_read_config_dword(F3, NBCFG, &val); + if (err) { + pr_err("%s: Error reading F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); + return; + } + + if (val & BIT(27)) + return; + + pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", + __func__); + + val |= BIT(27); + err = pci_write_config_dword(F3, NBCFG, val); + if (err) + pr_err("%s: Error writing F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); +} + +static void prepare_msrs(void *info) +{ + struct mce m = *(struct mce *)info; + u8 b = m.bank; + + wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + + if (boot_cpu_has(X86_FEATURE_SMCA)) { + if (m.inject_flags == DFR_INT_INJ) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); + } else { + wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); + } + + wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + } else { + wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); + } +} + +static void do_inject(void) +{ + u64 mcg_status = 0; + unsigned int cpu = i_mce.extcpu; + u8 b = i_mce.bank; + + i_mce.tsc = rdtsc_ordered(); + + if (i_mce.misc) + i_mce.status |= MCI_STATUS_MISCV; + + if (i_mce.synd) + i_mce.status |= MCI_STATUS_SYNDV; + + if (inj_type == SW_INJ) { + mce_inject_log(&i_mce); + return; + } + + /* prep MCE global settings for the injection */ + mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; + + if (!(i_mce.status & MCI_STATUS_PCC)) + mcg_status |= MCG_STATUS_RIPV; + + /* + * Ensure necessary status bits for deferred errors: + * - MCx_STATUS[Deferred]: make sure it is a deferred error + * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC + */ + if (inj_type == DFR_INT_INJ) { + i_mce.status |= MCI_STATUS_DEFERRED; + i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); + } + + /* + * For multi node CPUs, logging and reporting of bank 4 errors happens + * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for + * Fam10h and later BKDGs. + */ + if (static_cpu_has(X86_FEATURE_AMD_DCM) && + b == 4 && + boot_cpu_data.x86 < 0x17) { + toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); + cpu = get_nbc_for_node(amd_get_nb_id(cpu)); + } + + get_online_cpus(); + if (!cpu_online(cpu)) + goto err; + + toggle_hw_mce_inject(cpu, true); + + i_mce.mcgstatus = mcg_status; + i_mce.inject_flags = inj_type; + smp_call_function_single(cpu, prepare_msrs, &i_mce, 0); + + toggle_hw_mce_inject(cpu, false); + + switch (inj_type) { + case DFR_INT_INJ: + smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); + break; + case THR_INT_INJ: + smp_call_function_single(cpu, trigger_thr_int, NULL, 0); + break; + default: + smp_call_function_single(cpu, trigger_mce, NULL, 0); + } + +err: + put_online_cpus(); + +} + +/* + * This denotes into which bank we're injecting and triggers + * the injection, at the same time. + */ +static int inj_bank_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= n_banks) { + pr_err("Non-existent MCE bank: %llu\n", val); + return -EINVAL; + } + + m->bank = val; + do_inject(); + + /* Reset injection struct */ + setup_inj_struct(&i_mce); + + return 0; +} + +MCE_INJECT_GET(bank); + +DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); + +static const char readme_msg[] = +"Description of the files and their usages:\n" +"\n" +"Note1: i refers to the bank number below.\n" +"Note2: See respective BKDGs for the exact bit definitions of the files below\n" +"as they mirror the hardware registers.\n" +"\n" +"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" +"\t attributes of the error which caused the MCE.\n" +"\n" +"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" +"\t used for error thresholding purposes and its validity is indicated by\n" +"\t MCi_STATUS[MiscV].\n" +"\n" +"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" +"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" +"\n" +"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" +"\t associated with the error.\n" +"\n" +"cpu:\t The CPU to inject the error on.\n" +"\n" +"bank:\t Specify the bank you want to inject the error into: the number of\n" +"\t banks in a processor varies and is family/model-specific, therefore, the\n" +"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" +"\t injection.\n" +"\n" +"flags:\t Injection type to be performed. Writing to this file will trigger a\n" +"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" +"\t for AMD processors.\n" +"\n" +"\t Allowed error injection types:\n" +"\t - \"sw\": Software error injection. Decode error to a human-readable \n" +"\t format only. Safe to use.\n" +"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" +"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" +"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" +"\t before injecting.\n" +"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" +"\t error APIC interrupt handler to handle the error if the feature is \n" +"\t is present in hardware. \n" +"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" +"\t APIC interrupt handler to handle the error. \n" +"\n"; + +static ssize_t +inj_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static const struct file_operations readme_fops = { + .read = inj_readme_read, +}; + +static struct dfs_node { + char *name; + struct dentry *d; + const struct file_operations *fops; + umode_t perm; +} dfs_fls[] = { + { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, +}; + +static int __init debugfs_init(void) +{ + unsigned int i; + u64 cap; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + n_banks = cap & MCG_BANKCNT_MASK; + + dfs_inj = debugfs_create_dir("mce-inject", NULL); + if (!dfs_inj) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { + dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, + dfs_fls[i].perm, + dfs_inj, + &i_mce, + dfs_fls[i].fops); + + if (!dfs_fls[i].d) + goto err_dfs_add; + } + + return 0; + +err_dfs_add: + while (i-- > 0) + debugfs_remove(dfs_fls[i].d); + + debugfs_remove(dfs_inj); + dfs_inj = NULL; + + return -ENODEV; +} + +static int __init inject_init(void) +{ + int err; + + if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) + return -ENOMEM; + + err = debugfs_init(); + if (err) { + free_cpumask_var(mce_inject_cpumask); + return err; + } + + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); + mce_register_injector_chain(&inject_nb); + + setup_inj_struct(&i_mce); + + pr_info("Machine check injector initialized\n"); + + return 0; +} + +static void __exit inject_exit(void) +{ + + mce_unregister_injector_chain(&inject_nb); + unregister_nmi_handler(NMI_LOCAL, "mce_notify"); + + debugfs_remove_recursive(dfs_inj); + dfs_inj = NULL; + + memset(&dfs_fls, 0, sizeof(dfs_fls)); + + free_cpumask_var(mce_inject_cpumask); +} + +module_init(inject_init); +module_exit(inject_exit); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c @@ -0,0 +1,518 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel specific MCE features. + * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> + * Copyright (C) 2008, 2009 Intel Corporation + * Author: Andi Kleen + */ + +#include <linux/gfp.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <asm/apic.h> +#include <asm/cpufeature.h> +#include <asm/intel-family.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/mce.h> + +#include "internal.h" + +/* + * Support for Intel Correct Machine Check Interrupts. This allows + * the CPU to raise an interrupt when a corrected machine check happened. + * Normally we pick those up using a regular polling timer. + * Also supports reliable discovery of shared banks. + */ + +/* + * CMCI can be delivered to multiple cpus that share a machine check bank + * so we need to designate a single cpu to process errors logged in each bank + * in the interrupt handler (otherwise we would have many races and potential + * double reporting of the same error). + * Note that this can change when a cpu is offlined or brought online since + * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear() + * disables CMCI on all banks owned by the cpu and clears this bitfield. At + * this point, cmci_rediscover() kicks in and a different cpu may end up + * taking ownership of some of the shared MCA banks that were previously + * owned by the offlined cpu. + */ +static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); + +/* + * CMCI storm detection backoff counter + * + * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've + * encountered an error. If not, we decrement it by one. We signal the end of + * the CMCI storm when it reaches 0. + */ +static DEFINE_PER_CPU(int, cmci_backoff_cnt); + +/* + * cmci_discover_lock protects against parallel discovery attempts + * which could race against each other. + */ +static DEFINE_RAW_SPINLOCK(cmci_discover_lock); + +#define CMCI_THRESHOLD 1 +#define CMCI_POLL_INTERVAL (30 * HZ) +#define CMCI_STORM_INTERVAL (HZ) +#define CMCI_STORM_THRESHOLD 15 + +static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); +static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); +static DEFINE_PER_CPU(unsigned int, cmci_storm_state); + +enum { + CMCI_STORM_NONE, + CMCI_STORM_ACTIVE, + CMCI_STORM_SUBSIDED, +}; + +static atomic_t cmci_storm_on_cpus; + +static int cmci_supported(int *banks) +{ + u64 cap; + + if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) + return 0; + + /* + * Vendor check is not strictly needed, but the initial + * initialization is vendor keyed and this + * makes sure none of the backdoors are entered otherwise. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) + return 0; + rdmsrl(MSR_IA32_MCG_CAP, cap); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + return !!(cap & MCG_CMCI_P); +} + +static bool lmce_supported(void) +{ + u64 tmp; + + if (mca_cfg.lmce_disabled) + return false; + + rdmsrl(MSR_IA32_MCG_CAP, tmp); + + /* + * LMCE depends on recovery support in the processor. Hence both + * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP. + */ + if ((tmp & (MCG_SER_P | MCG_LMCE_P)) != + (MCG_SER_P | MCG_LMCE_P)) + return false; + + /* + * BIOS should indicate support for LMCE by setting bit 20 in + * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will + * generate a #GP fault. + */ + rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp); + if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) == + (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) + return true; + + return false; +} + +bool mce_intel_cmci_poll(void) +{ + if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) + return false; + + /* + * Reset the counter if we've logged an error in the last poll + * during the storm. + */ + if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned))) + this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); + else + this_cpu_dec(cmci_backoff_cnt); + + return true; +} + +void mce_intel_hcpu_update(unsigned long cpu) +{ + if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) + atomic_dec(&cmci_storm_on_cpus); + + per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; +} + +static void cmci_toggle_interrupt_mode(bool on) +{ + unsigned long flags, *owned; + int bank; + u64 val; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + owned = this_cpu_ptr(mce_banks_owned); + for_each_set_bit(bank, owned, MAX_NR_BANKS) { + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + + if (on) + val |= MCI_CTL2_CMCI_EN; + else + val &= ~MCI_CTL2_CMCI_EN; + + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + } + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +unsigned long cmci_intel_adjust_timer(unsigned long interval) +{ + if ((this_cpu_read(cmci_backoff_cnt) > 0) && + (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) { + mce_notify_irq(); + return CMCI_STORM_INTERVAL; + } + + switch (__this_cpu_read(cmci_storm_state)) { + case CMCI_STORM_ACTIVE: + + /* + * We switch back to interrupt mode once the poll timer has + * silenced itself. That means no events recorded and the timer + * interval is back to our poll interval. + */ + __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); + if (!atomic_sub_return(1, &cmci_storm_on_cpus)) + pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + + /* FALLTHROUGH */ + + case CMCI_STORM_SUBSIDED: + /* + * We wait for all CPUs to go back to SUBSIDED state. When that + * happens we switch back to interrupt mode. + */ + if (!atomic_read(&cmci_storm_on_cpus)) { + __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); + cmci_toggle_interrupt_mode(true); + cmci_recheck(); + } + return CMCI_POLL_INTERVAL; + default: + + /* We have shiny weather. Let the poll do whatever it thinks. */ + return interval; + } +} + +static bool cmci_storm_detect(void) +{ + unsigned int cnt = __this_cpu_read(cmci_storm_cnt); + unsigned long ts = __this_cpu_read(cmci_time_stamp); + unsigned long now = jiffies; + int r; + + if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) + return true; + + if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { + cnt++; + } else { + cnt = 1; + __this_cpu_write(cmci_time_stamp, now); + } + __this_cpu_write(cmci_storm_cnt, cnt); + + if (cnt <= CMCI_STORM_THRESHOLD) + return false; + + cmci_toggle_interrupt_mode(false); + __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); + r = atomic_add_return(1, &cmci_storm_on_cpus); + mce_timer_kick(CMCI_STORM_INTERVAL); + this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); + + if (r == 1) + pr_notice("CMCI storm detected: switching to poll mode\n"); + return true; +} + +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + if (cmci_storm_detect()) + return; + + machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); +} + +/* + * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks + * on this CPU. Use the algorithm recommended in the SDM to discover shared + * banks. + */ +static void cmci_discover(int banks) +{ + unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned); + unsigned long flags; + int i; + int bios_wrong_thresh = 0; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + for (i = 0; i < banks; i++) { + u64 val; + int bios_zero_thresh = 0; + + if (test_bit(i, owned)) + continue; + + /* Skip banks in firmware first mode */ + if (test_bit(i, mce_banks_ce_disabled)) + continue; + + rdmsrl(MSR_IA32_MCx_CTL2(i), val); + + /* Already owned by someone else? */ + if (val & MCI_CTL2_CMCI_EN) { + clear_bit(i, owned); + __clear_bit(i, this_cpu_ptr(mce_poll_banks)); + continue; + } + + if (!mca_cfg.bios_cmci_threshold) { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= CMCI_THRESHOLD; + } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { + /* + * If bios_cmci_threshold boot option was specified + * but the threshold is zero, we'll try to initialize + * it to 1. + */ + bios_zero_thresh = 1; + val |= CMCI_THRESHOLD; + } + + val |= MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(i), val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); + + /* Did the enable bit stick? -- the bank supports CMCI */ + if (val & MCI_CTL2_CMCI_EN) { + set_bit(i, owned); + __clear_bit(i, this_cpu_ptr(mce_poll_banks)); + /* + * We are able to set thresholds for some banks that + * had a threshold of 0. This means the BIOS has not + * set the thresholds properly or does not work with + * this boot option. Note down now and report later. + */ + if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && + (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) + bios_wrong_thresh = 1; + } else { + WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks))); + } + } + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { + pr_info_once( + "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); + pr_info_once( + "bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); + } +} + +/* + * Just in case we missed an event during initialization check + * all the CMCI owned banks. + */ +void cmci_recheck(void) +{ + unsigned long flags; + int banks; + + if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) + return; + + local_irq_save(flags); + machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)); + local_irq_restore(flags); +} + +/* Caller must hold the lock on cmci_discover_lock */ +static void __cmci_disable_bank(int bank) +{ + u64 val; + + if (!test_bit(bank, this_cpu_ptr(mce_banks_owned))) + return; + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + __clear_bit(bank, this_cpu_ptr(mce_banks_owned)); +} + +/* + * Disable CMCI on this CPU for all banks it owns when it goes down. + * This allows other CPUs to claim the banks on rediscovery. + */ +void cmci_clear(void) +{ + unsigned long flags; + int i; + int banks; + + if (!cmci_supported(&banks)) + return; + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + for (i = 0; i < banks; i++) + __cmci_disable_bank(i); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +static void cmci_rediscover_work_func(void *arg) +{ + int banks; + + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks); +} + +/* After a CPU went down cycle through all the others and rediscover */ +void cmci_rediscover(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + on_each_cpu(cmci_rediscover_work_func, NULL, 1); +} + +/* + * Reenable CMCI on this CPU in case a CPU down failed. + */ +void cmci_reenable(void) +{ + int banks; + if (cmci_supported(&banks)) + cmci_discover(banks); +} + +void cmci_disable_bank(int bank) +{ + int banks; + unsigned long flags; + + if (!cmci_supported(&banks)) + return; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + __cmci_disable_bank(bank); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +static void intel_init_cmci(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + mce_threshold_vector = intel_threshold_interrupt; + cmci_discover(banks); + /* + * For CPU #0 this runs with still disabled APIC, but that's + * ok because only the vector is set up. We still do another + * check for the banks later for CPU #0 just to make sure + * to not miss any events. + */ + apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); + cmci_recheck(); +} + +static void intel_init_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + + if (!(val & MCG_EXT_CTL_LMCE_EN)) + wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); +} + +static void intel_clear_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + val &= ~MCG_EXT_CTL_LMCE_EN; + wrmsrl(MSR_IA32_MCG_EXT_CTL, val); +} + +static void intel_ppin_init(struct cpuinfo_x86 *c) +{ + unsigned long long val; + + /* + * Even if testing the presence of the MSR would be enough, we don't + * want to risk the situation where other models reuse this MSR for + * other purposes. + */ + switch (c->x86_model) { + case INTEL_FAM6_IVYBRIDGE_X: + case INTEL_FAM6_HASWELL_X: + case INTEL_FAM6_BROADWELL_XEON_D: + case INTEL_FAM6_BROADWELL_X: + case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: + + if (rdmsrl_safe(MSR_PPIN_CTL, &val)) + return; + + if ((val & 3UL) == 1UL) { + /* PPIN available but disabled: */ + return; + } + + /* If PPIN is disabled, but not locked, try to enable: */ + if (!(val & 3UL)) { + wrmsrl_safe(MSR_PPIN_CTL, val | 2UL); + rdmsrl_safe(MSR_PPIN_CTL, &val); + } + + if ((val & 3UL) == 2UL) + set_cpu_cap(c, X86_FEATURE_INTEL_PPIN); + } +} + +void mce_intel_feature_init(struct cpuinfo_x86 *c) +{ + intel_init_thermal(c); + intel_init_cmci(); + intel_init_lmce(); + intel_ppin_init(c); +} + +void mce_intel_feature_clear(struct cpuinfo_x86 *c) +{ + intel_clear_lmce(); +} diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_MCE_INTERNAL_H__ +#define __X86_MCE_INTERNAL_H__ + +#undef pr_fmt +#define pr_fmt(fmt) "mce: " fmt + +#include <linux/device.h> +#include <asm/mce.h> + +enum severity_level { + MCE_NO_SEVERITY, + MCE_DEFERRED_SEVERITY, + MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY, + MCE_KEEP_SEVERITY, + MCE_SOME_SEVERITY, + MCE_AO_SEVERITY, + MCE_UC_SEVERITY, + MCE_AR_SEVERITY, + MCE_PANIC_SEVERITY, +}; + +extern struct blocking_notifier_head x86_mce_decoder_chain; + +#define ATTR_LEN 16 +#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ + +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank { + u64 ctl; /* subevents to enable */ + unsigned char init; /* initialise bank? */ + struct device_attribute attr; /* device attribute */ + char attrname[ATTR_LEN]; /* attribute name */ +}; + +struct mce_evt_llist { + struct llist_node llnode; + struct mce mce; +}; + +void mce_gen_pool_process(struct work_struct *__unused); +bool mce_gen_pool_empty(void); +int mce_gen_pool_add(struct mce *mce); +int mce_gen_pool_init(void); +struct llist_node *mce_gen_pool_prepare_records(void); + +extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); +struct dentry *mce_get_debugfs_dir(void); + +extern struct mce_bank *mce_banks; +extern mce_banks_t mce_banks_ce_disabled; + +#ifdef CONFIG_X86_MCE_INTEL +unsigned long cmci_intel_adjust_timer(unsigned long interval); +bool mce_intel_cmci_poll(void); +void mce_intel_hcpu_update(unsigned long cpu); +void cmci_disable_bank(int bank); +#else +# define cmci_intel_adjust_timer mce_adjust_timer_default +static inline bool mce_intel_cmci_poll(void) { return false; } +static inline void mce_intel_hcpu_update(unsigned long cpu) { } +static inline void cmci_disable_bank(int bank) { } +#endif + +void mce_timer_kick(unsigned long interval); + +#ifdef CONFIG_ACPI_APEI +int apei_write_mce(struct mce *m); +ssize_t apei_read_mce(struct mce *m, u64 *record_id); +int apei_check_mce(void); +int apei_clear_mce(u64 record_id); +#else +static inline int apei_write_mce(struct mce *m) +{ + return -EINVAL; +} +static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + return 0; +} +static inline int apei_check_mce(void) +{ + return 0; +} +static inline int apei_clear_mce(u64 record_id) +{ + return -EINVAL; +} +#endif + +void mce_inject_log(struct mce *m); + +/* + * We consider records to be equivalent if bank+status+addr+misc all match. + * This is only used when the system is going down because of a fatal error + * to avoid cluttering the console log with essentially repeated information. + * In normal processing all errors seen are logged. + */ +static inline bool mce_cmp(struct mce *m1, struct mce *m2) +{ + return m1->bank != m2->bank || + m1->status != m2->status || + m1->addr != m2->addr || + m1->misc != m2->misc; +} + +extern struct device_attribute dev_attr_trigger; + +#ifdef CONFIG_X86_MCELOG_LEGACY +void mce_work_trigger(void); +void mce_register_injector_chain(struct notifier_block *nb); +void mce_unregister_injector_chain(struct notifier_block *nb); +#else +static inline void mce_work_trigger(void) { } +static inline void mce_register_injector_chain(struct notifier_block *nb) { } +static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } +#endif + +struct mca_config { + bool dont_log_ce; + bool cmci_disabled; + bool ignore_ce; + + __u64 lmce_disabled : 1, + disabled : 1, + ser : 1, + recovery : 1, + bios_cmci_threshold : 1, + __reserved : 59; + + u8 banks; + s8 bootlog; + int tolerant; + int monarch_timeout; + int panic_timeout; + u32 rip_msr; +}; + +extern struct mca_config mca_cfg; + +struct mce_vendor_flags { + /* + * Indicates that overflow conditions are not fatal, when set. + */ + __u64 overflow_recov : 1, + + /* + * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and + * Recovery. It indicates support for data poisoning in HW and deferred + * error interrupts. + */ + succor : 1, + + /* + * (AMD) SMCA: This bit indicates support for Scalable MCA which expands + * the register space for each MCA bank and also increases number of + * banks. Also, to accommodate the new banks and registers, the MCA + * register space is moved to a new MSR range. + */ + smca : 1, + + __reserved_0 : 61; +}; + +extern struct mce_vendor_flags mce_flags; + +struct mca_msr_regs { + u32 (*ctl) (int bank); + u32 (*status) (int bank); + u32 (*addr) (int bank); + u32 (*misc) (int bank); +}; + +extern struct mca_msr_regs msr_ops; + +#endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * P5 specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> + */ +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/smp.h> + +#include <asm/processor.h> +#include <asm/traps.h> +#include <asm/tlbflush.h> +#include <asm/mce.h> +#include <asm/msr.h> + +#include "internal.h" + +/* By default disabled */ +int mce_p5_enabled __read_mostly; + +/* Machine check handler for Pentium class Intel CPUs: */ +static void pentium_machine_check(struct pt_regs *regs, long error_code) +{ + u32 loaddr, hi, lotype; + + ist_enter(regs); + + rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); + rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); + + pr_emerg("CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", + smp_processor_id(), loaddr, lotype); + + if (lotype & (1<<5)) { + pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n", + smp_processor_id()); + } + + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + + ist_exit(regs); +} + +/* Set up machine check reporting for processors with Intel style MCE: */ +void intel_p5_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 l, h; + + /* Default P5 to off as its often misconnected: */ + if (!mce_p5_enabled) + return; + + /* Check for MCE support: */ + if (!cpu_has(c, X86_FEATURE_MCE)) + return; + + machine_check_vector = pentium_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ + wmb(); + + /* Read registers before enabling: */ + rdmsr(MSR_IA32_P5_MC_ADDR, l, h); + rdmsr(MSR_IA32_P5_MC_TYPE, l, h); + pr_info("Intel old style machine check architecture supported.\n"); + + /* Enable MCE: */ + cr4_set_bits(X86_CR4_MCE); + pr_info("Intel old style machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); +} diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c @@ -0,0 +1,419 @@ +/* + * MCE grading rules. + * Copyright 2008, 2009 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Author: Andi Kleen + */ +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/debugfs.h> +#include <asm/mce.h> +#include <linux/uaccess.h> + +#include "internal.h" + +/* + * Grade an mce by severity. In general the most severe ones are processed + * first. Since there are quite a lot of combinations test the bits in a + * table-driven way. The rules are simply processed in order, first + * match wins. + * + * Note this is only used for machine check exceptions, the corrected + * errors use much simpler rules. The exceptions still check for the corrected + * errors, but only to leave them alone for the CMCI handler (except for + * panic situations) + */ + +enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 }; +enum ser { SER_REQUIRED = 1, NO_SER = 2 }; +enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; + +static struct severity { + u64 mask; + u64 result; + unsigned char sev; + unsigned char mcgmask; + unsigned char mcgres; + unsigned char ser; + unsigned char context; + unsigned char excp; + unsigned char covered; + char *msg; +} severities[] = { +#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define KERNEL_RECOV .context = IN_KERNEL_RECOV +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER +#define EXCP .excp = EXCP_CONTEXT +#define NOEXCP .excp = NO_EXCP +#define BITCLR(x) .mask = x, .result = 0 +#define BITSET(x) .mask = x, .result = x +#define MCGMASK(x, y) .mcgmask = x, .mcgres = y +#define MASK(x, y) .mask = x, .result = y +#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) +#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) + + MCESEV( + NO, "Invalid", + BITCLR(MCI_STATUS_VAL) + ), + MCESEV( + NO, "Not enabled", + EXCP, BITCLR(MCI_STATUS_EN) + ), + MCESEV( + PANIC, "Processor context corrupt", + BITSET(MCI_STATUS_PCC) + ), + /* When MCIP is not set something is very confused */ + MCESEV( + PANIC, "MCIP not set in MCA handler", + EXCP, MCGMASK(MCG_STATUS_MCIP, 0) + ), + /* Neither return not error IP -- no chance to recover -> PANIC */ + MCESEV( + PANIC, "Neither restart nor error IP", + EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) + ), + MCESEV( + PANIC, "In kernel and no restart IP", + EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + PANIC, "In kernel and no restart IP", + EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + DEFERRED, "Deferred error", + NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) + ), + MCESEV( + KEEP, "Corrected error", + NOSER, BITCLR(MCI_STATUS_UC) + ), + + /* + * known AO MCACODs reported via MCE or CMC: + * + * SRAO could be signaled either via a machine check exception or + * CMCI with the corresponding bit S 1 or 0. So we don't need to + * check bit S for SRAO. + */ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + ), + + /* ignore OVER for UCNA */ + MCESEV( + UCNA, "Uncorrected no action required", + SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) + ), + MCESEV( + PANIC, "Illegal combination (UCNA with AR=1)", + SER, + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) + ), + MCESEV( + KEEP, "Non signalled machine check", + SER, BITCLR(MCI_STATUS_S) + ), + + MCESEV( + PANIC, "Action required with lost events", + SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) + ), + + /* known AR MCACODs: */ +#ifdef CONFIG_MEMORY_FAILURE + MCESEV( + KEEP, "Action required but unaffected thread is continuable", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR), + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) + ), + MCESEV( + AR, "Action required: data load in error recoverable area of kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + KERNEL_RECOV + ), + MCESEV( + AR, "Action required: data load error in a user process", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + USER + ), + MCESEV( + AR, "Action required: instruction fetch error in a user process", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + USER + ), + MCESEV( + PANIC, "Data load in unrecoverable area of kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + KERNEL + ), +#endif + MCESEV( + PANIC, "Action required: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) + ), + + MCESEV( + SOME, "Action optional: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) + ), + MCESEV( + SOME, "Action optional with lost events", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S) + ), + + MCESEV( + PANIC, "Overflowed uncorrected", + BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) + ), + MCESEV( + UC, "Uncorrected", + BITSET(MCI_STATUS_UC) + ), + MCESEV( + SOME, "No match", + BITSET(0) + ) /* always matches. keep at end */ +}; + +#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ + (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) + +/* + * If mcgstatus indicated that ip/cs on the stack were + * no good, then "m->cs" will be zero and we will have + * to assume the worst case (IN_KERNEL) as we actually + * have no idea what we were executing when the machine + * check hit. + * If we do have a good "m->cs" (or a faked one in the + * case we were executing in VM86 mode) we can use it to + * distinguish an exception taken in user from from one + * taken in the kernel. + */ +static int error_context(struct mce *m) +{ + if ((m->cs & 3) == 3) + return IN_USER; + if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) + return IN_KERNEL_RECOV; + return IN_KERNEL; +} + +static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) +{ + u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); + u32 low, high; + + /* + * We need to look at the following bits: + * - "succor" bit (data poisoning support), and + * - TCC bit (Task Context Corrupt) + * in MCi_STATUS to determine error severity. + */ + if (!mce_flags.succor) + return MCE_PANIC_SEVERITY; + + if (rdmsr_safe(addr, &low, &high)) + return MCE_PANIC_SEVERITY; + + /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */ + if ((low & MCI_CONFIG_MCAX) && + (m->status & MCI_STATUS_TCC) && + (err_ctx == IN_KERNEL)) + return MCE_PANIC_SEVERITY; + + /* ...otherwise invoke hwpoison handler. */ + return MCE_AR_SEVERITY; +} + +/* + * See AMD Error Scope Hierarchy table in a newer BKDG. For example + * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" + */ +static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp) +{ + enum context ctx = error_context(m); + + /* Processor Context Corrupt, no need to fumble too much, die! */ + if (m->status & MCI_STATUS_PCC) + return MCE_PANIC_SEVERITY; + + if (m->status & MCI_STATUS_UC) { + + if (ctx == IN_KERNEL) + return MCE_PANIC_SEVERITY; + + /* + * On older systems where overflow_recov flag is not present, we + * should simply panic if an error overflow occurs. If + * overflow_recov flag is present and set, then software can try + * to at least kill process to prolong system operation. + */ + if (mce_flags.overflow_recov) { + if (mce_flags.smca) + return mce_severity_amd_smca(m, ctx); + + /* kill current process */ + return MCE_AR_SEVERITY; + } else { + /* at least one error was not logged */ + if (m->status & MCI_STATUS_OVER) + return MCE_PANIC_SEVERITY; + } + + /* + * For any other case, return MCE_UC_SEVERITY so that we log the + * error and exit #MC handler. + */ + return MCE_UC_SEVERITY; + } + + /* + * deferred error: poll handler catches these and adds to mce_ring so + * memory-failure can take recovery actions. + */ + if (m->status & MCI_STATUS_DEFERRED) + return MCE_DEFERRED_SEVERITY; + + /* + * corrected error: poll handler catches these and passes responsibility + * of decoding the error to EDAC + */ + return MCE_KEEP_SEVERITY; +} + +static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp) +{ + enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); + enum context ctx = error_context(m); + struct severity *s; + + for (s = severities;; s++) { + if ((m->status & s->mask) != s->result) + continue; + if ((m->mcgstatus & s->mcgmask) != s->mcgres) + continue; + if (s->ser == SER_REQUIRED && !mca_cfg.ser) + continue; + if (s->ser == NO_SER && mca_cfg.ser) + continue; + if (s->context && ctx != s->context) + continue; + if (s->excp && excp != s->excp) + continue; + if (msg) + *msg = s->msg; + s->covered = 1; + if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { + if (tolerant < 1) + return MCE_PANIC_SEVERITY; + } + return s->sev; + } +} + +/* Default to mce_severity_intel */ +int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = + mce_severity_intel; + +void __init mcheck_vendor_init_severity(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + mce_severity = mce_severity_amd; +} + +#ifdef CONFIG_DEBUG_FS +static void *s_start(struct seq_file *f, loff_t *pos) +{ + if (*pos >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void *s_next(struct seq_file *f, void *data, loff_t *pos) +{ + if (++(*pos) >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void s_stop(struct seq_file *f, void *data) +{ +} + +static int s_show(struct seq_file *f, void *data) +{ + struct severity *ser = data; + seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); + return 0; +} + +static const struct seq_operations severities_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int severities_coverage_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &severities_seq_ops); +} + +static ssize_t severities_coverage_write(struct file *file, + const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int i; + for (i = 0; i < ARRAY_SIZE(severities); i++) + severities[i].covered = 0; + return count; +} + +static const struct file_operations severities_coverage_fops = { + .open = severities_coverage_open, + .release = seq_release, + .read = seq_read, + .write = severities_coverage_write, + .llseek = seq_lseek, +}; + +static int __init severities_debugfs_init(void) +{ + struct dentry *dmce, *fsev; + + dmce = mce_get_debugfs_dir(); + if (!dmce) + goto err_out; + + fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL, + &severities_coverage_fops); + if (!fsev) + goto err_out; + + return 0; + +err_out: + return -ENOMEM; +} +late_initcall(severities_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -0,0 +1,522 @@ +/* + * Thermal throttle event support code (such as syslog messaging and rate + * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + * + * This allows consistent reporting of CPU thermal throttle events. + * + * Maintains a counter in /sys that keeps track of the number of thermal + * events, such that the user knows how bad the thermal problem might be + * (since the logging to syslog is rate limited). + * + * Author: Dmitriy Zavin (dmitriyz@google.com) + * + * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. + * Inspired by Ross Biro's and Al Borchers' counter code. + */ +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <asm/processor.h> +#include <asm/apic.h> +#include <asm/mce.h> +#include <asm/msr.h> +#include <asm/trace/irq_vectors.h> + +#include "internal.h" + +/* How long to wait between reporting thermal events */ +#define CHECK_INTERVAL (300 * HZ) + +#define THERMAL_THROTTLING_EVENT 0 +#define POWER_LIMIT_EVENT 1 + +/* + * Current thermal event state: + */ +struct _thermal_state { + bool new_event; + int event; + u64 next_check; + unsigned long count; + unsigned long last_count; +}; + +struct thermal_state { + struct _thermal_state core_throttle; + struct _thermal_state core_power_limit; + struct _thermal_state package_throttle; + struct _thermal_state package_power_limit; + struct _thermal_state core_thresh0; + struct _thermal_state core_thresh1; + struct _thermal_state pkg_thresh0; + struct _thermal_state pkg_thresh1; +}; + +/* Callback to handle core threshold interrupts */ +int (*platform_thermal_notify)(__u64 msr_val); +EXPORT_SYMBOL(platform_thermal_notify); + +/* Callback to handle core package threshold_interrupts */ +int (*platform_thermal_package_notify)(__u64 msr_val); +EXPORT_SYMBOL_GPL(platform_thermal_package_notify); + +/* Callback support of rate control, return true, if + * callback has rate control */ +bool (*platform_thermal_package_rate_control)(void); +EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control); + + +static DEFINE_PER_CPU(struct thermal_state, thermal_state); + +static atomic_t therm_throt_en = ATOMIC_INIT(0); + +static u32 lvtthmr_init __read_mostly; + +#ifdef CONFIG_SYSFS +#define define_therm_throt_device_one_ro(_name) \ + static DEVICE_ATTR(_name, 0444, \ + therm_throt_device_show_##_name, \ + NULL) \ + +#define define_therm_throt_device_show_func(event, name) \ + \ +static ssize_t therm_throt_device_show_##event##_##name( \ + struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + ssize_t ret; \ + \ + preempt_disable(); /* CPU hotplug */ \ + if (cpu_online(cpu)) { \ + ret = sprintf(buf, "%lu\n", \ + per_cpu(thermal_state, cpu).event.name); \ + } else \ + ret = 0; \ + preempt_enable(); \ + \ + return ret; \ +} + +define_therm_throt_device_show_func(core_throttle, count); +define_therm_throt_device_one_ro(core_throttle_count); + +define_therm_throt_device_show_func(core_power_limit, count); +define_therm_throt_device_one_ro(core_power_limit_count); + +define_therm_throt_device_show_func(package_throttle, count); +define_therm_throt_device_one_ro(package_throttle_count); + +define_therm_throt_device_show_func(package_power_limit, count); +define_therm_throt_device_one_ro(package_power_limit_count); + +static struct attribute *thermal_throttle_attrs[] = { + &dev_attr_core_throttle_count.attr, + NULL +}; + +static const struct attribute_group thermal_attr_group = { + .attrs = thermal_throttle_attrs, + .name = "thermal_throttle" +}; +#endif /* CONFIG_SYSFS */ + +#define CORE_LEVEL 0 +#define PACKAGE_LEVEL 1 + +/*** + * therm_throt_process - Process thermal throttling event from interrupt + * @curr: Whether the condition is current or not (boolean), since the + * thermal interrupt normally gets called both when the thermal + * event begins and once the event has ended. + * + * This function is called by the thermal interrupt after the + * IRQ has been acknowledged. + * + * It will take care of rate limiting and printing messages to the syslog. + */ +static void therm_throt_process(bool new_event, int event, int level) +{ + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + bool old_event; + u64 now; + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); + + now = get_jiffies_64(); + if (level == CORE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->core_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->core_power_limit; + else + return; + } else if (level == PACKAGE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->package_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->package_power_limit; + else + return; + } else + return; + + old_event = state->new_event; + state->new_event = new_event; + + if (new_event) + state->count++; + + if (time_before64(now, state->next_check) && + state->count != state->last_count) + return; + + state->next_check = now + CHECK_INTERVAL; + state->last_count = state->count; + + /* if we just entered the thermal event */ + if (new_event) { + if (event == THERMAL_THROTTLING_EVENT) + pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->count); + return; + } + if (old_event) { + if (event == THERMAL_THROTTLING_EVENT) + pr_info("CPU%d: %s temperature/speed normal\n", this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); + return; + } +} + +static int thresh_event_valid(int level, int event) +{ + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); + u64 now = get_jiffies_64(); + + if (level == PACKAGE_LEVEL) + state = (event == 0) ? &pstate->pkg_thresh0 : + &pstate->pkg_thresh1; + else + state = (event == 0) ? &pstate->core_thresh0 : + &pstate->core_thresh1; + + if (time_before64(now, state->next_check)) + return 0; + + state->next_check = now + CHECK_INTERVAL; + + return 1; +} + +static bool int_pln_enable; +static int __init int_pln_enable_setup(char *s) +{ + int_pln_enable = true; + + return 1; +} +__setup("int_pln_enable", int_pln_enable_setup); + +#ifdef CONFIG_SYSFS +/* Add/Remove thermal_throttle interface for CPU device: */ +static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) +{ + int err; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + err = sysfs_create_group(&dev->kobj, &thermal_attr_group); + if (err) + return err; + + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_core_power_limit_count.attr, + thermal_attr_group.name); + if (cpu_has(c, X86_FEATURE_PTS)) { + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_count.attr, + thermal_attr_group.name); + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_power_limit_count.attr, + thermal_attr_group.name); + } + + return err; +} + +static void thermal_throttle_remove_dev(struct device *dev) +{ + sysfs_remove_group(&dev->kobj, &thermal_attr_group); +} + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static int thermal_throttle_online(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + return thermal_throttle_add_dev(dev, cpu); +} + +static int thermal_throttle_offline(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + thermal_throttle_remove_dev(dev); + return 0; +} + +static __init int thermal_throttle_init_device(void) +{ + int ret; + + if (!atomic_read(&therm_throt_en)) + return 0; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online", + thermal_throttle_online, + thermal_throttle_offline); + return ret < 0 ? ret : 0; +} +device_initcall(thermal_throttle_init_device); + +#endif /* CONFIG_SYSFS */ + +static void notify_package_thresholds(__u64 msr_val) +{ + bool notify_thres_0 = false; + bool notify_thres_1 = false; + + if (!platform_thermal_package_notify) + return; + + /* lower threshold check */ + if (msr_val & THERM_LOG_THRESHOLD0) + notify_thres_0 = true; + /* higher threshold check */ + if (msr_val & THERM_LOG_THRESHOLD1) + notify_thres_1 = true; + + if (!notify_thres_0 && !notify_thres_1) + return; + + if (platform_thermal_package_rate_control && + platform_thermal_package_rate_control()) { + /* Rate control is implemented in callback */ + platform_thermal_package_notify(msr_val); + return; + } + + /* lower threshold reached */ + if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0)) + platform_thermal_package_notify(msr_val); + /* higher threshold reached */ + if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1)) + platform_thermal_package_notify(msr_val); +} + +static void notify_thresholds(__u64 msr_val) +{ + /* check whether the interrupt handler is defined; + * otherwise simply return + */ + if (!platform_thermal_notify) + return; + + /* lower threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD0) && + thresh_event_valid(CORE_LEVEL, 0)) + platform_thermal_notify(msr_val); + /* higher threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD1) && + thresh_event_valid(CORE_LEVEL, 1)) + platform_thermal_notify(msr_val); +} + +/* Thermal transition interrupt handler */ +static void intel_thermal_interrupt(void) +{ + __u64 msr_val; + + if (static_cpu_has(X86_FEATURE_HWP)) + wrmsrl_safe(MSR_HWP_STATUS, 0); + + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + + /* Check for violation of core thermal thresholds*/ + notify_thresholds(msr_val); + + therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + CORE_LEVEL); + + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) + therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + CORE_LEVEL); + + if (this_cpu_has(X86_FEATURE_PTS)) { + rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); + /* check violations of package thermal thresholds */ + notify_package_thresholds(msr_val); + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + PACKAGE_LEVEL); + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) + therm_throt_process(msr_val & + PACKAGE_THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + PACKAGE_LEVEL); + } +} + +static void unexpected_thermal_interrupt(void) +{ + pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", + smp_processor_id()); +} + +static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; + +asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r) +{ + entering_irq(); + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + exiting_ack_irq(); +} + +/* Thermal monitoring depends on APIC, ACPI and clock modulation */ +static int intel_thermal_supported(struct cpuinfo_x86 *c) +{ + if (!boot_cpu_has(X86_FEATURE_APIC)) + return 0; + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return 0; + return 1; +} + +void __init mcheck_intel_therm_init(void) +{ + /* + * This function is only called on boot CPU. Save the init thermal + * LVT value on BSP and use that value to restore APs' thermal LVT + * entry BIOS programmed later + */ + if (intel_thermal_supported(&boot_cpu_data)) + lvtthmr_init = apic_read(APIC_LVTTHMR); +} + +void intel_init_thermal(struct cpuinfo_x86 *c) +{ + unsigned int cpu = smp_processor_id(); + int tm2 = 0; + u32 l, h; + + if (!intel_thermal_supported(c)) + return; + + /* + * First check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already: + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + + h = lvtthmr_init; + /* + * The initial value of thermal LVT entries on all APs always reads + * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI + * sequence to them and LVT registers are reset to 0s except for + * the mask bits which are set to 1s when APs receive INIT IPI. + * If BIOS takes over the thermal interrupt and sets its interrupt + * delivery mode to SMI (not fixed), it restores the value that the + * BIOS has programmed on AP based on BSP's info we saved since BIOS + * is always setting the same value for all threads/cores. + */ + if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) + apic_write(APIC_LVTTHMR, lvtthmr_init); + + + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { + if (system_state == SYSTEM_BOOTING) + pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + /* early Pentium M models use different method for enabling TM2 */ + if (cpu_has(c, X86_FEATURE_TM2)) { + if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { + rdmsr(MSR_THERM2_CTL, l, h); + if (l & MSR_THERM2_CTL_TM_SELECT) + tm2 = 1; + } else if (l & MSR_IA32_MISC_ENABLE_TM2) + tm2 = 1; + } + + /* We'll mask the thermal vector in the lapic till we're ready: */ + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; + apic_write(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) + wrmsr(MSR_IA32_THERM_INTERRUPT, + (l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + (l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE)) + & ~PACKAGE_THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE + | PACKAGE_THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE), h); + } + + smp_thermal_vector = intel_thermal_interrupt; + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + + /* Unmask the thermal vector: */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + + pr_info_once("CPU0: Thermal monitoring enabled (%s)\n", + tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); +} diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common corrected MCE threshold handler code: + */ +#include <linux/interrupt.h> +#include <linux/kernel.h> + +#include <asm/irq_vectors.h> +#include <asm/apic.h> +#include <asm/mce.h> +#include <asm/trace/irq_vectors.h> + +#include "internal.h" + +static void default_threshold_interrupt(void) +{ + pr_err("Unexpected threshold interrupt at vector %x\n", + THRESHOLD_APIC_VECTOR); +} + +void (*mce_threshold_vector)(void) = default_threshold_interrupt; + +asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) +{ + entering_irq(); + trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); + trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); + exiting_ack_irq(); +} diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * IDT Winchip specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> + */ +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> + +#include <asm/processor.h> +#include <asm/traps.h> +#include <asm/tlbflush.h> +#include <asm/mce.h> +#include <asm/msr.h> + +#include "internal.h" + +/* Machine check handler for WinChip C6: */ +static void winchip_machine_check(struct pt_regs *regs, long error_code) +{ + ist_enter(regs); + + pr_emerg("CPU0: Machine Check Exception.\n"); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + + ist_exit(regs); +} + +/* Set up machine check reporting on the Winchip C6 series */ +void winchip_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + machine_check_vector = winchip_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ + wmb(); + + rdmsr(MSR_IDT_FCR1, lo, hi); + lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ + lo &= ~(1<<4); /* Enable MCE */ + wrmsr(MSR_IDT_FCR1, lo, hi); + + cr4_set_bits(X86_CR4_MCE); + + pr_info("Winchip machine check reporting enabled on CPU#0.\n"); +} diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -obj-y = mce.o mce-severity.o mce-genpool.o - -obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o -obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o -obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o -obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o -obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o - -obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o - -obj-$(CONFIG_ACPI_APEI) += mce-apei.o - -obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -1,360 +0,0 @@ -/* - * /dev/mcelog driver - * - * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. - * Rest from unknown author(s). - * 2004 Andi Kleen. Rewrote most of it. - * Copyright 2008 Intel Corporation - * Author: Andi Kleen - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/miscdevice.h> -#include <linux/slab.h> -#include <linux/kmod.h> -#include <linux/poll.h> - -#include "mce-internal.h" - -static BLOCKING_NOTIFIER_HEAD(mce_injector_chain); - -static DEFINE_MUTEX(mce_chrdev_read_mutex); - -static char mce_helper[128]; -static char *mce_helper_argv[2] = { mce_helper, NULL }; - -/* - * Lockless MCE logging infrastructure. - * This avoids deadlocks on printk locks without having to break locks. Also - * separate MCEs from kernel messages to avoid bogus bug reports. - */ - -static struct mce_log_buffer mcelog = { - .signature = MCE_LOG_SIGNATURE, - .len = MCE_LOG_LEN, - .recordlen = sizeof(struct mce), -}; - -static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); - -static int dev_mce_log(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct mce *mce = (struct mce *)data; - unsigned int entry; - - mutex_lock(&mce_chrdev_read_mutex); - - entry = mcelog.next; - - /* - * When the buffer fills up discard new entries. Assume that the - * earlier errors are the more interesting ones: - */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); - goto unlock; - } - - mcelog.next = entry + 1; - - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - mcelog.entry[entry].finished = 1; - - /* wake processes polling /dev/mcelog */ - wake_up_interruptible(&mce_chrdev_wait); - -unlock: - mutex_unlock(&mce_chrdev_read_mutex); - - return NOTIFY_OK; -} - -static struct notifier_block dev_mcelog_nb = { - .notifier_call = dev_mce_log, - .priority = MCE_PRIO_MCELOG, -}; - -static void mce_do_trigger(struct work_struct *work) -{ - call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); -} - -static DECLARE_WORK(mce_trigger_work, mce_do_trigger); - - -void mce_work_trigger(void) -{ - if (mce_helper[0]) - schedule_work(&mce_trigger_work); -} - -static ssize_t -show_trigger(struct device *s, struct device_attribute *attr, char *buf) -{ - strcpy(buf, mce_helper); - strcat(buf, "\n"); - return strlen(mce_helper) + 1; -} - -static ssize_t set_trigger(struct device *s, struct device_attribute *attr, - const char *buf, size_t siz) -{ - char *p; - - strncpy(mce_helper, buf, sizeof(mce_helper)); - mce_helper[sizeof(mce_helper)-1] = 0; - p = strchr(mce_helper, '\n'); - - if (p) - *p = 0; - - return strlen(mce_helper) + !!p; -} - -DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); - -/* - * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. - */ - -static DEFINE_SPINLOCK(mce_chrdev_state_lock); -static int mce_chrdev_open_count; /* #times opened */ -static int mce_chrdev_open_exclu; /* already open exclusive? */ - -static int mce_chrdev_open(struct inode *inode, struct file *file) -{ - spin_lock(&mce_chrdev_state_lock); - - if (mce_chrdev_open_exclu || - (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_chrdev_state_lock); - - return -EBUSY; - } - - if (file->f_flags & O_EXCL) - mce_chrdev_open_exclu = 1; - mce_chrdev_open_count++; - - spin_unlock(&mce_chrdev_state_lock); - - return nonseekable_open(inode, file); -} - -static int mce_chrdev_release(struct inode *inode, struct file *file) -{ - spin_lock(&mce_chrdev_state_lock); - - mce_chrdev_open_count--; - mce_chrdev_open_exclu = 0; - - spin_unlock(&mce_chrdev_state_lock); - - return 0; -} - -static int mce_apei_read_done; - -/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ -static int __mce_read_apei(char __user **ubuf, size_t usize) -{ - int rc; - u64 record_id; - struct mce m; - - if (usize < sizeof(struct mce)) - return -EINVAL; - - rc = apei_read_mce(&m, &record_id); - /* Error or no more MCE record */ - if (rc <= 0) { - mce_apei_read_done = 1; - /* - * When ERST is disabled, mce_chrdev_read() should return - * "no record" instead of "no device." - */ - if (rc == -ENODEV) - return 0; - return rc; - } - rc = -EFAULT; - if (copy_to_user(*ubuf, &m, sizeof(struct mce))) - return rc; - /* - * In fact, we should have cleared the record after that has - * been flushed to the disk or sent to network in - * /sbin/mcelog, but we have no interface to support that now, - * so just clear it to avoid duplication. - */ - rc = apei_clear_mce(record_id); - if (rc) { - mce_apei_read_done = 1; - return rc; - } - *ubuf += sizeof(struct mce); - - return 0; -} - -static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, - size_t usize, loff_t *off) -{ - char __user *buf = ubuf; - unsigned next; - int i, err; - - mutex_lock(&mce_chrdev_read_mutex); - - if (!mce_apei_read_done) { - err = __mce_read_apei(&buf, usize); - if (err || buf != ubuf) - goto out; - } - - /* Only supports full reads right now */ - err = -EINVAL; - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) - goto out; - - next = mcelog.next; - err = 0; - - for (i = 0; i < next; i++) { - struct mce *m = &mcelog.entry[i]; - - err |= copy_to_user(buf, m, sizeof(*m)); - buf += sizeof(*m); - } - - memset(mcelog.entry, 0, next * sizeof(struct mce)); - mcelog.next = 0; - - if (err) - err = -EFAULT; - -out: - mutex_unlock(&mce_chrdev_read_mutex); - - return err ? err : buf - ubuf; -} - -static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait) -{ - poll_wait(file, &mce_chrdev_wait, wait); - if (READ_ONCE(mcelog.next)) - return EPOLLIN | EPOLLRDNORM; - if (!mce_apei_read_done && apei_check_mce()) - return EPOLLIN | EPOLLRDNORM; - return 0; -} - -static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, - unsigned long arg) -{ - int __user *p = (int __user *)arg; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case MCE_GET_RECORD_LEN: - return put_user(sizeof(struct mce), p); - case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - - do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); - - return put_user(flags, p); - } - default: - return -ENOTTY; - } -} - -void mce_register_injector_chain(struct notifier_block *nb) -{ - blocking_notifier_chain_register(&mce_injector_chain, nb); -} -EXPORT_SYMBOL_GPL(mce_register_injector_chain); - -void mce_unregister_injector_chain(struct notifier_block *nb) -{ - blocking_notifier_chain_unregister(&mce_injector_chain, nb); -} -EXPORT_SYMBOL_GPL(mce_unregister_injector_chain); - -static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off) -{ - struct mce m; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - /* - * There are some cases where real MSR reads could slip - * through. - */ - if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) - return -EIO; - - if ((unsigned long)usize > sizeof(struct mce)) - usize = sizeof(struct mce); - if (copy_from_user(&m, ubuf, usize)) - return -EFAULT; - - if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) - return -EINVAL; - - /* - * Need to give user space some time to set everything up, - * so do it a jiffie or two later everywhere. - */ - schedule_timeout(2); - - blocking_notifier_call_chain(&mce_injector_chain, 0, &m); - - return usize; -} - -static const struct file_operations mce_chrdev_ops = { - .open = mce_chrdev_open, - .release = mce_chrdev_release, - .read = mce_chrdev_read, - .write = mce_chrdev_write, - .poll = mce_chrdev_poll, - .unlocked_ioctl = mce_chrdev_ioctl, - .llseek = no_llseek, -}; - -static struct miscdevice mce_chrdev_device = { - MISC_MCELOG_MINOR, - "mcelog", - &mce_chrdev_ops, -}; - -static __init int dev_mcelog_init_device(void) -{ - int err; - - /* register character device /dev/mcelog */ - err = misc_register(&mce_chrdev_device); - if (err) { - if (err == -EBUSY) - /* Xen dom0 might have registered the device already. */ - pr_info("Unable to init device /dev/mcelog, already registered"); - else - pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); - - return err; - } - - mce_register_decode_chain(&dev_mcelog_nb); - return 0; -} -device_initcall_sync(dev_mcelog_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -1,157 +0,0 @@ -/* - * Bridge between MCE and APEI - * - * On some machine, corrected memory errors are reported via APEI - * generic hardware error source (GHES) instead of corrected Machine - * Check. These corrected memory errors can be reported to user space - * through /dev/mcelog via faking a corrected Machine Check, so that - * the error memory page can be offlined by /sbin/mcelog if the error - * count for one page is beyond the threshold. - * - * For fatal MCE, save MCE record into persistent storage via ERST, so - * that the MCE record can be logged after reboot via ERST. - * - * Copyright 2010 Intel Corp. - * Author: Huang Ying <ying.huang@intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <linux/export.h> -#include <linux/kernel.h> -#include <linux/acpi.h> -#include <linux/cper.h> -#include <acpi/apei.h> -#include <acpi/ghes.h> -#include <asm/mce.h> - -#include "mce-internal.h" - -void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) -{ - struct mce m; - - if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) - return; - - mce_setup(&m); - m.bank = -1; - /* Fake a memory read error with unknown channel */ - m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; - - if (severity >= GHES_SEV_RECOVERABLE) - m.status |= MCI_STATUS_UC; - - if (severity >= GHES_SEV_PANIC) { - m.status |= MCI_STATUS_PCC; - m.tsc = rdtsc(); - } - - m.addr = mem_err->physical_addr; - mce_log(&m); -} -EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); - -#define CPER_CREATOR_MCE \ - UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ - 0x64, 0x90, 0xb8, 0x9d) -#define CPER_SECTION_TYPE_MCE \ - UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ - 0x04, 0x4a, 0x38, 0xfc) - -/* - * CPER specification (in UEFI specification 2.3 appendix N) requires - * byte-packed. - */ -struct cper_mce_record { - struct cper_record_header hdr; - struct cper_section_descriptor sec_hdr; - struct mce mce; -} __packed; - -int apei_write_mce(struct mce *m) -{ - struct cper_mce_record rcd; - - memset(&rcd, 0, sizeof(rcd)); - memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE); - rcd.hdr.revision = CPER_RECORD_REV; - rcd.hdr.signature_end = CPER_SIG_END; - rcd.hdr.section_count = 1; - rcd.hdr.error_severity = CPER_SEV_FATAL; - /* timestamp, platform_id, partition_id are all invalid */ - rcd.hdr.validation_bits = 0; - rcd.hdr.record_length = sizeof(rcd); - rcd.hdr.creator_id = CPER_CREATOR_MCE; - rcd.hdr.notification_type = CPER_NOTIFY_MCE; - rcd.hdr.record_id = cper_next_record_id(); - rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR; - - rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd; - rcd.sec_hdr.section_length = sizeof(rcd.mce); - rcd.sec_hdr.revision = CPER_SEC_REV; - /* fru_id and fru_text is invalid */ - rcd.sec_hdr.validation_bits = 0; - rcd.sec_hdr.flags = CPER_SEC_PRIMARY; - rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; - rcd.sec_hdr.section_severity = CPER_SEV_FATAL; - - memcpy(&rcd.mce, m, sizeof(*m)); - - return erst_write(&rcd.hdr); -} - -ssize_t apei_read_mce(struct mce *m, u64 *record_id) -{ - struct cper_mce_record rcd; - int rc, pos; - - rc = erst_get_record_id_begin(&pos); - if (rc) - return rc; -retry: - rc = erst_get_record_id_next(&pos, record_id); - if (rc) - goto out; - /* no more record */ - if (*record_id == APEI_ERST_INVALID_RECORD_ID) - goto out; - rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); - /* someone else has cleared the record, try next one */ - if (rc == -ENOENT) - goto retry; - else if (rc < 0) - goto out; - /* try to skip other type records in storage */ - else if (rc != sizeof(rcd) || - uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) - goto retry; - memcpy(m, &rcd.mce, sizeof(*m)); - rc = sizeof(*m); -out: - erst_get_record_id_end(); - - return rc; -} - -/* Check whether there is record in ERST */ -int apei_check_mce(void) -{ - return erst_get_record_count(); -} - -int apei_clear_mce(u64 record_id) -{ - return erst_clear(record_id); -} diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -1,145 +0,0 @@ -/* - * MCE event pool management in MCE context - * - * Copyright (C) 2015 Intel Corp. - * Author: Chen, Gong <gong.chen@linux.intel.com> - * - * This file is licensed under GPLv2. - */ -#include <linux/smp.h> -#include <linux/mm.h> -#include <linux/genalloc.h> -#include <linux/llist.h> -#include "mce-internal.h" - -/* - * printk() is not safe in MCE context. This is a lock-less memory allocator - * used to save error information organized in a lock-less list. - * - * This memory pool is only to be used to save MCE records in MCE context. - * MCE events are rare, so a fixed size memory pool should be enough. Use - * 2 pages to save MCE events for now (~80 MCE records at most). - */ -#define MCE_POOLSZ (2 * PAGE_SIZE) - -static struct gen_pool *mce_evt_pool; -static LLIST_HEAD(mce_event_llist); -static char gen_pool_buf[MCE_POOLSZ]; - -/* - * Compare the record "t" with each of the records on list "l" to see if - * an equivalent one is present in the list. - */ -static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) -{ - struct mce_evt_llist *node; - struct mce *m1, *m2; - - m1 = &t->mce; - - llist_for_each_entry(node, &l->llnode, llnode) { - m2 = &node->mce; - - if (!mce_cmp(m1, m2)) - return true; - } - return false; -} - -/* - * The system has panicked - we'd like to peruse the list of MCE records - * that have been queued, but not seen by anyone yet. The list is in - * reverse time order, so we need to reverse it. While doing that we can - * also drop duplicate records (these were logged because some banks are - * shared between cores or by all threads on a socket). - */ -struct llist_node *mce_gen_pool_prepare_records(void) -{ - struct llist_node *head; - LLIST_HEAD(new_head); - struct mce_evt_llist *node, *t; - - head = llist_del_all(&mce_event_llist); - if (!head) - return NULL; - - /* squeeze out duplicates while reversing order */ - llist_for_each_entry_safe(node, t, head, llnode) { - if (!is_duplicate_mce_record(node, t)) - llist_add(&node->llnode, &new_head); - } - - return new_head.first; -} - -void mce_gen_pool_process(struct work_struct *__unused) -{ - struct llist_node *head; - struct mce_evt_llist *node, *tmp; - struct mce *mce; - - head = llist_del_all(&mce_event_llist); - if (!head) - return; - - head = llist_reverse_order(head); - llist_for_each_entry_safe(node, tmp, head, llnode) { - mce = &node->mce; - blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); - gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); - } -} - -bool mce_gen_pool_empty(void) -{ - return llist_empty(&mce_event_llist); -} - -int mce_gen_pool_add(struct mce *mce) -{ - struct mce_evt_llist *node; - - if (!mce_evt_pool) - return -EINVAL; - - node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); - if (!node) { - pr_warn_ratelimited("MCE records pool full!\n"); - return -ENOMEM; - } - - memcpy(&node->mce, mce, sizeof(*mce)); - llist_add(&node->llnode, &mce_event_llist); - - return 0; -} - -static int mce_gen_pool_create(void) -{ - struct gen_pool *tmpp; - int ret = -ENOMEM; - - tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1); - if (!tmpp) - goto out; - - ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1); - if (ret) { - gen_pool_destroy(tmpp); - goto out; - } - - mce_evt_pool = tmpp; - -out: - return ret; -} - -int mce_gen_pool_init(void) -{ - /* Just init mce_gen_pool once. */ - if (mce_evt_pool) - return 0; - - return mce_gen_pool_create(); -} diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -1,739 +0,0 @@ -/* - * Machine check injection support. - * Copyright 2008 Intel Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - * - * Authors: - * Andi Kleen - * Ying Huang - * - * The AMD part (from mce_amd_inj.c): a simple MCE injection facility - * for testing different aspects of the RAS code. This driver should be - * built as module so that it can be loaded on production kernels for - * testing purposes. - * - * This file may be distributed under the terms of the GNU General Public - * License version 2. - * - * Copyright (c) 2010-17: Borislav Petkov <bp@alien8.de> - * Advanced Micro Devices Inc. - */ - -#include <linux/cpu.h> -#include <linux/debugfs.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/notifier.h> -#include <linux/pci.h> -#include <linux/uaccess.h> - -#include <asm/amd_nb.h> -#include <asm/apic.h> -#include <asm/irq_vectors.h> -#include <asm/mce.h> -#include <asm/nmi.h> -#include <asm/smp.h> - -#include "mce-internal.h" - -/* - * Collect all the MCi_XXX settings - */ -static struct mce i_mce; -static struct dentry *dfs_inj; - -static u8 n_banks; - -#define MAX_FLAG_OPT_SIZE 4 -#define NBCFG 0x44 - -enum injection_type { - SW_INJ = 0, /* SW injection, simply decode the error */ - HW_INJ, /* Trigger a #MC */ - DFR_INT_INJ, /* Trigger Deferred error interrupt */ - THR_INT_INJ, /* Trigger threshold interrupt */ - N_INJ_TYPES, -}; - -static const char * const flags_options[] = { - [SW_INJ] = "sw", - [HW_INJ] = "hw", - [DFR_INT_INJ] = "df", - [THR_INT_INJ] = "th", - NULL -}; - -/* Set default injection to SW_INJ */ -static enum injection_type inj_type = SW_INJ; - -#define MCE_INJECT_SET(reg) \ -static int inj_##reg##_set(void *data, u64 val) \ -{ \ - struct mce *m = (struct mce *)data; \ - \ - m->reg = val; \ - return 0; \ -} - -MCE_INJECT_SET(status); -MCE_INJECT_SET(misc); -MCE_INJECT_SET(addr); -MCE_INJECT_SET(synd); - -#define MCE_INJECT_GET(reg) \ -static int inj_##reg##_get(void *data, u64 *val) \ -{ \ - struct mce *m = (struct mce *)data; \ - \ - *val = m->reg; \ - return 0; \ -} - -MCE_INJECT_GET(status); -MCE_INJECT_GET(misc); -MCE_INJECT_GET(addr); -MCE_INJECT_GET(synd); - -DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); - -static void setup_inj_struct(struct mce *m) -{ - memset(m, 0, sizeof(struct mce)); - - m->cpuvendor = boot_cpu_data.x86_vendor; - m->time = ktime_get_real_seconds(); - m->cpuid = cpuid_eax(1); - m->microcode = boot_cpu_data.microcode; -} - -/* Update fake mce registers on current CPU. */ -static void inject_mce(struct mce *m) -{ - struct mce *i = &per_cpu(injectm, m->extcpu); - - /* Make sure no one reads partially written injectm */ - i->finished = 0; - mb(); - m->finished = 0; - /* First set the fields after finished */ - i->extcpu = m->extcpu; - mb(); - /* Now write record in order, finished last (except above) */ - memcpy(i, m, sizeof(struct mce)); - /* Finally activate it */ - mb(); - i->finished = 1; -} - -static void raise_poll(struct mce *m) -{ - unsigned long flags; - mce_banks_t b; - - memset(&b, 0xff, sizeof(mce_banks_t)); - local_irq_save(flags); - machine_check_poll(0, &b); - local_irq_restore(flags); - m->finished = 0; -} - -static void raise_exception(struct mce *m, struct pt_regs *pregs) -{ - struct pt_regs regs; - unsigned long flags; - - if (!pregs) { - memset(&regs, 0, sizeof(struct pt_regs)); - regs.ip = m->ip; - regs.cs = m->cs; - pregs = &regs; - } - /* in mcheck exeception handler, irq will be disabled */ - local_irq_save(flags); - do_machine_check(pregs, 0); - local_irq_restore(flags); - m->finished = 0; -} - -static cpumask_var_t mce_inject_cpumask; -static DEFINE_MUTEX(mce_inject_mutex); - -static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) -{ - int cpu = smp_processor_id(); - struct mce *m = this_cpu_ptr(&injectm); - if (!cpumask_test_cpu(cpu, mce_inject_cpumask)) - return NMI_DONE; - cpumask_clear_cpu(cpu, mce_inject_cpumask); - if (m->inject_flags & MCJ_EXCEPTION) - raise_exception(m, regs); - else if (m->status) - raise_poll(m); - return NMI_HANDLED; -} - -static void mce_irq_ipi(void *info) -{ - int cpu = smp_processor_id(); - struct mce *m = this_cpu_ptr(&injectm); - - if (cpumask_test_cpu(cpu, mce_inject_cpumask) && - m->inject_flags & MCJ_EXCEPTION) { - cpumask_clear_cpu(cpu, mce_inject_cpumask); - raise_exception(m, NULL); - } -} - -/* Inject mce on current CPU */ -static int raise_local(void) -{ - struct mce *m = this_cpu_ptr(&injectm); - int context = MCJ_CTX(m->inject_flags); - int ret = 0; - int cpu = m->extcpu; - - if (m->inject_flags & MCJ_EXCEPTION) { - pr_info("Triggering MCE exception on CPU %d\n", cpu); - switch (context) { - case MCJ_CTX_IRQ: - /* - * Could do more to fake interrupts like - * calling irq_enter, but the necessary - * machinery isn't exported currently. - */ - /*FALL THROUGH*/ - case MCJ_CTX_PROCESS: - raise_exception(m, NULL); - break; - default: - pr_info("Invalid MCE context\n"); - ret = -EINVAL; - } - pr_info("MCE exception done on CPU %d\n", cpu); - } else if (m->status) { - pr_info("Starting machine check poll CPU %d\n", cpu); - raise_poll(m); - mce_notify_irq(); - pr_info("Machine check poll done on CPU %d\n", cpu); - } else - m->finished = 0; - - return ret; -} - -static void __maybe_unused raise_mce(struct mce *m) -{ - int context = MCJ_CTX(m->inject_flags); - - inject_mce(m); - - if (context == MCJ_CTX_RANDOM) - return; - - if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) { - unsigned long start; - int cpu; - - get_online_cpus(); - cpumask_copy(mce_inject_cpumask, cpu_online_mask); - cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); - for_each_online_cpu(cpu) { - struct mce *mcpu = &per_cpu(injectm, cpu); - if (!mcpu->finished || - MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) - cpumask_clear_cpu(cpu, mce_inject_cpumask); - } - if (!cpumask_empty(mce_inject_cpumask)) { - if (m->inject_flags & MCJ_IRQ_BROADCAST) { - /* - * don't wait because mce_irq_ipi is necessary - * to be sync with following raise_local - */ - preempt_disable(); - smp_call_function_many(mce_inject_cpumask, - mce_irq_ipi, NULL, 0); - preempt_enable(); - } else if (m->inject_flags & MCJ_NMI_BROADCAST) - apic->send_IPI_mask(mce_inject_cpumask, - NMI_VECTOR); - } - start = jiffies; - while (!cpumask_empty(mce_inject_cpumask)) { - if (!time_before(jiffies, start + 2*HZ)) { - pr_err("Timeout waiting for mce inject %lx\n", - *cpumask_bits(mce_inject_cpumask)); - break; - } - cpu_relax(); - } - raise_local(); - put_cpu(); - put_online_cpus(); - } else { - preempt_disable(); - raise_local(); - preempt_enable(); - } -} - -static int mce_inject_raise(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct mce *m = (struct mce *)data; - - if (!m) - return NOTIFY_DONE; - - mutex_lock(&mce_inject_mutex); - raise_mce(m); - mutex_unlock(&mce_inject_mutex); - - return NOTIFY_DONE; -} - -static struct notifier_block inject_nb = { - .notifier_call = mce_inject_raise, -}; - -/* - * Caller needs to be make sure this cpu doesn't disappear - * from under us, i.e.: get_cpu/put_cpu. - */ -static int toggle_hw_mce_inject(unsigned int cpu, bool enable) -{ - u32 l, h; - int err; - - err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h); - if (err) { - pr_err("%s: error reading HWCR\n", __func__); - return err; - } - - enable ? (l |= BIT(18)) : (l &= ~BIT(18)); - - err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h); - if (err) - pr_err("%s: error writing HWCR\n", __func__); - - return err; -} - -static int __set_inj(const char *buf) -{ - int i; - - for (i = 0; i < N_INJ_TYPES; i++) { - if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { - inj_type = i; - return 0; - } - } - return -EINVAL; -} - -static ssize_t flags_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_FLAG_OPT_SIZE]; - int n; - - n = sprintf(buf, "%s\n", flags_options[inj_type]); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); -} - -static ssize_t flags_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_FLAG_OPT_SIZE], *__buf; - int err; - - if (cnt > MAX_FLAG_OPT_SIZE) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt - 1] = 0; - - /* strip whitespace */ - __buf = strstrip(buf); - - err = __set_inj(__buf); - if (err) { - pr_err("%s: Invalid flags value: %s\n", __func__, __buf); - return err; - } - - *ppos += cnt; - - return cnt; -} - -static const struct file_operations flags_fops = { - .read = flags_read, - .write = flags_write, - .llseek = generic_file_llseek, -}; - -/* - * On which CPU to inject? - */ -MCE_INJECT_GET(extcpu); - -static int inj_extcpu_set(void *data, u64 val) -{ - struct mce *m = (struct mce *)data; - - if (val >= nr_cpu_ids || !cpu_online(val)) { - pr_err("%s: Invalid CPU: %llu\n", __func__, val); - return -EINVAL; - } - m->extcpu = val; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n"); - -static void trigger_mce(void *info) -{ - asm volatile("int $18"); -} - -static void trigger_dfr_int(void *info) -{ - asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); -} - -static void trigger_thr_int(void *info) -{ - asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); -} - -static u32 get_nbc_for_node(int node_id) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - u32 cores_per_node; - - cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); - - return cores_per_node * node_id; -} - -static void toggle_nb_mca_mst_cpu(u16 nid) -{ - struct amd_northbridge *nb; - struct pci_dev *F3; - u32 val; - int err; - - nb = node_to_amd_nb(nid); - if (!nb) - return; - - F3 = nb->misc; - if (!F3) - return; - - err = pci_read_config_dword(F3, NBCFG, &val); - if (err) { - pr_err("%s: Error reading F%dx%03x.\n", - __func__, PCI_FUNC(F3->devfn), NBCFG); - return; - } - - if (val & BIT(27)) - return; - - pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", - __func__); - - val |= BIT(27); - err = pci_write_config_dword(F3, NBCFG, val); - if (err) - pr_err("%s: Error writing F%dx%03x.\n", - __func__, PCI_FUNC(F3->devfn), NBCFG); -} - -static void prepare_msrs(void *info) -{ - struct mce m = *(struct mce *)info; - u8 b = m.bank; - - wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - - if (boot_cpu_has(X86_FEATURE_SMCA)) { - if (m.inject_flags == DFR_INT_INJ) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); - } else { - wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); - } - - wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); - wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); - } else { - wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); - wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); - wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); - } -} - -static void do_inject(void) -{ - u64 mcg_status = 0; - unsigned int cpu = i_mce.extcpu; - u8 b = i_mce.bank; - - i_mce.tsc = rdtsc_ordered(); - - if (i_mce.misc) - i_mce.status |= MCI_STATUS_MISCV; - - if (i_mce.synd) - i_mce.status |= MCI_STATUS_SYNDV; - - if (inj_type == SW_INJ) { - mce_inject_log(&i_mce); - return; - } - - /* prep MCE global settings for the injection */ - mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; - - if (!(i_mce.status & MCI_STATUS_PCC)) - mcg_status |= MCG_STATUS_RIPV; - - /* - * Ensure necessary status bits for deferred errors: - * - MCx_STATUS[Deferred]: make sure it is a deferred error - * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC - */ - if (inj_type == DFR_INT_INJ) { - i_mce.status |= MCI_STATUS_DEFERRED; - i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); - } - - /* - * For multi node CPUs, logging and reporting of bank 4 errors happens - * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for - * Fam10h and later BKDGs. - */ - if (static_cpu_has(X86_FEATURE_AMD_DCM) && - b == 4 && - boot_cpu_data.x86 < 0x17) { - toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); - cpu = get_nbc_for_node(amd_get_nb_id(cpu)); - } - - get_online_cpus(); - if (!cpu_online(cpu)) - goto err; - - toggle_hw_mce_inject(cpu, true); - - i_mce.mcgstatus = mcg_status; - i_mce.inject_flags = inj_type; - smp_call_function_single(cpu, prepare_msrs, &i_mce, 0); - - toggle_hw_mce_inject(cpu, false); - - switch (inj_type) { - case DFR_INT_INJ: - smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); - break; - case THR_INT_INJ: - smp_call_function_single(cpu, trigger_thr_int, NULL, 0); - break; - default: - smp_call_function_single(cpu, trigger_mce, NULL, 0); - } - -err: - put_online_cpus(); - -} - -/* - * This denotes into which bank we're injecting and triggers - * the injection, at the same time. - */ -static int inj_bank_set(void *data, u64 val) -{ - struct mce *m = (struct mce *)data; - - if (val >= n_banks) { - pr_err("Non-existent MCE bank: %llu\n", val); - return -EINVAL; - } - - m->bank = val; - do_inject(); - - /* Reset injection struct */ - setup_inj_struct(&i_mce); - - return 0; -} - -MCE_INJECT_GET(bank); - -DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); - -static const char readme_msg[] = -"Description of the files and their usages:\n" -"\n" -"Note1: i refers to the bank number below.\n" -"Note2: See respective BKDGs for the exact bit definitions of the files below\n" -"as they mirror the hardware registers.\n" -"\n" -"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" -"\t attributes of the error which caused the MCE.\n" -"\n" -"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" -"\t used for error thresholding purposes and its validity is indicated by\n" -"\t MCi_STATUS[MiscV].\n" -"\n" -"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" -"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" -"\n" -"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" -"\t associated with the error.\n" -"\n" -"cpu:\t The CPU to inject the error on.\n" -"\n" -"bank:\t Specify the bank you want to inject the error into: the number of\n" -"\t banks in a processor varies and is family/model-specific, therefore, the\n" -"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" -"\t injection.\n" -"\n" -"flags:\t Injection type to be performed. Writing to this file will trigger a\n" -"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" -"\t for AMD processors.\n" -"\n" -"\t Allowed error injection types:\n" -"\t - \"sw\": Software error injection. Decode error to a human-readable \n" -"\t format only. Safe to use.\n" -"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" -"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" -"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" -"\t before injecting.\n" -"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" -"\t error APIC interrupt handler to handle the error if the feature is \n" -"\t is present in hardware. \n" -"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" -"\t APIC interrupt handler to handle the error. \n" -"\n"; - -static ssize_t -inj_readme_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - return simple_read_from_buffer(ubuf, cnt, ppos, - readme_msg, strlen(readme_msg)); -} - -static const struct file_operations readme_fops = { - .read = inj_readme_read, -}; - -static struct dfs_node { - char *name; - struct dentry *d; - const struct file_operations *fops; - umode_t perm; -} dfs_fls[] = { - { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, -}; - -static int __init debugfs_init(void) -{ - unsigned int i; - u64 cap; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - n_banks = cap & MCG_BANKCNT_MASK; - - dfs_inj = debugfs_create_dir("mce-inject", NULL); - if (!dfs_inj) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { - dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, - dfs_fls[i].perm, - dfs_inj, - &i_mce, - dfs_fls[i].fops); - - if (!dfs_fls[i].d) - goto err_dfs_add; - } - - return 0; - -err_dfs_add: - while (i-- > 0) - debugfs_remove(dfs_fls[i].d); - - debugfs_remove(dfs_inj); - dfs_inj = NULL; - - return -ENODEV; -} - -static int __init inject_init(void) -{ - int err; - - if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) - return -ENOMEM; - - err = debugfs_init(); - if (err) { - free_cpumask_var(mce_inject_cpumask); - return err; - } - - register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); - mce_register_injector_chain(&inject_nb); - - setup_inj_struct(&i_mce); - - pr_info("Machine check injector initialized\n"); - - return 0; -} - -static void __exit inject_exit(void) -{ - - mce_unregister_injector_chain(&inject_nb); - unregister_nmi_handler(NMI_LOCAL, "mce_notify"); - - debugfs_remove_recursive(dfs_inj); - dfs_inj = NULL; - - memset(&dfs_fls, 0, sizeof(dfs_fls)); - - free_cpumask_var(mce_inject_cpumask); -} - -module_init(inject_init); -module_exit(inject_exit); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,173 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __X86_MCE_INTERNAL_H__ -#define __X86_MCE_INTERNAL_H__ - -#include <linux/device.h> -#include <asm/mce.h> - -enum severity_level { - MCE_NO_SEVERITY, - MCE_DEFERRED_SEVERITY, - MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY, - MCE_KEEP_SEVERITY, - MCE_SOME_SEVERITY, - MCE_AO_SEVERITY, - MCE_UC_SEVERITY, - MCE_AR_SEVERITY, - MCE_PANIC_SEVERITY, -}; - -extern struct blocking_notifier_head x86_mce_decoder_chain; - -#define ATTR_LEN 16 -#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ - -/* One object for each MCE bank, shared by all CPUs */ -struct mce_bank { - u64 ctl; /* subevents to enable */ - unsigned char init; /* initialise bank? */ - struct device_attribute attr; /* device attribute */ - char attrname[ATTR_LEN]; /* attribute name */ -}; - -struct mce_evt_llist { - struct llist_node llnode; - struct mce mce; -}; - -void mce_gen_pool_process(struct work_struct *__unused); -bool mce_gen_pool_empty(void); -int mce_gen_pool_add(struct mce *mce); -int mce_gen_pool_init(void); -struct llist_node *mce_gen_pool_prepare_records(void); - -extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); -struct dentry *mce_get_debugfs_dir(void); - -extern struct mce_bank *mce_banks; -extern mce_banks_t mce_banks_ce_disabled; - -#ifdef CONFIG_X86_MCE_INTEL -unsigned long cmci_intel_adjust_timer(unsigned long interval); -bool mce_intel_cmci_poll(void); -void mce_intel_hcpu_update(unsigned long cpu); -void cmci_disable_bank(int bank); -#else -# define cmci_intel_adjust_timer mce_adjust_timer_default -static inline bool mce_intel_cmci_poll(void) { return false; } -static inline void mce_intel_hcpu_update(unsigned long cpu) { } -static inline void cmci_disable_bank(int bank) { } -#endif - -void mce_timer_kick(unsigned long interval); - -#ifdef CONFIG_ACPI_APEI -int apei_write_mce(struct mce *m); -ssize_t apei_read_mce(struct mce *m, u64 *record_id); -int apei_check_mce(void); -int apei_clear_mce(u64 record_id); -#else -static inline int apei_write_mce(struct mce *m) -{ - return -EINVAL; -} -static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id) -{ - return 0; -} -static inline int apei_check_mce(void) -{ - return 0; -} -static inline int apei_clear_mce(u64 record_id) -{ - return -EINVAL; -} -#endif - -void mce_inject_log(struct mce *m); - -/* - * We consider records to be equivalent if bank+status+addr+misc all match. - * This is only used when the system is going down because of a fatal error - * to avoid cluttering the console log with essentially repeated information. - * In normal processing all errors seen are logged. - */ -static inline bool mce_cmp(struct mce *m1, struct mce *m2) -{ - return m1->bank != m2->bank || - m1->status != m2->status || - m1->addr != m2->addr || - m1->misc != m2->misc; -} - -extern struct device_attribute dev_attr_trigger; - -#ifdef CONFIG_X86_MCELOG_LEGACY -void mce_work_trigger(void); -void mce_register_injector_chain(struct notifier_block *nb); -void mce_unregister_injector_chain(struct notifier_block *nb); -#else -static inline void mce_work_trigger(void) { } -static inline void mce_register_injector_chain(struct notifier_block *nb) { } -static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } -#endif - -struct mca_config { - bool dont_log_ce; - bool cmci_disabled; - bool ignore_ce; - - __u64 lmce_disabled : 1, - disabled : 1, - ser : 1, - recovery : 1, - bios_cmci_threshold : 1, - __reserved : 59; - - u8 banks; - s8 bootlog; - int tolerant; - int monarch_timeout; - int panic_timeout; - u32 rip_msr; -}; - -extern struct mca_config mca_cfg; - -struct mce_vendor_flags { - /* - * Indicates that overflow conditions are not fatal, when set. - */ - __u64 overflow_recov : 1, - - /* - * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and - * Recovery. It indicates support for data poisoning in HW and deferred - * error interrupts. - */ - succor : 1, - - /* - * (AMD) SMCA: This bit indicates support for Scalable MCA which expands - * the register space for each MCA bank and also increases number of - * banks. Also, to accommodate the new banks and registers, the MCA - * register space is moved to a new MSR range. - */ - smca : 1, - - __reserved_0 : 61; -}; - -extern struct mce_vendor_flags mce_flags; - -struct mca_msr_regs { - u32 (*ctl) (int bank); - u32 (*status) (int bank); - u32 (*addr) (int bank); - u32 (*misc) (int bank); -}; - -extern struct mca_msr_regs msr_ops; - -#endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -1,419 +0,0 @@ -/* - * MCE grading rules. - * Copyright 2008, 2009 Intel Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - * - * Author: Andi Kleen - */ -#include <linux/kernel.h> -#include <linux/seq_file.h> -#include <linux/init.h> -#include <linux/debugfs.h> -#include <asm/mce.h> -#include <linux/uaccess.h> - -#include "mce-internal.h" - -/* - * Grade an mce by severity. In general the most severe ones are processed - * first. Since there are quite a lot of combinations test the bits in a - * table-driven way. The rules are simply processed in order, first - * match wins. - * - * Note this is only used for machine check exceptions, the corrected - * errors use much simpler rules. The exceptions still check for the corrected - * errors, but only to leave them alone for the CMCI handler (except for - * panic situations) - */ - -enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 }; -enum ser { SER_REQUIRED = 1, NO_SER = 2 }; -enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; - -static struct severity { - u64 mask; - u64 result; - unsigned char sev; - unsigned char mcgmask; - unsigned char mcgres; - unsigned char ser; - unsigned char context; - unsigned char excp; - unsigned char covered; - char *msg; -} severities[] = { -#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } -#define KERNEL .context = IN_KERNEL -#define USER .context = IN_USER -#define KERNEL_RECOV .context = IN_KERNEL_RECOV -#define SER .ser = SER_REQUIRED -#define NOSER .ser = NO_SER -#define EXCP .excp = EXCP_CONTEXT -#define NOEXCP .excp = NO_EXCP -#define BITCLR(x) .mask = x, .result = 0 -#define BITSET(x) .mask = x, .result = x -#define MCGMASK(x, y) .mcgmask = x, .mcgres = y -#define MASK(x, y) .mask = x, .result = y -#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) -#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) -#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) -#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) - - MCESEV( - NO, "Invalid", - BITCLR(MCI_STATUS_VAL) - ), - MCESEV( - NO, "Not enabled", - EXCP, BITCLR(MCI_STATUS_EN) - ), - MCESEV( - PANIC, "Processor context corrupt", - BITSET(MCI_STATUS_PCC) - ), - /* When MCIP is not set something is very confused */ - MCESEV( - PANIC, "MCIP not set in MCA handler", - EXCP, MCGMASK(MCG_STATUS_MCIP, 0) - ), - /* Neither return not error IP -- no chance to recover -> PANIC */ - MCESEV( - PANIC, "Neither restart nor error IP", - EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) - ), - MCESEV( - PANIC, "In kernel and no restart IP", - EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) - ), - MCESEV( - PANIC, "In kernel and no restart IP", - EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) - ), - MCESEV( - DEFERRED, "Deferred error", - NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) - ), - MCESEV( - KEEP, "Corrected error", - NOSER, BITCLR(MCI_STATUS_UC) - ), - - /* - * known AO MCACODs reported via MCE or CMC: - * - * SRAO could be signaled either via a machine check exception or - * CMCI with the corresponding bit S 1 or 0. So we don't need to - * check bit S for SRAO. - */ - MCESEV( - AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) - ), - MCESEV( - AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) - ), - - /* ignore OVER for UCNA */ - MCESEV( - UCNA, "Uncorrected no action required", - SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) - ), - MCESEV( - PANIC, "Illegal combination (UCNA with AR=1)", - SER, - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) - ), - MCESEV( - KEEP, "Non signalled machine check", - SER, BITCLR(MCI_STATUS_S) - ), - - MCESEV( - PANIC, "Action required with lost events", - SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) - ), - - /* known AR MCACODs: */ -#ifdef CONFIG_MEMORY_FAILURE - MCESEV( - KEEP, "Action required but unaffected thread is continuable", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR), - MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) - ), - MCESEV( - AR, "Action required: data load in error recoverable area of kernel", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), - KERNEL_RECOV - ), - MCESEV( - AR, "Action required: data load error in a user process", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), - USER - ), - MCESEV( - AR, "Action required: instruction fetch error in a user process", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), - USER - ), - MCESEV( - PANIC, "Data load in unrecoverable area of kernel", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), - KERNEL - ), -#endif - MCESEV( - PANIC, "Action required: unknown MCACOD", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) - ), - - MCESEV( - SOME, "Action optional: unknown MCACOD", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) - ), - MCESEV( - SOME, "Action optional with lost events", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S) - ), - - MCESEV( - PANIC, "Overflowed uncorrected", - BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) - ), - MCESEV( - UC, "Uncorrected", - BITSET(MCI_STATUS_UC) - ), - MCESEV( - SOME, "No match", - BITSET(0) - ) /* always matches. keep at end */ -}; - -#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ - (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) - -/* - * If mcgstatus indicated that ip/cs on the stack were - * no good, then "m->cs" will be zero and we will have - * to assume the worst case (IN_KERNEL) as we actually - * have no idea what we were executing when the machine - * check hit. - * If we do have a good "m->cs" (or a faked one in the - * case we were executing in VM86 mode) we can use it to - * distinguish an exception taken in user from from one - * taken in the kernel. - */ -static int error_context(struct mce *m) -{ - if ((m->cs & 3) == 3) - return IN_USER; - if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) - return IN_KERNEL_RECOV; - return IN_KERNEL; -} - -static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) -{ - u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); - u32 low, high; - - /* - * We need to look at the following bits: - * - "succor" bit (data poisoning support), and - * - TCC bit (Task Context Corrupt) - * in MCi_STATUS to determine error severity. - */ - if (!mce_flags.succor) - return MCE_PANIC_SEVERITY; - - if (rdmsr_safe(addr, &low, &high)) - return MCE_PANIC_SEVERITY; - - /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */ - if ((low & MCI_CONFIG_MCAX) && - (m->status & MCI_STATUS_TCC) && - (err_ctx == IN_KERNEL)) - return MCE_PANIC_SEVERITY; - - /* ...otherwise invoke hwpoison handler. */ - return MCE_AR_SEVERITY; -} - -/* - * See AMD Error Scope Hierarchy table in a newer BKDG. For example - * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" - */ -static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp) -{ - enum context ctx = error_context(m); - - /* Processor Context Corrupt, no need to fumble too much, die! */ - if (m->status & MCI_STATUS_PCC) - return MCE_PANIC_SEVERITY; - - if (m->status & MCI_STATUS_UC) { - - if (ctx == IN_KERNEL) - return MCE_PANIC_SEVERITY; - - /* - * On older systems where overflow_recov flag is not present, we - * should simply panic if an error overflow occurs. If - * overflow_recov flag is present and set, then software can try - * to at least kill process to prolong system operation. - */ - if (mce_flags.overflow_recov) { - if (mce_flags.smca) - return mce_severity_amd_smca(m, ctx); - - /* kill current process */ - return MCE_AR_SEVERITY; - } else { - /* at least one error was not logged */ - if (m->status & MCI_STATUS_OVER) - return MCE_PANIC_SEVERITY; - } - - /* - * For any other case, return MCE_UC_SEVERITY so that we log the - * error and exit #MC handler. - */ - return MCE_UC_SEVERITY; - } - - /* - * deferred error: poll handler catches these and adds to mce_ring so - * memory-failure can take recovery actions. - */ - if (m->status & MCI_STATUS_DEFERRED) - return MCE_DEFERRED_SEVERITY; - - /* - * corrected error: poll handler catches these and passes responsibility - * of decoding the error to EDAC - */ - return MCE_KEEP_SEVERITY; -} - -static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp) -{ - enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); - enum context ctx = error_context(m); - struct severity *s; - - for (s = severities;; s++) { - if ((m->status & s->mask) != s->result) - continue; - if ((m->mcgstatus & s->mcgmask) != s->mcgres) - continue; - if (s->ser == SER_REQUIRED && !mca_cfg.ser) - continue; - if (s->ser == NO_SER && mca_cfg.ser) - continue; - if (s->context && ctx != s->context) - continue; - if (s->excp && excp != s->excp) - continue; - if (msg) - *msg = s->msg; - s->covered = 1; - if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { - if (tolerant < 1) - return MCE_PANIC_SEVERITY; - } - return s->sev; - } -} - -/* Default to mce_severity_intel */ -int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = - mce_severity_intel; - -void __init mcheck_vendor_init_severity(void) -{ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - mce_severity = mce_severity_amd; -} - -#ifdef CONFIG_DEBUG_FS -static void *s_start(struct seq_file *f, loff_t *pos) -{ - if (*pos >= ARRAY_SIZE(severities)) - return NULL; - return &severities[*pos]; -} - -static void *s_next(struct seq_file *f, void *data, loff_t *pos) -{ - if (++(*pos) >= ARRAY_SIZE(severities)) - return NULL; - return &severities[*pos]; -} - -static void s_stop(struct seq_file *f, void *data) -{ -} - -static int s_show(struct seq_file *f, void *data) -{ - struct severity *ser = data; - seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); - return 0; -} - -static const struct seq_operations severities_seq_ops = { - .start = s_start, - .next = s_next, - .stop = s_stop, - .show = s_show, -}; - -static int severities_coverage_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &severities_seq_ops); -} - -static ssize_t severities_coverage_write(struct file *file, - const char __user *ubuf, - size_t count, loff_t *ppos) -{ - int i; - for (i = 0; i < ARRAY_SIZE(severities); i++) - severities[i].covered = 0; - return count; -} - -static const struct file_operations severities_coverage_fops = { - .open = severities_coverage_open, - .release = seq_release, - .read = seq_read, - .write = severities_coverage_write, - .llseek = seq_lseek, -}; - -static int __init severities_debugfs_init(void) -{ - struct dentry *dmce, *fsev; - - dmce = mce_get_debugfs_dir(); - if (!dmce) - goto err_out; - - fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL, - &severities_coverage_fops); - if (!fsev) - goto err_out; - - return 0; - -err_out: - return -ENOMEM; -} -late_initcall(severities_debugfs_init); -#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1,2499 +0,0 @@ -/* - * Machine check handler. - * - * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. - * Rest from unknown author(s). - * 2004 Andi Kleen. Rewrote most of it. - * Copyright 2008 Intel Corporation - * Author: Andi Kleen - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/thread_info.h> -#include <linux/capability.h> -#include <linux/miscdevice.h> -#include <linux/ratelimit.h> -#include <linux/rcupdate.h> -#include <linux/kobject.h> -#include <linux/uaccess.h> -#include <linux/kdebug.h> -#include <linux/kernel.h> -#include <linux/percpu.h> -#include <linux/string.h> -#include <linux/device.h> -#include <linux/syscore_ops.h> -#include <linux/delay.h> -#include <linux/ctype.h> -#include <linux/sched.h> -#include <linux/sysfs.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/kmod.h> -#include <linux/poll.h> -#include <linux/nmi.h> -#include <linux/cpu.h> -#include <linux/ras.h> -#include <linux/smp.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/debugfs.h> -#include <linux/irq_work.h> -#include <linux/export.h> -#include <linux/jump_label.h> -#include <linux/set_memory.h> - -#include <asm/intel-family.h> -#include <asm/processor.h> -#include <asm/traps.h> -#include <asm/tlbflush.h> -#include <asm/mce.h> -#include <asm/msr.h> -#include <asm/reboot.h> - -#include "mce-internal.h" - -static DEFINE_MUTEX(mce_log_mutex); - -/* sysfs synchronization */ -static DEFINE_MUTEX(mce_sysfs_mutex); - -#define CREATE_TRACE_POINTS -#include <trace/events/mce.h> - -#define SPINUNIT 100 /* 100ns */ - -DEFINE_PER_CPU(unsigned, mce_exception_count); - -struct mce_bank *mce_banks __read_mostly; -struct mce_vendor_flags mce_flags __read_mostly; - -struct mca_config mca_cfg __read_mostly = { - .bootlog = -1, - /* - * Tolerant levels: - * 0: always panic on uncorrected errors, log corrected errors - * 1: panic or SIGBUS on uncorrected errors, log corrected errors - * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors - * 3: never panic or SIGBUS, log all errors (for testing only) - */ - .tolerant = 1, - .monarch_timeout = -1 -}; - -static DEFINE_PER_CPU(struct mce, mces_seen); -static unsigned long mce_need_notify; -static int cpu_missing; - -/* - * MCA banks polled by the period polling timer for corrected events. - * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). - */ -DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { - [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL -}; - -/* - * MCA banks controlled through firmware first for corrected errors. - * This is a global list of banks for which we won't enable CMCI and we - * won't poll. Firmware controls these banks and is responsible for - * reporting corrected errors through GHES. Uncorrected/recoverable - * errors are still notified through a machine check. - */ -mce_banks_t mce_banks_ce_disabled; - -static struct work_struct mce_work; -static struct irq_work mce_irq_work; - -static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); - -/* - * CPU/chipset specific EDAC code can register a notifier call here to print - * MCE errors in a human-readable form. - */ -BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); - -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) -{ - memset(m, 0, sizeof(struct mce)); - m->cpu = m->extcpu = smp_processor_id(); - /* need the internal __ version to avoid deadlocks */ - m->time = __ktime_get_real_seconds(); - m->cpuvendor = boot_cpu_data.x86_vendor; - m->cpuid = cpuid_eax(1); - m->socketid = cpu_data(m->extcpu).phys_proc_id; - m->apicid = cpu_data(m->extcpu).initial_apicid; - rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); - - if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) - rdmsrl(MSR_PPIN, m->ppin); - - m->microcode = boot_cpu_data.microcode; -} - -DEFINE_PER_CPU(struct mce, injectm); -EXPORT_PER_CPU_SYMBOL_GPL(injectm); - -void mce_log(struct mce *m) -{ - if (!mce_gen_pool_add(m)) - irq_work_queue(&mce_irq_work); -} - -void mce_inject_log(struct mce *m) -{ - mutex_lock(&mce_log_mutex); - mce_log(m); - mutex_unlock(&mce_log_mutex); -} -EXPORT_SYMBOL_GPL(mce_inject_log); - -static struct notifier_block mce_srao_nb; - -/* - * We run the default notifier if we have only the SRAO, the first and the - * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS - * notifiers registered on the chain. - */ -#define NUM_DEFAULT_NOTIFIERS 3 -static atomic_t num_notifiers; - -void mce_register_decode_chain(struct notifier_block *nb) -{ - if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) - return; - - atomic_inc(&num_notifiers); - - blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); -} -EXPORT_SYMBOL_GPL(mce_register_decode_chain); - -void mce_unregister_decode_chain(struct notifier_block *nb) -{ - atomic_dec(&num_notifiers); - - blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); -} -EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); - -static inline u32 ctl_reg(int bank) -{ - return MSR_IA32_MCx_CTL(bank); -} - -static inline u32 status_reg(int bank) -{ - return MSR_IA32_MCx_STATUS(bank); -} - -static inline u32 addr_reg(int bank) -{ - return MSR_IA32_MCx_ADDR(bank); -} - -static inline u32 misc_reg(int bank) -{ - return MSR_IA32_MCx_MISC(bank); -} - -static inline u32 smca_ctl_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_CTL(bank); -} - -static inline u32 smca_status_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_STATUS(bank); -} - -static inline u32 smca_addr_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_ADDR(bank); -} - -static inline u32 smca_misc_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_MISC(bank); -} - -struct mca_msr_regs msr_ops = { - .ctl = ctl_reg, - .status = status_reg, - .addr = addr_reg, - .misc = misc_reg -}; - -static void __print_mce(struct mce *m) -{ - pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", - m->extcpu, - (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), - m->mcgstatus, m->bank, m->status); - - if (m->ip) { - pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", - !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->ip); - - if (m->cs == __KERNEL_CS) - pr_cont("{%pS}", (void *)(unsigned long)m->ip); - pr_cont("\n"); - } - - pr_emerg(HW_ERR "TSC %llx ", m->tsc); - if (m->addr) - pr_cont("ADDR %llx ", m->addr); - if (m->misc) - pr_cont("MISC %llx ", m->misc); - - if (mce_flags.smca) { - if (m->synd) - pr_cont("SYND %llx ", m->synd); - if (m->ipid) - pr_cont("IPID %llx ", m->ipid); - } - - pr_cont("\n"); - /* - * Note this output is parsed by external tools and old fields - * should not be changed. - */ - pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", - m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, - m->microcode); -} - -static void print_mce(struct mce *m) -{ - __print_mce(m); - - if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) - pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); -} - -#define PANIC_TIMEOUT 5 /* 5 seconds */ - -static atomic_t mce_panicked; - -static int fake_panic; -static atomic_t mce_fake_panicked; - -/* Panic in progress. Enable interrupts and wait for final IPI */ -static void wait_for_panic(void) -{ - long timeout = PANIC_TIMEOUT*USEC_PER_SEC; - - preempt_disable(); - local_irq_enable(); - while (timeout-- > 0) - udelay(1); - if (panic_timeout == 0) - panic_timeout = mca_cfg.panic_timeout; - panic("Panicing machine check CPU died"); -} - -static void mce_panic(const char *msg, struct mce *final, char *exp) -{ - int apei_err = 0; - struct llist_node *pending; - struct mce_evt_llist *l; - - if (!fake_panic) { - /* - * Make sure only one CPU runs in machine check panic - */ - if (atomic_inc_return(&mce_panicked) > 1) - wait_for_panic(); - barrier(); - - bust_spinlocks(1); - console_verbose(); - } else { - /* Don't log too much for fake panic */ - if (atomic_inc_return(&mce_fake_panicked) > 1) - return; - } - pending = mce_gen_pool_prepare_records(); - /* First print corrected ones that are still unlogged */ - llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; - if (!(m->status & MCI_STATUS_UC)) { - print_mce(m); - if (!apei_err) - apei_err = apei_write_mce(m); - } - } - /* Now print uncorrected but with the final one last */ - llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; - if (!(m->status & MCI_STATUS_UC)) - continue; - if (!final || mce_cmp(m, final)) { - print_mce(m); - if (!apei_err) - apei_err = apei_write_mce(m); - } - } - if (final) { - print_mce(final); - if (!apei_err) - apei_err = apei_write_mce(final); - } - if (cpu_missing) - pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); - if (exp) - pr_emerg(HW_ERR "Machine check: %s\n", exp); - if (!fake_panic) { - if (panic_timeout == 0) - panic_timeout = mca_cfg.panic_timeout; - panic(msg); - } else - pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); -} - -/* Support code for software error injection */ - -static int msr_to_offset(u32 msr) -{ - unsigned bank = __this_cpu_read(injectm.bank); - - if (msr == mca_cfg.rip_msr) - return offsetof(struct mce, ip); - if (msr == msr_ops.status(bank)) - return offsetof(struct mce, status); - if (msr == msr_ops.addr(bank)) - return offsetof(struct mce, addr); - if (msr == msr_ops.misc(bank)) - return offsetof(struct mce, misc); - if (msr == MSR_IA32_MCG_STATUS) - return offsetof(struct mce, mcgstatus); - return -1; -} - -/* MSR access wrappers used for error injection */ -static u64 mce_rdmsrl(u32 msr) -{ - u64 v; - - if (__this_cpu_read(injectm.finished)) { - int offset = msr_to_offset(msr); - - if (offset < 0) - return 0; - return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); - } - - if (rdmsrl_safe(msr, &v)) { - WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); - /* - * Return zero in case the access faulted. This should - * not happen normally but can happen if the CPU does - * something weird, or if the code is buggy. - */ - v = 0; - } - - return v; -} - -static void mce_wrmsrl(u32 msr, u64 v) -{ - if (__this_cpu_read(injectm.finished)) { - int offset = msr_to_offset(msr); - - if (offset >= 0) - *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; - return; - } - wrmsrl(msr, v); -} - -/* - * Collect all global (w.r.t. this processor) status about this machine - * check into our "mce" struct so that we can use it later to assess - * the severity of the problem as we read per-bank specific details. - */ -static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) -{ - mce_setup(m); - - m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - if (regs) { - /* - * Get the address of the instruction at the time of - * the machine check error. - */ - if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { - m->ip = regs->ip; - m->cs = regs->cs; - - /* - * When in VM86 mode make the cs look like ring 3 - * always. This is a lie, but it's better than passing - * the additional vm86 bit around everywhere. - */ - if (v8086_mode(regs)) - m->cs |= 3; - } - /* Use accurate RIP reporting if available. */ - if (mca_cfg.rip_msr) - m->ip = mce_rdmsrl(mca_cfg.rip_msr); - } -} - -int mce_available(struct cpuinfo_x86 *c) -{ - if (mca_cfg.disabled) - return 0; - return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); -} - -static void mce_schedule_work(void) -{ - if (!mce_gen_pool_empty()) - schedule_work(&mce_work); -} - -static void mce_irq_work_cb(struct irq_work *entry) -{ - mce_schedule_work(); -} - -static void mce_report_event(struct pt_regs *regs) -{ - if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { - mce_notify_irq(); - /* - * Triggering the work queue here is just an insurance - * policy in case the syscall exit notify handler - * doesn't run soon enough or ends up running on the - * wrong CPU (can happen when audit sleeps) - */ - mce_schedule_work(); - return; - } - - irq_work_queue(&mce_irq_work); -} - -/* - * Check if the address reported by the CPU is in a format we can parse. - * It would be possible to add code for most other cases, but all would - * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses up to page granuality for now. - */ -int mce_usable_address(struct mce *m) -{ - if (!(m->status & MCI_STATUS_ADDRV)) - return 0; - - /* Checks after this one are Intel-specific: */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - return 1; - - if (!(m->status & MCI_STATUS_MISCV)) - return 0; - - if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) - return 0; - - if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) - return 0; - - return 1; -} -EXPORT_SYMBOL_GPL(mce_usable_address); - -bool mce_is_memory_error(struct mce *m) -{ - if (m->cpuvendor == X86_VENDOR_AMD || - m->cpuvendor == X86_VENDOR_HYGON) { - return amd_mce_is_memory_error(m); - } else if (m->cpuvendor == X86_VENDOR_INTEL) { - /* - * Intel SDM Volume 3B - 15.9.2 Compound Error Codes - * - * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for - * indicating a memory error. Bit 8 is used for indicating a - * cache hierarchy error. The combination of bit 2 and bit 3 - * is used for indicating a `generic' cache hierarchy error - * But we can't just blindly check the above bits, because if - * bit 11 is set, then it is a bus/interconnect error - and - * either way the above bits just gives more detail on what - * bus/interconnect error happened. Note that bit 12 can be - * ignored, as it's the "filter" bit. - */ - return (m->status & 0xef80) == BIT(7) || - (m->status & 0xef00) == BIT(8) || - (m->status & 0xeffc) == 0xc; - } - - return false; -} -EXPORT_SYMBOL_GPL(mce_is_memory_error); - -bool mce_is_correctable(struct mce *m) -{ - if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) - return false; - - if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED) - return false; - - if (m->status & MCI_STATUS_UC) - return false; - - return true; -} -EXPORT_SYMBOL_GPL(mce_is_correctable); - -static bool cec_add_mce(struct mce *m) -{ - if (!m) - return false; - - /* We eat only correctable DRAM errors with usable addresses. */ - if (mce_is_memory_error(m) && - mce_is_correctable(m) && - mce_usable_address(m)) - if (!cec_add_elem(m->addr >> PAGE_SHIFT)) - return true; - - return false; -} - -static int mce_first_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct mce *m = (struct mce *)data; - - if (!m) - return NOTIFY_DONE; - - if (cec_add_mce(m)) - return NOTIFY_STOP; - - /* Emit the trace record: */ - trace_mce_record(m); - - set_bit(0, &mce_need_notify); - - mce_notify_irq(); - - return NOTIFY_DONE; -} - -static struct notifier_block first_nb = { - .notifier_call = mce_first_notifier, - .priority = MCE_PRIO_FIRST, -}; - -static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct mce *mce = (struct mce *)data; - unsigned long pfn; - - if (!mce) - return NOTIFY_DONE; - - if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) { - pfn = mce->addr >> PAGE_SHIFT; - if (!memory_failure(pfn, 0)) - set_mce_nospec(pfn); - } - - return NOTIFY_OK; -} -static struct notifier_block mce_srao_nb = { - .notifier_call = srao_decode_notifier, - .priority = MCE_PRIO_SRAO, -}; - -static int mce_default_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct mce *m = (struct mce *)data; - - if (!m) - return NOTIFY_DONE; - - if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS) - return NOTIFY_DONE; - - __print_mce(m); - - return NOTIFY_DONE; -} - -static struct notifier_block mce_default_nb = { - .notifier_call = mce_default_notifier, - /* lowest prio, we want it to run last. */ - .priority = MCE_PRIO_LOWEST, -}; - -/* - * Read ADDR and MISC registers. - */ -static void mce_read_aux(struct mce *m, int i) -{ - if (m->status & MCI_STATUS_MISCV) - m->misc = mce_rdmsrl(msr_ops.misc(i)); - - if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrl(msr_ops.addr(i)); - - /* - * Mask the reported address by the reported granularity. - */ - if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { - u8 shift = MCI_MISC_ADDR_LSB(m->misc); - m->addr >>= shift; - m->addr <<= shift; - } - - /* - * Extract [55:<lsb>] where lsb is the least significant - * *valid* bit of the address bits. - */ - if (mce_flags.smca) { - u8 lsb = (m->addr >> 56) & 0x3f; - - m->addr &= GENMASK_ULL(55, lsb); - } - } - - if (mce_flags.smca) { - m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); - - if (m->status & MCI_STATUS_SYNDV) - m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); - } -} - -DEFINE_PER_CPU(unsigned, mce_poll_count); - -/* - * Poll for corrected events or events that happened before reset. - * Those are just logged through /dev/mcelog. - * - * This is executed in standard interrupt context. - * - * Note: spec recommends to panic for fatal unsignalled - * errors here. However this would be quite problematic -- - * we would need to reimplement the Monarch handling and - * it would mess up the exclusion between exception handler - * and poll hander -- * so we skip this for now. - * These cases should not happen anyways, or only when the CPU - * is already totally * confused. In this case it's likely it will - * not fully execute the machine check handler either. - */ -bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) -{ - bool error_seen = false; - struct mce m; - int i; - - this_cpu_inc(mce_poll_count); - - mce_gather_info(&m, NULL); - - if (flags & MCP_TIMESTAMP) - m.tsc = rdtsc(); - - for (i = 0; i < mca_cfg.banks; i++) { - if (!mce_banks[i].ctl || !test_bit(i, *b)) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - - barrier(); - m.status = mce_rdmsrl(msr_ops.status(i)); - if (!(m.status & MCI_STATUS_VAL)) - continue; - - /* - * Uncorrected or signalled events are handled by the exception - * handler when it is enabled, so don't process those here. - * - * TBD do the same check for MCI_STATUS_EN here? - */ - if (!(flags & MCP_UC) && - (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) - continue; - - error_seen = true; - - mce_read_aux(&m, i); - - m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); - - /* - * Don't get the IP here because it's unlikely to - * have anything to do with the actual error location. - */ - if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) - mce_log(&m); - else if (mce_usable_address(&m)) { - /* - * Although we skipped logging this, we still want - * to take action. Add to the pool so the registered - * notifiers will see it. - */ - if (!mce_gen_pool_add(&m)) - mce_schedule_work(); - } - - /* - * Clear state for this bank. - */ - mce_wrmsrl(msr_ops.status(i), 0); - } - - /* - * Don't clear MCG_STATUS here because it's only defined for - * exceptions. - */ - - sync_core(); - - return error_seen; -} -EXPORT_SYMBOL_GPL(machine_check_poll); - -/* - * Do a quick check if any of the events requires a panic. - * This decides if we keep the events around or clear them. - */ -static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, - struct pt_regs *regs) -{ - char *tmp; - int i; - - for (i = 0; i < mca_cfg.banks; i++) { - m->status = mce_rdmsrl(msr_ops.status(i)); - if (!(m->status & MCI_STATUS_VAL)) - continue; - - __set_bit(i, validp); - if (quirk_no_way_out) - quirk_no_way_out(i, m, regs); - - if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { - mce_read_aux(m, i); - *msg = tmp; - return 1; - } - } - return 0; -} - -/* - * Variable to establish order between CPUs while scanning. - * Each CPU spins initially until executing is equal its number. - */ -static atomic_t mce_executing; - -/* - * Defines order of CPUs on entry. First CPU becomes Monarch. - */ -static atomic_t mce_callin; - -/* - * Check if a timeout waiting for other CPUs happened. - */ -static int mce_timed_out(u64 *t, const char *msg) -{ - /* - * The others already did panic for some reason. - * Bail out like in a timeout. - * rmb() to tell the compiler that system_state - * might have been modified by someone else. - */ - rmb(); - if (atomic_read(&mce_panicked)) - wait_for_panic(); - if (!mca_cfg.monarch_timeout) - goto out; - if ((s64)*t < SPINUNIT) { - if (mca_cfg.tolerant <= 1) - mce_panic(msg, NULL, NULL); - cpu_missing = 1; - return 1; - } - *t -= SPINUNIT; -out: - touch_nmi_watchdog(); - return 0; -} - -/* - * The Monarch's reign. The Monarch is the CPU who entered - * the machine check handler first. It waits for the others to - * raise the exception too and then grades them. When any - * error is fatal panic. Only then let the others continue. - * - * The other CPUs entering the MCE handler will be controlled by the - * Monarch. They are called Subjects. - * - * This way we prevent any potential data corruption in a unrecoverable case - * and also makes sure always all CPU's errors are examined. - * - * Also this detects the case of a machine check event coming from outer - * space (not detected by any CPUs) In this case some external agent wants - * us to shut down, so panic too. - * - * The other CPUs might still decide to panic if the handler happens - * in a unrecoverable place, but in this case the system is in a semi-stable - * state and won't corrupt anything by itself. It's ok to let the others - * continue for a bit first. - * - * All the spin loops have timeouts; when a timeout happens a CPU - * typically elects itself to be Monarch. - */ -static void mce_reign(void) -{ - int cpu; - struct mce *m = NULL; - int global_worst = 0; - char *msg = NULL; - char *nmsg = NULL; - - /* - * This CPU is the Monarch and the other CPUs have run - * through their handlers. - * Grade the severity of the errors of all the CPUs. - */ - for_each_possible_cpu(cpu) { - int severity = mce_severity(&per_cpu(mces_seen, cpu), - mca_cfg.tolerant, - &nmsg, true); - if (severity > global_worst) { - msg = nmsg; - global_worst = severity; - m = &per_cpu(mces_seen, cpu); - } - } - - /* - * Cannot recover? Panic here then. - * This dumps all the mces in the log buffer and stops the - * other CPUs. - */ - if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) - mce_panic("Fatal machine check", m, msg); - - /* - * For UC somewhere we let the CPU who detects it handle it. - * Also must let continue the others, otherwise the handling - * CPU could deadlock on a lock. - */ - - /* - * No machine check event found. Must be some external - * source or one CPU is hung. Panic. - */ - if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) - mce_panic("Fatal machine check from unknown source", NULL, NULL); - - /* - * Now clear all the mces_seen so that they don't reappear on - * the next mce. - */ - for_each_possible_cpu(cpu) - memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); -} - -static atomic_t global_nwo; - -/* - * Start of Monarch synchronization. This waits until all CPUs have - * entered the exception handler and then determines if any of them - * saw a fatal event that requires panic. Then it executes them - * in the entry order. - * TBD double check parallel CPU hotunplug - */ -static int mce_start(int *no_way_out) -{ - int order; - int cpus = num_online_cpus(); - u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; - - if (!timeout) - return -1; - - atomic_add(*no_way_out, &global_nwo); - /* - * Rely on the implied barrier below, such that global_nwo - * is updated before mce_callin. - */ - order = atomic_inc_return(&mce_callin); - - /* - * Wait for everyone. - */ - while (atomic_read(&mce_callin) != cpus) { - if (mce_timed_out(&timeout, - "Timeout: Not all CPUs entered broadcast exception handler")) { - atomic_set(&global_nwo, 0); - return -1; - } - ndelay(SPINUNIT); - } - - /* - * mce_callin should be read before global_nwo - */ - smp_rmb(); - - if (order == 1) { - /* - * Monarch: Starts executing now, the others wait. - */ - atomic_set(&mce_executing, 1); - } else { - /* - * Subject: Now start the scanning loop one by one in - * the original callin order. - * This way when there are any shared banks it will be - * only seen by one CPU before cleared, avoiding duplicates. - */ - while (atomic_read(&mce_executing) < order) { - if (mce_timed_out(&timeout, - "Timeout: Subject CPUs unable to finish machine check processing")) { - atomic_set(&global_nwo, 0); - return -1; - } - ndelay(SPINUNIT); - } - } - - /* - * Cache the global no_way_out state. - */ - *no_way_out = atomic_read(&global_nwo); - - return order; -} - -/* - * Synchronize between CPUs after main scanning loop. - * This invokes the bulk of the Monarch processing. - */ -static int mce_end(int order) -{ - int ret = -1; - u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; - - if (!timeout) - goto reset; - if (order < 0) - goto reset; - - /* - * Allow others to run. - */ - atomic_inc(&mce_executing); - - if (order == 1) { - /* CHECKME: Can this race with a parallel hotplug? */ - int cpus = num_online_cpus(); - - /* - * Monarch: Wait for everyone to go through their scanning - * loops. - */ - while (atomic_read(&mce_executing) <= cpus) { - if (mce_timed_out(&timeout, - "Timeout: Monarch CPU unable to finish machine check processing")) - goto reset; - ndelay(SPINUNIT); - } - - mce_reign(); - barrier(); - ret = 0; - } else { - /* - * Subject: Wait for Monarch to finish. - */ - while (atomic_read(&mce_executing) != 0) { - if (mce_timed_out(&timeout, - "Timeout: Monarch CPU did not finish machine check processing")) - goto reset; - ndelay(SPINUNIT); - } - - /* - * Don't reset anything. That's done by the Monarch. - */ - return 0; - } - - /* - * Reset all global state. - */ -reset: - atomic_set(&global_nwo, 0); - atomic_set(&mce_callin, 0); - barrier(); - - /* - * Let others run again. - */ - atomic_set(&mce_executing, 0); - return ret; -} - -static void mce_clear_state(unsigned long *toclear) -{ - int i; - - for (i = 0; i < mca_cfg.banks; i++) { - if (test_bit(i, toclear)) - mce_wrmsrl(msr_ops.status(i), 0); - } -} - -static int do_memory_failure(struct mce *m) -{ - int flags = MF_ACTION_REQUIRED; - int ret; - - pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); - if (!(m->mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - ret = memory_failure(m->addr >> PAGE_SHIFT, flags); - if (ret) - pr_err("Memory error not recovered"); - else - set_mce_nospec(m->addr >> PAGE_SHIFT); - return ret; -} - - -/* - * Cases where we avoid rendezvous handler timeout: - * 1) If this CPU is offline. - * - * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to - * skip those CPUs which remain looping in the 1st kernel - see - * crash_nmi_callback(). - * - * Note: there still is a small window between kexec-ing and the new, - * kdump kernel establishing a new #MC handler where a broadcasted MCE - * might not get handled properly. - */ -static bool __mc_check_crashing_cpu(int cpu) -{ - if (cpu_is_offline(cpu) || - (crashing_cpu != -1 && crashing_cpu != cpu)) { - u64 mcgstatus; - - mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - if (mcgstatus & MCG_STATUS_RIPV) { - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - return true; - } - } - return false; -} - -static void __mc_scan_banks(struct mce *m, struct mce *final, - unsigned long *toclear, unsigned long *valid_banks, - int no_way_out, int *worst) -{ - struct mca_config *cfg = &mca_cfg; - int severity, i; - - for (i = 0; i < cfg->banks; i++) { - __clear_bit(i, toclear); - if (!test_bit(i, valid_banks)) - continue; - - if (!mce_banks[i].ctl) - continue; - - m->misc = 0; - m->addr = 0; - m->bank = i; - - m->status = mce_rdmsrl(msr_ops.status(i)); - if (!(m->status & MCI_STATUS_VAL)) - continue; - - /* - * Corrected or non-signaled errors are handled by - * machine_check_poll(). Leave them alone, unless this panics. - */ - if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && - !no_way_out) - continue; - - /* Set taint even when machine check was not enabled. */ - add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - severity = mce_severity(m, cfg->tolerant, NULL, true); - - /* - * When machine check was for corrected/deferred handler don't - * touch, unless we're panicking. - */ - if ((severity == MCE_KEEP_SEVERITY || - severity == MCE_UCNA_SEVERITY) && !no_way_out) - continue; - - __set_bit(i, toclear); - - /* Machine check event was not enabled. Clear, but ignore. */ - if (severity == MCE_NO_SEVERITY) - continue; - - mce_read_aux(m, i); - - /* assuming valid severity level != 0 */ - m->severity = severity; - - mce_log(m); - - if (severity > *worst) { - *final = *m; - *worst = severity; - } - } - - /* mce_clear_state will clear *final, save locally for use later */ - *m = *final; -} - -/* - * The actual machine check handler. This only handles real - * exceptions when something got corrupted coming in through int 18. - * - * This is executed in NMI context not subject to normal locking rules. This - * implies that most kernel services cannot be safely used. Don't even - * think about putting a printk in there! - * - * On Intel systems this is entered on all CPUs in parallel through - * MCE broadcast. However some CPUs might be broken beyond repair, - * so be always careful when synchronizing with others. - */ -void do_machine_check(struct pt_regs *regs, long error_code) -{ - DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); - DECLARE_BITMAP(toclear, MAX_NR_BANKS); - struct mca_config *cfg = &mca_cfg; - int cpu = smp_processor_id(); - char *msg = "Unknown"; - struct mce m, *final; - int worst = 0; - - /* - * Establish sequential order between the CPUs entering the machine - * check handler. - */ - int order = -1; - - /* - * If no_way_out gets set, there is no safe way to recover from this - * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. - */ - int no_way_out = 0; - - /* - * If kill_it gets set, there might be a way to recover from this - * error. - */ - int kill_it = 0; - - /* - * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES - * on Intel. - */ - int lmce = 1; - - if (__mc_check_crashing_cpu(cpu)) - return; - - ist_enter(regs); - - this_cpu_inc(mce_exception_count); - - mce_gather_info(&m, regs); - m.tsc = rdtsc(); - - final = this_cpu_ptr(&mces_seen); - *final = m; - - memset(valid_banks, 0, sizeof(valid_banks)); - no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); - - barrier(); - - /* - * When no restart IP might need to kill or panic. - * Assume the worst for now, but if we find the - * severity is MCE_AR_SEVERITY we have other options. - */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) - kill_it = 1; - - /* - * Check if this MCE is signaled to only this logical processor, - * on Intel only. - */ - if (m.cpuvendor == X86_VENDOR_INTEL) - lmce = m.mcgstatus & MCG_STATUS_LMCES; - - /* - * Local machine check may already know that we have to panic. - * Broadcast machine check begins rendezvous in mce_start() - * Go through all banks in exclusion of the other CPUs. This way we - * don't report duplicated events on shared banks because the first one - * to see it will clear it. - */ - if (lmce) { - if (no_way_out) - mce_panic("Fatal local machine check", &m, msg); - } else { - order = mce_start(&no_way_out); - } - - __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst); - - if (!no_way_out) - mce_clear_state(toclear); - - /* - * Do most of the synchronization with other CPUs. - * When there's any problem use only local no_way_out state. - */ - if (!lmce) { - if (mce_end(order) < 0) - no_way_out = worst >= MCE_PANIC_SEVERITY; - } else { - /* - * If there was a fatal machine check we should have - * already called mce_panic earlier in this function. - * Since we re-read the banks, we might have found - * something new. Check again to see if we found a - * fatal error. We call "mce_severity()" again to - * make sure we have the right "msg". - */ - if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { - mce_severity(&m, cfg->tolerant, &msg, true); - mce_panic("Local fatal machine check!", &m, msg); - } - } - - /* - * If tolerant is at an insane level we drop requests to kill - * processes and continue even when there is no way out. - */ - if (cfg->tolerant == 3) - kill_it = 0; - else if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); - - if (worst > 0) - mce_report_event(regs); - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - - sync_core(); - - if (worst != MCE_AR_SEVERITY && !kill_it) - goto out_ist; - - /* Fault was in user mode and we need to take some action */ - if ((m.cs & 3) == 3) { - ist_begin_non_atomic(regs); - local_irq_enable(); - - if (kill_it || do_memory_failure(&m)) - force_sig(SIGBUS, current); - local_irq_disable(); - ist_end_non_atomic(); - } else { - if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) - mce_panic("Failed kernel mode recovery", &m, NULL); - } - -out_ist: - ist_exit(regs); -} -EXPORT_SYMBOL_GPL(do_machine_check); - -#ifndef CONFIG_MEMORY_FAILURE -int memory_failure(unsigned long pfn, int flags) -{ - /* mce_severity() should not hand us an ACTION_REQUIRED error */ - BUG_ON(flags & MF_ACTION_REQUIRED); - pr_err("Uncorrected memory error in page 0x%lx ignored\n" - "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", - pfn); - - return 0; -} -#endif - -/* - * Periodic polling timer for "silent" machine check errors. If the - * poller finds an MCE, poll 2x faster. When the poller finds no more - * errors, poll 2x slower (up to check_interval seconds). - */ -static unsigned long check_interval = INITIAL_CHECK_INTERVAL; - -static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ -static DEFINE_PER_CPU(struct timer_list, mce_timer); - -static unsigned long mce_adjust_timer_default(unsigned long interval) -{ - return interval; -} - -static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; - -static void __start_timer(struct timer_list *t, unsigned long interval) -{ - unsigned long when = jiffies + interval; - unsigned long flags; - - local_irq_save(flags); - - if (!timer_pending(t) || time_before(when, t->expires)) - mod_timer(t, round_jiffies(when)); - - local_irq_restore(flags); -} - -static void mce_timer_fn(struct timer_list *t) -{ - struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); - unsigned long iv; - - WARN_ON(cpu_t != t); - - iv = __this_cpu_read(mce_next_interval); - - if (mce_available(this_cpu_ptr(&cpu_info))) { - machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); - - if (mce_intel_cmci_poll()) { - iv = mce_adjust_timer(iv); - goto done; - } - } - - /* - * Alert userspace if needed. If we logged an MCE, reduce the polling - * interval, otherwise increase the polling interval. - */ - if (mce_notify_irq()) - iv = max(iv / 2, (unsigned long) HZ/100); - else - iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); - -done: - __this_cpu_write(mce_next_interval, iv); - __start_timer(t, iv); -} - -/* - * Ensure that the timer is firing in @interval from now. - */ -void mce_timer_kick(unsigned long interval) -{ - struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned long iv = __this_cpu_read(mce_next_interval); - - __start_timer(t, interval); - - if (interval < iv) - __this_cpu_write(mce_next_interval, interval); -} - -/* Must not be called in IRQ context where del_timer_sync() can deadlock */ -static void mce_timer_delete_all(void) -{ - int cpu; - - for_each_online_cpu(cpu) - del_timer_sync(&per_cpu(mce_timer, cpu)); -} - -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -int mce_notify_irq(void) -{ - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - - if (test_and_clear_bit(0, &mce_need_notify)) { - mce_work_trigger(); - - if (__ratelimit(&ratelimit)) - pr_info(HW_ERR "Machine check events logged\n"); - - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(mce_notify_irq); - -static int __mcheck_cpu_mce_banks_init(void) -{ - int i; - u8 num_banks = mca_cfg.banks; - - mce_banks = kcalloc(num_banks, sizeof(struct mce_bank), GFP_KERNEL); - if (!mce_banks) - return -ENOMEM; - - for (i = 0; i < num_banks; i++) { - struct mce_bank *b = &mce_banks[i]; - - b->ctl = -1ULL; - b->init = 1; - } - return 0; -} - -/* - * Initialize Machine Checks for a CPU. - */ -static int __mcheck_cpu_cap_init(void) -{ - unsigned b; - u64 cap; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - - b = cap & MCG_BANKCNT_MASK; - if (!mca_cfg.banks) - pr_info("CPU supports %d MCE banks\n", b); - - if (b > MAX_NR_BANKS) { - pr_warn("Using only %u machine check banks out of %u\n", - MAX_NR_BANKS, b); - b = MAX_NR_BANKS; - } - - /* Don't support asymmetric configurations today */ - WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); - mca_cfg.banks = b; - - if (!mce_banks) { - int err = __mcheck_cpu_mce_banks_init(); - - if (err) - return err; - } - - /* Use accurate RIP reporting if available. */ - if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - mca_cfg.rip_msr = MSR_IA32_MCG_EIP; - - if (cap & MCG_SER_P) - mca_cfg.ser = 1; - - return 0; -} - -static void __mcheck_cpu_init_generic(void) -{ - enum mcp_flags m_fl = 0; - mce_banks_t all_banks; - u64 cap; - - if (!mca_cfg.bootlog) - m_fl = MCP_DONTLOG; - - /* - * Log the machine checks left over from the previous reset. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC | m_fl, &all_banks); - - cr4_set_bits(X86_CR4_MCE); - - rdmsrl(MSR_IA32_MCG_CAP, cap); - if (cap & MCG_CTL_P) - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); -} - -static void __mcheck_cpu_init_clear_banks(void) -{ - int i; - - for (i = 0; i < mca_cfg.banks; i++) { - struct mce_bank *b = &mce_banks[i]; - - if (!b->init) - continue; - wrmsrl(msr_ops.ctl(i), b->ctl); - wrmsrl(msr_ops.status(i), 0); - } -} - -/* - * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and - * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM - * Vol 3B Table 15-20). But this confuses both the code that determines - * whether the machine check occurred in kernel or user mode, and also - * the severity assessment code. Pretend that EIPV was set, and take the - * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. - */ -static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) -{ - if (bank != 0) - return; - if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) - return; - if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| - MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| - MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| - MCACOD)) != - (MCI_STATUS_UC|MCI_STATUS_EN| - MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| - MCI_STATUS_AR|MCACOD_INSTR)) - return; - - m->mcgstatus |= MCG_STATUS_EIPV; - m->ip = regs->ip; - m->cs = regs->cs; -} - -/* Add per CPU specific workarounds here */ -static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) -{ - struct mca_config *cfg = &mca_cfg; - - if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("unknown CPU type - not enabling MCE support\n"); - return -EOPNOTSUPP; - } - - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && cfg->banks > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && cfg->bootlog < 0) { - /* - * Lots of broken BIOS around that don't clear them - * by default and leave crap in there. Don't log: - */ - cfg->bootlog = 0; - } - /* - * Various K7s with broken bank 0 around. Always disable - * by default. - */ - if (c->x86 == 6 && cfg->banks > 0) - mce_banks[0].ctl = 0; - - /* - * overflow_recov is supported for F15h Models 00h-0fh - * even though we don't have a CPUID bit for it. - */ - if (c->x86 == 0x15 && c->x86_model <= 0xf) - mce_flags.overflow_recov = 1; - - /* - * Turn off MC4_MISC thresholding banks on those models since - * they're not supported there. - */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { - int i; - u64 hwcr; - bool need_toggle; - u32 msrs[] = { - 0x00000413, /* MC4_MISC0 */ - 0xc0000408, /* MC4_MISC1 */ - }; - - rdmsrl(MSR_K7_HWCR, hwcr); - - /* McStatusWrEn has to be set */ - need_toggle = !(hwcr & BIT(18)); - - if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); - - /* Clear CntP bit safely */ - for (i = 0; i < ARRAY_SIZE(msrs); i++) - msr_clear_bit(msrs[i], 62); - - /* restore old settings */ - if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr); - } - } - - if (c->x86_vendor == X86_VENDOR_INTEL) { - /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ - - if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) - mce_banks[0].init = 0; - - /* - * All newer Intel systems support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; - - /* - * There are also broken BIOSes on some Pentium M and - * earlier systems: - */ - if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) - cfg->bootlog = 0; - - if (c->x86 == 6 && c->x86_model == 45) - quirk_no_way_out = quirk_sandybridge_ifu; - } - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = 0; - if (cfg->bootlog != 0) - cfg->panic_timeout = 30; - - return 0; -} - -static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) -{ - if (c->x86 != 5) - return 0; - - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - intel_p5_mcheck_init(c); - return 1; - break; - case X86_VENDOR_CENTAUR: - winchip_mcheck_init(c); - return 1; - break; - default: - return 0; - } - - return 0; -} - -/* - * Init basic CPU features needed for early decoding of MCEs. - */ -static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) -{ - if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { - mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); - mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); - mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - - if (mce_flags.smca) { - msr_ops.ctl = smca_ctl_reg; - msr_ops.status = smca_status_reg; - msr_ops.addr = smca_addr_reg; - msr_ops.misc = smca_misc_reg; - } - } -} - -static void mce_centaur_feature_init(struct cpuinfo_x86 *c) -{ - struct mca_config *cfg = &mca_cfg; - - /* - * All newer Centaur CPUs support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) || - c->x86 > 6) { - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; - } -} - -static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) -{ - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - mce_intel_feature_init(c); - mce_adjust_timer = cmci_intel_adjust_timer; - break; - - case X86_VENDOR_AMD: { - mce_amd_feature_init(c); - break; - } - - case X86_VENDOR_HYGON: - mce_hygon_feature_init(c); - break; - - case X86_VENDOR_CENTAUR: - mce_centaur_feature_init(c); - break; - - default: - break; - } -} - -static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) -{ - switch (c->x86_vendor) { - case X86_VENDOR_INTEL: - mce_intel_feature_clear(c); - break; - default: - break; - } -} - -static void mce_start_timer(struct timer_list *t) -{ - unsigned long iv = check_interval * HZ; - - if (mca_cfg.ignore_ce || !iv) - return; - - this_cpu_write(mce_next_interval, iv); - __start_timer(t, iv); -} - -static void __mcheck_cpu_setup_timer(void) -{ - struct timer_list *t = this_cpu_ptr(&mce_timer); - - timer_setup(t, mce_timer_fn, TIMER_PINNED); -} - -static void __mcheck_cpu_init_timer(void) -{ - struct timer_list *t = this_cpu_ptr(&mce_timer); - - timer_setup(t, mce_timer_fn, TIMER_PINNED); - mce_start_timer(t); -} - -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - -dotraplinkage void do_mce(struct pt_regs *regs, long error_code) -{ - machine_check_vector(regs, error_code); -} - -/* - * Called for each booted CPU to set up machine checks. - * Must be called with preempt off: - */ -void mcheck_cpu_init(struct cpuinfo_x86 *c) -{ - if (mca_cfg.disabled) - return; - - if (__mcheck_cpu_ancient_init(c)) - return; - - if (!mce_available(c)) - return; - - if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { - mca_cfg.disabled = 1; - return; - } - - if (mce_gen_pool_init()) { - mca_cfg.disabled = 1; - pr_emerg("Couldn't allocate MCE records pool!\n"); - return; - } - - machine_check_vector = do_machine_check; - - __mcheck_cpu_init_early(c); - __mcheck_cpu_init_generic(); - __mcheck_cpu_init_vendor(c); - __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_setup_timer(); -} - -/* - * Called for each booted CPU to clear some machine checks opt-ins - */ -void mcheck_cpu_clear(struct cpuinfo_x86 *c) -{ - if (mca_cfg.disabled) - return; - - if (!mce_available(c)) - return; - - /* - * Possibly to clear general settings generic to x86 - * __mcheck_cpu_clear_generic(c); - */ - __mcheck_cpu_clear_vendor(c); - -} - -static void __mce_disable_bank(void *arg) -{ - int bank = *((int *)arg); - __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); - cmci_disable_bank(bank); -} - -void mce_disable_bank(int bank) -{ - if (bank >= mca_cfg.banks) { - pr_warn(FW_BUG - "Ignoring request to disable invalid MCA bank %d.\n", - bank); - return; - } - set_bit(bank, mce_banks_ce_disabled); - on_each_cpu(__mce_disable_bank, &bank, 1); -} - -/* - * mce=off Disables machine check - * mce=no_cmci Disables CMCI - * mce=no_lmce Disables LMCE - * mce=dont_log_ce Clears corrected events silently, no log created for CEs. - * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. - * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) - * monarchtimeout is how long to wait for other CPUs on machine - * check, or 0 to not wait - * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h - and older. - * mce=nobootlog Don't log MCEs from before booting. - * mce=bios_cmci_threshold Don't program the CMCI threshold - * mce=recovery force enable memcpy_mcsafe() - */ -static int __init mcheck_enable(char *str) -{ - struct mca_config *cfg = &mca_cfg; - - if (*str == 0) { - enable_p5_mce(); - return 1; - } - if (*str == '=') - str++; - if (!strcmp(str, "off")) - cfg->disabled = 1; - else if (!strcmp(str, "no_cmci")) - cfg->cmci_disabled = true; - else if (!strcmp(str, "no_lmce")) - cfg->lmce_disabled = 1; - else if (!strcmp(str, "dont_log_ce")) - cfg->dont_log_ce = true; - else if (!strcmp(str, "ignore_ce")) - cfg->ignore_ce = true; - else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) - cfg->bootlog = (str[0] == 'b'); - else if (!strcmp(str, "bios_cmci_threshold")) - cfg->bios_cmci_threshold = 1; - else if (!strcmp(str, "recovery")) - cfg->recovery = 1; - else if (isdigit(str[0])) { - if (get_option(&str, &cfg->tolerant) == 2) - get_option(&str, &(cfg->monarch_timeout)); - } else { - pr_info("mce argument %s ignored. Please use /sys\n", str); - return 0; - } - return 1; -} -__setup("mce", mcheck_enable); - -int __init mcheck_init(void) -{ - mcheck_intel_therm_init(); - mce_register_decode_chain(&first_nb); - mce_register_decode_chain(&mce_srao_nb); - mce_register_decode_chain(&mce_default_nb); - mcheck_vendor_init_severity(); - - INIT_WORK(&mce_work, mce_gen_pool_process); - init_irq_work(&mce_irq_work, mce_irq_work_cb); - - return 0; -} - -/* - * mce_syscore: PM support - */ - -/* - * Disable machine checks on suspend and shutdown. We can't really handle - * them later. - */ -static void mce_disable_error_reporting(void) -{ - int i; - - for (i = 0; i < mca_cfg.banks; i++) { - struct mce_bank *b = &mce_banks[i]; - - if (b->init) - wrmsrl(msr_ops.ctl(i), 0); - } - return; -} - -static void vendor_disable_error_reporting(void) -{ - /* - * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs - * are socket-wide. - * Disabling them for just a single offlined CPU is bad, since it will - * inhibit reporting for all shared resources on the socket like the - * last level cache (LLC), the integrated memory controller (iMC), etc. - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON || - boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return; - - mce_disable_error_reporting(); -} - -static int mce_syscore_suspend(void) -{ - vendor_disable_error_reporting(); - return 0; -} - -static void mce_syscore_shutdown(void) -{ - vendor_disable_error_reporting(); -} - -/* - * On resume clear all MCE state. Don't want to see leftovers from the BIOS. - * Only one CPU is active at this time, the others get re-added later using - * CPU hotplug: - */ -static void mce_syscore_resume(void) -{ - __mcheck_cpu_init_generic(); - __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); - __mcheck_cpu_init_clear_banks(); -} - -static struct syscore_ops mce_syscore_ops = { - .suspend = mce_syscore_suspend, - .shutdown = mce_syscore_shutdown, - .resume = mce_syscore_resume, -}; - -/* - * mce_device: Sysfs support - */ - -static void mce_cpu_restart(void *data) -{ - if (!mce_available(raw_cpu_ptr(&cpu_info))) - return; - __mcheck_cpu_init_generic(); - __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_init_timer(); -} - -/* Reinit MCEs after user configuration changes */ -static void mce_restart(void) -{ - mce_timer_delete_all(); - on_each_cpu(mce_cpu_restart, NULL, 1); -} - -/* Toggle features for corrected errors */ -static void mce_disable_cmci(void *data) -{ - if (!mce_available(raw_cpu_ptr(&cpu_info))) - return; - cmci_clear(); -} - -static void mce_enable_ce(void *all) -{ - if (!mce_available(raw_cpu_ptr(&cpu_info))) - return; - cmci_reenable(); - cmci_recheck(); - if (all) - __mcheck_cpu_init_timer(); -} - -static struct bus_type mce_subsys = { - .name = "machinecheck", - .dev_name = "machinecheck", -}; - -DEFINE_PER_CPU(struct device *, mce_device); - -static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) -{ - return container_of(attr, struct mce_bank, attr); -} - -static ssize_t show_bank(struct device *s, struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); -} - -static ssize_t set_bank(struct device *s, struct device_attribute *attr, - const char *buf, size_t size) -{ - u64 new; - - if (kstrtou64(buf, 0, &new) < 0) - return -EINVAL; - - attr_to_bank(attr)->ctl = new; - mce_restart(); - - return size; -} - -static ssize_t set_ignore_ce(struct device *s, - struct device_attribute *attr, - const char *buf, size_t size) -{ - u64 new; - - if (kstrtou64(buf, 0, &new) < 0) - return -EINVAL; - - mutex_lock(&mce_sysfs_mutex); - if (mca_cfg.ignore_ce ^ !!new) { - if (new) { - /* disable ce features */ - mce_timer_delete_all(); - on_each_cpu(mce_disable_cmci, NULL, 1); - mca_cfg.ignore_ce = true; - } else { - /* enable ce features */ - mca_cfg.ignore_ce = false; - on_each_cpu(mce_enable_ce, (void *)1, 1); - } - } - mutex_unlock(&mce_sysfs_mutex); - - return size; -} - -static ssize_t set_cmci_disabled(struct device *s, - struct device_attribute *attr, - const char *buf, size_t size) -{ - u64 new; - - if (kstrtou64(buf, 0, &new) < 0) - return -EINVAL; - - mutex_lock(&mce_sysfs_mutex); - if (mca_cfg.cmci_disabled ^ !!new) { - if (new) { - /* disable cmci */ - on_each_cpu(mce_disable_cmci, NULL, 1); - mca_cfg.cmci_disabled = true; - } else { - /* enable cmci */ - mca_cfg.cmci_disabled = false; - on_each_cpu(mce_enable_ce, NULL, 1); - } - } - mutex_unlock(&mce_sysfs_mutex); - - return size; -} - -static ssize_t store_int_with_restart(struct device *s, - struct device_attribute *attr, - const char *buf, size_t size) -{ - unsigned long old_check_interval = check_interval; - ssize_t ret = device_store_ulong(s, attr, buf, size); - - if (check_interval == old_check_interval) - return ret; - - mutex_lock(&mce_sysfs_mutex); - mce_restart(); - mutex_unlock(&mce_sysfs_mutex); - - return ret; -} - -static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); -static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); -static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); - -static struct dev_ext_attribute dev_attr_check_interval = { - __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), - &check_interval -}; - -static struct dev_ext_attribute dev_attr_ignore_ce = { - __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), - &mca_cfg.ignore_ce -}; - -static struct dev_ext_attribute dev_attr_cmci_disabled = { - __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), - &mca_cfg.cmci_disabled -}; - -static struct device_attribute *mce_device_attrs[] = { - &dev_attr_tolerant.attr, - &dev_attr_check_interval.attr, -#ifdef CONFIG_X86_MCELOG_LEGACY - &dev_attr_trigger, -#endif - &dev_attr_monarch_timeout.attr, - &dev_attr_dont_log_ce.attr, - &dev_attr_ignore_ce.attr, - &dev_attr_cmci_disabled.attr, - NULL -}; - -static cpumask_var_t mce_device_initialized; - -static void mce_device_release(struct device *dev) -{ - kfree(dev); -} - -/* Per cpu device init. All of the cpus still share the same ctrl bank: */ -static int mce_device_create(unsigned int cpu) -{ - struct device *dev; - int err; - int i, j; - - if (!mce_available(&boot_cpu_data)) - return -EIO; - - dev = per_cpu(mce_device, cpu); - if (dev) - return 0; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; - dev->id = cpu; - dev->bus = &mce_subsys; - dev->release = &mce_device_release; - - err = device_register(dev); - if (err) { - put_device(dev); - return err; - } - - for (i = 0; mce_device_attrs[i]; i++) { - err = device_create_file(dev, mce_device_attrs[i]); - if (err) - goto error; - } - for (j = 0; j < mca_cfg.banks; j++) { - err = device_create_file(dev, &mce_banks[j].attr); - if (err) - goto error2; - } - cpumask_set_cpu(cpu, mce_device_initialized); - per_cpu(mce_device, cpu) = dev; - - return 0; -error2: - while (--j >= 0) - device_remove_file(dev, &mce_banks[j].attr); -error: - while (--i >= 0) - device_remove_file(dev, mce_device_attrs[i]); - - device_unregister(dev); - - return err; -} - -static void mce_device_remove(unsigned int cpu) -{ - struct device *dev = per_cpu(mce_device, cpu); - int i; - - if (!cpumask_test_cpu(cpu, mce_device_initialized)) - return; - - for (i = 0; mce_device_attrs[i]; i++) - device_remove_file(dev, mce_device_attrs[i]); - - for (i = 0; i < mca_cfg.banks; i++) - device_remove_file(dev, &mce_banks[i].attr); - - device_unregister(dev); - cpumask_clear_cpu(cpu, mce_device_initialized); - per_cpu(mce_device, cpu) = NULL; -} - -/* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void) -{ - if (!mce_available(raw_cpu_ptr(&cpu_info))) - return; - - if (!cpuhp_tasks_frozen) - cmci_clear(); - - vendor_disable_error_reporting(); -} - -static void mce_reenable_cpu(void) -{ - int i; - - if (!mce_available(raw_cpu_ptr(&cpu_info))) - return; - - if (!cpuhp_tasks_frozen) - cmci_reenable(); - for (i = 0; i < mca_cfg.banks; i++) { - struct mce_bank *b = &mce_banks[i]; - - if (b->init) - wrmsrl(msr_ops.ctl(i), b->ctl); - } -} - -static int mce_cpu_dead(unsigned int cpu) -{ - mce_intel_hcpu_update(cpu); - - /* intentionally ignoring frozen here */ - if (!cpuhp_tasks_frozen) - cmci_rediscover(); - return 0; -} - -static int mce_cpu_online(unsigned int cpu) -{ - struct timer_list *t = this_cpu_ptr(&mce_timer); - int ret; - - mce_device_create(cpu); - - ret = mce_threshold_create_device(cpu); - if (ret) { - mce_device_remove(cpu); - return ret; - } - mce_reenable_cpu(); - mce_start_timer(t); - return 0; -} - -static int mce_cpu_pre_down(unsigned int cpu) -{ - struct timer_list *t = this_cpu_ptr(&mce_timer); - - mce_disable_cpu(); - del_timer_sync(t); - mce_threshold_remove_device(cpu); - mce_device_remove(cpu); - return 0; -} - -static __init void mce_init_banks(void) -{ - int i; - - for (i = 0; i < mca_cfg.banks; i++) { - struct mce_bank *b = &mce_banks[i]; - struct device_attribute *a = &b->attr; - - sysfs_attr_init(&a->attr); - a->attr.name = b->attrname; - snprintf(b->attrname, ATTR_LEN, "bank%d", i); - - a->attr.mode = 0644; - a->show = show_bank; - a->store = set_bank; - } -} - -static __init int mcheck_init_device(void) -{ - int err; - - /* - * Check if we have a spare virtual bit. This will only become - * a problem if/when we move beyond 5-level page tables. - */ - MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63); - - if (!mce_available(&boot_cpu_data)) { - err = -EIO; - goto err_out; - } - - if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { - err = -ENOMEM; - goto err_out; - } - - mce_init_banks(); - - err = subsys_system_register(&mce_subsys, NULL); - if (err) - goto err_out_mem; - - err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL, - mce_cpu_dead); - if (err) - goto err_out_mem; - - err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online", - mce_cpu_online, mce_cpu_pre_down); - if (err < 0) - goto err_out_online; - - register_syscore_ops(&mce_syscore_ops); - - return 0; - -err_out_online: - cpuhp_remove_state(CPUHP_X86_MCE_DEAD); - -err_out_mem: - free_cpumask_var(mce_device_initialized); - -err_out: - pr_err("Unable to init MCE device (rc: %d)\n", err); - - return err; -} -device_initcall_sync(mcheck_init_device); - -/* - * Old style boot options parsing. Only for compatibility. - */ -static int __init mcheck_disable(char *str) -{ - mca_cfg.disabled = 1; - return 1; -} -__setup("nomce", mcheck_disable); - -#ifdef CONFIG_DEBUG_FS -struct dentry *mce_get_debugfs_dir(void) -{ - static struct dentry *dmce; - - if (!dmce) - dmce = debugfs_create_dir("mce", NULL); - - return dmce; -} - -static void mce_reset(void) -{ - cpu_missing = 0; - atomic_set(&mce_fake_panicked, 0); - atomic_set(&mce_executing, 0); - atomic_set(&mce_callin, 0); - atomic_set(&global_nwo, 0); -} - -static int fake_panic_get(void *data, u64 *val) -{ - *val = fake_panic; - return 0; -} - -static int fake_panic_set(void *data, u64 val) -{ - mce_reset(); - fake_panic = val; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, - fake_panic_set, "%llu\n"); - -static int __init mcheck_debugfs_init(void) -{ - struct dentry *dmce, *ffake_panic; - - dmce = mce_get_debugfs_dir(); - if (!dmce) - return -ENOMEM; - ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, - &fake_panic_fops); - if (!ffake_panic) - return -ENOMEM; - - return 0; -} -#else -static int __init mcheck_debugfs_init(void) { return -EINVAL; } -#endif - -DEFINE_STATIC_KEY_FALSE(mcsafe_key); -EXPORT_SYMBOL_GPL(mcsafe_key); - -static int __init mcheck_late_init(void) -{ - if (mca_cfg.recovery) - static_branch_inc(&mcsafe_key); - - mcheck_debugfs_init(); - cec_init(); - - /* - * Flush out everything that has been logged during early boot, now that - * everything has been initialized (workqueues, decoders, ...). - */ - mce_schedule_work(); - - return 0; -} -late_initcall(mcheck_late_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,1437 +0,0 @@ -/* - * (c) 2005-2016 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - * - * Written by Jacob Shin - AMD, Inc. - * Maintained by: Borislav Petkov <bp@alien8.de> - * - * All MC4_MISCi registers are shared between cores on a node. - */ -#include <linux/interrupt.h> -#include <linux/notifier.h> -#include <linux/kobject.h> -#include <linux/percpu.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/sysfs.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <linux/string.h> - -#include <asm/amd_nb.h> -#include <asm/apic.h> -#include <asm/mce.h> -#include <asm/msr.h> -#include <asm/trace/irq_vectors.h> - -#include "mce-internal.h" - -#define NR_BLOCKS 5 -#define THRESHOLD_MAX 0xFFF -#define INT_TYPE_APIC 0x00020000 -#define MASK_VALID_HI 0x80000000 -#define MASK_CNTP_HI 0x40000000 -#define MASK_LOCKED_HI 0x20000000 -#define MASK_LVTOFF_HI 0x00F00000 -#define MASK_COUNT_EN_HI 0x00080000 -#define MASK_INT_TYPE_HI 0x00060000 -#define MASK_OVERFLOW_HI 0x00010000 -#define MASK_ERR_COUNT_HI 0x00000FFF -#define MASK_BLKPTR_LO 0xFF000000 -#define MCG_XBLK_ADDR 0xC0000400 - -/* Deferred error settings */ -#define MSR_CU_DEF_ERR 0xC0000410 -#define MASK_DEF_LVTOFF 0x000000F0 -#define MASK_DEF_INT_TYPE 0x00000006 -#define DEF_LVT_OFF 0x2 -#define DEF_INT_TYPE_APIC 0x2 - -/* Scalable MCA: */ - -/* Threshold LVT offset is at MSR0xC0000410[15:12] */ -#define SMCA_THR_LVT_OFF 0xF000 - -static bool thresholding_irq_en; - -static const char * const th_names[] = { - "load_store", - "insn_fetch", - "combined_unit", - "decode_unit", - "northbridge", - "execution_unit", -}; - -static const char * const smca_umc_block_names[] = { - "dram_ecc", - "misc_umc" -}; - -struct smca_bank_name { - const char *name; /* Short name for sysfs */ - const char *long_name; /* Long name for pretty-printing */ -}; - -static struct smca_bank_name smca_names[] = { - [SMCA_LS] = { "load_store", "Load Store Unit" }, - [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, - [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, - [SMCA_DE] = { "decode_unit", "Decode Unit" }, - [SMCA_RESERVED] = { "reserved", "Reserved" }, - [SMCA_EX] = { "execution_unit", "Execution Unit" }, - [SMCA_FP] = { "floating_point", "Floating Point Unit" }, - [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, - [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, - [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, - [SMCA_UMC] = { "umc", "Unified Memory Controller" }, - [SMCA_PB] = { "param_block", "Parameter Block" }, - [SMCA_PSP] = { "psp", "Platform Security Processor" }, - [SMCA_SMU] = { "smu", "System Management Unit" }, -}; - -static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init = -{ - [0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 } -}; - -const char *smca_get_name(enum smca_bank_types t) -{ - if (t >= N_SMCA_BANK_TYPES) - return NULL; - - return smca_names[t].name; -} - -const char *smca_get_long_name(enum smca_bank_types t) -{ - if (t >= N_SMCA_BANK_TYPES) - return NULL; - - return smca_names[t].long_name; -} -EXPORT_SYMBOL_GPL(smca_get_long_name); - -static enum smca_bank_types smca_get_bank_type(unsigned int bank) -{ - struct smca_bank *b; - - if (bank >= MAX_NR_BANKS) - return N_SMCA_BANK_TYPES; - - b = &smca_banks[bank]; - if (!b->hwid) - return N_SMCA_BANK_TYPES; - - return b->hwid->bank_type; -} - -static struct smca_hwid smca_hwid_mcatypes[] = { - /* { bank_type, hwid_mcatype, xec_bitmap } */ - - /* Reserved type */ - { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 }, - - /* ZN Core (HWID=0xB0) MCA types */ - { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF }, - { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, - { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, - { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, - /* HWID 0xB0 MCATYPE 0x4 is Reserved */ - { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF }, - { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, - { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, - - /* Data Fabric MCA types */ - { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, - { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF }, - - /* Unified Memory Controller MCA type */ - { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F }, - - /* Parameter Block MCA type */ - { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, - - /* Platform Security Processor MCA type */ - { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, - - /* System Management Unit MCA type */ - { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, -}; - -struct smca_bank smca_banks[MAX_NR_BANKS]; -EXPORT_SYMBOL_GPL(smca_banks); - -/* - * In SMCA enabled processors, we can have multiple banks for a given IP type. - * So to define a unique name for each bank, we use a temp c-string to append - * the MCA_IPID[InstanceId] to type's name in get_name(). - * - * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN - * is greater than 8 plus 1 (for underscore) plus length of longest type name. - */ -#define MAX_MCATYPE_NAME_LEN 30 -static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; - -static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); -static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ - -static void amd_threshold_interrupt(void); -static void amd_deferred_error_interrupt(void); - -static void default_deferred_error_interrupt(void) -{ - pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR); -} -void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; - -static void smca_configure(unsigned int bank, unsigned int cpu) -{ - unsigned int i, hwid_mcatype; - struct smca_hwid *s_hwid; - u32 high, low; - u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank); - - /* Set appropriate bits in MCA_CONFIG */ - if (!rdmsr_safe(smca_config, &low, &high)) { - /* - * OS is required to set the MCAX bit to acknowledge that it is - * now using the new MSR ranges and new registers under each - * bank. It also means that the OS will configure deferred - * errors in the new MCx_CONFIG register. If the bit is not set, - * uncorrectable errors will cause a system panic. - * - * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) - */ - high |= BIT(0); - - /* - * SMCA sets the Deferred Error Interrupt type per bank. - * - * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us - * if the DeferredIntType bit field is available. - * - * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the - * high portion of the MSR). OS should set this to 0x1 to enable - * APIC based interrupt. First, check that no interrupt has been - * set. - */ - if ((low & BIT(5)) && !((high >> 5) & 0x3)) - high |= BIT(5); - - wrmsr(smca_config, low, high); - } - - /* Return early if this bank was already initialized. */ - if (smca_banks[bank].hwid) - return; - - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { - pr_warn("Failed to read MCA_IPID for bank %d\n", bank); - return; - } - - hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID, - (high & MCI_IPID_MCATYPE) >> 16); - - for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { - s_hwid = &smca_hwid_mcatypes[i]; - if (hwid_mcatype == s_hwid->hwid_mcatype) { - smca_banks[bank].hwid = s_hwid; - smca_banks[bank].id = low; - smca_banks[bank].sysfs_id = s_hwid->count++; - break; - } - } -} - -struct thresh_restart { - struct threshold_block *b; - int reset; - int set_lvt_off; - int lvt_off; - u16 old_limit; -}; - -static inline bool is_shared_bank(int bank) -{ - /* - * Scalable MCA provides for only one core to have access to the MSRs of - * a shared bank. - */ - if (mce_flags.smca) - return false; - - /* Bank 4 is for northbridge reporting and is thus shared */ - return (bank == 4); -} - -static const char *bank4_names(const struct threshold_block *b) -{ - switch (b->address) { - /* MSR4_MISC0 */ - case 0x00000413: - return "dram"; - - case 0xc0000408: - return "ht_links"; - - case 0xc0000409: - return "l3_cache"; - - default: - WARN(1, "Funny MSR: 0x%08x\n", b->address); - return ""; - } -}; - - -static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) -{ - /* - * bank 4 supports APIC LVT interrupts implicitly since forever. - */ - if (bank == 4) - return true; - - /* - * IntP: interrupt present; if this bit is set, the thresholding - * bank can generate APIC LVT interrupts - */ - return msr_high_bits & BIT(28); -} - -static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) -{ - int msr = (hi & MASK_LVTOFF_HI) >> 20; - - if (apic < 0) { - pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " - "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, - b->bank, b->block, b->address, hi, lo); - return 0; - } - - if (apic != msr) { - /* - * On SMCA CPUs, LVT offset is programmed at a different MSR, and - * the BIOS provides the value. The original field where LVT offset - * was set is reserved. Return early here: - */ - if (mce_flags.smca) - return 0; - - pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " - "for bank %d, block %d (MSR%08X=0x%x%08x)\n", - b->cpu, apic, b->bank, b->block, b->address, hi, lo); - return 0; - } - - return 1; -}; - -/* Reprogram MCx_MISC MSR behind this threshold bank. */ -static void threshold_restart_bank(void *_tr) -{ - struct thresh_restart *tr = _tr; - u32 hi, lo; - - rdmsr(tr->b->address, lo, hi); - - if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) - tr->reset = 1; /* limit cannot be lower than err count */ - - if (tr->reset) { /* reset err count and overflow bit */ - hi = - (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | - (THRESHOLD_MAX - tr->b->threshold_limit); - } else if (tr->old_limit) { /* change limit w/o reset */ - int new_count = (hi & THRESHOLD_MAX) + - (tr->old_limit - tr->b->threshold_limit); - - hi = (hi & ~MASK_ERR_COUNT_HI) | - (new_count & THRESHOLD_MAX); - } - - /* clear IntType */ - hi &= ~MASK_INT_TYPE_HI; - - if (!tr->b->interrupt_capable) - goto done; - - if (tr->set_lvt_off) { - if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { - /* set new lvt offset */ - hi &= ~MASK_LVTOFF_HI; - hi |= tr->lvt_off << 20; - } - } - - if (tr->b->interrupt_enable) - hi |= INT_TYPE_APIC; - - done: - - hi |= MASK_COUNT_EN_HI; - wrmsr(tr->b->address, lo, hi); -} - -static void mce_threshold_block_init(struct threshold_block *b, int offset) -{ - struct thresh_restart tr = { - .b = b, - .set_lvt_off = 1, - .lvt_off = offset, - }; - - b->threshold_limit = THRESHOLD_MAX; - threshold_restart_bank(&tr); -}; - -static int setup_APIC_mce_threshold(int reserved, int new) -{ - if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, - APIC_EILVT_MSG_FIX, 0)) - return new; - - return reserved; -} - -static int setup_APIC_deferred_error(int reserved, int new) -{ - if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR, - APIC_EILVT_MSG_FIX, 0)) - return new; - - return reserved; -} - -static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) -{ - u32 low = 0, high = 0; - int def_offset = -1, def_new; - - if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high)) - return; - - def_new = (low & MASK_DEF_LVTOFF) >> 4; - if (!(low & MASK_DEF_LVTOFF)) { - pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n"); - def_new = DEF_LVT_OFF; - low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4); - } - - def_offset = setup_APIC_deferred_error(def_offset, def_new); - if ((def_offset == def_new) && - (deferred_error_int_vector != amd_deferred_error_interrupt)) - deferred_error_int_vector = amd_deferred_error_interrupt; - - if (!mce_flags.smca) - low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; - - wrmsr(MSR_CU_DEF_ERR, low, high); -} - -static u32 smca_get_block_address(unsigned int bank, unsigned int block) -{ - u32 low, high; - u32 addr = 0; - - if (smca_get_bank_type(bank) == SMCA_RESERVED) - return addr; - - if (!block) - return MSR_AMD64_SMCA_MCx_MISC(bank); - - /* Check our cache first: */ - if (smca_bank_addrs[bank][block] != -1) - return smca_bank_addrs[bank][block]; - - /* - * For SMCA enabled processors, BLKPTR field of the first MISC register - * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). - */ - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - goto out; - - if (!(low & MCI_CONFIG_MCAX)) - goto out; - - if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && - (low & MASK_BLKPTR_LO)) - addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); - -out: - smca_bank_addrs[bank][block] = addr; - return addr; -} - -static u32 get_block_address(u32 current_addr, u32 low, u32 high, - unsigned int bank, unsigned int block) -{ - u32 addr = 0, offset = 0; - - if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) - return addr; - - if (mce_flags.smca) - return smca_get_block_address(bank, block); - - /* Fall back to method we used for older processors: */ - switch (block) { - case 0: - addr = msr_ops.misc(bank); - break; - case 1: - offset = ((low & MASK_BLKPTR_LO) >> 21); - if (offset) - addr = MCG_XBLK_ADDR + offset; - break; - default: - addr = ++current_addr; - } - return addr; -} - -static int -prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, - int offset, u32 misc_high) -{ - unsigned int cpu = smp_processor_id(); - u32 smca_low, smca_high; - struct threshold_block b; - int new; - - if (!block) - per_cpu(bank_map, cpu) |= (1 << bank); - - memset(&b, 0, sizeof(b)); - b.cpu = cpu; - b.bank = bank; - b.block = block; - b.address = addr; - b.interrupt_capable = lvt_interrupt_supported(bank, misc_high); - - if (!b.interrupt_capable) - goto done; - - b.interrupt_enable = 1; - - if (!mce_flags.smca) { - new = (misc_high & MASK_LVTOFF_HI) >> 20; - goto set_offset; - } - - /* Gather LVT offset for thresholding: */ - if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) - goto out; - - new = (smca_low & SMCA_THR_LVT_OFF) >> 12; - -set_offset: - offset = setup_APIC_mce_threshold(offset, new); - if (offset == new) - thresholding_irq_en = true; - -done: - mce_threshold_block_init(&b, offset); - -out: - return offset; -} - -/* cpu init entry point, called from mce.c with preempt off */ -void mce_amd_feature_init(struct cpuinfo_x86 *c) -{ - u32 low = 0, high = 0, address = 0; - unsigned int bank, block, cpu = smp_processor_id(); - int offset = -1; - - for (bank = 0; bank < mca_cfg.banks; ++bank) { - if (mce_flags.smca) - smca_configure(bank, cpu); - - for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(address, low, high, bank, block); - if (!address) - break; - - if (rdmsr_safe(address, &low, &high)) - break; - - if (!(high & MASK_VALID_HI)) - continue; - - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - continue; - - offset = prepare_threshold_block(bank, block, address, offset, high); - } - } - - if (mce_flags.succor) - deferred_error_interrupt_enable(c); -} - -int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) -{ - u64 dram_base_addr, dram_limit_addr, dram_hole_base; - /* We start from the normalized address */ - u64 ret_addr = norm_addr; - - u32 tmp; - - u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; - u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; - u8 intlv_addr_sel, intlv_addr_bit; - u8 num_intlv_bits, hashed_bit; - u8 lgcy_mmio_hole_en, base = 0; - u8 cs_mask, cs_id = 0; - bool hash_enabled = false; - - /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ - if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp)) - goto out_err; - - /* Remove HiAddrOffset from normalized address, if enabled: */ - if (tmp & BIT(0)) { - u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8; - - if (norm_addr >= hi_addr_offset) { - ret_addr -= hi_addr_offset; - base = 1; - } - } - - /* Read D18F0x110 (DramBaseAddress). */ - if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp)) - goto out_err; - - /* Check if address range is valid. */ - if (!(tmp & BIT(0))) { - pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", - __func__, tmp); - goto out_err; - } - - lgcy_mmio_hole_en = tmp & BIT(1); - intlv_num_chan = (tmp >> 4) & 0xF; - intlv_addr_sel = (tmp >> 8) & 0x7; - dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16; - - /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ - if (intlv_addr_sel > 3) { - pr_err("%s: Invalid interleave address select %d.\n", - __func__, intlv_addr_sel); - goto out_err; - } - - /* Read D18F0x114 (DramLimitAddress). */ - if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp)) - goto out_err; - - intlv_num_sockets = (tmp >> 8) & 0x1; - intlv_num_dies = (tmp >> 10) & 0x3; - dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); - - intlv_addr_bit = intlv_addr_sel + 8; - - /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ - switch (intlv_num_chan) { - case 0: intlv_num_chan = 0; break; - case 1: intlv_num_chan = 1; break; - case 3: intlv_num_chan = 2; break; - case 5: intlv_num_chan = 3; break; - case 7: intlv_num_chan = 4; break; - - case 8: intlv_num_chan = 1; - hash_enabled = true; - break; - default: - pr_err("%s: Invalid number of interleaved channels %d.\n", - __func__, intlv_num_chan); - goto out_err; - } - - num_intlv_bits = intlv_num_chan; - - if (intlv_num_dies > 2) { - pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", - __func__, intlv_num_dies); - goto out_err; - } - - num_intlv_bits += intlv_num_dies; - - /* Add a bit if sockets are interleaved. */ - num_intlv_bits += intlv_num_sockets; - - /* Assert num_intlv_bits <= 4 */ - if (num_intlv_bits > 4) { - pr_err("%s: Invalid interleave bits %d.\n", - __func__, num_intlv_bits); - goto out_err; - } - - if (num_intlv_bits > 0) { - u64 temp_addr_x, temp_addr_i, temp_addr_y; - u8 die_id_bit, sock_id_bit, cs_fabric_id; - - /* - * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. - * This is the fabric id for this coherent slave. Use - * umc/channel# as instance id of the coherent slave - * for FICAA. - */ - if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp)) - goto out_err; - - cs_fabric_id = (tmp >> 8) & 0xFF; - die_id_bit = 0; - - /* If interleaved over more than 1 channel: */ - if (intlv_num_chan) { - die_id_bit = intlv_num_chan; - cs_mask = (1 << die_id_bit) - 1; - cs_id = cs_fabric_id & cs_mask; - } - - sock_id_bit = die_id_bit; - - /* Read D18F1x208 (SystemFabricIdMask). */ - if (intlv_num_dies || intlv_num_sockets) - if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp)) - goto out_err; - - /* If interleaved over more than 1 die. */ - if (intlv_num_dies) { - sock_id_bit = die_id_bit + intlv_num_dies; - die_id_shift = (tmp >> 24) & 0xF; - die_id_mask = (tmp >> 8) & 0xFF; - - cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; - } - - /* If interleaved over more than 1 socket. */ - if (intlv_num_sockets) { - socket_id_shift = (tmp >> 28) & 0xF; - socket_id_mask = (tmp >> 16) & 0xFF; - - cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; - } - - /* - * The pre-interleaved address consists of XXXXXXIIIYYYYY - * where III is the ID for this CS, and XXXXXXYYYYY are the - * address bits from the post-interleaved address. - * "num_intlv_bits" has been calculated to tell us how many "I" - * bits there are. "intlv_addr_bit" tells us how many "Y" bits - * there are (where "I" starts). - */ - temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0); - temp_addr_i = (cs_id << intlv_addr_bit); - temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; - ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; - } - - /* Add dram base address */ - ret_addr += dram_base_addr; - - /* If legacy MMIO hole enabled */ - if (lgcy_mmio_hole_en) { - if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp)) - goto out_err; - - dram_hole_base = tmp & GENMASK(31, 24); - if (ret_addr >= dram_hole_base) - ret_addr += (BIT_ULL(32) - dram_hole_base); - } - - if (hash_enabled) { - /* Save some parentheses and grab ls-bit at the end. */ - hashed_bit = (ret_addr >> 12) ^ - (ret_addr >> 18) ^ - (ret_addr >> 21) ^ - (ret_addr >> 30) ^ - cs_id; - - hashed_bit &= BIT(0); - - if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0))) - ret_addr ^= BIT(intlv_addr_bit); - } - - /* Is calculated system address is above DRAM limit address? */ - if (ret_addr > dram_limit_addr) - goto out_err; - - *sys_addr = ret_addr; - return 0; - -out_err: - return -EINVAL; -} -EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); - -bool amd_mce_is_memory_error(struct mce *m) -{ - /* ErrCodeExt[20:16] */ - u8 xec = (m->status >> 16) & 0x1f; - - if (mce_flags.smca) - return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0; - - return m->bank == 4 && xec == 0x8; -} - -static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) -{ - struct mce m; - - mce_setup(&m); - - m.status = status; - m.misc = misc; - m.bank = bank; - m.tsc = rdtsc(); - - if (m.status & MCI_STATUS_ADDRV) { - m.addr = addr; - - /* - * Extract [55:<lsb>] where lsb is the least significant - * *valid* bit of the address bits. - */ - if (mce_flags.smca) { - u8 lsb = (m.addr >> 56) & 0x3f; - - m.addr &= GENMASK_ULL(55, lsb); - } - } - - if (mce_flags.smca) { - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); - - if (m.status & MCI_STATUS_SYNDV) - rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); - } - - mce_log(&m); -} - -asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) -{ - entering_irq(); - trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); - inc_irq_stat(irq_deferred_error_count); - deferred_error_int_vector(); - trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); - exiting_ack_irq(); -} - -/* - * Returns true if the logged error is deferred. False, otherwise. - */ -static inline bool -_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) -{ - u64 status, addr = 0; - - rdmsrl(msr_stat, status); - if (!(status & MCI_STATUS_VAL)) - return false; - - if (status & MCI_STATUS_ADDRV) - rdmsrl(msr_addr, addr); - - __log_error(bank, status, addr, misc); - - wrmsrl(msr_stat, 0); - - return status & MCI_STATUS_DEFERRED; -} - -/* - * We have three scenarios for checking for Deferred errors: - * - * 1) Non-SMCA systems check MCA_STATUS and log error if found. - * 2) SMCA systems check MCA_STATUS. If error is found then log it and also - * clear MCA_DESTAT. - * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and - * log it. - */ -static void log_error_deferred(unsigned int bank) -{ - bool defrd; - - defrd = _log_error_bank(bank, msr_ops.status(bank), - msr_ops.addr(bank), 0); - - if (!mce_flags.smca) - return; - - /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */ - if (defrd) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); - return; - } - - /* - * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check - * for a valid error. - */ - _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), - MSR_AMD64_SMCA_MCx_DEADDR(bank), 0); -} - -/* APIC interrupt handler for deferred errors */ -static void amd_deferred_error_interrupt(void) -{ - unsigned int bank; - - for (bank = 0; bank < mca_cfg.banks; ++bank) - log_error_deferred(bank); -} - -static void log_error_thresholding(unsigned int bank, u64 misc) -{ - _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc); -} - -static void log_and_reset_block(struct threshold_block *block) -{ - struct thresh_restart tr; - u32 low = 0, high = 0; - - if (!block) - return; - - if (rdmsr_safe(block->address, &low, &high)) - return; - - if (!(high & MASK_OVERFLOW_HI)) - return; - - /* Log the MCE which caused the threshold event. */ - log_error_thresholding(block->bank, ((u64)high << 32) | low); - - /* Reset threshold block after logging error. */ - memset(&tr, 0, sizeof(tr)); - tr.b = block; - threshold_restart_bank(&tr); -} - -/* - * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt - * goes off when error_count reaches threshold_limit. - */ -static void amd_threshold_interrupt(void) -{ - struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; - unsigned int bank, cpu = smp_processor_id(); - - for (bank = 0; bank < mca_cfg.banks; ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - - first_block = per_cpu(threshold_banks, cpu)[bank]->blocks; - if (!first_block) - continue; - - /* - * The first block is also the head of the list. Check it first - * before iterating over the rest. - */ - log_and_reset_block(first_block); - list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) - log_and_reset_block(block); - } -} - -/* - * Sysfs Interface - */ - -struct threshold_attr { - struct attribute attr; - ssize_t (*show) (struct threshold_block *, char *); - ssize_t (*store) (struct threshold_block *, const char *, size_t count); -}; - -#define SHOW_FIELDS(name) \ -static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ -{ \ - return sprintf(buf, "%lu\n", (unsigned long) b->name); \ -} -SHOW_FIELDS(interrupt_enable) -SHOW_FIELDS(threshold_limit) - -static ssize_t -store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) -{ - struct thresh_restart tr; - unsigned long new; - - if (!b->interrupt_capable) - return -EINVAL; - - if (kstrtoul(buf, 0, &new) < 0) - return -EINVAL; - - b->interrupt_enable = !!new; - - memset(&tr, 0, sizeof(tr)); - tr.b = b; - - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - - return size; -} - -static ssize_t -store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) -{ - struct thresh_restart tr; - unsigned long new; - - if (kstrtoul(buf, 0, &new) < 0) - return -EINVAL; - - if (new > THRESHOLD_MAX) - new = THRESHOLD_MAX; - if (new < 1) - new = 1; - - memset(&tr, 0, sizeof(tr)); - tr.old_limit = b->threshold_limit; - b->threshold_limit = new; - tr.b = b; - - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - - return size; -} - -static ssize_t show_error_count(struct threshold_block *b, char *buf) -{ - u32 lo, hi; - - rdmsr_on_cpu(b->cpu, b->address, &lo, &hi); - - return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - - (THRESHOLD_MAX - b->threshold_limit))); -} - -static struct threshold_attr error_count = { - .attr = {.name = __stringify(error_count), .mode = 0444 }, - .show = show_error_count, -}; - -#define RW_ATTR(val) \ -static struct threshold_attr val = { \ - .attr = {.name = __stringify(val), .mode = 0644 }, \ - .show = show_## val, \ - .store = store_## val, \ -}; - -RW_ATTR(interrupt_enable); -RW_ATTR(threshold_limit); - -static struct attribute *default_attrs[] = { - &threshold_limit.attr, - &error_count.attr, - NULL, /* possibly interrupt_enable if supported, see below */ - NULL, -}; - -#define to_block(k) container_of(k, struct threshold_block, kobj) -#define to_attr(a) container_of(a, struct threshold_attr, attr) - -static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) -{ - struct threshold_block *b = to_block(kobj); - struct threshold_attr *a = to_attr(attr); - ssize_t ret; - - ret = a->show ? a->show(b, buf) : -EIO; - - return ret; -} - -static ssize_t store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t count) -{ - struct threshold_block *b = to_block(kobj); - struct threshold_attr *a = to_attr(attr); - ssize_t ret; - - ret = a->store ? a->store(b, buf, count) : -EIO; - - return ret; -} - -static const struct sysfs_ops threshold_ops = { - .show = show, - .store = store, -}; - -static struct kobj_type threshold_ktype = { - .sysfs_ops = &threshold_ops, - .default_attrs = default_attrs, -}; - -static const char *get_name(unsigned int bank, struct threshold_block *b) -{ - enum smca_bank_types bank_type; - - if (!mce_flags.smca) { - if (b && bank == 4) - return bank4_names(b); - - return th_names[bank]; - } - - bank_type = smca_get_bank_type(bank); - if (bank_type >= N_SMCA_BANK_TYPES) - return NULL; - - if (b && bank_type == SMCA_UMC) { - if (b->block < ARRAY_SIZE(smca_umc_block_names)) - return smca_umc_block_names[b->block]; - return NULL; - } - - if (smca_banks[bank].hwid->count == 1) - return smca_get_name(bank_type); - - snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, - "%s_%x", smca_get_name(bank_type), - smca_banks[bank].sysfs_id); - return buf_mcatype; -} - -static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, - unsigned int block, u32 address) -{ - struct threshold_block *b = NULL; - u32 low, high; - int err; - - if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) - return 0; - - if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) - return 0; - - if (!(high & MASK_VALID_HI)) { - if (block) - goto recurse; - else - return 0; - } - - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - goto recurse; - - b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); - if (!b) - return -ENOMEM; - - b->block = block; - b->bank = bank; - b->cpu = cpu; - b->address = address; - b->interrupt_enable = 0; - b->interrupt_capable = lvt_interrupt_supported(bank, high); - b->threshold_limit = THRESHOLD_MAX; - - if (b->interrupt_capable) { - threshold_ktype.default_attrs[2] = &interrupt_enable.attr; - b->interrupt_enable = 1; - } else { - threshold_ktype.default_attrs[2] = NULL; - } - - INIT_LIST_HEAD(&b->miscj); - - if (per_cpu(threshold_banks, cpu)[bank]->blocks) { - list_add(&b->miscj, - &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); - } else { - per_cpu(threshold_banks, cpu)[bank]->blocks = b; - } - - err = kobject_init_and_add(&b->kobj, &threshold_ktype, - per_cpu(threshold_banks, cpu)[bank]->kobj, - get_name(bank, b)); - if (err) - goto out_free; -recurse: - address = get_block_address(address, low, high, bank, ++block); - if (!address) - return 0; - - err = allocate_threshold_blocks(cpu, bank, block, address); - if (err) - goto out_free; - - if (b) - kobject_uevent(&b->kobj, KOBJ_ADD); - - return err; - -out_free: - if (b) { - kobject_put(&b->kobj); - list_del(&b->miscj); - kfree(b); - } - return err; -} - -static int __threshold_add_blocks(struct threshold_bank *b) -{ - struct list_head *head = &b->blocks->miscj; - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - int err = 0; - - err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); - if (err) - return err; - - list_for_each_entry_safe(pos, tmp, head, miscj) { - - err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); - if (err) { - list_for_each_entry_safe_reverse(pos, tmp, head, miscj) - kobject_del(&pos->kobj); - - return err; - } - } - return err; -} - -static int threshold_create_bank(unsigned int cpu, unsigned int bank) -{ - struct device *dev = per_cpu(mce_device, cpu); - struct amd_northbridge *nb = NULL; - struct threshold_bank *b = NULL; - const char *name = get_name(bank, NULL); - int err = 0; - - if (!dev) - return -ENODEV; - - if (is_shared_bank(bank)) { - nb = node_to_amd_nb(amd_get_nb_id(cpu)); - - /* threshold descriptor already initialized on this node? */ - if (nb && nb->bank4) { - /* yes, use it */ - b = nb->bank4; - err = kobject_add(b->kobj, &dev->kobj, name); - if (err) - goto out; - - per_cpu(threshold_banks, cpu)[bank] = b; - refcount_inc(&b->cpus); - - err = __threshold_add_blocks(b); - - goto out; - } - } - - b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); - if (!b) { - err = -ENOMEM; - goto out; - } - - b->kobj = kobject_create_and_add(name, &dev->kobj); - if (!b->kobj) { - err = -EINVAL; - goto out_free; - } - - per_cpu(threshold_banks, cpu)[bank] = b; - - if (is_shared_bank(bank)) { - refcount_set(&b->cpus, 1); - - /* nb is already initialized, see above */ - if (nb) { - WARN_ON(nb->bank4); - nb->bank4 = b; - } - } - - err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank)); - if (!err) - goto out; - - out_free: - kfree(b); - - out: - return err; -} - -static void deallocate_threshold_block(unsigned int cpu, - unsigned int bank) -{ - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; - - if (!head) - return; - - list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { - kobject_put(&pos->kobj); - list_del(&pos->miscj); - kfree(pos); - } - - kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); - per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; -} - -static void __threshold_remove_blocks(struct threshold_bank *b) -{ - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - - kobject_del(b->kobj); - - list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) - kobject_del(&pos->kobj); -} - -static void threshold_remove_bank(unsigned int cpu, int bank) -{ - struct amd_northbridge *nb; - struct threshold_bank *b; - - b = per_cpu(threshold_banks, cpu)[bank]; - if (!b) - return; - - if (!b->blocks) - goto free_out; - - if (is_shared_bank(bank)) { - if (!refcount_dec_and_test(&b->cpus)) { - __threshold_remove_blocks(b); - per_cpu(threshold_banks, cpu)[bank] = NULL; - return; - } else { - /* - * the last CPU on this node using the shared bank is - * going away, remove that bank now. - */ - nb = node_to_amd_nb(amd_get_nb_id(cpu)); - nb->bank4 = NULL; - } - } - - deallocate_threshold_block(cpu, bank); - -free_out: - kobject_del(b->kobj); - kobject_put(b->kobj); - kfree(b); - per_cpu(threshold_banks, cpu)[bank] = NULL; -} - -int mce_threshold_remove_device(unsigned int cpu) -{ - unsigned int bank; - - for (bank = 0; bank < mca_cfg.banks; ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - threshold_remove_bank(cpu, bank); - } - kfree(per_cpu(threshold_banks, cpu)); - per_cpu(threshold_banks, cpu) = NULL; - return 0; -} - -/* create dir/files for all valid threshold banks */ -int mce_threshold_create_device(unsigned int cpu) -{ - unsigned int bank; - struct threshold_bank **bp; - int err = 0; - - bp = per_cpu(threshold_banks, cpu); - if (bp) - return 0; - - bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *), - GFP_KERNEL); - if (!bp) - return -ENOMEM; - - per_cpu(threshold_banks, cpu) = bp; - - for (bank = 0; bank < mca_cfg.banks; ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - err = threshold_create_bank(cpu, bank); - if (err) - goto err; - } - return err; -err: - mce_threshold_remove_device(cpu); - return err; -} - -static __init int threshold_init_device(void) -{ - unsigned lcpu = 0; - - /* to hit CPUs online before the notifier is up */ - for_each_online_cpu(lcpu) { - int err = mce_threshold_create_device(lcpu); - - if (err) - return err; - } - - if (thresholding_irq_en) - mce_threshold_vector = amd_threshold_interrupt; - - return 0; -} -/* - * there are 3 funcs which need to be _initcalled in a logic sequence: - * 1. xen_late_init_mcelog - * 2. mcheck_init_device - * 3. threshold_init_device - * - * xen_late_init_mcelog must register xen_mce_chrdev_device before - * native mce_chrdev_device registration if running under xen platform; - * - * mcheck_init_device should be inited before threshold_init_device to - * initialize mce_device, otherwise a NULL ptr dereference will cause panic. - * - * so we use following _initcalls - * 1. device_initcall(xen_late_init_mcelog); - * 2. device_initcall_sync(mcheck_init_device); - * 3. late_initcall(threshold_init_device); - * - * when running under xen, the initcall order is 1,2,3; - * on baremetal, we skip 1 and we do only 2 and 3. - */ -late_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -1,518 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Intel specific MCE features. - * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> - * Copyright (C) 2008, 2009 Intel Corporation - * Author: Andi Kleen - */ - -#include <linux/gfp.h> -#include <linux/interrupt.h> -#include <linux/percpu.h> -#include <linux/sched.h> -#include <linux/cpumask.h> -#include <asm/apic.h> -#include <asm/cpufeature.h> -#include <asm/intel-family.h> -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/mce.h> - -#include "mce-internal.h" - -/* - * Support for Intel Correct Machine Check Interrupts. This allows - * the CPU to raise an interrupt when a corrected machine check happened. - * Normally we pick those up using a regular polling timer. - * Also supports reliable discovery of shared banks. - */ - -/* - * CMCI can be delivered to multiple cpus that share a machine check bank - * so we need to designate a single cpu to process errors logged in each bank - * in the interrupt handler (otherwise we would have many races and potential - * double reporting of the same error). - * Note that this can change when a cpu is offlined or brought online since - * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear() - * disables CMCI on all banks owned by the cpu and clears this bitfield. At - * this point, cmci_rediscover() kicks in and a different cpu may end up - * taking ownership of some of the shared MCA banks that were previously - * owned by the offlined cpu. - */ -static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); - -/* - * CMCI storm detection backoff counter - * - * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've - * encountered an error. If not, we decrement it by one. We signal the end of - * the CMCI storm when it reaches 0. - */ -static DEFINE_PER_CPU(int, cmci_backoff_cnt); - -/* - * cmci_discover_lock protects against parallel discovery attempts - * which could race against each other. - */ -static DEFINE_RAW_SPINLOCK(cmci_discover_lock); - -#define CMCI_THRESHOLD 1 -#define CMCI_POLL_INTERVAL (30 * HZ) -#define CMCI_STORM_INTERVAL (HZ) -#define CMCI_STORM_THRESHOLD 15 - -static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); -static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); -static DEFINE_PER_CPU(unsigned int, cmci_storm_state); - -enum { - CMCI_STORM_NONE, - CMCI_STORM_ACTIVE, - CMCI_STORM_SUBSIDED, -}; - -static atomic_t cmci_storm_on_cpus; - -static int cmci_supported(int *banks) -{ - u64 cap; - - if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) - return 0; - - /* - * Vendor check is not strictly needed, but the initial - * initialization is vendor keyed and this - * makes sure none of the backdoors are entered otherwise. - */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - return 0; - if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) - return 0; - rdmsrl(MSR_IA32_MCG_CAP, cap); - *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); - return !!(cap & MCG_CMCI_P); -} - -static bool lmce_supported(void) -{ - u64 tmp; - - if (mca_cfg.lmce_disabled) - return false; - - rdmsrl(MSR_IA32_MCG_CAP, tmp); - - /* - * LMCE depends on recovery support in the processor. Hence both - * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP. - */ - if ((tmp & (MCG_SER_P | MCG_LMCE_P)) != - (MCG_SER_P | MCG_LMCE_P)) - return false; - - /* - * BIOS should indicate support for LMCE by setting bit 20 in - * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will - * generate a #GP fault. - */ - rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp); - if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) == - (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) - return true; - - return false; -} - -bool mce_intel_cmci_poll(void) -{ - if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) - return false; - - /* - * Reset the counter if we've logged an error in the last poll - * during the storm. - */ - if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned))) - this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); - else - this_cpu_dec(cmci_backoff_cnt); - - return true; -} - -void mce_intel_hcpu_update(unsigned long cpu) -{ - if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) - atomic_dec(&cmci_storm_on_cpus); - - per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; -} - -static void cmci_toggle_interrupt_mode(bool on) -{ - unsigned long flags, *owned; - int bank; - u64 val; - - raw_spin_lock_irqsave(&cmci_discover_lock, flags); - owned = this_cpu_ptr(mce_banks_owned); - for_each_set_bit(bank, owned, MAX_NR_BANKS) { - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); - - if (on) - val |= MCI_CTL2_CMCI_EN; - else - val &= ~MCI_CTL2_CMCI_EN; - - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); -} - -unsigned long cmci_intel_adjust_timer(unsigned long interval) -{ - if ((this_cpu_read(cmci_backoff_cnt) > 0) && - (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) { - mce_notify_irq(); - return CMCI_STORM_INTERVAL; - } - - switch (__this_cpu_read(cmci_storm_state)) { - case CMCI_STORM_ACTIVE: - - /* - * We switch back to interrupt mode once the poll timer has - * silenced itself. That means no events recorded and the timer - * interval is back to our poll interval. - */ - __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); - if (!atomic_sub_return(1, &cmci_storm_on_cpus)) - pr_notice("CMCI storm subsided: switching to interrupt mode\n"); - - /* FALLTHROUGH */ - - case CMCI_STORM_SUBSIDED: - /* - * We wait for all CPUs to go back to SUBSIDED state. When that - * happens we switch back to interrupt mode. - */ - if (!atomic_read(&cmci_storm_on_cpus)) { - __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); - cmci_toggle_interrupt_mode(true); - cmci_recheck(); - } - return CMCI_POLL_INTERVAL; - default: - - /* We have shiny weather. Let the poll do whatever it thinks. */ - return interval; - } -} - -static bool cmci_storm_detect(void) -{ - unsigned int cnt = __this_cpu_read(cmci_storm_cnt); - unsigned long ts = __this_cpu_read(cmci_time_stamp); - unsigned long now = jiffies; - int r; - - if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) - return true; - - if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { - cnt++; - } else { - cnt = 1; - __this_cpu_write(cmci_time_stamp, now); - } - __this_cpu_write(cmci_storm_cnt, cnt); - - if (cnt <= CMCI_STORM_THRESHOLD) - return false; - - cmci_toggle_interrupt_mode(false); - __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); - r = atomic_add_return(1, &cmci_storm_on_cpus); - mce_timer_kick(CMCI_STORM_INTERVAL); - this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); - - if (r == 1) - pr_notice("CMCI storm detected: switching to poll mode\n"); - return true; -} - -/* - * The interrupt handler. This is called on every event. - * Just call the poller directly to log any events. - * This could in theory increase the threshold under high load, - * but doesn't for now. - */ -static void intel_threshold_interrupt(void) -{ - if (cmci_storm_detect()) - return; - - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); -} - -/* - * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks - * on this CPU. Use the algorithm recommended in the SDM to discover shared - * banks. - */ -static void cmci_discover(int banks) -{ - unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned); - unsigned long flags; - int i; - int bios_wrong_thresh = 0; - - raw_spin_lock_irqsave(&cmci_discover_lock, flags); - for (i = 0; i < banks; i++) { - u64 val; - int bios_zero_thresh = 0; - - if (test_bit(i, owned)) - continue; - - /* Skip banks in firmware first mode */ - if (test_bit(i, mce_banks_ce_disabled)) - continue; - - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - - /* Already owned by someone else? */ - if (val & MCI_CTL2_CMCI_EN) { - clear_bit(i, owned); - __clear_bit(i, this_cpu_ptr(mce_poll_banks)); - continue; - } - - if (!mca_cfg.bios_cmci_threshold) { - val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - val |= CMCI_THRESHOLD; - } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { - /* - * If bios_cmci_threshold boot option was specified - * but the threshold is zero, we'll try to initialize - * it to 1. - */ - bios_zero_thresh = 1; - val |= CMCI_THRESHOLD; - } - - val |= MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(i), val); - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - - /* Did the enable bit stick? -- the bank supports CMCI */ - if (val & MCI_CTL2_CMCI_EN) { - set_bit(i, owned); - __clear_bit(i, this_cpu_ptr(mce_poll_banks)); - /* - * We are able to set thresholds for some banks that - * had a threshold of 0. This means the BIOS has not - * set the thresholds properly or does not work with - * this boot option. Note down now and report later. - */ - if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && - (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) - bios_wrong_thresh = 1; - } else { - WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks))); - } - } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); - if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { - pr_info_once( - "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); - pr_info_once( - "bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); - } -} - -/* - * Just in case we missed an event during initialization check - * all the CMCI owned banks. - */ -void cmci_recheck(void) -{ - unsigned long flags; - int banks; - - if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) - return; - - local_irq_save(flags); - machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)); - local_irq_restore(flags); -} - -/* Caller must hold the lock on cmci_discover_lock */ -static void __cmci_disable_bank(int bank) -{ - u64 val; - - if (!test_bit(bank, this_cpu_ptr(mce_banks_owned))) - return; - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); - val &= ~MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - __clear_bit(bank, this_cpu_ptr(mce_banks_owned)); -} - -/* - * Disable CMCI on this CPU for all banks it owns when it goes down. - * This allows other CPUs to claim the banks on rediscovery. - */ -void cmci_clear(void) -{ - unsigned long flags; - int i; - int banks; - - if (!cmci_supported(&banks)) - return; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); - for (i = 0; i < banks; i++) - __cmci_disable_bank(i); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); -} - -static void cmci_rediscover_work_func(void *arg) -{ - int banks; - - /* Recheck banks in case CPUs don't all have the same */ - if (cmci_supported(&banks)) - cmci_discover(banks); -} - -/* After a CPU went down cycle through all the others and rediscover */ -void cmci_rediscover(void) -{ - int banks; - - if (!cmci_supported(&banks)) - return; - - on_each_cpu(cmci_rediscover_work_func, NULL, 1); -} - -/* - * Reenable CMCI on this CPU in case a CPU down failed. - */ -void cmci_reenable(void) -{ - int banks; - if (cmci_supported(&banks)) - cmci_discover(banks); -} - -void cmci_disable_bank(int bank) -{ - int banks; - unsigned long flags; - - if (!cmci_supported(&banks)) - return; - - raw_spin_lock_irqsave(&cmci_discover_lock, flags); - __cmci_disable_bank(bank); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); -} - -static void intel_init_cmci(void) -{ - int banks; - - if (!cmci_supported(&banks)) - return; - - mce_threshold_vector = intel_threshold_interrupt; - cmci_discover(banks); - /* - * For CPU #0 this runs with still disabled APIC, but that's - * ok because only the vector is set up. We still do another - * check for the banks later for CPU #0 just to make sure - * to not miss any events. - */ - apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); - cmci_recheck(); -} - -static void intel_init_lmce(void) -{ - u64 val; - - if (!lmce_supported()) - return; - - rdmsrl(MSR_IA32_MCG_EXT_CTL, val); - - if (!(val & MCG_EXT_CTL_LMCE_EN)) - wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); -} - -static void intel_clear_lmce(void) -{ - u64 val; - - if (!lmce_supported()) - return; - - rdmsrl(MSR_IA32_MCG_EXT_CTL, val); - val &= ~MCG_EXT_CTL_LMCE_EN; - wrmsrl(MSR_IA32_MCG_EXT_CTL, val); -} - -static void intel_ppin_init(struct cpuinfo_x86 *c) -{ - unsigned long long val; - - /* - * Even if testing the presence of the MSR would be enough, we don't - * want to risk the situation where other models reuse this MSR for - * other purposes. - */ - switch (c->x86_model) { - case INTEL_FAM6_IVYBRIDGE_X: - case INTEL_FAM6_HASWELL_X: - case INTEL_FAM6_BROADWELL_XEON_D: - case INTEL_FAM6_BROADWELL_X: - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_XEON_PHI_KNL: - case INTEL_FAM6_XEON_PHI_KNM: - - if (rdmsrl_safe(MSR_PPIN_CTL, &val)) - return; - - if ((val & 3UL) == 1UL) { - /* PPIN available but disabled: */ - return; - } - - /* If PPIN is disabled, but not locked, try to enable: */ - if (!(val & 3UL)) { - wrmsrl_safe(MSR_PPIN_CTL, val | 2UL); - rdmsrl_safe(MSR_PPIN_CTL, &val); - } - - if ((val & 3UL) == 2UL) - set_cpu_cap(c, X86_FEATURE_INTEL_PPIN); - } -} - -void mce_intel_feature_init(struct cpuinfo_x86 *c) -{ - intel_init_thermal(c); - intel_init_cmci(); - intel_init_lmce(); - intel_ppin_init(c); -} - -void mce_intel_feature_clear(struct cpuinfo_x86 *c) -{ - intel_clear_lmce(); -} diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c @@ -1,69 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * P5 specific Machine Check Exception Reporting - * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> - */ -#include <linux/interrupt.h>