arch/x86/kernel/cpu/mcheck/mce.c

   1 /*
   2  * Machine check handler.
   3  *
   4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5  * Rest from unknown author(s).
   6  * 2004 Andi Kleen. Rewrote most of it.
   7  * Copyright 2008 Intel Corporation
   8  * Author: Andi Kleen
   9  */
  10
  11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13 #include <linux/thread_info.h>
  14 #include <linux/capability.h>
  15 #include <linux/miscdevice.h>
  16 #include <linux/ratelimit.h>
  17 #include <linux/kallsyms.h>
  18 #include <linux/rcupdate.h>
  19 #include <linux/kobject.h>
  20 #include <linux/uaccess.h>
  21 #include <linux/kdebug.h>
  22 #include <linux/kernel.h>
  23 #include <linux/percpu.h>
  24 #include <linux/string.h>
  25 #include <linux/device.h>
  26 #include <linux/syscore_ops.h>
  27 #include <linux/delay.h>
  28 #include <linux/ctype.h>
  29 #include <linux/sched.h>
  30 #include <linux/sysfs.h>
  31 #include <linux/types.h>
  32 #include <linux/slab.h>
  33 #include <linux/init.h>
  34 #include <linux/kmod.h>
  35 #include <linux/poll.h>
  36 #include <linux/nmi.h>
  37 #include <linux/cpu.h>
  38 #include <linux/ras.h>
  39 #include <linux/smp.h>
  40 #include <linux/fs.h>
  41 #include <linux/mm.h>
  42 #include <linux/debugfs.h>
  43 #include <linux/irq_work.h>
  44 #include <linux/export.h>
  45 #include <linux/jump_label.h>
  46
  47 #include <asm/intel-family.h>
  48 #include <asm/processor.h>
  49 #include <asm/traps.h>
  50 #include <asm/tlbflush.h>
  51 #include <asm/mce.h>
  52 #include <asm/msr.h>
  53 #include <asm/reboot.h>
  54 #include <asm/set_memory.h>
  55
  56 #include "mce-internal.h"
  57
  58 static DEFINE_MUTEX(mce_log_mutex);
  59
  60 #define CREATE_TRACE_POINTS
  61 #include <trace/events/mce.h>
  62
  63 #define SPINUNIT                100     /* 100ns */
  64
  65 DEFINE_PER_CPU(unsigned, mce_exception_count);
  66
  67 struct mce_bank *mce_banks __read_mostly;
  68 struct mce_vendor_flags mce_flags __read_mostly;
  69
  70 struct mca_config mca_cfg __read_mostly = {
  71         .bootlog  = -1,
  72         /*
  73          * Tolerant levels:
  74          * 0: always panic on uncorrected errors, log corrected errors
  75          * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  76          * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  77          * 3: never panic or SIGBUS, log all errors (for testing only)
  78          */
  79         .tolerant = 1,
  80         .monarch_timeout = -1
  81 };
  82
  83 static DEFINE_PER_CPU(struct mce, mces_seen);
  84 static unsigned long mce_need_notify;
  85 static int cpu_missing;
  86
  87 /*
  88  * MCA banks polled by the period polling timer for corrected events.
  89  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
  90  */
  91 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  92         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  93 };
  94
  95 /*
  96  * MCA banks controlled through firmware first for corrected errors.
  97  * This is a global list of banks for which we won't enable CMCI and we
  98  * won't poll. Firmware controls these banks and is responsible for
  99  * reporting corrected errors through GHES. Uncorrected/recoverable
 100  * errors are still notified through a machine check.
 101  */
 102 mce_banks_t mce_banks_ce_disabled;
 103
 104 static struct work_struct mce_work;
 105 static struct irq_work mce_irq_work;
 106
 107 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 108
 109 /*
 110  * CPU/chipset specific EDAC code can register a notifier call here to print
 111  * MCE errors in a human-readable form.
 112  */
 113 BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
 114
 115 /* Do initial initialization of a struct mce */
 116 void mce_setup(struct mce *m)
 117 {
 118         memset(m, 0, sizeof(struct mce));
 119         m->cpu = m->extcpu = smp_processor_id();
 120         /* We hope get_seconds stays lockless */
 121         m->time = get_seconds();
 122         m->cpuvendor = boot_cpu_data.x86_vendor;
 123         m->cpuid = cpuid_eax(1);
 124         m->socketid = cpu_data(m->extcpu).phys_proc_id;
 125         m->apicid = cpu_data(m->extcpu).initial_apicid;
 126         rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 127
 128         if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
 129                 rdmsrl(MSR_PPIN, m->ppin);
 130 }
 131
 132 DEFINE_PER_CPU(struct mce, injectm);
 133 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 134
 135 void mce_log(struct mce *m)
 136 {
 137         if (!mce_gen_pool_add(m))
 138                 irq_work_queue(&mce_irq_work);
 139 }
 140
 141 void mce_inject_log(struct mce *m)
 142 {
 143         mutex_lock(&mce_log_mutex);
 144         mce_log(m);
 145         mutex_unlock(&mce_log_mutex);
 146 }
 147 EXPORT_SYMBOL_GPL(mce_inject_log);
 148
 149 static struct notifier_block mce_srao_nb;
 150
 151 /*
 152  * We run the default notifier if we have only the SRAO, the first and the
 153  * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
 154  * notifiers registered on the chain.
 155  */
 156 #define NUM_DEFAULT_NOTIFIERS   3
 157 static atomic_t num_notifiers;
 158
 159 void mce_register_decode_chain(struct notifier_block *nb)
 160 {
 161         if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
 162                 return;
 163
 164         atomic_inc(&num_notifiers);
 165
 166         blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
 167 }
 168 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 169
 170 void mce_unregister_decode_chain(struct notifier_block *nb)
 171 {
 172         atomic_dec(&num_notifiers);
 173
 174         blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 175 }
 176 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 177
 178 static inline u32 ctl_reg(int bank)
 179 {
 180         return MSR_IA32_MCx_CTL(bank);
 181 }
 182
 183 static inline u32 status_reg(int bank)
 184 {
 185         return MSR_IA32_MCx_STATUS(bank);
 186 }
 187
 188 static inline u32 addr_reg(int bank)
 189 {
 190         return MSR_IA32_MCx_ADDR(bank);
 191 }
 192
 193 static inline u32 misc_reg(int bank)
 194 {
 195         return MSR_IA32_MCx_MISC(bank);
 196 }
 197
 198 static inline u32 smca_ctl_reg(int bank)
 199 {
 200         return MSR_AMD64_SMCA_MCx_CTL(bank);
 201 }
 202
 203 static inline u32 smca_status_reg(int bank)
 204 {
 205         return MSR_AMD64_SMCA_MCx_STATUS(bank);
 206 }
 207
 208 static inline u32 smca_addr_reg(int bank)
 209 {
 210         return MSR_AMD64_SMCA_MCx_ADDR(bank);
 211 }
 212
 213 static inline u32 smca_misc_reg(int bank)
 214 {
 215         return MSR_AMD64_SMCA_MCx_MISC(bank);
 216 }
 217
 218 struct mca_msr_regs msr_ops = {
 219         .ctl    = ctl_reg,
 220         .status = status_reg,
 221         .addr   = addr_reg,
 222         .misc   = misc_reg
 223 };
 224
 225 static void __print_mce(struct mce *m)
 226 {
 227         pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
 228                  m->extcpu,
 229                  (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
 230                  m->mcgstatus, m->bank, m->status);
 231
 232         if (m->ip) {
 233                 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 234                         !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 235                         m->cs, m->ip);
 236
 237                 if (m->cs == __KERNEL_CS)
 238                         print_symbol("{%s}", m->ip);
 239                 pr_cont("\n");
 240         }
 241
 242         pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 243         if (m->addr)
 244                 pr_cont("ADDR %llx ", m->addr);
 245         if (m->misc)
 246                 pr_cont("MISC %llx ", m->misc);
 247
 248         if (mce_flags.smca) {
 249                 if (m->synd)
 250                         pr_cont("SYND %llx ", m->synd);
 251                 if (m->ipid)
 252                         pr_cont("IPID %llx ", m->ipid);
 253         }
 254
 255         pr_cont("\n");
 256         /*
 257          * Note this output is parsed by external tools and old fields
 258          * should not be changed.
 259          */
 260         pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 261                 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 262                 cpu_data(m->extcpu).microcode);
 263 }
 264
 265 static void print_mce(struct mce *m)
 266 {
 267         __print_mce(m);
 268         pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 269 }
 270
 271 #define PANIC_TIMEOUT 5 /* 5 seconds */
 272
 273 static atomic_t mce_panicked;
 274
 275 static int fake_panic;
 276 static atomic_t mce_fake_panicked;
 277
 278 /* Panic in progress. Enable interrupts and wait for final IPI */
 279 static void wait_for_panic(void)
 280 {
 281         long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 282
 283         preempt_disable();
 284         local_irq_enable();
 285         while (timeout-- > 0)
 286                 udelay(1);
 287         if (panic_timeout == 0)
 288                 panic_timeout = mca_cfg.panic_timeout;
 289         panic("Panicing machine check CPU died");
 290 }
 291
 292 static void mce_panic(const char *msg, struct mce *final, char *exp)
 293 {
 294         int apei_err = 0;
 295         struct llist_node *pending;
 296         struct mce_evt_llist *l;
 297
 298         if (!fake_panic) {
 299                 /*
 300                  * Make sure only one CPU runs in machine check panic
 301                  */
 302                 if (atomic_inc_return(&mce_panicked) > 1)
 303                         wait_for_panic();
 304                 barrier();
 305
 306                 bust_spinlocks(1);
 307                 console_verbose();
 308         } else {
 309                 /* Don't log too much for fake panic */
 310                 if (atomic_inc_return(&mce_fake_panicked) > 1)
 311                         return;
 312         }
 313         pending = mce_gen_pool_prepare_records();
 314         /* First print corrected ones that are still unlogged */
 315         llist_for_each_entry(l, pending, llnode) {
 316                 struct mce *m = &l->mce;
 317                 if (!(m->status & MCI_STATUS_UC)) {
 318                         print_mce(m);
 319                         if (!apei_err)
 320                                 apei_err = apei_write_mce(m);
 321                 }
 322         }
 323         /* Now print uncorrected but with the final one last */
 324         llist_for_each_entry(l, pending, llnode) {
 325                 struct mce *m = &l->mce;
 326                 if (!(m->status & MCI_STATUS_UC))
 327                         continue;
 328                 if (!final || mce_cmp(m, final)) {
 329                         print_mce(m);
 330                         if (!apei_err)
 331                                 apei_err = apei_write_mce(m);
 332                 }
 333         }
 334         if (final) {
 335                 print_mce(final);
 336                 if (!apei_err)
 337                         apei_err = apei_write_mce(final);
 338         }
 339         if (cpu_missing)
 340                 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 341         if (exp)
 342                 pr_emerg(HW_ERR "Machine check: %s\n", exp);
 343         if (!fake_panic) {
 344                 if (panic_timeout == 0)
 345                         panic_timeout = mca_cfg.panic_timeout;
 346                 panic(msg);
 347         } else
 348                 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 349 }
 350
 351 /* Support code for software error injection */
 352
 353 static int msr_to_offset(u32 msr)
 354 {
 355         unsigned bank = __this_cpu_read(injectm.bank);
 356
 357         if (msr == mca_cfg.rip_msr)
 358                 return offsetof(struct mce, ip);
 359         if (msr == msr_ops.status(bank))
 360                 return offsetof(struct mce, status);
 361         if (msr == msr_ops.addr(bank))
 362                 return offsetof(struct mce, addr);
 363         if (msr == msr_ops.misc(bank))
 364                 return offsetof(struct mce, misc);
 365         if (msr == MSR_IA32_MCG_STATUS)
 366                 return offsetof(struct mce, mcgstatus);
 367         return -1;
 368 }
 369
 370 /* MSR access wrappers used for error injection */
 371 static u64 mce_rdmsrl(u32 msr)
 372 {
 373         u64 v;
 374
 375         if (__this_cpu_read(injectm.finished)) {
 376                 int offset = msr_to_offset(msr);
 377
 378                 if (offset < 0)
 379                         return 0;
 380                 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
 381         }
 382
 383         if (rdmsrl_safe(msr, &v)) {
 384                 WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
 385                 /*
 386                  * Return zero in case the access faulted. This should
 387                  * not happen normally but can happen if the CPU does
 388                  * something weird, or if the code is buggy.
 389                  */
 390                 v = 0;
 391         }
 392
 393         return v;
 394 }
 395
 396 static void mce_wrmsrl(u32 msr, u64 v)
 397 {
 398         if (__this_cpu_read(injectm.finished)) {
 399                 int offset = msr_to_offset(msr);
 400
 401                 if (offset >= 0)
 402                         *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
 403                 return;
 404         }
 405         wrmsrl(msr, v);
 406 }
 407
 408 /*
 409  * Collect all global (w.r.t. this processor) status about this machine
 410  * check into our "mce" struct so that we can use it later to assess
 411  * the severity of the problem as we read per-bank specific details.
 412  */
 413 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 414 {
 415         mce_setup(m);
 416
 417         m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 418         if (regs) {
 419                 /*
 420                  * Get the address of the instruction at the time of
 421                  * the machine check error.
 422                  */
 423                 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 424                         m->ip = regs->ip;
 425                         m->cs = regs->cs;
 426
 427                         /*
 428                          * When in VM86 mode make the cs look like ring 3
 429                          * always. This is a lie, but it's better than passing
 430                          * the additional vm86 bit around everywhere.
 431                          */
 432                         if (v8086_mode(regs))
 433                                 m->cs |= 3;
 434                 }
 435                 /* Use accurate RIP reporting if available. */
 436                 if (mca_cfg.rip_msr)
 437                         m->ip = mce_rdmsrl(mca_cfg.rip_msr);
 438         }
 439 }
 440
 441 int mce_available(struct cpuinfo_x86 *c)
 442 {
 443         if (mca_cfg.disabled)
 444                 return 0;
 445         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 446 }
 447
 448 static void mce_schedule_work(void)
 449 {
 450         if (!mce_gen_pool_empty())
 451                 schedule_work(&mce_work);
 452 }
 453
 454 static void mce_irq_work_cb(struct irq_work *entry)
 455 {
 456         mce_schedule_work();
 457 }
 458
 459 static void mce_report_event(struct pt_regs *regs)
 460 {
 461         if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 462                 mce_notify_irq();
 463                 /*
 464                  * Triggering the work queue here is just an insurance
 465                  * policy in case the syscall exit notify handler
 466                  * doesn't run soon enough or ends up running on the
 467                  * wrong CPU (can happen when audit sleeps)
 468                  */
 469                 mce_schedule_work();
 470                 return;
 471         }
 472
 473         irq_work_queue(&mce_irq_work);
 474 }
 475
 476 /*
 477  * Check if the address reported by the CPU is in a format we can parse.
 478  * It would be possible to add code for most other cases, but all would
 479  * be somewhat complicated (e.g. segment offset would require an instruction
 480  * parser). So only support physical addresses up to page granuality for now.
 481  */
 482 static int mce_usable_address(struct mce *m)
 483 {
 484         if (!(m->status & MCI_STATUS_ADDRV))
 485                 return 0;
 486
 487         /* Checks after this one are Intel-specific: */
 488         if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
 489                 return 1;
 490
 491         if (!(m->status & MCI_STATUS_MISCV))
 492                 return 0;
 493
 494         if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 495                 return 0;
 496
 497         if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 498                 return 0;
 499
 500         return 1;
 501 }
 502
 503 bool mce_is_memory_error(struct mce *m)
 504 {
 505         if (m->cpuvendor == X86_VENDOR_AMD) {
 506                 return amd_mce_is_memory_error(m);
 507
 508         } else if (m->cpuvendor == X86_VENDOR_INTEL) {
 509                 /*
 510                  * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
 511                  *
 512                  * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
 513                  * indicating a memory error. Bit 8 is used for indicating a
 514                  * cache hierarchy error. The combination of bit 2 and bit 3
 515                  * is used for indicating a `generic' cache hierarchy error
 516                  * But we can't just blindly check the above bits, because if
 517                  * bit 11 is set, then it is a bus/interconnect error - and
 518                  * either way the above bits just gives more detail on what
 519                  * bus/interconnect error happened. Note that bit 12 can be
 520                  * ignored, as it's the "filter" bit.
 521                  */
 522                 return (m->status & 0xef80) == BIT(7) ||
 523                        (m->status & 0xef00) == BIT(8) ||
 524                        (m->status & 0xeffc) == 0xc;
 525         }
 526
 527         return false;
 528 }
 529 EXPORT_SYMBOL_GPL(mce_is_memory_error);
 530
 531 static bool mce_is_correctable(struct mce *m)
 532 {
 533         if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
 534                 return false;
 535
 536         if (m->status & MCI_STATUS_UC)
 537                 return false;
 538
 539         return true;
 540 }
 541
 542 static bool cec_add_mce(struct mce *m)
 543 {
 544         if (!m)
 545                 return false;
 546
 547         /* We eat only correctable DRAM errors with usable addresses. */
 548         if (mce_is_memory_error(m) &&
 549             mce_is_correctable(m)  &&
 550             mce_usable_address(m))
 551                 if (!cec_add_elem(m->addr >> PAGE_SHIFT))
 552                         return true;
 553
 554         return false;
 555 }
 556
 557 static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
 558                               void *data)
 559 {
 560         struct mce *m = (struct mce *)data;
 561
 562         if (!m)
 563                 return NOTIFY_DONE;
 564
 565         if (cec_add_mce(m))
 566                 return NOTIFY_STOP;
 567
 568         /* Emit the trace record: */
 569         trace_mce_record(m);
 570
 571         set_bit(0, &mce_need_notify);
 572
 573         mce_notify_irq();
 574
 575         return NOTIFY_DONE;
 576 }
 577
 578 static struct notifier_block first_nb = {
 579         .notifier_call  = mce_first_notifier,
 580         .priority       = MCE_PRIO_FIRST,
 581 };
 582
 583 static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 584                                 void *data)
 585 {
 586         struct mce *mce = (struct mce *)data;
 587         unsigned long pfn;
 588
 589         if (!mce)
 590                 return NOTIFY_DONE;
 591
 592         if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
 593                 pfn = mce->addr >> PAGE_SHIFT;
 594                 memory_failure(pfn, 0);
 595         }
 596
 597         return NOTIFY_OK;
 598 }
 599 static struct notifier_block mce_srao_nb = {
 600         .notifier_call  = srao_decode_notifier,
 601         .priority       = MCE_PRIO_SRAO,
 602 };
 603
 604 static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
 605                                 void *data)
 606 {
 607         struct mce *m = (struct mce *)data;
 608
 609         if (!m)
 610                 return NOTIFY_DONE;
 611
 612         if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
 613                 return NOTIFY_DONE;
 614
 615         __print_mce(m);
 616
 617         return NOTIFY_DONE;
 618 }
 619
 620 static struct notifier_block mce_default_nb = {
 621         .notifier_call  = mce_default_notifier,
 622         /* lowest prio, we want it to run last. */
 623         .priority       = MCE_PRIO_LOWEST,
 624 };
 625
 626 /*
 627  * Read ADDR and MISC registers.
 628  */
 629 static void mce_read_aux(struct mce *m, int i)
 630 {
 631         if (m->status & MCI_STATUS_MISCV)
 632                 m->misc = mce_rdmsrl(msr_ops.misc(i));
 633
 634         if (m->status & MCI_STATUS_ADDRV) {
 635                 m->addr = mce_rdmsrl(msr_ops.addr(i));
 636
 637                 /*
 638                  * Mask the reported address by the reported granularity.
 639                  */
 640                 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
 641                         u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 642                         m->addr >>= shift;
 643                         m->addr <<= shift;
 644                 }
 645
 646                 /*
 647                  * Extract [55:<lsb>] where lsb is the least significant
 648                  * *valid* bit of the address bits.
 649                  */
 650                 if (mce_flags.smca) {
 651                         u8 lsb = (m->addr >> 56) & 0x3f;
 652
 653                         m->addr &= GENMASK_ULL(55, lsb);
 654                 }
 655         }
 656
 657         if (mce_flags.smca) {
 658                 m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
 659
 660                 if (m->status & MCI_STATUS_SYNDV)
 661                         m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
 662         }
 663 }
 664
 665 DEFINE_PER_CPU(unsigned, mce_poll_count);
 666
 667 /*
 668  * Poll for corrected events or events that happened before reset.
 669  * Those are just logged through /dev/mcelog.
 670  *
 671  * This is executed in standard interrupt context.
 672  *
 673  * Note: spec recommends to panic for fatal unsignalled
 674  * errors here. However this would be quite problematic --
 675  * we would need to reimplement the Monarch handling and
 676  * it would mess up the exclusion between exception handler
 677  * and poll hander -- * so we skip this for now.
 678  * These cases should not happen anyways, or only when the CPU
 679  * is already totally * confused. In this case it's likely it will
 680  * not fully execute the machine check handler either.
 681  */
 682 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 683 {
 684         bool error_seen = false;
 685         struct mce m;
 686         int i;
 687
 688         this_cpu_inc(mce_poll_count);
 689
 690         mce_gather_info(&m, NULL);
 691
 692         if (flags & MCP_TIMESTAMP)
 693                 m.tsc = rdtsc();
 694
 695         for (i = 0; i < mca_cfg.banks; i++) {
 696                 if (!mce_banks[i].ctl || !test_bit(i, *b))
 697                         continue;
 698
 699                 m.misc = 0;
 700                 m.addr = 0;
 701                 m.bank = i;
 702
 703                 barrier();
 704                 m.status = mce_rdmsrl(msr_ops.status(i));
 705                 if (!(m.status & MCI_STATUS_VAL))
 706                         continue;
 707
 708                 /*
 709                  * Uncorrected or signalled events are handled by the exception
 710                  * handler when it is enabled, so don't process those here.
 711                  *
 712                  * TBD do the same check for MCI_STATUS_EN here?
 713                  */
 714                 if (!(flags & MCP_UC) &&
 715                     (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 716                         continue;
 717
 718                 error_seen = true;
 719
 720                 mce_read_aux(&m, i);
 721
 722                 m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 723
 724                 /*
 725                  * Don't get the IP here because it's unlikely to
 726                  * have anything to do with the actual error location.
 727                  */
 728                 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
 729                         mce_log(&m);
 730                 else if (mce_usable_address(&m)) {
 731                         /*
 732                          * Although we skipped logging this, we still want
 733                          * to take action. Add to the pool so the registered
 734                          * notifiers will see it.
 735                          */
 736                         if (!mce_gen_pool_add(&m))
 737                                 mce_schedule_work();
 738                 }
 739
 740                 /*
 741                  * Clear state for this bank.
 742                  */
 743                 mce_wrmsrl(msr_ops.status(i), 0);
 744         }
 745
 746         /*
 747          * Don't clear MCG_STATUS here because it's only defined for
 748          * exceptions.
 749          */
 750
 751         sync_core();
 752
 753         return error_seen;
 754 }
 755 EXPORT_SYMBOL_GPL(machine_check_poll);
 756
 757 /*
 758  * Do a quick check if any of the events requires a panic.
 759  * This decides if we keep the events around or clear them.
 760  */
 761 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 762                           struct pt_regs *regs)
 763 {
 764         int i, ret = 0;
 765         char *tmp;
 766
 767         for (i = 0; i < mca_cfg.banks; i++) {
 768                 m->status = mce_rdmsrl(msr_ops.status(i));
 769                 if (m->status & MCI_STATUS_VAL) {
 770                         __set_bit(i, validp);
 771                         if (quirk_no_way_out)
 772                                 quirk_no_way_out(i, m, regs);
 773                 }
 774
 775                 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
 776                         *msg = tmp;
 777                         ret = 1;
 778                 }
 779         }
 780         return ret;
 781 }
 782
 783 /*
 784  * Variable to establish order between CPUs while scanning.
 785  * Each CPU spins initially until executing is equal its number.
 786  */
 787 static atomic_t mce_executing;
 788
 789 /*
 790  * Defines order of CPUs on entry. First CPU becomes Monarch.
 791  */
 792 static atomic_t mce_callin;
 793
 794 /*
 795  * Check if a timeout waiting for other CPUs happened.
 796  */
 797 static int mce_timed_out(u64 *t, const char *msg)
 798 {
 799         /*
 800          * The others already did panic for some reason.
 801          * Bail out like in a timeout.
 802          * rmb() to tell the compiler that system_state
 803          * might have been modified by someone else.
 804          */
 805         rmb();
 806         if (atomic_read(&mce_panicked))
 807                 wait_for_panic();
 808         if (!mca_cfg.monarch_timeout)
 809                 goto out;
 810         if ((s64)*t < SPINUNIT) {
 811                 if (mca_cfg.tolerant <= 1)
 812                         mce_panic(msg, NULL, NULL);
 813                 cpu_missing = 1;
 814                 return 1;
 815         }
 816         *t -= SPINUNIT;
 817 out:
 818         touch_nmi_watchdog();
 819         return 0;
 820 }
 821
 822 /*
 823  * The Monarch's reign.  The Monarch is the CPU who entered
 824  * the machine check handler first. It waits for the others to
 825  * raise the exception too and then grades them. When any
 826  * error is fatal panic. Only then let the others continue.
 827  *
 828  * The other CPUs entering the MCE handler will be controlled by the
 829  * Monarch. They are called Subjects.
 830  *
 831  * This way we prevent any potential data corruption in a unrecoverable case
 832  * and also makes sure always all CPU's errors are examined.
 833  *
 834  * Also this detects the case of a machine check event coming from outer
 835  * space (not detected by any CPUs) In this case some external agent wants
 836  * us to shut down, so panic too.
 837  *
 838  * The other CPUs might still decide to panic if the handler happens
 839  * in a unrecoverable place, but in this case the system is in a semi-stable
 840  * state and won't corrupt anything by itself. It's ok to let the others
 841  * continue for a bit first.
 842  *
 843  * All the spin loops have timeouts; when a timeout happens a CPU
 844  * typically elects itself to be Monarch.
 845  */
 846 static void mce_reign(void)
 847 {
 848         int cpu;
 849         struct mce *m = NULL;
 850         int global_worst = 0;
 851         char *msg = NULL;
 852         char *nmsg = NULL;
 853
 854         /*
 855          * This CPU is the Monarch and the other CPUs have run
 856          * through their handlers.
 857          * Grade the severity of the errors of all the CPUs.
 858          */
 859         for_each_possible_cpu(cpu) {
 860                 int severity = mce_severity(&per_cpu(mces_seen, cpu),
 861                                             mca_cfg.tolerant,
 862                                             &nmsg, true);
 863                 if (severity > global_worst) {
 864                         msg = nmsg;
 865                         global_worst = severity;
 866                         m = &per_cpu(mces_seen, cpu);
 867                 }
 868         }
 869
 870         /*
 871          * Cannot recover? Panic here then.
 872          * This dumps all the mces in the log buffer and stops the
 873          * other CPUs.
 874          */
 875         if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
 876                 mce_panic("Fatal machine check", m, msg);
 877
 878         /*
 879          * For UC somewhere we let the CPU who detects it handle it.
 880          * Also must let continue the others, otherwise the handling
 881          * CPU could deadlock on a lock.
 882          */
 883
 884         /*
 885          * No machine check event found. Must be some external
 886          * source or one CPU is hung. Panic.
 887          */
 888         if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
 889                 mce_panic("Fatal machine check from unknown source", NULL, NULL);
 890
 891         /*
 892          * Now clear all the mces_seen so that they don't reappear on
 893          * the next mce.
 894          */
 895         for_each_possible_cpu(cpu)
 896                 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 897 }
 898
 899 static atomic_t global_nwo;
 900
 901 /*
 902  * Start of Monarch synchronization. This waits until all CPUs have
 903  * entered the exception handler and then determines if any of them
 904  * saw a fatal event that requires panic. Then it executes them
 905  * in the entry order.
 906  * TBD double check parallel CPU hotunplug
 907  */
 908 static int mce_start(int *no_way_out)
 909 {
 910         int order;
 911         int cpus = num_online_cpus();
 912         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 913
 914         if (!timeout)
 915                 return -1;
 916
 917         atomic_add(*no_way_out, &global_nwo);
 918         /*
 919          * Rely on the implied barrier below, such that global_nwo
 920          * is updated before mce_callin.
 921          */
 922         order = atomic_inc_return(&mce_callin);
 923
 924         /*
 925          * Wait for everyone.
 926          */
 927         while (atomic_read(&mce_callin) != cpus) {
 928                 if (mce_timed_out(&timeout,
 929                                   "Timeout: Not all CPUs entered broadcast exception handler")) {
 930                         atomic_set(&global_nwo, 0);
 931                         return -1;
 932                 }
 933                 ndelay(SPINUNIT);
 934         }
 935
 936         /*
 937          * mce_callin should be read before global_nwo
 938          */
 939         smp_rmb();
 940
 941         if (order == 1) {
 942                 /*
 943                  * Monarch: Starts executing now, the others wait.
 944                  */
 945                 atomic_set(&mce_executing, 1);
 946         } else {
 947                 /*
 948                  * Subject: Now start the scanning loop one by one in
 949                  * the original callin order.
 950                  * This way when there are any shared banks it will be
 951                  * only seen by one CPU before cleared, avoiding duplicates.
 952                  */
 953                 while (atomic_read(&mce_executing) < order) {
 954                         if (mce_timed_out(&timeout,
 955                                           "Timeout: Subject CPUs unable to finish machine check processing")) {
 956                                 atomic_set(&global_nwo, 0);
 957                                 return -1;
 958                         }
 959                         ndelay(SPINUNIT);
 960                 }
 961         }
 962
 963         /*
 964          * Cache the global no_way_out state.
 965          */
 966         *no_way_out = atomic_read(&global_nwo);
 967
 968         return order;
 969 }
 970
 971 /*
 972  * Synchronize between CPUs after main scanning loop.
 973  * This invokes the bulk of the Monarch processing.
 974  */
 975 static int mce_end(int order)
 976 {
 977         int ret = -1;
 978         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 979
 980         if (!timeout)
 981                 goto reset;
 982         if (order < 0)
 983                 goto reset;
 984
 985         /*
 986          * Allow others to run.
 987          */
 988         atomic_inc(&mce_executing);
 989
 990         if (order == 1) {
 991                 /* CHECKME: Can this race with a parallel hotplug? */
 992                 int cpus = num_online_cpus();
 993
 994                 /*
 995                  * Monarch: Wait for everyone to go through their scanning
 996                  * loops.
 997                  */
 998                 while (atomic_read(&mce_executing) <= cpus) {
 999                         if (mce_timed_out(&timeout,
1000                                           "Timeout: Monarch CPU unable to finish machine check processing"))
1001                                 goto reset;
1002                         ndelay(SPINUNIT);
1003                 }
1004
1005                 mce_reign();
1006                 barrier();
1007                 ret = 0;
1008         } else {
1009                 /*
1010                  * Subject: Wait for Monarch to finish.
1011                  */
1012                 while (atomic_read(&mce_executing) != 0) {
1013                         if (mce_timed_out(&timeout,
1014                                           "Timeout: Monarch CPU did not finish machine check processing"))
1015                                 goto reset;
1016                         ndelay(SPINUNIT);
1017                 }
1018
1019                 /*
1020                  * Don't reset anything. That's done by the Monarch.
1021                  */
1022                 return 0;
1023         }
1024
1025         /*
1026          * Reset all global state.
1027          */
1028 reset:
1029         atomic_set(&global_nwo, 0);
1030         atomic_set(&mce_callin, 0);
1031         barrier();
1032
1033         /*
1034          * Let others run again.
1035          */
1036         atomic_set(&mce_executing, 0);
1037         return ret;
1038 }
1039
1040 static void mce_clear_state(unsigned long *toclear)
1041 {
1042         int i;
1043
1044         for (i = 0; i < mca_cfg.banks; i++) {
1045                 if (test_bit(i, toclear))
1046                         mce_wrmsrl(msr_ops.status(i), 0);
1047         }
1048 }
1049
1050 static int do_memory_failure(struct mce *m)
1051 {
1052         int flags = MF_ACTION_REQUIRED;
1053         int ret;
1054
1055         pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
1056         if (!(m->mcgstatus & MCG_STATUS_RIPV))
1057                 flags |= MF_MUST_KILL;
1058         ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
1059         if (ret)
1060                 pr_err("Memory error not recovered");
1061         return ret;
1062 }
1063
1064 #if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
1065
1066 void arch_unmap_kpfn(unsigned long pfn)
1067 {
1068         unsigned long decoy_addr;
1069
1070         /*
1071          * Unmap this page from the kernel 1:1 mappings to make sure
1072          * we don't log more errors because of speculative access to
1073          * the page.
1074          * We would like to just call:
1075          *      set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
1076          * but doing that would radically increase the odds of a
1077          * speculative access to the posion page because we'd have
1078          * the virtual address of the kernel 1:1 mapping sitting
1079          * around in registers.
1080          * Instead we get tricky.  We create a non-canonical address
1081          * that looks just like the one we want, but has bit 63 flipped.
1082          * This relies on set_memory_np() not checking whether we passed
1083          * a legal address.
1084          */
1085
1086 /*
1087  * Build time check to see if we have a spare virtual bit. Don't want
1088  * to leave this until run time because most developers don't have a
1089  * system that can exercise this code path. This will only become a
1090  * problem if/when we move beyond 5-level page tables.
1091  *
1092  * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
1093  */
1094 #if PGDIR_SHIFT + 9 < 63
1095         decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
1096 #else
1097 #error "no unused virtual bit available"
1098 #endif
1099
1100         if (set_memory_np(decoy_addr, 1))
1101                 pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
1102
1103 }
1104 #endif
1105
1106 /*
1107  * The actual machine check handler. This only handles real
1108  * exceptions when something got corrupted coming in through int 18.
1109  *
1110  * This is executed in NMI context not subject to normal locking rules. This
1111  * implies that most kernel services cannot be safely used. Don't even
1112  * think about putting a printk in there!
1113  *
1114  * On Intel systems this is entered on all CPUs in parallel through
1115  * MCE broadcast. However some CPUs might be broken beyond repair,
1116  * so be always careful when synchronizing with others.
1117  */
1118 void do_machine_check(struct pt_regs *regs, long error_code)
1119 {
1120         struct mca_config *cfg = &mca_cfg;
1121         struct mce m, *final;
1122         int i;
1123         int worst = 0;
1124         int severity;
1125
1126         /*
1127          * Establish sequential order between the CPUs entering the machine
1128          * check handler.
1129          */
1130         int order = -1;
1131         /*
1132          * If no_way_out gets set, there is no safe way to recover from this
1133          * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1134          */
1135         int no_way_out = 0;
1136         /*
1137          * If kill_it gets set, there might be a way to recover from this
1138          * error.
1139          */
1140         int kill_it = 0;
1141         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1142         DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1143         char *msg = "Unknown";
1144
1145         /*
1146          * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1147          * on Intel.
1148          */
1149         int lmce = 1;
1150         int cpu = smp_processor_id();
1151
1152         /*
1153          * Cases where we avoid rendezvous handler timeout:
1154          * 1) If this CPU is offline.
1155          *
1156          * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1157          *  skip those CPUs which remain looping in the 1st kernel - see
1158          *  crash_nmi_callback().
1159          *
1160          * Note: there still is a small window between kexec-ing and the new,
1161          * kdump kernel establishing a new #MC handler where a broadcasted MCE
1162          * might not get handled properly.
1163          */
1164         if (cpu_is_offline(cpu) ||
1165             (crashing_cpu != -1 && crashing_cpu != cpu)) {
1166                 u64 mcgstatus;
1167
1168                 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1169                 if (mcgstatus & MCG_STATUS_RIPV) {
1170                         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1171                         return;
1172                 }
1173         }
1174
1175         ist_enter(regs);
1176
1177         this_cpu_inc(mce_exception_count);
1178
1179         if (!cfg->banks)
1180                 goto out;
1181
1182         mce_gather_info(&m, regs);
1183         m.tsc = rdtsc();
1184
1185         final = this_cpu_ptr(&mces_seen);
1186         *final = m;
1187
1188         memset(valid_banks, 0, sizeof(valid_banks));
1189         no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1190
1191         barrier();
1192
1193         /*
1194          * When no restart IP might need to kill or panic.
1195          * Assume the worst for now, but if we find the
1196          * severity is MCE_AR_SEVERITY we have other options.
1197          */
1198         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1199                 kill_it = 1;
1200
1201         /*
1202          * Check if this MCE is signaled to only this logical processor,
1203          * on Intel only.
1204          */
1205         if (m.cpuvendor == X86_VENDOR_INTEL)
1206                 lmce = m.mcgstatus & MCG_STATUS_LMCES;
1207
1208         /*
1209          * Go through all banks in exclusion of the other CPUs. This way we
1210          * don't report duplicated events on shared banks because the first one
1211          * to see it will clear it. If this is a Local MCE, then no need to
1212          * perform rendezvous.
1213          */
1214         if (!lmce)
1215                 order = mce_start(&no_way_out);
1216
1217         for (i = 0; i < cfg->banks; i++) {
1218                 __clear_bit(i, toclear);
1219                 if (!test_bit(i, valid_banks))
1220                         continue;
1221                 if (!mce_banks[i].ctl)
1222                         continue;
1223
1224                 m.misc = 0;
1225                 m.addr = 0;
1226                 m.bank = i;
1227
1228                 m.status = mce_rdmsrl(msr_ops.status(i));
1229                 if ((m.status & MCI_STATUS_VAL) == 0)
1230                         continue;
1231
1232                 /*
1233                  * Non uncorrected or non signaled errors are handled by
1234                  * machine_check_poll. Leave them alone, unless this panics.
1235                  */
1236                 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1237                         !no_way_out)
1238                         continue;
1239
1240                 /*
1241                  * Set taint even when machine check was not enabled.
1242                  */
1243                 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1244
1245                 severity = mce_severity(&m, cfg->tolerant, NULL, true);
1246
1247                 /*
1248                  * When machine check was for corrected/deferred handler don't
1249                  * touch, unless we're panicing.
1250                  */
1251                 if ((severity == MCE_KEEP_SEVERITY ||
1252                      severity == MCE_UCNA_SEVERITY) && !no_way_out)
1253                         continue;
1254                 __set_bit(i, toclear);
1255                 if (severity == MCE_NO_SEVERITY) {
1256                         /*
1257                          * Machine check event was not enabled. Clear, but
1258                          * ignore.
1259                          */
1260                         continue;
1261                 }
1262
1263                 mce_read_aux(&m, i);
1264
1265                 /* assuming valid severity level != 0 */
1266                 m.severity = severity;
1267
1268                 mce_log(&m);
1269
1270                 if (severity > worst) {
1271                         *final = m;
1272                         worst = severity;
1273                 }
1274         }
1275
1276         /* mce_clear_state will clear *final, save locally for use later */
1277         m = *final;
1278
1279         if (!no_way_out)
1280                 mce_clear_state(toclear);
1281
1282         /*
1283          * Do most of the synchronization with other CPUs.
1284          * When there's any problem use only local no_way_out state.
1285          */
1286         if (!lmce) {
1287                 if (mce_end(order) < 0)
1288                         no_way_out = worst >= MCE_PANIC_SEVERITY;
1289         } else {
1290                 /*
1291                  * Local MCE skipped calling mce_reign()
1292                  * If we found a fatal error, we need to panic here.
1293                  */
1294                  if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
1295                         mce_panic("Machine check from unknown source",
1296                                 NULL, NULL);
1297         }
1298
1299         /*
1300          * If tolerant is at an insane level we drop requests to kill
1301          * processes and continue even when there is no way out.
1302          */
1303         if (cfg->tolerant == 3)
1304                 kill_it = 0;
1305         else if (no_way_out)
1306                 mce_panic("Fatal machine check on current CPU", &m, msg);
1307
1308         if (worst > 0)
1309                 mce_report_event(regs);
1310         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1311 out:
1312         sync_core();
1313
1314         if (worst != MCE_AR_SEVERITY && !kill_it)
1315                 goto out_ist;
1316
1317         /* Fault was in user mode and we need to take some action */
1318         if ((m.cs & 3) == 3) {
1319                 ist_begin_non_atomic(regs);
1320                 local_irq_enable();
1321
1322                 if (kill_it || do_memory_failure(&m))
1323                         force_sig(SIGBUS, current);
1324                 local_irq_disable();
1325                 ist_end_non_atomic();
1326         } else {
1327                 if (!fixup_exception(regs, X86_TRAP_MC))
1328                         mce_panic("Failed kernel mode recovery", &m, NULL);
1329         }
1330
1331 out_ist:
1332         ist_exit(regs);
1333 }
1334 EXPORT_SYMBOL_GPL(do_machine_check);
1335
1336 #ifndef CONFIG_MEMORY_FAILURE
1337 int memory_failure(unsigned long pfn, int flags)
1338 {
1339         /* mce_severity() should not hand us an ACTION_REQUIRED error */
1340         BUG_ON(flags & MF_ACTION_REQUIRED);
1341         pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1342                "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1343                pfn);
1344
1345         return 0;
1346 }
1347 #endif
1348
1349 /*
1350  * Periodic polling timer for "silent" machine check errors.  If the
1351  * poller finds an MCE, poll 2x faster.  When the poller finds no more
1352  * errors, poll 2x slower (up to check_interval seconds).
1353  */
1354 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1355
1356 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1357 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1358
1359 static unsigned long mce_adjust_timer_default(unsigned long interval)
1360 {
1361         return interval;
1362 }
1363
1364 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1365
1366 static void __start_timer(struct timer_list *t, unsigned long interval)
1367 {
1368         unsigned long when = jiffies + interval;
1369         unsigned long flags;
1370
1371         local_irq_save(flags);
1372
1373         if (!timer_pending(t) || time_before(when, t->expires))
1374                 mod_timer(t, round_jiffies(when));
1375
1376         local_irq_restore(flags);
1377 }
1378
1379 static void mce_timer_fn(struct timer_list *t)
1380 {
1381         struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
1382         unsigned long iv;
1383
1384         WARN_ON(cpu_t != t);
1385
1386         iv = __this_cpu_read(mce_next_interval);
1387
1388         if (mce_available(this_cpu_ptr(&cpu_info))) {
1389                 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1390
1391                 if (mce_intel_cmci_poll()) {
1392                         iv = mce_adjust_timer(iv);
1393                         goto done;
1394                 }
1395         }
1396
1397         /*
1398          * Alert userspace if needed. If we logged an MCE, reduce the polling
1399          * interval, otherwise increase the polling interval.
1400          */
1401         if (mce_notify_irq())
1402                 iv = max(iv / 2, (unsigned long) HZ/100);
1403         else
1404                 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1405
1406 done:
1407         __this_cpu_write(mce_next_interval, iv);
1408         __start_timer(t, iv);
1409 }
1410
1411 /*
1412  * Ensure that the timer is firing in @interval from now.
1413  */
1414 void mce_timer_kick(unsigned long interval)
1415 {
1416         struct timer_list *t = this_cpu_ptr(&mce_timer);
1417         unsigned long iv = __this_cpu_read(mce_next_interval);
1418
1419         __start_timer(t, interval);
1420
1421         if (interval < iv)
1422                 __this_cpu_write(mce_next_interval, interval);
1423 }
1424
1425 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
1426 static void mce_timer_delete_all(void)
1427 {
1428         int cpu;
1429
1430         for_each_online_cpu(cpu)
1431                 del_timer_sync(&per_cpu(mce_timer, cpu));
1432 }
1433
1434 /*
1435  * Notify the user(s) about new machine check events.
1436  * Can be called from interrupt context, but not from machine check/NMI
1437  * context.
1438  */
1439 int mce_notify_irq(void)
1440 {
1441         /* Not more than two messages every minute */
1442         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1443
1444         if (test_and_clear_bit(0, &mce_need_notify)) {
1445                 mce_work_trigger();
1446
1447                 if (__ratelimit(&ratelimit))
1448                         pr_info(HW_ERR "Machine check events logged\n");
1449
1450                 return 1;
1451         }
1452         return 0;
1453 }
1454 EXPORT_SYMBOL_GPL(mce_notify_irq);
1455
1456 static int __mcheck_cpu_mce_banks_init(void)
1457 {
1458         int i;
1459         u8 num_banks = mca_cfg.banks;
1460
1461         mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1462         if (!mce_banks)
1463                 return -ENOMEM;
1464
1465         for (i = 0; i < num_banks; i++) {
1466                 struct mce_bank *b = &mce_banks[i];
1467
1468                 b->ctl = -1ULL;
1469                 b->init = 1;
1470         }
1471         return 0;
1472 }
1473
1474 /*
1475  * Initialize Machine Checks for a CPU.
1476  */
1477 static int __mcheck_cpu_cap_init(void)
1478 {
1479         unsigned b;
1480         u64 cap;
1481
1482         rdmsrl(MSR_IA32_MCG_CAP, cap);
1483
1484         b = cap & MCG_BANKCNT_MASK;
1485         if (!mca_cfg.banks)
1486                 pr_info("CPU supports %d MCE banks\n", b);
1487
1488         if (b > MAX_NR_BANKS) {
1489                 pr_warn("Using only %u machine check banks out of %u\n",
1490                         MAX_NR_BANKS, b);
1491                 b = MAX_NR_BANKS;
1492         }
1493
1494         /* Don't support asymmetric configurations today */
1495         WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1496         mca_cfg.banks = b;
1497
1498         if (!mce_banks) {
1499                 int err = __mcheck_cpu_mce_banks_init();
1500
1501                 if (err)
1502                         return err;
1503         }
1504
1505         /* Use accurate RIP reporting if available. */
1506         if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1507                 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1508
1509         if (cap & MCG_SER_P)
1510                 mca_cfg.ser = true;
1511
1512         return 0;
1513 }
1514
1515 static void __mcheck_cpu_init_generic(void)
1516 {
1517         enum mcp_flags m_fl = 0;
1518         mce_banks_t all_banks;
1519         u64 cap;
1520
1521         if (!mca_cfg.bootlog)
1522                 m_fl = MCP_DONTLOG;
1523
1524         /*
1525          * Log the machine checks left over from the previous reset.
1526          */
1527         bitmap_fill(all_banks, MAX_NR_BANKS);
1528         machine_check_poll(MCP_UC | m_fl, &all_banks);
1529
1530         cr4_set_bits(X86_CR4_MCE);
1531
1532         rdmsrl(MSR_IA32_MCG_CAP, cap);
1533         if (cap & MCG_CTL_P)
1534                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1535 }
1536
1537 static void __mcheck_cpu_init_clear_banks(void)
1538 {
1539         int i;
1540
1541         for (i = 0; i < mca_cfg.banks; i++) {
1542                 struct mce_bank *b = &mce_banks[i];
1543
1544                 if (!b->init)
1545                         continue;
1546                 wrmsrl(msr_ops.ctl(i), b->ctl);
1547                 wrmsrl(msr_ops.status(i), 0);
1548         }
1549 }
1550
1551 /*
1552  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1553  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1554  * Vol 3B Table 15-20). But this confuses both the code that determines
1555  * whether the machine check occurred in kernel or user mode, and also
1556  * the severity assessment code. Pretend that EIPV was set, and take the
1557  * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1558  */
1559 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1560 {
1561         if (bank != 0)
1562                 return;
1563         if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1564                 return;
1565         if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1566                           MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1567                           MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1568                           MCACOD)) !=
1569                          (MCI_STATUS_UC|MCI_STATUS_EN|
1570                           MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1571                           MCI_STATUS_AR|MCACOD_INSTR))
1572                 return;
1573
1574         m->mcgstatus |= MCG_STATUS_EIPV;
1575         m->ip = regs->ip;
1576         m->cs = regs->cs;
1577 }
1578
1579 /* Add per CPU specific workarounds here */
1580 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1581 {
1582         struct mca_config *cfg = &mca_cfg;
1583
1584         if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1585                 pr_info("unknown CPU type - not enabling MCE support\n");
1586                 return -EOPNOTSUPP;
1587         }
1588
1589         /* This should be disabled by the BIOS, but isn't always */
1590         if (c->x86_vendor == X86_VENDOR_AMD) {
1591                 if (c->x86 == 15 && cfg->banks > 4) {
1592                         /*
1593                          * disable GART TBL walk error reporting, which
1594                          * trips off incorrectly with the IOMMU & 3ware
1595                          * & Cerberus:
1596                          */
1597                         clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1598                 }
1599                 if (c->x86 < 0x11 && cfg->bootlog < 0) {
1600                         /*
1601                          * Lots of broken BIOS around that don't clear them
1602                          * by default and leave crap in there. Don't log:
1603                          */
1604                         cfg->bootlog = 0;
1605                 }
1606                 /*
1607                  * Various K7s with broken bank 0 around. Always disable
1608                  * by default.
1609                  */
1610                 if (c->x86 == 6 && cfg->banks > 0)
1611                         mce_banks[0].ctl = 0;
1612
1613                 /*
1614                  * overflow_recov is supported for F15h Models 00h-0fh
1615                  * even though we don't have a CPUID bit for it.
1616                  */
1617                 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1618                         mce_flags.overflow_recov = 1;
1619
1620                 /*
1621                  * Turn off MC4_MISC thresholding banks on those models since
1622                  * they're not supported there.
1623                  */
1624                 if (c->x86 == 0x15 &&
1625                     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1626                         int i;
1627                         u64 hwcr;
1628                         bool need_toggle;
1629                         u32 msrs[] = {
1630                                 0x00000413, /* MC4_MISC0 */
1631                                 0xc0000408, /* MC4_MISC1 */
1632                         };
1633
1634                         rdmsrl(MSR_K7_HWCR, hwcr);
1635
1636                         /* McStatusWrEn has to be set */
1637                         need_toggle = !(hwcr & BIT(18));
1638
1639                         if (need_toggle)
1640                                 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1641
1642                         /* Clear CntP bit safely */
1643                         for (i = 0; i < ARRAY_SIZE(msrs); i++)
1644                                 msr_clear_bit(msrs[i], 62);
1645
1646                         /* restore old settings */
1647                         if (need_toggle)
1648                                 wrmsrl(MSR_K7_HWCR, hwcr);
1649                 }
1650         }
1651
1652         if (c->x86_vendor == X86_VENDOR_INTEL) {
1653                 /*
1654                  * SDM documents that on family 6 bank 0 should not be written
1655                  * because it aliases to another special BIOS controlled
1656                  * register.
1657                  * But it's not aliased anymore on model 0x1a+
1658                  * Don't ignore bank 0 completely because there could be a
1659                  * valid event later, merely don't write CTL0.
1660                  */
1661
1662                 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1663                         mce_banks[0].init = 0;
1664
1665                 /*
1666                  * All newer Intel systems support MCE broadcasting. Enable
1667                  * synchronization with a one second timeout.
1668                  */
1669                 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1670                         cfg->monarch_timeout < 0)
1671                         cfg->monarch_timeout = USEC_PER_SEC;
1672
1673                 /*
1674                  * There are also broken BIOSes on some Pentium M and
1675                  * earlier systems:
1676                  */
1677                 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1678                         cfg->bootlog = 0;
1679
1680                 if (c->x86 == 6 && c->x86_model == 45)
1681                         quirk_no_way_out = quirk_sandybridge_ifu;
1682         }
1683         if (cfg->monarch_timeout < 0)
1684                 cfg->monarch_timeout = 0;
1685         if (cfg->bootlog != 0)
1686                 cfg->panic_timeout = 30;
1687
1688         return 0;
1689 }
1690
1691 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1692 {
1693         if (c->x86 != 5)
1694                 return 0;
1695
1696         switch (c->x86_vendor) {
1697         case X86_VENDOR_INTEL:
1698                 intel_p5_mcheck_init(c);
1699                 return 1;
1700                 break;
1701         case X86_VENDOR_CENTAUR:
1702                 winchip_mcheck_init(c);
1703                 return 1;
1704                 break;
1705         default:
1706                 return 0;
1707         }
1708
1709         return 0;
1710 }
1711
1712 /*
1713  * Init basic CPU features needed for early decoding of MCEs.
1714  */
1715 static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1716 {
1717         if (c->x86_vendor == X86_VENDOR_AMD) {
1718                 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1719                 mce_flags.succor         = !!cpu_has(c, X86_FEATURE_SUCCOR);
1720                 mce_flags.smca           = !!cpu_has(c, X86_FEATURE_SMCA);
1721
1722                 if (mce_flags.smca) {
1723                         msr_ops.ctl     = smca_ctl_reg;
1724                         msr_ops.status  = smca_status_reg;
1725                         msr_ops.addr    = smca_addr_reg;
1726                         msr_ops.misc    = smca_misc_reg;
1727                 }
1728         }
1729 }
1730
1731 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1732 {
1733         switch (c->x86_vendor) {
1734         case X86_VENDOR_INTEL:
1735                 mce_intel_feature_init(c);
1736                 mce_adjust_timer = cmci_intel_adjust_timer;
1737                 break;
1738
1739         case X86_VENDOR_AMD: {
1740                 mce_amd_feature_init(c);
1741                 break;
1742                 }
1743
1744         default:
1745                 break;
1746         }
1747 }
1748
1749 static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1750 {
1751         switch (c->x86_vendor) {
1752         case X86_VENDOR_INTEL:
1753                 mce_intel_feature_clear(c);
1754                 break;
1755         default:
1756                 break;
1757         }
1758 }
1759
1760 static void mce_start_timer(struct timer_list *t)
1761 {
1762         unsigned long iv = check_interval * HZ;
1763
1764         if (mca_cfg.ignore_ce || !iv)
1765                 return;
1766
1767         this_cpu_write(mce_next_interval, iv);
1768         __start_timer(t, iv);
1769 }
1770
1771 static void __mcheck_cpu_setup_timer(void)
1772 {
1773         struct timer_list *t = this_cpu_ptr(&mce_timer);
1774
1775         timer_setup(t, mce_timer_fn, TIMER_PINNED);
1776 }
1777
1778 static void __mcheck_cpu_init_timer(void)
1779 {
1780         struct timer_list *t = this_cpu_ptr(&mce_timer);
1781
1782         timer_setup(t, mce_timer_fn, TIMER_PINNED);
1783         mce_start_timer(t);
1784 }
1785
1786 /* Handle unconfigured int18 (should never happen) */
1787 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1788 {
1789         pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1790                smp_processor_id());
1791 }
1792
1793 /* Call the installed machine check handler for this CPU setup. */
1794 void (*machine_check_vector)(struct pt_regs *, long error_code) =
1795                                                 unexpected_machine_check;
1796
1797 dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
1798 {
1799         machine_check_vector(regs, error_code);
1800 }
1801
1802 /*
1803  * Called for each booted CPU to set up machine checks.
1804  * Must be called with preempt off:
1805  */
1806 void mcheck_cpu_init(struct cpuinfo_x86 *c)
1807 {
1808         if (mca_cfg.disabled)
1809                 return;
1810
1811         if (__mcheck_cpu_ancient_init(c))
1812                 return;
1813
1814         if (!mce_available(c))
1815                 return;
1816
1817         if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1818                 mca_cfg.disabled = true;
1819                 return;
1820         }
1821
1822         if (mce_gen_pool_init()) {
1823                 mca_cfg.disabled = true;
1824                 pr_emerg("Couldn't allocate MCE records pool!\n");
1825                 return;
1826         }
1827
1828         machine_check_vector = do_machine_check;
1829
1830         __mcheck_cpu_init_early(c);
1831         __mcheck_cpu_init_generic();
1832         __mcheck_cpu_init_vendor(c);
1833         __mcheck_cpu_init_clear_banks();
1834         __mcheck_cpu_setup_timer();
1835 }
1836
1837 /*
1838  * Called for each booted CPU to clear some machine checks opt-ins
1839  */
1840 void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1841 {
1842         if (mca_cfg.disabled)
1843                 return;
1844
1845         if (!mce_available(c))
1846                 return;
1847
1848         /*
1849          * Possibly to clear general settings generic to x86
1850          * __mcheck_cpu_clear_generic(c);
1851          */
1852         __mcheck_cpu_clear_vendor(c);
1853
1854 }
1855
1856 static void __mce_disable_bank(void *arg)
1857 {
1858         int bank = *((int *)arg);
1859         __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
1860         cmci_disable_bank(bank);
1861 }
1862
1863 void mce_disable_bank(int bank)
1864 {
1865         if (bank >= mca_cfg.banks) {
1866                 pr_warn(FW_BUG
1867                         "Ignoring request to disable invalid MCA bank %d.\n",
1868                         bank);
1869                 return;
1870         }
1871         set_bit(bank, mce_banks_ce_disabled);
1872         on_each_cpu(__mce_disable_bank, &bank, 1);
1873 }
1874
1875 /*
1876  * mce=off Disables machine check
1877  * mce=no_cmci Disables CMCI
1878  * mce=no_lmce Disables LMCE
1879  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1880  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1881  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1882  *      monarchtimeout is how long to wait for other CPUs on machine
1883  *      check, or 0 to not wait
1884  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
1885         and older.
1886  * mce=nobootlog Don't log MCEs from before booting.
1887  * mce=bios_cmci_threshold Don't program the CMCI threshold
1888  * mce=recovery force enable memcpy_mcsafe()
1889  */
1890 static int __init mcheck_enable(char *str)
1891 {
1892         struct mca_config *cfg = &mca_cfg;
1893
1894         if (*str == 0) {
1895                 enable_p5_mce();
1896                 return 1;
1897         }
1898         if (*str == '=')
1899                 str++;
1900         if (!strcmp(str, "off"))
1901                 cfg->disabled = true;
1902         else if (!strcmp(str, "no_cmci"))
1903                 cfg->cmci_disabled = true;
1904         else if (!strcmp(str, "no_lmce"))
1905                 cfg->lmce_disabled = true;
1906         else if (!strcmp(str, "dont_log_ce"))
1907                 cfg->dont_log_ce = true;
1908         else if (!strcmp(str, "ignore_ce"))
1909                 cfg->ignore_ce = true;
1910         else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1911                 cfg->bootlog = (str[0] == 'b');
1912         else if (!strcmp(str, "bios_cmci_threshold"))
1913                 cfg->bios_cmci_threshold = true;
1914         else if (!strcmp(str, "recovery"))
1915                 cfg->recovery = true;
1916         else if (isdigit(str[0])) {
1917                 if (get_option(&str, &cfg->tolerant) == 2)
1918                         get_option(&str, &(cfg->monarch_timeout));
1919         } else {
1920                 pr_info("mce argument %s ignored. Please use /sys\n", str);
1921                 return 0;
1922         }
1923         return 1;
1924 }
1925 __setup("mce", mcheck_enable);
1926
1927 int __init mcheck_init(void)
1928 {
1929         mcheck_intel_therm_init();
1930         mce_register_decode_chain(&first_nb);
1931         mce_register_decode_chain(&mce_srao_nb);
1932         mce_register_decode_chain(&mce_default_nb);
1933         mcheck_vendor_init_severity();
1934
1935         INIT_WORK(&mce_work, mce_gen_pool_process);
1936         init_irq_work(&mce_irq_work, mce_irq_work_cb);
1937
1938         return 0;
1939 }
1940
1941 /*
1942  * mce_syscore: PM support
1943  */
1944
1945 /*
1946  * Disable machine checks on suspend and shutdown. We can't really handle
1947  * them later.
1948  */
1949 static void mce_disable_error_reporting(void)
1950 {
1951         int i;
1952
1953         for (i = 0; i < mca_cfg.banks; i++) {
1954                 struct mce_bank *b = &mce_banks[i];
1955
1956                 if (b->init)
1957                         wrmsrl(msr_ops.ctl(i), 0);
1958         }
1959         return;
1960 }
1961
1962 static void vendor_disable_error_reporting(void)
1963 {
1964         /*
1965          * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide.
1966          * Disabling them for just a single offlined CPU is bad, since it will
1967          * inhibit reporting for all shared resources on the socket like the
1968          * last level cache (LLC), the integrated memory controller (iMC), etc.
1969          */
1970         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
1971             boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1972                 return;
1973
1974         mce_disable_error_reporting();
1975 }
1976
1977 static int mce_syscore_suspend(void)
1978 {
1979         vendor_disable_error_reporting();
1980         return 0;
1981 }
1982
1983 static void mce_syscore_shutdown(void)
1984 {
1985         vendor_disable_error_reporting();
1986 }
1987
1988 /*
1989  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1990  * Only one CPU is active at this time, the others get re-added later using
1991  * CPU hotplug:
1992  */
1993 static void mce_syscore_resume(void)
1994 {
1995         __mcheck_cpu_init_generic();
1996         __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
1997         __mcheck_cpu_init_clear_banks();
1998 }
1999
2000 static struct syscore_ops mce_syscore_ops = {
2001         .suspend        = mce_syscore_suspend,
2002         .shutdown       = mce_syscore_shutdown,
2003         .resume         = mce_syscore_resume,
2004 };
2005
2006 /*
2007  * mce_device: Sysfs support
2008  */
2009
2010 static void mce_cpu_restart(void *data)
2011 {
2012         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2013                 return;
2014         __mcheck_cpu_init_generic();
2015         __mcheck_cpu_init_clear_banks();
2016         __mcheck_cpu_init_timer();
2017 }
2018
2019 /* Reinit MCEs after user configuration changes */
2020 static void mce_restart(void)
2021 {
2022         mce_timer_delete_all();
2023         on_each_cpu(mce_cpu_restart, NULL, 1);
2024 }
2025
2026 /* Toggle features for corrected errors */
2027 static void mce_disable_cmci(void *data)
2028 {
2029         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2030                 return;
2031         cmci_clear();
2032 }
2033
2034 static void mce_enable_ce(void *all)
2035 {
2036         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2037                 return;
2038         cmci_reenable();
2039         cmci_recheck();
2040         if (all)
2041                 __mcheck_cpu_init_timer();
2042 }
2043
2044 static struct bus_type mce_subsys = {
2045         .name           = "machinecheck",
2046         .dev_name       = "machinecheck",
2047 };
2048
2049 DEFINE_PER_CPU(struct device *, mce_device);
2050
2051 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2052 {
2053         return container_of(attr, struct mce_bank, attr);
2054 }
2055
2056 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2057                          char *buf)
2058 {
2059         return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2060 }
2061
2062 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2063                         const char *buf, size_t size)
2064 {
2065         u64 new;
2066
2067         if (kstrtou64(buf, 0, &new) < 0)
2068                 return -EINVAL;
2069
2070         attr_to_bank(attr)->ctl = new;
2071         mce_restart();
2072
2073         return size;
2074 }
2075
2076 static ssize_t set_ignore_ce(struct device *s,
2077                              struct device_attribute *attr,
2078                              const char *buf, size_t size)
2079 {
2080         u64 new;
2081
2082         if (kstrtou64(buf, 0, &new) < 0)
2083                 return -EINVAL;
2084
2085         if (mca_cfg.ignore_ce ^ !!new) {
2086                 if (new) {
2087                         /* disable ce features */
2088                         mce_timer_delete_all();
2089                         on_each_cpu(mce_disable_cmci, NULL, 1);
2090                         mca_cfg.ignore_ce = true;
2091                 } else {
2092                         /* enable ce features */
2093                         mca_cfg.ignore_ce = false;
2094                         on_each_cpu(mce_enable_ce, (void *)1, 1);
2095                 }
2096         }
2097         return size;
2098 }
2099
2100 static ssize_t set_cmci_disabled(struct device *s,
2101                                  struct device_attribute *attr,
2102                                  const char *buf, size_t size)
2103 {
2104         u64 new;
2105
2106         if (kstrtou64(buf, 0, &new) < 0)
2107                 return -EINVAL;
2108
2109         if (mca_cfg.cmci_disabled ^ !!new) {
2110                 if (new) {
2111                         /* disable cmci */
2112                         on_each_cpu(mce_disable_cmci, NULL, 1);
2113                         mca_cfg.cmci_disabled = true;
2114                 } else {
2115                         /* enable cmci */
2116                         mca_cfg.cmci_disabled = false;
2117                         on_each_cpu(mce_enable_ce, NULL, 1);
2118                 }
2119         }
2120         return size;
2121 }
2122
2123 static ssize_t store_int_with_restart(struct device *s,
2124                                       struct device_attribute *attr,
2125                                       const char *buf, size_t size)
2126 {
2127         ssize_t ret = device_store_int(s, attr, buf, size);
2128         mce_restart();
2129         return ret;
2130 }
2131
2132 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2133 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2134 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2135
2136 static struct dev_ext_attribute dev_attr_check_interval = {
2137         __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2138         &check_interval
2139 };
2140
2141 static struct dev_ext_attribute dev_attr_ignore_ce = {
2142         __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2143         &mca_cfg.ignore_ce
2144 };
2145
2146 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2147         __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2148         &mca_cfg.cmci_disabled
2149 };
2150
2151 static struct device_attribute *mce_device_attrs[] = {
2152         &dev_attr_tolerant.attr,
2153         &dev_attr_check_interval.attr,
2154 #ifdef CONFIG_X86_MCELOG_LEGACY
2155         &dev_attr_trigger,
2156 #endif
2157         &dev_attr_monarch_timeout.attr,
2158         &dev_attr_dont_log_ce.attr,
2159         &dev_attr_ignore_ce.attr,
2160         &dev_attr_cmci_disabled.attr,
2161         NULL
2162 };
2163
2164 static cpumask_var_t mce_device_initialized;
2165
2166 static void mce_device_release(struct device *dev)
2167 {
2168         kfree(dev);
2169 }
2170
2171 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
2172 static int mce_device_create(unsigned int cpu)
2173 {
2174         struct device *dev;
2175         int err;
2176         int i, j;
2177
2178         if (!mce_available(&boot_cpu_data))
2179                 return -EIO;
2180
2181         dev = per_cpu(mce_device, cpu);
2182         if (dev)
2183                 return 0;
2184
2185         dev = kzalloc(sizeof *dev, GFP_KERNEL);
2186         if (!dev)
2187                 return -ENOMEM;
2188         dev->id  = cpu;
2189         dev->bus = &mce_subsys;
2190         dev->release = &mce_device_release;
2191
2192         err = device_register(dev);
2193         if (err) {
2194                 put_device(dev);
2195                 return err;
2196         }
2197
2198         for (i = 0; mce_device_attrs[i]; i++) {
2199                 err = device_create_file(dev, mce_device_attrs[i]);
2200                 if (err)
2201                         goto error;
2202         }
2203         for (j = 0; j < mca_cfg.banks; j++) {
2204                 err = device_create_file(dev, &mce_banks[j].attr);
2205                 if (err)
2206                         goto error2;
2207         }
2208         cpumask_set_cpu(cpu, mce_device_initialized);
2209         per_cpu(mce_device, cpu) = dev;
2210
2211         return 0;
2212 error2:
2213         while (--j >= 0)
2214                 device_remove_file(dev, &mce_banks[j].attr);
2215 error:
2216         while (--i >= 0)
2217                 device_remove_file(dev, mce_device_attrs[i]);
2218
2219         device_unregister(dev);
2220
2221         return err;
2222 }
2223
2224 static void mce_device_remove(unsigned int cpu)
2225 {
2226         struct device *dev = per_cpu(mce_device, cpu);
2227         int i;
2228
2229         if (!cpumask_test_cpu(cpu, mce_device_initialized))
2230                 return;
2231
2232         for (i = 0; mce_device_attrs[i]; i++)
2233                 device_remove_file(dev, mce_device_attrs[i]);
2234
2235         for (i = 0; i < mca_cfg.banks; i++)
2236                 device_remove_file(dev, &mce_banks[i].attr);
2237
2238         device_unregister(dev);
2239         cpumask_clear_cpu(cpu, mce_device_initialized);
2240         per_cpu(mce_device, cpu) = NULL;
2241 }
2242
2243 /* Make sure there are no machine checks on offlined CPUs. */
2244 static void mce_disable_cpu(void)
2245 {
2246         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2247                 return;
2248
2249         if (!cpuhp_tasks_frozen)
2250                 cmci_clear();
2251
2252         vendor_disable_error_reporting();
2253 }
2254
2255 static void mce_reenable_cpu(void)
2256 {
2257         int i;
2258
2259         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2260                 return;
2261
2262         if (!cpuhp_tasks_frozen)
2263                 cmci_reenable();
2264         for (i = 0; i < mca_cfg.banks; i++) {
2265                 struct mce_bank *b = &mce_banks[i];
2266
2267                 if (b->init)
2268                         wrmsrl(msr_ops.ctl(i), b->ctl);
2269         }
2270 }
2271
2272 static int mce_cpu_dead(unsigned int cpu)
2273 {
2274         mce_intel_hcpu_update(cpu);
2275
2276         /* intentionally ignoring frozen here */
2277         if (!cpuhp_tasks_frozen)
2278                 cmci_rediscover();
2279         return 0;
2280 }
2281
2282 static int mce_cpu_online(unsigned int cpu)
2283 {
2284         struct timer_list *t = this_cpu_ptr(&mce_timer);
2285         int ret;
2286
2287         mce_device_create(cpu);
2288
2289         ret = mce_threshold_create_device(cpu);
2290         if (ret) {
2291                 mce_device_remove(cpu);
2292                 return ret;
2293         }
2294         mce_reenable_cpu();
2295         mce_start_timer(t);
2296         return 0;
2297 }
2298
2299 static int mce_cpu_pre_down(unsigned int cpu)
2300 {
2301         struct timer_list *t = this_cpu_ptr(&mce_timer);
2302
2303         mce_disable_cpu();
2304         del_timer_sync(t);
2305         mce_threshold_remove_device(cpu);
2306         mce_device_remove(cpu);
2307         return 0;
2308 }
2309
2310 static __init void mce_init_banks(void)
2311 {
2312         int i;
2313
2314         for (i = 0; i < mca_cfg.banks; i++) {
2315                 struct mce_bank *b = &mce_banks[i];
2316                 struct device_attribute *a = &b->attr;
2317
2318                 sysfs_attr_init(&a->attr);
2319                 a->attr.name    = b->attrname;
2320                 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2321
2322                 a->attr.mode    = 0644;
2323                 a->show         = show_bank;
2324                 a->store        = set_bank;
2325         }
2326 }
2327
2328 static __init int mcheck_init_device(void)
2329 {
2330         int err;
2331
2332         if (!mce_available(&boot_cpu_data)) {
2333                 err = -EIO;
2334                 goto err_out;
2335         }
2336
2337         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2338                 err = -ENOMEM;
2339                 goto err_out;
2340         }
2341
2342         mce_init_banks();
2343
2344         err = subsys_system_register(&mce_subsys, NULL);
2345         if (err)
2346                 goto err_out_mem;
2347
2348         err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2349                                 mce_cpu_dead);
2350         if (err)
2351                 goto err_out_mem;
2352
2353         err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2354                                 mce_cpu_online, mce_cpu_pre_down);
2355         if (err < 0)
2356                 goto err_out_online;
2357
2358         register_syscore_ops(&mce_syscore_ops);
2359
2360         return 0;
2361
2362 err_out_online:
2363         cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
2364
2365 err_out_mem:
2366         free_cpumask_var(mce_device_initialized);
2367
2368 err_out:
2369         pr_err("Unable to init MCE device (rc: %d)\n", err);
2370
2371         return err;
2372 }
2373 device_initcall_sync(mcheck_init_device);
2374
2375 /*
2376  * Old style boot options parsing. Only for compatibility.
2377  */
2378 static int __init mcheck_disable(char *str)
2379 {
2380         mca_cfg.disabled = true;
2381         return 1;
2382 }
2383 __setup("nomce", mcheck_disable);
2384
2385 #ifdef CONFIG_DEBUG_FS
2386 struct dentry *mce_get_debugfs_dir(void)
2387 {
2388         static struct dentry *dmce;
2389
2390         if (!dmce)
2391                 dmce = debugfs_create_dir("mce", NULL);
2392
2393         return dmce;
2394 }
2395
2396 static void mce_reset(void)
2397 {
2398         cpu_missing = 0;
2399         atomic_set(&mce_fake_panicked, 0);
2400         atomic_set(&mce_executing, 0);
2401         atomic_set(&mce_callin, 0);
2402         atomic_set(&global_nwo, 0);
2403 }
2404
2405 static int fake_panic_get(void *data, u64 *val)
2406 {
2407         *val = fake_panic;
2408         return 0;
2409 }
2410
2411 static int fake_panic_set(void *data, u64 val)
2412 {
2413         mce_reset();
2414         fake_panic = val;
2415         return 0;
2416 }
2417
2418 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2419                         fake_panic_set, "%llu\n");
2420
2421 static int __init mcheck_debugfs_init(void)
2422 {
2423         struct dentry *dmce, *ffake_panic;
2424
2425         dmce = mce_get_debugfs_dir();
2426         if (!dmce)
2427                 return -ENOMEM;
2428         ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2429                                           &fake_panic_fops);
2430         if (!ffake_panic)
2431                 return -ENOMEM;
2432
2433         return 0;
2434 }
2435 #else
2436 static int __init mcheck_debugfs_init(void) { return -EINVAL; }
2437 #endif
2438
2439 DEFINE_STATIC_KEY_FALSE(mcsafe_key);
2440 EXPORT_SYMBOL_GPL(mcsafe_key);
2441
2442 static int __init mcheck_late_init(void)
2443 {
2444         if (mca_cfg.recovery)
2445                 static_branch_inc(&mcsafe_key);
2446
2447         mcheck_debugfs_init();
2448         cec_init();
2449
2450         /*
2451          * Flush out everything that has been logged during early boot, now that
2452          * everything has been initialized (workqueues, decoders, ...).
2453          */
2454         mce_schedule_work();
2455
2456         return 0;
2457 }
2458 late_initcall(mcheck_late_init);