arch/x86_64/kernel/mce.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  */
   7
   8 #include <linux/init.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/sched.h>
  12 #include <linux/string.h>
  13 #include <linux/rcupdate.h>
  14 #include <linux/kallsyms.h>
  15 #include <linux/sysdev.h>
  16 #include <linux/miscdevice.h>
  17 #include <linux/fs.h>
  18 #include <linux/capability.h>
  19 #include <linux/cpu.h>
  20 #include <linux/percpu.h>
  21 #include <linux/ctype.h>
  22 #include <linux/kmod.h>
  23 #include <asm/processor.h>
  24 #include <asm/msr.h>
  25 #include <asm/mce.h>
  26 #include <asm/kdebug.h>
  27 #include <asm/uaccess.h>
  28 #include <asm/smp.h>
  29
  30 #define MISC_MCELOG_MINOR 227
  31 #define NR_BANKS 6
  32
  33 atomic_t mce_entry;
  34
  35 static int mce_dont_init;
  36
  37 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
  38    3: never panic or exit (for testing only) */
  39 static int tolerant = 1;
  40 static int banks;
  41 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
  42 static unsigned long console_logged;
  43 static int notify_user;
  44 static int rip_msr;
  45 static int mce_bootlog = 1;
  46 static atomic_t mce_events;
  47
  48 static char trigger[128];
  49 static char *trigger_argv[2] = { trigger, NULL };
  50
  51 /*
  52  * Lockless MCE logging infrastructure.
  53  * This avoids deadlocks on printk locks without having to break locks. Also
  54  * separate MCEs from kernel messages to avoid bogus bug reports.
  55  */
  56
  57 struct mce_log mcelog = {
  58         MCE_LOG_SIGNATURE,
  59         MCE_LOG_LEN,
  60 };
  61
  62 void mce_log(struct mce *mce)
  63 {
  64         unsigned next, entry;
  65         atomic_inc(&mce_events);
  66         mce->finished = 0;
  67         wmb();
  68         for (;;) {
  69                 entry = rcu_dereference(mcelog.next);
  70                 /* The rmb forces the compiler to reload next in each
  71                     iteration */
  72                 rmb();
  73                 for (;;) {
  74                         /* When the buffer fills up discard new entries. Assume
  75                            that the earlier errors are the more interesting. */
  76                         if (entry >= MCE_LOG_LEN) {
  77                                 set_bit(MCE_OVERFLOW, &mcelog.flags);
  78                                 return;
  79                         }
  80                         /* Old left over entry. Skip. */
  81                         if (mcelog.entry[entry].finished) {
  82                                 entry++;
  83                                 continue;
  84                         }
  85                         break;
  86                 }
  87                 smp_rmb();
  88                 next = entry + 1;
  89                 if (cmpxchg(&mcelog.next, entry, next) == entry)
  90                         break;
  91         }
  92         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  93         wmb();
  94         mcelog.entry[entry].finished = 1;
  95         wmb();
  96
  97         if (!test_and_set_bit(0, &console_logged))
  98                 notify_user = 1;
  99 }
 100
 101 static void print_mce(struct mce *m)
 102 {
 103         printk(KERN_EMERG "\n"
 104                KERN_EMERG "HARDWARE ERROR\n"
 105                KERN_EMERG
 106                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 107                m->cpu, m->mcgstatus, m->bank, m->status);
 108         if (m->rip) {
 109                 printk(KERN_EMERG
 110                        "RIP%s %02x:<%016Lx> ",
 111                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 112                        m->cs, m->rip);
 113                 if (m->cs == __KERNEL_CS)
 114                         print_symbol("{%s}", m->rip);
 115                 printk("\n");
 116         }
 117         printk(KERN_EMERG "TSC %Lx ", m->tsc);
 118         if (m->addr)
 119                 printk("ADDR %Lx ", m->addr);
 120         if (m->misc)
 121                 printk("MISC %Lx ", m->misc);
 122         printk("\n");
 123         printk(KERN_EMERG "This is not a software problem!\n");
 124         printk(KERN_EMERG
 125     "Run through mcelog --ascii to decode and contact your hardware vendor\n");
 126 }
 127
 128 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 129 {
 130         int i;
 131         oops_begin();
 132         for (i = 0; i < MCE_LOG_LEN; i++) {
 133                 unsigned long tsc = mcelog.entry[i].tsc;
 134                 if (time_before(tsc, start))
 135                         continue;
 136                 print_mce(&mcelog.entry[i]);
 137                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 138                         backup = NULL;
 139         }
 140         if (backup)
 141                 print_mce(backup);
 142         if (tolerant >= 3)
 143                 printk("Fake panic: %s\n", msg);
 144         else
 145                 panic(msg);
 146 }
 147
 148 static int mce_available(struct cpuinfo_x86 *c)
 149 {
 150         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 151 }
 152
 153 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 154 {
 155         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 156                 m->rip = regs->rip;
 157                 m->cs = regs->cs;
 158         } else {
 159                 m->rip = 0;
 160                 m->cs = 0;
 161         }
 162         if (rip_msr) {
 163                 /* Assume the RIP in the MSR is exact. Is this true? */
 164                 m->mcgstatus |= MCG_STATUS_EIPV;
 165                 rdmsrl(rip_msr, m->rip);
 166                 m->cs = 0;
 167         }
 168 }
 169
 170 static void do_mce_trigger(void)
 171 {
 172         static atomic_t mce_logged;
 173         int events = atomic_read(&mce_events);
 174         if (events != atomic_read(&mce_logged) && trigger[0]) {
 175                 /* Small race window, but should be harmless.  */
 176                 atomic_set(&mce_logged, events);
 177                 call_usermodehelper(trigger, trigger_argv, NULL, -1);
 178         }
 179 }
 180
 181 /*
 182  * The actual machine check handler
 183  */
 184
 185 void do_machine_check(struct pt_regs * regs, long error_code)
 186 {
 187         struct mce m, panicm;
 188         int nowayout = (tolerant < 1);
 189         int kill_it = 0;
 190         u64 mcestart = 0;
 191         int i;
 192         int panicm_found = 0;
 193
 194         atomic_inc(&mce_entry);
 195
 196         if (regs)
 197                 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
 198         if (!banks)
 199                 goto out2;
 200
 201         memset(&m, 0, sizeof(struct mce));
 202         m.cpu = smp_processor_id();
 203         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 204         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 205                 kill_it = 1;
 206
 207         rdtscll(mcestart);
 208         barrier();
 209
 210         for (i = 0; i < banks; i++) {
 211                 if (!bank[i])
 212                         continue;
 213
 214                 m.misc = 0;
 215                 m.addr = 0;
 216                 m.bank = i;
 217                 m.tsc = 0;
 218
 219                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 220                 if ((m.status & MCI_STATUS_VAL) == 0)
 221                         continue;
 222
 223                 if (m.status & MCI_STATUS_EN) {
 224                         /* In theory _OVER could be a nowayout too, but
 225                            assume any overflowed errors were no fatal. */
 226                         nowayout |= !!(m.status & MCI_STATUS_PCC);
 227                         kill_it |= !!(m.status & MCI_STATUS_UC);
 228                 }
 229
 230                 if (m.status & MCI_STATUS_MISCV)
 231                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 232                 if (m.status & MCI_STATUS_ADDRV)
 233                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 234
 235                 mce_get_rip(&m, regs);
 236                 if (error_code >= 0)
 237                         rdtscll(m.tsc);
 238                 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
 239                 if (error_code != -2)
 240                         mce_log(&m);
 241
 242                 /* Did this bank cause the exception? */
 243                 /* Assume that the bank with uncorrectable errors did it,
 244                    and that there is only a single one. */
 245                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 246                         panicm = m;
 247                         panicm_found = 1;
 248                 }
 249
 250                 add_taint(TAINT_MACHINE_CHECK);
 251         }
 252
 253         /* Never do anything final in the polling timer */
 254         if (!regs) {
 255                 /* Normal interrupt context here. Call trigger for any new
 256                    events. */
 257                 do_mce_trigger();
 258                 goto out;
 259         }
 260
 261         /* If we didn't find an uncorrectable error, pick
 262            the last one (shouldn't happen, just being safe). */
 263         if (!panicm_found)
 264                 panicm = m;
 265         if (nowayout)
 266                 mce_panic("Machine check", &panicm, mcestart);
 267         if (kill_it) {
 268                 int user_space = 0;
 269
 270                 if (m.mcgstatus & MCG_STATUS_RIPV)
 271                         user_space = panicm.rip && (panicm.cs & 3);
 272
 273                 /* When the machine was in user space and the CPU didn't get
 274                    confused it's normally not necessary to panic, unless you
 275                    are paranoid (tolerant == 0)
 276
 277                    RED-PEN could be more tolerant for MCEs in idle,
 278                    but most likely they occur at boot anyways, where
 279                    it is best to just halt the machine. */
 280                 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
 281                     (unsigned)current->pid <= 1)
 282                         mce_panic("Uncorrected machine check", &panicm, mcestart);
 283
 284                 /* do_exit takes an awful lot of locks and has as
 285                    slight risk of deadlocking. If you don't want that
 286                    don't set tolerant >= 2 */
 287                 if (tolerant < 3)
 288                         do_exit(SIGBUS);
 289         }
 290
 291  out:
 292         /* Last thing done in the machine check exception to clear state. */
 293         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 294  out2:
 295         atomic_dec(&mce_entry);
 296 }
 297
 298 #ifdef CONFIG_X86_MCE_INTEL
 299 /***
 300  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 301  * @cpu: The CPU on which the event occured.
 302  * @status: Event status information
 303  *
 304  * This function should be called by the thermal interrupt after the
 305  * event has been processed and the decision was made to log the event
 306  * further.
 307  *
 308  * The status parameter will be saved to the 'status' field of 'struct mce'
 309  * and historically has been the register value of the
 310  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 311  */
 312 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 313 {
 314         struct mce m;
 315
 316         memset(&m, 0, sizeof(m));
 317         m.cpu = cpu;
 318         m.bank = MCE_THERMAL_BANK;
 319         m.status = status;
 320         rdtscll(m.tsc);
 321         mce_log(&m);
 322 }
 323 #endif /* CONFIG_X86_MCE_INTEL */
 324
 325 /*
 326  * Periodic polling timer for "silent" machine check errors.
 327  */
 328
 329 static int check_interval = 5 * 60; /* 5 minutes */
 330 static void mcheck_timer(struct work_struct *work);
 331 static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
 332
 333 static void mcheck_check_cpu(void *info)
 334 {
 335         if (mce_available(&current_cpu_data))
 336                 do_machine_check(NULL, 0);
 337 }
 338
 339 static void mcheck_timer(struct work_struct *work)
 340 {
 341         on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
 342         schedule_delayed_work(&mcheck_work, check_interval * HZ);
 343
 344         /*
 345          * It's ok to read stale data here for notify_user and
 346          * console_logged as we'll simply get the updated versions
 347          * on the next mcheck_timer execution and atomic operations
 348          * on console_logged act as synchronization for notify_user
 349          * writes.
 350          */
 351         if (notify_user && console_logged) {
 352                 notify_user = 0;
 353                 clear_bit(0, &console_logged);
 354                 printk(KERN_INFO "Machine check events logged\n");
 355         }
 356 }
 357
 358
 359 static __init int periodic_mcheck_init(void)
 360 {
 361         if (check_interval)
 362                 schedule_delayed_work(&mcheck_work, check_interval*HZ);
 363         return 0;
 364 }
 365 __initcall(periodic_mcheck_init);
 366
 367
 368 /*
 369  * Initialize Machine Checks for a CPU.
 370  */
 371 static void mce_init(void *dummy)
 372 {
 373         u64 cap;
 374         int i;
 375
 376         rdmsrl(MSR_IA32_MCG_CAP, cap);
 377         banks = cap & 0xff;
 378         if (banks > NR_BANKS) {
 379                 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
 380                 banks = NR_BANKS;
 381         }
 382         /* Use accurate RIP reporting if available. */
 383         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 384                 rip_msr = MSR_IA32_MCG_EIP;
 385
 386         /* Log the machine checks left over from the previous reset.
 387            This also clears all registers */
 388         do_machine_check(NULL, mce_bootlog ? -1 : -2);
 389
 390         set_in_cr4(X86_CR4_MCE);
 391
 392         if (cap & MCG_CTL_P)
 393                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 394
 395         for (i = 0; i < banks; i++) {
 396                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 397                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 398         }
 399 }
 400
 401 /* Add per CPU specific workarounds here */
 402 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 403 {
 404         /* This should be disabled by the BIOS, but isn't always */
 405         if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
 406                 /* disable GART TBL walk error reporting, which trips off
 407                    incorrectly with the IOMMU & 3ware & Cerberus. */
 408                 clear_bit(10, &bank[4]);
 409                 /* Lots of broken BIOS around that don't clear them
 410                    by default and leave crap in there. Don't log. */
 411                 mce_bootlog = 0;
 412         }
 413
 414 }
 415
 416 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 417 {
 418         switch (c->x86_vendor) {
 419         case X86_VENDOR_INTEL:
 420                 mce_intel_feature_init(c);
 421                 break;
 422         case X86_VENDOR_AMD:
 423                 mce_amd_feature_init(c);
 424                 break;
 425         default:
 426                 break;
 427         }
 428 }
 429
 430 /*
 431  * Called for each booted CPU to set up machine checks.
 432  * Must be called with preempt off.
 433  */
 434 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 435 {
 436         static cpumask_t mce_cpus = CPU_MASK_NONE;
 437
 438         mce_cpu_quirks(c);
 439
 440         if (mce_dont_init ||
 441             cpu_test_and_set(smp_processor_id(), mce_cpus) ||
 442             !mce_available(c))
 443                 return;
 444
 445         mce_init(NULL);
 446         mce_cpu_features(c);
 447 }
 448
 449 /*
 450  * Character device to read and clear the MCE log.
 451  */
 452
 453 static void collect_tscs(void *data)
 454 {
 455         unsigned long *cpu_tsc = (unsigned long *)data;
 456         rdtscll(cpu_tsc[smp_processor_id()]);
 457 }
 458
 459 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
 460 {
 461         unsigned long *cpu_tsc;
 462         static DECLARE_MUTEX(mce_read_sem);
 463         unsigned next;
 464         char __user *buf = ubuf;
 465         int i, err;
 466
 467         cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
 468         if (!cpu_tsc)
 469                 return -ENOMEM;
 470
 471         down(&mce_read_sem);
 472         next = rcu_dereference(mcelog.next);
 473
 474         /* Only supports full reads right now */
 475         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 476                 up(&mce_read_sem);
 477                 kfree(cpu_tsc);
 478                 return -EINVAL;
 479         }
 480
 481         err = 0;
 482         for (i = 0; i < next; i++) {
 483                 unsigned long start = jiffies;
 484                 while (!mcelog.entry[i].finished) {
 485                         if (!time_before(jiffies, start + 2)) {
 486                                 memset(mcelog.entry + i,0, sizeof(struct mce));
 487                                 continue;
 488                         }
 489                         cpu_relax();
 490                 }
 491                 smp_rmb();
 492                 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
 493                 buf += sizeof(struct mce);
 494         }
 495
 496         memset(mcelog.entry, 0, next * sizeof(struct mce));
 497         mcelog.next = 0;
 498
 499         synchronize_sched();
 500
 501         /* Collect entries that were still getting written before the synchronize. */
 502
 503         on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
 504         for (i = next; i < MCE_LOG_LEN; i++) {
 505                 if (mcelog.entry[i].finished &&
 506                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 507                         err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
 508                         smp_rmb();
 509                         buf += sizeof(struct mce);
 510                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 511                 }
 512         }
 513         up(&mce_read_sem);
 514         kfree(cpu_tsc);
 515         return err ? -EFAULT : buf - ubuf;
 516 }
 517
 518 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
 519 {
 520         int __user *p = (int __user *)arg;
 521         if (!capable(CAP_SYS_ADMIN))
 522                 return -EPERM;
 523         switch (cmd) {
 524         case MCE_GET_RECORD_LEN:
 525                 return put_user(sizeof(struct mce), p);
 526         case MCE_GET_LOG_LEN:
 527                 return put_user(MCE_LOG_LEN, p);
 528         case MCE_GETCLEAR_FLAGS: {
 529                 unsigned flags;
 530                 do {
 531                         flags = mcelog.flags;
 532                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 533                 return put_user(flags, p);
 534         }
 535         default:
 536                 return -ENOTTY;
 537         }
 538 }
 539
 540 static const struct file_operations mce_chrdev_ops = {
 541         .read = mce_read,
 542         .ioctl = mce_ioctl,
 543 };
 544
 545 static struct miscdevice mce_log_device = {
 546         MISC_MCELOG_MINOR,
 547         "mcelog",
 548         &mce_chrdev_ops,
 549 };
 550
 551 /*
 552  * Old style boot options parsing. Only for compatibility.
 553  */
 554
 555 static int __init mcheck_disable(char *str)
 556 {
 557         mce_dont_init = 1;
 558         return 1;
 559 }
 560
 561 /* mce=off disables machine check. Note you can reenable it later
 562    using sysfs.
 563    mce=TOLERANCELEVEL (number, see above)
 564    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 565    mce=nobootlog Don't log MCEs from before booting. */
 566 static int __init mcheck_enable(char *str)
 567 {
 568         if (*str == '=')
 569                 str++;
 570         if (!strcmp(str, "off"))
 571                 mce_dont_init = 1;
 572         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 573                 mce_bootlog = str[0] == 'b';
 574         else if (isdigit(str[0]))
 575                 get_option(&str, &tolerant);
 576         else
 577                 printk("mce= argument %s ignored. Please use /sys", str);
 578         return 1;
 579 }
 580
 581 __setup("nomce", mcheck_disable);
 582 __setup("mce", mcheck_enable);
 583
 584 /*
 585  * Sysfs support
 586  */
 587
 588 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 589    Only one CPU is active at this time, the others get readded later using
 590    CPU hotplug. */
 591 static int mce_resume(struct sys_device *dev)
 592 {
 593         mce_init(NULL);
 594         return 0;
 595 }
 596
 597 /* Reinit MCEs after user configuration changes */
 598 static void mce_restart(void)
 599 {
 600         if (check_interval)
 601                 cancel_delayed_work(&mcheck_work);
 602         /* Timer race is harmless here */
 603         on_each_cpu(mce_init, NULL, 1, 1);
 604         if (check_interval)
 605                 schedule_delayed_work(&mcheck_work, check_interval*HZ);
 606 }
 607
 608 static struct sysdev_class mce_sysclass = {
 609         .resume = mce_resume,
 610         set_kset_name("machinecheck"),
 611 };
 612
 613 DEFINE_PER_CPU(struct sys_device, device_mce);
 614
 615 /* Why are there no generic functions for this? */
 616 #define ACCESSOR(name, var, start) \
 617         static ssize_t show_ ## name(struct sys_device *s, char *buf) {                    \
 618                 return sprintf(buf, "%lx\n", (unsigned long)var);                  \
 619         }                                                                          \
 620         static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
 621                 char *end;                                                         \
 622                 unsigned long new = simple_strtoul(buf, &end, 0);                  \
 623                 if (end == buf) return -EINVAL;                                    \
 624                 var = new;                                                         \
 625                 start;                                                             \
 626                 return end-buf;                                                    \
 627         }                                                                          \
 628         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 629
 630 /* TBD should generate these dynamically based on number of available banks */
 631 ACCESSOR(bank0ctl,bank[0],mce_restart())
 632 ACCESSOR(bank1ctl,bank[1],mce_restart())
 633 ACCESSOR(bank2ctl,bank[2],mce_restart())
 634 ACCESSOR(bank3ctl,bank[3],mce_restart())
 635 ACCESSOR(bank4ctl,bank[4],mce_restart())
 636 ACCESSOR(bank5ctl,bank[5],mce_restart())
 637
 638 static ssize_t show_trigger(struct sys_device *s, char *buf)
 639 {
 640         strcpy(buf, trigger);
 641         strcat(buf, "\n");
 642         return strlen(trigger) + 1;
 643 }
 644
 645 static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
 646 {
 647         char *p;
 648         int len;
 649         strncpy(trigger, buf, sizeof(trigger));
 650         trigger[sizeof(trigger)-1] = 0;
 651         len = strlen(trigger);
 652         p = strchr(trigger, '\n');
 653         if (*p) *p = 0;
 654         return len;
 655 }
 656
 657 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 658 ACCESSOR(tolerant,tolerant,)
 659 ACCESSOR(check_interval,check_interval,mce_restart())
 660 static struct sysdev_attribute *mce_attributes[] = {
 661         &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
 662         &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
 663         &attr_tolerant, &attr_check_interval, &attr_trigger,
 664         NULL
 665 };
 666
 667 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 668 static __cpuinit int mce_create_device(unsigned int cpu)
 669 {
 670         int err;
 671         int i;
 672         if (!mce_available(&cpu_data[cpu]))
 673                 return -EIO;
 674
 675         per_cpu(device_mce,cpu).id = cpu;
 676         per_cpu(device_mce,cpu).cls = &mce_sysclass;
 677
 678         err = sysdev_register(&per_cpu(device_mce,cpu));
 679
 680         if (!err) {
 681                 for (i = 0; mce_attributes[i]; i++)
 682                         sysdev_create_file(&per_cpu(device_mce,cpu),
 683                                 mce_attributes[i]);
 684         }
 685         return err;
 686 }
 687
 688 static void mce_remove_device(unsigned int cpu)
 689 {
 690         int i;
 691
 692         for (i = 0; mce_attributes[i]; i++)
 693                 sysdev_remove_file(&per_cpu(device_mce,cpu),
 694                         mce_attributes[i]);
 695         sysdev_unregister(&per_cpu(device_mce,cpu));
 696         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
 697 }
 698
 699 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 700 static int
 701 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 702 {
 703         unsigned int cpu = (unsigned long)hcpu;
 704
 705         switch (action) {
 706         case CPU_ONLINE:
 707                 mce_create_device(cpu);
 708                 break;
 709         case CPU_DEAD:
 710                 mce_remove_device(cpu);
 711                 break;
 712         }
 713         return NOTIFY_OK;
 714 }
 715
 716 static struct notifier_block mce_cpu_notifier = {
 717         .notifier_call = mce_cpu_callback,
 718 };
 719
 720 static __init int mce_init_device(void)
 721 {
 722         int err;
 723         int i = 0;
 724
 725         if (!mce_available(&boot_cpu_data))
 726                 return -EIO;
 727         err = sysdev_class_register(&mce_sysclass);
 728
 729         for_each_online_cpu(i) {
 730                 mce_create_device(i);
 731         }
 732
 733         register_hotcpu_notifier(&mce_cpu_notifier);
 734         misc_register(&mce_log_device);
 735         return err;
 736 }
 737
 738 device_initcall(mce_init_device);