[PATCH] x86_64: Log machine checks from boot on Intel systems
[deliverable/linux.git] / arch / x86_64 / kernel / mce.c
1 /*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
17 #include <linux/fs.h>
18 #include <linux/cpu.h>
19 #include <linux/percpu.h>
20 #include <linux/ctype.h>
21 #include <asm/processor.h>
22 #include <asm/msr.h>
23 #include <asm/mce.h>
24 #include <asm/kdebug.h>
25 #include <asm/uaccess.h>
26
27 #define MISC_MCELOG_MINOR 227
28 #define NR_BANKS 5
29
30 static int mce_dont_init;
31
32 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
33 3: never panic or exit (for testing only) */
34 static int tolerant = 1;
35 static int banks;
36 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
37 static unsigned long console_logged;
38 static int notify_user;
39 static int rip_msr;
40 static int mce_bootlog = 1;
41
42 /*
43 * Lockless MCE logging infrastructure.
44 * This avoids deadlocks on printk locks without having to break locks. Also
45 * separate MCEs from kernel messages to avoid bogus bug reports.
46 */
47
48 struct mce_log mcelog = {
49 MCE_LOG_SIGNATURE,
50 MCE_LOG_LEN,
51 };
52
53 void mce_log(struct mce *mce)
54 {
55 unsigned next, entry;
56 mce->finished = 0;
57 wmb();
58 for (;;) {
59 entry = rcu_dereference(mcelog.next);
60 /* The rmb forces the compiler to reload next in each
61 iteration */
62 rmb();
63 for (;;) {
64 /* When the buffer fills up discard new entries. Assume
65 that the earlier errors are the more interesting. */
66 if (entry >= MCE_LOG_LEN) {
67 set_bit(MCE_OVERFLOW, &mcelog.flags);
68 return;
69 }
70 /* Old left over entry. Skip. */
71 if (mcelog.entry[entry].finished) {
72 entry++;
73 continue;
74 }
75 break;
76 }
77 smp_rmb();
78 next = entry + 1;
79 if (cmpxchg(&mcelog.next, entry, next) == entry)
80 break;
81 }
82 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
83 wmb();
84 mcelog.entry[entry].finished = 1;
85 wmb();
86
87 if (!test_and_set_bit(0, &console_logged))
88 notify_user = 1;
89 }
90
91 static void print_mce(struct mce *m)
92 {
93 printk(KERN_EMERG "\n"
94 KERN_EMERG
95 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
96 m->cpu, m->mcgstatus, m->bank, m->status);
97 if (m->rip) {
98 printk(KERN_EMERG
99 "RIP%s %02x:<%016Lx> ",
100 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
101 m->cs, m->rip);
102 if (m->cs == __KERNEL_CS)
103 print_symbol("{%s}", m->rip);
104 printk("\n");
105 }
106 printk(KERN_EMERG "TSC %Lx ", m->tsc);
107 if (m->addr)
108 printk("ADDR %Lx ", m->addr);
109 if (m->misc)
110 printk("MISC %Lx ", m->misc);
111 printk("\n");
112 }
113
114 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
115 {
116 int i;
117 oops_begin();
118 for (i = 0; i < MCE_LOG_LEN; i++) {
119 unsigned long tsc = mcelog.entry[i].tsc;
120 if (time_before(tsc, start))
121 continue;
122 print_mce(&mcelog.entry[i]);
123 if (backup && mcelog.entry[i].tsc == backup->tsc)
124 backup = NULL;
125 }
126 if (backup)
127 print_mce(backup);
128 if (tolerant >= 3)
129 printk("Fake panic: %s\n", msg);
130 else
131 panic(msg);
132 }
133
134 static int mce_available(struct cpuinfo_x86 *c)
135 {
136 return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
137 test_bit(X86_FEATURE_MCA, &c->x86_capability);
138 }
139
140 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
141 {
142 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
143 m->rip = regs->rip;
144 m->cs = regs->cs;
145 } else {
146 m->rip = 0;
147 m->cs = 0;
148 }
149 if (rip_msr) {
150 /* Assume the RIP in the MSR is exact. Is this true? */
151 m->mcgstatus |= MCG_STATUS_EIPV;
152 rdmsrl(rip_msr, m->rip);
153 m->cs = 0;
154 }
155 }
156
157 /*
158 * The actual machine check handler
159 */
160
161 void do_machine_check(struct pt_regs * regs, long error_code)
162 {
163 struct mce m, panicm;
164 int nowayout = (tolerant < 1);
165 int kill_it = 0;
166 u64 mcestart = 0;
167 int i;
168 int panicm_found = 0;
169
170 if (regs)
171 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
172 if (!banks)
173 return;
174
175 memset(&m, 0, sizeof(struct mce));
176 m.cpu = hard_smp_processor_id();
177 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
178 if (!(m.mcgstatus & MCG_STATUS_RIPV))
179 kill_it = 1;
180
181 rdtscll(mcestart);
182 barrier();
183
184 for (i = 0; i < banks; i++) {
185 if (!bank[i])
186 continue;
187
188 m.misc = 0;
189 m.addr = 0;
190 m.bank = i;
191 m.tsc = 0;
192
193 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
194 if ((m.status & MCI_STATUS_VAL) == 0)
195 continue;
196
197 if (m.status & MCI_STATUS_EN) {
198 /* In theory _OVER could be a nowayout too, but
199 assume any overflowed errors were no fatal. */
200 nowayout |= !!(m.status & MCI_STATUS_PCC);
201 kill_it |= !!(m.status & MCI_STATUS_UC);
202 }
203
204 if (m.status & MCI_STATUS_MISCV)
205 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
206 if (m.status & MCI_STATUS_ADDRV)
207 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
208
209 mce_get_rip(&m, regs);
210 if (error_code >= 0)
211 rdtscll(m.tsc);
212 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
213 if (error_code != -2)
214 mce_log(&m);
215
216 /* Did this bank cause the exception? */
217 /* Assume that the bank with uncorrectable errors did it,
218 and that there is only a single one. */
219 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
220 panicm = m;
221 panicm_found = 1;
222 }
223
224 add_taint(TAINT_MACHINE_CHECK);
225 }
226
227 /* Never do anything final in the polling timer */
228 if (!regs)
229 goto out;
230
231 /* If we didn't find an uncorrectable error, pick
232 the last one (shouldn't happen, just being safe). */
233 if (!panicm_found)
234 panicm = m;
235 if (nowayout)
236 mce_panic("Machine check", &panicm, mcestart);
237 if (kill_it) {
238 int user_space = 0;
239
240 if (m.mcgstatus & MCG_STATUS_RIPV)
241 user_space = panicm.rip && (panicm.cs & 3);
242
243 /* When the machine was in user space and the CPU didn't get
244 confused it's normally not necessary to panic, unless you
245 are paranoid (tolerant == 0)
246
247 RED-PEN could be more tolerant for MCEs in idle,
248 but most likely they occur at boot anyways, where
249 it is best to just halt the machine. */
250 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
251 (unsigned)current->pid <= 1)
252 mce_panic("Uncorrected machine check", &panicm, mcestart);
253
254 /* do_exit takes an awful lot of locks and has as
255 slight risk of deadlocking. If you don't want that
256 don't set tolerant >= 2 */
257 if (tolerant < 3)
258 do_exit(SIGBUS);
259 }
260
261 out:
262 /* Last thing done in the machine check exception to clear state. */
263 wrmsrl(MSR_IA32_MCG_STATUS, 0);
264 }
265
266 /*
267 * Periodic polling timer for "silent" machine check errors.
268 */
269
270 static int check_interval = 5 * 60; /* 5 minutes */
271 static void mcheck_timer(void *data);
272 static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
273
274 static void mcheck_check_cpu(void *info)
275 {
276 if (mce_available(&current_cpu_data))
277 do_machine_check(NULL, 0);
278 }
279
280 static void mcheck_timer(void *data)
281 {
282 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
283 schedule_delayed_work(&mcheck_work, check_interval * HZ);
284
285 /*
286 * It's ok to read stale data here for notify_user and
287 * console_logged as we'll simply get the updated versions
288 * on the next mcheck_timer execution and atomic operations
289 * on console_logged act as synchronization for notify_user
290 * writes.
291 */
292 if (notify_user && console_logged) {
293 notify_user = 0;
294 clear_bit(0, &console_logged);
295 printk(KERN_INFO "Machine check events logged\n");
296 }
297 }
298
299
300 static __init int periodic_mcheck_init(void)
301 {
302 if (check_interval)
303 schedule_delayed_work(&mcheck_work, check_interval*HZ);
304 return 0;
305 }
306 __initcall(periodic_mcheck_init);
307
308
309 /*
310 * Initialize Machine Checks for a CPU.
311 */
312 static void mce_init(void *dummy)
313 {
314 u64 cap;
315 int i;
316
317 rdmsrl(MSR_IA32_MCG_CAP, cap);
318 banks = cap & 0xff;
319 if (banks > NR_BANKS) {
320 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
321 banks = NR_BANKS;
322 }
323 /* Use accurate RIP reporting if available. */
324 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
325 rip_msr = MSR_IA32_MCG_EIP;
326
327 /* Log the machine checks left over from the previous reset.
328 This also clears all registers */
329 do_machine_check(NULL, mce_bootlog ? -1 : -2);
330
331 set_in_cr4(X86_CR4_MCE);
332
333 if (cap & MCG_CTL_P)
334 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
335
336 for (i = 0; i < banks; i++) {
337 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
338 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
339 }
340 }
341
342 /* Add per CPU specific workarounds here */
343 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
344 {
345 /* This should be disabled by the BIOS, but isn't always */
346 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
347 /* disable GART TBL walk error reporting, which trips off
348 incorrectly with the IOMMU & 3ware & Cerberus. */
349 clear_bit(10, &bank[4]);
350 /* Lots of broken BIOS around that don't clear them
351 by default and leave crap in there. Don't log. */
352 mce_bootlog = 0;
353 }
354
355 }
356
357 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
358 {
359 switch (c->x86_vendor) {
360 case X86_VENDOR_INTEL:
361 mce_intel_feature_init(c);
362 break;
363 case X86_VENDOR_AMD:
364 mce_amd_feature_init(c);
365 break;
366 default:
367 break;
368 }
369 }
370
371 /*
372 * Called for each booted CPU to set up machine checks.
373 * Must be called with preempt off.
374 */
375 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
376 {
377 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
378
379 mce_cpu_quirks(c);
380
381 if (mce_dont_init ||
382 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
383 !mce_available(c))
384 return;
385
386 mce_init(NULL);
387 mce_cpu_features(c);
388 }
389
390 /*
391 * Character device to read and clear the MCE log.
392 */
393
394 static void collect_tscs(void *data)
395 {
396 unsigned long *cpu_tsc = (unsigned long *)data;
397 rdtscll(cpu_tsc[smp_processor_id()]);
398 }
399
400 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
401 {
402 unsigned long *cpu_tsc;
403 static DECLARE_MUTEX(mce_read_sem);
404 unsigned next;
405 char __user *buf = ubuf;
406 int i, err;
407
408 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
409 if (!cpu_tsc)
410 return -ENOMEM;
411
412 down(&mce_read_sem);
413 next = rcu_dereference(mcelog.next);
414
415 /* Only supports full reads right now */
416 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
417 up(&mce_read_sem);
418 kfree(cpu_tsc);
419 return -EINVAL;
420 }
421
422 err = 0;
423 for (i = 0; i < next; i++) {
424 unsigned long start = jiffies;
425 while (!mcelog.entry[i].finished) {
426 if (!time_before(jiffies, start + 2)) {
427 memset(mcelog.entry + i,0, sizeof(struct mce));
428 continue;
429 }
430 cpu_relax();
431 }
432 smp_rmb();
433 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
434 buf += sizeof(struct mce);
435 }
436
437 memset(mcelog.entry, 0, next * sizeof(struct mce));
438 mcelog.next = 0;
439
440 synchronize_sched();
441
442 /* Collect entries that were still getting written before the synchronize. */
443
444 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
445 for (i = next; i < MCE_LOG_LEN; i++) {
446 if (mcelog.entry[i].finished &&
447 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
448 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
449 smp_rmb();
450 buf += sizeof(struct mce);
451 memset(&mcelog.entry[i], 0, sizeof(struct mce));
452 }
453 }
454 up(&mce_read_sem);
455 kfree(cpu_tsc);
456 return err ? -EFAULT : buf - ubuf;
457 }
458
459 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
460 {
461 int __user *p = (int __user *)arg;
462 if (!capable(CAP_SYS_ADMIN))
463 return -EPERM;
464 switch (cmd) {
465 case MCE_GET_RECORD_LEN:
466 return put_user(sizeof(struct mce), p);
467 case MCE_GET_LOG_LEN:
468 return put_user(MCE_LOG_LEN, p);
469 case MCE_GETCLEAR_FLAGS: {
470 unsigned flags;
471 do {
472 flags = mcelog.flags;
473 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
474 return put_user(flags, p);
475 }
476 default:
477 return -ENOTTY;
478 }
479 }
480
481 static struct file_operations mce_chrdev_ops = {
482 .read = mce_read,
483 .ioctl = mce_ioctl,
484 };
485
486 static struct miscdevice mce_log_device = {
487 MISC_MCELOG_MINOR,
488 "mcelog",
489 &mce_chrdev_ops,
490 };
491
492 /*
493 * Old style boot options parsing. Only for compatibility.
494 */
495
496 static int __init mcheck_disable(char *str)
497 {
498 mce_dont_init = 1;
499 return 0;
500 }
501
502 /* mce=off disables machine check. Note you can reenable it later
503 using sysfs.
504 mce=TOLERANCELEVEL (number, see above)
505 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
506 mce=nobootlog Don't log MCEs from before booting. */
507 static int __init mcheck_enable(char *str)
508 {
509 if (*str == '=')
510 str++;
511 if (!strcmp(str, "off"))
512 mce_dont_init = 1;
513 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
514 mce_bootlog = str[0] == 'b';
515 else if (isdigit(str[0]))
516 get_option(&str, &tolerant);
517 else
518 printk("mce= argument %s ignored. Please use /sys", str);
519 return 0;
520 }
521
522 __setup("nomce", mcheck_disable);
523 __setup("mce", mcheck_enable);
524
525 /*
526 * Sysfs support
527 */
528
529 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
530 Only one CPU is active at this time, the others get readded later using
531 CPU hotplug. */
532 static int mce_resume(struct sys_device *dev)
533 {
534 mce_init(NULL);
535 return 0;
536 }
537
538 /* Reinit MCEs after user configuration changes */
539 static void mce_restart(void)
540 {
541 if (check_interval)
542 cancel_delayed_work(&mcheck_work);
543 /* Timer race is harmless here */
544 on_each_cpu(mce_init, NULL, 1, 1);
545 if (check_interval)
546 schedule_delayed_work(&mcheck_work, check_interval*HZ);
547 }
548
549 static struct sysdev_class mce_sysclass = {
550 .resume = mce_resume,
551 set_kset_name("machinecheck"),
552 };
553
554 static DEFINE_PER_CPU(struct sys_device, device_mce);
555
556 /* Why are there no generic functions for this? */
557 #define ACCESSOR(name, var, start) \
558 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
559 return sprintf(buf, "%lx\n", (unsigned long)var); \
560 } \
561 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
562 char *end; \
563 unsigned long new = simple_strtoul(buf, &end, 0); \
564 if (end == buf) return -EINVAL; \
565 var = new; \
566 start; \
567 return end-buf; \
568 } \
569 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
570
571 ACCESSOR(bank0ctl,bank[0],mce_restart())
572 ACCESSOR(bank1ctl,bank[1],mce_restart())
573 ACCESSOR(bank2ctl,bank[2],mce_restart())
574 ACCESSOR(bank3ctl,bank[3],mce_restart())
575 ACCESSOR(bank4ctl,bank[4],mce_restart())
576 ACCESSOR(tolerant,tolerant,)
577 ACCESSOR(check_interval,check_interval,mce_restart())
578
579 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
580 static __cpuinit int mce_create_device(unsigned int cpu)
581 {
582 int err;
583 if (!mce_available(&cpu_data[cpu]))
584 return -EIO;
585
586 per_cpu(device_mce,cpu).id = cpu;
587 per_cpu(device_mce,cpu).cls = &mce_sysclass;
588
589 err = sysdev_register(&per_cpu(device_mce,cpu));
590
591 if (!err) {
592 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
593 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
594 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
595 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
596 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
597 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
598 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
599 }
600 return err;
601 }
602
603 #ifdef CONFIG_HOTPLUG_CPU
604 static __cpuinit void mce_remove_device(unsigned int cpu)
605 {
606 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
607 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
608 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
609 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
610 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
611 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
612 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
613 sysdev_unregister(&per_cpu(device_mce,cpu));
614 }
615 #endif
616
617 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
618 static __cpuinit int
619 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
620 {
621 unsigned int cpu = (unsigned long)hcpu;
622
623 switch (action) {
624 case CPU_ONLINE:
625 mce_create_device(cpu);
626 break;
627 #ifdef CONFIG_HOTPLUG_CPU
628 case CPU_DEAD:
629 mce_remove_device(cpu);
630 break;
631 #endif
632 }
633 return NOTIFY_OK;
634 }
635
636 static struct notifier_block mce_cpu_notifier = {
637 .notifier_call = mce_cpu_callback,
638 };
639
640 static __init int mce_init_device(void)
641 {
642 int err;
643 int i = 0;
644
645 if (!mce_available(&boot_cpu_data))
646 return -EIO;
647 err = sysdev_class_register(&mce_sysclass);
648
649 for_each_online_cpu(i) {
650 mce_create_device(i);
651 }
652
653 register_cpu_notifier(&mce_cpu_notifier);
654 misc_register(&mce_log_device);
655 return err;
656 }
657
658 device_initcall(mce_init_device);
This page took 0.137026 seconds and 5 git commands to generate.