[PATCH] x86_64: another mb() for smpboot.c
[deliverable/linux.git] / arch / x86_64 / kernel / mce.c
CommitLineData
1da177e4
LT
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
a9415644 18#include <linux/capability.h>
91c6d400
AK
19#include <linux/cpu.h>
20#include <linux/percpu.h>
8c566ef5 21#include <linux/ctype.h>
1da177e4
LT
22#include <asm/processor.h>
23#include <asm/msr.h>
24#include <asm/mce.h>
25#include <asm/kdebug.h>
26#include <asm/uaccess.h>
27
28#define MISC_MCELOG_MINOR 227
29#define NR_BANKS 5
30
31static int mce_dont_init;
32
33/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
34 3: never panic or exit (for testing only) */
35static int tolerant = 1;
36static int banks;
37static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
38static unsigned long console_logged;
39static int notify_user;
94ad8474 40static int rip_msr;
e583538f 41static int mce_bootlog = 1;
1da177e4
LT
42
43/*
44 * Lockless MCE logging infrastructure.
45 * This avoids deadlocks on printk locks without having to break locks. Also
46 * separate MCEs from kernel messages to avoid bogus bug reports.
47 */
48
49struct mce_log mcelog = {
50 MCE_LOG_SIGNATURE,
51 MCE_LOG_LEN,
52};
53
54void mce_log(struct mce *mce)
55{
56 unsigned next, entry;
57 mce->finished = 0;
7644143c 58 wmb();
1da177e4
LT
59 for (;;) {
60 entry = rcu_dereference(mcelog.next);
7644143c
MW
61 /* The rmb forces the compiler to reload next in each
62 iteration */
63 rmb();
673242c1
AK
64 for (;;) {
65 /* When the buffer fills up discard new entries. Assume
66 that the earlier errors are the more interesting. */
67 if (entry >= MCE_LOG_LEN) {
68 set_bit(MCE_OVERFLOW, &mcelog.flags);
69 return;
70 }
71 /* Old left over entry. Skip. */
72 if (mcelog.entry[entry].finished) {
73 entry++;
74 continue;
75 }
7644143c 76 break;
1da177e4 77 }
1da177e4
LT
78 smp_rmb();
79 next = entry + 1;
80 if (cmpxchg(&mcelog.next, entry, next) == entry)
81 break;
82 }
83 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
7644143c 84 wmb();
1da177e4 85 mcelog.entry[entry].finished = 1;
7644143c 86 wmb();
1da177e4
LT
87
88 if (!test_and_set_bit(0, &console_logged))
89 notify_user = 1;
90}
91
92static void print_mce(struct mce *m)
93{
94 printk(KERN_EMERG "\n"
95 KERN_EMERG
96 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
97 m->cpu, m->mcgstatus, m->bank, m->status);
98 if (m->rip) {
99 printk(KERN_EMERG
100 "RIP%s %02x:<%016Lx> ",
101 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
102 m->cs, m->rip);
103 if (m->cs == __KERNEL_CS)
104 print_symbol("{%s}", m->rip);
105 printk("\n");
106 }
107 printk(KERN_EMERG "TSC %Lx ", m->tsc);
108 if (m->addr)
109 printk("ADDR %Lx ", m->addr);
110 if (m->misc)
111 printk("MISC %Lx ", m->misc);
112 printk("\n");
113}
114
115static void mce_panic(char *msg, struct mce *backup, unsigned long start)
116{
117 int i;
118 oops_begin();
119 for (i = 0; i < MCE_LOG_LEN; i++) {
120 unsigned long tsc = mcelog.entry[i].tsc;
121 if (time_before(tsc, start))
122 continue;
123 print_mce(&mcelog.entry[i]);
124 if (backup && mcelog.entry[i].tsc == backup->tsc)
125 backup = NULL;
126 }
127 if (backup)
128 print_mce(backup);
129 if (tolerant >= 3)
130 printk("Fake panic: %s\n", msg);
131 else
132 panic(msg);
133}
134
135static int mce_available(struct cpuinfo_x86 *c)
136{
137 return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
138 test_bit(X86_FEATURE_MCA, &c->x86_capability);
139}
140
94ad8474
AK
141static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
142{
143 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
144 m->rip = regs->rip;
145 m->cs = regs->cs;
146 } else {
147 m->rip = 0;
148 m->cs = 0;
149 }
150 if (rip_msr) {
151 /* Assume the RIP in the MSR is exact. Is this true? */
152 m->mcgstatus |= MCG_STATUS_EIPV;
153 rdmsrl(rip_msr, m->rip);
154 m->cs = 0;
155 }
156}
157
1da177e4
LT
158/*
159 * The actual machine check handler
160 */
161
162void do_machine_check(struct pt_regs * regs, long error_code)
163{
164 struct mce m, panicm;
165 int nowayout = (tolerant < 1);
166 int kill_it = 0;
167 u64 mcestart = 0;
168 int i;
169 int panicm_found = 0;
170
171 if (regs)
6e3f3617 172 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
1da177e4
LT
173 if (!banks)
174 return;
175
176 memset(&m, 0, sizeof(struct mce));
177 m.cpu = hard_smp_processor_id();
178 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
179 if (!(m.mcgstatus & MCG_STATUS_RIPV))
180 kill_it = 1;
181
182 rdtscll(mcestart);
183 barrier();
184
185 for (i = 0; i < banks; i++) {
186 if (!bank[i])
187 continue;
188
189 m.misc = 0;
190 m.addr = 0;
191 m.bank = i;
192 m.tsc = 0;
193
194 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
195 if ((m.status & MCI_STATUS_VAL) == 0)
196 continue;
197
198 if (m.status & MCI_STATUS_EN) {
199 /* In theory _OVER could be a nowayout too, but
200 assume any overflowed errors were no fatal. */
201 nowayout |= !!(m.status & MCI_STATUS_PCC);
202 kill_it |= !!(m.status & MCI_STATUS_UC);
203 }
204
205 if (m.status & MCI_STATUS_MISCV)
206 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
207 if (m.status & MCI_STATUS_ADDRV)
208 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
209
94ad8474 210 mce_get_rip(&m, regs);
d5172f26 211 if (error_code >= 0)
1da177e4
LT
212 rdtscll(m.tsc);
213 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
d5172f26
AK
214 if (error_code != -2)
215 mce_log(&m);
1da177e4
LT
216
217 /* Did this bank cause the exception? */
218 /* Assume that the bank with uncorrectable errors did it,
219 and that there is only a single one. */
220 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
221 panicm = m;
222 panicm_found = 1;
223 }
224
9f158333 225 add_taint(TAINT_MACHINE_CHECK);
1da177e4
LT
226 }
227
228 /* Never do anything final in the polling timer */
229 if (!regs)
230 goto out;
231
232 /* If we didn't find an uncorrectable error, pick
233 the last one (shouldn't happen, just being safe). */
234 if (!panicm_found)
235 panicm = m;
236 if (nowayout)
237 mce_panic("Machine check", &panicm, mcestart);
238 if (kill_it) {
239 int user_space = 0;
240
241 if (m.mcgstatus & MCG_STATUS_RIPV)
242 user_space = panicm.rip && (panicm.cs & 3);
243
244 /* When the machine was in user space and the CPU didn't get
245 confused it's normally not necessary to panic, unless you
246 are paranoid (tolerant == 0)
247
248 RED-PEN could be more tolerant for MCEs in idle,
249 but most likely they occur at boot anyways, where
250 it is best to just halt the machine. */
251 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
252 (unsigned)current->pid <= 1)
253 mce_panic("Uncorrected machine check", &panicm, mcestart);
254
255 /* do_exit takes an awful lot of locks and has as
256 slight risk of deadlocking. If you don't want that
257 don't set tolerant >= 2 */
258 if (tolerant < 3)
259 do_exit(SIGBUS);
260 }
261
262 out:
263 /* Last thing done in the machine check exception to clear state. */
264 wrmsrl(MSR_IA32_MCG_STATUS, 0);
265}
266
267/*
268 * Periodic polling timer for "silent" machine check errors.
269 */
270
271static int check_interval = 5 * 60; /* 5 minutes */
272static void mcheck_timer(void *data);
273static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
274
275static void mcheck_check_cpu(void *info)
276{
277 if (mce_available(&current_cpu_data))
278 do_machine_check(NULL, 0);
279}
280
281static void mcheck_timer(void *data)
282{
283 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
284 schedule_delayed_work(&mcheck_work, check_interval * HZ);
285
286 /*
287 * It's ok to read stale data here for notify_user and
288 * console_logged as we'll simply get the updated versions
289 * on the next mcheck_timer execution and atomic operations
290 * on console_logged act as synchronization for notify_user
291 * writes.
292 */
293 if (notify_user && console_logged) {
294 notify_user = 0;
295 clear_bit(0, &console_logged);
296 printk(KERN_INFO "Machine check events logged\n");
297 }
298}
299
300
301static __init int periodic_mcheck_init(void)
302{
303 if (check_interval)
304 schedule_delayed_work(&mcheck_work, check_interval*HZ);
305 return 0;
306}
307__initcall(periodic_mcheck_init);
308
309
310/*
311 * Initialize Machine Checks for a CPU.
312 */
313static void mce_init(void *dummy)
314{
315 u64 cap;
316 int i;
317
318 rdmsrl(MSR_IA32_MCG_CAP, cap);
319 banks = cap & 0xff;
320 if (banks > NR_BANKS) {
321 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
322 banks = NR_BANKS;
323 }
94ad8474
AK
324 /* Use accurate RIP reporting if available. */
325 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
326 rip_msr = MSR_IA32_MCG_EIP;
1da177e4
LT
327
328 /* Log the machine checks left over from the previous reset.
329 This also clears all registers */
d5172f26 330 do_machine_check(NULL, mce_bootlog ? -1 : -2);
1da177e4
LT
331
332 set_in_cr4(X86_CR4_MCE);
333
334 if (cap & MCG_CTL_P)
335 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
336
337 for (i = 0; i < banks; i++) {
338 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
339 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
340 }
341}
342
343/* Add per CPU specific workarounds here */
e6982c67 344static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1da177e4
LT
345{
346 /* This should be disabled by the BIOS, but isn't always */
347 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
348 /* disable GART TBL walk error reporting, which trips off
349 incorrectly with the IOMMU & 3ware & Cerberus. */
350 clear_bit(10, &bank[4]);
e583538f
AK
351 /* Lots of broken BIOS around that don't clear them
352 by default and leave crap in there. Don't log. */
353 mce_bootlog = 0;
1da177e4 354 }
e583538f 355
1da177e4
LT
356}
357
e6982c67 358static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
1da177e4
LT
359{
360 switch (c->x86_vendor) {
361 case X86_VENDOR_INTEL:
362 mce_intel_feature_init(c);
363 break;
89b831ef
JS
364 case X86_VENDOR_AMD:
365 mce_amd_feature_init(c);
366 break;
1da177e4
LT
367 default:
368 break;
369 }
370}
371
372/*
373 * Called for each booted CPU to set up machine checks.
374 * Must be called with preempt off.
375 */
e6982c67 376void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1da177e4
LT
377{
378 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
379
380 mce_cpu_quirks(c);
381
382 if (mce_dont_init ||
383 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
384 !mce_available(c))
385 return;
386
387 mce_init(NULL);
388 mce_cpu_features(c);
389}
390
391/*
392 * Character device to read and clear the MCE log.
393 */
394
395static void collect_tscs(void *data)
396{
397 unsigned long *cpu_tsc = (unsigned long *)data;
398 rdtscll(cpu_tsc[smp_processor_id()]);
399}
400
401static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
402{
f0de53bb 403 unsigned long *cpu_tsc;
1da177e4
LT
404 static DECLARE_MUTEX(mce_read_sem);
405 unsigned next;
406 char __user *buf = ubuf;
407 int i, err;
408
f0de53bb
AK
409 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
410 if (!cpu_tsc)
411 return -ENOMEM;
412
1da177e4
LT
413 down(&mce_read_sem);
414 next = rcu_dereference(mcelog.next);
415
416 /* Only supports full reads right now */
417 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
418 up(&mce_read_sem);
f0de53bb 419 kfree(cpu_tsc);
1da177e4
LT
420 return -EINVAL;
421 }
422
423 err = 0;
673242c1
AK
424 for (i = 0; i < next; i++) {
425 unsigned long start = jiffies;
426 while (!mcelog.entry[i].finished) {
427 if (!time_before(jiffies, start + 2)) {
428 memset(mcelog.entry + i,0, sizeof(struct mce));
429 continue;
430 }
431 cpu_relax();
432 }
1da177e4
LT
433 smp_rmb();
434 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
435 buf += sizeof(struct mce);
436 }
437
438 memset(mcelog.entry, 0, next * sizeof(struct mce));
439 mcelog.next = 0;
440
b2b18660 441 synchronize_sched();
1da177e4
LT
442
443 /* Collect entries that were still getting written before the synchronize. */
444
445 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
446 for (i = next; i < MCE_LOG_LEN; i++) {
447 if (mcelog.entry[i].finished &&
448 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
449 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
450 smp_rmb();
451 buf += sizeof(struct mce);
452 memset(&mcelog.entry[i], 0, sizeof(struct mce));
453 }
454 }
455 up(&mce_read_sem);
f0de53bb 456 kfree(cpu_tsc);
1da177e4
LT
457 return err ? -EFAULT : buf - ubuf;
458}
459
460static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
461{
462 int __user *p = (int __user *)arg;
463 if (!capable(CAP_SYS_ADMIN))
464 return -EPERM;
465 switch (cmd) {
466 case MCE_GET_RECORD_LEN:
467 return put_user(sizeof(struct mce), p);
468 case MCE_GET_LOG_LEN:
469 return put_user(MCE_LOG_LEN, p);
470 case MCE_GETCLEAR_FLAGS: {
471 unsigned flags;
472 do {
473 flags = mcelog.flags;
474 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
475 return put_user(flags, p);
476 }
477 default:
478 return -ENOTTY;
479 }
480}
481
482static struct file_operations mce_chrdev_ops = {
483 .read = mce_read,
484 .ioctl = mce_ioctl,
485};
486
487static struct miscdevice mce_log_device = {
488 MISC_MCELOG_MINOR,
489 "mcelog",
490 &mce_chrdev_ops,
491};
492
493/*
494 * Old style boot options parsing. Only for compatibility.
495 */
496
497static int __init mcheck_disable(char *str)
498{
499 mce_dont_init = 1;
500 return 0;
501}
502
503/* mce=off disables machine check. Note you can reenable it later
d5172f26 504 using sysfs.
8c566ef5 505 mce=TOLERANCELEVEL (number, see above)
e583538f
AK
506 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
507 mce=nobootlog Don't log MCEs from before booting. */
1da177e4
LT
508static int __init mcheck_enable(char *str)
509{
d5172f26
AK
510 if (*str == '=')
511 str++;
1da177e4
LT
512 if (!strcmp(str, "off"))
513 mce_dont_init = 1;
e583538f
AK
514 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
515 mce_bootlog = str[0] == 'b';
8c566ef5
AK
516 else if (isdigit(str[0]))
517 get_option(&str, &tolerant);
1da177e4
LT
518 else
519 printk("mce= argument %s ignored. Please use /sys", str);
520 return 0;
521}
522
523__setup("nomce", mcheck_disable);
524__setup("mce", mcheck_enable);
525
526/*
527 * Sysfs support
528 */
529
413588c7
AK
530/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
531 Only one CPU is active at this time, the others get readded later using
532 CPU hotplug. */
1da177e4
LT
533static int mce_resume(struct sys_device *dev)
534{
413588c7 535 mce_init(NULL);
1da177e4
LT
536 return 0;
537}
538
539/* Reinit MCEs after user configuration changes */
540static void mce_restart(void)
541{
542 if (check_interval)
543 cancel_delayed_work(&mcheck_work);
544 /* Timer race is harmless here */
545 on_each_cpu(mce_init, NULL, 1, 1);
546 if (check_interval)
547 schedule_delayed_work(&mcheck_work, check_interval*HZ);
548}
549
550static struct sysdev_class mce_sysclass = {
551 .resume = mce_resume,
552 set_kset_name("machinecheck"),
553};
554
91c6d400 555static DEFINE_PER_CPU(struct sys_device, device_mce);
1da177e4
LT
556
557/* Why are there no generic functions for this? */
558#define ACCESSOR(name, var, start) \
559 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
560 return sprintf(buf, "%lx\n", (unsigned long)var); \
561 } \
562 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
563 char *end; \
564 unsigned long new = simple_strtoul(buf, &end, 0); \
565 if (end == buf) return -EINVAL; \
566 var = new; \
567 start; \
568 return end-buf; \
569 } \
570 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
571
572ACCESSOR(bank0ctl,bank[0],mce_restart())
573ACCESSOR(bank1ctl,bank[1],mce_restart())
574ACCESSOR(bank2ctl,bank[2],mce_restart())
575ACCESSOR(bank3ctl,bank[3],mce_restart())
576ACCESSOR(bank4ctl,bank[4],mce_restart())
577ACCESSOR(tolerant,tolerant,)
578ACCESSOR(check_interval,check_interval,mce_restart())
579
91c6d400
AK
580/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
581static __cpuinit int mce_create_device(unsigned int cpu)
1da177e4
LT
582{
583 int err;
91c6d400
AK
584 if (!mce_available(&cpu_data[cpu]))
585 return -EIO;
586
587 per_cpu(device_mce,cpu).id = cpu;
588 per_cpu(device_mce,cpu).cls = &mce_sysclass;
589
590 err = sysdev_register(&per_cpu(device_mce,cpu));
591
592 if (!err) {
593 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
594 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
595 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
596 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
597 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
598 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
599 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
600 }
601 return err;
602}
603
604#ifdef CONFIG_HOTPLUG_CPU
605static __cpuinit void mce_remove_device(unsigned int cpu)
606{
607 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
608 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
609 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
610 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
611 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
612 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
613 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
614 sysdev_unregister(&per_cpu(device_mce,cpu));
615}
616#endif
617
618/* Get notified when a cpu comes on/off. Be hotplug friendly. */
619static __cpuinit int
620mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
621{
622 unsigned int cpu = (unsigned long)hcpu;
623
624 switch (action) {
625 case CPU_ONLINE:
626 mce_create_device(cpu);
627 break;
628#ifdef CONFIG_HOTPLUG_CPU
629 case CPU_DEAD:
630 mce_remove_device(cpu);
631 break;
632#endif
633 }
634 return NOTIFY_OK;
635}
636
637static struct notifier_block mce_cpu_notifier = {
638 .notifier_call = mce_cpu_callback,
639};
640
641static __init int mce_init_device(void)
642{
643 int err;
644 int i = 0;
645
1da177e4
LT
646 if (!mce_available(&boot_cpu_data))
647 return -EIO;
648 err = sysdev_class_register(&mce_sysclass);
91c6d400
AK
649
650 for_each_online_cpu(i) {
651 mce_create_device(i);
652 }
653
654 register_cpu_notifier(&mce_cpu_notifier);
1da177e4
LT
655 misc_register(&mce_log_device);
656 return err;
1da177e4 657}
91c6d400 658
1da177e4 659device_initcall(mce_init_device);
This page took 0.112653 seconds and 5 git commands to generate.