[PATCH] x86_64: extra NODES_SHIFT definition
[deliverable/linux.git] / arch / x86_64 / kernel / mce.c
CommitLineData
1da177e4
LT
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
a9415644 18#include <linux/capability.h>
91c6d400
AK
19#include <linux/cpu.h>
20#include <linux/percpu.h>
8c566ef5 21#include <linux/ctype.h>
1da177e4
LT
22#include <asm/processor.h>
23#include <asm/msr.h>
24#include <asm/mce.h>
25#include <asm/kdebug.h>
26#include <asm/uaccess.h>
0a9c3ee7 27#include <asm/smp.h>
1da177e4
LT
28
29#define MISC_MCELOG_MINOR 227
73ca5358 30#define NR_BANKS 6
1da177e4
LT
31
32static int mce_dont_init;
33
34/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
35 3: never panic or exit (for testing only) */
36static int tolerant = 1;
37static int banks;
38static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
39static unsigned long console_logged;
40static int notify_user;
94ad8474 41static int rip_msr;
e583538f 42static int mce_bootlog = 1;
1da177e4
LT
43
44/*
45 * Lockless MCE logging infrastructure.
46 * This avoids deadlocks on printk locks without having to break locks. Also
47 * separate MCEs from kernel messages to avoid bogus bug reports.
48 */
49
50struct mce_log mcelog = {
51 MCE_LOG_SIGNATURE,
52 MCE_LOG_LEN,
53};
54
55void mce_log(struct mce *mce)
56{
57 unsigned next, entry;
58 mce->finished = 0;
7644143c 59 wmb();
1da177e4
LT
60 for (;;) {
61 entry = rcu_dereference(mcelog.next);
7644143c
MW
62 /* The rmb forces the compiler to reload next in each
63 iteration */
64 rmb();
673242c1
AK
65 for (;;) {
66 /* When the buffer fills up discard new entries. Assume
67 that the earlier errors are the more interesting. */
68 if (entry >= MCE_LOG_LEN) {
69 set_bit(MCE_OVERFLOW, &mcelog.flags);
70 return;
71 }
72 /* Old left over entry. Skip. */
73 if (mcelog.entry[entry].finished) {
74 entry++;
75 continue;
76 }
7644143c 77 break;
1da177e4 78 }
1da177e4
LT
79 smp_rmb();
80 next = entry + 1;
81 if (cmpxchg(&mcelog.next, entry, next) == entry)
82 break;
83 }
84 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
7644143c 85 wmb();
1da177e4 86 mcelog.entry[entry].finished = 1;
7644143c 87 wmb();
1da177e4
LT
88
89 if (!test_and_set_bit(0, &console_logged))
90 notify_user = 1;
91}
92
93static void print_mce(struct mce *m)
94{
95 printk(KERN_EMERG "\n"
4855170f 96 KERN_EMERG "HARDWARE ERROR\n"
1da177e4
LT
97 KERN_EMERG
98 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
99 m->cpu, m->mcgstatus, m->bank, m->status);
100 if (m->rip) {
101 printk(KERN_EMERG
102 "RIP%s %02x:<%016Lx> ",
103 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
104 m->cs, m->rip);
105 if (m->cs == __KERNEL_CS)
106 print_symbol("{%s}", m->rip);
107 printk("\n");
108 }
109 printk(KERN_EMERG "TSC %Lx ", m->tsc);
110 if (m->addr)
111 printk("ADDR %Lx ", m->addr);
112 if (m->misc)
113 printk("MISC %Lx ", m->misc);
114 printk("\n");
4855170f
AK
115 printk(KERN_EMERG "This is not a software problem!\n");
116 printk(KERN_EMERG
117 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
1da177e4
LT
118}
119
120static void mce_panic(char *msg, struct mce *backup, unsigned long start)
121{
122 int i;
123 oops_begin();
124 for (i = 0; i < MCE_LOG_LEN; i++) {
125 unsigned long tsc = mcelog.entry[i].tsc;
126 if (time_before(tsc, start))
127 continue;
128 print_mce(&mcelog.entry[i]);
129 if (backup && mcelog.entry[i].tsc == backup->tsc)
130 backup = NULL;
131 }
132 if (backup)
133 print_mce(backup);
134 if (tolerant >= 3)
135 printk("Fake panic: %s\n", msg);
136 else
137 panic(msg);
138}
139
140static int mce_available(struct cpuinfo_x86 *c)
141{
3d1712c9 142 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
1da177e4
LT
143}
144
94ad8474
AK
145static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
146{
147 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
148 m->rip = regs->rip;
149 m->cs = regs->cs;
150 } else {
151 m->rip = 0;
152 m->cs = 0;
153 }
154 if (rip_msr) {
155 /* Assume the RIP in the MSR is exact. Is this true? */
156 m->mcgstatus |= MCG_STATUS_EIPV;
157 rdmsrl(rip_msr, m->rip);
158 m->cs = 0;
159 }
160}
161
1da177e4
LT
162/*
163 * The actual machine check handler
164 */
165
166void do_machine_check(struct pt_regs * regs, long error_code)
167{
168 struct mce m, panicm;
169 int nowayout = (tolerant < 1);
170 int kill_it = 0;
171 u64 mcestart = 0;
172 int i;
173 int panicm_found = 0;
174
175 if (regs)
6e3f3617 176 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
1da177e4
LT
177 if (!banks)
178 return;
179
180 memset(&m, 0, sizeof(struct mce));
0a9c3ee7 181 m.cpu = safe_smp_processor_id();
1da177e4
LT
182 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
183 if (!(m.mcgstatus & MCG_STATUS_RIPV))
184 kill_it = 1;
185
186 rdtscll(mcestart);
187 barrier();
188
189 for (i = 0; i < banks; i++) {
190 if (!bank[i])
191 continue;
192
193 m.misc = 0;
194 m.addr = 0;
195 m.bank = i;
196 m.tsc = 0;
197
198 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
199 if ((m.status & MCI_STATUS_VAL) == 0)
200 continue;
201
202 if (m.status & MCI_STATUS_EN) {
203 /* In theory _OVER could be a nowayout too, but
204 assume any overflowed errors were no fatal. */
205 nowayout |= !!(m.status & MCI_STATUS_PCC);
206 kill_it |= !!(m.status & MCI_STATUS_UC);
207 }
208
209 if (m.status & MCI_STATUS_MISCV)
210 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
211 if (m.status & MCI_STATUS_ADDRV)
212 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
213
94ad8474 214 mce_get_rip(&m, regs);
d5172f26 215 if (error_code >= 0)
1da177e4
LT
216 rdtscll(m.tsc);
217 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
d5172f26
AK
218 if (error_code != -2)
219 mce_log(&m);
1da177e4
LT
220
221 /* Did this bank cause the exception? */
222 /* Assume that the bank with uncorrectable errors did it,
223 and that there is only a single one. */
224 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
225 panicm = m;
226 panicm_found = 1;
227 }
228
9f158333 229 add_taint(TAINT_MACHINE_CHECK);
1da177e4
LT
230 }
231
232 /* Never do anything final in the polling timer */
233 if (!regs)
234 goto out;
235
236 /* If we didn't find an uncorrectable error, pick
237 the last one (shouldn't happen, just being safe). */
238 if (!panicm_found)
239 panicm = m;
240 if (nowayout)
241 mce_panic("Machine check", &panicm, mcestart);
242 if (kill_it) {
243 int user_space = 0;
244
245 if (m.mcgstatus & MCG_STATUS_RIPV)
246 user_space = panicm.rip && (panicm.cs & 3);
247
248 /* When the machine was in user space and the CPU didn't get
249 confused it's normally not necessary to panic, unless you
250 are paranoid (tolerant == 0)
251
252 RED-PEN could be more tolerant for MCEs in idle,
253 but most likely they occur at boot anyways, where
254 it is best to just halt the machine. */
255 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
256 (unsigned)current->pid <= 1)
257 mce_panic("Uncorrected machine check", &panicm, mcestart);
258
259 /* do_exit takes an awful lot of locks and has as
260 slight risk of deadlocking. If you don't want that
261 don't set tolerant >= 2 */
262 if (tolerant < 3)
263 do_exit(SIGBUS);
264 }
265
266 out:
267 /* Last thing done in the machine check exception to clear state. */
268 wrmsrl(MSR_IA32_MCG_STATUS, 0);
269}
270
271/*
272 * Periodic polling timer for "silent" machine check errors.
273 */
274
275static int check_interval = 5 * 60; /* 5 minutes */
276static void mcheck_timer(void *data);
277static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
278
279static void mcheck_check_cpu(void *info)
280{
281 if (mce_available(&current_cpu_data))
282 do_machine_check(NULL, 0);
283}
284
285static void mcheck_timer(void *data)
286{
287 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
288 schedule_delayed_work(&mcheck_work, check_interval * HZ);
289
290 /*
291 * It's ok to read stale data here for notify_user and
292 * console_logged as we'll simply get the updated versions
293 * on the next mcheck_timer execution and atomic operations
294 * on console_logged act as synchronization for notify_user
295 * writes.
296 */
297 if (notify_user && console_logged) {
298 notify_user = 0;
299 clear_bit(0, &console_logged);
300 printk(KERN_INFO "Machine check events logged\n");
301 }
302}
303
304
305static __init int periodic_mcheck_init(void)
306{
307 if (check_interval)
308 schedule_delayed_work(&mcheck_work, check_interval*HZ);
309 return 0;
310}
311__initcall(periodic_mcheck_init);
312
313
314/*
315 * Initialize Machine Checks for a CPU.
316 */
317static void mce_init(void *dummy)
318{
319 u64 cap;
320 int i;
321
322 rdmsrl(MSR_IA32_MCG_CAP, cap);
323 banks = cap & 0xff;
324 if (banks > NR_BANKS) {
325 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
326 banks = NR_BANKS;
327 }
94ad8474
AK
328 /* Use accurate RIP reporting if available. */
329 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
330 rip_msr = MSR_IA32_MCG_EIP;
1da177e4
LT
331
332 /* Log the machine checks left over from the previous reset.
333 This also clears all registers */
d5172f26 334 do_machine_check(NULL, mce_bootlog ? -1 : -2);
1da177e4
LT
335
336 set_in_cr4(X86_CR4_MCE);
337
338 if (cap & MCG_CTL_P)
339 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
340
341 for (i = 0; i < banks; i++) {
342 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
343 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
344 }
345}
346
347/* Add per CPU specific workarounds here */
e6982c67 348static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1da177e4
LT
349{
350 /* This should be disabled by the BIOS, but isn't always */
351 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
352 /* disable GART TBL walk error reporting, which trips off
353 incorrectly with the IOMMU & 3ware & Cerberus. */
354 clear_bit(10, &bank[4]);
e583538f
AK
355 /* Lots of broken BIOS around that don't clear them
356 by default and leave crap in there. Don't log. */
357 mce_bootlog = 0;
1da177e4 358 }
e583538f 359
1da177e4
LT
360}
361
e6982c67 362static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
1da177e4
LT
363{
364 switch (c->x86_vendor) {
365 case X86_VENDOR_INTEL:
366 mce_intel_feature_init(c);
367 break;
89b831ef
JS
368 case X86_VENDOR_AMD:
369 mce_amd_feature_init(c);
370 break;
1da177e4
LT
371 default:
372 break;
373 }
374}
375
376/*
377 * Called for each booted CPU to set up machine checks.
378 * Must be called with preempt off.
379 */
e6982c67 380void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1da177e4 381{
7ded5689 382 static cpumask_t mce_cpus = CPU_MASK_NONE;
1da177e4
LT
383
384 mce_cpu_quirks(c);
385
386 if (mce_dont_init ||
387 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
388 !mce_available(c))
389 return;
390
391 mce_init(NULL);
392 mce_cpu_features(c);
393}
394
395/*
396 * Character device to read and clear the MCE log.
397 */
398
399static void collect_tscs(void *data)
400{
401 unsigned long *cpu_tsc = (unsigned long *)data;
402 rdtscll(cpu_tsc[smp_processor_id()]);
403}
404
405static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
406{
f0de53bb 407 unsigned long *cpu_tsc;
1da177e4
LT
408 static DECLARE_MUTEX(mce_read_sem);
409 unsigned next;
410 char __user *buf = ubuf;
411 int i, err;
412
f0de53bb
AK
413 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
414 if (!cpu_tsc)
415 return -ENOMEM;
416
1da177e4
LT
417 down(&mce_read_sem);
418 next = rcu_dereference(mcelog.next);
419
420 /* Only supports full reads right now */
421 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
422 up(&mce_read_sem);
f0de53bb 423 kfree(cpu_tsc);
1da177e4
LT
424 return -EINVAL;
425 }
426
427 err = 0;
673242c1
AK
428 for (i = 0; i < next; i++) {
429 unsigned long start = jiffies;
430 while (!mcelog.entry[i].finished) {
431 if (!time_before(jiffies, start + 2)) {
432 memset(mcelog.entry + i,0, sizeof(struct mce));
433 continue;
434 }
435 cpu_relax();
436 }
1da177e4
LT
437 smp_rmb();
438 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
439 buf += sizeof(struct mce);
440 }
441
442 memset(mcelog.entry, 0, next * sizeof(struct mce));
443 mcelog.next = 0;
444
b2b18660 445 synchronize_sched();
1da177e4
LT
446
447 /* Collect entries that were still getting written before the synchronize. */
448
449 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
450 for (i = next; i < MCE_LOG_LEN; i++) {
451 if (mcelog.entry[i].finished &&
452 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
453 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
454 smp_rmb();
455 buf += sizeof(struct mce);
456 memset(&mcelog.entry[i], 0, sizeof(struct mce));
457 }
458 }
459 up(&mce_read_sem);
f0de53bb 460 kfree(cpu_tsc);
1da177e4
LT
461 return err ? -EFAULT : buf - ubuf;
462}
463
464static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
465{
466 int __user *p = (int __user *)arg;
467 if (!capable(CAP_SYS_ADMIN))
468 return -EPERM;
469 switch (cmd) {
470 case MCE_GET_RECORD_LEN:
471 return put_user(sizeof(struct mce), p);
472 case MCE_GET_LOG_LEN:
473 return put_user(MCE_LOG_LEN, p);
474 case MCE_GETCLEAR_FLAGS: {
475 unsigned flags;
476 do {
477 flags = mcelog.flags;
478 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
479 return put_user(flags, p);
480 }
481 default:
482 return -ENOTTY;
483 }
484}
485
486static struct file_operations mce_chrdev_ops = {
487 .read = mce_read,
488 .ioctl = mce_ioctl,
489};
490
491static struct miscdevice mce_log_device = {
492 MISC_MCELOG_MINOR,
493 "mcelog",
494 &mce_chrdev_ops,
495};
496
497/*
498 * Old style boot options parsing. Only for compatibility.
499 */
500
501static int __init mcheck_disable(char *str)
502{
503 mce_dont_init = 1;
9b41046c 504 return 1;
1da177e4
LT
505}
506
507/* mce=off disables machine check. Note you can reenable it later
d5172f26 508 using sysfs.
8c566ef5 509 mce=TOLERANCELEVEL (number, see above)
e583538f
AK
510 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
511 mce=nobootlog Don't log MCEs from before booting. */
1da177e4
LT
512static int __init mcheck_enable(char *str)
513{
d5172f26
AK
514 if (*str == '=')
515 str++;
1da177e4
LT
516 if (!strcmp(str, "off"))
517 mce_dont_init = 1;
e583538f
AK
518 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
519 mce_bootlog = str[0] == 'b';
8c566ef5
AK
520 else if (isdigit(str[0]))
521 get_option(&str, &tolerant);
1da177e4
LT
522 else
523 printk("mce= argument %s ignored. Please use /sys", str);
9b41046c 524 return 1;
1da177e4
LT
525}
526
527__setup("nomce", mcheck_disable);
528__setup("mce", mcheck_enable);
529
530/*
531 * Sysfs support
532 */
533
413588c7
AK
534/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
535 Only one CPU is active at this time, the others get readded later using
536 CPU hotplug. */
1da177e4
LT
537static int mce_resume(struct sys_device *dev)
538{
413588c7 539 mce_init(NULL);
1da177e4
LT
540 return 0;
541}
542
543/* Reinit MCEs after user configuration changes */
544static void mce_restart(void)
545{
546 if (check_interval)
547 cancel_delayed_work(&mcheck_work);
548 /* Timer race is harmless here */
549 on_each_cpu(mce_init, NULL, 1, 1);
550 if (check_interval)
551 schedule_delayed_work(&mcheck_work, check_interval*HZ);
552}
553
554static struct sysdev_class mce_sysclass = {
555 .resume = mce_resume,
556 set_kset_name("machinecheck"),
557};
558
91c6d400 559static DEFINE_PER_CPU(struct sys_device, device_mce);
1da177e4
LT
560
561/* Why are there no generic functions for this? */
562#define ACCESSOR(name, var, start) \
563 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
564 return sprintf(buf, "%lx\n", (unsigned long)var); \
565 } \
566 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
567 char *end; \
568 unsigned long new = simple_strtoul(buf, &end, 0); \
569 if (end == buf) return -EINVAL; \
570 var = new; \
571 start; \
572 return end-buf; \
573 } \
574 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
575
576ACCESSOR(bank0ctl,bank[0],mce_restart())
577ACCESSOR(bank1ctl,bank[1],mce_restart())
578ACCESSOR(bank2ctl,bank[2],mce_restart())
579ACCESSOR(bank3ctl,bank[3],mce_restart())
580ACCESSOR(bank4ctl,bank[4],mce_restart())
73ca5358
SL
581ACCESSOR(bank5ctl,bank[5],mce_restart())
582static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
583 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
584 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
1da177e4
LT
585ACCESSOR(tolerant,tolerant,)
586ACCESSOR(check_interval,check_interval,mce_restart())
587
91c6d400
AK
588/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
589static __cpuinit int mce_create_device(unsigned int cpu)
1da177e4
LT
590{
591 int err;
73ca5358 592 int i;
91c6d400
AK
593 if (!mce_available(&cpu_data[cpu]))
594 return -EIO;
595
596 per_cpu(device_mce,cpu).id = cpu;
597 per_cpu(device_mce,cpu).cls = &mce_sysclass;
598
599 err = sysdev_register(&per_cpu(device_mce,cpu));
600
601 if (!err) {
73ca5358
SL
602 for (i = 0; i < banks; i++)
603 sysdev_create_file(&per_cpu(device_mce,cpu),
604 bank_attributes[i]);
91c6d400
AK
605 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
606 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
607 }
608 return err;
609}
610
611#ifdef CONFIG_HOTPLUG_CPU
612static __cpuinit void mce_remove_device(unsigned int cpu)
613{
73ca5358
SL
614 int i;
615
616 for (i = 0; i < banks; i++)
617 sysdev_remove_file(&per_cpu(device_mce,cpu),
618 bank_attributes[i]);
91c6d400
AK
619 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
620 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
621 sysdev_unregister(&per_cpu(device_mce,cpu));
622}
623#endif
624
625/* Get notified when a cpu comes on/off. Be hotplug friendly. */
626static __cpuinit int
627mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
628{
629 unsigned int cpu = (unsigned long)hcpu;
630
631 switch (action) {
632 case CPU_ONLINE:
633 mce_create_device(cpu);
634 break;
635#ifdef CONFIG_HOTPLUG_CPU
636 case CPU_DEAD:
637 mce_remove_device(cpu);
638 break;
639#endif
640 }
641 return NOTIFY_OK;
642}
643
644static struct notifier_block mce_cpu_notifier = {
645 .notifier_call = mce_cpu_callback,
646};
647
648static __init int mce_init_device(void)
649{
650 int err;
651 int i = 0;
652
1da177e4
LT
653 if (!mce_available(&boot_cpu_data))
654 return -EIO;
655 err = sysdev_class_register(&mce_sysclass);
91c6d400
AK
656
657 for_each_online_cpu(i) {
658 mce_create_device(i);
659 }
660
661 register_cpu_notifier(&mce_cpu_notifier);
1da177e4
LT
662 misc_register(&mce_log_device);
663 return err;
1da177e4 664}
91c6d400 665
1da177e4 666device_initcall(mce_init_device);
This page took 0.138383 seconds and 5 git commands to generate.