[PATCH] x86-64: Remove redundant max_mapnr and replace with end_pfn
[deliverable/linux.git] / arch / x86_64 / kernel / mce.c
CommitLineData
1da177e4
LT
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
91c6d400
AK
18#include <linux/cpu.h>
19#include <linux/percpu.h>
8c566ef5 20#include <linux/ctype.h>
1da177e4
LT
21#include <asm/processor.h>
22#include <asm/msr.h>
23#include <asm/mce.h>
24#include <asm/kdebug.h>
25#include <asm/uaccess.h>
26
27#define MISC_MCELOG_MINOR 227
28#define NR_BANKS 5
29
30static int mce_dont_init;
31
32/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
33 3: never panic or exit (for testing only) */
34static int tolerant = 1;
35static int banks;
36static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
37static unsigned long console_logged;
38static int notify_user;
94ad8474 39static int rip_msr;
d5172f26 40static int mce_bootlog;
1da177e4
LT
41
42/*
43 * Lockless MCE logging infrastructure.
44 * This avoids deadlocks on printk locks without having to break locks. Also
45 * separate MCEs from kernel messages to avoid bogus bug reports.
46 */
47
48struct mce_log mcelog = {
49 MCE_LOG_SIGNATURE,
50 MCE_LOG_LEN,
51};
52
53void mce_log(struct mce *mce)
54{
55 unsigned next, entry;
56 mce->finished = 0;
57 smp_wmb();
58 for (;;) {
59 entry = rcu_dereference(mcelog.next);
673242c1
AK
60 for (;;) {
61 /* When the buffer fills up discard new entries. Assume
62 that the earlier errors are the more interesting. */
63 if (entry >= MCE_LOG_LEN) {
64 set_bit(MCE_OVERFLOW, &mcelog.flags);
65 return;
66 }
67 /* Old left over entry. Skip. */
68 if (mcelog.entry[entry].finished) {
69 entry++;
70 continue;
71 }
1da177e4 72 }
1da177e4
LT
73 smp_rmb();
74 next = entry + 1;
75 if (cmpxchg(&mcelog.next, entry, next) == entry)
76 break;
77 }
78 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
79 smp_wmb();
80 mcelog.entry[entry].finished = 1;
81 smp_wmb();
82
83 if (!test_and_set_bit(0, &console_logged))
84 notify_user = 1;
85}
86
87static void print_mce(struct mce *m)
88{
89 printk(KERN_EMERG "\n"
90 KERN_EMERG
91 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
92 m->cpu, m->mcgstatus, m->bank, m->status);
93 if (m->rip) {
94 printk(KERN_EMERG
95 "RIP%s %02x:<%016Lx> ",
96 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
97 m->cs, m->rip);
98 if (m->cs == __KERNEL_CS)
99 print_symbol("{%s}", m->rip);
100 printk("\n");
101 }
102 printk(KERN_EMERG "TSC %Lx ", m->tsc);
103 if (m->addr)
104 printk("ADDR %Lx ", m->addr);
105 if (m->misc)
106 printk("MISC %Lx ", m->misc);
107 printk("\n");
108}
109
110static void mce_panic(char *msg, struct mce *backup, unsigned long start)
111{
112 int i;
113 oops_begin();
114 for (i = 0; i < MCE_LOG_LEN; i++) {
115 unsigned long tsc = mcelog.entry[i].tsc;
116 if (time_before(tsc, start))
117 continue;
118 print_mce(&mcelog.entry[i]);
119 if (backup && mcelog.entry[i].tsc == backup->tsc)
120 backup = NULL;
121 }
122 if (backup)
123 print_mce(backup);
124 if (tolerant >= 3)
125 printk("Fake panic: %s\n", msg);
126 else
127 panic(msg);
128}
129
130static int mce_available(struct cpuinfo_x86 *c)
131{
132 return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
133 test_bit(X86_FEATURE_MCA, &c->x86_capability);
134}
135
94ad8474
AK
136static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
137{
138 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
139 m->rip = regs->rip;
140 m->cs = regs->cs;
141 } else {
142 m->rip = 0;
143 m->cs = 0;
144 }
145 if (rip_msr) {
146 /* Assume the RIP in the MSR is exact. Is this true? */
147 m->mcgstatus |= MCG_STATUS_EIPV;
148 rdmsrl(rip_msr, m->rip);
149 m->cs = 0;
150 }
151}
152
1da177e4
LT
153/*
154 * The actual machine check handler
155 */
156
157void do_machine_check(struct pt_regs * regs, long error_code)
158{
159 struct mce m, panicm;
160 int nowayout = (tolerant < 1);
161 int kill_it = 0;
162 u64 mcestart = 0;
163 int i;
164 int panicm_found = 0;
165
166 if (regs)
167 notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
168 if (!banks)
169 return;
170
171 memset(&m, 0, sizeof(struct mce));
172 m.cpu = hard_smp_processor_id();
173 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
174 if (!(m.mcgstatus & MCG_STATUS_RIPV))
175 kill_it = 1;
176
177 rdtscll(mcestart);
178 barrier();
179
180 for (i = 0; i < banks; i++) {
181 if (!bank[i])
182 continue;
183
184 m.misc = 0;
185 m.addr = 0;
186 m.bank = i;
187 m.tsc = 0;
188
189 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
190 if ((m.status & MCI_STATUS_VAL) == 0)
191 continue;
192
193 if (m.status & MCI_STATUS_EN) {
194 /* In theory _OVER could be a nowayout too, but
195 assume any overflowed errors were no fatal. */
196 nowayout |= !!(m.status & MCI_STATUS_PCC);
197 kill_it |= !!(m.status & MCI_STATUS_UC);
198 }
199
200 if (m.status & MCI_STATUS_MISCV)
201 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
202 if (m.status & MCI_STATUS_ADDRV)
203 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
204
94ad8474 205 mce_get_rip(&m, regs);
d5172f26 206 if (error_code >= 0)
1da177e4
LT
207 rdtscll(m.tsc);
208 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
d5172f26
AK
209 if (error_code != -2)
210 mce_log(&m);
1da177e4
LT
211
212 /* Did this bank cause the exception? */
213 /* Assume that the bank with uncorrectable errors did it,
214 and that there is only a single one. */
215 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
216 panicm = m;
217 panicm_found = 1;
218 }
219
220 tainted |= TAINT_MACHINE_CHECK;
221 }
222
223 /* Never do anything final in the polling timer */
224 if (!regs)
225 goto out;
226
227 /* If we didn't find an uncorrectable error, pick
228 the last one (shouldn't happen, just being safe). */
229 if (!panicm_found)
230 panicm = m;
231 if (nowayout)
232 mce_panic("Machine check", &panicm, mcestart);
233 if (kill_it) {
234 int user_space = 0;
235
236 if (m.mcgstatus & MCG_STATUS_RIPV)
237 user_space = panicm.rip && (panicm.cs & 3);
238
239 /* When the machine was in user space and the CPU didn't get
240 confused it's normally not necessary to panic, unless you
241 are paranoid (tolerant == 0)
242
243 RED-PEN could be more tolerant for MCEs in idle,
244 but most likely they occur at boot anyways, where
245 it is best to just halt the machine. */
246 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
247 (unsigned)current->pid <= 1)
248 mce_panic("Uncorrected machine check", &panicm, mcestart);
249
250 /* do_exit takes an awful lot of locks and has as
251 slight risk of deadlocking. If you don't want that
252 don't set tolerant >= 2 */
253 if (tolerant < 3)
254 do_exit(SIGBUS);
255 }
256
257 out:
258 /* Last thing done in the machine check exception to clear state. */
259 wrmsrl(MSR_IA32_MCG_STATUS, 0);
260}
261
262/*
263 * Periodic polling timer for "silent" machine check errors.
264 */
265
266static int check_interval = 5 * 60; /* 5 minutes */
267static void mcheck_timer(void *data);
268static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
269
270static void mcheck_check_cpu(void *info)
271{
272 if (mce_available(&current_cpu_data))
273 do_machine_check(NULL, 0);
274}
275
276static void mcheck_timer(void *data)
277{
278 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
279 schedule_delayed_work(&mcheck_work, check_interval * HZ);
280
281 /*
282 * It's ok to read stale data here for notify_user and
283 * console_logged as we'll simply get the updated versions
284 * on the next mcheck_timer execution and atomic operations
285 * on console_logged act as synchronization for notify_user
286 * writes.
287 */
288 if (notify_user && console_logged) {
289 notify_user = 0;
290 clear_bit(0, &console_logged);
291 printk(KERN_INFO "Machine check events logged\n");
292 }
293}
294
295
296static __init int periodic_mcheck_init(void)
297{
298 if (check_interval)
299 schedule_delayed_work(&mcheck_work, check_interval*HZ);
300 return 0;
301}
302__initcall(periodic_mcheck_init);
303
304
305/*
306 * Initialize Machine Checks for a CPU.
307 */
308static void mce_init(void *dummy)
309{
310 u64 cap;
311 int i;
312
313 rdmsrl(MSR_IA32_MCG_CAP, cap);
314 banks = cap & 0xff;
315 if (banks > NR_BANKS) {
316 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
317 banks = NR_BANKS;
318 }
94ad8474
AK
319 /* Use accurate RIP reporting if available. */
320 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
321 rip_msr = MSR_IA32_MCG_EIP;
1da177e4
LT
322
323 /* Log the machine checks left over from the previous reset.
324 This also clears all registers */
d5172f26 325 do_machine_check(NULL, mce_bootlog ? -1 : -2);
1da177e4
LT
326
327 set_in_cr4(X86_CR4_MCE);
328
329 if (cap & MCG_CTL_P)
330 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
331
332 for (i = 0; i < banks; i++) {
333 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
334 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
335 }
336}
337
338/* Add per CPU specific workarounds here */
e6982c67 339static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1da177e4
LT
340{
341 /* This should be disabled by the BIOS, but isn't always */
342 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
343 /* disable GART TBL walk error reporting, which trips off
344 incorrectly with the IOMMU & 3ware & Cerberus. */
345 clear_bit(10, &bank[4]);
346 }
347}
348
e6982c67 349static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
1da177e4
LT
350{
351 switch (c->x86_vendor) {
352 case X86_VENDOR_INTEL:
353 mce_intel_feature_init(c);
354 break;
355 default:
356 break;
357 }
358}
359
360/*
361 * Called for each booted CPU to set up machine checks.
362 * Must be called with preempt off.
363 */
e6982c67 364void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1da177e4
LT
365{
366 static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
367
368 mce_cpu_quirks(c);
369
370 if (mce_dont_init ||
371 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
372 !mce_available(c))
373 return;
374
375 mce_init(NULL);
376 mce_cpu_features(c);
377}
378
379/*
380 * Character device to read and clear the MCE log.
381 */
382
383static void collect_tscs(void *data)
384{
385 unsigned long *cpu_tsc = (unsigned long *)data;
386 rdtscll(cpu_tsc[smp_processor_id()]);
387}
388
389static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
390{
f0de53bb 391 unsigned long *cpu_tsc;
1da177e4
LT
392 static DECLARE_MUTEX(mce_read_sem);
393 unsigned next;
394 char __user *buf = ubuf;
395 int i, err;
396
f0de53bb
AK
397 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
398 if (!cpu_tsc)
399 return -ENOMEM;
400
1da177e4
LT
401 down(&mce_read_sem);
402 next = rcu_dereference(mcelog.next);
403
404 /* Only supports full reads right now */
405 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
406 up(&mce_read_sem);
f0de53bb 407 kfree(cpu_tsc);
1da177e4
LT
408 return -EINVAL;
409 }
410
411 err = 0;
673242c1
AK
412 for (i = 0; i < next; i++) {
413 unsigned long start = jiffies;
414 while (!mcelog.entry[i].finished) {
415 if (!time_before(jiffies, start + 2)) {
416 memset(mcelog.entry + i,0, sizeof(struct mce));
417 continue;
418 }
419 cpu_relax();
420 }
1da177e4
LT
421 smp_rmb();
422 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
423 buf += sizeof(struct mce);
424 }
425
426 memset(mcelog.entry, 0, next * sizeof(struct mce));
427 mcelog.next = 0;
428
b2b18660 429 synchronize_sched();
1da177e4
LT
430
431 /* Collect entries that were still getting written before the synchronize. */
432
433 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
434 for (i = next; i < MCE_LOG_LEN; i++) {
435 if (mcelog.entry[i].finished &&
436 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
437 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
438 smp_rmb();
439 buf += sizeof(struct mce);
440 memset(&mcelog.entry[i], 0, sizeof(struct mce));
441 }
442 }
443 up(&mce_read_sem);
f0de53bb 444 kfree(cpu_tsc);
1da177e4
LT
445 return err ? -EFAULT : buf - ubuf;
446}
447
448static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
449{
450 int __user *p = (int __user *)arg;
451 if (!capable(CAP_SYS_ADMIN))
452 return -EPERM;
453 switch (cmd) {
454 case MCE_GET_RECORD_LEN:
455 return put_user(sizeof(struct mce), p);
456 case MCE_GET_LOG_LEN:
457 return put_user(MCE_LOG_LEN, p);
458 case MCE_GETCLEAR_FLAGS: {
459 unsigned flags;
460 do {
461 flags = mcelog.flags;
462 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
463 return put_user(flags, p);
464 }
465 default:
466 return -ENOTTY;
467 }
468}
469
470static struct file_operations mce_chrdev_ops = {
471 .read = mce_read,
472 .ioctl = mce_ioctl,
473};
474
475static struct miscdevice mce_log_device = {
476 MISC_MCELOG_MINOR,
477 "mcelog",
478 &mce_chrdev_ops,
479};
480
481/*
482 * Old style boot options parsing. Only for compatibility.
483 */
484
485static int __init mcheck_disable(char *str)
486{
487 mce_dont_init = 1;
488 return 0;
489}
490
491/* mce=off disables machine check. Note you can reenable it later
d5172f26 492 using sysfs.
8c566ef5 493 mce=TOLERANCELEVEL (number, see above)
d5172f26
AK
494 mce=bootlog Log MCEs from before booting. Disabled by default to work
495 around buggy BIOS that leave bogus MCEs. */
1da177e4
LT
496static int __init mcheck_enable(char *str)
497{
d5172f26
AK
498 if (*str == '=')
499 str++;
1da177e4
LT
500 if (!strcmp(str, "off"))
501 mce_dont_init = 1;
d5172f26
AK
502 else if (!strcmp(str, "bootlog"))
503 mce_bootlog = 1;
8c566ef5
AK
504 else if (isdigit(str[0]))
505 get_option(&str, &tolerant);
1da177e4
LT
506 else
507 printk("mce= argument %s ignored. Please use /sys", str);
508 return 0;
509}
510
511__setup("nomce", mcheck_disable);
512__setup("mce", mcheck_enable);
513
514/*
515 * Sysfs support
516 */
517
518/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */
519static int mce_resume(struct sys_device *dev)
520{
521 on_each_cpu(mce_init, NULL, 1, 1);
522 return 0;
523}
524
525/* Reinit MCEs after user configuration changes */
526static void mce_restart(void)
527{
528 if (check_interval)
529 cancel_delayed_work(&mcheck_work);
530 /* Timer race is harmless here */
531 on_each_cpu(mce_init, NULL, 1, 1);
532 if (check_interval)
533 schedule_delayed_work(&mcheck_work, check_interval*HZ);
534}
535
536static struct sysdev_class mce_sysclass = {
537 .resume = mce_resume,
538 set_kset_name("machinecheck"),
539};
540
91c6d400 541static DEFINE_PER_CPU(struct sys_device, device_mce);
1da177e4
LT
542
543/* Why are there no generic functions for this? */
544#define ACCESSOR(name, var, start) \
545 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
546 return sprintf(buf, "%lx\n", (unsigned long)var); \
547 } \
548 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
549 char *end; \
550 unsigned long new = simple_strtoul(buf, &end, 0); \
551 if (end == buf) return -EINVAL; \
552 var = new; \
553 start; \
554 return end-buf; \
555 } \
556 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
557
558ACCESSOR(bank0ctl,bank[0],mce_restart())
559ACCESSOR(bank1ctl,bank[1],mce_restart())
560ACCESSOR(bank2ctl,bank[2],mce_restart())
561ACCESSOR(bank3ctl,bank[3],mce_restart())
562ACCESSOR(bank4ctl,bank[4],mce_restart())
563ACCESSOR(tolerant,tolerant,)
564ACCESSOR(check_interval,check_interval,mce_restart())
565
91c6d400
AK
566/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
567static __cpuinit int mce_create_device(unsigned int cpu)
1da177e4
LT
568{
569 int err;
91c6d400
AK
570 if (!mce_available(&cpu_data[cpu]))
571 return -EIO;
572
573 per_cpu(device_mce,cpu).id = cpu;
574 per_cpu(device_mce,cpu).cls = &mce_sysclass;
575
576 err = sysdev_register(&per_cpu(device_mce,cpu));
577
578 if (!err) {
579 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
580 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
581 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
582 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
583 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
584 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
585 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
586 }
587 return err;
588}
589
590#ifdef CONFIG_HOTPLUG_CPU
591static __cpuinit void mce_remove_device(unsigned int cpu)
592{
593 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank0ctl);
594 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank1ctl);
595 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank2ctl);
596 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank3ctl);
597 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_bank4ctl);
598 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
599 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
600 sysdev_unregister(&per_cpu(device_mce,cpu));
601}
602#endif
603
604/* Get notified when a cpu comes on/off. Be hotplug friendly. */
605static __cpuinit int
606mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
607{
608 unsigned int cpu = (unsigned long)hcpu;
609
610 switch (action) {
611 case CPU_ONLINE:
612 mce_create_device(cpu);
613 break;
614#ifdef CONFIG_HOTPLUG_CPU
615 case CPU_DEAD:
616 mce_remove_device(cpu);
617 break;
618#endif
619 }
620 return NOTIFY_OK;
621}
622
623static struct notifier_block mce_cpu_notifier = {
624 .notifier_call = mce_cpu_callback,
625};
626
627static __init int mce_init_device(void)
628{
629 int err;
630 int i = 0;
631
1da177e4
LT
632 if (!mce_available(&boot_cpu_data))
633 return -EIO;
634 err = sysdev_class_register(&mce_sysclass);
91c6d400
AK
635
636 for_each_online_cpu(i) {
637 mce_create_device(i);
638 }
639
640 register_cpu_notifier(&mce_cpu_notifier);
1da177e4
LT
641 misc_register(&mce_log_device);
642 return err;
1da177e4 643}
91c6d400 644
1da177e4 645device_initcall(mce_init_device);
This page took 0.090684 seconds and 5 git commands to generate.