x86, mce: Move debugfs mce dir creating to mce.c
[deliverable/linux.git] / arch / x86 / kernel / cpu / mcheck / mce.c
CommitLineData
1da177e4
LT
1/*
2 * Machine check handler.
e9eee03e 3 *
1da177e4 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
d88203d1
TG
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
b79109c3
AK
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
1da177e4 9 */
e9eee03e
IM
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
ccc3c319 13#include <linux/interrupt.h>
e9eee03e
IM
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
e9eee03e 17#include <linux/kobject.h>
14a02530 18#include <linux/uaccess.h>
e9eee03e
IM
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
1da177e4 22#include <linux/string.h>
1da177e4 23#include <linux/sysdev.h>
3c079792 24#include <linux/delay.h>
8c566ef5 25#include <linux/ctype.h>
e9eee03e 26#include <linux/sched.h>
0d7482e3 27#include <linux/sysfs.h>
e9eee03e
IM
28#include <linux/types.h>
29#include <linux/init.h>
30#include <linux/kmod.h>
31#include <linux/poll.h>
3c079792 32#include <linux/nmi.h>
e9eee03e 33#include <linux/cpu.h>
14a02530 34#include <linux/smp.h>
e9eee03e 35#include <linux/fs.h>
9b1beaf2 36#include <linux/mm.h>
5be9ed25 37#include <linux/debugfs.h>
e9eee03e 38
d88203d1 39#include <asm/processor.h>
ccc3c319
AK
40#include <asm/hw_irq.h>
41#include <asm/apic.h>
e02e68d3 42#include <asm/idle.h>
ccc3c319 43#include <asm/ipi.h>
e9eee03e
IM
44#include <asm/mce.h>
45#include <asm/msr.h>
1da177e4 46
bd19a5e6 47#include "mce-internal.h"
711c2e48 48
4e5b3e69 49int mce_disabled __read_mostly;
04b2b1a4 50
e9eee03e 51#define MISC_MCELOG_MINOR 227
0d7482e3 52
3c079792
AK
53#define SPINUNIT 100 /* 100ns */
54
553f265f
AK
55atomic_t mce_entry;
56
01ca79f1
AK
57DEFINE_PER_CPU(unsigned, mce_exception_count);
58
bd78432c
TH
59/*
60 * Tolerant levels:
61 * 0: always panic on uncorrected errors, log corrected errors
62 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
63 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
64 * 3: never panic or SIGBUS, log all errors (for testing only)
65 */
4e5b3e69
HS
66static int tolerant __read_mostly = 1;
67static int banks __read_mostly;
4e5b3e69
HS
68static int rip_msr __read_mostly;
69static int mce_bootlog __read_mostly = -1;
70static int monarch_timeout __read_mostly = -1;
71static int mce_panic_timeout __read_mostly;
72static int mce_dont_log_ce __read_mostly;
73int mce_cmci_disabled __read_mostly;
74int mce_ignore_ce __read_mostly;
75int mce_ser __read_mostly;
a98f0dd3 76
cebe1820
AK
77struct mce_bank *mce_banks __read_mostly;
78
1020bcbc
HS
79/* User mode helper program triggered by machine check event */
80static unsigned long mce_need_notify;
81static char mce_helper[128];
82static char *mce_helper_argv[2] = { mce_helper, NULL };
1da177e4 83
e02e68d3 84static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
3c079792
AK
85static DEFINE_PER_CPU(struct mce, mces_seen);
86static int cpu_missing;
87
e02e68d3 88
ee031c31
AK
89/* MCA banks polled by the period polling timer for corrected events */
90DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
91 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
92};
93
9b1beaf2
AK
94static DEFINE_PER_CPU(struct work_struct, mce_work);
95
b5f2fa4e
AK
96/* Do initial initialization of a struct mce */
97void mce_setup(struct mce *m)
98{
99 memset(m, 0, sizeof(struct mce));
d620c67f 100 m->cpu = m->extcpu = smp_processor_id();
b5f2fa4e 101 rdtscll(m->tsc);
8ee08347
AK
102 /* We hope get_seconds stays lockless */
103 m->time = get_seconds();
104 m->cpuvendor = boot_cpu_data.x86_vendor;
105 m->cpuid = cpuid_eax(1);
106#ifdef CONFIG_SMP
107 m->socketid = cpu_data(m->extcpu).phys_proc_id;
108#endif
109 m->apicid = cpu_data(m->extcpu).initial_apicid;
110 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
b5f2fa4e
AK
111}
112
ea149b36
AK
113DEFINE_PER_CPU(struct mce, injectm);
114EXPORT_PER_CPU_SYMBOL_GPL(injectm);
115
1da177e4
LT
116/*
117 * Lockless MCE logging infrastructure.
118 * This avoids deadlocks on printk locks without having to break locks. Also
119 * separate MCEs from kernel messages to avoid bogus bug reports.
120 */
121
231fd906 122static struct mce_log mcelog = {
f6fb0ac0
AK
123 .signature = MCE_LOG_SIGNATURE,
124 .len = MCE_LOG_LEN,
125 .recordlen = sizeof(struct mce),
d88203d1 126};
1da177e4
LT
127
128void mce_log(struct mce *mce)
129{
130 unsigned next, entry;
e9eee03e 131
1da177e4 132 mce->finished = 0;
7644143c 133 wmb();
1da177e4
LT
134 for (;;) {
135 entry = rcu_dereference(mcelog.next);
673242c1 136 for (;;) {
e9eee03e
IM
137 /*
138 * When the buffer fills up discard new entries.
139 * Assume that the earlier errors are the more
140 * interesting ones:
141 */
673242c1 142 if (entry >= MCE_LOG_LEN) {
14a02530
HS
143 set_bit(MCE_OVERFLOW,
144 (unsigned long *)&mcelog.flags);
673242c1
AK
145 return;
146 }
e9eee03e 147 /* Old left over entry. Skip: */
673242c1
AK
148 if (mcelog.entry[entry].finished) {
149 entry++;
150 continue;
151 }
7644143c 152 break;
1da177e4 153 }
1da177e4
LT
154 smp_rmb();
155 next = entry + 1;
156 if (cmpxchg(&mcelog.next, entry, next) == entry)
157 break;
158 }
159 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
7644143c 160 wmb();
1da177e4 161 mcelog.entry[entry].finished = 1;
7644143c 162 wmb();
1da177e4 163
a0189c70 164 mce->finished = 1;
1020bcbc 165 set_bit(0, &mce_need_notify);
1da177e4
LT
166}
167
77e26cca 168static void print_mce(struct mce *m)
1da177e4 169{
86503560 170 printk(KERN_EMERG
1da177e4 171 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
d620c67f 172 m->extcpu, m->mcgstatus, m->bank, m->status);
65ea5b03 173 if (m->ip) {
d88203d1 174 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
1da177e4 175 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
65ea5b03 176 m->cs, m->ip);
1da177e4 177 if (m->cs == __KERNEL_CS)
65ea5b03 178 print_symbol("{%s}", m->ip);
1da177e4
LT
179 printk("\n");
180 }
f6d1826d 181 printk(KERN_EMERG "TSC %llx ", m->tsc);
1da177e4 182 if (m->addr)
f6d1826d 183 printk("ADDR %llx ", m->addr);
1da177e4 184 if (m->misc)
f6d1826d 185 printk("MISC %llx ", m->misc);
1da177e4 186 printk("\n");
8ee08347
AK
187 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
188 m->cpuvendor, m->cpuid, m->time, m->socketid,
189 m->apicid);
86503560
AK
190}
191
77e26cca
HS
192static void print_mce_head(void)
193{
194 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
195}
196
86503560
AK
197static void print_mce_tail(void)
198{
199 printk(KERN_EMERG "This is not a software problem!\n"
200 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
1da177e4
LT
201}
202
f94b61c2
AK
203#define PANIC_TIMEOUT 5 /* 5 seconds */
204
205static atomic_t mce_paniced;
206
207/* Panic in progress. Enable interrupts and wait for final IPI */
208static void wait_for_panic(void)
209{
210 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
211 preempt_disable();
212 local_irq_enable();
213 while (timeout-- > 0)
214 udelay(1);
29b0f591
AK
215 if (panic_timeout == 0)
216 panic_timeout = mce_panic_timeout;
f94b61c2
AK
217 panic("Panicing machine check CPU died");
218}
219
bd19a5e6 220static void mce_panic(char *msg, struct mce *final, char *exp)
d88203d1 221{
1da177e4 222 int i;
e02e68d3 223
f94b61c2
AK
224 /*
225 * Make sure only one CPU runs in machine check panic
226 */
a95436e4 227 if (atomic_inc_return(&mce_paniced) > 1)
f94b61c2
AK
228 wait_for_panic();
229 barrier();
230
d896a940
AK
231 bust_spinlocks(1);
232 console_verbose();
77e26cca 233 print_mce_head();
a0189c70 234 /* First print corrected ones that are still unlogged */
1da177e4 235 for (i = 0; i < MCE_LOG_LEN; i++) {
a0189c70 236 struct mce *m = &mcelog.entry[i];
77e26cca
HS
237 if (!(m->status & MCI_STATUS_VAL))
238 continue;
239 if (!(m->status & MCI_STATUS_UC))
240 print_mce(m);
a0189c70
AK
241 }
242 /* Now print uncorrected but with the final one last */
243 for (i = 0; i < MCE_LOG_LEN; i++) {
244 struct mce *m = &mcelog.entry[i];
245 if (!(m->status & MCI_STATUS_VAL))
1da177e4 246 continue;
77e26cca
HS
247 if (!(m->status & MCI_STATUS_UC))
248 continue;
a0189c70 249 if (!final || memcmp(m, final, sizeof(struct mce)))
77e26cca 250 print_mce(m);
1da177e4 251 }
a0189c70 252 if (final)
77e26cca 253 print_mce(final);
3c079792
AK
254 if (cpu_missing)
255 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
86503560 256 print_mce_tail();
bd19a5e6
AK
257 if (exp)
258 printk(KERN_EMERG "Machine check: %s\n", exp);
29b0f591
AK
259 if (panic_timeout == 0)
260 panic_timeout = mce_panic_timeout;
e02e68d3 261 panic(msg);
d88203d1 262}
1da177e4 263
ea149b36
AK
264/* Support code for software error injection */
265
266static int msr_to_offset(u32 msr)
267{
268 unsigned bank = __get_cpu_var(injectm.bank);
269 if (msr == rip_msr)
270 return offsetof(struct mce, ip);
a2d32bcb 271 if (msr == MSR_IA32_MCx_STATUS(bank))
ea149b36 272 return offsetof(struct mce, status);
a2d32bcb 273 if (msr == MSR_IA32_MCx_ADDR(bank))
ea149b36 274 return offsetof(struct mce, addr);
a2d32bcb 275 if (msr == MSR_IA32_MCx_MISC(bank))
ea149b36
AK
276 return offsetof(struct mce, misc);
277 if (msr == MSR_IA32_MCG_STATUS)
278 return offsetof(struct mce, mcgstatus);
279 return -1;
280}
281
5f8c1a54
AK
282/* MSR access wrappers used for error injection */
283static u64 mce_rdmsrl(u32 msr)
284{
285 u64 v;
ea149b36
AK
286 if (__get_cpu_var(injectm).finished) {
287 int offset = msr_to_offset(msr);
288 if (offset < 0)
289 return 0;
290 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
291 }
5f8c1a54
AK
292 rdmsrl(msr, v);
293 return v;
294}
295
296static void mce_wrmsrl(u32 msr, u64 v)
297{
ea149b36
AK
298 if (__get_cpu_var(injectm).finished) {
299 int offset = msr_to_offset(msr);
300 if (offset >= 0)
301 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
302 return;
303 }
5f8c1a54
AK
304 wrmsrl(msr, v);
305}
306
9b1beaf2
AK
307/*
308 * Simple lockless ring to communicate PFNs from the exception handler with the
309 * process context work function. This is vastly simplified because there's
310 * only a single reader and a single writer.
311 */
312#define MCE_RING_SIZE 16 /* we use one entry less */
313
314struct mce_ring {
315 unsigned short start;
316 unsigned short end;
317 unsigned long ring[MCE_RING_SIZE];
318};
319static DEFINE_PER_CPU(struct mce_ring, mce_ring);
320
321/* Runs with CPU affinity in workqueue */
322static int mce_ring_empty(void)
323{
324 struct mce_ring *r = &__get_cpu_var(mce_ring);
325
326 return r->start == r->end;
327}
328
329static int mce_ring_get(unsigned long *pfn)
330{
331 struct mce_ring *r;
332 int ret = 0;
333
334 *pfn = 0;
335 get_cpu();
336 r = &__get_cpu_var(mce_ring);
337 if (r->start == r->end)
338 goto out;
339 *pfn = r->ring[r->start];
340 r->start = (r->start + 1) % MCE_RING_SIZE;
341 ret = 1;
342out:
343 put_cpu();
344 return ret;
345}
346
347/* Always runs in MCE context with preempt off */
348static int mce_ring_add(unsigned long pfn)
349{
350 struct mce_ring *r = &__get_cpu_var(mce_ring);
351 unsigned next;
352
353 next = (r->end + 1) % MCE_RING_SIZE;
354 if (next == r->start)
355 return -1;
356 r->ring[r->end] = pfn;
357 wmb();
358 r->end = next;
359 return 0;
360}
361
88ccbedd 362int mce_available(struct cpuinfo_x86 *c)
1da177e4 363{
04b2b1a4 364 if (mce_disabled)
5b4408fd 365 return 0;
3d1712c9 366 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
1da177e4
LT
367}
368
9b1beaf2
AK
369static void mce_schedule_work(void)
370{
371 if (!mce_ring_empty()) {
372 struct work_struct *work = &__get_cpu_var(mce_work);
373 if (!work_pending(work))
374 schedule_work(work);
375 }
376}
377
1b2797dc
HY
378/*
379 * Get the address of the instruction at the time of the machine check
380 * error.
381 */
94ad8474
AK
382static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
383{
1b2797dc
HY
384
385 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
65ea5b03 386 m->ip = regs->ip;
94ad8474
AK
387 m->cs = regs->cs;
388 } else {
65ea5b03 389 m->ip = 0;
94ad8474
AK
390 m->cs = 0;
391 }
1b2797dc 392 if (rip_msr)
5f8c1a54 393 m->ip = mce_rdmsrl(rip_msr);
94ad8474
AK
394}
395
ccc3c319
AK
396#ifdef CONFIG_X86_LOCAL_APIC
397/*
398 * Called after interrupts have been reenabled again
399 * when a MCE happened during an interrupts off region
400 * in the kernel.
401 */
402asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
403{
404 ack_APIC_irq();
405 exit_idle();
406 irq_enter();
9ff36ee9 407 mce_notify_irq();
9b1beaf2 408 mce_schedule_work();
ccc3c319
AK
409 irq_exit();
410}
411#endif
412
413static void mce_report_event(struct pt_regs *regs)
414{
415 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
9ff36ee9 416 mce_notify_irq();
9b1beaf2
AK
417 /*
418 * Triggering the work queue here is just an insurance
419 * policy in case the syscall exit notify handler
420 * doesn't run soon enough or ends up running on the
421 * wrong CPU (can happen when audit sleeps)
422 */
423 mce_schedule_work();
ccc3c319
AK
424 return;
425 }
426
427#ifdef CONFIG_X86_LOCAL_APIC
428 /*
429 * Without APIC do not notify. The event will be picked
430 * up eventually.
431 */
432 if (!cpu_has_apic)
433 return;
434
435 /*
436 * When interrupts are disabled we cannot use
437 * kernel services safely. Trigger an self interrupt
438 * through the APIC to instead do the notification
439 * after interrupts are reenabled again.
440 */
441 apic->send_IPI_self(MCE_SELF_VECTOR);
442
443 /*
444 * Wait for idle afterwards again so that we don't leave the
445 * APIC in a non idle state because the normal APIC writes
446 * cannot exclude us.
447 */
448 apic_wait_icr_idle();
449#endif
450}
451
ca84f696
AK
452DEFINE_PER_CPU(unsigned, mce_poll_count);
453
d88203d1 454/*
b79109c3
AK
455 * Poll for corrected events or events that happened before reset.
456 * Those are just logged through /dev/mcelog.
457 *
458 * This is executed in standard interrupt context.
ed7290d0
AK
459 *
460 * Note: spec recommends to panic for fatal unsignalled
461 * errors here. However this would be quite problematic --
462 * we would need to reimplement the Monarch handling and
463 * it would mess up the exclusion between exception handler
464 * and poll hander -- * so we skip this for now.
465 * These cases should not happen anyways, or only when the CPU
466 * is already totally * confused. In this case it's likely it will
467 * not fully execute the machine check handler either.
b79109c3 468 */
ee031c31 469void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
b79109c3
AK
470{
471 struct mce m;
472 int i;
473
ca84f696
AK
474 __get_cpu_var(mce_poll_count)++;
475
b79109c3
AK
476 mce_setup(&m);
477
5f8c1a54 478 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
b79109c3 479 for (i = 0; i < banks; i++) {
cebe1820 480 if (!mce_banks[i].ctl || !test_bit(i, *b))
b79109c3
AK
481 continue;
482
483 m.misc = 0;
484 m.addr = 0;
485 m.bank = i;
486 m.tsc = 0;
487
488 barrier();
a2d32bcb 489 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
b79109c3
AK
490 if (!(m.status & MCI_STATUS_VAL))
491 continue;
492
493 /*
ed7290d0
AK
494 * Uncorrected or signalled events are handled by the exception
495 * handler when it is enabled, so don't process those here.
b79109c3
AK
496 *
497 * TBD do the same check for MCI_STATUS_EN here?
498 */
ed7290d0
AK
499 if (!(flags & MCP_UC) &&
500 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
b79109c3
AK
501 continue;
502
503 if (m.status & MCI_STATUS_MISCV)
a2d32bcb 504 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
b79109c3 505 if (m.status & MCI_STATUS_ADDRV)
a2d32bcb 506 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
b79109c3
AK
507
508 if (!(flags & MCP_TIMESTAMP))
509 m.tsc = 0;
510 /*
511 * Don't get the IP here because it's unlikely to
512 * have anything to do with the actual error location.
513 */
62fdac59 514 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
5679af4c
AK
515 mce_log(&m);
516 add_taint(TAINT_MACHINE_CHECK);
517 }
b79109c3
AK
518
519 /*
520 * Clear state for this bank.
521 */
a2d32bcb 522 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
b79109c3
AK
523 }
524
525 /*
526 * Don't clear MCG_STATUS here because it's only defined for
527 * exceptions.
528 */
88921be3
AK
529
530 sync_core();
b79109c3 531}
ea149b36 532EXPORT_SYMBOL_GPL(machine_check_poll);
b79109c3 533
bd19a5e6
AK
534/*
535 * Do a quick check if any of the events requires a panic.
536 * This decides if we keep the events around or clear them.
537 */
538static int mce_no_way_out(struct mce *m, char **msg)
539{
540 int i;
541
542 for (i = 0; i < banks; i++) {
a2d32bcb 543 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
bd19a5e6
AK
544 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
545 return 1;
546 }
547 return 0;
548}
549
3c079792
AK
550/*
551 * Variable to establish order between CPUs while scanning.
552 * Each CPU spins initially until executing is equal its number.
553 */
554static atomic_t mce_executing;
555
556/*
557 * Defines order of CPUs on entry. First CPU becomes Monarch.
558 */
559static atomic_t mce_callin;
560
561/*
562 * Check if a timeout waiting for other CPUs happened.
563 */
564static int mce_timed_out(u64 *t)
565{
566 /*
567 * The others already did panic for some reason.
568 * Bail out like in a timeout.
569 * rmb() to tell the compiler that system_state
570 * might have been modified by someone else.
571 */
572 rmb();
573 if (atomic_read(&mce_paniced))
574 wait_for_panic();
575 if (!monarch_timeout)
576 goto out;
577 if ((s64)*t < SPINUNIT) {
578 /* CHECKME: Make panic default for 1 too? */
579 if (tolerant < 1)
580 mce_panic("Timeout synchronizing machine check over CPUs",
581 NULL, NULL);
582 cpu_missing = 1;
583 return 1;
584 }
585 *t -= SPINUNIT;
586out:
587 touch_nmi_watchdog();
588 return 0;
589}
590
591/*
592 * The Monarch's reign. The Monarch is the CPU who entered
593 * the machine check handler first. It waits for the others to
594 * raise the exception too and then grades them. When any
595 * error is fatal panic. Only then let the others continue.
596 *
597 * The other CPUs entering the MCE handler will be controlled by the
598 * Monarch. They are called Subjects.
599 *
600 * This way we prevent any potential data corruption in a unrecoverable case
601 * and also makes sure always all CPU's errors are examined.
602 *
603 * Also this detects the case of an machine check event coming from outer
604 * space (not detected by any CPUs) In this case some external agent wants
605 * us to shut down, so panic too.
606 *
607 * The other CPUs might still decide to panic if the handler happens
608 * in a unrecoverable place, but in this case the system is in a semi-stable
609 * state and won't corrupt anything by itself. It's ok to let the others
610 * continue for a bit first.
611 *
612 * All the spin loops have timeouts; when a timeout happens a CPU
613 * typically elects itself to be Monarch.
614 */
615static void mce_reign(void)
616{
617 int cpu;
618 struct mce *m = NULL;
619 int global_worst = 0;
620 char *msg = NULL;
621 char *nmsg = NULL;
622
623 /*
624 * This CPU is the Monarch and the other CPUs have run
625 * through their handlers.
626 * Grade the severity of the errors of all the CPUs.
627 */
628 for_each_possible_cpu(cpu) {
629 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
630 &nmsg);
631 if (severity > global_worst) {
632 msg = nmsg;
633 global_worst = severity;
634 m = &per_cpu(mces_seen, cpu);
635 }
636 }
637
638 /*
639 * Cannot recover? Panic here then.
640 * This dumps all the mces in the log buffer and stops the
641 * other CPUs.
642 */
643 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
ac960375 644 mce_panic("Fatal Machine check", m, msg);
3c079792
AK
645
646 /*
647 * For UC somewhere we let the CPU who detects it handle it.
648 * Also must let continue the others, otherwise the handling
649 * CPU could deadlock on a lock.
650 */
651
652 /*
653 * No machine check event found. Must be some external
654 * source or one CPU is hung. Panic.
655 */
656 if (!m && tolerant < 3)
657 mce_panic("Machine check from unknown source", NULL, NULL);
658
659 /*
660 * Now clear all the mces_seen so that they don't reappear on
661 * the next mce.
662 */
663 for_each_possible_cpu(cpu)
664 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
665}
666
667static atomic_t global_nwo;
668
669/*
670 * Start of Monarch synchronization. This waits until all CPUs have
671 * entered the exception handler and then determines if any of them
672 * saw a fatal event that requires panic. Then it executes them
673 * in the entry order.
674 * TBD double check parallel CPU hotunplug
675 */
7fb06fc9 676static int mce_start(int *no_way_out)
3c079792 677{
7fb06fc9 678 int order;
3c079792
AK
679 int cpus = num_online_cpus();
680 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
681
7fb06fc9
HS
682 if (!timeout)
683 return -1;
3c079792 684
7fb06fc9 685 atomic_add(*no_way_out, &global_nwo);
184e1fdf
HY
686 /*
687 * global_nwo should be updated before mce_callin
688 */
689 smp_wmb();
a95436e4 690 order = atomic_inc_return(&mce_callin);
3c079792
AK
691
692 /*
693 * Wait for everyone.
694 */
695 while (atomic_read(&mce_callin) != cpus) {
696 if (mce_timed_out(&timeout)) {
697 atomic_set(&global_nwo, 0);
7fb06fc9 698 return -1;
3c079792
AK
699 }
700 ndelay(SPINUNIT);
701 }
702
184e1fdf
HY
703 /*
704 * mce_callin should be read before global_nwo
705 */
706 smp_rmb();
3c079792 707
7fb06fc9
HS
708 if (order == 1) {
709 /*
710 * Monarch: Starts executing now, the others wait.
711 */
3c079792 712 atomic_set(&mce_executing, 1);
7fb06fc9
HS
713 } else {
714 /*
715 * Subject: Now start the scanning loop one by one in
716 * the original callin order.
717 * This way when there are any shared banks it will be
718 * only seen by one CPU before cleared, avoiding duplicates.
719 */
720 while (atomic_read(&mce_executing) < order) {
721 if (mce_timed_out(&timeout)) {
722 atomic_set(&global_nwo, 0);
723 return -1;
724 }
725 ndelay(SPINUNIT);
726 }
3c079792
AK
727 }
728
729 /*
7fb06fc9 730 * Cache the global no_way_out state.
3c079792 731 */
7fb06fc9
HS
732 *no_way_out = atomic_read(&global_nwo);
733
734 return order;
3c079792
AK
735}
736
737/*
738 * Synchronize between CPUs after main scanning loop.
739 * This invokes the bulk of the Monarch processing.
740 */
741static int mce_end(int order)
742{
743 int ret = -1;
744 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
745
746 if (!timeout)
747 goto reset;
748 if (order < 0)
749 goto reset;
750
751 /*
752 * Allow others to run.
753 */
754 atomic_inc(&mce_executing);
755
756 if (order == 1) {
757 /* CHECKME: Can this race with a parallel hotplug? */
758 int cpus = num_online_cpus();
759
760 /*
761 * Monarch: Wait for everyone to go through their scanning
762 * loops.
763 */
764 while (atomic_read(&mce_executing) <= cpus) {
765 if (mce_timed_out(&timeout))
766 goto reset;
767 ndelay(SPINUNIT);
768 }
769
770 mce_reign();
771 barrier();
772 ret = 0;
773 } else {
774 /*
775 * Subject: Wait for Monarch to finish.
776 */
777 while (atomic_read(&mce_executing) != 0) {
778 if (mce_timed_out(&timeout))
779 goto reset;
780 ndelay(SPINUNIT);
781 }
782
783 /*
784 * Don't reset anything. That's done by the Monarch.
785 */
786 return 0;
787 }
788
789 /*
790 * Reset all global state.
791 */
792reset:
793 atomic_set(&global_nwo, 0);
794 atomic_set(&mce_callin, 0);
795 barrier();
796
797 /*
798 * Let others run again.
799 */
800 atomic_set(&mce_executing, 0);
801 return ret;
802}
803
9b1beaf2
AK
804/*
805 * Check if the address reported by the CPU is in a format we can parse.
806 * It would be possible to add code for most other cases, but all would
807 * be somewhat complicated (e.g. segment offset would require an instruction
808 * parser). So only support physical addresses upto page granuality for now.
809 */
810static int mce_usable_address(struct mce *m)
811{
812 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
813 return 0;
814 if ((m->misc & 0x3f) > PAGE_SHIFT)
815 return 0;
816 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
817 return 0;
818 return 1;
819}
820
3c079792
AK
821static void mce_clear_state(unsigned long *toclear)
822{
823 int i;
824
825 for (i = 0; i < banks; i++) {
826 if (test_bit(i, toclear))
a2d32bcb 827 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
3c079792
AK
828 }
829}
830
b79109c3
AK
831/*
832 * The actual machine check handler. This only handles real
833 * exceptions when something got corrupted coming in through int 18.
834 *
835 * This is executed in NMI context not subject to normal locking rules. This
836 * implies that most kernel services cannot be safely used. Don't even
837 * think about putting a printk in there!
3c079792
AK
838 *
839 * On Intel systems this is entered on all CPUs in parallel through
840 * MCE broadcast. However some CPUs might be broken beyond repair,
841 * so be always careful when synchronizing with others.
1da177e4 842 */
e9eee03e 843void do_machine_check(struct pt_regs *regs, long error_code)
1da177e4 844{
3c079792 845 struct mce m, *final;
1da177e4 846 int i;
3c079792
AK
847 int worst = 0;
848 int severity;
849 /*
850 * Establish sequential order between the CPUs entering the machine
851 * check handler.
852 */
7fb06fc9 853 int order;
bd78432c
TH
854 /*
855 * If no_way_out gets set, there is no safe way to recover from this
856 * MCE. If tolerant is cranked up, we'll try anyway.
857 */
858 int no_way_out = 0;
859 /*
860 * If kill_it gets set, there might be a way to recover from this
861 * error.
862 */
863 int kill_it = 0;
b79109c3 864 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
bd19a5e6 865 char *msg = "Unknown";
1da177e4 866
553f265f
AK
867 atomic_inc(&mce_entry);
868
01ca79f1
AK
869 __get_cpu_var(mce_exception_count)++;
870
b79109c3 871 if (notify_die(DIE_NMI, "machine check", regs, error_code,
22f5991c 872 18, SIGKILL) == NOTIFY_STOP)
32561696 873 goto out;
b79109c3 874 if (!banks)
32561696 875 goto out;
1da177e4 876
b5f2fa4e
AK
877 mce_setup(&m);
878
5f8c1a54 879 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
bd19a5e6 880 no_way_out = mce_no_way_out(&m, &msg);
d88203d1 881
3c079792
AK
882 final = &__get_cpu_var(mces_seen);
883 *final = m;
884
1da177e4
LT
885 barrier();
886
ed7290d0
AK
887 /*
888 * When no restart IP must always kill or panic.
889 */
890 if (!(m.mcgstatus & MCG_STATUS_RIPV))
891 kill_it = 1;
892
3c079792
AK
893 /*
894 * Go through all the banks in exclusion of the other CPUs.
895 * This way we don't report duplicated events on shared banks
896 * because the first one to see it will clear it.
897 */
7fb06fc9 898 order = mce_start(&no_way_out);
1da177e4 899 for (i = 0; i < banks; i++) {
b79109c3 900 __clear_bit(i, toclear);
cebe1820 901 if (!mce_banks[i].ctl)
1da177e4 902 continue;
d88203d1
TG
903
904 m.misc = 0;
1da177e4
LT
905 m.addr = 0;
906 m.bank = i;
1da177e4 907
a2d32bcb 908 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1da177e4
LT
909 if ((m.status & MCI_STATUS_VAL) == 0)
910 continue;
911
b79109c3 912 /*
ed7290d0
AK
913 * Non uncorrected or non signaled errors are handled by
914 * machine_check_poll. Leave them alone, unless this panics.
b79109c3 915 */
ed7290d0
AK
916 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
917 !no_way_out)
b79109c3
AK
918 continue;
919
920 /*
921 * Set taint even when machine check was not enabled.
922 */
923 add_taint(TAINT_MACHINE_CHECK);
924
ed7290d0 925 severity = mce_severity(&m, tolerant, NULL);
b79109c3 926
ed7290d0
AK
927 /*
928 * When machine check was for corrected handler don't touch,
929 * unless we're panicing.
930 */
931 if (severity == MCE_KEEP_SEVERITY && !no_way_out)
932 continue;
933 __set_bit(i, toclear);
934 if (severity == MCE_NO_SEVERITY) {
b79109c3
AK
935 /*
936 * Machine check event was not enabled. Clear, but
937 * ignore.
938 */
939 continue;
1da177e4
LT
940 }
941
ed7290d0
AK
942 /*
943 * Kill on action required.
944 */
945 if (severity == MCE_AR_SEVERITY)
946 kill_it = 1;
947
1da177e4 948 if (m.status & MCI_STATUS_MISCV)
a2d32bcb 949 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
1da177e4 950 if (m.status & MCI_STATUS_ADDRV)
a2d32bcb 951 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
1da177e4 952
9b1beaf2
AK
953 /*
954 * Action optional error. Queue address for later processing.
955 * When the ring overflows we just ignore the AO error.
956 * RED-PEN add some logging mechanism when
957 * usable_address or mce_add_ring fails.
958 * RED-PEN don't ignore overflow for tolerant == 0
959 */
960 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
961 mce_ring_add(m.addr >> PAGE_SHIFT);
962
94ad8474 963 mce_get_rip(&m, regs);
b79109c3 964 mce_log(&m);
1da177e4 965
3c079792
AK
966 if (severity > worst) {
967 *final = m;
968 worst = severity;
1da177e4 969 }
1da177e4
LT
970 }
971
3c079792
AK
972 if (!no_way_out)
973 mce_clear_state(toclear);
974
e9eee03e 975 /*
3c079792
AK
976 * Do most of the synchronization with other CPUs.
977 * When there's any problem use only local no_way_out state.
e9eee03e 978 */
3c079792
AK
979 if (mce_end(order) < 0)
980 no_way_out = worst >= MCE_PANIC_SEVERITY;
bd78432c
TH
981
982 /*
983 * If we have decided that we just CAN'T continue, and the user
e9eee03e 984 * has not set tolerant to an insane level, give up and die.
3c079792
AK
985 *
986 * This is mainly used in the case when the system doesn't
987 * support MCE broadcasting or it has been disabled.
bd78432c
TH
988 */
989 if (no_way_out && tolerant < 3)
ac960375 990 mce_panic("Fatal machine check on current CPU", final, msg);
bd78432c
TH
991
992 /*
993 * If the error seems to be unrecoverable, something should be
994 * done. Try to kill as little as possible. If we can kill just
995 * one task, do that. If the user has set the tolerance very
996 * high, don't try to do anything at all.
997 */
bd78432c 998
ed7290d0
AK
999 if (kill_it && tolerant < 3)
1000 force_sig(SIGBUS, current);
1da177e4 1001
e02e68d3
TH
1002 /* notify userspace ASAP */
1003 set_thread_flag(TIF_MCE_NOTIFY);
1004
3c079792
AK
1005 if (worst > 0)
1006 mce_report_event(regs);
5f8c1a54 1007 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
32561696 1008out:
553f265f 1009 atomic_dec(&mce_entry);
88921be3 1010 sync_core();
1da177e4 1011}
ea149b36 1012EXPORT_SYMBOL_GPL(do_machine_check);
1da177e4 1013
9b1beaf2
AK
1014/* dummy to break dependency. actual code is in mm/memory-failure.c */
1015void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1016{
1017 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1018}
1019
1020/*
1021 * Called after mce notification in process context. This code
1022 * is allowed to sleep. Call the high level VM handler to process
1023 * any corrupted pages.
1024 * Assume that the work queue code only calls this one at a time
1025 * per CPU.
1026 * Note we don't disable preemption, so this code might run on the wrong
1027 * CPU. In this case the event is picked up by the scheduled work queue.
1028 * This is merely a fast path to expedite processing in some common
1029 * cases.
1030 */
1031void mce_notify_process(void)
1032{
1033 unsigned long pfn;
1034 mce_notify_irq();
1035 while (mce_ring_get(&pfn))
1036 memory_failure(pfn, MCE_VECTOR);
1037}
1038
1039static void mce_process_work(struct work_struct *dummy)
1040{
1041 mce_notify_process();
1042}
1043
15d5f839
DZ
1044#ifdef CONFIG_X86_MCE_INTEL
1045/***
1046 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
676b1855 1047 * @cpu: The CPU on which the event occurred.
15d5f839
DZ
1048 * @status: Event status information
1049 *
1050 * This function should be called by the thermal interrupt after the
1051 * event has been processed and the decision was made to log the event
1052 * further.
1053 *
1054 * The status parameter will be saved to the 'status' field of 'struct mce'
1055 * and historically has been the register value of the
1056 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1057 */
b5f2fa4e 1058void mce_log_therm_throt_event(__u64 status)
15d5f839
DZ
1059{
1060 struct mce m;
1061
b5f2fa4e 1062 mce_setup(&m);
15d5f839
DZ
1063 m.bank = MCE_THERMAL_BANK;
1064 m.status = status;
15d5f839
DZ
1065 mce_log(&m);
1066}
1067#endif /* CONFIG_X86_MCE_INTEL */
1068
1da177e4 1069/*
8a336b0a
TH
1070 * Periodic polling timer for "silent" machine check errors. If the
1071 * poller finds an MCE, poll 2x faster. When the poller finds no more
1072 * errors, poll 2x slower (up to check_interval seconds).
1da177e4 1073 */
1da177e4 1074static int check_interval = 5 * 60; /* 5 minutes */
e9eee03e 1075
6298c512 1076static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
52d168e2 1077static DEFINE_PER_CPU(struct timer_list, mce_timer);
1da177e4 1078
52d168e2 1079static void mcheck_timer(unsigned long data)
1da177e4 1080{
52d168e2 1081 struct timer_list *t = &per_cpu(mce_timer, data);
6298c512 1082 int *n;
52d168e2
AK
1083
1084 WARN_ON(smp_processor_id() != data);
1085
e9eee03e 1086 if (mce_available(&current_cpu_data)) {
ee031c31
AK
1087 machine_check_poll(MCP_TIMESTAMP,
1088 &__get_cpu_var(mce_poll_banks));
e9eee03e 1089 }
1da177e4
LT
1090
1091 /*
e02e68d3
TH
1092 * Alert userspace if needed. If we logged an MCE, reduce the
1093 * polling interval, otherwise increase the polling interval.
1da177e4 1094 */
6298c512 1095 n = &__get_cpu_var(next_interval);
9ff36ee9 1096 if (mce_notify_irq())
6298c512 1097 *n = max(*n/2, HZ/100);
14a02530 1098 else
6298c512 1099 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
e02e68d3 1100
6298c512 1101 t->expires = jiffies + *n;
52d168e2 1102 add_timer(t);
e02e68d3
TH
1103}
1104
9bd98405
AK
1105static void mce_do_trigger(struct work_struct *work)
1106{
1020bcbc 1107 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
9bd98405
AK
1108}
1109
1110static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1111
e02e68d3 1112/*
9bd98405
AK
1113 * Notify the user(s) about new machine check events.
1114 * Can be called from interrupt context, but not from machine check/NMI
1115 * context.
e02e68d3 1116 */
9ff36ee9 1117int mce_notify_irq(void)
e02e68d3 1118{
8457c84d
AK
1119 /* Not more than two messages every minute */
1120 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1121
e02e68d3 1122 clear_thread_flag(TIF_MCE_NOTIFY);
e9eee03e 1123
1020bcbc 1124 if (test_and_clear_bit(0, &mce_need_notify)) {
e02e68d3 1125 wake_up_interruptible(&mce_wait);
9bd98405
AK
1126
1127 /*
1128 * There is no risk of missing notifications because
1129 * work_pending is always cleared before the function is
1130 * executed.
1131 */
1020bcbc 1132 if (mce_helper[0] && !work_pending(&mce_trigger_work))
9bd98405 1133 schedule_work(&mce_trigger_work);
e02e68d3 1134
8457c84d 1135 if (__ratelimit(&ratelimit))
8a336b0a 1136 printk(KERN_INFO "Machine check events logged\n");
e02e68d3
TH
1137
1138 return 1;
1da177e4 1139 }
e02e68d3
TH
1140 return 0;
1141}
9ff36ee9 1142EXPORT_SYMBOL_GPL(mce_notify_irq);
8a336b0a 1143
cebe1820
AK
1144static int mce_banks_init(void)
1145{
1146 int i;
1147
1148 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1149 if (!mce_banks)
1150 return -ENOMEM;
1151 for (i = 0; i < banks; i++) {
1152 struct mce_bank *b = &mce_banks[i];
1153 b->ctl = -1ULL;
1154 b->init = 1;
1155 }
1156 return 0;
1157}
1158
d88203d1 1159/*
1da177e4
LT
1160 * Initialize Machine Checks for a CPU.
1161 */
419d6162 1162static int __cpuinit mce_cap_init(void)
1da177e4 1163{
0d7482e3 1164 unsigned b;
e9eee03e 1165 u64 cap;
1da177e4
LT
1166
1167 rdmsrl(MSR_IA32_MCG_CAP, cap);
01c6680a
TG
1168
1169 b = cap & MCG_BANKCNT_MASK;
b659294b
IM
1170 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1171
0d7482e3
AK
1172 if (b > MAX_NR_BANKS) {
1173 printk(KERN_WARNING
1174 "MCE: Using only %u machine check banks out of %u\n",
1175 MAX_NR_BANKS, b);
1176 b = MAX_NR_BANKS;
1177 }
1178
1179 /* Don't support asymmetric configurations today */
1180 WARN_ON(banks != 0 && b != banks);
1181 banks = b;
cebe1820
AK
1182 if (!mce_banks) {
1183 int err = mce_banks_init();
1184 if (err)
1185 return err;
1da177e4 1186 }
0d7482e3 1187
94ad8474 1188 /* Use accurate RIP reporting if available. */
01c6680a 1189 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
94ad8474 1190 rip_msr = MSR_IA32_MCG_EIP;
1da177e4 1191
ed7290d0
AK
1192 if (cap & MCG_SER_P)
1193 mce_ser = 1;
1194
0d7482e3
AK
1195 return 0;
1196}
1197
8be91105 1198static void mce_init(void)
0d7482e3 1199{
e9eee03e 1200 mce_banks_t all_banks;
0d7482e3
AK
1201 u64 cap;
1202 int i;
1203
b79109c3
AK
1204 /*
1205 * Log the machine checks left over from the previous reset.
1206 */
ee031c31 1207 bitmap_fill(all_banks, MAX_NR_BANKS);
5679af4c 1208 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1da177e4
LT
1209
1210 set_in_cr4(X86_CR4_MCE);
1211
0d7482e3 1212 rdmsrl(MSR_IA32_MCG_CAP, cap);
1da177e4
LT
1213 if (cap & MCG_CTL_P)
1214 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1215
1216 for (i = 0; i < banks; i++) {
cebe1820
AK
1217 struct mce_bank *b = &mce_banks[i];
1218 if (!b->init)
06b7a7a5 1219 continue;
a2d32bcb
AK
1220 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1221 wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
d88203d1 1222 }
1da177e4
LT
1223}
1224
1225/* Add per CPU specific workarounds here */
419d6162 1226static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
d88203d1 1227{
1da177e4 1228 /* This should be disabled by the BIOS, but isn't always */
911f6a7b 1229 if (c->x86_vendor == X86_VENDOR_AMD) {
e9eee03e
IM
1230 if (c->x86 == 15 && banks > 4) {
1231 /*
1232 * disable GART TBL walk error reporting, which
1233 * trips off incorrectly with the IOMMU & 3ware
1234 * & Cerberus:
1235 */
cebe1820 1236 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
e9eee03e
IM
1237 }
1238 if (c->x86 <= 17 && mce_bootlog < 0) {
1239 /*
1240 * Lots of broken BIOS around that don't clear them
1241 * by default and leave crap in there. Don't log:
1242 */
911f6a7b 1243 mce_bootlog = 0;
e9eee03e 1244 }
2e6f694f
AK
1245 /*
1246 * Various K7s with broken bank 0 around. Always disable
1247 * by default.
1248 */
203abd67 1249 if (c->x86 == 6 && banks > 0)
cebe1820 1250 mce_banks[0].ctl = 0;
1da177e4 1251 }
e583538f 1252
06b7a7a5
AK
1253 if (c->x86_vendor == X86_VENDOR_INTEL) {
1254 /*
1255 * SDM documents that on family 6 bank 0 should not be written
1256 * because it aliases to another special BIOS controlled
1257 * register.
1258 * But it's not aliased anymore on model 0x1a+
1259 * Don't ignore bank 0 completely because there could be a
1260 * valid event later, merely don't write CTL0.
1261 */
1262
cebe1820
AK
1263 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1264 mce_banks[0].init = 0;
3c079792
AK
1265
1266 /*
1267 * All newer Intel systems support MCE broadcasting. Enable
1268 * synchronization with a one second timeout.
1269 */
1270 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1271 monarch_timeout < 0)
1272 monarch_timeout = USEC_PER_SEC;
94699b04
BZ
1273
1274 /* There are also broken BIOSes on some Pentium M systems. */
1275 if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0)
1276 mce_bootlog = 0;
06b7a7a5 1277 }
3c079792
AK
1278 if (monarch_timeout < 0)
1279 monarch_timeout = 0;
29b0f591
AK
1280 if (mce_bootlog != 0)
1281 mce_panic_timeout = 30;
d88203d1 1282}
1da177e4 1283
4efc0670
AK
1284static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1285{
1286 if (c->x86 != 5)
1287 return;
1288 switch (c->x86_vendor) {
1289 case X86_VENDOR_INTEL:
c6978369 1290 intel_p5_mcheck_init(c);
4efc0670
AK
1291 break;
1292 case X86_VENDOR_CENTAUR:
1293 winchip_mcheck_init(c);
1294 break;
1295 }
1296}
1297
cc3ca220 1298static void mce_cpu_features(struct cpuinfo_x86 *c)
1da177e4
LT
1299{
1300 switch (c->x86_vendor) {
1301 case X86_VENDOR_INTEL:
1302 mce_intel_feature_init(c);
1303 break;
89b831ef
JS
1304 case X86_VENDOR_AMD:
1305 mce_amd_feature_init(c);
1306 break;
1da177e4
LT
1307 default:
1308 break;
1309 }
1310}
1311
52d168e2
AK
1312static void mce_init_timer(void)
1313{
1314 struct timer_list *t = &__get_cpu_var(mce_timer);
6298c512 1315 int *n = &__get_cpu_var(next_interval);
52d168e2 1316
62fdac59
HS
1317 if (mce_ignore_ce)
1318 return;
1319
6298c512
AK
1320 *n = check_interval * HZ;
1321 if (!*n)
52d168e2
AK
1322 return;
1323 setup_timer(t, mcheck_timer, smp_processor_id());
6298c512 1324 t->expires = round_jiffies(jiffies + *n);
52d168e2
AK
1325 add_timer(t);
1326}
1327
9eda8cb3
AK
1328/* Handle unconfigured int18 (should never happen) */
1329static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1330{
1331 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1332 smp_processor_id());
1333}
1334
1335/* Call the installed machine check handler for this CPU setup. */
1336void (*machine_check_vector)(struct pt_regs *, long error_code) =
1337 unexpected_machine_check;
1338
d88203d1 1339/*
1da177e4 1340 * Called for each booted CPU to set up machine checks.
e9eee03e 1341 * Must be called with preempt off:
1da177e4 1342 */
e6982c67 1343void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1da177e4 1344{
4efc0670
AK
1345 if (mce_disabled)
1346 return;
1347
1348 mce_ancient_init(c);
1349
5b4408fd 1350 if (!mce_available(c))
1da177e4
LT
1351 return;
1352
0d7482e3 1353 if (mce_cap_init() < 0) {
04b2b1a4 1354 mce_disabled = 1;
0d7482e3
AK
1355 return;
1356 }
1357 mce_cpu_quirks(c);
1358
5d727926
AK
1359 machine_check_vector = do_machine_check;
1360
8be91105 1361 mce_init();
1da177e4 1362 mce_cpu_features(c);
52d168e2 1363 mce_init_timer();
9b1beaf2 1364 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1da177e4
LT
1365}
1366
1367/*
1368 * Character device to read and clear the MCE log.
1369 */
1370
f528e7ba 1371static DEFINE_SPINLOCK(mce_state_lock);
e9eee03e
IM
1372static int open_count; /* #times opened */
1373static int open_exclu; /* already open exclusive? */
f528e7ba
TH
1374
1375static int mce_open(struct inode *inode, struct file *file)
1376{
1377 spin_lock(&mce_state_lock);
1378
1379 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1380 spin_unlock(&mce_state_lock);
e9eee03e 1381
f528e7ba
TH
1382 return -EBUSY;
1383 }
1384
1385 if (file->f_flags & O_EXCL)
1386 open_exclu = 1;
1387 open_count++;
1388
1389 spin_unlock(&mce_state_lock);
1390
bd78432c 1391 return nonseekable_open(inode, file);
f528e7ba
TH
1392}
1393
1394static int mce_release(struct inode *inode, struct file *file)
1395{
1396 spin_lock(&mce_state_lock);
1397
1398 open_count--;
1399 open_exclu = 0;
1400
1401 spin_unlock(&mce_state_lock);
1402
1403 return 0;
1404}
1405
d88203d1
TG
1406static void collect_tscs(void *data)
1407{
1da177e4 1408 unsigned long *cpu_tsc = (unsigned long *)data;
d88203d1 1409
1da177e4 1410 rdtscll(cpu_tsc[smp_processor_id()]);
d88203d1 1411}
1da177e4 1412
e9eee03e
IM
1413static DEFINE_MUTEX(mce_read_mutex);
1414
d88203d1
TG
1415static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1416 loff_t *off)
1da177e4 1417{
e9eee03e 1418 char __user *buf = ubuf;
f0de53bb 1419 unsigned long *cpu_tsc;
ef41df43 1420 unsigned prev, next;
1da177e4
LT
1421 int i, err;
1422
6bca67f9 1423 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
f0de53bb
AK
1424 if (!cpu_tsc)
1425 return -ENOMEM;
1426
8c8b8859 1427 mutex_lock(&mce_read_mutex);
1da177e4
LT
1428 next = rcu_dereference(mcelog.next);
1429
1430 /* Only supports full reads right now */
d88203d1 1431 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
8c8b8859 1432 mutex_unlock(&mce_read_mutex);
f0de53bb 1433 kfree(cpu_tsc);
e9eee03e 1434
1da177e4
LT
1435 return -EINVAL;
1436 }
1437
1438 err = 0;
ef41df43
HY
1439 prev = 0;
1440 do {
1441 for (i = prev; i < next; i++) {
1442 unsigned long start = jiffies;
1443
1444 while (!mcelog.entry[i].finished) {
1445 if (time_after_eq(jiffies, start + 2)) {
1446 memset(mcelog.entry + i, 0,
1447 sizeof(struct mce));
1448 goto timeout;
1449 }
1450 cpu_relax();
673242c1 1451 }
ef41df43
HY
1452 smp_rmb();
1453 err |= copy_to_user(buf, mcelog.entry + i,
1454 sizeof(struct mce));
1455 buf += sizeof(struct mce);
1456timeout:
1457 ;
673242c1 1458 }
1da177e4 1459
ef41df43
HY
1460 memset(mcelog.entry + prev, 0,
1461 (next - prev) * sizeof(struct mce));
1462 prev = next;
1463 next = cmpxchg(&mcelog.next, prev, 0);
1464 } while (next != prev);
1da177e4 1465
b2b18660 1466 synchronize_sched();
1da177e4 1467
d88203d1
TG
1468 /*
1469 * Collect entries that were still getting written before the
1470 * synchronize.
1471 */
15c8b6c1 1472 on_each_cpu(collect_tscs, cpu_tsc, 1);
e9eee03e 1473
d88203d1
TG
1474 for (i = next; i < MCE_LOG_LEN; i++) {
1475 if (mcelog.entry[i].finished &&
1476 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1477 err |= copy_to_user(buf, mcelog.entry+i,
1478 sizeof(struct mce));
1da177e4
LT
1479 smp_rmb();
1480 buf += sizeof(struct mce);
1481 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1482 }
d88203d1 1483 }
8c8b8859 1484 mutex_unlock(&mce_read_mutex);
f0de53bb 1485 kfree(cpu_tsc);
e9eee03e 1486
d88203d1 1487 return err ? -EFAULT : buf - ubuf;
1da177e4
LT
1488}
1489
e02e68d3
TH
1490static unsigned int mce_poll(struct file *file, poll_table *wait)
1491{
1492 poll_wait(file, &mce_wait, wait);
1493 if (rcu_dereference(mcelog.next))
1494 return POLLIN | POLLRDNORM;
1495 return 0;
1496}
1497
c68461b6 1498static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1da177e4
LT
1499{
1500 int __user *p = (int __user *)arg;
d88203d1 1501
1da177e4 1502 if (!capable(CAP_SYS_ADMIN))
d88203d1 1503 return -EPERM;
e9eee03e 1504
1da177e4 1505 switch (cmd) {
d88203d1 1506 case MCE_GET_RECORD_LEN:
1da177e4
LT
1507 return put_user(sizeof(struct mce), p);
1508 case MCE_GET_LOG_LEN:
d88203d1 1509 return put_user(MCE_LOG_LEN, p);
1da177e4
LT
1510 case MCE_GETCLEAR_FLAGS: {
1511 unsigned flags;
d88203d1
TG
1512
1513 do {
1da177e4 1514 flags = mcelog.flags;
d88203d1 1515 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
e9eee03e 1516
d88203d1 1517 return put_user(flags, p);
1da177e4
LT
1518 }
1519 default:
d88203d1
TG
1520 return -ENOTTY;
1521 }
1da177e4
LT
1522}
1523
a1ff41bf 1524/* Modified in mce-inject.c, so not static or const */
ea149b36 1525struct file_operations mce_chrdev_ops = {
e9eee03e
IM
1526 .open = mce_open,
1527 .release = mce_release,
1528 .read = mce_read,
1529 .poll = mce_poll,
1530 .unlocked_ioctl = mce_ioctl,
1da177e4 1531};
ea149b36 1532EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1da177e4
LT
1533
1534static struct miscdevice mce_log_device = {
1535 MISC_MCELOG_MINOR,
1536 "mcelog",
1537 &mce_chrdev_ops,
1538};
1539
13503fa9 1540/*
62fdac59
HS
1541 * mce=off Disables machine check
1542 * mce=no_cmci Disables CMCI
1543 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1544 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
3c079792
AK
1545 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1546 * monarchtimeout is how long to wait for other CPUs on machine
1547 * check, or 0 to not wait
13503fa9
HS
1548 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1549 * mce=nobootlog Don't log MCEs from before booting.
1550 */
1da177e4
LT
1551static int __init mcheck_enable(char *str)
1552{
e3346fc4 1553 if (*str == 0) {
4efc0670 1554 enable_p5_mce();
e3346fc4
BZ
1555 return 1;
1556 }
4efc0670
AK
1557 if (*str == '=')
1558 str++;
1da177e4 1559 if (!strcmp(str, "off"))
04b2b1a4 1560 mce_disabled = 1;
62fdac59
HS
1561 else if (!strcmp(str, "no_cmci"))
1562 mce_cmci_disabled = 1;
1563 else if (!strcmp(str, "dont_log_ce"))
1564 mce_dont_log_ce = 1;
1565 else if (!strcmp(str, "ignore_ce"))
1566 mce_ignore_ce = 1;
13503fa9
HS
1567 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1568 mce_bootlog = (str[0] == 'b');
3c079792 1569 else if (isdigit(str[0])) {
8c566ef5 1570 get_option(&str, &tolerant);
3c079792
AK
1571 if (*str == ',') {
1572 ++str;
1573 get_option(&str, &monarch_timeout);
1574 }
1575 } else {
4efc0670 1576 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
13503fa9
HS
1577 str);
1578 return 0;
1579 }
9b41046c 1580 return 1;
1da177e4 1581}
4efc0670 1582__setup("mce", mcheck_enable);
1da177e4 1583
d88203d1 1584/*
1da177e4 1585 * Sysfs support
d88203d1 1586 */
1da177e4 1587
973a2dd1
AK
1588/*
1589 * Disable machine checks on suspend and shutdown. We can't really handle
1590 * them later.
1591 */
1592static int mce_disable(void)
1593{
1594 int i;
1595
06b7a7a5 1596 for (i = 0; i < banks; i++) {
cebe1820
AK
1597 struct mce_bank *b = &mce_banks[i];
1598 if (b->init)
a2d32bcb 1599 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
06b7a7a5 1600 }
973a2dd1
AK
1601 return 0;
1602}
1603
1604static int mce_suspend(struct sys_device *dev, pm_message_t state)
1605{
1606 return mce_disable();
1607}
1608
1609static int mce_shutdown(struct sys_device *dev)
1610{
1611 return mce_disable();
1612}
1613
e9eee03e
IM
1614/*
1615 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1616 * Only one CPU is active at this time, the others get re-added later using
1617 * CPU hotplug:
1618 */
1da177e4
LT
1619static int mce_resume(struct sys_device *dev)
1620{
8be91105 1621 mce_init();
6ec68bff 1622 mce_cpu_features(&current_cpu_data);
e9eee03e 1623
1da177e4
LT
1624 return 0;
1625}
1626
52d168e2
AK
1627static void mce_cpu_restart(void *data)
1628{
1629 del_timer_sync(&__get_cpu_var(mce_timer));
33edbf02
HS
1630 if (!mce_available(&current_cpu_data))
1631 return;
1632 mce_init();
52d168e2
AK
1633 mce_init_timer();
1634}
1635
1da177e4 1636/* Reinit MCEs after user configuration changes */
d88203d1
TG
1637static void mce_restart(void)
1638{
52d168e2 1639 on_each_cpu(mce_cpu_restart, NULL, 1);
1da177e4
LT
1640}
1641
9af43b54
HS
1642/* Toggle features for corrected errors */
1643static void mce_disable_ce(void *all)
1644{
1645 if (!mce_available(&current_cpu_data))
1646 return;
1647 if (all)
1648 del_timer_sync(&__get_cpu_var(mce_timer));
1649 cmci_clear();
1650}
1651
1652static void mce_enable_ce(void *all)
1653{
1654 if (!mce_available(&current_cpu_data))
1655 return;
1656 cmci_reenable();
1657 cmci_recheck();
1658 if (all)
1659 mce_init_timer();
1660}
1661
1da177e4 1662static struct sysdev_class mce_sysclass = {
e9eee03e
IM
1663 .suspend = mce_suspend,
1664 .shutdown = mce_shutdown,
1665 .resume = mce_resume,
1666 .name = "machinecheck",
1da177e4
LT
1667};
1668
cb491fca 1669DEFINE_PER_CPU(struct sys_device, mce_dev);
e9eee03e
IM
1670
1671__cpuinitdata
1672void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1da177e4 1673
cebe1820
AK
1674static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
1675{
1676 return container_of(attr, struct mce_bank, attr);
1677}
0d7482e3
AK
1678
1679static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1680 char *buf)
1681{
cebe1820 1682 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
0d7482e3
AK
1683}
1684
1685static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
9319cec8 1686 const char *buf, size_t size)
0d7482e3 1687{
9319cec8 1688 u64 new;
e9eee03e 1689
9319cec8 1690 if (strict_strtoull(buf, 0, &new) < 0)
0d7482e3 1691 return -EINVAL;
e9eee03e 1692
cebe1820 1693 attr_to_bank(attr)->ctl = new;
0d7482e3 1694 mce_restart();
e9eee03e 1695
9319cec8 1696 return size;
0d7482e3 1697}
a98f0dd3 1698
e9eee03e
IM
1699static ssize_t
1700show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
a98f0dd3 1701{
1020bcbc 1702 strcpy(buf, mce_helper);
a98f0dd3 1703 strcat(buf, "\n");
1020bcbc 1704 return strlen(mce_helper) + 1;
a98f0dd3
AK
1705}
1706
4a0b2b4d 1707static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
e9eee03e 1708 const char *buf, size_t siz)
a98f0dd3
AK
1709{
1710 char *p;
1711 int len;
e9eee03e 1712
1020bcbc
HS
1713 strncpy(mce_helper, buf, sizeof(mce_helper));
1714 mce_helper[sizeof(mce_helper)-1] = 0;
1715 len = strlen(mce_helper);
1716 p = strchr(mce_helper, '\n');
e9eee03e
IM
1717
1718 if (*p)
1719 *p = 0;
1720
a98f0dd3
AK
1721 return len;
1722}
1723
9af43b54
HS
1724static ssize_t set_ignore_ce(struct sys_device *s,
1725 struct sysdev_attribute *attr,
1726 const char *buf, size_t size)
1727{
1728 u64 new;
1729
1730 if (strict_strtoull(buf, 0, &new) < 0)
1731 return -EINVAL;
1732
1733 if (mce_ignore_ce ^ !!new) {
1734 if (new) {
1735 /* disable ce features */
1736 on_each_cpu(mce_disable_ce, (void *)1, 1);
1737 mce_ignore_ce = 1;
1738 } else {
1739 /* enable ce features */
1740 mce_ignore_ce = 0;
1741 on_each_cpu(mce_enable_ce, (void *)1, 1);
1742 }
1743 }
1744 return size;
1745}
1746
1747static ssize_t set_cmci_disabled(struct sys_device *s,
1748 struct sysdev_attribute *attr,
1749 const char *buf, size_t size)
1750{
1751 u64 new;
1752
1753 if (strict_strtoull(buf, 0, &new) < 0)
1754 return -EINVAL;
1755
1756 if (mce_cmci_disabled ^ !!new) {
1757 if (new) {
1758 /* disable cmci */
1759 on_each_cpu(mce_disable_ce, NULL, 1);
1760 mce_cmci_disabled = 1;
1761 } else {
1762 /* enable cmci */
1763 mce_cmci_disabled = 0;
1764 on_each_cpu(mce_enable_ce, NULL, 1);
1765 }
1766 }
1767 return size;
1768}
1769
b56f642d
AK
1770static ssize_t store_int_with_restart(struct sys_device *s,
1771 struct sysdev_attribute *attr,
1772 const char *buf, size_t size)
1773{
1774 ssize_t ret = sysdev_store_int(s, attr, buf, size);
1775 mce_restart();
1776 return ret;
1777}
1778
a98f0dd3 1779static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
d95d62c0 1780static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
3c079792 1781static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
9af43b54 1782static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
e9eee03e 1783
b56f642d
AK
1784static struct sysdev_ext_attribute attr_check_interval = {
1785 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1786 store_int_with_restart),
1787 &check_interval
1788};
e9eee03e 1789
9af43b54
HS
1790static struct sysdev_ext_attribute attr_ignore_ce = {
1791 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
1792 &mce_ignore_ce
1793};
1794
1795static struct sysdev_ext_attribute attr_cmci_disabled = {
74b602c7 1796 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
9af43b54
HS
1797 &mce_cmci_disabled
1798};
1799
cb491fca 1800static struct sysdev_attribute *mce_attrs[] = {
9af43b54
HS
1801 &attr_tolerant.attr,
1802 &attr_check_interval.attr,
1803 &attr_trigger,
3c079792 1804 &attr_monarch_timeout.attr,
9af43b54
HS
1805 &attr_dont_log_ce.attr,
1806 &attr_ignore_ce.attr,
1807 &attr_cmci_disabled.attr,
a98f0dd3
AK
1808 NULL
1809};
1da177e4 1810
cb491fca 1811static cpumask_var_t mce_dev_initialized;
bae19fe0 1812
e9eee03e 1813/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
91c6d400 1814static __cpuinit int mce_create_device(unsigned int cpu)
1da177e4
LT
1815{
1816 int err;
b1f49f95 1817 int i, j;
92cb7612 1818
90367556 1819 if (!mce_available(&boot_cpu_data))
91c6d400
AK
1820 return -EIO;
1821
cb491fca
IM
1822 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1823 per_cpu(mce_dev, cpu).id = cpu;
1824 per_cpu(mce_dev, cpu).cls = &mce_sysclass;
91c6d400 1825
cb491fca 1826 err = sysdev_register(&per_cpu(mce_dev, cpu));
d435d862
AM
1827 if (err)
1828 return err;
1829
cb491fca
IM
1830 for (i = 0; mce_attrs[i]; i++) {
1831 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
d435d862
AM
1832 if (err)
1833 goto error;
1834 }
b1f49f95 1835 for (j = 0; j < banks; j++) {
cb491fca 1836 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
cebe1820 1837 &mce_banks[j].attr);
0d7482e3
AK
1838 if (err)
1839 goto error2;
1840 }
cb491fca 1841 cpumask_set_cpu(cpu, mce_dev_initialized);
91c6d400 1842
d435d862 1843 return 0;
0d7482e3 1844error2:
b1f49f95 1845 while (--j >= 0)
cebe1820 1846 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
d435d862 1847error:
cb491fca 1848 while (--i >= 0)
cebe1820 1849 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
cb491fca
IM
1850
1851 sysdev_unregister(&per_cpu(mce_dev, cpu));
d435d862 1852
91c6d400
AK
1853 return err;
1854}
1855
2d9cd6c2 1856static __cpuinit void mce_remove_device(unsigned int cpu)
91c6d400 1857{
73ca5358
SL
1858 int i;
1859
cb491fca 1860 if (!cpumask_test_cpu(cpu, mce_dev_initialized))
bae19fe0
AH
1861 return;
1862
cb491fca
IM
1863 for (i = 0; mce_attrs[i]; i++)
1864 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1865
0d7482e3 1866 for (i = 0; i < banks; i++)
cebe1820 1867 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
cb491fca
IM
1868
1869 sysdev_unregister(&per_cpu(mce_dev, cpu));
1870 cpumask_clear_cpu(cpu, mce_dev_initialized);
91c6d400 1871}
91c6d400 1872
d6b75584 1873/* Make sure there are no machine checks on offlined CPUs. */
ec5b3d32 1874static void mce_disable_cpu(void *h)
d6b75584 1875{
88ccbedd 1876 unsigned long action = *(unsigned long *)h;
cb491fca 1877 int i;
d6b75584
AK
1878
1879 if (!mce_available(&current_cpu_data))
1880 return;
88ccbedd
AK
1881 if (!(action & CPU_TASKS_FROZEN))
1882 cmci_clear();
06b7a7a5 1883 for (i = 0; i < banks; i++) {
cebe1820
AK
1884 struct mce_bank *b = &mce_banks[i];
1885 if (b->init)
a2d32bcb 1886 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
06b7a7a5 1887 }
d6b75584
AK
1888}
1889
ec5b3d32 1890static void mce_reenable_cpu(void *h)
d6b75584 1891{
88ccbedd 1892 unsigned long action = *(unsigned long *)h;
e9eee03e 1893 int i;
d6b75584
AK
1894
1895 if (!mce_available(&current_cpu_data))
1896 return;
e9eee03e 1897
88ccbedd
AK
1898 if (!(action & CPU_TASKS_FROZEN))
1899 cmci_reenable();
06b7a7a5 1900 for (i = 0; i < banks; i++) {
cebe1820
AK
1901 struct mce_bank *b = &mce_banks[i];
1902 if (b->init)
a2d32bcb 1903 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
06b7a7a5 1904 }
d6b75584
AK
1905}
1906
91c6d400 1907/* Get notified when a cpu comes on/off. Be hotplug friendly. */
e9eee03e
IM
1908static int __cpuinit
1909mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
91c6d400
AK
1910{
1911 unsigned int cpu = (unsigned long)hcpu;
52d168e2 1912 struct timer_list *t = &per_cpu(mce_timer, cpu);
91c6d400
AK
1913
1914 switch (action) {
bae19fe0
AH
1915 case CPU_ONLINE:
1916 case CPU_ONLINE_FROZEN:
1917 mce_create_device(cpu);
8735728e
RW
1918 if (threshold_cpu_callback)
1919 threshold_cpu_callback(action, cpu);
91c6d400 1920 break;
91c6d400 1921 case CPU_DEAD:
8bb78442 1922 case CPU_DEAD_FROZEN:
8735728e
RW
1923 if (threshold_cpu_callback)
1924 threshold_cpu_callback(action, cpu);
91c6d400
AK
1925 mce_remove_device(cpu);
1926 break;
52d168e2
AK
1927 case CPU_DOWN_PREPARE:
1928 case CPU_DOWN_PREPARE_FROZEN:
1929 del_timer_sync(t);
88ccbedd 1930 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
52d168e2
AK
1931 break;
1932 case CPU_DOWN_FAILED:
1933 case CPU_DOWN_FAILED_FROZEN:
6298c512
AK
1934 t->expires = round_jiffies(jiffies +
1935 __get_cpu_var(next_interval));
52d168e2 1936 add_timer_on(t, cpu);
88ccbedd
AK
1937 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1938 break;
1939 case CPU_POST_DEAD:
1940 /* intentionally ignoring frozen here */
1941 cmci_rediscover(cpu);
52d168e2 1942 break;
91c6d400 1943 }
bae19fe0 1944 return NOTIFY_OK;
91c6d400
AK
1945}
1946
1e35669d 1947static struct notifier_block mce_cpu_notifier __cpuinitdata = {
91c6d400
AK
1948 .notifier_call = mce_cpu_callback,
1949};
1950
cebe1820 1951static __init void mce_init_banks(void)
0d7482e3
AK
1952{
1953 int i;
1954
0d7482e3 1955 for (i = 0; i < banks; i++) {
cebe1820
AK
1956 struct mce_bank *b = &mce_banks[i];
1957 struct sysdev_attribute *a = &b->attr;
e9eee03e 1958
cebe1820
AK
1959 a->attr.name = b->attrname;
1960 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
e9eee03e
IM
1961
1962 a->attr.mode = 0644;
1963 a->show = show_bank;
1964 a->store = set_bank;
0d7482e3 1965 }
0d7482e3
AK
1966}
1967
91c6d400
AK
1968static __init int mce_init_device(void)
1969{
1970 int err;
1971 int i = 0;
1972
1da177e4
LT
1973 if (!mce_available(&boot_cpu_data))
1974 return -EIO;
0d7482e3 1975
e92fae06 1976 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
996867d0 1977
cebe1820 1978 mce_init_banks();
0d7482e3 1979
1da177e4 1980 err = sysdev_class_register(&mce_sysclass);
d435d862
AM
1981 if (err)
1982 return err;
91c6d400
AK
1983
1984 for_each_online_cpu(i) {
d435d862
AM
1985 err = mce_create_device(i);
1986 if (err)
1987 return err;
91c6d400
AK
1988 }
1989
be6b5a35 1990 register_hotcpu_notifier(&mce_cpu_notifier);
1da177e4 1991 misc_register(&mce_log_device);
e9eee03e 1992
1da177e4 1993 return err;
1da177e4 1994}
91c6d400 1995
1da177e4 1996device_initcall(mce_init_device);
a988d334 1997
d7c3c9a6
AK
1998/*
1999 * Old style boot options parsing. Only for compatibility.
2000 */
2001static int __init mcheck_disable(char *str)
2002{
2003 mce_disabled = 1;
2004 return 1;
2005}
2006__setup("nomce", mcheck_disable);
5be9ed25
HY
2007
2008#ifdef CONFIG_DEBUG_FS
2009struct dentry *mce_get_debugfs_dir(void)
2010{
2011 static struct dentry *dmce;
2012
2013 if (!dmce)
2014 dmce = debugfs_create_dir("mce", NULL);
2015
2016 return dmce;
2017}
2018#endif
This page took 0.962111 seconds and 5 git commands to generate.