x86, mce: Support specifying raise mode for software MCE injection
[deliverable/linux.git] / arch / x86 / kernel / cpu / mcheck / mce.c
CommitLineData
1da177e4
LT
1/*
2 * Machine check handler.
e9eee03e 3 *
1da177e4 4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
d88203d1
TG
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
b79109c3
AK
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
1da177e4 9 */
e9eee03e
IM
10#include <linux/thread_info.h>
11#include <linux/capability.h>
12#include <linux/miscdevice.h>
ccc3c319 13#include <linux/interrupt.h>
e9eee03e
IM
14#include <linux/ratelimit.h>
15#include <linux/kallsyms.h>
16#include <linux/rcupdate.h>
e9eee03e 17#include <linux/kobject.h>
14a02530 18#include <linux/uaccess.h>
e9eee03e
IM
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
1da177e4 22#include <linux/string.h>
1da177e4 23#include <linux/sysdev.h>
3c079792 24#include <linux/delay.h>
8c566ef5 25#include <linux/ctype.h>
e9eee03e 26#include <linux/sched.h>
0d7482e3 27#include <linux/sysfs.h>
e9eee03e
IM
28#include <linux/types.h>
29#include <linux/init.h>
30#include <linux/kmod.h>
31#include <linux/poll.h>
3c079792 32#include <linux/nmi.h>
e9eee03e 33#include <linux/cpu.h>
14a02530 34#include <linux/smp.h>
e9eee03e 35#include <linux/fs.h>
9b1beaf2 36#include <linux/mm.h>
e9eee03e 37
d88203d1 38#include <asm/processor.h>
ccc3c319
AK
39#include <asm/hw_irq.h>
40#include <asm/apic.h>
e02e68d3 41#include <asm/idle.h>
ccc3c319 42#include <asm/ipi.h>
e9eee03e
IM
43#include <asm/mce.h>
44#include <asm/msr.h>
1da177e4 45
bd19a5e6 46#include "mce-internal.h"
711c2e48 47
4e5b3e69 48int mce_disabled __read_mostly;
04b2b1a4 49
e9eee03e 50#define MISC_MCELOG_MINOR 227
0d7482e3 51
3c079792
AK
52#define SPINUNIT 100 /* 100ns */
53
553f265f
AK
54atomic_t mce_entry;
55
01ca79f1
AK
56DEFINE_PER_CPU(unsigned, mce_exception_count);
57
bd78432c
TH
58/*
59 * Tolerant levels:
60 * 0: always panic on uncorrected errors, log corrected errors
61 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
62 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
63 * 3: never panic or SIGBUS, log all errors (for testing only)
64 */
4e5b3e69
HS
65static int tolerant __read_mostly = 1;
66static int banks __read_mostly;
4e5b3e69
HS
67static int rip_msr __read_mostly;
68static int mce_bootlog __read_mostly = -1;
69static int monarch_timeout __read_mostly = -1;
70static int mce_panic_timeout __read_mostly;
71static int mce_dont_log_ce __read_mostly;
72int mce_cmci_disabled __read_mostly;
73int mce_ignore_ce __read_mostly;
74int mce_ser __read_mostly;
a98f0dd3 75
cebe1820
AK
76struct mce_bank *mce_banks __read_mostly;
77
1020bcbc
HS
78/* User mode helper program triggered by machine check event */
79static unsigned long mce_need_notify;
80static char mce_helper[128];
81static char *mce_helper_argv[2] = { mce_helper, NULL };
1da177e4 82
e02e68d3 83static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
3c079792
AK
84static DEFINE_PER_CPU(struct mce, mces_seen);
85static int cpu_missing;
86
e02e68d3 87
ee031c31
AK
88/* MCA banks polled by the period polling timer for corrected events */
89DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
90 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
91};
92
9b1beaf2
AK
93static DEFINE_PER_CPU(struct work_struct, mce_work);
94
b5f2fa4e
AK
95/* Do initial initialization of a struct mce */
96void mce_setup(struct mce *m)
97{
98 memset(m, 0, sizeof(struct mce));
d620c67f 99 m->cpu = m->extcpu = smp_processor_id();
b5f2fa4e 100 rdtscll(m->tsc);
8ee08347
AK
101 /* We hope get_seconds stays lockless */
102 m->time = get_seconds();
103 m->cpuvendor = boot_cpu_data.x86_vendor;
104 m->cpuid = cpuid_eax(1);
105#ifdef CONFIG_SMP
106 m->socketid = cpu_data(m->extcpu).phys_proc_id;
107#endif
108 m->apicid = cpu_data(m->extcpu).initial_apicid;
109 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
b5f2fa4e
AK
110}
111
ea149b36
AK
112DEFINE_PER_CPU(struct mce, injectm);
113EXPORT_PER_CPU_SYMBOL_GPL(injectm);
114
1da177e4
LT
115/*
116 * Lockless MCE logging infrastructure.
117 * This avoids deadlocks on printk locks without having to break locks. Also
118 * separate MCEs from kernel messages to avoid bogus bug reports.
119 */
120
231fd906 121static struct mce_log mcelog = {
f6fb0ac0
AK
122 .signature = MCE_LOG_SIGNATURE,
123 .len = MCE_LOG_LEN,
124 .recordlen = sizeof(struct mce),
d88203d1 125};
1da177e4
LT
126
127void mce_log(struct mce *mce)
128{
129 unsigned next, entry;
e9eee03e 130
1da177e4 131 mce->finished = 0;
7644143c 132 wmb();
1da177e4
LT
133 for (;;) {
134 entry = rcu_dereference(mcelog.next);
673242c1 135 for (;;) {
e9eee03e
IM
136 /*
137 * When the buffer fills up discard new entries.
138 * Assume that the earlier errors are the more
139 * interesting ones:
140 */
673242c1 141 if (entry >= MCE_LOG_LEN) {
14a02530
HS
142 set_bit(MCE_OVERFLOW,
143 (unsigned long *)&mcelog.flags);
673242c1
AK
144 return;
145 }
e9eee03e 146 /* Old left over entry. Skip: */
673242c1
AK
147 if (mcelog.entry[entry].finished) {
148 entry++;
149 continue;
150 }
7644143c 151 break;
1da177e4 152 }
1da177e4
LT
153 smp_rmb();
154 next = entry + 1;
155 if (cmpxchg(&mcelog.next, entry, next) == entry)
156 break;
157 }
158 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
7644143c 159 wmb();
1da177e4 160 mcelog.entry[entry].finished = 1;
7644143c 161 wmb();
1da177e4 162
a0189c70 163 mce->finished = 1;
1020bcbc 164 set_bit(0, &mce_need_notify);
1da177e4
LT
165}
166
77e26cca 167static void print_mce(struct mce *m)
1da177e4 168{
86503560 169 printk(KERN_EMERG
1da177e4 170 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
d620c67f 171 m->extcpu, m->mcgstatus, m->bank, m->status);
65ea5b03 172 if (m->ip) {
d88203d1 173 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
1da177e4 174 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
65ea5b03 175 m->cs, m->ip);
1da177e4 176 if (m->cs == __KERNEL_CS)
65ea5b03 177 print_symbol("{%s}", m->ip);
1da177e4
LT
178 printk("\n");
179 }
f6d1826d 180 printk(KERN_EMERG "TSC %llx ", m->tsc);
1da177e4 181 if (m->addr)
f6d1826d 182 printk("ADDR %llx ", m->addr);
1da177e4 183 if (m->misc)
f6d1826d 184 printk("MISC %llx ", m->misc);
1da177e4 185 printk("\n");
8ee08347
AK
186 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
187 m->cpuvendor, m->cpuid, m->time, m->socketid,
188 m->apicid);
86503560
AK
189}
190
77e26cca
HS
191static void print_mce_head(void)
192{
193 printk(KERN_EMERG "\n" KERN_EMERG "HARDWARE ERROR\n");
194}
195
86503560
AK
196static void print_mce_tail(void)
197{
198 printk(KERN_EMERG "This is not a software problem!\n"
199 KERN_EMERG "Run through mcelog --ascii to decode and contact your hardware vendor\n");
1da177e4
LT
200}
201
f94b61c2
AK
202#define PANIC_TIMEOUT 5 /* 5 seconds */
203
204static atomic_t mce_paniced;
205
206/* Panic in progress. Enable interrupts and wait for final IPI */
207static void wait_for_panic(void)
208{
209 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
210 preempt_disable();
211 local_irq_enable();
212 while (timeout-- > 0)
213 udelay(1);
29b0f591
AK
214 if (panic_timeout == 0)
215 panic_timeout = mce_panic_timeout;
f94b61c2
AK
216 panic("Panicing machine check CPU died");
217}
218
bd19a5e6 219static void mce_panic(char *msg, struct mce *final, char *exp)
d88203d1 220{
1da177e4 221 int i;
e02e68d3 222
f94b61c2
AK
223 /*
224 * Make sure only one CPU runs in machine check panic
225 */
a95436e4 226 if (atomic_inc_return(&mce_paniced) > 1)
f94b61c2
AK
227 wait_for_panic();
228 barrier();
229
d896a940
AK
230 bust_spinlocks(1);
231 console_verbose();
77e26cca 232 print_mce_head();
a0189c70 233 /* First print corrected ones that are still unlogged */
1da177e4 234 for (i = 0; i < MCE_LOG_LEN; i++) {
a0189c70 235 struct mce *m = &mcelog.entry[i];
77e26cca
HS
236 if (!(m->status & MCI_STATUS_VAL))
237 continue;
238 if (!(m->status & MCI_STATUS_UC))
239 print_mce(m);
a0189c70
AK
240 }
241 /* Now print uncorrected but with the final one last */
242 for (i = 0; i < MCE_LOG_LEN; i++) {
243 struct mce *m = &mcelog.entry[i];
244 if (!(m->status & MCI_STATUS_VAL))
1da177e4 245 continue;
77e26cca
HS
246 if (!(m->status & MCI_STATUS_UC))
247 continue;
a0189c70 248 if (!final || memcmp(m, final, sizeof(struct mce)))
77e26cca 249 print_mce(m);
1da177e4 250 }
a0189c70 251 if (final)
77e26cca 252 print_mce(final);
3c079792
AK
253 if (cpu_missing)
254 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
86503560 255 print_mce_tail();
bd19a5e6
AK
256 if (exp)
257 printk(KERN_EMERG "Machine check: %s\n", exp);
29b0f591
AK
258 if (panic_timeout == 0)
259 panic_timeout = mce_panic_timeout;
e02e68d3 260 panic(msg);
d88203d1 261}
1da177e4 262
ea149b36
AK
263/* Support code for software error injection */
264
265static int msr_to_offset(u32 msr)
266{
267 unsigned bank = __get_cpu_var(injectm.bank);
268 if (msr == rip_msr)
269 return offsetof(struct mce, ip);
a2d32bcb 270 if (msr == MSR_IA32_MCx_STATUS(bank))
ea149b36 271 return offsetof(struct mce, status);
a2d32bcb 272 if (msr == MSR_IA32_MCx_ADDR(bank))
ea149b36 273 return offsetof(struct mce, addr);
a2d32bcb 274 if (msr == MSR_IA32_MCx_MISC(bank))
ea149b36
AK
275 return offsetof(struct mce, misc);
276 if (msr == MSR_IA32_MCG_STATUS)
277 return offsetof(struct mce, mcgstatus);
278 return -1;
279}
280
5f8c1a54
AK
281/* MSR access wrappers used for error injection */
282static u64 mce_rdmsrl(u32 msr)
283{
284 u64 v;
ea149b36
AK
285 if (__get_cpu_var(injectm).finished) {
286 int offset = msr_to_offset(msr);
287 if (offset < 0)
288 return 0;
289 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
290 }
5f8c1a54
AK
291 rdmsrl(msr, v);
292 return v;
293}
294
295static void mce_wrmsrl(u32 msr, u64 v)
296{
ea149b36
AK
297 if (__get_cpu_var(injectm).finished) {
298 int offset = msr_to_offset(msr);
299 if (offset >= 0)
300 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
301 return;
302 }
5f8c1a54
AK
303 wrmsrl(msr, v);
304}
305
9b1beaf2
AK
306/*
307 * Simple lockless ring to communicate PFNs from the exception handler with the
308 * process context work function. This is vastly simplified because there's
309 * only a single reader and a single writer.
310 */
311#define MCE_RING_SIZE 16 /* we use one entry less */
312
313struct mce_ring {
314 unsigned short start;
315 unsigned short end;
316 unsigned long ring[MCE_RING_SIZE];
317};
318static DEFINE_PER_CPU(struct mce_ring, mce_ring);
319
320/* Runs with CPU affinity in workqueue */
321static int mce_ring_empty(void)
322{
323 struct mce_ring *r = &__get_cpu_var(mce_ring);
324
325 return r->start == r->end;
326}
327
328static int mce_ring_get(unsigned long *pfn)
329{
330 struct mce_ring *r;
331 int ret = 0;
332
333 *pfn = 0;
334 get_cpu();
335 r = &__get_cpu_var(mce_ring);
336 if (r->start == r->end)
337 goto out;
338 *pfn = r->ring[r->start];
339 r->start = (r->start + 1) % MCE_RING_SIZE;
340 ret = 1;
341out:
342 put_cpu();
343 return ret;
344}
345
346/* Always runs in MCE context with preempt off */
347static int mce_ring_add(unsigned long pfn)
348{
349 struct mce_ring *r = &__get_cpu_var(mce_ring);
350 unsigned next;
351
352 next = (r->end + 1) % MCE_RING_SIZE;
353 if (next == r->start)
354 return -1;
355 r->ring[r->end] = pfn;
356 wmb();
357 r->end = next;
358 return 0;
359}
360
88ccbedd 361int mce_available(struct cpuinfo_x86 *c)
1da177e4 362{
04b2b1a4 363 if (mce_disabled)
5b4408fd 364 return 0;
3d1712c9 365 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
1da177e4
LT
366}
367
9b1beaf2
AK
368static void mce_schedule_work(void)
369{
370 if (!mce_ring_empty()) {
371 struct work_struct *work = &__get_cpu_var(mce_work);
372 if (!work_pending(work))
373 schedule_work(work);
374 }
375}
376
1b2797dc
HY
377/*
378 * Get the address of the instruction at the time of the machine check
379 * error.
380 */
94ad8474
AK
381static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
382{
1b2797dc
HY
383
384 if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
65ea5b03 385 m->ip = regs->ip;
94ad8474
AK
386 m->cs = regs->cs;
387 } else {
65ea5b03 388 m->ip = 0;
94ad8474
AK
389 m->cs = 0;
390 }
1b2797dc 391 if (rip_msr)
5f8c1a54 392 m->ip = mce_rdmsrl(rip_msr);
94ad8474
AK
393}
394
ccc3c319
AK
395#ifdef CONFIG_X86_LOCAL_APIC
396/*
397 * Called after interrupts have been reenabled again
398 * when a MCE happened during an interrupts off region
399 * in the kernel.
400 */
401asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
402{
403 ack_APIC_irq();
404 exit_idle();
405 irq_enter();
9ff36ee9 406 mce_notify_irq();
9b1beaf2 407 mce_schedule_work();
ccc3c319
AK
408 irq_exit();
409}
410#endif
411
412static void mce_report_event(struct pt_regs *regs)
413{
414 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
9ff36ee9 415 mce_notify_irq();
9b1beaf2
AK
416 /*
417 * Triggering the work queue here is just an insurance
418 * policy in case the syscall exit notify handler
419 * doesn't run soon enough or ends up running on the
420 * wrong CPU (can happen when audit sleeps)
421 */
422 mce_schedule_work();
ccc3c319
AK
423 return;
424 }
425
426#ifdef CONFIG_X86_LOCAL_APIC
427 /*
428 * Without APIC do not notify. The event will be picked
429 * up eventually.
430 */
431 if (!cpu_has_apic)
432 return;
433
434 /*
435 * When interrupts are disabled we cannot use
436 * kernel services safely. Trigger an self interrupt
437 * through the APIC to instead do the notification
438 * after interrupts are reenabled again.
439 */
440 apic->send_IPI_self(MCE_SELF_VECTOR);
441
442 /*
443 * Wait for idle afterwards again so that we don't leave the
444 * APIC in a non idle state because the normal APIC writes
445 * cannot exclude us.
446 */
447 apic_wait_icr_idle();
448#endif
449}
450
ca84f696
AK
451DEFINE_PER_CPU(unsigned, mce_poll_count);
452
d88203d1 453/*
b79109c3
AK
454 * Poll for corrected events or events that happened before reset.
455 * Those are just logged through /dev/mcelog.
456 *
457 * This is executed in standard interrupt context.
ed7290d0
AK
458 *
459 * Note: spec recommends to panic for fatal unsignalled
460 * errors here. However this would be quite problematic --
461 * we would need to reimplement the Monarch handling and
462 * it would mess up the exclusion between exception handler
463 * and poll hander -- * so we skip this for now.
464 * These cases should not happen anyways, or only when the CPU
465 * is already totally * confused. In this case it's likely it will
466 * not fully execute the machine check handler either.
b79109c3 467 */
ee031c31 468void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
b79109c3
AK
469{
470 struct mce m;
471 int i;
472
ca84f696
AK
473 __get_cpu_var(mce_poll_count)++;
474
b79109c3
AK
475 mce_setup(&m);
476
5f8c1a54 477 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
b79109c3 478 for (i = 0; i < banks; i++) {
cebe1820 479 if (!mce_banks[i].ctl || !test_bit(i, *b))
b79109c3
AK
480 continue;
481
482 m.misc = 0;
483 m.addr = 0;
484 m.bank = i;
485 m.tsc = 0;
486
487 barrier();
a2d32bcb 488 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
b79109c3
AK
489 if (!(m.status & MCI_STATUS_VAL))
490 continue;
491
492 /*
ed7290d0
AK
493 * Uncorrected or signalled events are handled by the exception
494 * handler when it is enabled, so don't process those here.
b79109c3
AK
495 *
496 * TBD do the same check for MCI_STATUS_EN here?
497 */
ed7290d0
AK
498 if (!(flags & MCP_UC) &&
499 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
b79109c3
AK
500 continue;
501
502 if (m.status & MCI_STATUS_MISCV)
a2d32bcb 503 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
b79109c3 504 if (m.status & MCI_STATUS_ADDRV)
a2d32bcb 505 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
b79109c3
AK
506
507 if (!(flags & MCP_TIMESTAMP))
508 m.tsc = 0;
509 /*
510 * Don't get the IP here because it's unlikely to
511 * have anything to do with the actual error location.
512 */
62fdac59 513 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
5679af4c
AK
514 mce_log(&m);
515 add_taint(TAINT_MACHINE_CHECK);
516 }
b79109c3
AK
517
518 /*
519 * Clear state for this bank.
520 */
a2d32bcb 521 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
b79109c3
AK
522 }
523
524 /*
525 * Don't clear MCG_STATUS here because it's only defined for
526 * exceptions.
527 */
88921be3
AK
528
529 sync_core();
b79109c3 530}
ea149b36 531EXPORT_SYMBOL_GPL(machine_check_poll);
b79109c3 532
bd19a5e6
AK
533/*
534 * Do a quick check if any of the events requires a panic.
535 * This decides if we keep the events around or clear them.
536 */
537static int mce_no_way_out(struct mce *m, char **msg)
538{
539 int i;
540
541 for (i = 0; i < banks; i++) {
a2d32bcb 542 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
bd19a5e6
AK
543 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
544 return 1;
545 }
546 return 0;
547}
548
3c079792
AK
549/*
550 * Variable to establish order between CPUs while scanning.
551 * Each CPU spins initially until executing is equal its number.
552 */
553static atomic_t mce_executing;
554
555/*
556 * Defines order of CPUs on entry. First CPU becomes Monarch.
557 */
558static atomic_t mce_callin;
559
560/*
561 * Check if a timeout waiting for other CPUs happened.
562 */
563static int mce_timed_out(u64 *t)
564{
565 /*
566 * The others already did panic for some reason.
567 * Bail out like in a timeout.
568 * rmb() to tell the compiler that system_state
569 * might have been modified by someone else.
570 */
571 rmb();
572 if (atomic_read(&mce_paniced))
573 wait_for_panic();
574 if (!monarch_timeout)
575 goto out;
576 if ((s64)*t < SPINUNIT) {
577 /* CHECKME: Make panic default for 1 too? */
578 if (tolerant < 1)
579 mce_panic("Timeout synchronizing machine check over CPUs",
580 NULL, NULL);
581 cpu_missing = 1;
582 return 1;
583 }
584 *t -= SPINUNIT;
585out:
586 touch_nmi_watchdog();
587 return 0;
588}
589
590/*
591 * The Monarch's reign. The Monarch is the CPU who entered
592 * the machine check handler first. It waits for the others to
593 * raise the exception too and then grades them. When any
594 * error is fatal panic. Only then let the others continue.
595 *
596 * The other CPUs entering the MCE handler will be controlled by the
597 * Monarch. They are called Subjects.
598 *
599 * This way we prevent any potential data corruption in a unrecoverable case
600 * and also makes sure always all CPU's errors are examined.
601 *
602 * Also this detects the case of an machine check event coming from outer
603 * space (not detected by any CPUs) In this case some external agent wants
604 * us to shut down, so panic too.
605 *
606 * The other CPUs might still decide to panic if the handler happens
607 * in a unrecoverable place, but in this case the system is in a semi-stable
608 * state and won't corrupt anything by itself. It's ok to let the others
609 * continue for a bit first.
610 *
611 * All the spin loops have timeouts; when a timeout happens a CPU
612 * typically elects itself to be Monarch.
613 */
614static void mce_reign(void)
615{
616 int cpu;
617 struct mce *m = NULL;
618 int global_worst = 0;
619 char *msg = NULL;
620 char *nmsg = NULL;
621
622 /*
623 * This CPU is the Monarch and the other CPUs have run
624 * through their handlers.
625 * Grade the severity of the errors of all the CPUs.
626 */
627 for_each_possible_cpu(cpu) {
628 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
629 &nmsg);
630 if (severity > global_worst) {
631 msg = nmsg;
632 global_worst = severity;
633 m = &per_cpu(mces_seen, cpu);
634 }
635 }
636
637 /*
638 * Cannot recover? Panic here then.
639 * This dumps all the mces in the log buffer and stops the
640 * other CPUs.
641 */
642 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
ac960375 643 mce_panic("Fatal Machine check", m, msg);
3c079792
AK
644
645 /*
646 * For UC somewhere we let the CPU who detects it handle it.
647 * Also must let continue the others, otherwise the handling
648 * CPU could deadlock on a lock.
649 */
650
651 /*
652 * No machine check event found. Must be some external
653 * source or one CPU is hung. Panic.
654 */
655 if (!m && tolerant < 3)
656 mce_panic("Machine check from unknown source", NULL, NULL);
657
658 /*
659 * Now clear all the mces_seen so that they don't reappear on
660 * the next mce.
661 */
662 for_each_possible_cpu(cpu)
663 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
664}
665
666static atomic_t global_nwo;
667
668/*
669 * Start of Monarch synchronization. This waits until all CPUs have
670 * entered the exception handler and then determines if any of them
671 * saw a fatal event that requires panic. Then it executes them
672 * in the entry order.
673 * TBD double check parallel CPU hotunplug
674 */
7fb06fc9 675static int mce_start(int *no_way_out)
3c079792 676{
7fb06fc9 677 int order;
3c079792
AK
678 int cpus = num_online_cpus();
679 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
680
7fb06fc9
HS
681 if (!timeout)
682 return -1;
3c079792 683
7fb06fc9 684 atomic_add(*no_way_out, &global_nwo);
184e1fdf
HY
685 /*
686 * global_nwo should be updated before mce_callin
687 */
688 smp_wmb();
a95436e4 689 order = atomic_inc_return(&mce_callin);
3c079792
AK
690
691 /*
692 * Wait for everyone.
693 */
694 while (atomic_read(&mce_callin) != cpus) {
695 if (mce_timed_out(&timeout)) {
696 atomic_set(&global_nwo, 0);
7fb06fc9 697 return -1;
3c079792
AK
698 }
699 ndelay(SPINUNIT);
700 }
701
184e1fdf
HY
702 /*
703 * mce_callin should be read before global_nwo
704 */
705 smp_rmb();
3c079792 706
7fb06fc9
HS
707 if (order == 1) {
708 /*
709 * Monarch: Starts executing now, the others wait.
710 */
3c079792 711 atomic_set(&mce_executing, 1);
7fb06fc9
HS
712 } else {
713 /*
714 * Subject: Now start the scanning loop one by one in
715 * the original callin order.
716 * This way when there are any shared banks it will be
717 * only seen by one CPU before cleared, avoiding duplicates.
718 */
719 while (atomic_read(&mce_executing) < order) {
720 if (mce_timed_out(&timeout)) {
721 atomic_set(&global_nwo, 0);
722 return -1;
723 }
724 ndelay(SPINUNIT);
725 }
3c079792
AK
726 }
727
728 /*
7fb06fc9 729 * Cache the global no_way_out state.
3c079792 730 */
7fb06fc9
HS
731 *no_way_out = atomic_read(&global_nwo);
732
733 return order;
3c079792
AK
734}
735
736/*
737 * Synchronize between CPUs after main scanning loop.
738 * This invokes the bulk of the Monarch processing.
739 */
740static int mce_end(int order)
741{
742 int ret = -1;
743 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
744
745 if (!timeout)
746 goto reset;
747 if (order < 0)
748 goto reset;
749
750 /*
751 * Allow others to run.
752 */
753 atomic_inc(&mce_executing);
754
755 if (order == 1) {
756 /* CHECKME: Can this race with a parallel hotplug? */
757 int cpus = num_online_cpus();
758
759 /*
760 * Monarch: Wait for everyone to go through their scanning
761 * loops.
762 */
763 while (atomic_read(&mce_executing) <= cpus) {
764 if (mce_timed_out(&timeout))
765 goto reset;
766 ndelay(SPINUNIT);
767 }
768
769 mce_reign();
770 barrier();
771 ret = 0;
772 } else {
773 /*
774 * Subject: Wait for Monarch to finish.
775 */
776 while (atomic_read(&mce_executing) != 0) {
777 if (mce_timed_out(&timeout))
778 goto reset;
779 ndelay(SPINUNIT);
780 }
781
782 /*
783 * Don't reset anything. That's done by the Monarch.
784 */
785 return 0;
786 }
787
788 /*
789 * Reset all global state.
790 */
791reset:
792 atomic_set(&global_nwo, 0);
793 atomic_set(&mce_callin, 0);
794 barrier();
795
796 /*
797 * Let others run again.
798 */
799 atomic_set(&mce_executing, 0);
800 return ret;
801}
802
9b1beaf2
AK
803/*
804 * Check if the address reported by the CPU is in a format we can parse.
805 * It would be possible to add code for most other cases, but all would
806 * be somewhat complicated (e.g. segment offset would require an instruction
807 * parser). So only support physical addresses upto page granuality for now.
808 */
809static int mce_usable_address(struct mce *m)
810{
811 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
812 return 0;
813 if ((m->misc & 0x3f) > PAGE_SHIFT)
814 return 0;
815 if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
816 return 0;
817 return 1;
818}
819
3c079792
AK
820static void mce_clear_state(unsigned long *toclear)
821{
822 int i;
823
824 for (i = 0; i < banks; i++) {
825 if (test_bit(i, toclear))
a2d32bcb 826 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
3c079792
AK
827 }
828}
829
b79109c3
AK
830/*
831 * The actual machine check handler. This only handles real
832 * exceptions when something got corrupted coming in through int 18.
833 *
834 * This is executed in NMI context not subject to normal locking rules. This
835 * implies that most kernel services cannot be safely used. Don't even
836 * think about putting a printk in there!
3c079792
AK
837 *
838 * On Intel systems this is entered on all CPUs in parallel through
839 * MCE broadcast. However some CPUs might be broken beyond repair,
840 * so be always careful when synchronizing with others.
1da177e4 841 */
e9eee03e 842void do_machine_check(struct pt_regs *regs, long error_code)
1da177e4 843{
3c079792 844 struct mce m, *final;
1da177e4 845 int i;
3c079792
AK
846 int worst = 0;
847 int severity;
848 /*
849 * Establish sequential order between the CPUs entering the machine
850 * check handler.
851 */
7fb06fc9 852 int order;
bd78432c
TH
853 /*
854 * If no_way_out gets set, there is no safe way to recover from this
855 * MCE. If tolerant is cranked up, we'll try anyway.
856 */
857 int no_way_out = 0;
858 /*
859 * If kill_it gets set, there might be a way to recover from this
860 * error.
861 */
862 int kill_it = 0;
b79109c3 863 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
bd19a5e6 864 char *msg = "Unknown";
1da177e4 865
553f265f
AK
866 atomic_inc(&mce_entry);
867
01ca79f1
AK
868 __get_cpu_var(mce_exception_count)++;
869
b79109c3 870 if (notify_die(DIE_NMI, "machine check", regs, error_code,
22f5991c 871 18, SIGKILL) == NOTIFY_STOP)
32561696 872 goto out;
b79109c3 873 if (!banks)
32561696 874 goto out;
1da177e4 875
b5f2fa4e
AK
876 mce_setup(&m);
877
5f8c1a54 878 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
bd19a5e6 879 no_way_out = mce_no_way_out(&m, &msg);
d88203d1 880
3c079792
AK
881 final = &__get_cpu_var(mces_seen);
882 *final = m;
883
1da177e4
LT
884 barrier();
885
ed7290d0
AK
886 /*
887 * When no restart IP must always kill or panic.
888 */
889 if (!(m.mcgstatus & MCG_STATUS_RIPV))
890 kill_it = 1;
891
3c079792
AK
892 /*
893 * Go through all the banks in exclusion of the other CPUs.
894 * This way we don't report duplicated events on shared banks
895 * because the first one to see it will clear it.
896 */
7fb06fc9 897 order = mce_start(&no_way_out);
1da177e4 898 for (i = 0; i < banks; i++) {
b79109c3 899 __clear_bit(i, toclear);
cebe1820 900 if (!mce_banks[i].ctl)
1da177e4 901 continue;
d88203d1
TG
902
903 m.misc = 0;
1da177e4
LT
904 m.addr = 0;
905 m.bank = i;
1da177e4 906
a2d32bcb 907 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1da177e4
LT
908 if ((m.status & MCI_STATUS_VAL) == 0)
909 continue;
910
b79109c3 911 /*
ed7290d0
AK
912 * Non uncorrected or non signaled errors are handled by
913 * machine_check_poll. Leave them alone, unless this panics.
b79109c3 914 */
ed7290d0
AK
915 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
916 !no_way_out)
b79109c3
AK
917 continue;
918
919 /*
920 * Set taint even when machine check was not enabled.
921 */
922 add_taint(TAINT_MACHINE_CHECK);
923
ed7290d0 924 severity = mce_severity(&m, tolerant, NULL);
b79109c3 925
ed7290d0
AK
926 /*
927 * When machine check was for corrected handler don't touch,
928 * unless we're panicing.
929 */
930 if (severity == MCE_KEEP_SEVERITY && !no_way_out)
931 continue;
932 __set_bit(i, toclear);
933 if (severity == MCE_NO_SEVERITY) {
b79109c3
AK
934 /*
935 * Machine check event was not enabled. Clear, but
936 * ignore.
937 */
938 continue;
1da177e4
LT
939 }
940
ed7290d0
AK
941 /*
942 * Kill on action required.
943 */
944 if (severity == MCE_AR_SEVERITY)
945 kill_it = 1;
946
1da177e4 947 if (m.status & MCI_STATUS_MISCV)
a2d32bcb 948 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
1da177e4 949 if (m.status & MCI_STATUS_ADDRV)
a2d32bcb 950 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
1da177e4 951
9b1beaf2
AK
952 /*
953 * Action optional error. Queue address for later processing.
954 * When the ring overflows we just ignore the AO error.
955 * RED-PEN add some logging mechanism when
956 * usable_address or mce_add_ring fails.
957 * RED-PEN don't ignore overflow for tolerant == 0
958 */
959 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
960 mce_ring_add(m.addr >> PAGE_SHIFT);
961
94ad8474 962 mce_get_rip(&m, regs);
b79109c3 963 mce_log(&m);
1da177e4 964
3c079792
AK
965 if (severity > worst) {
966 *final = m;
967 worst = severity;
1da177e4 968 }
1da177e4
LT
969 }
970
3c079792
AK
971 if (!no_way_out)
972 mce_clear_state(toclear);
973
e9eee03e 974 /*
3c079792
AK
975 * Do most of the synchronization with other CPUs.
976 * When there's any problem use only local no_way_out state.
e9eee03e 977 */
3c079792
AK
978 if (mce_end(order) < 0)
979 no_way_out = worst >= MCE_PANIC_SEVERITY;
bd78432c
TH
980
981 /*
982 * If we have decided that we just CAN'T continue, and the user
e9eee03e 983 * has not set tolerant to an insane level, give up and die.
3c079792
AK
984 *
985 * This is mainly used in the case when the system doesn't
986 * support MCE broadcasting or it has been disabled.
bd78432c
TH
987 */
988 if (no_way_out && tolerant < 3)
ac960375 989 mce_panic("Fatal machine check on current CPU", final, msg);
bd78432c
TH
990
991 /*
992 * If the error seems to be unrecoverable, something should be
993 * done. Try to kill as little as possible. If we can kill just
994 * one task, do that. If the user has set the tolerance very
995 * high, don't try to do anything at all.
996 */
bd78432c 997
ed7290d0
AK
998 if (kill_it && tolerant < 3)
999 force_sig(SIGBUS, current);
1da177e4 1000
e02e68d3
TH
1001 /* notify userspace ASAP */
1002 set_thread_flag(TIF_MCE_NOTIFY);
1003
3c079792
AK
1004 if (worst > 0)
1005 mce_report_event(regs);
5f8c1a54 1006 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
32561696 1007out:
553f265f 1008 atomic_dec(&mce_entry);
88921be3 1009 sync_core();
1da177e4 1010}
ea149b36 1011EXPORT_SYMBOL_GPL(do_machine_check);
1da177e4 1012
9b1beaf2
AK
1013/* dummy to break dependency. actual code is in mm/memory-failure.c */
1014void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1015{
1016 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1017}
1018
1019/*
1020 * Called after mce notification in process context. This code
1021 * is allowed to sleep. Call the high level VM handler to process
1022 * any corrupted pages.
1023 * Assume that the work queue code only calls this one at a time
1024 * per CPU.
1025 * Note we don't disable preemption, so this code might run on the wrong
1026 * CPU. In this case the event is picked up by the scheduled work queue.
1027 * This is merely a fast path to expedite processing in some common
1028 * cases.
1029 */
1030void mce_notify_process(void)
1031{
1032 unsigned long pfn;
1033 mce_notify_irq();
1034 while (mce_ring_get(&pfn))
1035 memory_failure(pfn, MCE_VECTOR);
1036}
1037
1038static void mce_process_work(struct work_struct *dummy)
1039{
1040 mce_notify_process();
1041}
1042
15d5f839
DZ
1043#ifdef CONFIG_X86_MCE_INTEL
1044/***
1045 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
676b1855 1046 * @cpu: The CPU on which the event occurred.
15d5f839
DZ
1047 * @status: Event status information
1048 *
1049 * This function should be called by the thermal interrupt after the
1050 * event has been processed and the decision was made to log the event
1051 * further.
1052 *
1053 * The status parameter will be saved to the 'status' field of 'struct mce'
1054 * and historically has been the register value of the
1055 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1056 */
b5f2fa4e 1057void mce_log_therm_throt_event(__u64 status)
15d5f839
DZ
1058{
1059 struct mce m;
1060
b5f2fa4e 1061 mce_setup(&m);
15d5f839
DZ
1062 m.bank = MCE_THERMAL_BANK;
1063 m.status = status;
15d5f839
DZ
1064 mce_log(&m);
1065}
1066#endif /* CONFIG_X86_MCE_INTEL */
1067
1da177e4 1068/*
8a336b0a
TH
1069 * Periodic polling timer for "silent" machine check errors. If the
1070 * poller finds an MCE, poll 2x faster. When the poller finds no more
1071 * errors, poll 2x slower (up to check_interval seconds).
1da177e4 1072 */
1da177e4 1073static int check_interval = 5 * 60; /* 5 minutes */
e9eee03e 1074
6298c512 1075static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
52d168e2 1076static DEFINE_PER_CPU(struct timer_list, mce_timer);
1da177e4 1077
52d168e2 1078static void mcheck_timer(unsigned long data)
1da177e4 1079{
52d168e2 1080 struct timer_list *t = &per_cpu(mce_timer, data);
6298c512 1081 int *n;
52d168e2
AK
1082
1083 WARN_ON(smp_processor_id() != data);
1084
e9eee03e 1085 if (mce_available(&current_cpu_data)) {
ee031c31
AK
1086 machine_check_poll(MCP_TIMESTAMP,
1087 &__get_cpu_var(mce_poll_banks));
e9eee03e 1088 }
1da177e4
LT
1089
1090 /*
e02e68d3
TH
1091 * Alert userspace if needed. If we logged an MCE, reduce the
1092 * polling interval, otherwise increase the polling interval.
1da177e4 1093 */
6298c512 1094 n = &__get_cpu_var(next_interval);
9ff36ee9 1095 if (mce_notify_irq())
6298c512 1096 *n = max(*n/2, HZ/100);
14a02530 1097 else
6298c512 1098 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
e02e68d3 1099
6298c512 1100 t->expires = jiffies + *n;
52d168e2 1101 add_timer(t);
e02e68d3
TH
1102}
1103
9bd98405
AK
1104static void mce_do_trigger(struct work_struct *work)
1105{
1020bcbc 1106 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
9bd98405
AK
1107}
1108
1109static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1110
e02e68d3 1111/*
9bd98405
AK
1112 * Notify the user(s) about new machine check events.
1113 * Can be called from interrupt context, but not from machine check/NMI
1114 * context.
e02e68d3 1115 */
9ff36ee9 1116int mce_notify_irq(void)
e02e68d3 1117{
8457c84d
AK
1118 /* Not more than two messages every minute */
1119 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1120
e02e68d3 1121 clear_thread_flag(TIF_MCE_NOTIFY);
e9eee03e 1122
1020bcbc 1123 if (test_and_clear_bit(0, &mce_need_notify)) {
e02e68d3 1124 wake_up_interruptible(&mce_wait);
9bd98405
AK
1125
1126 /*
1127 * There is no risk of missing notifications because
1128 * work_pending is always cleared before the function is
1129 * executed.
1130 */
1020bcbc 1131 if (mce_helper[0] && !work_pending(&mce_trigger_work))
9bd98405 1132 schedule_work(&mce_trigger_work);
e02e68d3 1133
8457c84d 1134 if (__ratelimit(&ratelimit))
8a336b0a 1135 printk(KERN_INFO "Machine check events logged\n");
e02e68d3
TH
1136
1137 return 1;
1da177e4 1138 }
e02e68d3
TH
1139 return 0;
1140}
9ff36ee9 1141EXPORT_SYMBOL_GPL(mce_notify_irq);
8a336b0a 1142
cebe1820
AK
1143static int mce_banks_init(void)
1144{
1145 int i;
1146
1147 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1148 if (!mce_banks)
1149 return -ENOMEM;
1150 for (i = 0; i < banks; i++) {
1151 struct mce_bank *b = &mce_banks[i];
1152 b->ctl = -1ULL;
1153 b->init = 1;
1154 }
1155 return 0;
1156}
1157
d88203d1 1158/*
1da177e4
LT
1159 * Initialize Machine Checks for a CPU.
1160 */
419d6162 1161static int __cpuinit mce_cap_init(void)
1da177e4 1162{
0d7482e3 1163 unsigned b;
e9eee03e 1164 u64 cap;
1da177e4
LT
1165
1166 rdmsrl(MSR_IA32_MCG_CAP, cap);
01c6680a
TG
1167
1168 b = cap & MCG_BANKCNT_MASK;
b659294b
IM
1169 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1170
0d7482e3
AK
1171 if (b > MAX_NR_BANKS) {
1172 printk(KERN_WARNING
1173 "MCE: Using only %u machine check banks out of %u\n",
1174 MAX_NR_BANKS, b);
1175 b = MAX_NR_BANKS;
1176 }
1177
1178 /* Don't support asymmetric configurations today */
1179 WARN_ON(banks != 0 && b != banks);
1180 banks = b;
cebe1820
AK
1181 if (!mce_banks) {
1182 int err = mce_banks_init();
1183 if (err)
1184 return err;
1da177e4 1185 }
0d7482e3 1186
94ad8474 1187 /* Use accurate RIP reporting if available. */
01c6680a 1188 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
94ad8474 1189 rip_msr = MSR_IA32_MCG_EIP;
1da177e4 1190
ed7290d0
AK
1191 if (cap & MCG_SER_P)
1192 mce_ser = 1;
1193
0d7482e3
AK
1194 return 0;
1195}
1196
8be91105 1197static void mce_init(void)
0d7482e3 1198{
e9eee03e 1199 mce_banks_t all_banks;
0d7482e3
AK
1200 u64 cap;
1201 int i;
1202
b79109c3
AK
1203 /*
1204 * Log the machine checks left over from the previous reset.
1205 */
ee031c31 1206 bitmap_fill(all_banks, MAX_NR_BANKS);
5679af4c 1207 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1da177e4
LT
1208
1209 set_in_cr4(X86_CR4_MCE);
1210
0d7482e3 1211 rdmsrl(MSR_IA32_MCG_CAP, cap);
1da177e4
LT
1212 if (cap & MCG_CTL_P)
1213 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1214
1215 for (i = 0; i < banks; i++) {
cebe1820
AK
1216 struct mce_bank *b = &mce_banks[i];
1217 if (!b->init)
06b7a7a5 1218 continue;
a2d32bcb
AK
1219 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1220 wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
d88203d1 1221 }
1da177e4
LT
1222}
1223
1224/* Add per CPU specific workarounds here */
419d6162 1225static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
d88203d1 1226{
1da177e4 1227 /* This should be disabled by the BIOS, but isn't always */
911f6a7b 1228 if (c->x86_vendor == X86_VENDOR_AMD) {
e9eee03e
IM
1229 if (c->x86 == 15 && banks > 4) {
1230 /*
1231 * disable GART TBL walk error reporting, which
1232 * trips off incorrectly with the IOMMU & 3ware
1233 * & Cerberus:
1234 */
cebe1820 1235 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
e9eee03e
IM
1236 }
1237 if (c->x86 <= 17 && mce_bootlog < 0) {
1238 /*
1239 * Lots of broken BIOS around that don't clear them
1240 * by default and leave crap in there. Don't log:
1241 */
911f6a7b 1242 mce_bootlog = 0;
e9eee03e 1243 }
2e6f694f
AK
1244 /*
1245 * Various K7s with broken bank 0 around. Always disable
1246 * by default.
1247 */
203abd67 1248 if (c->x86 == 6 && banks > 0)
cebe1820 1249 mce_banks[0].ctl = 0;
1da177e4 1250 }
e583538f 1251
06b7a7a5
AK
1252 if (c->x86_vendor == X86_VENDOR_INTEL) {
1253 /*
1254 * SDM documents that on family 6 bank 0 should not be written
1255 * because it aliases to another special BIOS controlled
1256 * register.
1257 * But it's not aliased anymore on model 0x1a+
1258 * Don't ignore bank 0 completely because there could be a
1259 * valid event later, merely don't write CTL0.
1260 */
1261
cebe1820
AK
1262 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1263 mce_banks[0].init = 0;
3c079792
AK
1264
1265 /*
1266 * All newer Intel systems support MCE broadcasting. Enable
1267 * synchronization with a one second timeout.
1268 */
1269 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1270 monarch_timeout < 0)
1271 monarch_timeout = USEC_PER_SEC;
94699b04
BZ
1272
1273 /* There are also broken BIOSes on some Pentium M systems. */
1274 if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0)
1275 mce_bootlog = 0;
06b7a7a5 1276 }
3c079792
AK
1277 if (monarch_timeout < 0)
1278 monarch_timeout = 0;
29b0f591
AK
1279 if (mce_bootlog != 0)
1280 mce_panic_timeout = 30;
d88203d1 1281}
1da177e4 1282
4efc0670
AK
1283static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1284{
1285 if (c->x86 != 5)
1286 return;
1287 switch (c->x86_vendor) {
1288 case X86_VENDOR_INTEL:
c6978369 1289 intel_p5_mcheck_init(c);
4efc0670
AK
1290 break;
1291 case X86_VENDOR_CENTAUR:
1292 winchip_mcheck_init(c);
1293 break;
1294 }
1295}
1296
cc3ca220 1297static void mce_cpu_features(struct cpuinfo_x86 *c)
1da177e4
LT
1298{
1299 switch (c->x86_vendor) {
1300 case X86_VENDOR_INTEL:
1301 mce_intel_feature_init(c);
1302 break;
89b831ef
JS
1303 case X86_VENDOR_AMD:
1304 mce_amd_feature_init(c);
1305 break;
1da177e4
LT
1306 default:
1307 break;
1308 }
1309}
1310
52d168e2
AK
1311static void mce_init_timer(void)
1312{
1313 struct timer_list *t = &__get_cpu_var(mce_timer);
6298c512 1314 int *n = &__get_cpu_var(next_interval);
52d168e2 1315
62fdac59
HS
1316 if (mce_ignore_ce)
1317 return;
1318
6298c512
AK
1319 *n = check_interval * HZ;
1320 if (!*n)
52d168e2
AK
1321 return;
1322 setup_timer(t, mcheck_timer, smp_processor_id());
6298c512 1323 t->expires = round_jiffies(jiffies + *n);
52d168e2
AK
1324 add_timer(t);
1325}
1326
9eda8cb3
AK
1327/* Handle unconfigured int18 (should never happen) */
1328static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1329{
1330 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1331 smp_processor_id());
1332}
1333
1334/* Call the installed machine check handler for this CPU setup. */
1335void (*machine_check_vector)(struct pt_regs *, long error_code) =
1336 unexpected_machine_check;
1337
d88203d1 1338/*
1da177e4 1339 * Called for each booted CPU to set up machine checks.
e9eee03e 1340 * Must be called with preempt off:
1da177e4 1341 */
e6982c67 1342void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1da177e4 1343{
4efc0670
AK
1344 if (mce_disabled)
1345 return;
1346
1347 mce_ancient_init(c);
1348
5b4408fd 1349 if (!mce_available(c))
1da177e4
LT
1350 return;
1351
0d7482e3 1352 if (mce_cap_init() < 0) {
04b2b1a4 1353 mce_disabled = 1;
0d7482e3
AK
1354 return;
1355 }
1356 mce_cpu_quirks(c);
1357
5d727926
AK
1358 machine_check_vector = do_machine_check;
1359
8be91105 1360 mce_init();
1da177e4 1361 mce_cpu_features(c);
52d168e2 1362 mce_init_timer();
9b1beaf2 1363 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1da177e4
LT
1364}
1365
1366/*
1367 * Character device to read and clear the MCE log.
1368 */
1369
f528e7ba 1370static DEFINE_SPINLOCK(mce_state_lock);
e9eee03e
IM
1371static int open_count; /* #times opened */
1372static int open_exclu; /* already open exclusive? */
f528e7ba
TH
1373
1374static int mce_open(struct inode *inode, struct file *file)
1375{
1376 spin_lock(&mce_state_lock);
1377
1378 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1379 spin_unlock(&mce_state_lock);
e9eee03e 1380
f528e7ba
TH
1381 return -EBUSY;
1382 }
1383
1384 if (file->f_flags & O_EXCL)
1385 open_exclu = 1;
1386 open_count++;
1387
1388 spin_unlock(&mce_state_lock);
1389
bd78432c 1390 return nonseekable_open(inode, file);
f528e7ba
TH
1391}
1392
1393static int mce_release(struct inode *inode, struct file *file)
1394{
1395 spin_lock(&mce_state_lock);
1396
1397 open_count--;
1398 open_exclu = 0;
1399
1400 spin_unlock(&mce_state_lock);
1401
1402 return 0;
1403}
1404
d88203d1
TG
1405static void collect_tscs(void *data)
1406{
1da177e4 1407 unsigned long *cpu_tsc = (unsigned long *)data;
d88203d1 1408
1da177e4 1409 rdtscll(cpu_tsc[smp_processor_id()]);
d88203d1 1410}
1da177e4 1411
e9eee03e
IM
1412static DEFINE_MUTEX(mce_read_mutex);
1413
d88203d1
TG
1414static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1415 loff_t *off)
1da177e4 1416{
e9eee03e 1417 char __user *buf = ubuf;
f0de53bb 1418 unsigned long *cpu_tsc;
ef41df43 1419 unsigned prev, next;
1da177e4
LT
1420 int i, err;
1421
6bca67f9 1422 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
f0de53bb
AK
1423 if (!cpu_tsc)
1424 return -ENOMEM;
1425
8c8b8859 1426 mutex_lock(&mce_read_mutex);
1da177e4
LT
1427 next = rcu_dereference(mcelog.next);
1428
1429 /* Only supports full reads right now */
d88203d1 1430 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
8c8b8859 1431 mutex_unlock(&mce_read_mutex);
f0de53bb 1432 kfree(cpu_tsc);
e9eee03e 1433
1da177e4
LT
1434 return -EINVAL;
1435 }
1436
1437 err = 0;
ef41df43
HY
1438 prev = 0;
1439 do {
1440 for (i = prev; i < next; i++) {
1441 unsigned long start = jiffies;
1442
1443 while (!mcelog.entry[i].finished) {
1444 if (time_after_eq(jiffies, start + 2)) {
1445 memset(mcelog.entry + i, 0,
1446 sizeof(struct mce));
1447 goto timeout;
1448 }
1449 cpu_relax();
673242c1 1450 }
ef41df43
HY
1451 smp_rmb();
1452 err |= copy_to_user(buf, mcelog.entry + i,
1453 sizeof(struct mce));
1454 buf += sizeof(struct mce);
1455timeout:
1456 ;
673242c1 1457 }
1da177e4 1458
ef41df43
HY
1459 memset(mcelog.entry + prev, 0,
1460 (next - prev) * sizeof(struct mce));
1461 prev = next;
1462 next = cmpxchg(&mcelog.next, prev, 0);
1463 } while (next != prev);
1da177e4 1464
b2b18660 1465 synchronize_sched();
1da177e4 1466
d88203d1
TG
1467 /*
1468 * Collect entries that were still getting written before the
1469 * synchronize.
1470 */
15c8b6c1 1471 on_each_cpu(collect_tscs, cpu_tsc, 1);
e9eee03e 1472
d88203d1
TG
1473 for (i = next; i < MCE_LOG_LEN; i++) {
1474 if (mcelog.entry[i].finished &&
1475 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1476 err |= copy_to_user(buf, mcelog.entry+i,
1477 sizeof(struct mce));
1da177e4
LT
1478 smp_rmb();
1479 buf += sizeof(struct mce);
1480 memset(&mcelog.entry[i], 0, sizeof(struct mce));
1481 }
d88203d1 1482 }
8c8b8859 1483 mutex_unlock(&mce_read_mutex);
f0de53bb 1484 kfree(cpu_tsc);
e9eee03e 1485
d88203d1 1486 return err ? -EFAULT : buf - ubuf;
1da177e4
LT
1487}
1488
e02e68d3
TH
1489static unsigned int mce_poll(struct file *file, poll_table *wait)
1490{
1491 poll_wait(file, &mce_wait, wait);
1492 if (rcu_dereference(mcelog.next))
1493 return POLLIN | POLLRDNORM;
1494 return 0;
1495}
1496
c68461b6 1497static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1da177e4
LT
1498{
1499 int __user *p = (int __user *)arg;
d88203d1 1500
1da177e4 1501 if (!capable(CAP_SYS_ADMIN))
d88203d1 1502 return -EPERM;
e9eee03e 1503
1da177e4 1504 switch (cmd) {
d88203d1 1505 case MCE_GET_RECORD_LEN:
1da177e4
LT
1506 return put_user(sizeof(struct mce), p);
1507 case MCE_GET_LOG_LEN:
d88203d1 1508 return put_user(MCE_LOG_LEN, p);
1da177e4
LT
1509 case MCE_GETCLEAR_FLAGS: {
1510 unsigned flags;
d88203d1
TG
1511
1512 do {
1da177e4 1513 flags = mcelog.flags;
d88203d1 1514 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
e9eee03e 1515
d88203d1 1516 return put_user(flags, p);
1da177e4
LT
1517 }
1518 default:
d88203d1
TG
1519 return -ENOTTY;
1520 }
1da177e4
LT
1521}
1522
a1ff41bf 1523/* Modified in mce-inject.c, so not static or const */
ea149b36 1524struct file_operations mce_chrdev_ops = {
e9eee03e
IM
1525 .open = mce_open,
1526 .release = mce_release,
1527 .read = mce_read,
1528 .poll = mce_poll,
1529 .unlocked_ioctl = mce_ioctl,
1da177e4 1530};
ea149b36 1531EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1da177e4
LT
1532
1533static struct miscdevice mce_log_device = {
1534 MISC_MCELOG_MINOR,
1535 "mcelog",
1536 &mce_chrdev_ops,
1537};
1538
13503fa9 1539/*
62fdac59
HS
1540 * mce=off Disables machine check
1541 * mce=no_cmci Disables CMCI
1542 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1543 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
3c079792
AK
1544 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1545 * monarchtimeout is how long to wait for other CPUs on machine
1546 * check, or 0 to not wait
13503fa9
HS
1547 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1548 * mce=nobootlog Don't log MCEs from before booting.
1549 */
1da177e4
LT
1550static int __init mcheck_enable(char *str)
1551{
e3346fc4 1552 if (*str == 0) {
4efc0670 1553 enable_p5_mce();
e3346fc4
BZ
1554 return 1;
1555 }
4efc0670
AK
1556 if (*str == '=')
1557 str++;
1da177e4 1558 if (!strcmp(str, "off"))
04b2b1a4 1559 mce_disabled = 1;
62fdac59
HS
1560 else if (!strcmp(str, "no_cmci"))
1561 mce_cmci_disabled = 1;
1562 else if (!strcmp(str, "dont_log_ce"))
1563 mce_dont_log_ce = 1;
1564 else if (!strcmp(str, "ignore_ce"))
1565 mce_ignore_ce = 1;
13503fa9
HS
1566 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1567 mce_bootlog = (str[0] == 'b');
3c079792 1568 else if (isdigit(str[0])) {
8c566ef5 1569 get_option(&str, &tolerant);
3c079792
AK
1570 if (*str == ',') {
1571 ++str;
1572 get_option(&str, &monarch_timeout);
1573 }
1574 } else {
4efc0670 1575 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
13503fa9
HS
1576 str);
1577 return 0;
1578 }
9b41046c 1579 return 1;
1da177e4 1580}
4efc0670 1581__setup("mce", mcheck_enable);
1da177e4 1582
d88203d1 1583/*
1da177e4 1584 * Sysfs support
d88203d1 1585 */
1da177e4 1586
973a2dd1
AK
1587/*
1588 * Disable machine checks on suspend and shutdown. We can't really handle
1589 * them later.
1590 */
1591static int mce_disable(void)
1592{
1593 int i;
1594
06b7a7a5 1595 for (i = 0; i < banks; i++) {
cebe1820
AK
1596 struct mce_bank *b = &mce_banks[i];
1597 if (b->init)
a2d32bcb 1598 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
06b7a7a5 1599 }
973a2dd1
AK
1600 return 0;
1601}
1602
1603static int mce_suspend(struct sys_device *dev, pm_message_t state)
1604{
1605 return mce_disable();
1606}
1607
1608static int mce_shutdown(struct sys_device *dev)
1609{
1610 return mce_disable();
1611}
1612
e9eee03e
IM
1613/*
1614 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1615 * Only one CPU is active at this time, the others get re-added later using
1616 * CPU hotplug:
1617 */
1da177e4
LT
1618static int mce_resume(struct sys_device *dev)
1619{
8be91105 1620 mce_init();
6ec68bff 1621 mce_cpu_features(&current_cpu_data);
e9eee03e 1622
1da177e4
LT
1623 return 0;
1624}
1625
52d168e2
AK
1626static void mce_cpu_restart(void *data)
1627{
1628 del_timer_sync(&__get_cpu_var(mce_timer));
33edbf02
HS
1629 if (!mce_available(&current_cpu_data))
1630 return;
1631 mce_init();
52d168e2
AK
1632 mce_init_timer();
1633}
1634
1da177e4 1635/* Reinit MCEs after user configuration changes */
d88203d1
TG
1636static void mce_restart(void)
1637{
52d168e2 1638 on_each_cpu(mce_cpu_restart, NULL, 1);
1da177e4
LT
1639}
1640
9af43b54
HS
1641/* Toggle features for corrected errors */
1642static void mce_disable_ce(void *all)
1643{
1644 if (!mce_available(&current_cpu_data))
1645 return;
1646 if (all)
1647 del_timer_sync(&__get_cpu_var(mce_timer));
1648 cmci_clear();
1649}
1650
1651static void mce_enable_ce(void *all)
1652{
1653 if (!mce_available(&current_cpu_data))
1654 return;
1655 cmci_reenable();
1656 cmci_recheck();
1657 if (all)
1658 mce_init_timer();
1659}
1660
1da177e4 1661static struct sysdev_class mce_sysclass = {
e9eee03e
IM
1662 .suspend = mce_suspend,
1663 .shutdown = mce_shutdown,
1664 .resume = mce_resume,
1665 .name = "machinecheck",
1da177e4
LT
1666};
1667
cb491fca 1668DEFINE_PER_CPU(struct sys_device, mce_dev);
e9eee03e
IM
1669
1670__cpuinitdata
1671void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1da177e4 1672
cebe1820
AK
1673static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
1674{
1675 return container_of(attr, struct mce_bank, attr);
1676}
0d7482e3
AK
1677
1678static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1679 char *buf)
1680{
cebe1820 1681 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
0d7482e3
AK
1682}
1683
1684static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
9319cec8 1685 const char *buf, size_t size)
0d7482e3 1686{
9319cec8 1687 u64 new;
e9eee03e 1688
9319cec8 1689 if (strict_strtoull(buf, 0, &new) < 0)
0d7482e3 1690 return -EINVAL;
e9eee03e 1691
cebe1820 1692 attr_to_bank(attr)->ctl = new;
0d7482e3 1693 mce_restart();
e9eee03e 1694
9319cec8 1695 return size;
0d7482e3 1696}
a98f0dd3 1697
e9eee03e
IM
1698static ssize_t
1699show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
a98f0dd3 1700{
1020bcbc 1701 strcpy(buf, mce_helper);
a98f0dd3 1702 strcat(buf, "\n");
1020bcbc 1703 return strlen(mce_helper) + 1;
a98f0dd3
AK
1704}
1705
4a0b2b4d 1706static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
e9eee03e 1707 const char *buf, size_t siz)
a98f0dd3
AK
1708{
1709 char *p;
1710 int len;
e9eee03e 1711
1020bcbc
HS
1712 strncpy(mce_helper, buf, sizeof(mce_helper));
1713 mce_helper[sizeof(mce_helper)-1] = 0;
1714 len = strlen(mce_helper);
1715 p = strchr(mce_helper, '\n');
e9eee03e
IM
1716
1717 if (*p)
1718 *p = 0;
1719
a98f0dd3
AK
1720 return len;
1721}
1722
9af43b54
HS
1723static ssize_t set_ignore_ce(struct sys_device *s,
1724 struct sysdev_attribute *attr,
1725 const char *buf, size_t size)
1726{
1727 u64 new;
1728
1729 if (strict_strtoull(buf, 0, &new) < 0)
1730 return -EINVAL;
1731
1732 if (mce_ignore_ce ^ !!new) {
1733 if (new) {
1734 /* disable ce features */
1735 on_each_cpu(mce_disable_ce, (void *)1, 1);
1736 mce_ignore_ce = 1;
1737 } else {
1738 /* enable ce features */
1739 mce_ignore_ce = 0;
1740 on_each_cpu(mce_enable_ce, (void *)1, 1);
1741 }
1742 }
1743 return size;
1744}
1745
1746static ssize_t set_cmci_disabled(struct sys_device *s,
1747 struct sysdev_attribute *attr,
1748 const char *buf, size_t size)
1749{
1750 u64 new;
1751
1752 if (strict_strtoull(buf, 0, &new) < 0)
1753 return -EINVAL;
1754
1755 if (mce_cmci_disabled ^ !!new) {
1756 if (new) {
1757 /* disable cmci */
1758 on_each_cpu(mce_disable_ce, NULL, 1);
1759 mce_cmci_disabled = 1;
1760 } else {
1761 /* enable cmci */
1762 mce_cmci_disabled = 0;
1763 on_each_cpu(mce_enable_ce, NULL, 1);
1764 }
1765 }
1766 return size;
1767}
1768
b56f642d
AK
1769static ssize_t store_int_with_restart(struct sys_device *s,
1770 struct sysdev_attribute *attr,
1771 const char *buf, size_t size)
1772{
1773 ssize_t ret = sysdev_store_int(s, attr, buf, size);
1774 mce_restart();
1775 return ret;
1776}
1777
a98f0dd3 1778static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
d95d62c0 1779static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
3c079792 1780static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
9af43b54 1781static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
e9eee03e 1782
b56f642d
AK
1783static struct sysdev_ext_attribute attr_check_interval = {
1784 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1785 store_int_with_restart),
1786 &check_interval
1787};
e9eee03e 1788
9af43b54
HS
1789static struct sysdev_ext_attribute attr_ignore_ce = {
1790 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
1791 &mce_ignore_ce
1792};
1793
1794static struct sysdev_ext_attribute attr_cmci_disabled = {
74b602c7 1795 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
9af43b54
HS
1796 &mce_cmci_disabled
1797};
1798
cb491fca 1799static struct sysdev_attribute *mce_attrs[] = {
9af43b54
HS
1800 &attr_tolerant.attr,
1801 &attr_check_interval.attr,
1802 &attr_trigger,
3c079792 1803 &attr_monarch_timeout.attr,
9af43b54
HS
1804 &attr_dont_log_ce.attr,
1805 &attr_ignore_ce.attr,
1806 &attr_cmci_disabled.attr,
a98f0dd3
AK
1807 NULL
1808};
1da177e4 1809
cb491fca 1810static cpumask_var_t mce_dev_initialized;
bae19fe0 1811
e9eee03e 1812/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
91c6d400 1813static __cpuinit int mce_create_device(unsigned int cpu)
1da177e4
LT
1814{
1815 int err;
b1f49f95 1816 int i, j;
92cb7612 1817
90367556 1818 if (!mce_available(&boot_cpu_data))
91c6d400
AK
1819 return -EIO;
1820
cb491fca
IM
1821 memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1822 per_cpu(mce_dev, cpu).id = cpu;
1823 per_cpu(mce_dev, cpu).cls = &mce_sysclass;
91c6d400 1824
cb491fca 1825 err = sysdev_register(&per_cpu(mce_dev, cpu));
d435d862
AM
1826 if (err)
1827 return err;
1828
cb491fca
IM
1829 for (i = 0; mce_attrs[i]; i++) {
1830 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
d435d862
AM
1831 if (err)
1832 goto error;
1833 }
b1f49f95 1834 for (j = 0; j < banks; j++) {
cb491fca 1835 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
cebe1820 1836 &mce_banks[j].attr);
0d7482e3
AK
1837 if (err)
1838 goto error2;
1839 }
cb491fca 1840 cpumask_set_cpu(cpu, mce_dev_initialized);
91c6d400 1841
d435d862 1842 return 0;
0d7482e3 1843error2:
b1f49f95 1844 while (--j >= 0)
cebe1820 1845 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
d435d862 1846error:
cb491fca 1847 while (--i >= 0)
cebe1820 1848 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
cb491fca
IM
1849
1850 sysdev_unregister(&per_cpu(mce_dev, cpu));
d435d862 1851
91c6d400
AK
1852 return err;
1853}
1854
2d9cd6c2 1855static __cpuinit void mce_remove_device(unsigned int cpu)
91c6d400 1856{
73ca5358
SL
1857 int i;
1858
cb491fca 1859 if (!cpumask_test_cpu(cpu, mce_dev_initialized))
bae19fe0
AH
1860 return;
1861
cb491fca
IM
1862 for (i = 0; mce_attrs[i]; i++)
1863 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1864
0d7482e3 1865 for (i = 0; i < banks; i++)
cebe1820 1866 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
cb491fca
IM
1867
1868 sysdev_unregister(&per_cpu(mce_dev, cpu));
1869 cpumask_clear_cpu(cpu, mce_dev_initialized);
91c6d400 1870}
91c6d400 1871
d6b75584 1872/* Make sure there are no machine checks on offlined CPUs. */
ec5b3d32 1873static void mce_disable_cpu(void *h)
d6b75584 1874{
88ccbedd 1875 unsigned long action = *(unsigned long *)h;
cb491fca 1876 int i;
d6b75584
AK
1877
1878 if (!mce_available(&current_cpu_data))
1879 return;
88ccbedd
AK
1880 if (!(action & CPU_TASKS_FROZEN))
1881 cmci_clear();
06b7a7a5 1882 for (i = 0; i < banks; i++) {
cebe1820
AK
1883 struct mce_bank *b = &mce_banks[i];
1884 if (b->init)
a2d32bcb 1885 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
06b7a7a5 1886 }
d6b75584
AK
1887}
1888
ec5b3d32 1889static void mce_reenable_cpu(void *h)
d6b75584 1890{
88ccbedd 1891 unsigned long action = *(unsigned long *)h;
e9eee03e 1892 int i;
d6b75584
AK
1893
1894 if (!mce_available(&current_cpu_data))
1895 return;
e9eee03e 1896
88ccbedd
AK
1897 if (!(action & CPU_TASKS_FROZEN))
1898 cmci_reenable();
06b7a7a5 1899 for (i = 0; i < banks; i++) {
cebe1820
AK
1900 struct mce_bank *b = &mce_banks[i];
1901 if (b->init)
a2d32bcb 1902 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
06b7a7a5 1903 }
d6b75584
AK
1904}
1905
91c6d400 1906/* Get notified when a cpu comes on/off. Be hotplug friendly. */
e9eee03e
IM
1907static int __cpuinit
1908mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
91c6d400
AK
1909{
1910 unsigned int cpu = (unsigned long)hcpu;
52d168e2 1911 struct timer_list *t = &per_cpu(mce_timer, cpu);
91c6d400
AK
1912
1913 switch (action) {
bae19fe0
AH
1914 case CPU_ONLINE:
1915 case CPU_ONLINE_FROZEN:
1916 mce_create_device(cpu);
8735728e
RW
1917 if (threshold_cpu_callback)
1918 threshold_cpu_callback(action, cpu);
91c6d400 1919 break;
91c6d400 1920 case CPU_DEAD:
8bb78442 1921 case CPU_DEAD_FROZEN:
8735728e
RW
1922 if (threshold_cpu_callback)
1923 threshold_cpu_callback(action, cpu);
91c6d400
AK
1924 mce_remove_device(cpu);
1925 break;
52d168e2
AK
1926 case CPU_DOWN_PREPARE:
1927 case CPU_DOWN_PREPARE_FROZEN:
1928 del_timer_sync(t);
88ccbedd 1929 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
52d168e2
AK
1930 break;
1931 case CPU_DOWN_FAILED:
1932 case CPU_DOWN_FAILED_FROZEN:
6298c512
AK
1933 t->expires = round_jiffies(jiffies +
1934 __get_cpu_var(next_interval));
52d168e2 1935 add_timer_on(t, cpu);
88ccbedd
AK
1936 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1937 break;
1938 case CPU_POST_DEAD:
1939 /* intentionally ignoring frozen here */
1940 cmci_rediscover(cpu);
52d168e2 1941 break;
91c6d400 1942 }
bae19fe0 1943 return NOTIFY_OK;
91c6d400
AK
1944}
1945
1e35669d 1946static struct notifier_block mce_cpu_notifier __cpuinitdata = {
91c6d400
AK
1947 .notifier_call = mce_cpu_callback,
1948};
1949
cebe1820 1950static __init void mce_init_banks(void)
0d7482e3
AK
1951{
1952 int i;
1953
0d7482e3 1954 for (i = 0; i < banks; i++) {
cebe1820
AK
1955 struct mce_bank *b = &mce_banks[i];
1956 struct sysdev_attribute *a = &b->attr;
e9eee03e 1957
cebe1820
AK
1958 a->attr.name = b->attrname;
1959 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
e9eee03e
IM
1960
1961 a->attr.mode = 0644;
1962 a->show = show_bank;
1963 a->store = set_bank;
0d7482e3 1964 }
0d7482e3
AK
1965}
1966
91c6d400
AK
1967static __init int mce_init_device(void)
1968{
1969 int err;
1970 int i = 0;
1971
1da177e4
LT
1972 if (!mce_available(&boot_cpu_data))
1973 return -EIO;
0d7482e3 1974
e92fae06 1975 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
996867d0 1976
cebe1820 1977 mce_init_banks();
0d7482e3 1978
1da177e4 1979 err = sysdev_class_register(&mce_sysclass);
d435d862
AM
1980 if (err)
1981 return err;
91c6d400
AK
1982
1983 for_each_online_cpu(i) {
d435d862
AM
1984 err = mce_create_device(i);
1985 if (err)
1986 return err;
91c6d400
AK
1987 }
1988
be6b5a35 1989 register_hotcpu_notifier(&mce_cpu_notifier);
1da177e4 1990 misc_register(&mce_log_device);
e9eee03e 1991
1da177e4 1992 return err;
1da177e4 1993}
91c6d400 1994
1da177e4 1995device_initcall(mce_init_device);
a988d334 1996
d7c3c9a6
AK
1997/*
1998 * Old style boot options parsing. Only for compatibility.
1999 */
2000static int __init mcheck_disable(char *str)
2001{
2002 mce_disabled = 1;
2003 return 1;
2004}
2005__setup("nomce", mcheck_disable);
This page took 0.902893 seconds and 5 git commands to generate.