[PATCH] Notify page fault call chain for x86_64
[deliverable/linux.git] / arch / x86_64 / mm / fault.c
CommitLineData
1da177e4
LT
1/*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
7
8#include <linux/config.h>
9#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/smp.h>
19#include <linux/smp_lock.h>
20#include <linux/interrupt.h>
21#include <linux/init.h>
22#include <linux/tty.h>
23#include <linux/vt_kern.h> /* For unblank_screen() */
24#include <linux/compiler.h>
25#include <linux/module.h>
0f2fbdcb 26#include <linux/kprobes.h>
1da177e4
LT
27
28#include <asm/system.h>
29#include <asm/uaccess.h>
30#include <asm/pgalloc.h>
31#include <asm/smp.h>
32#include <asm/tlbflush.h>
33#include <asm/proto.h>
34#include <asm/kdebug.h>
35#include <asm-generic/sections.h>
1da177e4 36
66c58156
AK
37/* Page fault error code bits */
38#define PF_PROT (1<<0) /* or no page found */
39#define PF_WRITE (1<<1)
40#define PF_USER (1<<2)
41#define PF_RSVD (1<<3)
42#define PF_INSTR (1<<4)
43
1bd858a5
AK
44#ifdef CONFIG_KPROBES
45ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
46
47/* Hook to register for page fault notifications */
48int register_page_fault_notifier(struct notifier_block *nb)
49{
50 vmalloc_sync_all();
51 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
52}
53
54int unregister_page_fault_notifier(struct notifier_block *nb)
55{
56 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
57}
58
59static inline int notify_page_fault(enum die_val val, const char *str,
60 struct pt_regs *regs, long err, int trap, int sig)
61{
62 struct die_args args = {
63 .regs = regs,
64 .str = str,
65 .err = err,
66 .trapnr = trap,
67 .signr = sig
68 };
69 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
70}
71#else
72static inline int notify_page_fault(enum die_val val, const char *str,
73 struct pt_regs *regs, long err, int trap, int sig)
74{
75 return NOTIFY_DONE;
76}
77#endif
78
1da177e4
LT
79void bust_spinlocks(int yes)
80{
81 int loglevel_save = console_loglevel;
82 if (yes) {
83 oops_in_progress = 1;
84 } else {
85#ifdef CONFIG_VT
86 unblank_screen();
87#endif
88 oops_in_progress = 0;
89 /*
90 * OK, the message is on the console. Now we call printk()
91 * without oops_in_progress set so that printk will give klogd
92 * a poke. Hold onto your hats...
93 */
94 console_loglevel = 15; /* NMI oopser may have shut the console up */
95 printk(" ");
96 console_loglevel = loglevel_save;
97 }
98}
99
100/* Sometimes the CPU reports invalid exceptions on prefetch.
101 Check that here and ignore.
102 Opcode checker based on code by Richard Brunner */
103static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
104 unsigned long error_code)
105{
f1290ec9 106 unsigned char *instr;
1da177e4
LT
107 int scan_more = 1;
108 int prefetch = 0;
f1290ec9 109 unsigned char *max_instr;
1da177e4
LT
110
111 /* If it was a exec fault ignore */
66c58156 112 if (error_code & PF_INSTR)
1da177e4
LT
113 return 0;
114
f1290ec9
AK
115 instr = (unsigned char *)convert_rip_to_linear(current, regs);
116 max_instr = instr + 15;
1da177e4 117
76381fee 118 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
119 return 0;
120
121 while (scan_more && instr < max_instr) {
122 unsigned char opcode;
123 unsigned char instr_hi;
124 unsigned char instr_lo;
125
126 if (__get_user(opcode, instr))
127 break;
128
129 instr_hi = opcode & 0xf0;
130 instr_lo = opcode & 0x0f;
131 instr++;
132
133 switch (instr_hi) {
134 case 0x20:
135 case 0x30:
136 /* Values 0x26,0x2E,0x36,0x3E are valid x86
137 prefixes. In long mode, the CPU will signal
138 invalid opcode if some of these prefixes are
139 present so we will never get here anyway */
140 scan_more = ((instr_lo & 7) == 0x6);
141 break;
142
143 case 0x40:
144 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
145 Need to figure out under what instruction mode the
146 instruction was issued ... */
147 /* Could check the LDT for lm, but for now it's good
148 enough to assume that long mode only uses well known
149 segments or kernel. */
76381fee 150 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4
LT
151 break;
152
153 case 0x60:
154 /* 0x64 thru 0x67 are valid prefixes in all modes. */
155 scan_more = (instr_lo & 0xC) == 0x4;
156 break;
157 case 0xF0:
158 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
159 scan_more = !instr_lo || (instr_lo>>1) == 1;
160 break;
161 case 0x00:
162 /* Prefetch instruction is 0x0F0D or 0x0F18 */
163 scan_more = 0;
164 if (__get_user(opcode, instr))
165 break;
166 prefetch = (instr_lo == 0xF) &&
167 (opcode == 0x0D || opcode == 0x18);
168 break;
169 default:
170 scan_more = 0;
171 break;
172 }
173 }
174 return prefetch;
175}
176
177static int bad_address(void *p)
178{
179 unsigned long dummy;
180 return __get_user(dummy, (unsigned long *)p);
181}
182
183void dump_pagetable(unsigned long address)
184{
185 pgd_t *pgd;
186 pud_t *pud;
187 pmd_t *pmd;
188 pte_t *pte;
189
190 asm("movq %%cr3,%0" : "=r" (pgd));
191
192 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
193 pgd += pgd_index(address);
1da177e4 194 if (bad_address(pgd)) goto bad;
d646bce4 195 printk("PGD %lx ", pgd_val(*pgd));
1da177e4
LT
196 if (!pgd_present(*pgd)) goto ret;
197
198 pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
199 if (bad_address(pud)) goto bad;
200 printk("PUD %lx ", pud_val(*pud));
201 if (!pud_present(*pud)) goto ret;
202
203 pmd = pmd_offset(pud, address);
204 if (bad_address(pmd)) goto bad;
205 printk("PMD %lx ", pmd_val(*pmd));
206 if (!pmd_present(*pmd)) goto ret;
207
208 pte = pte_offset_kernel(pmd, address);
209 if (bad_address(pte)) goto bad;
210 printk("PTE %lx", pte_val(*pte));
211ret:
212 printk("\n");
213 return;
214bad:
215 printk("BAD\n");
216}
217
218static const char errata93_warning[] =
219KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
220KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
221KERN_ERR "******* Please consider a BIOS update.\n"
222KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
223
224/* Workaround for K8 erratum #93 & buggy BIOS.
225 BIOS SMM functions are required to use a specific workaround
226 to avoid corruption of the 64bit RIP register on C stepping K8.
227 A lot of BIOS that didn't get tested properly miss this.
228 The OS sees this as a page fault with the upper 32bits of RIP cleared.
229 Try to work around it here.
230 Note we only handle faults in kernel here. */
231
232static int is_errata93(struct pt_regs *regs, unsigned long address)
233{
234 static int warned;
235 if (address != regs->rip)
236 return 0;
237 if ((address >> 32) != 0)
238 return 0;
239 address |= 0xffffffffUL << 32;
240 if ((address >= (u64)_stext && address <= (u64)_etext) ||
241 (address >= MODULES_VADDR && address <= MODULES_END)) {
242 if (!warned) {
243 printk(errata93_warning);
244 warned = 1;
245 }
246 regs->rip = address;
247 return 1;
248 }
249 return 0;
250}
251
252int unhandled_signal(struct task_struct *tsk, int sig)
253{
254 if (tsk->pid == 1)
255 return 1;
5e5ec104 256 if (tsk->ptrace & PT_PTRACED)
1da177e4
LT
257 return 0;
258 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
259 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
260}
261
262static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
263 unsigned long error_code)
264{
1209140c 265 unsigned long flags = oops_begin();
6e3f3617 266 struct task_struct *tsk;
1209140c 267
1da177e4
LT
268 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
269 current->comm, address);
270 dump_pagetable(address);
6e3f3617
JB
271 tsk = current;
272 tsk->thread.cr2 = address;
273 tsk->thread.trap_no = 14;
274 tsk->thread.error_code = error_code;
1da177e4 275 __die("Bad pagetable", regs, error_code);
1209140c 276 oops_end(flags);
1da177e4
LT
277 do_exit(SIGKILL);
278}
279
280/*
f95190b2 281 * Handle a fault on the vmalloc area
3b9ba4d5
AK
282 *
283 * This assumes no large pages in there.
1da177e4
LT
284 */
285static int vmalloc_fault(unsigned long address)
286{
287 pgd_t *pgd, *pgd_ref;
288 pud_t *pud, *pud_ref;
289 pmd_t *pmd, *pmd_ref;
290 pte_t *pte, *pte_ref;
291
292 /* Copy kernel mappings over when needed. This can also
293 happen within a race in page table update. In the later
294 case just flush. */
295
296 pgd = pgd_offset(current->mm ?: &init_mm, address);
297 pgd_ref = pgd_offset_k(address);
298 if (pgd_none(*pgd_ref))
299 return -1;
300 if (pgd_none(*pgd))
301 set_pgd(pgd, *pgd_ref);
8c914cb7
JB
302 else
303 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
1da177e4
LT
304
305 /* Below here mismatches are bugs because these lower tables
306 are shared */
307
308 pud = pud_offset(pgd, address);
309 pud_ref = pud_offset(pgd_ref, address);
310 if (pud_none(*pud_ref))
311 return -1;
312 if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
313 BUG();
314 pmd = pmd_offset(pud, address);
315 pmd_ref = pmd_offset(pud_ref, address);
316 if (pmd_none(*pmd_ref))
317 return -1;
318 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
319 BUG();
320 pte_ref = pte_offset_kernel(pmd_ref, address);
321 if (!pte_present(*pte_ref))
322 return -1;
323 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
324 /* Don't use pte_page here, because the mappings can point
325 outside mem_map, and the NUMA hash lookup cannot handle
326 that. */
327 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 328 BUG();
1da177e4
LT
329 return 0;
330}
331
332int page_fault_trace = 0;
333int exception_trace = 1;
334
335/*
336 * This routine handles page faults. It determines the address,
337 * and the problem, and then passes it off to one of the appropriate
338 * routines.
1da177e4 339 */
0f2fbdcb
PP
340asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
341 unsigned long error_code)
1da177e4
LT
342{
343 struct task_struct *tsk;
344 struct mm_struct *mm;
345 struct vm_area_struct * vma;
346 unsigned long address;
347 const struct exception_table_entry *fixup;
348 int write;
1209140c 349 unsigned long flags;
1da177e4
LT
350 siginfo_t info;
351
a9ba9a3b
AV
352 tsk = current;
353 mm = tsk->mm;
354 prefetchw(&mm->mmap_sem);
355
1da177e4
LT
356 /* get the address */
357 __asm__("movq %%cr2,%0":"=r" (address));
1da177e4 358
1da177e4
LT
359 info.si_code = SEGV_MAPERR;
360
361
362 /*
363 * We fault-in kernel-space virtual memory on-demand. The
364 * 'reference' page table is init_mm.pgd.
365 *
366 * NOTE! We MUST NOT take any locks for this case. We may
367 * be in an interrupt or a critical region, and should
368 * only copy the information from the master page table,
369 * nothing more.
370 *
371 * This verifies that the fault happens in kernel space
372 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 373 * protection error (error_code & 9) == 0.
1da177e4 374 */
84929801 375 if (unlikely(address >= TASK_SIZE64)) {
f95190b2
AK
376 /*
377 * Don't check for the module range here: its PML4
378 * is always initialized because it's shared with the main
379 * kernel text. Only vmalloc may need PML4 syncups.
380 */
66c58156 381 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
f95190b2 382 ((address >= VMALLOC_START && address < VMALLOC_END))) {
8c914cb7
JB
383 if (vmalloc_fault(address) >= 0)
384 return;
1da177e4 385 }
1bd858a5 386 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
8c914cb7
JB
387 SIGSEGV) == NOTIFY_STOP)
388 return;
1da177e4
LT
389 /*
390 * Don't take the mm semaphore here. If we fixup a prefetch
391 * fault we could otherwise deadlock.
392 */
393 goto bad_area_nosemaphore;
394 }
395
1bd858a5 396 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
8c914cb7
JB
397 SIGSEGV) == NOTIFY_STOP)
398 return;
399
400 if (likely(regs->eflags & X86_EFLAGS_IF))
401 local_irq_enable();
402
403 if (unlikely(page_fault_trace))
404 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
405 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
406
66c58156 407 if (unlikely(error_code & PF_RSVD))
1da177e4
LT
408 pgtable_bad(address, regs, error_code);
409
410 /*
411 * If we're in an interrupt or have no user
412 * context, we must not take the fault..
413 */
414 if (unlikely(in_atomic() || !mm))
415 goto bad_area_nosemaphore;
416
417 again:
418 /* When running in the kernel we expect faults to occur only to
419 * addresses in user space. All other faults represent errors in the
420 * kernel and should generate an OOPS. Unfortunatly, in the case of an
421 * erroneous fault occuring in a code path which already holds mmap_sem
422 * we will deadlock attempting to validate the fault against the
423 * address space. Luckily the kernel only validly references user
424 * space from well defined areas of code, which are listed in the
425 * exceptions table.
426 *
427 * As the vast majority of faults will be valid we will only perform
428 * the source reference check when there is a possibilty of a deadlock.
429 * Attempt to lock the address space, if we cannot we then validate the
430 * source. If this is invalid we can skip the address space check,
431 * thus avoiding the deadlock.
432 */
433 if (!down_read_trylock(&mm->mmap_sem)) {
66c58156 434 if ((error_code & PF_USER) == 0 &&
1da177e4
LT
435 !search_exception_tables(regs->rip))
436 goto bad_area_nosemaphore;
437 down_read(&mm->mmap_sem);
438 }
439
440 vma = find_vma(mm, address);
441 if (!vma)
442 goto bad_area;
443 if (likely(vma->vm_start <= address))
444 goto good_area;
445 if (!(vma->vm_flags & VM_GROWSDOWN))
446 goto bad_area;
447 if (error_code & 4) {
448 // XXX: align red zone size with ABI
449 if (address + 128 < regs->rsp)
450 goto bad_area;
451 }
452 if (expand_stack(vma, address))
453 goto bad_area;
454/*
455 * Ok, we have a good vm_area for this memory access, so
456 * we can handle it..
457 */
458good_area:
459 info.si_code = SEGV_ACCERR;
460 write = 0;
66c58156 461 switch (error_code & (PF_PROT|PF_WRITE)) {
1da177e4
LT
462 default: /* 3: write, present */
463 /* fall through */
66c58156 464 case PF_WRITE: /* write, not present */
1da177e4
LT
465 if (!(vma->vm_flags & VM_WRITE))
466 goto bad_area;
467 write++;
468 break;
66c58156 469 case PF_PROT: /* read, present */
1da177e4 470 goto bad_area;
66c58156 471 case 0: /* read, not present */
1da177e4
LT
472 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
473 goto bad_area;
474 }
475
476 /*
477 * If for any reason at all we couldn't handle the fault,
478 * make sure we exit gracefully rather than endlessly redo
479 * the fault.
480 */
481 switch (handle_mm_fault(mm, vma, address, write)) {
96800216 482 case VM_FAULT_MINOR:
1da177e4
LT
483 tsk->min_flt++;
484 break;
96800216 485 case VM_FAULT_MAJOR:
1da177e4
LT
486 tsk->maj_flt++;
487 break;
96800216 488 case VM_FAULT_SIGBUS:
1da177e4
LT
489 goto do_sigbus;
490 default:
491 goto out_of_memory;
492 }
493
494 up_read(&mm->mmap_sem);
495 return;
496
497/*
498 * Something tried to access memory that isn't in our memory map..
499 * Fix it, but check if it's kernel or user first..
500 */
501bad_area:
502 up_read(&mm->mmap_sem);
503
504bad_area_nosemaphore:
1da177e4 505 /* User mode accesses just cause a SIGSEGV */
66c58156 506 if (error_code & PF_USER) {
1da177e4
LT
507 if (is_prefetch(regs, address, error_code))
508 return;
509
510 /* Work around K8 erratum #100 K8 in compat mode
511 occasionally jumps to illegal addresses >4GB. We
512 catch this here in the page fault handler because
513 these addresses are not reachable. Just detect this
514 case and return. Any code segment in LDT is
515 compatibility mode. */
516 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
517 (address >> 32))
518 return;
519
520 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
521 printk(
522 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
523 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
524 tsk->comm, tsk->pid, address, regs->rip,
525 regs->rsp, error_code);
526 }
527
528 tsk->thread.cr2 = address;
529 /* Kernel addresses are always protection faults */
530 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
531 tsk->thread.trap_no = 14;
532 info.si_signo = SIGSEGV;
533 info.si_errno = 0;
534 /* info.si_code has been set above */
535 info.si_addr = (void __user *)address;
536 force_sig_info(SIGSEGV, &info, tsk);
537 return;
538 }
539
540no_context:
541
542 /* Are we prepared to handle this kernel fault? */
543 fixup = search_exception_tables(regs->rip);
544 if (fixup) {
545 regs->rip = fixup->fixup;
546 return;
547 }
548
549 /*
550 * Hall of shame of CPU/BIOS bugs.
551 */
552
553 if (is_prefetch(regs, address, error_code))
554 return;
555
556 if (is_errata93(regs, address))
557 return;
558
559/*
560 * Oops. The kernel tried to access some bad page. We'll have to
561 * terminate things with extreme prejudice.
562 */
563
1209140c 564 flags = oops_begin();
1da177e4
LT
565
566 if (address < PAGE_SIZE)
567 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
568 else
569 printk(KERN_ALERT "Unable to handle kernel paging request");
570 printk(" at %016lx RIP: \n" KERN_ALERT,address);
571 printk_address(regs->rip);
572 printk("\n");
573 dump_pagetable(address);
6e3f3617
JB
574 tsk->thread.cr2 = address;
575 tsk->thread.trap_no = 14;
576 tsk->thread.error_code = error_code;
1da177e4
LT
577 __die("Oops", regs, error_code);
578 /* Executive summary in case the body of the oops scrolled away */
579 printk(KERN_EMERG "CR2: %016lx\n", address);
1209140c 580 oops_end(flags);
1da177e4
LT
581 do_exit(SIGKILL);
582
583/*
584 * We ran out of memory, or some other thing happened to us that made
585 * us unable to handle the page fault gracefully.
586 */
587out_of_memory:
588 up_read(&mm->mmap_sem);
1da177e4
LT
589 if (current->pid == 1) {
590 yield();
591 goto again;
592 }
593 printk("VM: killing process %s\n", tsk->comm);
594 if (error_code & 4)
595 do_exit(SIGKILL);
596 goto no_context;
597
598do_sigbus:
599 up_read(&mm->mmap_sem);
600
601 /* Kernel mode? Handle exceptions or die */
66c58156 602 if (!(error_code & PF_USER))
1da177e4
LT
603 goto no_context;
604
605 tsk->thread.cr2 = address;
606 tsk->thread.error_code = error_code;
607 tsk->thread.trap_no = 14;
608 info.si_signo = SIGBUS;
609 info.si_errno = 0;
610 info.si_code = BUS_ADRERR;
611 info.si_addr = (void __user *)address;
612 force_sig_info(SIGBUS, &info, tsk);
613 return;
614}
9e43e1b7 615
8c914cb7
JB
616DEFINE_SPINLOCK(pgd_lock);
617struct page *pgd_list;
618
619void vmalloc_sync_all(void)
620{
621 /* Note that races in the updates of insync and start aren't
622 problematic:
623 insync can only get set bits added, and updates to start are only
624 improving performance (without affecting correctness if undone). */
625 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
626 static unsigned long start = VMALLOC_START & PGDIR_MASK;
627 unsigned long address;
628
629 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
630 if (!test_bit(pgd_index(address), insync)) {
631 const pgd_t *pgd_ref = pgd_offset_k(address);
632 struct page *page;
633
634 if (pgd_none(*pgd_ref))
635 continue;
636 spin_lock(&pgd_lock);
637 for (page = pgd_list; page;
638 page = (struct page *)page->index) {
639 pgd_t *pgd;
640 pgd = (pgd_t *)page_address(page) + pgd_index(address);
641 if (pgd_none(*pgd))
642 set_pgd(pgd, *pgd_ref);
643 else
644 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
645 }
646 spin_unlock(&pgd_lock);
647 set_bit(pgd_index(address), insync);
648 }
649 if (address == start)
650 start = address + PGDIR_SIZE;
651 }
652 /* Check that there is no need to do the same for the modules area. */
653 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
654 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
655 (__START_KERNEL & PGDIR_MASK)));
656}
657
9e43e1b7
AK
658static int __init enable_pagefaulttrace(char *str)
659{
660 page_fault_trace = 1;
9b41046c 661 return 1;
9e43e1b7
AK
662}
663__setup("pagefaulttrace", enable_pagefaulttrace);
This page took 0.153833 seconds and 5 git commands to generate.