x86: optimise x86's do_page_fault (C entry point for the page fault path)
[deliverable/linux.git] / arch / x86 / mm / fault.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
0fd0e3da 13#include <linux/mmiotrace.h>
1da177e4
LT
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
1da177e4
LT
17#include <linux/interrupt.h>
18#include <linux/init.h>
19#include <linux/tty.h>
20#include <linux/vt_kern.h> /* For unblank_screen() */
21#include <linux/compiler.h>
c61e211d
HH
22#include <linux/highmem.h>
23#include <linux/bootmem.h> /* for max_low_pfn */
1eeb66a1 24#include <linux/vmalloc.h>
1da177e4 25#include <linux/module.h>
0f2fbdcb 26#include <linux/kprobes.h>
ab2bf0c1 27#include <linux/uaccess.h>
1eeb66a1 28#include <linux/kdebug.h>
1da177e4
LT
29
30#include <asm/system.h>
c61e211d
HH
31#include <asm/desc.h>
32#include <asm/segment.h>
1da177e4
LT
33#include <asm/pgalloc.h>
34#include <asm/smp.h>
35#include <asm/tlbflush.h>
36#include <asm/proto.h>
1da177e4 37#include <asm-generic/sections.h>
70ef5641 38#include <asm/traps.h>
1da177e4 39
33cb5243
HH
40/*
41 * Page fault error code bits
42 * bit 0 == 0 means no page found, 1 means protection fault
43 * bit 1 == 0 means read, 1 means write
44 * bit 2 == 0 means kernel, 1 means user-mode
45 * bit 3 == 1 means use of reserved bit detected
46 * bit 4 == 1 means fault was an instruction fetch
47 */
8a19da7b 48#define PF_PROT (1<<0)
66c58156 49#define PF_WRITE (1<<1)
8a19da7b
IM
50#define PF_USER (1<<2)
51#define PF_RSVD (1<<3)
66c58156
AK
52#define PF_INSTR (1<<4)
53
0fd0e3da 54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
86069782 55{
fd3fdf11 56#ifdef CONFIG_MMIOTRACE
0fd0e3da
PP
57 if (unlikely(is_kmmio_active()))
58 if (kmmio_handler(regs, addr) == 1)
59 return -1;
86069782 60#endif
0fd0e3da 61 return 0;
86069782
PP
62}
63
74a0b576 64static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 65{
33cb5243 66#ifdef CONFIG_KPROBES
74a0b576
CH
67 int ret = 0;
68
69 /* kprobe_running() needs smp_processor_id() */
f8c2ee22 70 if (!user_mode_vm(regs)) {
74a0b576
CH
71 preempt_disable();
72 if (kprobe_running() && kprobe_fault_handler(regs, 14))
73 ret = 1;
74 preempt_enable();
75 }
1bd858a5 76
74a0b576 77 return ret;
74a0b576 78#else
74a0b576 79 return 0;
74a0b576 80#endif
33cb5243 81}
1bd858a5 82
1dc85be0
HH
83/*
84 * X86_32
85 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
86 * Check that here and ignore it.
87 *
88 * X86_64
89 * Sometimes the CPU reports invalid exceptions on prefetch.
90 * Check that here and ignore it.
91 *
92 * Opcode checker based on code by Richard Brunner
93 */
92181f19
NP
94static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
95 unsigned long addr)
33cb5243 96{
ab2bf0c1 97 unsigned char *instr;
1da177e4 98 int scan_more = 1;
33cb5243 99 int prefetch = 0;
f1290ec9 100 unsigned char *max_instr;
1da177e4 101
3085354d
IM
102 /*
103 * If it was a exec (instruction fetch) fault on NX page, then
104 * do not ignore the fault:
105 */
66c58156 106 if (error_code & PF_INSTR)
1da177e4 107 return 0;
1dc85be0 108
f2857ce9 109 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 110 max_instr = instr + 15;
1da177e4 111
76381fee 112 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
113 return 0;
114
33cb5243 115 while (scan_more && instr < max_instr) {
1da177e4
LT
116 unsigned char opcode;
117 unsigned char instr_hi;
118 unsigned char instr_lo;
119
ab2bf0c1 120 if (probe_kernel_address(instr, opcode))
33cb5243 121 break;
1da177e4 122
33cb5243
HH
123 instr_hi = opcode & 0xf0;
124 instr_lo = opcode & 0x0f;
1da177e4
LT
125 instr++;
126
33cb5243 127 switch (instr_hi) {
1da177e4
LT
128 case 0x20:
129 case 0x30:
33cb5243
HH
130 /*
131 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
132 * In X86_64 long mode, the CPU will signal invalid
133 * opcode if some of these prefixes are present so
134 * X86_64 will never get here anyway
135 */
1da177e4
LT
136 scan_more = ((instr_lo & 7) == 0x6);
137 break;
33cb5243 138#ifdef CONFIG_X86_64
1da177e4 139 case 0x40:
33cb5243
HH
140 /*
141 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
142 * Need to figure out under what instruction mode the
143 * instruction was issued. Could check the LDT for lm,
144 * but for now it's good enough to assume that long
145 * mode only uses well known segments or kernel.
146 */
76381fee 147 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 148 break;
33cb5243 149#endif
1da177e4
LT
150 case 0x60:
151 /* 0x64 thru 0x67 are valid prefixes in all modes. */
152 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 153 break;
1da177e4 154 case 0xF0:
1dc85be0 155 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 156 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 157 break;
1da177e4
LT
158 case 0x00:
159 /* Prefetch instruction is 0x0F0D or 0x0F18 */
160 scan_more = 0;
f2857ce9 161
ab2bf0c1 162 if (probe_kernel_address(instr, opcode))
1da177e4
LT
163 break;
164 prefetch = (instr_lo == 0xF) &&
165 (opcode == 0x0D || opcode == 0x18);
33cb5243 166 break;
1da177e4
LT
167 default:
168 scan_more = 0;
169 break;
33cb5243 170 }
1da177e4
LT
171 }
172 return prefetch;
173}
174
c4aba4a8
HH
175static void force_sig_info_fault(int si_signo, int si_code,
176 unsigned long address, struct task_struct *tsk)
177{
178 siginfo_t info;
179
180 info.si_signo = si_signo;
181 info.si_errno = 0;
182 info.si_code = si_code;
183 info.si_addr = (void __user *)address;
184 force_sig_info(si_signo, &info, tsk);
185}
186
1156e098 187#ifdef CONFIG_X86_64
33cb5243
HH
188static int bad_address(void *p)
189{
1da177e4 190 unsigned long dummy;
ab2bf0c1 191 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 192}
1156e098 193#endif
1da177e4 194
cae30f82 195static void dump_pagetable(unsigned long address)
1da177e4 196{
1156e098
HH
197#ifdef CONFIG_X86_32
198 __typeof__(pte_val(__pte(0))) page;
199
200 page = read_cr3();
201 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
202#ifdef CONFIG_X86_PAE
203 printk("*pdpt = %016Lx ", page);
204 if ((page >> PAGE_SHIFT) < max_low_pfn
205 && page & _PAGE_PRESENT) {
206 page &= PAGE_MASK;
207 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
208 & (PTRS_PER_PMD - 1)];
209 printk(KERN_CONT "*pde = %016Lx ", page);
210 page &= ~_PAGE_NX;
211 }
212#else
213 printk("*pde = %08lx ", page);
214#endif
215
216 /*
217 * We must not directly access the pte in the highpte
218 * case if the page table is located in highmem.
219 * And let's rather not kmap-atomic the pte, just in case
220 * it's allocated already.
221 */
222 if ((page >> PAGE_SHIFT) < max_low_pfn
223 && (page & _PAGE_PRESENT)
224 && !(page & _PAGE_PSE)) {
225 page &= PAGE_MASK;
226 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
227 & (PTRS_PER_PTE - 1)];
228 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
229 }
230
231 printk("\n");
232#else /* CONFIG_X86_64 */
1da177e4
LT
233 pgd_t *pgd;
234 pud_t *pud;
235 pmd_t *pmd;
236 pte_t *pte;
237
f51c9452 238 pgd = (pgd_t *)read_cr3();
1da177e4 239
33cb5243 240 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 241 pgd += pgd_index(address);
1da177e4 242 if (bad_address(pgd)) goto bad;
d646bce4 243 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 244 if (!pgd_present(*pgd)) goto ret;
1da177e4 245
d2ae5b5f 246 pud = pud_offset(pgd, address);
1da177e4
LT
247 if (bad_address(pud)) goto bad;
248 printk("PUD %lx ", pud_val(*pud));
b5360222
AK
249 if (!pud_present(*pud) || pud_large(*pud))
250 goto ret;
1da177e4
LT
251
252 pmd = pmd_offset(pud, address);
253 if (bad_address(pmd)) goto bad;
254 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 255 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
256
257 pte = pte_offset_kernel(pmd, address);
258 if (bad_address(pte)) goto bad;
33cb5243 259 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
260ret:
261 printk("\n");
262 return;
263bad:
264 printk("BAD\n");
1156e098
HH
265#endif
266}
267
268#ifdef CONFIG_X86_32
269static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
270{
271 unsigned index = pgd_index(address);
272 pgd_t *pgd_k;
273 pud_t *pud, *pud_k;
274 pmd_t *pmd, *pmd_k;
275
276 pgd += index;
277 pgd_k = init_mm.pgd + index;
278
279 if (!pgd_present(*pgd_k))
280 return NULL;
281
282 /*
283 * set_pgd(pgd, *pgd_k); here would be useless on PAE
284 * and redundant with the set_pmd() on non-PAE. As would
285 * set_pud.
286 */
287
288 pud = pud_offset(pgd, address);
289 pud_k = pud_offset(pgd_k, address);
290 if (!pud_present(*pud_k))
291 return NULL;
292
293 pmd = pmd_offset(pud, address);
294 pmd_k = pmd_offset(pud_k, address);
295 if (!pmd_present(*pmd_k))
296 return NULL;
297 if (!pmd_present(*pmd)) {
298 set_pmd(pmd, *pmd_k);
299 arch_flush_lazy_mmu_mode();
300 } else
301 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
302 return pmd_k;
1da177e4 303}
1156e098 304#endif
1da177e4 305
1dc85be0 306#ifdef CONFIG_X86_64
33cb5243 307static const char errata93_warning[] =
1da177e4
LT
308KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
309KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
310KERN_ERR "******* Please consider a BIOS update.\n"
311KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 312#endif
1da177e4
LT
313
314/* Workaround for K8 erratum #93 & buggy BIOS.
315 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
316 to avoid corruption of the 64bit RIP register on C stepping K8.
317 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
318 The OS sees this as a page fault with the upper 32bits of RIP cleared.
319 Try to work around it here.
fdfe8aa8
HH
320 Note we only handle faults in kernel here.
321 Does nothing for X86_32
322 */
33cb5243 323static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 324{
fdfe8aa8 325#ifdef CONFIG_X86_64
1da177e4 326 static int warned;
65ea5b03 327 if (address != regs->ip)
1da177e4 328 return 0;
33cb5243 329 if ((address >> 32) != 0)
1da177e4
LT
330 return 0;
331 address |= 0xffffffffUL << 32;
33cb5243
HH
332 if ((address >= (u64)_stext && address <= (u64)_etext) ||
333 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 334 if (!warned) {
33cb5243 335 printk(errata93_warning);
1da177e4
LT
336 warned = 1;
337 }
65ea5b03 338 regs->ip = address;
1da177e4
LT
339 return 1;
340 }
fdfe8aa8 341#endif
1da177e4 342 return 0;
33cb5243 343}
1da177e4 344
35f3266f
HH
345/*
346 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
347 * addresses >4GB. We catch this in the page fault handler because these
348 * addresses are not reachable. Just detect this case and return. Any code
349 * segment in LDT is compatibility mode.
350 */
351static int is_errata100(struct pt_regs *regs, unsigned long address)
352{
353#ifdef CONFIG_X86_64
354 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
355 (address >> 32))
356 return 1;
357#endif
358 return 0;
359}
360
29caf2f9
HH
361static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
362{
363#ifdef CONFIG_X86_F00F_BUG
364 unsigned long nr;
365 /*
366 * Pentium F0 0F C7 C8 bug workaround.
367 */
368 if (boot_cpu_data.f00f_bug) {
369 nr = (address - idt_descr.address) >> 3;
370
371 if (nr == 6) {
372 do_invalid_op(regs, 0);
373 return 1;
374 }
375 }
376#endif
377 return 0;
378}
379
b3279c7f
HH
380static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
381 unsigned long address)
382{
1156e098
HH
383#ifdef CONFIG_X86_32
384 if (!oops_may_print())
385 return;
fd40d6e3 386#endif
1156e098
HH
387
388#ifdef CONFIG_X86_PAE
389 if (error_code & PF_INSTR) {
93809be8 390 unsigned int level;
1156e098
HH
391 pte_t *pte = lookup_address(address, &level);
392
393 if (pte && pte_present(*pte) && !pte_exec(*pte))
394 printk(KERN_CRIT "kernel tried to execute "
395 "NX-protected page - exploit attempt? "
350b4da7 396 "(uid: %d)\n", current_uid());
1156e098
HH
397 }
398#endif
1156e098 399
19f0dda9 400 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 401 if (address < PAGE_SIZE)
19f0dda9 402 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 403 else
19f0dda9 404 printk(KERN_CONT "paging request");
f294a8ce 405 printk(KERN_CONT " at %p\n", (void *) address);
19f0dda9 406 printk(KERN_ALERT "IP:");
b3279c7f
HH
407 printk_address(regs->ip, 1);
408 dump_pagetable(address);
409}
410
1156e098 411#ifdef CONFIG_X86_64
92181f19
NP
412static noinline void pgtable_bad(struct pt_regs *regs,
413 unsigned long error_code, unsigned long address)
1da177e4 414{
1209140c 415 unsigned long flags = oops_begin();
874d93d1 416 int sig = SIGKILL;
92181f19 417 struct task_struct *tsk = current;
1209140c 418
1da177e4 419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
92181f19 420 tsk->comm, address);
1da177e4 421 dump_pagetable(address);
6e3f3617
JB
422 tsk = current;
423 tsk->thread.cr2 = address;
424 tsk->thread.trap_no = 14;
425 tsk->thread.error_code = error_code;
22f5991c 426 if (__die("Bad pagetable", regs, error_code))
874d93d1
AH
427 sig = 0;
428 oops_end(flags, regs, sig);
1da177e4 429}
1156e098 430#endif
1da177e4 431
92181f19
NP
432static noinline void no_context(struct pt_regs *regs,
433 unsigned long error_code, unsigned long address)
434{
435 struct task_struct *tsk = current;
436#ifdef CONFIG_X86_64
437 unsigned long flags;
438 int sig;
439#endif
440
441 /* Are we prepared to handle this kernel fault? */
442 if (fixup_exception(regs))
443 return;
444
445 /*
446 * X86_32
447 * Valid to do another page fault here, because if this fault
448 * had been triggered by is_prefetch fixup_exception would have
449 * handled it.
450 *
451 * X86_64
452 * Hall of shame of CPU/BIOS bugs.
453 */
454 if (is_prefetch(regs, error_code, address))
455 return;
456
457 if (is_errata93(regs, address))
458 return;
459
460 /*
461 * Oops. The kernel tried to access some bad page. We'll have to
462 * terminate things with extreme prejudice.
463 */
464#ifdef CONFIG_X86_32
465 bust_spinlocks(1);
466#else
467 flags = oops_begin();
468#endif
469
470 show_fault_oops(regs, error_code, address);
471
472 tsk->thread.cr2 = address;
473 tsk->thread.trap_no = 14;
474 tsk->thread.error_code = error_code;
475
476#ifdef CONFIG_X86_32
477 die("Oops", regs, error_code);
478 bust_spinlocks(0);
479 do_exit(SIGKILL);
480#else
481 sig = SIGKILL;
482 if (__die("Oops", regs, error_code))
483 sig = 0;
484 /* Executive summary in case the body of the oops scrolled away */
485 printk(KERN_EMERG "CR2: %016lx\n", address);
486 oops_end(flags, regs, sig);
487#endif
488}
489
490static void __bad_area_nosemaphore(struct pt_regs *regs,
491 unsigned long error_code, unsigned long address,
492 int si_code)
493{
494 struct task_struct *tsk = current;
495
496 /* User mode accesses just cause a SIGSEGV */
497 if (error_code & PF_USER) {
498 /*
499 * It's possible to have interrupts off here.
500 */
501 local_irq_enable();
502
503 /*
504 * Valid to do another page fault here because this one came
505 * from user space.
506 */
507 if (is_prefetch(regs, error_code, address))
508 return;
509
510 if (is_errata100(regs, address))
511 return;
512
513 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
514 printk_ratelimit()) {
515 printk(
516 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
517 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
518 tsk->comm, task_pid_nr(tsk), address,
519 (void *) regs->ip, (void *) regs->sp, error_code);
520 print_vma_addr(" in ", regs->ip);
521 printk("\n");
522 }
523
524 tsk->thread.cr2 = address;
525 /* Kernel addresses are always protection faults */
526 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
527 tsk->thread.trap_no = 14;
528 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
529 return;
530 }
531
532 if (is_f00f_bug(regs, address))
533 return;
534
535 no_context(regs, error_code, address);
536}
537
538static noinline void bad_area_nosemaphore(struct pt_regs *regs,
539 unsigned long error_code, unsigned long address)
540{
541 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
542}
543
544static void __bad_area(struct pt_regs *regs,
545 unsigned long error_code, unsigned long address,
546 int si_code)
547{
548 struct mm_struct *mm = current->mm;
549
550 /*
551 * Something tried to access memory that isn't in our memory map..
552 * Fix it, but check if it's kernel or user first..
553 */
554 up_read(&mm->mmap_sem);
555
556 __bad_area_nosemaphore(regs, error_code, address, si_code);
557}
558
559static noinline void bad_area(struct pt_regs *regs,
560 unsigned long error_code, unsigned long address)
561{
562 __bad_area(regs, error_code, address, SEGV_MAPERR);
563}
564
565static noinline void bad_area_access_error(struct pt_regs *regs,
566 unsigned long error_code, unsigned long address)
567{
568 __bad_area(regs, error_code, address, SEGV_ACCERR);
569}
570
571/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
572static void out_of_memory(struct pt_regs *regs,
573 unsigned long error_code, unsigned long address)
574{
575 /*
576 * We ran out of memory, call the OOM killer, and return the userspace
577 * (which will retry the fault, or kill us if we got oom-killed).
578 */
579 up_read(&current->mm->mmap_sem);
580 pagefault_out_of_memory();
581}
582
583static void do_sigbus(struct pt_regs *regs,
584 unsigned long error_code, unsigned long address)
585{
586 struct task_struct *tsk = current;
587 struct mm_struct *mm = tsk->mm;
588
589 up_read(&mm->mmap_sem);
590
591 /* Kernel mode? Handle exceptions or die */
592 if (!(error_code & PF_USER))
593 no_context(regs, error_code, address);
594#ifdef CONFIG_X86_32
595 /* User space => ok to do another page fault */
596 if (is_prefetch(regs, error_code, address))
597 return;
598#endif
599 tsk->thread.cr2 = address;
600 tsk->thread.error_code = error_code;
601 tsk->thread.trap_no = 14;
602 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
603}
604
605static noinline void mm_fault_error(struct pt_regs *regs,
606 unsigned long error_code, unsigned long address, unsigned int fault)
607{
608 if (fault & VM_FAULT_OOM)
609 out_of_memory(regs, error_code, address);
610 else if (fault & VM_FAULT_SIGBUS)
611 do_sigbus(regs, error_code, address);
612 else
613 BUG();
614}
615
d8b57bb7
TG
616static int spurious_fault_check(unsigned long error_code, pte_t *pte)
617{
618 if ((error_code & PF_WRITE) && !pte_write(*pte))
619 return 0;
620 if ((error_code & PF_INSTR) && !pte_exec(*pte))
621 return 0;
622
623 return 1;
624}
625
5b727a3b
JF
626/*
627 * Handle a spurious fault caused by a stale TLB entry. This allows
628 * us to lazily refresh the TLB when increasing the permissions of a
629 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
630 * expensive since that implies doing a full cross-processor TLB
631 * flush, even if no stale TLB entries exist on other processors.
632 * There are no security implications to leaving a stale TLB when
633 * increasing the permissions on a page.
634 */
92181f19
NP
635static noinline int spurious_fault(unsigned long error_code,
636 unsigned long address)
5b727a3b
JF
637{
638 pgd_t *pgd;
639 pud_t *pud;
640 pmd_t *pmd;
641 pte_t *pte;
642
643 /* Reserved-bit violation or user access to kernel space? */
644 if (error_code & (PF_USER | PF_RSVD))
645 return 0;
646
647 pgd = init_mm.pgd + pgd_index(address);
648 if (!pgd_present(*pgd))
649 return 0;
650
651 pud = pud_offset(pgd, address);
652 if (!pud_present(*pud))
653 return 0;
654
d8b57bb7
TG
655 if (pud_large(*pud))
656 return spurious_fault_check(error_code, (pte_t *) pud);
657
5b727a3b
JF
658 pmd = pmd_offset(pud, address);
659 if (!pmd_present(*pmd))
660 return 0;
661
d8b57bb7
TG
662 if (pmd_large(*pmd))
663 return spurious_fault_check(error_code, (pte_t *) pmd);
664
5b727a3b
JF
665 pte = pte_offset_kernel(pmd, address);
666 if (!pte_present(*pte))
667 return 0;
668
d8b57bb7 669 return spurious_fault_check(error_code, pte);
5b727a3b
JF
670}
671
1da177e4 672/*
f8c2ee22
HH
673 * X86_32
674 * Handle a fault on the vmalloc or module mapping area
675 *
676 * X86_64
f95190b2 677 * Handle a fault on the vmalloc area
3b9ba4d5
AK
678 *
679 * This assumes no large pages in there.
1da177e4 680 */
92181f19 681static noinline int vmalloc_fault(unsigned long address)
1da177e4 682{
fdfe8aa8
HH
683#ifdef CONFIG_X86_32
684 unsigned long pgd_paddr;
685 pmd_t *pmd_k;
686 pte_t *pte_k;
b29c701d
HN
687
688 /* Make sure we are in vmalloc area */
689 if (!(address >= VMALLOC_START && address < VMALLOC_END))
690 return -1;
691
fdfe8aa8
HH
692 /*
693 * Synchronize this task's top level page-table
694 * with the 'reference' page table.
695 *
696 * Do _not_ use "current" here. We might be inside
697 * an interrupt in the middle of a task switch..
698 */
699 pgd_paddr = read_cr3();
700 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
701 if (!pmd_k)
702 return -1;
703 pte_k = pte_offset_kernel(pmd_k, address);
704 if (!pte_present(*pte_k))
705 return -1;
706 return 0;
707#else
1da177e4
LT
708 pgd_t *pgd, *pgd_ref;
709 pud_t *pud, *pud_ref;
710 pmd_t *pmd, *pmd_ref;
711 pte_t *pte, *pte_ref;
712
cf89ec92
HH
713 /* Make sure we are in vmalloc area */
714 if (!(address >= VMALLOC_START && address < VMALLOC_END))
715 return -1;
716
1da177e4
LT
717 /* Copy kernel mappings over when needed. This can also
718 happen within a race in page table update. In the later
719 case just flush. */
720
f313e123 721 pgd = pgd_offset(current->active_mm, address);
1da177e4
LT
722 pgd_ref = pgd_offset_k(address);
723 if (pgd_none(*pgd_ref))
724 return -1;
725 if (pgd_none(*pgd))
726 set_pgd(pgd, *pgd_ref);
8c914cb7 727 else
46a82b2d 728 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
729
730 /* Below here mismatches are bugs because these lower tables
731 are shared */
732
733 pud = pud_offset(pgd, address);
734 pud_ref = pud_offset(pgd_ref, address);
735 if (pud_none(*pud_ref))
736 return -1;
46a82b2d 737 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
738 BUG();
739 pmd = pmd_offset(pud, address);
740 pmd_ref = pmd_offset(pud_ref, address);
741 if (pmd_none(*pmd_ref))
742 return -1;
743 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
744 BUG();
745 pte_ref = pte_offset_kernel(pmd_ref, address);
746 if (!pte_present(*pte_ref))
747 return -1;
748 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
749 /* Don't use pte_page here, because the mappings can point
750 outside mem_map, and the NUMA hash lookup cannot handle
751 that. */
752 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 753 BUG();
1da177e4 754 return 0;
fdfe8aa8 755#endif
1da177e4
LT
756}
757
abd4f750 758int show_unhandled_signals = 1;
1da177e4 759
92181f19
NP
760static inline int access_error(unsigned long error_code, int write,
761 struct vm_area_struct *vma)
762{
763 if (write) {
764 /* write, present and write, not present */
765 if (unlikely(!(vma->vm_flags & VM_WRITE)))
766 return 1;
767 } else if (unlikely(error_code & PF_PROT)) {
768 /* read, present */
769 return 1;
770 } else {
771 /* read, not present */
772 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
773 return 1;
774 }
775
776 return 0;
777}
778
1da177e4
LT
779/*
780 * This routine handles page faults. It determines the address,
781 * and the problem, and then passes it off to one of the appropriate
782 * routines.
1da177e4 783 */
f8c2ee22
HH
784#ifdef CONFIG_X86_64
785asmlinkage
786#endif
787void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1da177e4 788{
92181f19 789 unsigned long address;
1da177e4
LT
790 struct task_struct *tsk;
791 struct mm_struct *mm;
33cb5243 792 struct vm_area_struct *vma;
92181f19 793 int write;
f8c2ee22 794 int fault;
1da177e4 795
a9ba9a3b
AV
796 tsk = current;
797 mm = tsk->mm;
798 prefetchw(&mm->mmap_sem);
799
1da177e4 800 /* get the address */
f51c9452 801 address = read_cr2();
1da177e4 802
92181f19 803 if (unlikely(notify_page_fault(regs)))
608566b4 804 return;
0fd0e3da 805 if (unlikely(kmmio_fault(regs, address)))
86069782 806 return;
1da177e4
LT
807
808 /*
809 * We fault-in kernel-space virtual memory on-demand. The
810 * 'reference' page table is init_mm.pgd.
811 *
812 * NOTE! We MUST NOT take any locks for this case. We may
813 * be in an interrupt or a critical region, and should
814 * only copy the information from the master page table,
815 * nothing more.
816 *
817 * This verifies that the fault happens in kernel space
818 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 819 * protection error (error_code & 9) == 0.
1da177e4 820 */
f8c2ee22
HH
821#ifdef CONFIG_X86_32
822 if (unlikely(address >= TASK_SIZE)) {
cf89ec92
HH
823#else
824 if (unlikely(address >= TASK_SIZE64)) {
825#endif
f8c2ee22
HH
826 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
827 vmalloc_fault(address) >= 0)
828 return;
5b727a3b
JF
829
830 /* Can handle a stale RO->RW TLB */
92181f19 831 if (spurious_fault(error_code, address))
5b727a3b
JF
832 return;
833
f8c2ee22
HH
834 /*
835 * Don't take the mm semaphore here. If we fixup a prefetch
836 * fault we could otherwise deadlock.
837 */
92181f19
NP
838 bad_area_nosemaphore(regs, error_code, address);
839 return;
f8c2ee22
HH
840 }
841
f8c2ee22 842 /*
891cffbd
LT
843 * It's safe to allow irq's after cr2 has been saved and the
844 * vmalloc fault has been handled.
845 *
846 * User-mode registers count as a user access even for any
847 * potential system fault or CPU buglet.
f8c2ee22 848 */
891cffbd
LT
849 if (user_mode_vm(regs)) {
850 local_irq_enable();
851 error_code |= PF_USER;
852 } else if (regs->flags & X86_EFLAGS_IF)
8c914cb7
JB
853 local_irq_enable();
854
891cffbd 855#ifdef CONFIG_X86_64
66c58156 856 if (unlikely(error_code & PF_RSVD))
92181f19 857 pgtable_bad(regs, error_code, address);
891cffbd 858#endif
1da177e4
LT
859
860 /*
33cb5243
HH
861 * If we're in an interrupt, have no user context or are running in an
862 * atomic region then we must not take the fault.
1da177e4 863 */
92181f19
NP
864 if (unlikely(in_atomic() || !mm)) {
865 bad_area_nosemaphore(regs, error_code, address);
866 return;
867 }
1da177e4 868
3a1dfe6e
IM
869 /*
870 * When running in the kernel we expect faults to occur only to
1da177e4 871 * addresses in user space. All other faults represent errors in the
676b1855 872 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 873 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
874 * we will deadlock attempting to validate the fault against the
875 * address space. Luckily the kernel only validly references user
876 * space from well defined areas of code, which are listed in the
877 * exceptions table.
878 *
879 * As the vast majority of faults will be valid we will only perform
676b1855 880 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
881 * Attempt to lock the address space, if we cannot we then validate the
882 * source. If this is invalid we can skip the address space check,
883 * thus avoiding the deadlock.
884 */
92181f19 885 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
66c58156 886 if ((error_code & PF_USER) == 0 &&
92181f19
NP
887 !search_exception_tables(regs->ip)) {
888 bad_area_nosemaphore(regs, error_code, address);
889 return;
890 }
1da177e4
LT
891 down_read(&mm->mmap_sem);
892 }
893
894 vma = find_vma(mm, address);
92181f19
NP
895 if (unlikely(!vma)) {
896 bad_area(regs, error_code, address);
897 return;
898 }
899 if (likely(vma->vm_start <= address))
1da177e4 900 goto good_area;
92181f19
NP
901 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
902 bad_area(regs, error_code, address);
903 return;
904 }
33cb5243 905 if (error_code & PF_USER) {
6f4d368e
HH
906 /*
907 * Accessing the stack below %sp is always a bug.
908 * The large cushion allows instructions like enter
909 * and pusha to work. ("enter $65535,$31" pushes
910 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 911 */
92181f19
NP
912 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
913 bad_area(regs, error_code, address);
914 return;
915 }
1da177e4 916 }
92181f19
NP
917 if (unlikely(expand_stack(vma, address))) {
918 bad_area(regs, error_code, address);
919 return;
920 }
921
922 /*
923 * Ok, we have a good vm_area for this memory access, so
924 * we can handle it..
925 */
1da177e4 926good_area:
92181f19
NP
927 write = error_code & PF_WRITE;
928 if (unlikely(access_error(error_code, write, vma))) {
929 bad_area_access_error(regs, error_code, address);
930 return;
1da177e4
LT
931 }
932
933 /*
934 * If for any reason at all we couldn't handle the fault,
935 * make sure we exit gracefully rather than endlessly redo
936 * the fault.
937 */
83c54070
NP
938 fault = handle_mm_fault(mm, vma, address, write);
939 if (unlikely(fault & VM_FAULT_ERROR)) {
92181f19
NP
940 mm_fault_error(regs, error_code, address, fault);
941 return;
1da177e4 942 }
83c54070
NP
943 if (fault & VM_FAULT_MAJOR)
944 tsk->maj_flt++;
945 else
946 tsk->min_flt++;
d729ab35
HH
947
948#ifdef CONFIG_X86_32
949 /*
950 * Did it hit the DOS screen memory VA from vm86 mode?
951 */
952 if (v8086_mode(regs)) {
953 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
954 if (bit < 32)
955 tsk->thread.screen_bitmap |= 1 << bit;
956 }
957#endif
1da177e4 958 up_read(&mm->mmap_sem);
1da177e4 959}
9e43e1b7 960
8c914cb7 961DEFINE_SPINLOCK(pgd_lock);
2bff7383 962LIST_HEAD(pgd_list);
8c914cb7
JB
963
964void vmalloc_sync_all(void)
965{
1156e098
HH
966 unsigned long address;
967
cc643d46 968#ifdef CONFIG_X86_32
1156e098
HH
969 if (SHARED_KERNEL_PMD)
970 return;
971
cc643d46
JB
972 for (address = VMALLOC_START & PMD_MASK;
973 address >= TASK_SIZE && address < FIXADDR_TOP;
974 address += PMD_SIZE) {
67350a5c
JF
975 unsigned long flags;
976 struct page *page;
977
978 spin_lock_irqsave(&pgd_lock, flags);
979 list_for_each_entry(page, &pgd_list, lru) {
980 if (!vmalloc_sync_one(page_address(page),
981 address))
982 break;
1156e098 983 }
67350a5c 984 spin_unlock_irqrestore(&pgd_lock, flags);
1156e098
HH
985 }
986#else /* CONFIG_X86_64 */
cc643d46
JB
987 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
988 address += PGDIR_SIZE) {
67350a5c
JF
989 const pgd_t *pgd_ref = pgd_offset_k(address);
990 unsigned long flags;
991 struct page *page;
992
993 if (pgd_none(*pgd_ref))
994 continue;
995 spin_lock_irqsave(&pgd_lock, flags);
996 list_for_each_entry(page, &pgd_list, lru) {
997 pgd_t *pgd;
998 pgd = (pgd_t *)page_address(page) + pgd_index(address);
999 if (pgd_none(*pgd))
1000 set_pgd(pgd, *pgd_ref);
1001 else
1002 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7 1003 }
67350a5c 1004 spin_unlock_irqrestore(&pgd_lock, flags);
8c914cb7 1005 }
1156e098 1006#endif
8c914cb7 1007}
This page took 0.46198 seconds and 5 git commands to generate.