Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
[deliverable/linux.git] / arch / x86 / mm / fault.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
0fd0e3da 13#include <linux/mmiotrace.h>
1da177e4
LT
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
1da177e4
LT
17#include <linux/interrupt.h>
18#include <linux/init.h>
19#include <linux/tty.h>
20#include <linux/vt_kern.h> /* For unblank_screen() */
21#include <linux/compiler.h>
c61e211d
HH
22#include <linux/highmem.h>
23#include <linux/bootmem.h> /* for max_low_pfn */
1eeb66a1 24#include <linux/vmalloc.h>
1da177e4 25#include <linux/module.h>
0f2fbdcb 26#include <linux/kprobes.h>
ab2bf0c1 27#include <linux/uaccess.h>
1eeb66a1 28#include <linux/kdebug.h>
1da177e4
LT
29
30#include <asm/system.h>
c61e211d
HH
31#include <asm/desc.h>
32#include <asm/segment.h>
1da177e4
LT
33#include <asm/pgalloc.h>
34#include <asm/smp.h>
35#include <asm/tlbflush.h>
36#include <asm/proto.h>
1da177e4 37#include <asm-generic/sections.h>
70ef5641 38#include <asm/traps.h>
1da177e4 39
33cb5243
HH
40/*
41 * Page fault error code bits
42 * bit 0 == 0 means no page found, 1 means protection fault
43 * bit 1 == 0 means read, 1 means write
44 * bit 2 == 0 means kernel, 1 means user-mode
45 * bit 3 == 1 means use of reserved bit detected
46 * bit 4 == 1 means fault was an instruction fetch
47 */
8a19da7b 48#define PF_PROT (1<<0)
66c58156 49#define PF_WRITE (1<<1)
8a19da7b
IM
50#define PF_USER (1<<2)
51#define PF_RSVD (1<<3)
66c58156
AK
52#define PF_INSTR (1<<4)
53
0fd0e3da 54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
86069782 55{
10c43d2e 56#ifdef CONFIG_MMIOTRACE_HOOKS
0fd0e3da
PP
57 if (unlikely(is_kmmio_active()))
58 if (kmmio_handler(regs, addr) == 1)
59 return -1;
86069782 60#endif
0fd0e3da 61 return 0;
86069782
PP
62}
63
74a0b576 64static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 65{
33cb5243 66#ifdef CONFIG_KPROBES
74a0b576
CH
67 int ret = 0;
68
69 /* kprobe_running() needs smp_processor_id() */
f8c2ee22 70 if (!user_mode_vm(regs)) {
74a0b576
CH
71 preempt_disable();
72 if (kprobe_running() && kprobe_fault_handler(regs, 14))
73 ret = 1;
74 preempt_enable();
75 }
1bd858a5 76
74a0b576 77 return ret;
74a0b576 78#else
74a0b576 79 return 0;
74a0b576 80#endif
33cb5243 81}
1bd858a5 82
1dc85be0
HH
83/*
84 * X86_32
85 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
86 * Check that here and ignore it.
87 *
88 * X86_64
89 * Sometimes the CPU reports invalid exceptions on prefetch.
90 * Check that here and ignore it.
91 *
92 * Opcode checker based on code by Richard Brunner
93 */
94static int is_prefetch(struct pt_regs *regs, unsigned long addr,
95 unsigned long error_code)
33cb5243 96{
ab2bf0c1 97 unsigned char *instr;
1da177e4 98 int scan_more = 1;
33cb5243 99 int prefetch = 0;
f1290ec9 100 unsigned char *max_instr;
1da177e4 101
3085354d
IM
102 /*
103 * If it was a exec (instruction fetch) fault on NX page, then
104 * do not ignore the fault:
105 */
66c58156 106 if (error_code & PF_INSTR)
1da177e4 107 return 0;
1dc85be0 108
f2857ce9 109 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 110 max_instr = instr + 15;
1da177e4 111
76381fee 112 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
113 return 0;
114
33cb5243 115 while (scan_more && instr < max_instr) {
1da177e4
LT
116 unsigned char opcode;
117 unsigned char instr_hi;
118 unsigned char instr_lo;
119
ab2bf0c1 120 if (probe_kernel_address(instr, opcode))
33cb5243 121 break;
1da177e4 122
33cb5243
HH
123 instr_hi = opcode & 0xf0;
124 instr_lo = opcode & 0x0f;
1da177e4
LT
125 instr++;
126
33cb5243 127 switch (instr_hi) {
1da177e4
LT
128 case 0x20:
129 case 0x30:
33cb5243
HH
130 /*
131 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
132 * In X86_64 long mode, the CPU will signal invalid
133 * opcode if some of these prefixes are present so
134 * X86_64 will never get here anyway
135 */
1da177e4
LT
136 scan_more = ((instr_lo & 7) == 0x6);
137 break;
33cb5243 138#ifdef CONFIG_X86_64
1da177e4 139 case 0x40:
33cb5243
HH
140 /*
141 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
142 * Need to figure out under what instruction mode the
143 * instruction was issued. Could check the LDT for lm,
144 * but for now it's good enough to assume that long
145 * mode only uses well known segments or kernel.
146 */
76381fee 147 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 148 break;
33cb5243 149#endif
1da177e4
LT
150 case 0x60:
151 /* 0x64 thru 0x67 are valid prefixes in all modes. */
152 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 153 break;
1da177e4 154 case 0xF0:
1dc85be0 155 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 156 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 157 break;
1da177e4
LT
158 case 0x00:
159 /* Prefetch instruction is 0x0F0D or 0x0F18 */
160 scan_more = 0;
f2857ce9 161
ab2bf0c1 162 if (probe_kernel_address(instr, opcode))
1da177e4
LT
163 break;
164 prefetch = (instr_lo == 0xF) &&
165 (opcode == 0x0D || opcode == 0x18);
33cb5243 166 break;
1da177e4
LT
167 default:
168 scan_more = 0;
169 break;
33cb5243 170 }
1da177e4
LT
171 }
172 return prefetch;
173}
174
c4aba4a8
HH
175static void force_sig_info_fault(int si_signo, int si_code,
176 unsigned long address, struct task_struct *tsk)
177{
178 siginfo_t info;
179
180 info.si_signo = si_signo;
181 info.si_errno = 0;
182 info.si_code = si_code;
183 info.si_addr = (void __user *)address;
184 force_sig_info(si_signo, &info, tsk);
185}
186
1156e098 187#ifdef CONFIG_X86_64
33cb5243
HH
188static int bad_address(void *p)
189{
1da177e4 190 unsigned long dummy;
ab2bf0c1 191 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 192}
1156e098 193#endif
1da177e4 194
cae30f82 195static void dump_pagetable(unsigned long address)
1da177e4 196{
1156e098
HH
197#ifdef CONFIG_X86_32
198 __typeof__(pte_val(__pte(0))) page;
199
200 page = read_cr3();
201 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
202#ifdef CONFIG_X86_PAE
203 printk("*pdpt = %016Lx ", page);
204 if ((page >> PAGE_SHIFT) < max_low_pfn
205 && page & _PAGE_PRESENT) {
206 page &= PAGE_MASK;
207 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
208 & (PTRS_PER_PMD - 1)];
209 printk(KERN_CONT "*pde = %016Lx ", page);
210 page &= ~_PAGE_NX;
211 }
212#else
213 printk("*pde = %08lx ", page);
214#endif
215
216 /*
217 * We must not directly access the pte in the highpte
218 * case if the page table is located in highmem.
219 * And let's rather not kmap-atomic the pte, just in case
220 * it's allocated already.
221 */
222 if ((page >> PAGE_SHIFT) < max_low_pfn
223 && (page & _PAGE_PRESENT)
224 && !(page & _PAGE_PSE)) {
225 page &= PAGE_MASK;
226 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
227 & (PTRS_PER_PTE - 1)];
228 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
229 }
230
231 printk("\n");
232#else /* CONFIG_X86_64 */
1da177e4
LT
233 pgd_t *pgd;
234 pud_t *pud;
235 pmd_t *pmd;
236 pte_t *pte;
237
f51c9452 238 pgd = (pgd_t *)read_cr3();
1da177e4 239
33cb5243 240 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 241 pgd += pgd_index(address);
1da177e4 242 if (bad_address(pgd)) goto bad;
d646bce4 243 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 244 if (!pgd_present(*pgd)) goto ret;
1da177e4 245
d2ae5b5f 246 pud = pud_offset(pgd, address);
1da177e4
LT
247 if (bad_address(pud)) goto bad;
248 printk("PUD %lx ", pud_val(*pud));
b5360222
AK
249 if (!pud_present(*pud) || pud_large(*pud))
250 goto ret;
1da177e4
LT
251
252 pmd = pmd_offset(pud, address);
253 if (bad_address(pmd)) goto bad;
254 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 255 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
256
257 pte = pte_offset_kernel(pmd, address);
258 if (bad_address(pte)) goto bad;
33cb5243 259 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
260ret:
261 printk("\n");
262 return;
263bad:
264 printk("BAD\n");
1156e098
HH
265#endif
266}
267
268#ifdef CONFIG_X86_32
269static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
270{
271 unsigned index = pgd_index(address);
272 pgd_t *pgd_k;
273 pud_t *pud, *pud_k;
274 pmd_t *pmd, *pmd_k;
275
276 pgd += index;
277 pgd_k = init_mm.pgd + index;
278
279 if (!pgd_present(*pgd_k))
280 return NULL;
281
282 /*
283 * set_pgd(pgd, *pgd_k); here would be useless on PAE
284 * and redundant with the set_pmd() on non-PAE. As would
285 * set_pud.
286 */
287
288 pud = pud_offset(pgd, address);
289 pud_k = pud_offset(pgd_k, address);
290 if (!pud_present(*pud_k))
291 return NULL;
292
293 pmd = pmd_offset(pud, address);
294 pmd_k = pmd_offset(pud_k, address);
295 if (!pmd_present(*pmd_k))
296 return NULL;
297 if (!pmd_present(*pmd)) {
298 set_pmd(pmd, *pmd_k);
299 arch_flush_lazy_mmu_mode();
300 } else
301 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
302 return pmd_k;
1da177e4 303}
1156e098 304#endif
1da177e4 305
1dc85be0 306#ifdef CONFIG_X86_64
33cb5243 307static const char errata93_warning[] =
1da177e4
LT
308KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
309KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
310KERN_ERR "******* Please consider a BIOS update.\n"
311KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 312#endif
1da177e4
LT
313
314/* Workaround for K8 erratum #93 & buggy BIOS.
315 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
316 to avoid corruption of the 64bit RIP register on C stepping K8.
317 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
318 The OS sees this as a page fault with the upper 32bits of RIP cleared.
319 Try to work around it here.
fdfe8aa8
HH
320 Note we only handle faults in kernel here.
321 Does nothing for X86_32
322 */
33cb5243 323static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 324{
fdfe8aa8 325#ifdef CONFIG_X86_64
1da177e4 326 static int warned;
65ea5b03 327 if (address != regs->ip)
1da177e4 328 return 0;
33cb5243 329 if ((address >> 32) != 0)
1da177e4
LT
330 return 0;
331 address |= 0xffffffffUL << 32;
33cb5243
HH
332 if ((address >= (u64)_stext && address <= (u64)_etext) ||
333 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 334 if (!warned) {
33cb5243 335 printk(errata93_warning);
1da177e4
LT
336 warned = 1;
337 }
65ea5b03 338 regs->ip = address;
1da177e4
LT
339 return 1;
340 }
fdfe8aa8 341#endif
1da177e4 342 return 0;
33cb5243 343}
1da177e4 344
35f3266f
HH
345/*
346 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
347 * addresses >4GB. We catch this in the page fault handler because these
348 * addresses are not reachable. Just detect this case and return. Any code
349 * segment in LDT is compatibility mode.
350 */
351static int is_errata100(struct pt_regs *regs, unsigned long address)
352{
353#ifdef CONFIG_X86_64
354 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
355 (address >> 32))
356 return 1;
357#endif
358 return 0;
359}
360
29caf2f9
HH
361static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
362{
363#ifdef CONFIG_X86_F00F_BUG
364 unsigned long nr;
365 /*
366 * Pentium F0 0F C7 C8 bug workaround.
367 */
368 if (boot_cpu_data.f00f_bug) {
369 nr = (address - idt_descr.address) >> 3;
370
371 if (nr == 6) {
372 do_invalid_op(regs, 0);
373 return 1;
374 }
375 }
376#endif
377 return 0;
378}
379
b3279c7f
HH
380static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
381 unsigned long address)
382{
1156e098
HH
383#ifdef CONFIG_X86_32
384 if (!oops_may_print())
385 return;
fd40d6e3 386#endif
1156e098
HH
387
388#ifdef CONFIG_X86_PAE
389 if (error_code & PF_INSTR) {
93809be8 390 unsigned int level;
1156e098
HH
391 pte_t *pte = lookup_address(address, &level);
392
393 if (pte && pte_present(*pte) && !pte_exec(*pte))
394 printk(KERN_CRIT "kernel tried to execute "
395 "NX-protected page - exploit attempt? "
396 "(uid: %d)\n", current->uid);
397 }
398#endif
1156e098 399
19f0dda9 400 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 401 if (address < PAGE_SIZE)
19f0dda9 402 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 403 else
19f0dda9 404 printk(KERN_CONT "paging request");
f294a8ce 405 printk(KERN_CONT " at %p\n", (void *) address);
19f0dda9 406 printk(KERN_ALERT "IP:");
b3279c7f
HH
407 printk_address(regs->ip, 1);
408 dump_pagetable(address);
409}
410
1156e098 411#ifdef CONFIG_X86_64
1da177e4
LT
412static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
413 unsigned long error_code)
414{
1209140c 415 unsigned long flags = oops_begin();
6e3f3617 416 struct task_struct *tsk;
1209140c 417
1da177e4
LT
418 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
419 current->comm, address);
420 dump_pagetable(address);
6e3f3617
JB
421 tsk = current;
422 tsk->thread.cr2 = address;
423 tsk->thread.trap_no = 14;
424 tsk->thread.error_code = error_code;
22f5991c
JB
425 if (__die("Bad pagetable", regs, error_code))
426 regs = NULL;
427 oops_end(flags, regs, SIGKILL);
1da177e4 428}
1156e098 429#endif
1da177e4 430
d8b57bb7
TG
431static int spurious_fault_check(unsigned long error_code, pte_t *pte)
432{
433 if ((error_code & PF_WRITE) && !pte_write(*pte))
434 return 0;
435 if ((error_code & PF_INSTR) && !pte_exec(*pte))
436 return 0;
437
438 return 1;
439}
440
5b727a3b
JF
441/*
442 * Handle a spurious fault caused by a stale TLB entry. This allows
443 * us to lazily refresh the TLB when increasing the permissions of a
444 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
445 * expensive since that implies doing a full cross-processor TLB
446 * flush, even if no stale TLB entries exist on other processors.
447 * There are no security implications to leaving a stale TLB when
448 * increasing the permissions on a page.
449 */
450static int spurious_fault(unsigned long address,
451 unsigned long error_code)
452{
453 pgd_t *pgd;
454 pud_t *pud;
455 pmd_t *pmd;
456 pte_t *pte;
457
458 /* Reserved-bit violation or user access to kernel space? */
459 if (error_code & (PF_USER | PF_RSVD))
460 return 0;
461
462 pgd = init_mm.pgd + pgd_index(address);
463 if (!pgd_present(*pgd))
464 return 0;
465
466 pud = pud_offset(pgd, address);
467 if (!pud_present(*pud))
468 return 0;
469
d8b57bb7
TG
470 if (pud_large(*pud))
471 return spurious_fault_check(error_code, (pte_t *) pud);
472
5b727a3b
JF
473 pmd = pmd_offset(pud, address);
474 if (!pmd_present(*pmd))
475 return 0;
476
d8b57bb7
TG
477 if (pmd_large(*pmd))
478 return spurious_fault_check(error_code, (pte_t *) pmd);
479
5b727a3b
JF
480 pte = pte_offset_kernel(pmd, address);
481 if (!pte_present(*pte))
482 return 0;
483
d8b57bb7 484 return spurious_fault_check(error_code, pte);
5b727a3b
JF
485}
486
1da177e4 487/*
f8c2ee22
HH
488 * X86_32
489 * Handle a fault on the vmalloc or module mapping area
490 *
491 * X86_64
f95190b2 492 * Handle a fault on the vmalloc area
3b9ba4d5
AK
493 *
494 * This assumes no large pages in there.
1da177e4
LT
495 */
496static int vmalloc_fault(unsigned long address)
497{
fdfe8aa8
HH
498#ifdef CONFIG_X86_32
499 unsigned long pgd_paddr;
500 pmd_t *pmd_k;
501 pte_t *pte_k;
b29c701d
HN
502
503 /* Make sure we are in vmalloc area */
504 if (!(address >= VMALLOC_START && address < VMALLOC_END))
505 return -1;
506
fdfe8aa8
HH
507 /*
508 * Synchronize this task's top level page-table
509 * with the 'reference' page table.
510 *
511 * Do _not_ use "current" here. We might be inside
512 * an interrupt in the middle of a task switch..
513 */
514 pgd_paddr = read_cr3();
515 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
516 if (!pmd_k)
517 return -1;
518 pte_k = pte_offset_kernel(pmd_k, address);
519 if (!pte_present(*pte_k))
520 return -1;
521 return 0;
522#else
1da177e4
LT
523 pgd_t *pgd, *pgd_ref;
524 pud_t *pud, *pud_ref;
525 pmd_t *pmd, *pmd_ref;
526 pte_t *pte, *pte_ref;
527
cf89ec92
HH
528 /* Make sure we are in vmalloc area */
529 if (!(address >= VMALLOC_START && address < VMALLOC_END))
530 return -1;
531
1da177e4
LT
532 /* Copy kernel mappings over when needed. This can also
533 happen within a race in page table update. In the later
534 case just flush. */
535
536 pgd = pgd_offset(current->mm ?: &init_mm, address);
537 pgd_ref = pgd_offset_k(address);
538 if (pgd_none(*pgd_ref))
539 return -1;
540 if (pgd_none(*pgd))
541 set_pgd(pgd, *pgd_ref);
8c914cb7 542 else
46a82b2d 543 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
544
545 /* Below here mismatches are bugs because these lower tables
546 are shared */
547
548 pud = pud_offset(pgd, address);
549 pud_ref = pud_offset(pgd_ref, address);
550 if (pud_none(*pud_ref))
551 return -1;
46a82b2d 552 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
553 BUG();
554 pmd = pmd_offset(pud, address);
555 pmd_ref = pmd_offset(pud_ref, address);
556 if (pmd_none(*pmd_ref))
557 return -1;
558 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
559 BUG();
560 pte_ref = pte_offset_kernel(pmd_ref, address);
561 if (!pte_present(*pte_ref))
562 return -1;
563 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
564 /* Don't use pte_page here, because the mappings can point
565 outside mem_map, and the NUMA hash lookup cannot handle
566 that. */
567 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 568 BUG();
1da177e4 569 return 0;
fdfe8aa8 570#endif
1da177e4
LT
571}
572
abd4f750 573int show_unhandled_signals = 1;
1da177e4
LT
574
575/*
576 * This routine handles page faults. It determines the address,
577 * and the problem, and then passes it off to one of the appropriate
578 * routines.
1da177e4 579 */
f8c2ee22
HH
580#ifdef CONFIG_X86_64
581asmlinkage
582#endif
583void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1da177e4
LT
584{
585 struct task_struct *tsk;
586 struct mm_struct *mm;
33cb5243 587 struct vm_area_struct *vma;
1da177e4 588 unsigned long address;
f8c2ee22
HH
589 int write, si_code;
590 int fault;
591#ifdef CONFIG_X86_64
1209140c 592 unsigned long flags;
f8c2ee22 593#endif
1da177e4 594
143a5d32
PZ
595 /*
596 * We can fault from pretty much anywhere, with unknown IRQ state.
597 */
598 trace_hardirqs_fixup();
599
a9ba9a3b
AV
600 tsk = current;
601 mm = tsk->mm;
602 prefetchw(&mm->mmap_sem);
603
1da177e4 604 /* get the address */
f51c9452 605 address = read_cr2();
1da177e4 606
c4aba4a8 607 si_code = SEGV_MAPERR;
1da177e4 608
608566b4
HH
609 if (notify_page_fault(regs))
610 return;
0fd0e3da 611 if (unlikely(kmmio_fault(regs, address)))
86069782 612 return;
1da177e4
LT
613
614 /*
615 * We fault-in kernel-space virtual memory on-demand. The
616 * 'reference' page table is init_mm.pgd.
617 *
618 * NOTE! We MUST NOT take any locks for this case. We may
619 * be in an interrupt or a critical region, and should
620 * only copy the information from the master page table,
621 * nothing more.
622 *
623 * This verifies that the fault happens in kernel space
624 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 625 * protection error (error_code & 9) == 0.
1da177e4 626 */
f8c2ee22
HH
627#ifdef CONFIG_X86_32
628 if (unlikely(address >= TASK_SIZE)) {
cf89ec92
HH
629#else
630 if (unlikely(address >= TASK_SIZE64)) {
631#endif
f8c2ee22
HH
632 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
633 vmalloc_fault(address) >= 0)
634 return;
5b727a3b
JF
635
636 /* Can handle a stale RO->RW TLB */
637 if (spurious_fault(address, error_code))
638 return;
639
f8c2ee22
HH
640 /*
641 * Don't take the mm semaphore here. If we fixup a prefetch
642 * fault we could otherwise deadlock.
643 */
644 goto bad_area_nosemaphore;
645 }
646
cf89ec92
HH
647
648#ifdef CONFIG_X86_32
f8c2ee22
HH
649 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
650 fault has been handled. */
6b6891f9 651 if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
f8c2ee22
HH
652 local_irq_enable();
653
654 /*
655 * If we're in an interrupt, have no user context or are running in an
656 * atomic region then we must not take the fault.
657 */
658 if (in_atomic() || !mm)
659 goto bad_area_nosemaphore;
660#else /* CONFIG_X86_64 */
65ea5b03 661 if (likely(regs->flags & X86_EFLAGS_IF))
8c914cb7
JB
662 local_irq_enable();
663
66c58156 664 if (unlikely(error_code & PF_RSVD))
1da177e4
LT
665 pgtable_bad(address, regs, error_code);
666
667 /*
33cb5243
HH
668 * If we're in an interrupt, have no user context or are running in an
669 * atomic region then we must not take the fault.
1da177e4
LT
670 */
671 if (unlikely(in_atomic() || !mm))
672 goto bad_area_nosemaphore;
673
dbe3ed1c
LT
674 /*
675 * User-mode registers count as a user access even for any
676 * potential system fault or CPU buglet.
677 */
678 if (user_mode_vm(regs))
679 error_code |= PF_USER;
f8c2ee22
HH
680again:
681#endif
1da177e4
LT
682 /* When running in the kernel we expect faults to occur only to
683 * addresses in user space. All other faults represent errors in the
676b1855 684 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 685 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
686 * we will deadlock attempting to validate the fault against the
687 * address space. Luckily the kernel only validly references user
688 * space from well defined areas of code, which are listed in the
689 * exceptions table.
690 *
691 * As the vast majority of faults will be valid we will only perform
676b1855 692 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
693 * Attempt to lock the address space, if we cannot we then validate the
694 * source. If this is invalid we can skip the address space check,
695 * thus avoiding the deadlock.
696 */
697 if (!down_read_trylock(&mm->mmap_sem)) {
66c58156 698 if ((error_code & PF_USER) == 0 &&
65ea5b03 699 !search_exception_tables(regs->ip))
1da177e4
LT
700 goto bad_area_nosemaphore;
701 down_read(&mm->mmap_sem);
702 }
703
704 vma = find_vma(mm, address);
705 if (!vma)
706 goto bad_area;
f8c2ee22 707 if (vma->vm_start <= address)
1da177e4
LT
708 goto good_area;
709 if (!(vma->vm_flags & VM_GROWSDOWN))
710 goto bad_area;
33cb5243 711 if (error_code & PF_USER) {
6f4d368e
HH
712 /*
713 * Accessing the stack below %sp is always a bug.
714 * The large cushion allows instructions like enter
715 * and pusha to work. ("enter $65535,$31" pushes
716 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 717 */
65ea5b03 718 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
1da177e4
LT
719 goto bad_area;
720 }
721 if (expand_stack(vma, address))
722 goto bad_area;
723/*
724 * Ok, we have a good vm_area for this memory access, so
725 * we can handle it..
726 */
727good_area:
c4aba4a8 728 si_code = SEGV_ACCERR;
1da177e4 729 write = 0;
66c58156 730 switch (error_code & (PF_PROT|PF_WRITE)) {
33cb5243
HH
731 default: /* 3: write, present */
732 /* fall through */
733 case PF_WRITE: /* write, not present */
734 if (!(vma->vm_flags & VM_WRITE))
735 goto bad_area;
736 write++;
737 break;
738 case PF_PROT: /* read, present */
739 goto bad_area;
740 case 0: /* read, not present */
741 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1da177e4 742 goto bad_area;
1da177e4
LT
743 }
744
f8c2ee22
HH
745#ifdef CONFIG_X86_32
746survive:
747#endif
1da177e4
LT
748 /*
749 * If for any reason at all we couldn't handle the fault,
750 * make sure we exit gracefully rather than endlessly redo
751 * the fault.
752 */
83c54070
NP
753 fault = handle_mm_fault(mm, vma, address, write);
754 if (unlikely(fault & VM_FAULT_ERROR)) {
755 if (fault & VM_FAULT_OOM)
756 goto out_of_memory;
757 else if (fault & VM_FAULT_SIGBUS)
758 goto do_sigbus;
759 BUG();
1da177e4 760 }
83c54070
NP
761 if (fault & VM_FAULT_MAJOR)
762 tsk->maj_flt++;
763 else
764 tsk->min_flt++;
d729ab35
HH
765
766#ifdef CONFIG_X86_32
767 /*
768 * Did it hit the DOS screen memory VA from vm86 mode?
769 */
770 if (v8086_mode(regs)) {
771 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
772 if (bit < 32)
773 tsk->thread.screen_bitmap |= 1 << bit;
774 }
775#endif
1da177e4
LT
776 up_read(&mm->mmap_sem);
777 return;
778
779/*
780 * Something tried to access memory that isn't in our memory map..
781 * Fix it, but check if it's kernel or user first..
782 */
783bad_area:
784 up_read(&mm->mmap_sem);
785
786bad_area_nosemaphore:
1da177e4 787 /* User mode accesses just cause a SIGSEGV */
66c58156 788 if (error_code & PF_USER) {
e5e3c84b
SR
789 /*
790 * It's possible to have interrupts off here.
791 */
792 local_irq_enable();
793
1156e098
HH
794 /*
795 * Valid to do another page fault here because this one came
796 * from user space.
797 */
1da177e4
LT
798 if (is_prefetch(regs, address, error_code))
799 return;
800
35f3266f 801 if (is_errata100(regs, address))
1da177e4
LT
802 return;
803
abd4f750
MAS
804 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
805 printk_ratelimit()) {
1da177e4 806 printk(
f294a8ce 807 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
6f4d368e 808 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
f294a8ce
VN
809 tsk->comm, task_pid_nr(tsk), address,
810 (void *) regs->ip, (void *) regs->sp, error_code);
03252919
AK
811 print_vma_addr(" in ", regs->ip);
812 printk("\n");
1da177e4 813 }
33cb5243 814
1da177e4
LT
815 tsk->thread.cr2 = address;
816 /* Kernel addresses are always protection faults */
817 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
818 tsk->thread.trap_no = 14;
c4aba4a8 819 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
1da177e4
LT
820 return;
821 }
822
29caf2f9
HH
823 if (is_f00f_bug(regs, address))
824 return;
825
1da177e4 826no_context:
1da177e4 827 /* Are we prepared to handle this kernel fault? */
33cb5243 828 if (fixup_exception(regs))
1da177e4 829 return;
1da177e4 830
33cb5243 831 /*
f8c2ee22
HH
832 * X86_32
833 * Valid to do another page fault here, because if this fault
834 * had been triggered by is_prefetch fixup_exception would have
835 * handled it.
836 *
837 * X86_64
1da177e4
LT
838 * Hall of shame of CPU/BIOS bugs.
839 */
33cb5243
HH
840 if (is_prefetch(regs, address, error_code))
841 return;
1da177e4
LT
842
843 if (is_errata93(regs, address))
33cb5243 844 return;
1da177e4
LT
845
846/*
847 * Oops. The kernel tried to access some bad page. We'll have to
848 * terminate things with extreme prejudice.
849 */
f8c2ee22
HH
850#ifdef CONFIG_X86_32
851 bust_spinlocks(1);
fd40d6e3
HH
852#else
853 flags = oops_begin();
854#endif
f8c2ee22
HH
855
856 show_fault_oops(regs, error_code, address);
1da177e4 857
f8c2ee22
HH
858 tsk->thread.cr2 = address;
859 tsk->thread.trap_no = 14;
860 tsk->thread.error_code = error_code;
fd40d6e3
HH
861
862#ifdef CONFIG_X86_32
f8c2ee22
HH
863 die("Oops", regs, error_code);
864 bust_spinlocks(0);
865 do_exit(SIGKILL);
fd40d6e3 866#else
22f5991c
JB
867 if (__die("Oops", regs, error_code))
868 regs = NULL;
1da177e4
LT
869 /* Executive summary in case the body of the oops scrolled away */
870 printk(KERN_EMERG "CR2: %016lx\n", address);
22f5991c 871 oops_end(flags, regs, SIGKILL);
f8c2ee22 872#endif
1da177e4
LT
873
874/*
875 * We ran out of memory, or some other thing happened to us that made
876 * us unable to handle the page fault gracefully.
877 */
878out_of_memory:
879 up_read(&mm->mmap_sem);
f8c2ee22
HH
880 if (is_global_init(tsk)) {
881 yield();
fd40d6e3 882#ifdef CONFIG_X86_32
f8c2ee22
HH
883 down_read(&mm->mmap_sem);
884 goto survive;
f8c2ee22 885#else
1da177e4 886 goto again;
f8c2ee22 887#endif
fd40d6e3
HH
888 }
889
1da177e4 890 printk("VM: killing process %s\n", tsk->comm);
318aa296 891 if (error_code & PF_USER)
021daae2 892 do_group_exit(SIGKILL);
1da177e4
LT
893 goto no_context;
894
895do_sigbus:
896 up_read(&mm->mmap_sem);
897
898 /* Kernel mode? Handle exceptions or die */
66c58156 899 if (!(error_code & PF_USER))
1da177e4 900 goto no_context;
f8c2ee22
HH
901#ifdef CONFIG_X86_32
902 /* User space => ok to do another page fault */
903 if (is_prefetch(regs, address, error_code))
904 return;
905#endif
1da177e4
LT
906 tsk->thread.cr2 = address;
907 tsk->thread.error_code = error_code;
908 tsk->thread.trap_no = 14;
c4aba4a8 909 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
1da177e4 910}
9e43e1b7 911
8c914cb7 912DEFINE_SPINLOCK(pgd_lock);
2bff7383 913LIST_HEAD(pgd_list);
8c914cb7
JB
914
915void vmalloc_sync_all(void)
916{
1156e098 917#ifdef CONFIG_X86_32
67350a5c 918 unsigned long start = VMALLOC_START & PGDIR_MASK;
1156e098
HH
919 unsigned long address;
920
921 if (SHARED_KERNEL_PMD)
922 return;
923
924 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
925 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
67350a5c
JF
926 unsigned long flags;
927 struct page *page;
928
929 spin_lock_irqsave(&pgd_lock, flags);
930 list_for_each_entry(page, &pgd_list, lru) {
931 if (!vmalloc_sync_one(page_address(page),
932 address))
933 break;
1156e098 934 }
67350a5c 935 spin_unlock_irqrestore(&pgd_lock, flags);
1156e098
HH
936 }
937#else /* CONFIG_X86_64 */
67350a5c 938 unsigned long start = VMALLOC_START & PGDIR_MASK;
8c914cb7
JB
939 unsigned long address;
940
941 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
67350a5c
JF
942 const pgd_t *pgd_ref = pgd_offset_k(address);
943 unsigned long flags;
944 struct page *page;
945
946 if (pgd_none(*pgd_ref))
947 continue;
948 spin_lock_irqsave(&pgd_lock, flags);
949 list_for_each_entry(page, &pgd_list, lru) {
950 pgd_t *pgd;
951 pgd = (pgd_t *)page_address(page) + pgd_index(address);
952 if (pgd_none(*pgd))
953 set_pgd(pgd, *pgd_ref);
954 else
955 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7 956 }
67350a5c 957 spin_unlock_irqrestore(&pgd_lock, flags);
8c914cb7 958 }
1156e098 959#endif
8c914cb7 960}
This page took 0.432747 seconds and 5 git commands to generate.