x86: unify fault_32|64.c by ifdef'd function bodies
[deliverable/linux.git] / arch / x86 / mm / fault_64.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
1da177e4
LT
16#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
1eeb66a1 21#include <linux/vmalloc.h>
1da177e4 22#include <linux/module.h>
0f2fbdcb 23#include <linux/kprobes.h>
ab2bf0c1 24#include <linux/uaccess.h>
1eeb66a1 25#include <linux/kdebug.h>
1da177e4
LT
26
27#include <asm/system.h>
1da177e4
LT
28#include <asm/pgalloc.h>
29#include <asm/smp.h>
30#include <asm/tlbflush.h>
31#include <asm/proto.h>
1da177e4 32#include <asm-generic/sections.h>
1da177e4 33
33cb5243
HH
34/*
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
41 */
8a19da7b 42#define PF_PROT (1<<0)
66c58156 43#define PF_WRITE (1<<1)
8a19da7b
IM
44#define PF_USER (1<<2)
45#define PF_RSVD (1<<3)
66c58156
AK
46#define PF_INSTR (1<<4)
47
74a0b576 48static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 49{
33cb5243 50#ifdef CONFIG_KPROBES
74a0b576
CH
51 int ret = 0;
52
53 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
55 preempt_disable();
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 ret = 1;
58 preempt_enable();
59 }
1bd858a5 60
74a0b576 61 return ret;
74a0b576 62#else
74a0b576 63 return 0;
74a0b576 64#endif
33cb5243 65}
1bd858a5 66
1dc85be0
HH
67/*
68 * X86_32
69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70 * Check that here and ignore it.
71 *
72 * X86_64
73 * Sometimes the CPU reports invalid exceptions on prefetch.
74 * Check that here and ignore it.
75 *
76 * Opcode checker based on code by Richard Brunner
77 */
78static int is_prefetch(struct pt_regs *regs, unsigned long addr,
79 unsigned long error_code)
33cb5243 80{
ab2bf0c1 81 unsigned char *instr;
1da177e4 82 int scan_more = 1;
33cb5243 83 int prefetch = 0;
f1290ec9 84 unsigned char *max_instr;
1da177e4 85
1dc85be0 86#ifdef CONFIG_X86_32
1dc85be0
HH
87 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
88 boot_cpu_data.x86 >= 6)) {
89 /* Catch an obscure case of prefetch inside an NX page. */
90 if (nx_enabled && (error_code & PF_INSTR))
91 return 0;
92 } else {
93 return 0;
94 }
1dc85be0 95#else
1da177e4 96 /* If it was a exec fault ignore */
66c58156 97 if (error_code & PF_INSTR)
1da177e4 98 return 0;
1dc85be0
HH
99#endif
100
f2857ce9 101 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 102 max_instr = instr + 15;
1da177e4 103
76381fee 104 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
105 return 0;
106
33cb5243 107 while (scan_more && instr < max_instr) {
1da177e4
LT
108 unsigned char opcode;
109 unsigned char instr_hi;
110 unsigned char instr_lo;
111
ab2bf0c1 112 if (probe_kernel_address(instr, opcode))
33cb5243 113 break;
1da177e4 114
33cb5243
HH
115 instr_hi = opcode & 0xf0;
116 instr_lo = opcode & 0x0f;
1da177e4
LT
117 instr++;
118
33cb5243 119 switch (instr_hi) {
1da177e4
LT
120 case 0x20:
121 case 0x30:
33cb5243
HH
122 /*
123 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
124 * In X86_64 long mode, the CPU will signal invalid
125 * opcode if some of these prefixes are present so
126 * X86_64 will never get here anyway
127 */
1da177e4
LT
128 scan_more = ((instr_lo & 7) == 0x6);
129 break;
33cb5243 130#ifdef CONFIG_X86_64
1da177e4 131 case 0x40:
33cb5243
HH
132 /*
133 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
134 * Need to figure out under what instruction mode the
135 * instruction was issued. Could check the LDT for lm,
136 * but for now it's good enough to assume that long
137 * mode only uses well known segments or kernel.
138 */
76381fee 139 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 140 break;
33cb5243 141#endif
1da177e4
LT
142 case 0x60:
143 /* 0x64 thru 0x67 are valid prefixes in all modes. */
144 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 145 break;
1da177e4 146 case 0xF0:
1dc85be0 147 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 148 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 149 break;
1da177e4
LT
150 case 0x00:
151 /* Prefetch instruction is 0x0F0D or 0x0F18 */
152 scan_more = 0;
f2857ce9 153
ab2bf0c1 154 if (probe_kernel_address(instr, opcode))
1da177e4
LT
155 break;
156 prefetch = (instr_lo == 0xF) &&
157 (opcode == 0x0D || opcode == 0x18);
33cb5243 158 break;
1da177e4
LT
159 default:
160 scan_more = 0;
161 break;
33cb5243 162 }
1da177e4
LT
163 }
164 return prefetch;
165}
166
c4aba4a8
HH
167static void force_sig_info_fault(int si_signo, int si_code,
168 unsigned long address, struct task_struct *tsk)
169{
170 siginfo_t info;
171
172 info.si_signo = si_signo;
173 info.si_errno = 0;
174 info.si_code = si_code;
175 info.si_addr = (void __user *)address;
176 force_sig_info(si_signo, &info, tsk);
177}
178
1156e098 179#ifdef CONFIG_X86_64
33cb5243
HH
180static int bad_address(void *p)
181{
1da177e4 182 unsigned long dummy;
ab2bf0c1 183 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 184}
1156e098 185#endif
1da177e4
LT
186
187void dump_pagetable(unsigned long address)
188{
1156e098
HH
189#ifdef CONFIG_X86_32
190 __typeof__(pte_val(__pte(0))) page;
191
192 page = read_cr3();
193 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
194#ifdef CONFIG_X86_PAE
195 printk("*pdpt = %016Lx ", page);
196 if ((page >> PAGE_SHIFT) < max_low_pfn
197 && page & _PAGE_PRESENT) {
198 page &= PAGE_MASK;
199 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
200 & (PTRS_PER_PMD - 1)];
201 printk(KERN_CONT "*pde = %016Lx ", page);
202 page &= ~_PAGE_NX;
203 }
204#else
205 printk("*pde = %08lx ", page);
206#endif
207
208 /*
209 * We must not directly access the pte in the highpte
210 * case if the page table is located in highmem.
211 * And let's rather not kmap-atomic the pte, just in case
212 * it's allocated already.
213 */
214 if ((page >> PAGE_SHIFT) < max_low_pfn
215 && (page & _PAGE_PRESENT)
216 && !(page & _PAGE_PSE)) {
217 page &= PAGE_MASK;
218 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
219 & (PTRS_PER_PTE - 1)];
220 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
221 }
222
223 printk("\n");
224#else /* CONFIG_X86_64 */
1da177e4
LT
225 pgd_t *pgd;
226 pud_t *pud;
227 pmd_t *pmd;
228 pte_t *pte;
229
f51c9452 230 pgd = (pgd_t *)read_cr3();
1da177e4 231
33cb5243 232 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 233 pgd += pgd_index(address);
1da177e4 234 if (bad_address(pgd)) goto bad;
d646bce4 235 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 236 if (!pgd_present(*pgd)) goto ret;
1da177e4 237
d2ae5b5f 238 pud = pud_offset(pgd, address);
1da177e4
LT
239 if (bad_address(pud)) goto bad;
240 printk("PUD %lx ", pud_val(*pud));
241 if (!pud_present(*pud)) goto ret;
242
243 pmd = pmd_offset(pud, address);
244 if (bad_address(pmd)) goto bad;
245 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 246 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
247
248 pte = pte_offset_kernel(pmd, address);
249 if (bad_address(pte)) goto bad;
33cb5243 250 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
251ret:
252 printk("\n");
253 return;
254bad:
255 printk("BAD\n");
1156e098
HH
256#endif
257}
258
259#ifdef CONFIG_X86_32
260static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
261{
262 unsigned index = pgd_index(address);
263 pgd_t *pgd_k;
264 pud_t *pud, *pud_k;
265 pmd_t *pmd, *pmd_k;
266
267 pgd += index;
268 pgd_k = init_mm.pgd + index;
269
270 if (!pgd_present(*pgd_k))
271 return NULL;
272
273 /*
274 * set_pgd(pgd, *pgd_k); here would be useless on PAE
275 * and redundant with the set_pmd() on non-PAE. As would
276 * set_pud.
277 */
278
279 pud = pud_offset(pgd, address);
280 pud_k = pud_offset(pgd_k, address);
281 if (!pud_present(*pud_k))
282 return NULL;
283
284 pmd = pmd_offset(pud, address);
285 pmd_k = pmd_offset(pud_k, address);
286 if (!pmd_present(*pmd_k))
287 return NULL;
288 if (!pmd_present(*pmd)) {
289 set_pmd(pmd, *pmd_k);
290 arch_flush_lazy_mmu_mode();
291 } else
292 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
293 return pmd_k;
1da177e4 294}
1156e098 295#endif
1da177e4 296
1dc85be0 297#ifdef CONFIG_X86_64
33cb5243 298static const char errata93_warning[] =
1da177e4
LT
299KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
300KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
301KERN_ERR "******* Please consider a BIOS update.\n"
302KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 303#endif
1da177e4
LT
304
305/* Workaround for K8 erratum #93 & buggy BIOS.
306 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
307 to avoid corruption of the 64bit RIP register on C stepping K8.
308 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
309 The OS sees this as a page fault with the upper 32bits of RIP cleared.
310 Try to work around it here.
fdfe8aa8
HH
311 Note we only handle faults in kernel here.
312 Does nothing for X86_32
313 */
33cb5243 314static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 315{
fdfe8aa8 316#ifdef CONFIG_X86_64
1da177e4 317 static int warned;
65ea5b03 318 if (address != regs->ip)
1da177e4 319 return 0;
33cb5243 320 if ((address >> 32) != 0)
1da177e4
LT
321 return 0;
322 address |= 0xffffffffUL << 32;
33cb5243
HH
323 if ((address >= (u64)_stext && address <= (u64)_etext) ||
324 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 325 if (!warned) {
33cb5243 326 printk(errata93_warning);
1da177e4
LT
327 warned = 1;
328 }
65ea5b03 329 regs->ip = address;
1da177e4
LT
330 return 1;
331 }
fdfe8aa8 332#endif
1da177e4 333 return 0;
33cb5243 334}
1da177e4 335
35f3266f
HH
336/*
337 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
338 * addresses >4GB. We catch this in the page fault handler because these
339 * addresses are not reachable. Just detect this case and return. Any code
340 * segment in LDT is compatibility mode.
341 */
342static int is_errata100(struct pt_regs *regs, unsigned long address)
343{
344#ifdef CONFIG_X86_64
345 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
346 (address >> 32))
347 return 1;
348#endif
349 return 0;
350}
351
29caf2f9
HH
352void do_invalid_op(struct pt_regs *, unsigned long);
353
354static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
355{
356#ifdef CONFIG_X86_F00F_BUG
357 unsigned long nr;
358 /*
359 * Pentium F0 0F C7 C8 bug workaround.
360 */
361 if (boot_cpu_data.f00f_bug) {
362 nr = (address - idt_descr.address) >> 3;
363
364 if (nr == 6) {
365 do_invalid_op(regs, 0);
366 return 1;
367 }
368 }
369#endif
370 return 0;
371}
372
b3279c7f
HH
373static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
374 unsigned long address)
375{
1156e098
HH
376#ifdef CONFIG_X86_32
377 if (!oops_may_print())
378 return;
379
380#ifdef CONFIG_X86_PAE
381 if (error_code & PF_INSTR) {
382 int level;
383 pte_t *pte = lookup_address(address, &level);
384
385 if (pte && pte_present(*pte) && !pte_exec(*pte))
386 printk(KERN_CRIT "kernel tried to execute "
387 "NX-protected page - exploit attempt? "
388 "(uid: %d)\n", current->uid);
389 }
390#endif
391 printk(KERN_ALERT "BUG: unable to handle kernel ");
392 if (address < PAGE_SIZE)
393 printk(KERN_CONT "NULL pointer dereference");
394 else
395 printk(KERN_CONT "paging request");
396 printk(KERN_CONT " at %08lx\n", address);
397
398 printk(KERN_ALERT "IP:");
399 printk_address(regs->ip, 1);
400 dump_pagetable(address);
401#else /* CONFIG_X86_64 */
19f0dda9 402 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 403 if (address < PAGE_SIZE)
19f0dda9 404 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 405 else
19f0dda9
HH
406 printk(KERN_CONT "paging request");
407 printk(KERN_CONT " at %016lx\n", address);
408
409 printk(KERN_ALERT "IP:");
b3279c7f
HH
410 printk_address(regs->ip, 1);
411 dump_pagetable(address);
1156e098 412#endif
b3279c7f
HH
413}
414
1156e098 415#ifdef CONFIG_X86_64
1da177e4
LT
416static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
417 unsigned long error_code)
418{
1209140c 419 unsigned long flags = oops_begin();
6e3f3617 420 struct task_struct *tsk;
1209140c 421
1da177e4
LT
422 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
423 current->comm, address);
424 dump_pagetable(address);
6e3f3617
JB
425 tsk = current;
426 tsk->thread.cr2 = address;
427 tsk->thread.trap_no = 14;
428 tsk->thread.error_code = error_code;
22f5991c
JB
429 if (__die("Bad pagetable", regs, error_code))
430 regs = NULL;
431 oops_end(flags, regs, SIGKILL);
1da177e4 432}
1156e098 433#endif
1da177e4
LT
434
435/*
f95190b2 436 * Handle a fault on the vmalloc area
3b9ba4d5
AK
437 *
438 * This assumes no large pages in there.
1da177e4
LT
439 */
440static int vmalloc_fault(unsigned long address)
441{
fdfe8aa8
HH
442#ifdef CONFIG_X86_32
443 unsigned long pgd_paddr;
444 pmd_t *pmd_k;
445 pte_t *pte_k;
446 /*
447 * Synchronize this task's top level page-table
448 * with the 'reference' page table.
449 *
450 * Do _not_ use "current" here. We might be inside
451 * an interrupt in the middle of a task switch..
452 */
453 pgd_paddr = read_cr3();
454 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
455 if (!pmd_k)
456 return -1;
457 pte_k = pte_offset_kernel(pmd_k, address);
458 if (!pte_present(*pte_k))
459 return -1;
460 return 0;
461#else
1da177e4
LT
462 pgd_t *pgd, *pgd_ref;
463 pud_t *pud, *pud_ref;
464 pmd_t *pmd, *pmd_ref;
465 pte_t *pte, *pte_ref;
466
467 /* Copy kernel mappings over when needed. This can also
468 happen within a race in page table update. In the later
469 case just flush. */
470
471 pgd = pgd_offset(current->mm ?: &init_mm, address);
472 pgd_ref = pgd_offset_k(address);
473 if (pgd_none(*pgd_ref))
474 return -1;
475 if (pgd_none(*pgd))
476 set_pgd(pgd, *pgd_ref);
8c914cb7 477 else
46a82b2d 478 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
479
480 /* Below here mismatches are bugs because these lower tables
481 are shared */
482
483 pud = pud_offset(pgd, address);
484 pud_ref = pud_offset(pgd_ref, address);
485 if (pud_none(*pud_ref))
486 return -1;
46a82b2d 487 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
488 BUG();
489 pmd = pmd_offset(pud, address);
490 pmd_ref = pmd_offset(pud_ref, address);
491 if (pmd_none(*pmd_ref))
492 return -1;
493 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
494 BUG();
495 pte_ref = pte_offset_kernel(pmd_ref, address);
496 if (!pte_present(*pte_ref))
497 return -1;
498 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
499 /* Don't use pte_page here, because the mappings can point
500 outside mem_map, and the NUMA hash lookup cannot handle
501 that. */
502 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 503 BUG();
1da177e4 504 return 0;
fdfe8aa8 505#endif
1da177e4
LT
506}
507
abd4f750 508int show_unhandled_signals = 1;
1da177e4
LT
509
510/*
511 * This routine handles page faults. It determines the address,
512 * and the problem, and then passes it off to one of the appropriate
513 * routines.
1da177e4 514 */
0f2fbdcb
PP
515asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
516 unsigned long error_code)
1da177e4
LT
517{
518 struct task_struct *tsk;
519 struct mm_struct *mm;
33cb5243 520 struct vm_area_struct *vma;
1da177e4 521 unsigned long address;
83c54070 522 int write, fault;
1209140c 523 unsigned long flags;
c4aba4a8 524 int si_code;
1da177e4 525
143a5d32
PZ
526 /*
527 * We can fault from pretty much anywhere, with unknown IRQ state.
528 */
529 trace_hardirqs_fixup();
530
a9ba9a3b
AV
531 tsk = current;
532 mm = tsk->mm;
533 prefetchw(&mm->mmap_sem);
534
1da177e4 535 /* get the address */
f51c9452 536 address = read_cr2();
1da177e4 537
c4aba4a8 538 si_code = SEGV_MAPERR;
1da177e4 539
608566b4
HH
540 if (notify_page_fault(regs))
541 return;
1da177e4
LT
542
543 /*
544 * We fault-in kernel-space virtual memory on-demand. The
545 * 'reference' page table is init_mm.pgd.
546 *
547 * NOTE! We MUST NOT take any locks for this case. We may
548 * be in an interrupt or a critical region, and should
549 * only copy the information from the master page table,
550 * nothing more.
551 *
552 * This verifies that the fault happens in kernel space
553 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 554 * protection error (error_code & 9) == 0.
1da177e4 555 */
84929801 556 if (unlikely(address >= TASK_SIZE64)) {
f95190b2
AK
557 /*
558 * Don't check for the module range here: its PML4
559 * is always initialized because it's shared with the main
560 * kernel text. Only vmalloc may need PML4 syncups.
561 */
66c58156 562 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
f95190b2 563 ((address >= VMALLOC_START && address < VMALLOC_END))) {
8c914cb7
JB
564 if (vmalloc_fault(address) >= 0)
565 return;
1da177e4
LT
566 }
567 /*
568 * Don't take the mm semaphore here. If we fixup a prefetch
569 * fault we could otherwise deadlock.
570 */
571 goto bad_area_nosemaphore;
572 }
573
65ea5b03 574 if (likely(regs->flags & X86_EFLAGS_IF))
8c914cb7
JB
575 local_irq_enable();
576
66c58156 577 if (unlikely(error_code & PF_RSVD))
1da177e4
LT
578 pgtable_bad(address, regs, error_code);
579
580 /*
33cb5243
HH
581 * If we're in an interrupt, have no user context or are running in an
582 * atomic region then we must not take the fault.
1da177e4
LT
583 */
584 if (unlikely(in_atomic() || !mm))
585 goto bad_area_nosemaphore;
586
dbe3ed1c
LT
587 /*
588 * User-mode registers count as a user access even for any
589 * potential system fault or CPU buglet.
590 */
591 if (user_mode_vm(regs))
592 error_code |= PF_USER;
593
1da177e4
LT
594 again:
595 /* When running in the kernel we expect faults to occur only to
596 * addresses in user space. All other faults represent errors in the
676b1855 597 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 598 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
599 * we will deadlock attempting to validate the fault against the
600 * address space. Luckily the kernel only validly references user
601 * space from well defined areas of code, which are listed in the
602 * exceptions table.
603 *
604 * As the vast majority of faults will be valid we will only perform
676b1855 605 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
606 * Attempt to lock the address space, if we cannot we then validate the
607 * source. If this is invalid we can skip the address space check,
608 * thus avoiding the deadlock.
609 */
610 if (!down_read_trylock(&mm->mmap_sem)) {
66c58156 611 if ((error_code & PF_USER) == 0 &&
65ea5b03 612 !search_exception_tables(regs->ip))
1da177e4
LT
613 goto bad_area_nosemaphore;
614 down_read(&mm->mmap_sem);
615 }
616
617 vma = find_vma(mm, address);
618 if (!vma)
619 goto bad_area;
620 if (likely(vma->vm_start <= address))
621 goto good_area;
622 if (!(vma->vm_flags & VM_GROWSDOWN))
623 goto bad_area;
33cb5243 624 if (error_code & PF_USER) {
6f4d368e
HH
625 /*
626 * Accessing the stack below %sp is always a bug.
627 * The large cushion allows instructions like enter
628 * and pusha to work. ("enter $65535,$31" pushes
629 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 630 */
65ea5b03 631 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
1da177e4
LT
632 goto bad_area;
633 }
634 if (expand_stack(vma, address))
635 goto bad_area;
636/*
637 * Ok, we have a good vm_area for this memory access, so
638 * we can handle it..
639 */
640good_area:
c4aba4a8 641 si_code = SEGV_ACCERR;
1da177e4 642 write = 0;
66c58156 643 switch (error_code & (PF_PROT|PF_WRITE)) {
33cb5243
HH
644 default: /* 3: write, present */
645 /* fall through */
646 case PF_WRITE: /* write, not present */
647 if (!(vma->vm_flags & VM_WRITE))
648 goto bad_area;
649 write++;
650 break;
651 case PF_PROT: /* read, present */
652 goto bad_area;
653 case 0: /* read, not present */
654 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1da177e4 655 goto bad_area;
1da177e4
LT
656 }
657
658 /*
659 * If for any reason at all we couldn't handle the fault,
660 * make sure we exit gracefully rather than endlessly redo
661 * the fault.
662 */
83c54070
NP
663 fault = handle_mm_fault(mm, vma, address, write);
664 if (unlikely(fault & VM_FAULT_ERROR)) {
665 if (fault & VM_FAULT_OOM)
666 goto out_of_memory;
667 else if (fault & VM_FAULT_SIGBUS)
668 goto do_sigbus;
669 BUG();
1da177e4 670 }
83c54070
NP
671 if (fault & VM_FAULT_MAJOR)
672 tsk->maj_flt++;
673 else
674 tsk->min_flt++;
d729ab35
HH
675
676#ifdef CONFIG_X86_32
677 /*
678 * Did it hit the DOS screen memory VA from vm86 mode?
679 */
680 if (v8086_mode(regs)) {
681 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
682 if (bit < 32)
683 tsk->thread.screen_bitmap |= 1 << bit;
684 }
685#endif
1da177e4
LT
686 up_read(&mm->mmap_sem);
687 return;
688
689/*
690 * Something tried to access memory that isn't in our memory map..
691 * Fix it, but check if it's kernel or user first..
692 */
693bad_area:
694 up_read(&mm->mmap_sem);
695
696bad_area_nosemaphore:
1da177e4 697 /* User mode accesses just cause a SIGSEGV */
66c58156 698 if (error_code & PF_USER) {
e5e3c84b
SR
699 /*
700 * It's possible to have interrupts off here.
701 */
702 local_irq_enable();
703
1156e098
HH
704 /*
705 * Valid to do another page fault here because this one came
706 * from user space.
707 */
1da177e4
LT
708 if (is_prefetch(regs, address, error_code))
709 return;
710
35f3266f 711 if (is_errata100(regs, address))
1da177e4
LT
712 return;
713
abd4f750
MAS
714 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
715 printk_ratelimit()) {
1da177e4 716 printk(
6f4d368e 717#ifdef CONFIG_X86_32
edcd8119 718 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
6f4d368e 719#else
03252919 720 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
6f4d368e
HH
721#endif
722 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
723 tsk->comm, task_pid_nr(tsk), address, regs->ip,
724 regs->sp, error_code);
03252919
AK
725 print_vma_addr(" in ", regs->ip);
726 printk("\n");
1da177e4 727 }
33cb5243 728
1da177e4
LT
729 tsk->thread.cr2 = address;
730 /* Kernel addresses are always protection faults */
731 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
732 tsk->thread.trap_no = 14;
c4aba4a8
HH
733
734 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
1da177e4
LT
735 return;
736 }
737
29caf2f9
HH
738 if (is_f00f_bug(regs, address))
739 return;
740
1da177e4 741no_context:
1da177e4 742 /* Are we prepared to handle this kernel fault? */
33cb5243 743 if (fixup_exception(regs))
1da177e4 744 return;
1da177e4 745
33cb5243 746 /*
1da177e4
LT
747 * Hall of shame of CPU/BIOS bugs.
748 */
749
33cb5243
HH
750 if (is_prefetch(regs, address, error_code))
751 return;
1da177e4
LT
752
753 if (is_errata93(regs, address))
33cb5243 754 return;
1da177e4
LT
755
756/*
757 * Oops. The kernel tried to access some bad page. We'll have to
758 * terminate things with extreme prejudice.
759 */
760
1209140c 761 flags = oops_begin();
1da177e4 762
b3279c7f
HH
763 show_fault_oops(regs, error_code, address);
764
6e3f3617
JB
765 tsk->thread.cr2 = address;
766 tsk->thread.trap_no = 14;
767 tsk->thread.error_code = error_code;
22f5991c
JB
768 if (__die("Oops", regs, error_code))
769 regs = NULL;
1da177e4
LT
770 /* Executive summary in case the body of the oops scrolled away */
771 printk(KERN_EMERG "CR2: %016lx\n", address);
22f5991c 772 oops_end(flags, regs, SIGKILL);
1da177e4
LT
773
774/*
775 * We ran out of memory, or some other thing happened to us that made
776 * us unable to handle the page fault gracefully.
777 */
778out_of_memory:
779 up_read(&mm->mmap_sem);
b460cbc5 780 if (is_global_init(current)) {
1da177e4
LT
781 yield();
782 goto again;
783 }
784 printk("VM: killing process %s\n", tsk->comm);
318aa296 785 if (error_code & PF_USER)
021daae2 786 do_group_exit(SIGKILL);
1da177e4
LT
787 goto no_context;
788
789do_sigbus:
790 up_read(&mm->mmap_sem);
791
792 /* Kernel mode? Handle exceptions or die */
66c58156 793 if (!(error_code & PF_USER))
1da177e4
LT
794 goto no_context;
795
796 tsk->thread.cr2 = address;
797 tsk->thread.error_code = error_code;
798 tsk->thread.trap_no = 14;
c4aba4a8 799 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
1da177e4
LT
800 return;
801}
9e43e1b7 802
8c914cb7 803DEFINE_SPINLOCK(pgd_lock);
2bff7383 804LIST_HEAD(pgd_list);
8c914cb7
JB
805
806void vmalloc_sync_all(void)
807{
1156e098
HH
808#ifdef CONFIG_X86_32
809 /*
810 * Note that races in the updates of insync and start aren't
811 * problematic: insync can only get set bits added, and updates to
812 * start are only improving performance (without affecting correctness
813 * if undone).
814 */
815 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
816 static unsigned long start = TASK_SIZE;
817 unsigned long address;
818
819 if (SHARED_KERNEL_PMD)
820 return;
821
822 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
823 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
824 if (!test_bit(pgd_index(address), insync)) {
825 unsigned long flags;
826 struct page *page;
827
828 spin_lock_irqsave(&pgd_lock, flags);
829 for (page = pgd_list; page; page =
830 (struct page *)page->index)
831 if (!vmalloc_sync_one(page_address(page),
832 address)) {
833 BUG_ON(page != pgd_list);
834 break;
835 }
836 spin_unlock_irqrestore(&pgd_lock, flags);
837 if (!page)
838 set_bit(pgd_index(address), insync);
839 }
840 if (address == start && test_bit(pgd_index(address), insync))
841 start = address + PGDIR_SIZE;
842 }
843#else /* CONFIG_X86_64 */
6f4d368e
HH
844 /*
845 * Note that races in the updates of insync and start aren't
846 * problematic: insync can only get set bits added, and updates to
847 * start are only improving performance (without affecting correctness
848 * if undone).
849 */
8c914cb7
JB
850 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
851 static unsigned long start = VMALLOC_START & PGDIR_MASK;
852 unsigned long address;
853
854 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
855 if (!test_bit(pgd_index(address), insync)) {
856 const pgd_t *pgd_ref = pgd_offset_k(address);
857 struct page *page;
858
859 if (pgd_none(*pgd_ref))
860 continue;
861 spin_lock(&pgd_lock);
2bff7383 862 list_for_each_entry(page, &pgd_list, lru) {
8c914cb7
JB
863 pgd_t *pgd;
864 pgd = (pgd_t *)page_address(page) + pgd_index(address);
865 if (pgd_none(*pgd))
866 set_pgd(pgd, *pgd_ref);
867 else
46a82b2d 868 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7
JB
869 }
870 spin_unlock(&pgd_lock);
871 set_bit(pgd_index(address), insync);
872 }
873 if (address == start)
874 start = address + PGDIR_SIZE;
875 }
876 /* Check that there is no need to do the same for the modules area. */
877 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
33cb5243 878 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
8c914cb7 879 (__START_KERNEL & PGDIR_MASK)));
1156e098 880#endif
8c914cb7 881}
This page took 0.602941 seconds and 5 git commands to generate.