x86 mmiotrace: Use percpu instead of arrays.
[deliverable/linux.git] / arch / x86 / mm / fault.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
1da177e4
LT
16#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
c61e211d
HH
21#include <linux/highmem.h>
22#include <linux/bootmem.h> /* for max_low_pfn */
1eeb66a1 23#include <linux/vmalloc.h>
1da177e4 24#include <linux/module.h>
0f2fbdcb 25#include <linux/kprobes.h>
ab2bf0c1 26#include <linux/uaccess.h>
1eeb66a1 27#include <linux/kdebug.h>
1da177e4
LT
28
29#include <asm/system.h>
c61e211d
HH
30#include <asm/desc.h>
31#include <asm/segment.h>
1da177e4
LT
32#include <asm/pgalloc.h>
33#include <asm/smp.h>
34#include <asm/tlbflush.h>
35#include <asm/proto.h>
1da177e4 36#include <asm-generic/sections.h>
1da177e4 37
33cb5243
HH
38/*
39 * Page fault error code bits
40 * bit 0 == 0 means no page found, 1 means protection fault
41 * bit 1 == 0 means read, 1 means write
42 * bit 2 == 0 means kernel, 1 means user-mode
43 * bit 3 == 1 means use of reserved bit detected
44 * bit 4 == 1 means fault was an instruction fetch
45 */
8a19da7b 46#define PF_PROT (1<<0)
66c58156 47#define PF_WRITE (1<<1)
8a19da7b
IM
48#define PF_USER (1<<2)
49#define PF_RSVD (1<<3)
66c58156
AK
50#define PF_INSTR (1<<4)
51
10c43d2e
PP
52#ifdef CONFIG_MMIOTRACE_HOOKS
53static pf_handler_func mmiotrace_pf_handler; /* protected by RCU */
54static DEFINE_SPINLOCK(mmiotrace_handler_lock);
86069782 55
10c43d2e 56int mmiotrace_register_pf(pf_handler_func new_pfh)
86069782 57{
10c43d2e 58 int ret = 0;
86069782 59 unsigned long flags;
10c43d2e
PP
60 spin_lock_irqsave(&mmiotrace_handler_lock, flags);
61 if (mmiotrace_pf_handler)
62 ret = -EBUSY;
63 else
64 mmiotrace_pf_handler = new_pfh;
65 spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
66 return ret;
86069782 67}
10c43d2e 68EXPORT_SYMBOL_GPL(mmiotrace_register_pf);
86069782
PP
69
70/**
10c43d2e 71 * mmiotrace_unregister_pf:
86069782 72 * The caller must ensure @old_pfh is not in use anymore before freeing it.
10c43d2e
PP
73 * This function does not guarantee it. The handler function pointer is
74 * protected by RCU, so you can do this by e.g. calling synchronize_rcu().
86069782 75 */
10c43d2e 76int mmiotrace_unregister_pf(pf_handler_func old_pfh)
86069782 77{
10c43d2e 78 int ret = 0;
86069782 79 unsigned long flags;
10c43d2e
PP
80 spin_lock_irqsave(&mmiotrace_handler_lock, flags);
81 if (mmiotrace_pf_handler != old_pfh)
82 ret = -EPERM;
83 else
84 mmiotrace_pf_handler = NULL;
85 spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
86 return ret;
86069782 87}
10c43d2e
PP
88EXPORT_SYMBOL_GPL(mmiotrace_unregister_pf);
89#endif /* CONFIG_MMIOTRACE_HOOKS */
86069782
PP
90
91/* returns non-zero if do_page_fault() should return */
10c43d2e
PP
92static inline int call_mmiotrace(struct pt_regs *regs,
93 unsigned long error_code,
94 unsigned long address)
86069782 95{
10c43d2e 96#ifdef CONFIG_MMIOTRACE_HOOKS
86069782 97 int ret = 0;
86069782 98 rcu_read_lock();
10c43d2e
PP
99 if (mmiotrace_pf_handler)
100 ret = mmiotrace_pf_handler(regs, error_code, address);
86069782
PP
101 rcu_read_unlock();
102 return ret;
103#else
104 return 0;
105#endif
106}
107
74a0b576 108static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 109{
33cb5243 110#ifdef CONFIG_KPROBES
74a0b576
CH
111 int ret = 0;
112
113 /* kprobe_running() needs smp_processor_id() */
f8c2ee22
HH
114#ifdef CONFIG_X86_32
115 if (!user_mode_vm(regs)) {
116#else
74a0b576 117 if (!user_mode(regs)) {
f8c2ee22 118#endif
74a0b576
CH
119 preempt_disable();
120 if (kprobe_running() && kprobe_fault_handler(regs, 14))
121 ret = 1;
122 preempt_enable();
123 }
1bd858a5 124
74a0b576 125 return ret;
74a0b576 126#else
74a0b576 127 return 0;
74a0b576 128#endif
33cb5243 129}
1bd858a5 130
1dc85be0
HH
131/*
132 * X86_32
133 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
134 * Check that here and ignore it.
135 *
136 * X86_64
137 * Sometimes the CPU reports invalid exceptions on prefetch.
138 * Check that here and ignore it.
139 *
140 * Opcode checker based on code by Richard Brunner
141 */
142static int is_prefetch(struct pt_regs *regs, unsigned long addr,
143 unsigned long error_code)
33cb5243 144{
ab2bf0c1 145 unsigned char *instr;
1da177e4 146 int scan_more = 1;
33cb5243 147 int prefetch = 0;
f1290ec9 148 unsigned char *max_instr;
1da177e4 149
3085354d
IM
150 /*
151 * If it was a exec (instruction fetch) fault on NX page, then
152 * do not ignore the fault:
153 */
66c58156 154 if (error_code & PF_INSTR)
1da177e4 155 return 0;
1dc85be0 156
f2857ce9 157 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 158 max_instr = instr + 15;
1da177e4 159
76381fee 160 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
161 return 0;
162
33cb5243 163 while (scan_more && instr < max_instr) {
1da177e4
LT
164 unsigned char opcode;
165 unsigned char instr_hi;
166 unsigned char instr_lo;
167
ab2bf0c1 168 if (probe_kernel_address(instr, opcode))
33cb5243 169 break;
1da177e4 170
33cb5243
HH
171 instr_hi = opcode & 0xf0;
172 instr_lo = opcode & 0x0f;
1da177e4
LT
173 instr++;
174
33cb5243 175 switch (instr_hi) {
1da177e4
LT
176 case 0x20:
177 case 0x30:
33cb5243
HH
178 /*
179 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
180 * In X86_64 long mode, the CPU will signal invalid
181 * opcode if some of these prefixes are present so
182 * X86_64 will never get here anyway
183 */
1da177e4
LT
184 scan_more = ((instr_lo & 7) == 0x6);
185 break;
33cb5243 186#ifdef CONFIG_X86_64
1da177e4 187 case 0x40:
33cb5243
HH
188 /*
189 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
190 * Need to figure out under what instruction mode the
191 * instruction was issued. Could check the LDT for lm,
192 * but for now it's good enough to assume that long
193 * mode only uses well known segments or kernel.
194 */
76381fee 195 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 196 break;
33cb5243 197#endif
1da177e4
LT
198 case 0x60:
199 /* 0x64 thru 0x67 are valid prefixes in all modes. */
200 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 201 break;
1da177e4 202 case 0xF0:
1dc85be0 203 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 204 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 205 break;
1da177e4
LT
206 case 0x00:
207 /* Prefetch instruction is 0x0F0D or 0x0F18 */
208 scan_more = 0;
f2857ce9 209
ab2bf0c1 210 if (probe_kernel_address(instr, opcode))
1da177e4
LT
211 break;
212 prefetch = (instr_lo == 0xF) &&
213 (opcode == 0x0D || opcode == 0x18);
33cb5243 214 break;
1da177e4
LT
215 default:
216 scan_more = 0;
217 break;
33cb5243 218 }
1da177e4
LT
219 }
220 return prefetch;
221}
222
c4aba4a8
HH
223static void force_sig_info_fault(int si_signo, int si_code,
224 unsigned long address, struct task_struct *tsk)
225{
226 siginfo_t info;
227
228 info.si_signo = si_signo;
229 info.si_errno = 0;
230 info.si_code = si_code;
231 info.si_addr = (void __user *)address;
232 force_sig_info(si_signo, &info, tsk);
233}
234
1156e098 235#ifdef CONFIG_X86_64
33cb5243
HH
236static int bad_address(void *p)
237{
1da177e4 238 unsigned long dummy;
ab2bf0c1 239 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 240}
1156e098 241#endif
1da177e4 242
cae30f82 243static void dump_pagetable(unsigned long address)
1da177e4 244{
1156e098
HH
245#ifdef CONFIG_X86_32
246 __typeof__(pte_val(__pte(0))) page;
247
248 page = read_cr3();
249 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
250#ifdef CONFIG_X86_PAE
251 printk("*pdpt = %016Lx ", page);
252 if ((page >> PAGE_SHIFT) < max_low_pfn
253 && page & _PAGE_PRESENT) {
254 page &= PAGE_MASK;
255 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
256 & (PTRS_PER_PMD - 1)];
257 printk(KERN_CONT "*pde = %016Lx ", page);
258 page &= ~_PAGE_NX;
259 }
260#else
261 printk("*pde = %08lx ", page);
262#endif
263
264 /*
265 * We must not directly access the pte in the highpte
266 * case if the page table is located in highmem.
267 * And let's rather not kmap-atomic the pte, just in case
268 * it's allocated already.
269 */
270 if ((page >> PAGE_SHIFT) < max_low_pfn
271 && (page & _PAGE_PRESENT)
272 && !(page & _PAGE_PSE)) {
273 page &= PAGE_MASK;
274 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
275 & (PTRS_PER_PTE - 1)];
276 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
277 }
278
279 printk("\n");
280#else /* CONFIG_X86_64 */
1da177e4
LT
281 pgd_t *pgd;
282 pud_t *pud;
283 pmd_t *pmd;
284 pte_t *pte;
285
f51c9452 286 pgd = (pgd_t *)read_cr3();
1da177e4 287
33cb5243 288 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 289 pgd += pgd_index(address);
1da177e4 290 if (bad_address(pgd)) goto bad;
d646bce4 291 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 292 if (!pgd_present(*pgd)) goto ret;
1da177e4 293
d2ae5b5f 294 pud = pud_offset(pgd, address);
1da177e4
LT
295 if (bad_address(pud)) goto bad;
296 printk("PUD %lx ", pud_val(*pud));
b5360222
AK
297 if (!pud_present(*pud) || pud_large(*pud))
298 goto ret;
1da177e4
LT
299
300 pmd = pmd_offset(pud, address);
301 if (bad_address(pmd)) goto bad;
302 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 303 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
304
305 pte = pte_offset_kernel(pmd, address);
306 if (bad_address(pte)) goto bad;
33cb5243 307 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
308ret:
309 printk("\n");
310 return;
311bad:
312 printk("BAD\n");
1156e098
HH
313#endif
314}
315
316#ifdef CONFIG_X86_32
317static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
318{
319 unsigned index = pgd_index(address);
320 pgd_t *pgd_k;
321 pud_t *pud, *pud_k;
322 pmd_t *pmd, *pmd_k;
323
324 pgd += index;
325 pgd_k = init_mm.pgd + index;
326
327 if (!pgd_present(*pgd_k))
328 return NULL;
329
330 /*
331 * set_pgd(pgd, *pgd_k); here would be useless on PAE
332 * and redundant with the set_pmd() on non-PAE. As would
333 * set_pud.
334 */
335
336 pud = pud_offset(pgd, address);
337 pud_k = pud_offset(pgd_k, address);
338 if (!pud_present(*pud_k))
339 return NULL;
340
341 pmd = pmd_offset(pud, address);
342 pmd_k = pmd_offset(pud_k, address);
343 if (!pmd_present(*pmd_k))
344 return NULL;
345 if (!pmd_present(*pmd)) {
346 set_pmd(pmd, *pmd_k);
347 arch_flush_lazy_mmu_mode();
348 } else
349 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
350 return pmd_k;
1da177e4 351}
1156e098 352#endif
1da177e4 353
1dc85be0 354#ifdef CONFIG_X86_64
33cb5243 355static const char errata93_warning[] =
1da177e4
LT
356KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
357KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
358KERN_ERR "******* Please consider a BIOS update.\n"
359KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 360#endif
1da177e4
LT
361
362/* Workaround for K8 erratum #93 & buggy BIOS.
363 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
364 to avoid corruption of the 64bit RIP register on C stepping K8.
365 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
366 The OS sees this as a page fault with the upper 32bits of RIP cleared.
367 Try to work around it here.
fdfe8aa8
HH
368 Note we only handle faults in kernel here.
369 Does nothing for X86_32
370 */
33cb5243 371static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 372{
fdfe8aa8 373#ifdef CONFIG_X86_64
1da177e4 374 static int warned;
65ea5b03 375 if (address != regs->ip)
1da177e4 376 return 0;
33cb5243 377 if ((address >> 32) != 0)
1da177e4
LT
378 return 0;
379 address |= 0xffffffffUL << 32;
33cb5243
HH
380 if ((address >= (u64)_stext && address <= (u64)_etext) ||
381 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 382 if (!warned) {
33cb5243 383 printk(errata93_warning);
1da177e4
LT
384 warned = 1;
385 }
65ea5b03 386 regs->ip = address;
1da177e4
LT
387 return 1;
388 }
fdfe8aa8 389#endif
1da177e4 390 return 0;
33cb5243 391}
1da177e4 392
35f3266f
HH
393/*
394 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
395 * addresses >4GB. We catch this in the page fault handler because these
396 * addresses are not reachable. Just detect this case and return. Any code
397 * segment in LDT is compatibility mode.
398 */
399static int is_errata100(struct pt_regs *regs, unsigned long address)
400{
401#ifdef CONFIG_X86_64
402 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
403 (address >> 32))
404 return 1;
405#endif
406 return 0;
407}
408
29caf2f9
HH
409void do_invalid_op(struct pt_regs *, unsigned long);
410
411static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
412{
413#ifdef CONFIG_X86_F00F_BUG
414 unsigned long nr;
415 /*
416 * Pentium F0 0F C7 C8 bug workaround.
417 */
418 if (boot_cpu_data.f00f_bug) {
419 nr = (address - idt_descr.address) >> 3;
420
421 if (nr == 6) {
422 do_invalid_op(regs, 0);
423 return 1;
424 }
425 }
426#endif
427 return 0;
428}
429
b3279c7f
HH
430static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
431 unsigned long address)
432{
1156e098
HH
433#ifdef CONFIG_X86_32
434 if (!oops_may_print())
435 return;
fd40d6e3 436#endif
1156e098
HH
437
438#ifdef CONFIG_X86_PAE
439 if (error_code & PF_INSTR) {
93809be8 440 unsigned int level;
1156e098
HH
441 pte_t *pte = lookup_address(address, &level);
442
443 if (pte && pte_present(*pte) && !pte_exec(*pte))
444 printk(KERN_CRIT "kernel tried to execute "
445 "NX-protected page - exploit attempt? "
446 "(uid: %d)\n", current->uid);
447 }
448#endif
1156e098 449
19f0dda9 450 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 451 if (address < PAGE_SIZE)
19f0dda9 452 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 453 else
19f0dda9 454 printk(KERN_CONT "paging request");
fd40d6e3
HH
455#ifdef CONFIG_X86_32
456 printk(KERN_CONT " at %08lx\n", address);
457#else
19f0dda9 458 printk(KERN_CONT " at %016lx\n", address);
fd40d6e3 459#endif
19f0dda9 460 printk(KERN_ALERT "IP:");
b3279c7f
HH
461 printk_address(regs->ip, 1);
462 dump_pagetable(address);
463}
464
1156e098 465#ifdef CONFIG_X86_64
1da177e4
LT
466static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
467 unsigned long error_code)
468{
1209140c 469 unsigned long flags = oops_begin();
6e3f3617 470 struct task_struct *tsk;
1209140c 471
1da177e4
LT
472 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
473 current->comm, address);
474 dump_pagetable(address);
6e3f3617
JB
475 tsk = current;
476 tsk->thread.cr2 = address;
477 tsk->thread.trap_no = 14;
478 tsk->thread.error_code = error_code;
22f5991c
JB
479 if (__die("Bad pagetable", regs, error_code))
480 regs = NULL;
481 oops_end(flags, regs, SIGKILL);
1da177e4 482}
1156e098 483#endif
1da177e4 484
d8b57bb7
TG
485static int spurious_fault_check(unsigned long error_code, pte_t *pte)
486{
487 if ((error_code & PF_WRITE) && !pte_write(*pte))
488 return 0;
489 if ((error_code & PF_INSTR) && !pte_exec(*pte))
490 return 0;
491
492 return 1;
493}
494
5b727a3b
JF
495/*
496 * Handle a spurious fault caused by a stale TLB entry. This allows
497 * us to lazily refresh the TLB when increasing the permissions of a
498 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
499 * expensive since that implies doing a full cross-processor TLB
500 * flush, even if no stale TLB entries exist on other processors.
501 * There are no security implications to leaving a stale TLB when
502 * increasing the permissions on a page.
503 */
504static int spurious_fault(unsigned long address,
505 unsigned long error_code)
506{
507 pgd_t *pgd;
508 pud_t *pud;
509 pmd_t *pmd;
510 pte_t *pte;
511
512 /* Reserved-bit violation or user access to kernel space? */
513 if (error_code & (PF_USER | PF_RSVD))
514 return 0;
515
516 pgd = init_mm.pgd + pgd_index(address);
517 if (!pgd_present(*pgd))
518 return 0;
519
520 pud = pud_offset(pgd, address);
521 if (!pud_present(*pud))
522 return 0;
523
d8b57bb7
TG
524 if (pud_large(*pud))
525 return spurious_fault_check(error_code, (pte_t *) pud);
526
5b727a3b
JF
527 pmd = pmd_offset(pud, address);
528 if (!pmd_present(*pmd))
529 return 0;
530
d8b57bb7
TG
531 if (pmd_large(*pmd))
532 return spurious_fault_check(error_code, (pte_t *) pmd);
533
5b727a3b
JF
534 pte = pte_offset_kernel(pmd, address);
535 if (!pte_present(*pte))
536 return 0;
537
d8b57bb7 538 return spurious_fault_check(error_code, pte);
5b727a3b
JF
539}
540
1da177e4 541/*
f8c2ee22
HH
542 * X86_32
543 * Handle a fault on the vmalloc or module mapping area
544 *
545 * X86_64
f95190b2 546 * Handle a fault on the vmalloc area
3b9ba4d5
AK
547 *
548 * This assumes no large pages in there.
1da177e4
LT
549 */
550static int vmalloc_fault(unsigned long address)
551{
fdfe8aa8
HH
552#ifdef CONFIG_X86_32
553 unsigned long pgd_paddr;
554 pmd_t *pmd_k;
555 pte_t *pte_k;
556 /*
557 * Synchronize this task's top level page-table
558 * with the 'reference' page table.
559 *
560 * Do _not_ use "current" here. We might be inside
561 * an interrupt in the middle of a task switch..
562 */
563 pgd_paddr = read_cr3();
564 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
565 if (!pmd_k)
566 return -1;
567 pte_k = pte_offset_kernel(pmd_k, address);
568 if (!pte_present(*pte_k))
569 return -1;
570 return 0;
571#else
1da177e4
LT
572 pgd_t *pgd, *pgd_ref;
573 pud_t *pud, *pud_ref;
574 pmd_t *pmd, *pmd_ref;
575 pte_t *pte, *pte_ref;
576
cf89ec92
HH
577 /* Make sure we are in vmalloc area */
578 if (!(address >= VMALLOC_START && address < VMALLOC_END))
579 return -1;
580
1da177e4
LT
581 /* Copy kernel mappings over when needed. This can also
582 happen within a race in page table update. In the later
583 case just flush. */
584
585 pgd = pgd_offset(current->mm ?: &init_mm, address);
586 pgd_ref = pgd_offset_k(address);
587 if (pgd_none(*pgd_ref))
588 return -1;
589 if (pgd_none(*pgd))
590 set_pgd(pgd, *pgd_ref);
8c914cb7 591 else
46a82b2d 592 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
593
594 /* Below here mismatches are bugs because these lower tables
595 are shared */
596
597 pud = pud_offset(pgd, address);
598 pud_ref = pud_offset(pgd_ref, address);
599 if (pud_none(*pud_ref))
600 return -1;
46a82b2d 601 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
602 BUG();
603 pmd = pmd_offset(pud, address);
604 pmd_ref = pmd_offset(pud_ref, address);
605 if (pmd_none(*pmd_ref))
606 return -1;
607 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
608 BUG();
609 pte_ref = pte_offset_kernel(pmd_ref, address);
610 if (!pte_present(*pte_ref))
611 return -1;
612 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
613 /* Don't use pte_page here, because the mappings can point
614 outside mem_map, and the NUMA hash lookup cannot handle
615 that. */
616 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 617 BUG();
1da177e4 618 return 0;
fdfe8aa8 619#endif
1da177e4
LT
620}
621
abd4f750 622int show_unhandled_signals = 1;
1da177e4
LT
623
624/*
625 * This routine handles page faults. It determines the address,
626 * and the problem, and then passes it off to one of the appropriate
627 * routines.
1da177e4 628 */
f8c2ee22
HH
629#ifdef CONFIG_X86_64
630asmlinkage
631#endif
632void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1da177e4
LT
633{
634 struct task_struct *tsk;
635 struct mm_struct *mm;
33cb5243 636 struct vm_area_struct *vma;
1da177e4 637 unsigned long address;
f8c2ee22
HH
638 int write, si_code;
639 int fault;
640#ifdef CONFIG_X86_64
1209140c 641 unsigned long flags;
f8c2ee22 642#endif
1da177e4 643
143a5d32
PZ
644 /*
645 * We can fault from pretty much anywhere, with unknown IRQ state.
646 */
647 trace_hardirqs_fixup();
648
a9ba9a3b
AV
649 tsk = current;
650 mm = tsk->mm;
651 prefetchw(&mm->mmap_sem);
652
1da177e4 653 /* get the address */
f51c9452 654 address = read_cr2();
1da177e4 655
c4aba4a8 656 si_code = SEGV_MAPERR;
1da177e4 657
608566b4
HH
658 if (notify_page_fault(regs))
659 return;
10c43d2e 660 if (call_mmiotrace(regs, error_code, address))
86069782 661 return;
1da177e4
LT
662
663 /*
664 * We fault-in kernel-space virtual memory on-demand. The
665 * 'reference' page table is init_mm.pgd.
666 *
667 * NOTE! We MUST NOT take any locks for this case. We may
668 * be in an interrupt or a critical region, and should
669 * only copy the information from the master page table,
670 * nothing more.
671 *
672 * This verifies that the fault happens in kernel space
673 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 674 * protection error (error_code & 9) == 0.
1da177e4 675 */
f8c2ee22
HH
676#ifdef CONFIG_X86_32
677 if (unlikely(address >= TASK_SIZE)) {
cf89ec92
HH
678#else
679 if (unlikely(address >= TASK_SIZE64)) {
680#endif
f8c2ee22
HH
681 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
682 vmalloc_fault(address) >= 0)
683 return;
5b727a3b
JF
684
685 /* Can handle a stale RO->RW TLB */
686 if (spurious_fault(address, error_code))
687 return;
688
f8c2ee22
HH
689 /*
690 * Don't take the mm semaphore here. If we fixup a prefetch
691 * fault we could otherwise deadlock.
692 */
693 goto bad_area_nosemaphore;
694 }
695
cf89ec92
HH
696
697#ifdef CONFIG_X86_32
f8c2ee22
HH
698 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
699 fault has been handled. */
6b6891f9 700 if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
f8c2ee22
HH
701 local_irq_enable();
702
703 /*
704 * If we're in an interrupt, have no user context or are running in an
705 * atomic region then we must not take the fault.
706 */
707 if (in_atomic() || !mm)
708 goto bad_area_nosemaphore;
709#else /* CONFIG_X86_64 */
65ea5b03 710 if (likely(regs->flags & X86_EFLAGS_IF))
8c914cb7
JB
711 local_irq_enable();
712
66c58156 713 if (unlikely(error_code & PF_RSVD))
1da177e4
LT
714 pgtable_bad(address, regs, error_code);
715
716 /*
33cb5243
HH
717 * If we're in an interrupt, have no user context or are running in an
718 * atomic region then we must not take the fault.
1da177e4
LT
719 */
720 if (unlikely(in_atomic() || !mm))
721 goto bad_area_nosemaphore;
722
dbe3ed1c
LT
723 /*
724 * User-mode registers count as a user access even for any
725 * potential system fault or CPU buglet.
726 */
727 if (user_mode_vm(regs))
728 error_code |= PF_USER;
f8c2ee22
HH
729again:
730#endif
1da177e4
LT
731 /* When running in the kernel we expect faults to occur only to
732 * addresses in user space. All other faults represent errors in the
676b1855 733 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 734 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
735 * we will deadlock attempting to validate the fault against the
736 * address space. Luckily the kernel only validly references user
737 * space from well defined areas of code, which are listed in the
738 * exceptions table.
739 *
740 * As the vast majority of faults will be valid we will only perform
676b1855 741 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
742 * Attempt to lock the address space, if we cannot we then validate the
743 * source. If this is invalid we can skip the address space check,
744 * thus avoiding the deadlock.
745 */
746 if (!down_read_trylock(&mm->mmap_sem)) {
66c58156 747 if ((error_code & PF_USER) == 0 &&
65ea5b03 748 !search_exception_tables(regs->ip))
1da177e4
LT
749 goto bad_area_nosemaphore;
750 down_read(&mm->mmap_sem);
751 }
752
753 vma = find_vma(mm, address);
754 if (!vma)
755 goto bad_area;
f8c2ee22 756 if (vma->vm_start <= address)
1da177e4
LT
757 goto good_area;
758 if (!(vma->vm_flags & VM_GROWSDOWN))
759 goto bad_area;
33cb5243 760 if (error_code & PF_USER) {
6f4d368e
HH
761 /*
762 * Accessing the stack below %sp is always a bug.
763 * The large cushion allows instructions like enter
764 * and pusha to work. ("enter $65535,$31" pushes
765 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 766 */
65ea5b03 767 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
1da177e4
LT
768 goto bad_area;
769 }
770 if (expand_stack(vma, address))
771 goto bad_area;
772/*
773 * Ok, we have a good vm_area for this memory access, so
774 * we can handle it..
775 */
776good_area:
c4aba4a8 777 si_code = SEGV_ACCERR;
1da177e4 778 write = 0;
66c58156 779 switch (error_code & (PF_PROT|PF_WRITE)) {
33cb5243
HH
780 default: /* 3: write, present */
781 /* fall through */
782 case PF_WRITE: /* write, not present */
783 if (!(vma->vm_flags & VM_WRITE))
784 goto bad_area;
785 write++;
786 break;
787 case PF_PROT: /* read, present */
788 goto bad_area;
789 case 0: /* read, not present */
790 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1da177e4 791 goto bad_area;
1da177e4
LT
792 }
793
f8c2ee22
HH
794#ifdef CONFIG_X86_32
795survive:
796#endif
1da177e4
LT
797 /*
798 * If for any reason at all we couldn't handle the fault,
799 * make sure we exit gracefully rather than endlessly redo
800 * the fault.
801 */
83c54070
NP
802 fault = handle_mm_fault(mm, vma, address, write);
803 if (unlikely(fault & VM_FAULT_ERROR)) {
804 if (fault & VM_FAULT_OOM)
805 goto out_of_memory;
806 else if (fault & VM_FAULT_SIGBUS)
807 goto do_sigbus;
808 BUG();
1da177e4 809 }
83c54070
NP
810 if (fault & VM_FAULT_MAJOR)
811 tsk->maj_flt++;
812 else
813 tsk->min_flt++;
d729ab35
HH
814
815#ifdef CONFIG_X86_32
816 /*
817 * Did it hit the DOS screen memory VA from vm86 mode?
818 */
819 if (v8086_mode(regs)) {
820 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
821 if (bit < 32)
822 tsk->thread.screen_bitmap |= 1 << bit;
823 }
824#endif
1da177e4
LT
825 up_read(&mm->mmap_sem);
826 return;
827
828/*
829 * Something tried to access memory that isn't in our memory map..
830 * Fix it, but check if it's kernel or user first..
831 */
832bad_area:
833 up_read(&mm->mmap_sem);
834
835bad_area_nosemaphore:
1da177e4 836 /* User mode accesses just cause a SIGSEGV */
66c58156 837 if (error_code & PF_USER) {
e5e3c84b
SR
838 /*
839 * It's possible to have interrupts off here.
840 */
841 local_irq_enable();
842
1156e098
HH
843 /*
844 * Valid to do another page fault here because this one came
845 * from user space.
846 */
1da177e4
LT
847 if (is_prefetch(regs, address, error_code))
848 return;
849
35f3266f 850 if (is_errata100(regs, address))
1da177e4
LT
851 return;
852
abd4f750
MAS
853 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
854 printk_ratelimit()) {
1da177e4 855 printk(
6f4d368e 856#ifdef CONFIG_X86_32
edcd8119 857 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
6f4d368e 858#else
03252919 859 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
6f4d368e
HH
860#endif
861 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
862 tsk->comm, task_pid_nr(tsk), address, regs->ip,
863 regs->sp, error_code);
03252919
AK
864 print_vma_addr(" in ", regs->ip);
865 printk("\n");
1da177e4 866 }
33cb5243 867
1da177e4
LT
868 tsk->thread.cr2 = address;
869 /* Kernel addresses are always protection faults */
870 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
871 tsk->thread.trap_no = 14;
c4aba4a8 872 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
1da177e4
LT
873 return;
874 }
875
29caf2f9
HH
876 if (is_f00f_bug(regs, address))
877 return;
878
1da177e4 879no_context:
1da177e4 880 /* Are we prepared to handle this kernel fault? */
33cb5243 881 if (fixup_exception(regs))
1da177e4 882 return;
1da177e4 883
33cb5243 884 /*
f8c2ee22
HH
885 * X86_32
886 * Valid to do another page fault here, because if this fault
887 * had been triggered by is_prefetch fixup_exception would have
888 * handled it.
889 *
890 * X86_64
1da177e4
LT
891 * Hall of shame of CPU/BIOS bugs.
892 */
33cb5243
HH
893 if (is_prefetch(regs, address, error_code))
894 return;
1da177e4
LT
895
896 if (is_errata93(regs, address))
33cb5243 897 return;
1da177e4
LT
898
899/*
900 * Oops. The kernel tried to access some bad page. We'll have to
901 * terminate things with extreme prejudice.
902 */
f8c2ee22
HH
903#ifdef CONFIG_X86_32
904 bust_spinlocks(1);
fd40d6e3
HH
905#else
906 flags = oops_begin();
907#endif
f8c2ee22
HH
908
909 show_fault_oops(regs, error_code, address);
1da177e4 910
f8c2ee22
HH
911 tsk->thread.cr2 = address;
912 tsk->thread.trap_no = 14;
913 tsk->thread.error_code = error_code;
fd40d6e3
HH
914
915#ifdef CONFIG_X86_32
f8c2ee22
HH
916 die("Oops", regs, error_code);
917 bust_spinlocks(0);
918 do_exit(SIGKILL);
fd40d6e3 919#else
22f5991c
JB
920 if (__die("Oops", regs, error_code))
921 regs = NULL;
1da177e4
LT
922 /* Executive summary in case the body of the oops scrolled away */
923 printk(KERN_EMERG "CR2: %016lx\n", address);
22f5991c 924 oops_end(flags, regs, SIGKILL);
f8c2ee22 925#endif
1da177e4
LT
926
927/*
928 * We ran out of memory, or some other thing happened to us that made
929 * us unable to handle the page fault gracefully.
930 */
931out_of_memory:
932 up_read(&mm->mmap_sem);
f8c2ee22
HH
933 if (is_global_init(tsk)) {
934 yield();
fd40d6e3 935#ifdef CONFIG_X86_32
f8c2ee22
HH
936 down_read(&mm->mmap_sem);
937 goto survive;
f8c2ee22 938#else
1da177e4 939 goto again;
f8c2ee22 940#endif
fd40d6e3
HH
941 }
942
1da177e4 943 printk("VM: killing process %s\n", tsk->comm);
318aa296 944 if (error_code & PF_USER)
021daae2 945 do_group_exit(SIGKILL);
1da177e4
LT
946 goto no_context;
947
948do_sigbus:
949 up_read(&mm->mmap_sem);
950
951 /* Kernel mode? Handle exceptions or die */
66c58156 952 if (!(error_code & PF_USER))
1da177e4 953 goto no_context;
f8c2ee22
HH
954#ifdef CONFIG_X86_32
955 /* User space => ok to do another page fault */
956 if (is_prefetch(regs, address, error_code))
957 return;
958#endif
1da177e4
LT
959 tsk->thread.cr2 = address;
960 tsk->thread.error_code = error_code;
961 tsk->thread.trap_no = 14;
c4aba4a8 962 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
1da177e4 963}
9e43e1b7 964
8c914cb7 965DEFINE_SPINLOCK(pgd_lock);
2bff7383 966LIST_HEAD(pgd_list);
8c914cb7
JB
967
968void vmalloc_sync_all(void)
969{
1156e098
HH
970#ifdef CONFIG_X86_32
971 /*
972 * Note that races in the updates of insync and start aren't
973 * problematic: insync can only get set bits added, and updates to
974 * start are only improving performance (without affecting correctness
975 * if undone).
976 */
977 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
978 static unsigned long start = TASK_SIZE;
979 unsigned long address;
980
981 if (SHARED_KERNEL_PMD)
982 return;
983
984 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
985 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
986 if (!test_bit(pgd_index(address), insync)) {
987 unsigned long flags;
988 struct page *page;
989
990 spin_lock_irqsave(&pgd_lock, flags);
e3ed910d 991 list_for_each_entry(page, &pgd_list, lru) {
1156e098 992 if (!vmalloc_sync_one(page_address(page),
e3ed910d 993 address))
1156e098 994 break;
e3ed910d 995 }
1156e098
HH
996 spin_unlock_irqrestore(&pgd_lock, flags);
997 if (!page)
998 set_bit(pgd_index(address), insync);
999 }
1000 if (address == start && test_bit(pgd_index(address), insync))
1001 start = address + PGDIR_SIZE;
1002 }
1003#else /* CONFIG_X86_64 */
6f4d368e
HH
1004 /*
1005 * Note that races in the updates of insync and start aren't
1006 * problematic: insync can only get set bits added, and updates to
1007 * start are only improving performance (without affecting correctness
1008 * if undone).
1009 */
8c914cb7
JB
1010 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
1011 static unsigned long start = VMALLOC_START & PGDIR_MASK;
1012 unsigned long address;
1013
1014 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
1015 if (!test_bit(pgd_index(address), insync)) {
1016 const pgd_t *pgd_ref = pgd_offset_k(address);
58d5d0d8 1017 unsigned long flags;
8c914cb7
JB
1018 struct page *page;
1019
1020 if (pgd_none(*pgd_ref))
1021 continue;
58d5d0d8 1022 spin_lock_irqsave(&pgd_lock, flags);
2bff7383 1023 list_for_each_entry(page, &pgd_list, lru) {
8c914cb7
JB
1024 pgd_t *pgd;
1025 pgd = (pgd_t *)page_address(page) + pgd_index(address);
1026 if (pgd_none(*pgd))
1027 set_pgd(pgd, *pgd_ref);
1028 else
46a82b2d 1029 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7 1030 }
58d5d0d8 1031 spin_unlock_irqrestore(&pgd_lock, flags);
8c914cb7
JB
1032 set_bit(pgd_index(address), insync);
1033 }
1034 if (address == start)
1035 start = address + PGDIR_SIZE;
1036 }
1156e098 1037#endif
8c914cb7 1038}
This page took 0.673483 seconds and 5 git commands to generate.