x86: check for and defend against BIOS memory corruption
[deliverable/linux.git] / arch / x86 / mm / fault.c
CommitLineData
1da177e4 1/*
1da177e4
LT
2 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
1da177e4
LT
6#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
0fd0e3da 13#include <linux/mmiotrace.h>
1da177e4
LT
14#include <linux/mman.h>
15#include <linux/mm.h>
16#include <linux/smp.h>
1da177e4
LT
17#include <linux/interrupt.h>
18#include <linux/init.h>
19#include <linux/tty.h>
20#include <linux/vt_kern.h> /* For unblank_screen() */
21#include <linux/compiler.h>
c61e211d
HH
22#include <linux/highmem.h>
23#include <linux/bootmem.h> /* for max_low_pfn */
1eeb66a1 24#include <linux/vmalloc.h>
1da177e4 25#include <linux/module.h>
0f2fbdcb 26#include <linux/kprobes.h>
ab2bf0c1 27#include <linux/uaccess.h>
1eeb66a1 28#include <linux/kdebug.h>
1da177e4
LT
29
30#include <asm/system.h>
c61e211d
HH
31#include <asm/desc.h>
32#include <asm/segment.h>
1da177e4
LT
33#include <asm/pgalloc.h>
34#include <asm/smp.h>
35#include <asm/tlbflush.h>
36#include <asm/proto.h>
1da177e4 37#include <asm-generic/sections.h>
1da177e4 38
33cb5243
HH
39/*
40 * Page fault error code bits
41 * bit 0 == 0 means no page found, 1 means protection fault
42 * bit 1 == 0 means read, 1 means write
43 * bit 2 == 0 means kernel, 1 means user-mode
44 * bit 3 == 1 means use of reserved bit detected
45 * bit 4 == 1 means fault was an instruction fetch
46 */
8a19da7b 47#define PF_PROT (1<<0)
66c58156 48#define PF_WRITE (1<<1)
8a19da7b
IM
49#define PF_USER (1<<2)
50#define PF_RSVD (1<<3)
66c58156
AK
51#define PF_INSTR (1<<4)
52
0fd0e3da 53static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
86069782 54{
10c43d2e 55#ifdef CONFIG_MMIOTRACE_HOOKS
0fd0e3da
PP
56 if (unlikely(is_kmmio_active()))
57 if (kmmio_handler(regs, addr) == 1)
58 return -1;
86069782 59#endif
0fd0e3da 60 return 0;
86069782
PP
61}
62
74a0b576 63static inline int notify_page_fault(struct pt_regs *regs)
1bd858a5 64{
33cb5243 65#ifdef CONFIG_KPROBES
74a0b576
CH
66 int ret = 0;
67
68 /* kprobe_running() needs smp_processor_id() */
f8c2ee22 69 if (!user_mode_vm(regs)) {
74a0b576
CH
70 preempt_disable();
71 if (kprobe_running() && kprobe_fault_handler(regs, 14))
72 ret = 1;
73 preempt_enable();
74 }
1bd858a5 75
74a0b576 76 return ret;
74a0b576 77#else
74a0b576 78 return 0;
74a0b576 79#endif
33cb5243 80}
1bd858a5 81
1dc85be0
HH
82/*
83 * X86_32
84 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
85 * Check that here and ignore it.
86 *
87 * X86_64
88 * Sometimes the CPU reports invalid exceptions on prefetch.
89 * Check that here and ignore it.
90 *
91 * Opcode checker based on code by Richard Brunner
92 */
93static int is_prefetch(struct pt_regs *regs, unsigned long addr,
94 unsigned long error_code)
33cb5243 95{
ab2bf0c1 96 unsigned char *instr;
1da177e4 97 int scan_more = 1;
33cb5243 98 int prefetch = 0;
f1290ec9 99 unsigned char *max_instr;
1da177e4 100
3085354d
IM
101 /*
102 * If it was a exec (instruction fetch) fault on NX page, then
103 * do not ignore the fault:
104 */
66c58156 105 if (error_code & PF_INSTR)
1da177e4 106 return 0;
1dc85be0 107
f2857ce9 108 instr = (unsigned char *)convert_ip_to_linear(current, regs);
f1290ec9 109 max_instr = instr + 15;
1da177e4 110
76381fee 111 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4
LT
112 return 0;
113
33cb5243 114 while (scan_more && instr < max_instr) {
1da177e4
LT
115 unsigned char opcode;
116 unsigned char instr_hi;
117 unsigned char instr_lo;
118
ab2bf0c1 119 if (probe_kernel_address(instr, opcode))
33cb5243 120 break;
1da177e4 121
33cb5243
HH
122 instr_hi = opcode & 0xf0;
123 instr_lo = opcode & 0x0f;
1da177e4
LT
124 instr++;
125
33cb5243 126 switch (instr_hi) {
1da177e4
LT
127 case 0x20:
128 case 0x30:
33cb5243
HH
129 /*
130 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
131 * In X86_64 long mode, the CPU will signal invalid
132 * opcode if some of these prefixes are present so
133 * X86_64 will never get here anyway
134 */
1da177e4
LT
135 scan_more = ((instr_lo & 7) == 0x6);
136 break;
33cb5243 137#ifdef CONFIG_X86_64
1da177e4 138 case 0x40:
33cb5243
HH
139 /*
140 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
141 * Need to figure out under what instruction mode the
142 * instruction was issued. Could check the LDT for lm,
143 * but for now it's good enough to assume that long
144 * mode only uses well known segments or kernel.
145 */
76381fee 146 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
1da177e4 147 break;
33cb5243 148#endif
1da177e4
LT
149 case 0x60:
150 /* 0x64 thru 0x67 are valid prefixes in all modes. */
151 scan_more = (instr_lo & 0xC) == 0x4;
33cb5243 152 break;
1da177e4 153 case 0xF0:
1dc85be0 154 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
1da177e4 155 scan_more = !instr_lo || (instr_lo>>1) == 1;
33cb5243 156 break;
1da177e4
LT
157 case 0x00:
158 /* Prefetch instruction is 0x0F0D or 0x0F18 */
159 scan_more = 0;
f2857ce9 160
ab2bf0c1 161 if (probe_kernel_address(instr, opcode))
1da177e4
LT
162 break;
163 prefetch = (instr_lo == 0xF) &&
164 (opcode == 0x0D || opcode == 0x18);
33cb5243 165 break;
1da177e4
LT
166 default:
167 scan_more = 0;
168 break;
33cb5243 169 }
1da177e4
LT
170 }
171 return prefetch;
172}
173
c4aba4a8
HH
174static void force_sig_info_fault(int si_signo, int si_code,
175 unsigned long address, struct task_struct *tsk)
176{
177 siginfo_t info;
178
179 info.si_signo = si_signo;
180 info.si_errno = 0;
181 info.si_code = si_code;
182 info.si_addr = (void __user *)address;
183 force_sig_info(si_signo, &info, tsk);
184}
185
1156e098 186#ifdef CONFIG_X86_64
33cb5243
HH
187static int bad_address(void *p)
188{
1da177e4 189 unsigned long dummy;
ab2bf0c1 190 return probe_kernel_address((unsigned long *)p, dummy);
33cb5243 191}
1156e098 192#endif
1da177e4 193
cae30f82 194static void dump_pagetable(unsigned long address)
1da177e4 195{
1156e098
HH
196#ifdef CONFIG_X86_32
197 __typeof__(pte_val(__pte(0))) page;
198
199 page = read_cr3();
200 page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
201#ifdef CONFIG_X86_PAE
202 printk("*pdpt = %016Lx ", page);
203 if ((page >> PAGE_SHIFT) < max_low_pfn
204 && page & _PAGE_PRESENT) {
205 page &= PAGE_MASK;
206 page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
207 & (PTRS_PER_PMD - 1)];
208 printk(KERN_CONT "*pde = %016Lx ", page);
209 page &= ~_PAGE_NX;
210 }
211#else
212 printk("*pde = %08lx ", page);
213#endif
214
215 /*
216 * We must not directly access the pte in the highpte
217 * case if the page table is located in highmem.
218 * And let's rather not kmap-atomic the pte, just in case
219 * it's allocated already.
220 */
221 if ((page >> PAGE_SHIFT) < max_low_pfn
222 && (page & _PAGE_PRESENT)
223 && !(page & _PAGE_PSE)) {
224 page &= PAGE_MASK;
225 page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
226 & (PTRS_PER_PTE - 1)];
227 printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
228 }
229
230 printk("\n");
231#else /* CONFIG_X86_64 */
1da177e4
LT
232 pgd_t *pgd;
233 pud_t *pud;
234 pmd_t *pmd;
235 pte_t *pte;
236
f51c9452 237 pgd = (pgd_t *)read_cr3();
1da177e4 238
33cb5243 239 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
1da177e4 240 pgd += pgd_index(address);
1da177e4 241 if (bad_address(pgd)) goto bad;
d646bce4 242 printk("PGD %lx ", pgd_val(*pgd));
33cb5243 243 if (!pgd_present(*pgd)) goto ret;
1da177e4 244
d2ae5b5f 245 pud = pud_offset(pgd, address);
1da177e4
LT
246 if (bad_address(pud)) goto bad;
247 printk("PUD %lx ", pud_val(*pud));
b5360222
AK
248 if (!pud_present(*pud) || pud_large(*pud))
249 goto ret;
1da177e4
LT
250
251 pmd = pmd_offset(pud, address);
252 if (bad_address(pmd)) goto bad;
253 printk("PMD %lx ", pmd_val(*pmd));
b1992df3 254 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
1da177e4
LT
255
256 pte = pte_offset_kernel(pmd, address);
257 if (bad_address(pte)) goto bad;
33cb5243 258 printk("PTE %lx", pte_val(*pte));
1da177e4
LT
259ret:
260 printk("\n");
261 return;
262bad:
263 printk("BAD\n");
1156e098
HH
264#endif
265}
266
267#ifdef CONFIG_X86_32
268static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
269{
270 unsigned index = pgd_index(address);
271 pgd_t *pgd_k;
272 pud_t *pud, *pud_k;
273 pmd_t *pmd, *pmd_k;
274
275 pgd += index;
276 pgd_k = init_mm.pgd + index;
277
278 if (!pgd_present(*pgd_k))
279 return NULL;
280
281 /*
282 * set_pgd(pgd, *pgd_k); here would be useless on PAE
283 * and redundant with the set_pmd() on non-PAE. As would
284 * set_pud.
285 */
286
287 pud = pud_offset(pgd, address);
288 pud_k = pud_offset(pgd_k, address);
289 if (!pud_present(*pud_k))
290 return NULL;
291
292 pmd = pmd_offset(pud, address);
293 pmd_k = pmd_offset(pud_k, address);
294 if (!pmd_present(*pmd_k))
295 return NULL;
296 if (!pmd_present(*pmd)) {
297 set_pmd(pmd, *pmd_k);
298 arch_flush_lazy_mmu_mode();
299 } else
300 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
301 return pmd_k;
1da177e4 302}
1156e098 303#endif
1da177e4 304
1dc85be0 305#ifdef CONFIG_X86_64
33cb5243 306static const char errata93_warning[] =
1da177e4
LT
307KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
308KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
309KERN_ERR "******* Please consider a BIOS update.\n"
310KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
fdfe8aa8 311#endif
1da177e4
LT
312
313/* Workaround for K8 erratum #93 & buggy BIOS.
314 BIOS SMM functions are required to use a specific workaround
33cb5243
HH
315 to avoid corruption of the 64bit RIP register on C stepping K8.
316 A lot of BIOS that didn't get tested properly miss this.
1da177e4
LT
317 The OS sees this as a page fault with the upper 32bits of RIP cleared.
318 Try to work around it here.
fdfe8aa8
HH
319 Note we only handle faults in kernel here.
320 Does nothing for X86_32
321 */
33cb5243 322static int is_errata93(struct pt_regs *regs, unsigned long address)
1da177e4 323{
fdfe8aa8 324#ifdef CONFIG_X86_64
1da177e4 325 static int warned;
65ea5b03 326 if (address != regs->ip)
1da177e4 327 return 0;
33cb5243 328 if ((address >> 32) != 0)
1da177e4
LT
329 return 0;
330 address |= 0xffffffffUL << 32;
33cb5243
HH
331 if ((address >= (u64)_stext && address <= (u64)_etext) ||
332 (address >= MODULES_VADDR && address <= MODULES_END)) {
1da177e4 333 if (!warned) {
33cb5243 334 printk(errata93_warning);
1da177e4
LT
335 warned = 1;
336 }
65ea5b03 337 regs->ip = address;
1da177e4
LT
338 return 1;
339 }
fdfe8aa8 340#endif
1da177e4 341 return 0;
33cb5243 342}
1da177e4 343
35f3266f
HH
344/*
345 * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
346 * addresses >4GB. We catch this in the page fault handler because these
347 * addresses are not reachable. Just detect this case and return. Any code
348 * segment in LDT is compatibility mode.
349 */
350static int is_errata100(struct pt_regs *regs, unsigned long address)
351{
352#ifdef CONFIG_X86_64
353 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
354 (address >> 32))
355 return 1;
356#endif
357 return 0;
358}
359
29caf2f9
HH
360void do_invalid_op(struct pt_regs *, unsigned long);
361
362static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
363{
364#ifdef CONFIG_X86_F00F_BUG
365 unsigned long nr;
366 /*
367 * Pentium F0 0F C7 C8 bug workaround.
368 */
369 if (boot_cpu_data.f00f_bug) {
370 nr = (address - idt_descr.address) >> 3;
371
372 if (nr == 6) {
373 do_invalid_op(regs, 0);
374 return 1;
375 }
376 }
377#endif
378 return 0;
379}
380
b3279c7f
HH
381static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
382 unsigned long address)
383{
1156e098
HH
384#ifdef CONFIG_X86_32
385 if (!oops_may_print())
386 return;
fd40d6e3 387#endif
1156e098
HH
388
389#ifdef CONFIG_X86_PAE
390 if (error_code & PF_INSTR) {
93809be8 391 unsigned int level;
1156e098
HH
392 pte_t *pte = lookup_address(address, &level);
393
394 if (pte && pte_present(*pte) && !pte_exec(*pte))
395 printk(KERN_CRIT "kernel tried to execute "
396 "NX-protected page - exploit attempt? "
397 "(uid: %d)\n", current->uid);
398 }
399#endif
1156e098 400
19f0dda9 401 printk(KERN_ALERT "BUG: unable to handle kernel ");
b3279c7f 402 if (address < PAGE_SIZE)
19f0dda9 403 printk(KERN_CONT "NULL pointer dereference");
b3279c7f 404 else
19f0dda9 405 printk(KERN_CONT "paging request");
f294a8ce 406 printk(KERN_CONT " at %p\n", (void *) address);
19f0dda9 407 printk(KERN_ALERT "IP:");
b3279c7f
HH
408 printk_address(regs->ip, 1);
409 dump_pagetable(address);
410}
411
1156e098 412#ifdef CONFIG_X86_64
1da177e4
LT
413static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
414 unsigned long error_code)
415{
1209140c 416 unsigned long flags = oops_begin();
6e3f3617 417 struct task_struct *tsk;
1209140c 418
1da177e4
LT
419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
420 current->comm, address);
421 dump_pagetable(address);
6e3f3617
JB
422 tsk = current;
423 tsk->thread.cr2 = address;
424 tsk->thread.trap_no = 14;
425 tsk->thread.error_code = error_code;
22f5991c
JB
426 if (__die("Bad pagetable", regs, error_code))
427 regs = NULL;
428 oops_end(flags, regs, SIGKILL);
1da177e4 429}
1156e098 430#endif
1da177e4 431
d8b57bb7
TG
432static int spurious_fault_check(unsigned long error_code, pte_t *pte)
433{
434 if ((error_code & PF_WRITE) && !pte_write(*pte))
435 return 0;
436 if ((error_code & PF_INSTR) && !pte_exec(*pte))
437 return 0;
438
439 return 1;
440}
441
5b727a3b
JF
442/*
443 * Handle a spurious fault caused by a stale TLB entry. This allows
444 * us to lazily refresh the TLB when increasing the permissions of a
445 * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
446 * expensive since that implies doing a full cross-processor TLB
447 * flush, even if no stale TLB entries exist on other processors.
448 * There are no security implications to leaving a stale TLB when
449 * increasing the permissions on a page.
450 */
451static int spurious_fault(unsigned long address,
452 unsigned long error_code)
453{
454 pgd_t *pgd;
455 pud_t *pud;
456 pmd_t *pmd;
457 pte_t *pte;
458
459 /* Reserved-bit violation or user access to kernel space? */
460 if (error_code & (PF_USER | PF_RSVD))
461 return 0;
462
463 pgd = init_mm.pgd + pgd_index(address);
464 if (!pgd_present(*pgd))
465 return 0;
466
467 pud = pud_offset(pgd, address);
468 if (!pud_present(*pud))
469 return 0;
470
d8b57bb7
TG
471 if (pud_large(*pud))
472 return spurious_fault_check(error_code, (pte_t *) pud);
473
5b727a3b
JF
474 pmd = pmd_offset(pud, address);
475 if (!pmd_present(*pmd))
476 return 0;
477
d8b57bb7
TG
478 if (pmd_large(*pmd))
479 return spurious_fault_check(error_code, (pte_t *) pmd);
480
5b727a3b
JF
481 pte = pte_offset_kernel(pmd, address);
482 if (!pte_present(*pte))
483 return 0;
484
d8b57bb7 485 return spurious_fault_check(error_code, pte);
5b727a3b
JF
486}
487
1da177e4 488/*
f8c2ee22
HH
489 * X86_32
490 * Handle a fault on the vmalloc or module mapping area
491 *
492 * X86_64
f95190b2 493 * Handle a fault on the vmalloc area
3b9ba4d5
AK
494 *
495 * This assumes no large pages in there.
1da177e4
LT
496 */
497static int vmalloc_fault(unsigned long address)
498{
fdfe8aa8
HH
499#ifdef CONFIG_X86_32
500 unsigned long pgd_paddr;
501 pmd_t *pmd_k;
502 pte_t *pte_k;
b29c701d
HN
503
504 /* Make sure we are in vmalloc area */
505 if (!(address >= VMALLOC_START && address < VMALLOC_END))
506 return -1;
507
fdfe8aa8
HH
508 /*
509 * Synchronize this task's top level page-table
510 * with the 'reference' page table.
511 *
512 * Do _not_ use "current" here. We might be inside
513 * an interrupt in the middle of a task switch..
514 */
515 pgd_paddr = read_cr3();
516 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
517 if (!pmd_k)
518 return -1;
519 pte_k = pte_offset_kernel(pmd_k, address);
520 if (!pte_present(*pte_k))
521 return -1;
522 return 0;
523#else
1da177e4
LT
524 pgd_t *pgd, *pgd_ref;
525 pud_t *pud, *pud_ref;
526 pmd_t *pmd, *pmd_ref;
527 pte_t *pte, *pte_ref;
528
cf89ec92
HH
529 /* Make sure we are in vmalloc area */
530 if (!(address >= VMALLOC_START && address < VMALLOC_END))
531 return -1;
532
1da177e4
LT
533 /* Copy kernel mappings over when needed. This can also
534 happen within a race in page table update. In the later
535 case just flush. */
536
537 pgd = pgd_offset(current->mm ?: &init_mm, address);
538 pgd_ref = pgd_offset_k(address);
539 if (pgd_none(*pgd_ref))
540 return -1;
541 if (pgd_none(*pgd))
542 set_pgd(pgd, *pgd_ref);
8c914cb7 543 else
46a82b2d 544 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
1da177e4
LT
545
546 /* Below here mismatches are bugs because these lower tables
547 are shared */
548
549 pud = pud_offset(pgd, address);
550 pud_ref = pud_offset(pgd_ref, address);
551 if (pud_none(*pud_ref))
552 return -1;
46a82b2d 553 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
1da177e4
LT
554 BUG();
555 pmd = pmd_offset(pud, address);
556 pmd_ref = pmd_offset(pud_ref, address);
557 if (pmd_none(*pmd_ref))
558 return -1;
559 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
560 BUG();
561 pte_ref = pte_offset_kernel(pmd_ref, address);
562 if (!pte_present(*pte_ref))
563 return -1;
564 pte = pte_offset_kernel(pmd, address);
3b9ba4d5
AK
565 /* Don't use pte_page here, because the mappings can point
566 outside mem_map, and the NUMA hash lookup cannot handle
567 that. */
568 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
1da177e4 569 BUG();
1da177e4 570 return 0;
fdfe8aa8 571#endif
1da177e4
LT
572}
573
abd4f750 574int show_unhandled_signals = 1;
1da177e4
LT
575
576/*
577 * This routine handles page faults. It determines the address,
578 * and the problem, and then passes it off to one of the appropriate
579 * routines.
1da177e4 580 */
f8c2ee22
HH
581#ifdef CONFIG_X86_64
582asmlinkage
583#endif
584void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
1da177e4
LT
585{
586 struct task_struct *tsk;
587 struct mm_struct *mm;
33cb5243 588 struct vm_area_struct *vma;
1da177e4 589 unsigned long address;
f8c2ee22
HH
590 int write, si_code;
591 int fault;
592#ifdef CONFIG_X86_64
1209140c 593 unsigned long flags;
f8c2ee22 594#endif
1da177e4 595
143a5d32
PZ
596 /*
597 * We can fault from pretty much anywhere, with unknown IRQ state.
598 */
599 trace_hardirqs_fixup();
600
a9ba9a3b
AV
601 tsk = current;
602 mm = tsk->mm;
603 prefetchw(&mm->mmap_sem);
604
1da177e4 605 /* get the address */
f51c9452 606 address = read_cr2();
1da177e4 607
c4aba4a8 608 si_code = SEGV_MAPERR;
1da177e4 609
608566b4
HH
610 if (notify_page_fault(regs))
611 return;
0fd0e3da 612 if (unlikely(kmmio_fault(regs, address)))
86069782 613 return;
1da177e4
LT
614
615 /*
616 * We fault-in kernel-space virtual memory on-demand. The
617 * 'reference' page table is init_mm.pgd.
618 *
619 * NOTE! We MUST NOT take any locks for this case. We may
620 * be in an interrupt or a critical region, and should
621 * only copy the information from the master page table,
622 * nothing more.
623 *
624 * This verifies that the fault happens in kernel space
625 * (error_code & 4) == 0, and that the fault was not a
8b1bde93 626 * protection error (error_code & 9) == 0.
1da177e4 627 */
f8c2ee22
HH
628#ifdef CONFIG_X86_32
629 if (unlikely(address >= TASK_SIZE)) {
cf89ec92
HH
630#else
631 if (unlikely(address >= TASK_SIZE64)) {
632#endif
f8c2ee22
HH
633 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
634 vmalloc_fault(address) >= 0)
635 return;
5b727a3b
JF
636
637 /* Can handle a stale RO->RW TLB */
638 if (spurious_fault(address, error_code))
639 return;
640
f8c2ee22
HH
641 /*
642 * Don't take the mm semaphore here. If we fixup a prefetch
643 * fault we could otherwise deadlock.
644 */
645 goto bad_area_nosemaphore;
646 }
647
cf89ec92
HH
648
649#ifdef CONFIG_X86_32
f8c2ee22
HH
650 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
651 fault has been handled. */
6b6891f9 652 if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
f8c2ee22
HH
653 local_irq_enable();
654
655 /*
656 * If we're in an interrupt, have no user context or are running in an
657 * atomic region then we must not take the fault.
658 */
659 if (in_atomic() || !mm)
660 goto bad_area_nosemaphore;
661#else /* CONFIG_X86_64 */
65ea5b03 662 if (likely(regs->flags & X86_EFLAGS_IF))
8c914cb7
JB
663 local_irq_enable();
664
66c58156 665 if (unlikely(error_code & PF_RSVD))
1da177e4
LT
666 pgtable_bad(address, regs, error_code);
667
668 /*
33cb5243
HH
669 * If we're in an interrupt, have no user context or are running in an
670 * atomic region then we must not take the fault.
1da177e4
LT
671 */
672 if (unlikely(in_atomic() || !mm))
673 goto bad_area_nosemaphore;
674
dbe3ed1c
LT
675 /*
676 * User-mode registers count as a user access even for any
677 * potential system fault or CPU buglet.
678 */
679 if (user_mode_vm(regs))
680 error_code |= PF_USER;
f8c2ee22
HH
681again:
682#endif
1da177e4
LT
683 /* When running in the kernel we expect faults to occur only to
684 * addresses in user space. All other faults represent errors in the
676b1855 685 * kernel and should generate an OOPS. Unfortunately, in the case of an
80f7228b 686 * erroneous fault occurring in a code path which already holds mmap_sem
1da177e4
LT
687 * we will deadlock attempting to validate the fault against the
688 * address space. Luckily the kernel only validly references user
689 * space from well defined areas of code, which are listed in the
690 * exceptions table.
691 *
692 * As the vast majority of faults will be valid we will only perform
676b1855 693 * the source reference check when there is a possibility of a deadlock.
1da177e4
LT
694 * Attempt to lock the address space, if we cannot we then validate the
695 * source. If this is invalid we can skip the address space check,
696 * thus avoiding the deadlock.
697 */
698 if (!down_read_trylock(&mm->mmap_sem)) {
66c58156 699 if ((error_code & PF_USER) == 0 &&
65ea5b03 700 !search_exception_tables(regs->ip))
1da177e4
LT
701 goto bad_area_nosemaphore;
702 down_read(&mm->mmap_sem);
703 }
704
705 vma = find_vma(mm, address);
706 if (!vma)
707 goto bad_area;
f8c2ee22 708 if (vma->vm_start <= address)
1da177e4
LT
709 goto good_area;
710 if (!(vma->vm_flags & VM_GROWSDOWN))
711 goto bad_area;
33cb5243 712 if (error_code & PF_USER) {
6f4d368e
HH
713 /*
714 * Accessing the stack below %sp is always a bug.
715 * The large cushion allows instructions like enter
716 * and pusha to work. ("enter $65535,$31" pushes
717 * 32 pointers and then decrements %sp by 65535.)
03fdc2c2 718 */
65ea5b03 719 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
1da177e4
LT
720 goto bad_area;
721 }
722 if (expand_stack(vma, address))
723 goto bad_area;
724/*
725 * Ok, we have a good vm_area for this memory access, so
726 * we can handle it..
727 */
728good_area:
c4aba4a8 729 si_code = SEGV_ACCERR;
1da177e4 730 write = 0;
66c58156 731 switch (error_code & (PF_PROT|PF_WRITE)) {
33cb5243
HH
732 default: /* 3: write, present */
733 /* fall through */
734 case PF_WRITE: /* write, not present */
735 if (!(vma->vm_flags & VM_WRITE))
736 goto bad_area;
737 write++;
738 break;
739 case PF_PROT: /* read, present */
740 goto bad_area;
741 case 0: /* read, not present */
742 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1da177e4 743 goto bad_area;
1da177e4
LT
744 }
745
f8c2ee22
HH
746#ifdef CONFIG_X86_32
747survive:
748#endif
1da177e4
LT
749 /*
750 * If for any reason at all we couldn't handle the fault,
751 * make sure we exit gracefully rather than endlessly redo
752 * the fault.
753 */
83c54070
NP
754 fault = handle_mm_fault(mm, vma, address, write);
755 if (unlikely(fault & VM_FAULT_ERROR)) {
756 if (fault & VM_FAULT_OOM)
757 goto out_of_memory;
758 else if (fault & VM_FAULT_SIGBUS)
759 goto do_sigbus;
760 BUG();
1da177e4 761 }
83c54070
NP
762 if (fault & VM_FAULT_MAJOR)
763 tsk->maj_flt++;
764 else
765 tsk->min_flt++;
d729ab35
HH
766
767#ifdef CONFIG_X86_32
768 /*
769 * Did it hit the DOS screen memory VA from vm86 mode?
770 */
771 if (v8086_mode(regs)) {
772 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
773 if (bit < 32)
774 tsk->thread.screen_bitmap |= 1 << bit;
775 }
776#endif
1da177e4
LT
777 up_read(&mm->mmap_sem);
778 return;
779
780/*
781 * Something tried to access memory that isn't in our memory map..
782 * Fix it, but check if it's kernel or user first..
783 */
784bad_area:
785 up_read(&mm->mmap_sem);
786
787bad_area_nosemaphore:
1da177e4 788 /* User mode accesses just cause a SIGSEGV */
66c58156 789 if (error_code & PF_USER) {
e5e3c84b
SR
790 /*
791 * It's possible to have interrupts off here.
792 */
793 local_irq_enable();
794
1156e098
HH
795 /*
796 * Valid to do another page fault here because this one came
797 * from user space.
798 */
1da177e4
LT
799 if (is_prefetch(regs, address, error_code))
800 return;
801
35f3266f 802 if (is_errata100(regs, address))
1da177e4
LT
803 return;
804
abd4f750
MAS
805 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
806 printk_ratelimit()) {
1da177e4 807 printk(
f294a8ce 808 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
6f4d368e 809 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
f294a8ce
VN
810 tsk->comm, task_pid_nr(tsk), address,
811 (void *) regs->ip, (void *) regs->sp, error_code);
03252919
AK
812 print_vma_addr(" in ", regs->ip);
813 printk("\n");
1da177e4 814 }
33cb5243 815
1da177e4
LT
816 tsk->thread.cr2 = address;
817 /* Kernel addresses are always protection faults */
818 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
819 tsk->thread.trap_no = 14;
c4aba4a8 820 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
1da177e4
LT
821 return;
822 }
823
29caf2f9
HH
824 if (is_f00f_bug(regs, address))
825 return;
826
1da177e4 827no_context:
1da177e4 828 /* Are we prepared to handle this kernel fault? */
33cb5243 829 if (fixup_exception(regs))
1da177e4 830 return;
1da177e4 831
33cb5243 832 /*
f8c2ee22
HH
833 * X86_32
834 * Valid to do another page fault here, because if this fault
835 * had been triggered by is_prefetch fixup_exception would have
836 * handled it.
837 *
838 * X86_64
1da177e4
LT
839 * Hall of shame of CPU/BIOS bugs.
840 */
33cb5243
HH
841 if (is_prefetch(regs, address, error_code))
842 return;
1da177e4
LT
843
844 if (is_errata93(regs, address))
33cb5243 845 return;
1da177e4
LT
846
847/*
848 * Oops. The kernel tried to access some bad page. We'll have to
849 * terminate things with extreme prejudice.
850 */
5394f80f
JF
851 check_for_bios_corruption();
852
f8c2ee22
HH
853#ifdef CONFIG_X86_32
854 bust_spinlocks(1);
fd40d6e3
HH
855#else
856 flags = oops_begin();
857#endif
f8c2ee22
HH
858
859 show_fault_oops(regs, error_code, address);
1da177e4 860
f8c2ee22
HH
861 tsk->thread.cr2 = address;
862 tsk->thread.trap_no = 14;
863 tsk->thread.error_code = error_code;
fd40d6e3
HH
864
865#ifdef CONFIG_X86_32
f8c2ee22
HH
866 die("Oops", regs, error_code);
867 bust_spinlocks(0);
868 do_exit(SIGKILL);
fd40d6e3 869#else
22f5991c
JB
870 if (__die("Oops", regs, error_code))
871 regs = NULL;
1da177e4
LT
872 /* Executive summary in case the body of the oops scrolled away */
873 printk(KERN_EMERG "CR2: %016lx\n", address);
22f5991c 874 oops_end(flags, regs, SIGKILL);
f8c2ee22 875#endif
1da177e4
LT
876
877/*
878 * We ran out of memory, or some other thing happened to us that made
879 * us unable to handle the page fault gracefully.
880 */
881out_of_memory:
882 up_read(&mm->mmap_sem);
f8c2ee22
HH
883 if (is_global_init(tsk)) {
884 yield();
fd40d6e3 885#ifdef CONFIG_X86_32
f8c2ee22
HH
886 down_read(&mm->mmap_sem);
887 goto survive;
f8c2ee22 888#else
1da177e4 889 goto again;
f8c2ee22 890#endif
fd40d6e3
HH
891 }
892
1da177e4 893 printk("VM: killing process %s\n", tsk->comm);
318aa296 894 if (error_code & PF_USER)
021daae2 895 do_group_exit(SIGKILL);
1da177e4
LT
896 goto no_context;
897
898do_sigbus:
899 up_read(&mm->mmap_sem);
900
901 /* Kernel mode? Handle exceptions or die */
66c58156 902 if (!(error_code & PF_USER))
1da177e4 903 goto no_context;
f8c2ee22
HH
904#ifdef CONFIG_X86_32
905 /* User space => ok to do another page fault */
906 if (is_prefetch(regs, address, error_code))
907 return;
908#endif
1da177e4
LT
909 tsk->thread.cr2 = address;
910 tsk->thread.error_code = error_code;
911 tsk->thread.trap_no = 14;
c4aba4a8 912 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
1da177e4 913}
9e43e1b7 914
8c914cb7 915DEFINE_SPINLOCK(pgd_lock);
2bff7383 916LIST_HEAD(pgd_list);
8c914cb7
JB
917
918void vmalloc_sync_all(void)
919{
1156e098 920#ifdef CONFIG_X86_32
67350a5c 921 unsigned long start = VMALLOC_START & PGDIR_MASK;
1156e098
HH
922 unsigned long address;
923
924 if (SHARED_KERNEL_PMD)
925 return;
926
927 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
928 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
67350a5c
JF
929 unsigned long flags;
930 struct page *page;
931
932 spin_lock_irqsave(&pgd_lock, flags);
933 list_for_each_entry(page, &pgd_list, lru) {
934 if (!vmalloc_sync_one(page_address(page),
935 address))
936 break;
1156e098 937 }
67350a5c 938 spin_unlock_irqrestore(&pgd_lock, flags);
1156e098
HH
939 }
940#else /* CONFIG_X86_64 */
67350a5c 941 unsigned long start = VMALLOC_START & PGDIR_MASK;
8c914cb7
JB
942 unsigned long address;
943
944 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
67350a5c
JF
945 const pgd_t *pgd_ref = pgd_offset_k(address);
946 unsigned long flags;
947 struct page *page;
948
949 if (pgd_none(*pgd_ref))
950 continue;
951 spin_lock_irqsave(&pgd_lock, flags);
952 list_for_each_entry(page, &pgd_list, lru) {
953 pgd_t *pgd;
954 pgd = (pgd_t *)page_address(page) + pgd_index(address);
955 if (pgd_none(*pgd))
956 set_pgd(pgd, *pgd_ref);
957 else
958 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8c914cb7 959 }
67350a5c 960 spin_unlock_irqrestore(&pgd_lock, flags);
8c914cb7 961 }
1156e098 962#endif
8c914cb7 963}
This page took 0.623765 seconds and 5 git commands to generate.