[deliverable/linux.git] / arch / x86_64 / mm / fault.c

/*
 *  linux/arch/x86-64/mm/fault.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
 */

#include <linux/config.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/tty.h>
#include <linux/vt_kern.h>		/* For unblank_screen() */
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/kprobes.h>

#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/smp.h>
#include <asm/tlbflush.h>
#include <asm/proto.h>
#include <asm/kdebug.h>
#include <asm-generic/sections.h>
#include <asm/kdebug.h>

void bust_spinlocks(int yes)
{
	int loglevel_save = console_loglevel;
	if (yes) {
		oops_in_progress = 1;
	} else {
#ifdef CONFIG_VT
		unblank_screen();
#endif
		oops_in_progress = 0;
		/*
		 * OK, the message is on the console.  Now we call printk()
		 * without oops_in_progress set so that printk will give klogd
		 * a poke.  Hold onto your hats...
		 */
		console_loglevel = 15;		/* NMI oopser may have shut the console up */
		printk(" ");
		console_loglevel = loglevel_save;
	}
}

/* Sometimes the CPU reports invalid exceptions on prefetch.
   Check that here and ignore.
   Opcode checker based on code by Richard Brunner */
static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
				unsigned long error_code)
{ 
	unsigned char *instr;
	int scan_more = 1;
	int prefetch = 0; 
	unsigned char *max_instr;

	/* If it was a exec fault ignore */
	if (error_code & (1<<4))
		return 0;
	
	instr = (unsigned char *)convert_rip_to_linear(current, regs);
	max_instr = instr + 15;

	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
		return 0;

	while (scan_more && instr < max_instr) { 
		unsigned char opcode;
		unsigned char instr_hi;
		unsigned char instr_lo;

		if (__get_user(opcode, instr))
			break; 

		instr_hi = opcode & 0xf0; 
		instr_lo = opcode & 0x0f; 
		instr++;

		switch (instr_hi) { 
		case 0x20:
		case 0x30:
			/* Values 0x26,0x2E,0x36,0x3E are valid x86
			   prefixes.  In long mode, the CPU will signal
			   invalid opcode if some of these prefixes are
			   present so we will never get here anyway */
			scan_more = ((instr_lo & 7) == 0x6);
			break;
			
		case 0x40:
			/* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
			   Need to figure out under what instruction mode the
			   instruction was issued ... */
			/* Could check the LDT for lm, but for now it's good
			   enough to assume that long mode only uses well known
			   segments or kernel. */
			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
			break;
			
		case 0x60:
			/* 0x64 thru 0x67 are valid prefixes in all modes. */
			scan_more = (instr_lo & 0xC) == 0x4;
			break;		
		case 0xF0:
			/* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
			scan_more = !instr_lo || (instr_lo>>1) == 1;
			break;			
		case 0x00:
			/* Prefetch instruction is 0x0F0D or 0x0F18 */
			scan_more = 0;
			if (__get_user(opcode, instr)) 
				break;
			prefetch = (instr_lo == 0xF) &&
				(opcode == 0x0D || opcode == 0x18);
			break;			
		default:
			scan_more = 0;
			break;
		} 
	}
	return prefetch;
}

static int bad_address(void *p) 
{ 
	unsigned long dummy;
	return __get_user(dummy, (unsigned long *)p);
} 

void dump_pagetable(unsigned long address)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	asm("movq %%cr3,%0" : "=r" (pgd));

	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 
	pgd += pgd_index(address);
	printk("PGD %lx ", pgd_val(*pgd));
	if (bad_address(pgd)) goto bad;
	if (!pgd_present(*pgd)) goto ret; 

	pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
	if (bad_address(pud)) goto bad;
	printk("PUD %lx ", pud_val(*pud));
	if (!pud_present(*pud))	goto ret;

	pmd = pmd_offset(pud, address);
	if (bad_address(pmd)) goto bad;
	printk("PMD %lx ", pmd_val(*pmd));
	if (!pmd_present(*pmd))	goto ret;	 

	pte = pte_offset_kernel(pmd, address);
	if (bad_address(pte)) goto bad;
	printk("PTE %lx", pte_val(*pte)); 
ret:
	printk("\n");
	return;
bad:
	printk("BAD\n");
}

static const char errata93_warning[] = 
KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
KERN_ERR "******* Please consider a BIOS update.\n"
KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";

/* Workaround for K8 erratum #93 & buggy BIOS.
   BIOS SMM functions are required to use a specific workaround
   to avoid corruption of the 64bit RIP register on C stepping K8. 
   A lot of BIOS that didn't get tested properly miss this. 
   The OS sees this as a page fault with the upper 32bits of RIP cleared.
   Try to work around it here.
   Note we only handle faults in kernel here. */

static int is_errata93(struct pt_regs *regs, unsigned long address) 
{
	static int warned;
	if (address != regs->rip)
		return 0;
	if ((address >> 32) != 0) 
		return 0;
	address |= 0xffffffffUL << 32;
	if ((address >= (u64)_stext && address <= (u64)_etext) || 
	    (address >= MODULES_VADDR && address <= MODULES_END)) { 
		if (!warned) {
			printk(errata93_warning); 		
			warned = 1;
		}
		regs->rip = address;
		return 1;
	}
	return 0;
} 

int unhandled_signal(struct task_struct *tsk, int sig)
{
	if (tsk->pid == 1)
		return 1;
	if (tsk->ptrace & PT_PTRACED)
		return 0;
	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
		(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
}

static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
				 unsigned long error_code)
{
	unsigned long flags = oops_begin();
	struct task_struct *tsk;

	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
	       current->comm, address);
	dump_pagetable(address);
	tsk = current;
	tsk->thread.cr2 = address;
	tsk->thread.trap_no = 14;
	tsk->thread.error_code = error_code;
	__die("Bad pagetable", regs, error_code);
	oops_end(flags);
	do_exit(SIGKILL);
}

/*
 * Handle a fault on the vmalloc area
 *
 * This assumes no large pages in there.
 */
static int vmalloc_fault(unsigned long address)
{
	pgd_t *pgd, *pgd_ref;
	pud_t *pud, *pud_ref;
	pmd_t *pmd, *pmd_ref;
	pte_t *pte, *pte_ref;

	/* Copy kernel mappings over when needed. This can also
	   happen within a race in page table update. In the later
	   case just flush. */

	pgd = pgd_offset(current->mm ?: &init_mm, address);
	pgd_ref = pgd_offset_k(address);
	if (pgd_none(*pgd_ref))
		return -1;
	if (pgd_none(*pgd))
		set_pgd(pgd, *pgd_ref);

	/* Below here mismatches are bugs because these lower tables
	   are shared */

	pud = pud_offset(pgd, address);
	pud_ref = pud_offset(pgd_ref, address);
	if (pud_none(*pud_ref))
		return -1;
	if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
		BUG();
	pmd = pmd_offset(pud, address);
	pmd_ref = pmd_offset(pud_ref, address);
	if (pmd_none(*pmd_ref))
		return -1;
	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
		BUG();
	pte_ref = pte_offset_kernel(pmd_ref, address);
	if (!pte_present(*pte_ref))
		return -1;
	pte = pte_offset_kernel(pmd, address);
	/* Don't use pte_page here, because the mappings can point
	   outside mem_map, and the NUMA hash lookup cannot handle
	   that. */
	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
		BUG();
	return 0;
}

int page_fault_trace = 0;
int exception_trace = 1;

/*
 * This routine handles page faults.  It determines the address,
 * and the problem, and then passes it off to one of the appropriate
 * routines.
 *
 * error_code:
 *	bit 0 == 0 means no page found, 1 means protection fault
 *	bit 1 == 0 means read, 1 means write
 *	bit 2 == 0 means kernel, 1 means user-mode
 *	bit 3 == 1 means use of reserved bit detected
 *	bit 4 == 1 means fault was an instruction fetch
 */
asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
					unsigned long error_code)
{
	struct task_struct *tsk;
	struct mm_struct *mm;
	struct vm_area_struct * vma;
	unsigned long address;
	const struct exception_table_entry *fixup;
	int write;
	unsigned long flags;
	siginfo_t info;

	/* get the address */
	__asm__("movq %%cr2,%0":"=r" (address));
	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
					SIGSEGV) == NOTIFY_STOP)
		return;

	if (likely(regs->eflags & X86_EFLAGS_IF))
		local_irq_enable();

	if (unlikely(page_fault_trace))
		printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
		       regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 

	tsk = current;
	mm = tsk->mm;
	info.si_code = SEGV_MAPERR;


	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * 'reference' page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 *
	 * This verifies that the fault happens in kernel space
	 * (error_code & 4) == 0, and that the fault was not a
	 * protection error (error_code & 9) == 0.
	 */
	if (unlikely(address >= TASK_SIZE64)) {
		/*
		 * Don't check for the module range here: its PML4
		 * is always initialized because it's shared with the main
		 * kernel text. Only vmalloc may need PML4 syncups.
		 */
		if (!(error_code & 0xd) &&
		      ((address >= VMALLOC_START && address < VMALLOC_END))) {
			if (vmalloc_fault(address) < 0)
				goto bad_area_nosemaphore;
			return;
		}
		/*
		 * Don't take the mm semaphore here. If we fixup a prefetch
		 * fault we could otherwise deadlock.
		 */
		goto bad_area_nosemaphore;
	}

	if (unlikely(error_code & (1 << 3)))
		pgtable_bad(address, regs, error_code);

	/*
	 * If we're in an interrupt or have no user
	 * context, we must not take the fault..
	 */
	if (unlikely(in_atomic() || !mm))
		goto bad_area_nosemaphore;

 again:
	/* When running in the kernel we expect faults to occur only to
	 * addresses in user space.  All other faults represent errors in the
	 * kernel and should generate an OOPS.  Unfortunatly, in the case of an
	 * erroneous fault occuring in a code path which already holds mmap_sem
	 * we will deadlock attempting to validate the fault against the
	 * address space.  Luckily the kernel only validly references user
	 * space from well defined areas of code, which are listed in the
	 * exceptions table.
	 *
	 * As the vast majority of faults will be valid we will only perform
	 * the source reference check when there is a possibilty of a deadlock.
	 * Attempt to lock the address space, if we cannot we then validate the
	 * source.  If this is invalid we can skip the address space check,
	 * thus avoiding the deadlock.
	 */
	if (!down_read_trylock(&mm->mmap_sem)) {
		if ((error_code & 4) == 0 &&
		    !search_exception_tables(regs->rip))
			goto bad_area_nosemaphore;
		down_read(&mm->mmap_sem);
	}

	vma = find_vma(mm, address);
	if (!vma)
		goto bad_area;
	if (likely(vma->vm_start <= address))
		goto good_area;
	if (!(vma->vm_flags & VM_GROWSDOWN))
		goto bad_area;
	if (error_code & 4) {
		// XXX: align red zone size with ABI 
		if (address + 128 < regs->rsp)
			goto bad_area;
	}
	if (expand_stack(vma, address))
		goto bad_area;
/*
 * Ok, we have a good vm_area for this memory access, so
 * we can handle it..
 */
good_area:
	info.si_code = SEGV_ACCERR;
	write = 0;
	switch (error_code & 3) {
		default:	/* 3: write, present */
			/* fall through */
		case 2:		/* write, not present */
			if (!(vma->vm_flags & VM_WRITE))
				goto bad_area;
			write++;
			break;
		case 1:		/* read, present */
			goto bad_area;
		case 0:		/* read, not present */
			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
				goto bad_area;
	}

	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */
	switch (handle_mm_fault(mm, vma, address, write)) {
	case VM_FAULT_MINOR:
		tsk->min_flt++;
		break;
	case VM_FAULT_MAJOR:
		tsk->maj_flt++;
		break;
	case VM_FAULT_SIGBUS:
		goto do_sigbus;
	default:
		goto out_of_memory;
	}

	up_read(&mm->mmap_sem);
	return;

/*
 * Something tried to access memory that isn't in our memory map..
 * Fix it, but check if it's kernel or user first..
 */
bad_area:
	up_read(&mm->mmap_sem);

bad_area_nosemaphore:
	/* User mode accesses just cause a SIGSEGV */
	if (error_code & 4) {
		if (is_prefetch(regs, address, error_code))
			return;

		/* Work around K8 erratum #100 K8 in compat mode
		   occasionally jumps to illegal addresses >4GB.  We
		   catch this here in the page fault handler because
		   these addresses are not reachable. Just detect this
		   case and return.  Any code segment in LDT is
		   compatibility mode. */
		if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
		    (address >> 32))
			return;

		if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
			printk(
		       "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
					tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
					tsk->comm, tsk->pid, address, regs->rip,
					regs->rsp, error_code);
		}
       
		tsk->thread.cr2 = address;
		/* Kernel addresses are always protection faults */
		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
		tsk->thread.trap_no = 14;
		info.si_signo = SIGSEGV;
		info.si_errno = 0;
		/* info.si_code has been set above */
		info.si_addr = (void __user *)address;
		force_sig_info(SIGSEGV, &info, tsk);
		return;
	}

no_context:
	
	/* Are we prepared to handle this kernel fault?  */
	fixup = search_exception_tables(regs->rip);
	if (fixup) {
		regs->rip = fixup->fixup;
		return;
	}

	/* 
	 * Hall of shame of CPU/BIOS bugs.
	 */

 	if (is_prefetch(regs, address, error_code))
 		return;

	if (is_errata93(regs, address))
		return; 

/*
 * Oops. The kernel tried to access some bad page. We'll have to
 * terminate things with extreme prejudice.
 */

	flags = oops_begin();

	if (address < PAGE_SIZE)
		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
	else
		printk(KERN_ALERT "Unable to handle kernel paging request");
	printk(" at %016lx RIP: \n" KERN_ALERT,address);
	printk_address(regs->rip);
	printk("\n");
	dump_pagetable(address);
	tsk->thread.cr2 = address;
	tsk->thread.trap_no = 14;
	tsk->thread.error_code = error_code;
	__die("Oops", regs, error_code);
	/* Executive summary in case the body of the oops scrolled away */
	printk(KERN_EMERG "CR2: %016lx\n", address);
	oops_end(flags);
	do_exit(SIGKILL);

/*
 * We ran out of memory, or some other thing happened to us that made
 * us unable to handle the page fault gracefully.
 */
out_of_memory:
	up_read(&mm->mmap_sem);
	if (current->pid == 1) { 
		yield();
		goto again;
	}
	printk("VM: killing process %s\n", tsk->comm);
	if (error_code & 4)
		do_exit(SIGKILL);
	goto no_context;

do_sigbus:
	up_read(&mm->mmap_sem);

	/* Kernel mode? Handle exceptions or die */
	if (!(error_code & 4))
		goto no_context;

	tsk->thread.cr2 = address;
	tsk->thread.error_code = error_code;
	tsk->thread.trap_no = 14;
	info.si_signo = SIGBUS;
	info.si_errno = 0;
	info.si_code = BUS_ADRERR;
	info.si_addr = (void __user *)address;
	force_sig_info(SIGBUS, &info, tsk);
	return;
}

static int __init enable_pagefaulttrace(char *str)
{
	page_fault_trace = 1;
	return 0;
}
__setup("pagefaulttrace", enable_pagefaulttrace);
Commit	Line	Data
1da177e4 LT	1	/*
	2	* linux/arch/x86-64/mm/fault.c
	3	*
	4	* Copyright (C) 1995 Linus Torvalds
	5	* Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
	6	*/
	7
	8	#include <linux/config.h>
	9	#include <linux/signal.h>
	10	#include <linux/sched.h>
	11	#include <linux/kernel.h>
	12	#include <linux/errno.h>
	13	#include <linux/string.h>
	14	#include <linux/types.h>
	15	#include <linux/ptrace.h>
	16	#include <linux/mman.h>
	17	#include <linux/mm.h>
	18	#include <linux/smp.h>
	19	#include <linux/smp_lock.h>
	20	#include <linux/interrupt.h>
	21	#include <linux/init.h>
	22	#include <linux/tty.h>
	23	#include <linux/vt_kern.h> /* For unblank_screen() */
	24	#include <linux/compiler.h>
	25	#include <linux/module.h>
0f2fbdcb	26	#include <linux/kprobes.h>
1da177e4 LT	27
	28	#include <asm/system.h>
	29	#include <asm/uaccess.h>
	30	#include <asm/pgalloc.h>
	31	#include <asm/smp.h>
	32	#include <asm/tlbflush.h>
	33	#include <asm/proto.h>
	34	#include <asm/kdebug.h>
	35	#include <asm-generic/sections.h>
	36	#include <asm/kdebug.h>
	37
	38	void bust_spinlocks(int yes)
	39	{
	40	int loglevel_save = console_loglevel;
	41	if (yes) {
	42	oops_in_progress = 1;
	43	} else {
	44	#ifdef CONFIG_VT
	45	unblank_screen();
	46	#endif
	47	oops_in_progress = 0;
	48	/*
	49	* OK, the message is on the console. Now we call printk()
	50	* without oops_in_progress set so that printk will give klogd
	51	* a poke. Hold onto your hats...
	52	*/
	53	console_loglevel = 15; /* NMI oopser may have shut the console up */
	54	printk(" ");
	55	console_loglevel = loglevel_save;
	56	}
	57	}
	58
	59	/* Sometimes the CPU reports invalid exceptions on prefetch.
	60	Check that here and ignore.
	61	Opcode checker based on code by Richard Brunner */
	62	static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
	63	unsigned long error_code)
	64	{
f1290ec9	65	unsigned char *instr;
1da177e4 LT	66	int scan_more = 1;
1da177e4 LT	67	int prefetch = 0;
f1290ec9	68	unsigned char *max_instr;
1da177e4 LT	69
	70	/* If it was a exec fault ignore */
	71	if (error_code & (1<<4))
	72	return 0;
	73
f1290ec9 AK	74	instr = (unsigned char *)convert_rip_to_linear(current, regs);
f1290ec9 AK	75	max_instr = instr + 15;
1da177e4	76
76381fee	77	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4 LT	78	return 0;
	79
	80	while (scan_more && instr < max_instr) {
	81	unsigned char opcode;
	82	unsigned char instr_hi;
	83	unsigned char instr_lo;
	84
	85	if (__get_user(opcode, instr))
	86	break;
	87
	88	instr_hi = opcode & 0xf0;
	89	instr_lo = opcode & 0x0f;
	90	instr++;
	91
	92	switch (instr_hi) {
	93	case 0x20:
	94	case 0x30:
	95	/* Values 0x26,0x2E,0x36,0x3E are valid x86
	96	prefixes. In long mode, the CPU will signal
	97	invalid opcode if some of these prefixes are
	98	present so we will never get here anyway */
	99	scan_more = ((instr_lo & 7) == 0x6);
	100	break;
	101
	102	case 0x40:
	103	/* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
	104	Need to figure out under what instruction mode the
	105	instruction was issued ... */
	106	/* Could check the LDT for lm, but for now it's good
	107	enough to assume that long mode only uses well known
	108	segments or kernel. */
76381fee	109	scan_more = (!user_mode(regs)) \|\| (regs->cs == __USER_CS);
1da177e4 LT	110	break;
	111
	112	case 0x60:
	113	/* 0x64 thru 0x67 are valid prefixes in all modes. */
	114	scan_more = (instr_lo & 0xC) == 0x4;
	115	break;
	116	case 0xF0:
	117	/* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
	118	scan_more = !instr_lo \|\| (instr_lo>>1) == 1;
	119	break;
	120	case 0x00:
	121	/* Prefetch instruction is 0x0F0D or 0x0F18 */
	122	scan_more = 0;
	123	if (__get_user(opcode, instr))
	124	break;
	125	prefetch = (instr_lo == 0xF) &&
	126	(opcode == 0x0D \|\| opcode == 0x18);
	127	break;
	128	default:
	129	scan_more = 0;
	130	break;
	131	}
	132	}
	133	return prefetch;
	134	}
	135
	136	static int bad_address(void *p)
	137	{
	138	unsigned long dummy;
	139	return __get_user(dummy, (unsigned long *)p);
	140	}
	141
	142	void dump_pagetable(unsigned long address)
	143	{
	144	pgd_t *pgd;
	145	pud_t *pud;
	146	pmd_t *pmd;
	147	pte_t *pte;
	148
	149	asm("movq %%cr3,%0" : "=r" (pgd));
	150
	151	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
	152	pgd += pgd_index(address);
	153	printk("PGD %lx ", pgd_val(*pgd));
	154	if (bad_address(pgd)) goto bad;
	155	if (!pgd_present(*pgd)) goto ret;
	156
	157	pud = __pud_offset_k((pud_t )pgd_page(pgd), address);
	158	if (bad_address(pud)) goto bad;
	159	printk("PUD %lx ", pud_val(*pud));
	160	if (!pud_present(*pud)) goto ret;
	161
	162	pmd = pmd_offset(pud, address);
	163	if (bad_address(pmd)) goto bad;
	164	printk("PMD %lx ", pmd_val(*pmd));
	165	if (!pmd_present(*pmd)) goto ret;
	166
	167	pte = pte_offset_kernel(pmd, address);
	168	if (bad_address(pte)) goto bad;
	169	printk("PTE %lx", pte_val(*pte));
	170	ret:
	171	printk("\n");
	172	return;
	173	bad:
174	printk("BAD\n");
175	}
176
177	static const char errata93_warning[] =
178	KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
179	KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
180	KERN_ERR "******* Please consider a BIOS update.\n"
181	KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
182
183	/* Workaround for K8 erratum #93 & buggy BIOS.
184	BIOS SMM functions are required to use a specific workaround
185	to avoid corruption of the 64bit RIP register on C stepping K8.
186	A lot of BIOS that didn't get tested properly miss this.
187	The OS sees this as a page fault with the upper 32bits of RIP cleared.
188	Try to work around it here.
189	Note we only handle faults in kernel here. */
190
191	static int is_errata93(struct pt_regs *regs, unsigned long address)
192	{
193	static int warned;
194	if (address != regs->rip)
195	return 0;
196	if ((address >> 32) != 0)
197	return 0;
198	address \|= 0xffffffffUL << 32;
199	if ((address >= (u64)_stext && address <= (u64)_etext) \|\|
200	(address >= MODULES_VADDR && address <= MODULES_END)) {
201	if (!warned) {
202	printk(errata93_warning);
203	warned = 1;
204	}
205	regs->rip = address;
206	return 1;
207	}
208	return 0;
209	}
210
211	int unhandled_signal(struct task_struct *tsk, int sig)
212	{
213	if (tsk->pid == 1)
214	return 1;
5e5ec104	215	if (tsk->ptrace & PT_PTRACED)
1da177e4 LT	216	return 0;
	217	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) \|\|
	218	(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
	219	}
	220
	221	static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
	222	unsigned long error_code)
	223	{
1209140c	224	unsigned long flags = oops_begin();
6e3f3617	225	struct task_struct *tsk;
1209140c	226
1da177e4 LT	227	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
	228	current->comm, address);
	229	dump_pagetable(address);
6e3f3617 JB	230	tsk = current;
	231	tsk->thread.cr2 = address;
	232	tsk->thread.trap_no = 14;
	233	tsk->thread.error_code = error_code;
1da177e4	234	__die("Bad pagetable", regs, error_code);
1209140c	235	oops_end(flags);
1da177e4 LT	236	do_exit(SIGKILL);
	237	}
	238
	239	/*
f95190b2	240	* Handle a fault on the vmalloc area
3b9ba4d5 AK	241	*
3b9ba4d5 AK	242	* This assumes no large pages in there.
1da177e4 LT	243	*/
	244	static int vmalloc_fault(unsigned long address)
	245	{
	246	pgd_t pgd, pgd_ref;
	247	pud_t pud, pud_ref;
	248	pmd_t pmd, pmd_ref;
	249	pte_t pte, pte_ref;
	250
	251	/* Copy kernel mappings over when needed. This can also
	252	happen within a race in page table update. In the later
	253	case just flush. */
	254
	255	pgd = pgd_offset(current->mm ?: &init_mm, address);
	256	pgd_ref = pgd_offset_k(address);
	257	if (pgd_none(*pgd_ref))
	258	return -1;
	259	if (pgd_none(*pgd))
	260	set_pgd(pgd, *pgd_ref);
	261
	262	/* Below here mismatches are bugs because these lower tables
	263	are shared */
	264
	265	pud = pud_offset(pgd, address);
	266	pud_ref = pud_offset(pgd_ref, address);
	267	if (pud_none(*pud_ref))
	268	return -1;
	269	if (pud_none(pud) \|\| pud_page(pud) != pud_page(*pud_ref))
	270	BUG();
	271	pmd = pmd_offset(pud, address);
	272	pmd_ref = pmd_offset(pud_ref, address);
	273	if (pmd_none(*pmd_ref))
	274	return -1;
	275	if (pmd_none(pmd) \|\| pmd_page(pmd) != pmd_page(*pmd_ref))
	276	BUG();
	277	pte_ref = pte_offset_kernel(pmd_ref, address);
	278	if (!pte_present(*pte_ref))
	279	return -1;
	280	pte = pte_offset_kernel(pmd, address);
3b9ba4d5 AK	281	/* Don't use pte_page here, because the mappings can point
	282	outside mem_map, and the NUMA hash lookup cannot handle
	283	that. */
	284	if (!pte_present(pte) \|\| pte_pfn(pte) != pte_pfn(*pte_ref))
1da177e4	285	BUG();
1da177e4 LT	286	return 0;
	287	}
	288
	289	int page_fault_trace = 0;
	290	int exception_trace = 1;
	291
	292	/*
	293	* This routine handles page faults. It determines the address,
	294	* and the problem, and then passes it off to one of the appropriate
	295	* routines.
	296	*
	297	* error_code:
	298	* bit 0 == 0 means no page found, 1 means protection fault
	299	* bit 1 == 0 means read, 1 means write
	300	* bit 2 == 0 means kernel, 1 means user-mode
8b1bde93 JB	301	* bit 3 == 1 means use of reserved bit detected
8b1bde93 JB	302	* bit 4 == 1 means fault was an instruction fetch
1da177e4	303	*/
0f2fbdcb PP	304	asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
0f2fbdcb PP	305	unsigned long error_code)
1da177e4 LT	306	{
	307	struct task_struct *tsk;
	308	struct mm_struct *mm;
	309	struct vm_area_struct * vma;
	310	unsigned long address;
	311	const struct exception_table_entry *fixup;
	312	int write;
1209140c	313	unsigned long flags;
1da177e4 LT	314	siginfo_t info;
1da177e4 LT	315
1da177e4 LT	316	/* get the address */
	317	__asm__("movq %%cr2,%0":"=r" (address));
	318	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
	319	SIGSEGV) == NOTIFY_STOP)
	320	return;
	321
	322	if (likely(regs->eflags & X86_EFLAGS_IF))
	323	local_irq_enable();
	324
	325	if (unlikely(page_fault_trace))
	326	printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
	327	regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
	328
	329	tsk = current;
	330	mm = tsk->mm;
	331	info.si_code = SEGV_MAPERR;
	332
	333
	334	/*
	335	* We fault-in kernel-space virtual memory on-demand. The
	336	* 'reference' page table is init_mm.pgd.
	337	*
	338	* NOTE! We MUST NOT take any locks for this case. We may
	339	* be in an interrupt or a critical region, and should
	340	* only copy the information from the master page table,
	341	* nothing more.
	342	*
	343	* This verifies that the fault happens in kernel space
	344	* (error_code & 4) == 0, and that the fault was not a
8b1bde93	345	* protection error (error_code & 9) == 0.
1da177e4	346	*/
84929801	347	if (unlikely(address >= TASK_SIZE64)) {
f95190b2 AK	348	/*
	349	* Don't check for the module range here: its PML4
	350	* is always initialized because it's shared with the main
	351	* kernel text. Only vmalloc may need PML4 syncups.
	352	*/
8b1bde93	353	if (!(error_code & 0xd) &&
f95190b2	354	((address >= VMALLOC_START && address < VMALLOC_END))) {
1da177e4 LT	355	if (vmalloc_fault(address) < 0)
	356	goto bad_area_nosemaphore;
	357	return;
	358	}
	359	/*
	360	* Don't take the mm semaphore here. If we fixup a prefetch
	361	* fault we could otherwise deadlock.
	362	*/
	363	goto bad_area_nosemaphore;
	364	}
	365
	366	if (unlikely(error_code & (1 << 3)))
	367	pgtable_bad(address, regs, error_code);
	368
	369	/*
	370	* If we're in an interrupt or have no user
	371	* context, we must not take the fault..
	372	*/
	373	if (unlikely(in_atomic() \|\| !mm))
	374	goto bad_area_nosemaphore;
	375
	376	again:
	377	/* When running in the kernel we expect faults to occur only to
	378	* addresses in user space. All other faults represent errors in the
	379	* kernel and should generate an OOPS. Unfortunatly, in the case of an
	380	* erroneous fault occuring in a code path which already holds mmap_sem
	381	* we will deadlock attempting to validate the fault against the
	382	* address space. Luckily the kernel only validly references user
	383	* space from well defined areas of code, which are listed in the
	384	* exceptions table.
	385	*
	386	* As the vast majority of faults will be valid we will only perform
	387	* the source reference check when there is a possibilty of a deadlock.
	388	* Attempt to lock the address space, if we cannot we then validate the
	389	* source. If this is invalid we can skip the address space check,
	390	* thus avoiding the deadlock.
	391	*/
	392	if (!down_read_trylock(&mm->mmap_sem)) {
	393	if ((error_code & 4) == 0 &&
	394	!search_exception_tables(regs->rip))
	395	goto bad_area_nosemaphore;
	396	down_read(&mm->mmap_sem);
	397	}
	398
	399	vma = find_vma(mm, address);
	400	if (!vma)
	401	goto bad_area;
	402	if (likely(vma->vm_start <= address))
	403	goto good_area;
	404	if (!(vma->vm_flags & VM_GROWSDOWN))
	405	goto bad_area;
	406	if (error_code & 4) {
	407	// XXX: align red zone size with ABI
	408	if (address + 128 < regs->rsp)
	409	goto bad_area;
	410	}
	411	if (expand_stack(vma, address))
	412	goto bad_area;
	413	/*
	414	* Ok, we have a good vm_area for this memory access, so
	415	* we can handle it..
	416	*/
	417	good_area:
	418	info.si_code = SEGV_ACCERR;
419	write = 0;
420	switch (error_code & 3) {
421	default: /* 3: write, present */
422	/* fall through */
423	case 2: /* write, not present */
424	if (!(vma->vm_flags & VM_WRITE))
425	goto bad_area;
426	write++;
427	break;
428	case 1: /* read, present */
429	goto bad_area;
430	case 0: /* read, not present */
431	if (!(vma->vm_flags & (VM_READ \| VM_EXEC)))
432	goto bad_area;
433	}
434
435	/*
436	* If for any reason at all we couldn't handle the fault,
437	* make sure we exit gracefully rather than endlessly redo
438	* the fault.
439	*/
440	switch (handle_mm_fault(mm, vma, address, write)) {
96800216	441	case VM_FAULT_MINOR:
1da177e4 LT	442	tsk->min_flt++;
1da177e4 LT	443	break;
96800216	444	case VM_FAULT_MAJOR:
1da177e4 LT	445	tsk->maj_flt++;
1da177e4 LT	446	break;
96800216	447	case VM_FAULT_SIGBUS:
1da177e4 LT	448	goto do_sigbus;
	449	default:
	450	goto out_of_memory;
	451	}
	452
	453	up_read(&mm->mmap_sem);
	454	return;
	455
	456	/*
	457	* Something tried to access memory that isn't in our memory map..
	458	* Fix it, but check if it's kernel or user first..
	459	*/
	460	bad_area:
	461	up_read(&mm->mmap_sem);
	462
	463	bad_area_nosemaphore:
1da177e4 LT	464	/* User mode accesses just cause a SIGSEGV */
	465	if (error_code & 4) {
	466	if (is_prefetch(regs, address, error_code))
	467	return;
	468
	469	/* Work around K8 erratum #100 K8 in compat mode
	470	occasionally jumps to illegal addresses >4GB. We
	471	catch this here in the page fault handler because
	472	these addresses are not reachable. Just detect this
	473	case and return. Any code segment in LDT is
	474	compatibility mode. */
	475	if ((regs->cs == __USER32_CS \|\| (regs->cs & (1<<2))) &&
	476	(address >> 32))
	477	return;
	478
	479	if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
	480	printk(
	481	"%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
	482	tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
	483	tsk->comm, tsk->pid, address, regs->rip,
	484	regs->rsp, error_code);
	485	}
	486
	487	tsk->thread.cr2 = address;
	488	/* Kernel addresses are always protection faults */
	489	tsk->thread.error_code = error_code \| (address >= TASK_SIZE);
	490	tsk->thread.trap_no = 14;
	491	info.si_signo = SIGSEGV;
	492	info.si_errno = 0;
	493	/* info.si_code has been set above */
	494	info.si_addr = (void __user *)address;
	495	force_sig_info(SIGSEGV, &info, tsk);
	496	return;
	497	}
	498
	499	no_context:
	500
	501	/* Are we prepared to handle this kernel fault? */
	502	fixup = search_exception_tables(regs->rip);
	503	if (fixup) {
	504	regs->rip = fixup->fixup;
	505	return;
	506	}
	507
	508	/*
	509	* Hall of shame of CPU/BIOS bugs.
	510	*/
	511
	512	if (is_prefetch(regs, address, error_code))
	513	return;
	514
	515	if (is_errata93(regs, address))
	516	return;
	517
	518	/*
	519	* Oops. The kernel tried to access some bad page. We'll have to
	520	* terminate things with extreme prejudice.
	521	*/
	522
1209140c	523	flags = oops_begin();
1da177e4 LT	524
	525	if (address < PAGE_SIZE)
	526	printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
	527	else
	528	printk(KERN_ALERT "Unable to handle kernel paging request");
	529	printk(" at %016lx RIP: \n" KERN_ALERT,address);
	530	printk_address(regs->rip);
	531	printk("\n");
	532	dump_pagetable(address);
6e3f3617 JB	533	tsk->thread.cr2 = address;
	534	tsk->thread.trap_no = 14;
	535	tsk->thread.error_code = error_code;
1da177e4 LT	536	__die("Oops", regs, error_code);
	537	/* Executive summary in case the body of the oops scrolled away */
	538	printk(KERN_EMERG "CR2: %016lx\n", address);
1209140c	539	oops_end(flags);
1da177e4 LT	540	do_exit(SIGKILL);
	541
	542	/*
	543	* We ran out of memory, or some other thing happened to us that made
	544	* us unable to handle the page fault gracefully.
	545	*/
	546	out_of_memory:
	547	up_read(&mm->mmap_sem);
1da177e4 LT	548	if (current->pid == 1) {
	549	yield();
	550	goto again;
	551	}
	552	printk("VM: killing process %s\n", tsk->comm);
	553	if (error_code & 4)
	554	do_exit(SIGKILL);
	555	goto no_context;
	556
	557	do_sigbus:
	558	up_read(&mm->mmap_sem);
	559
	560	/* Kernel mode? Handle exceptions or die */
	561	if (!(error_code & 4))
	562	goto no_context;
	563
	564	tsk->thread.cr2 = address;
	565	tsk->thread.error_code = error_code;
	566	tsk->thread.trap_no = 14;
	567	info.si_signo = SIGBUS;
	568	info.si_errno = 0;
	569	info.si_code = BUS_ADRERR;
	570	info.si_addr = (void __user *)address;
	571	force_sig_info(SIGBUS, &info, tsk);
	572	return;
	573	}
9e43e1b7 AK	574
	575	static int __init enable_pagefaulttrace(char *str)
	576	{
	577	page_fault_trace = 1;
	578	return 0;
	579	}
	580	__setup("pagefaulttrace", enable_pagefaulttrace);