[deliverable/linux.git] / arch / x86_64 / mm / fault.c

/*
 *  linux/arch/x86-64/mm/fault.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
 */

#include <linux/config.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/tty.h>
#include <linux/vt_kern.h>		/* For unblank_screen() */
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/kprobes.h>

#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/smp.h>
#include <asm/tlbflush.h>
#include <asm/proto.h>
#include <asm/kdebug.h>
#include <asm-generic/sections.h>

/* Page fault error code bits */
#define PF_PROT	(1<<0)		/* or no page found */
#define PF_WRITE	(1<<1)
#define PF_USER	(1<<2)
#define PF_RSVD	(1<<3)
#define PF_INSTR	(1<<4)

#ifdef CONFIG_KPROBES
ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);

/* Hook to register for page fault notifications */
int register_page_fault_notifier(struct notifier_block *nb)
{
	vmalloc_sync_all();
	return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
}

int unregister_page_fault_notifier(struct notifier_block *nb)
{
	return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
}

static inline int notify_page_fault(enum die_val val, const char *str,
			struct pt_regs *regs, long err, int trap, int sig)
{
	struct die_args args = {
		.regs = regs,
		.str = str,
		.err = err,
		.trapnr = trap,
		.signr = sig
	};
	return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
}
#else
static inline int notify_page_fault(enum die_val val, const char *str,
			struct pt_regs *regs, long err, int trap, int sig)
{
	return NOTIFY_DONE;
}
#endif

void bust_spinlocks(int yes)
{
	int loglevel_save = console_loglevel;
	if (yes) {
		oops_in_progress = 1;
	} else {
#ifdef CONFIG_VT
		unblank_screen();
#endif
		oops_in_progress = 0;
		/*
		 * OK, the message is on the console.  Now we call printk()
		 * without oops_in_progress set so that printk will give klogd
		 * a poke.  Hold onto your hats...
		 */
		console_loglevel = 15;		/* NMI oopser may have shut the console up */
		printk(" ");
		console_loglevel = loglevel_save;
	}
}

/* Sometimes the CPU reports invalid exceptions on prefetch.
   Check that here and ignore.
   Opcode checker based on code by Richard Brunner */
static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
				unsigned long error_code)
{ 
	unsigned char *instr;
	int scan_more = 1;
	int prefetch = 0; 
	unsigned char *max_instr;

	/* If it was a exec fault ignore */
	if (error_code & PF_INSTR)
		return 0;
	
	instr = (unsigned char *)convert_rip_to_linear(current, regs);
	max_instr = instr + 15;

	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
		return 0;

	while (scan_more && instr < max_instr) { 
		unsigned char opcode;
		unsigned char instr_hi;
		unsigned char instr_lo;

		if (__get_user(opcode, instr))
			break; 

		instr_hi = opcode & 0xf0; 
		instr_lo = opcode & 0x0f; 
		instr++;

		switch (instr_hi) { 
		case 0x20:
		case 0x30:
			/* Values 0x26,0x2E,0x36,0x3E are valid x86
			   prefixes.  In long mode, the CPU will signal
			   invalid opcode if some of these prefixes are
			   present so we will never get here anyway */
			scan_more = ((instr_lo & 7) == 0x6);
			break;
			
		case 0x40:
			/* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
			   Need to figure out under what instruction mode the
			   instruction was issued ... */
			/* Could check the LDT for lm, but for now it's good
			   enough to assume that long mode only uses well known
			   segments or kernel. */
			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
			break;
			
		case 0x60:
			/* 0x64 thru 0x67 are valid prefixes in all modes. */
			scan_more = (instr_lo & 0xC) == 0x4;
			break;		
		case 0xF0:
			/* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
			scan_more = !instr_lo || (instr_lo>>1) == 1;
			break;			
		case 0x00:
			/* Prefetch instruction is 0x0F0D or 0x0F18 */
			scan_more = 0;
			if (__get_user(opcode, instr)) 
				break;
			prefetch = (instr_lo == 0xF) &&
				(opcode == 0x0D || opcode == 0x18);
			break;			
		default:
			scan_more = 0;
			break;
		} 
	}
	return prefetch;
}

static int bad_address(void *p) 
{ 
	unsigned long dummy;
	return __get_user(dummy, (unsigned long *)p);
} 

void dump_pagetable(unsigned long address)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	asm("movq %%cr3,%0" : "=r" (pgd));

	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 
	pgd += pgd_index(address);
	if (bad_address(pgd)) goto bad;
	printk("PGD %lx ", pgd_val(*pgd));
	if (!pgd_present(*pgd)) goto ret; 

	pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
	if (bad_address(pud)) goto bad;
	printk("PUD %lx ", pud_val(*pud));
	if (!pud_present(*pud))	goto ret;

	pmd = pmd_offset(pud, address);
	if (bad_address(pmd)) goto bad;
	printk("PMD %lx ", pmd_val(*pmd));
	if (!pmd_present(*pmd))	goto ret;	 

	pte = pte_offset_kernel(pmd, address);
	if (bad_address(pte)) goto bad;
	printk("PTE %lx", pte_val(*pte)); 
ret:
	printk("\n");
	return;
bad:
	printk("BAD\n");
}

static const char errata93_warning[] = 
KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
KERN_ERR "******* Please consider a BIOS update.\n"
KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";

/* Workaround for K8 erratum #93 & buggy BIOS.
   BIOS SMM functions are required to use a specific workaround
   to avoid corruption of the 64bit RIP register on C stepping K8. 
   A lot of BIOS that didn't get tested properly miss this. 
   The OS sees this as a page fault with the upper 32bits of RIP cleared.
   Try to work around it here.
   Note we only handle faults in kernel here. */

static int is_errata93(struct pt_regs *regs, unsigned long address) 
{
	static int warned;
	if (address != regs->rip)
		return 0;
	if ((address >> 32) != 0) 
		return 0;
	address |= 0xffffffffUL << 32;
	if ((address >= (u64)_stext && address <= (u64)_etext) || 
	    (address >= MODULES_VADDR && address <= MODULES_END)) { 
		if (!warned) {
			printk(errata93_warning); 		
			warned = 1;
		}
		regs->rip = address;
		return 1;
	}
	return 0;
} 

int unhandled_signal(struct task_struct *tsk, int sig)
{
	if (tsk->pid == 1)
		return 1;
	if (tsk->ptrace & PT_PTRACED)
		return 0;
	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
		(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
}

static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
				 unsigned long error_code)
{
	unsigned long flags = oops_begin();
	struct task_struct *tsk;

	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
	       current->comm, address);
	dump_pagetable(address);
	tsk = current;
	tsk->thread.cr2 = address;
	tsk->thread.trap_no = 14;
	tsk->thread.error_code = error_code;
	__die("Bad pagetable", regs, error_code);
	oops_end(flags);
	do_exit(SIGKILL);
}

/*
 * Handle a fault on the vmalloc area
 *
 * This assumes no large pages in there.
 */
static int vmalloc_fault(unsigned long address)
{
	pgd_t *pgd, *pgd_ref;
	pud_t *pud, *pud_ref;
	pmd_t *pmd, *pmd_ref;
	pte_t *pte, *pte_ref;

	/* Copy kernel mappings over when needed. This can also
	   happen within a race in page table update. In the later
	   case just flush. */

	pgd = pgd_offset(current->mm ?: &init_mm, address);
	pgd_ref = pgd_offset_k(address);
	if (pgd_none(*pgd_ref))
		return -1;
	if (pgd_none(*pgd))
		set_pgd(pgd, *pgd_ref);
	else
		BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));

	/* Below here mismatches are bugs because these lower tables
	   are shared */

	pud = pud_offset(pgd, address);
	pud_ref = pud_offset(pgd_ref, address);
	if (pud_none(*pud_ref))
		return -1;
	if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
		BUG();
	pmd = pmd_offset(pud, address);
	pmd_ref = pmd_offset(pud_ref, address);
	if (pmd_none(*pmd_ref))
		return -1;
	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
		BUG();
	pte_ref = pte_offset_kernel(pmd_ref, address);
	if (!pte_present(*pte_ref))
		return -1;
	pte = pte_offset_kernel(pmd, address);
	/* Don't use pte_page here, because the mappings can point
	   outside mem_map, and the NUMA hash lookup cannot handle
	   that. */
	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
		BUG();
	return 0;
}

int page_fault_trace = 0;
int exception_trace = 1;

/*
 * This routine handles page faults.  It determines the address,
 * and the problem, and then passes it off to one of the appropriate
 * routines.
 */
asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
					unsigned long error_code)
{
	struct task_struct *tsk;
	struct mm_struct *mm;
	struct vm_area_struct * vma;
	unsigned long address;
	const struct exception_table_entry *fixup;
	int write;
	unsigned long flags;
	siginfo_t info;

	tsk = current;
	mm = tsk->mm;
	prefetchw(&mm->mmap_sem);

	/* get the address */
	__asm__("movq %%cr2,%0":"=r" (address));

	info.si_code = SEGV_MAPERR;


	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * 'reference' page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 *
	 * This verifies that the fault happens in kernel space
	 * (error_code & 4) == 0, and that the fault was not a
	 * protection error (error_code & 9) == 0.
	 */
	if (unlikely(address >= TASK_SIZE64)) {
		/*
		 * Don't check for the module range here: its PML4
		 * is always initialized because it's shared with the main
		 * kernel text. Only vmalloc may need PML4 syncups.
		 */
		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
		      ((address >= VMALLOC_START && address < VMALLOC_END))) {
			if (vmalloc_fault(address) >= 0)
				return;
		}
		if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
						SIGSEGV) == NOTIFY_STOP)
			return;
		/*
		 * Don't take the mm semaphore here. If we fixup a prefetch
		 * fault we could otherwise deadlock.
		 */
		goto bad_area_nosemaphore;
	}

	if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
					SIGSEGV) == NOTIFY_STOP)
		return;

	if (likely(regs->eflags & X86_EFLAGS_IF))
		local_irq_enable();

	if (unlikely(page_fault_trace))
		printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
		       regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 

	if (unlikely(error_code & PF_RSVD))
		pgtable_bad(address, regs, error_code);

	/*
	 * If we're in an interrupt or have no user
	 * context, we must not take the fault..
	 */
	if (unlikely(in_atomic() || !mm))
		goto bad_area_nosemaphore;

 again:
	/* When running in the kernel we expect faults to occur only to
	 * addresses in user space.  All other faults represent errors in the
	 * kernel and should generate an OOPS.  Unfortunatly, in the case of an
	 * erroneous fault occuring in a code path which already holds mmap_sem
	 * we will deadlock attempting to validate the fault against the
	 * address space.  Luckily the kernel only validly references user
	 * space from well defined areas of code, which are listed in the
	 * exceptions table.
	 *
	 * As the vast majority of faults will be valid we will only perform
	 * the source reference check when there is a possibilty of a deadlock.
	 * Attempt to lock the address space, if we cannot we then validate the
	 * source.  If this is invalid we can skip the address space check,
	 * thus avoiding the deadlock.
	 */
	if (!down_read_trylock(&mm->mmap_sem)) {
		if ((error_code & PF_USER) == 0 &&
		    !search_exception_tables(regs->rip))
			goto bad_area_nosemaphore;
		down_read(&mm->mmap_sem);
	}

	vma = find_vma(mm, address);
	if (!vma)
		goto bad_area;
	if (likely(vma->vm_start <= address))
		goto good_area;
	if (!(vma->vm_flags & VM_GROWSDOWN))
		goto bad_area;
	if (error_code & 4) {
		// XXX: align red zone size with ABI 
		if (address + 128 < regs->rsp)
			goto bad_area;
	}
	if (expand_stack(vma, address))
		goto bad_area;
/*
 * Ok, we have a good vm_area for this memory access, so
 * we can handle it..
 */
good_area:
	info.si_code = SEGV_ACCERR;
	write = 0;
	switch (error_code & (PF_PROT|PF_WRITE)) {
		default:	/* 3: write, present */
			/* fall through */
		case PF_WRITE:		/* write, not present */
			if (!(vma->vm_flags & VM_WRITE))
				goto bad_area;
			write++;
			break;
		case PF_PROT:		/* read, present */
			goto bad_area;
		case 0:			/* read, not present */
			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
				goto bad_area;
	}

	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */
	switch (handle_mm_fault(mm, vma, address, write)) {
	case VM_FAULT_MINOR:
		tsk->min_flt++;
		break;
	case VM_FAULT_MAJOR:
		tsk->maj_flt++;
		break;
	case VM_FAULT_SIGBUS:
		goto do_sigbus;
	default:
		goto out_of_memory;
	}

	up_read(&mm->mmap_sem);
	return;

/*
 * Something tried to access memory that isn't in our memory map..
 * Fix it, but check if it's kernel or user first..
 */
bad_area:
	up_read(&mm->mmap_sem);

bad_area_nosemaphore:
	/* User mode accesses just cause a SIGSEGV */
	if (error_code & PF_USER) {
		if (is_prefetch(regs, address, error_code))
			return;

		/* Work around K8 erratum #100 K8 in compat mode
		   occasionally jumps to illegal addresses >4GB.  We
		   catch this here in the page fault handler because
		   these addresses are not reachable. Just detect this
		   case and return.  Any code segment in LDT is
		   compatibility mode. */
		if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
		    (address >> 32))
			return;

		if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
			printk(
		       "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
					tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
					tsk->comm, tsk->pid, address, regs->rip,
					regs->rsp, error_code);
		}
       
		tsk->thread.cr2 = address;
		/* Kernel addresses are always protection faults */
		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
		tsk->thread.trap_no = 14;
		info.si_signo = SIGSEGV;
		info.si_errno = 0;
		/* info.si_code has been set above */
		info.si_addr = (void __user *)address;
		force_sig_info(SIGSEGV, &info, tsk);
		return;
	}

no_context:
	
	/* Are we prepared to handle this kernel fault?  */
	fixup = search_exception_tables(regs->rip);
	if (fixup) {
		regs->rip = fixup->fixup;
		return;
	}

	/* 
	 * Hall of shame of CPU/BIOS bugs.
	 */

 	if (is_prefetch(regs, address, error_code))
 		return;

	if (is_errata93(regs, address))
		return; 

/*
 * Oops. The kernel tried to access some bad page. We'll have to
 * terminate things with extreme prejudice.
 */

	flags = oops_begin();

	if (address < PAGE_SIZE)
		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
	else
		printk(KERN_ALERT "Unable to handle kernel paging request");
	printk(" at %016lx RIP: \n" KERN_ALERT,address);
	printk_address(regs->rip);
	printk("\n");
	dump_pagetable(address);
	tsk->thread.cr2 = address;
	tsk->thread.trap_no = 14;
	tsk->thread.error_code = error_code;
	__die("Oops", regs, error_code);
	/* Executive summary in case the body of the oops scrolled away */
	printk(KERN_EMERG "CR2: %016lx\n", address);
	oops_end(flags);
	do_exit(SIGKILL);

/*
 * We ran out of memory, or some other thing happened to us that made
 * us unable to handle the page fault gracefully.
 */
out_of_memory:
	up_read(&mm->mmap_sem);
	if (current->pid == 1) { 
		yield();
		goto again;
	}
	printk("VM: killing process %s\n", tsk->comm);
	if (error_code & 4)
		do_exit(SIGKILL);
	goto no_context;

do_sigbus:
	up_read(&mm->mmap_sem);

	/* Kernel mode? Handle exceptions or die */
	if (!(error_code & PF_USER))
		goto no_context;

	tsk->thread.cr2 = address;
	tsk->thread.error_code = error_code;
	tsk->thread.trap_no = 14;
	info.si_signo = SIGBUS;
	info.si_errno = 0;
	info.si_code = BUS_ADRERR;
	info.si_addr = (void __user *)address;
	force_sig_info(SIGBUS, &info, tsk);
	return;
}

DEFINE_SPINLOCK(pgd_lock);
struct page *pgd_list;

void vmalloc_sync_all(void)
{
	/* Note that races in the updates of insync and start aren't 
	   problematic:
	   insync can only get set bits added, and updates to start are only
	   improving performance (without affecting correctness if undone). */
	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
	static unsigned long start = VMALLOC_START & PGDIR_MASK;
	unsigned long address;

	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
		if (!test_bit(pgd_index(address), insync)) {
			const pgd_t *pgd_ref = pgd_offset_k(address);
			struct page *page;

			if (pgd_none(*pgd_ref))
				continue;
			spin_lock(&pgd_lock);
			for (page = pgd_list; page;
			     page = (struct page *)page->index) {
				pgd_t *pgd;
				pgd = (pgd_t *)page_address(page) + pgd_index(address);
				if (pgd_none(*pgd))
					set_pgd(pgd, *pgd_ref);
				else
					BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
			}
			spin_unlock(&pgd_lock);
			set_bit(pgd_index(address), insync);
		}
		if (address == start)
			start = address + PGDIR_SIZE;
	}
	/* Check that there is no need to do the same for the modules area. */
	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 
				(__START_KERNEL & PGDIR_MASK)));
}

static int __init enable_pagefaulttrace(char *str)
{
	page_fault_trace = 1;
	return 1;
}
__setup("pagefaulttrace", enable_pagefaulttrace);
Commit	Line	Data
1da177e4 LT	1	/*
	2	* linux/arch/x86-64/mm/fault.c
	3	*
	4	* Copyright (C) 1995 Linus Torvalds
	5	* Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
	6	*/
	7
	8	#include <linux/config.h>
	9	#include <linux/signal.h>
	10	#include <linux/sched.h>
	11	#include <linux/kernel.h>
	12	#include <linux/errno.h>
	13	#include <linux/string.h>
	14	#include <linux/types.h>
	15	#include <linux/ptrace.h>
	16	#include <linux/mman.h>
	17	#include <linux/mm.h>
	18	#include <linux/smp.h>
	19	#include <linux/smp_lock.h>
	20	#include <linux/interrupt.h>
	21	#include <linux/init.h>
	22	#include <linux/tty.h>
	23	#include <linux/vt_kern.h> /* For unblank_screen() */
	24	#include <linux/compiler.h>
	25	#include <linux/module.h>
0f2fbdcb	26	#include <linux/kprobes.h>
1da177e4 LT	27
	28	#include <asm/system.h>
	29	#include <asm/uaccess.h>
	30	#include <asm/pgalloc.h>
	31	#include <asm/smp.h>
	32	#include <asm/tlbflush.h>
	33	#include <asm/proto.h>
	34	#include <asm/kdebug.h>
	35	#include <asm-generic/sections.h>
1da177e4	36
66c58156 AK	37	/* Page fault error code bits */
	38	#define PF_PROT (1<<0) /* or no page found */
	39	#define PF_WRITE (1<<1)
	40	#define PF_USER (1<<2)
	41	#define PF_RSVD (1<<3)
	42	#define PF_INSTR (1<<4)
	43
1bd858a5 AK	44	#ifdef CONFIG_KPROBES
	45	ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
	46
	47	/* Hook to register for page fault notifications */
	48	int register_page_fault_notifier(struct notifier_block *nb)
	49	{
	50	vmalloc_sync_all();
	51	return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
	52	}
	53
	54	int unregister_page_fault_notifier(struct notifier_block *nb)
	55	{
	56	return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
	57	}
	58
	59	static inline int notify_page_fault(enum die_val val, const char *str,
	60	struct pt_regs *regs, long err, int trap, int sig)
	61	{
	62	struct die_args args = {
	63	.regs = regs,
	64	.str = str,
	65	.err = err,
	66	.trapnr = trap,
	67	.signr = sig
	68	};
	69	return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
	70	}
	71	#else
	72	static inline int notify_page_fault(enum die_val val, const char *str,
	73	struct pt_regs *regs, long err, int trap, int sig)
	74	{
	75	return NOTIFY_DONE;
	76	}
	77	#endif
	78
1da177e4 LT	79	void bust_spinlocks(int yes)
	80	{
	81	int loglevel_save = console_loglevel;
	82	if (yes) {
	83	oops_in_progress = 1;
	84	} else {
	85	#ifdef CONFIG_VT
	86	unblank_screen();
	87	#endif
	88	oops_in_progress = 0;
	89	/*
	90	* OK, the message is on the console. Now we call printk()
	91	* without oops_in_progress set so that printk will give klogd
	92	* a poke. Hold onto your hats...
	93	*/
	94	console_loglevel = 15; /* NMI oopser may have shut the console up */
	95	printk(" ");
	96	console_loglevel = loglevel_save;
	97	}
	98	}
	99
	100	/* Sometimes the CPU reports invalid exceptions on prefetch.
	101	Check that here and ignore.
	102	Opcode checker based on code by Richard Brunner */
	103	static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
	104	unsigned long error_code)
	105	{
f1290ec9	106	unsigned char *instr;
1da177e4 LT	107	int scan_more = 1;
1da177e4 LT	108	int prefetch = 0;
f1290ec9	109	unsigned char *max_instr;
1da177e4 LT	110
1da177e4 LT	111	/* If it was a exec fault ignore */
66c58156	112	if (error_code & PF_INSTR)
1da177e4 LT	113	return 0;
1da177e4 LT	114
f1290ec9 AK	115	instr = (unsigned char *)convert_rip_to_linear(current, regs);
f1290ec9 AK	116	max_instr = instr + 15;
1da177e4	117
76381fee	118	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
1da177e4 LT	119	return 0;
	120
	121	while (scan_more && instr < max_instr) {
	122	unsigned char opcode;
	123	unsigned char instr_hi;
	124	unsigned char instr_lo;
	125
	126	if (__get_user(opcode, instr))
	127	break;
	128
	129	instr_hi = opcode & 0xf0;
	130	instr_lo = opcode & 0x0f;
	131	instr++;
	132
	133	switch (instr_hi) {
	134	case 0x20:
	135	case 0x30:
	136	/* Values 0x26,0x2E,0x36,0x3E are valid x86
	137	prefixes. In long mode, the CPU will signal
	138	invalid opcode if some of these prefixes are
	139	present so we will never get here anyway */
	140	scan_more = ((instr_lo & 7) == 0x6);
	141	break;
	142
	143	case 0x40:
	144	/* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
	145	Need to figure out under what instruction mode the
	146	instruction was issued ... */
	147	/* Could check the LDT for lm, but for now it's good
	148	enough to assume that long mode only uses well known
	149	segments or kernel. */
76381fee	150	scan_more = (!user_mode(regs)) \|\| (regs->cs == __USER_CS);
1da177e4 LT	151	break;
	152
	153	case 0x60:
	154	/* 0x64 thru 0x67 are valid prefixes in all modes. */
	155	scan_more = (instr_lo & 0xC) == 0x4;
	156	break;
	157	case 0xF0:
	158	/* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
	159	scan_more = !instr_lo \|\| (instr_lo>>1) == 1;
	160	break;
	161	case 0x00:
	162	/* Prefetch instruction is 0x0F0D or 0x0F18 */
	163	scan_more = 0;
	164	if (__get_user(opcode, instr))
	165	break;
	166	prefetch = (instr_lo == 0xF) &&
	167	(opcode == 0x0D \|\| opcode == 0x18);
	168	break;
	169	default:
	170	scan_more = 0;
	171	break;
	172	}
	173	}
	174	return prefetch;
	175	}
	176
	177	static int bad_address(void *p)
	178	{
	179	unsigned long dummy;
	180	return __get_user(dummy, (unsigned long *)p);
	181	}
	182
	183	void dump_pagetable(unsigned long address)
	184	{
	185	pgd_t *pgd;
	186	pud_t *pud;
	187	pmd_t *pmd;
	188	pte_t *pte;
	189
	190	asm("movq %%cr3,%0" : "=r" (pgd));
	191
	192	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
	193	pgd += pgd_index(address);
1da177e4	194	if (bad_address(pgd)) goto bad;
d646bce4	195	printk("PGD %lx ", pgd_val(*pgd));
1da177e4 LT	196	if (!pgd_present(*pgd)) goto ret;
	197
	198	pud = __pud_offset_k((pud_t )pgd_page(pgd), address);
	199	if (bad_address(pud)) goto bad;
	200	printk("PUD %lx ", pud_val(*pud));
	201	if (!pud_present(*pud)) goto ret;
	202
	203	pmd = pmd_offset(pud, address);
	204	if (bad_address(pmd)) goto bad;
	205	printk("PMD %lx ", pmd_val(*pmd));
	206	if (!pmd_present(*pmd)) goto ret;
	207
	208	pte = pte_offset_kernel(pmd, address);
	209	if (bad_address(pte)) goto bad;
	210	printk("PTE %lx", pte_val(*pte));
	211	ret:
	212	printk("\n");
	213	return;
	214	bad:
	215	printk("BAD\n");
	216	}
	217
	218	static const char errata93_warning[] =
	219	KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
	220	KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
	221	KERN_ERR "******* Please consider a BIOS update.\n"
	222	KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
	223
	224	/* Workaround for K8 erratum #93 & buggy BIOS.
	225	BIOS SMM functions are required to use a specific workaround
	226	to avoid corruption of the 64bit RIP register on C stepping K8.
	227	A lot of BIOS that didn't get tested properly miss this.
	228	The OS sees this as a page fault with the upper 32bits of RIP cleared.
	229	Try to work around it here.
	230	Note we only handle faults in kernel here. */
	231
	232	static int is_errata93(struct pt_regs *regs, unsigned long address)
	233	{
	234	static int warned;
	235	if (address != regs->rip)
	236	return 0;
	237	if ((address >> 32) != 0)
	238	return 0;
	239	address \|= 0xffffffffUL << 32;
	240	if ((address >= (u64)_stext && address <= (u64)_etext) \|\|
	241	(address >= MODULES_VADDR && address <= MODULES_END)) {
	242	if (!warned) {
	243	printk(errata93_warning);
	244	warned = 1;
	245	}
	246	regs->rip = address;
	247	return 1;
	248	}
	249	return 0;
	250	}
	251
	252	int unhandled_signal(struct task_struct *tsk, int sig)
	253	{
	254	if (tsk->pid == 1)
	255	return 1;
5e5ec104	256	if (tsk->ptrace & PT_PTRACED)
1da177e4 LT	257	return 0;
	258	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) \|\|
	259	(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
	260	}
	261
	262	static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
	263	unsigned long error_code)
	264	{
1209140c	265	unsigned long flags = oops_begin();
6e3f3617	266	struct task_struct *tsk;
1209140c	267
1da177e4 LT	268	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
	269	current->comm, address);
	270	dump_pagetable(address);
6e3f3617 JB	271	tsk = current;
	272	tsk->thread.cr2 = address;
	273	tsk->thread.trap_no = 14;
	274	tsk->thread.error_code = error_code;
1da177e4	275	__die("Bad pagetable", regs, error_code);
1209140c	276	oops_end(flags);
1da177e4 LT	277	do_exit(SIGKILL);
	278	}
	279
	280	/*
f95190b2	281	* Handle a fault on the vmalloc area
3b9ba4d5 AK	282	*
3b9ba4d5 AK	283	* This assumes no large pages in there.
1da177e4 LT	284	*/
	285	static int vmalloc_fault(unsigned long address)
	286	{
	287	pgd_t pgd, pgd_ref;
	288	pud_t pud, pud_ref;
	289	pmd_t pmd, pmd_ref;
	290	pte_t pte, pte_ref;
	291
	292	/* Copy kernel mappings over when needed. This can also
	293	happen within a race in page table update. In the later
	294	case just flush. */
	295
	296	pgd = pgd_offset(current->mm ?: &init_mm, address);
	297	pgd_ref = pgd_offset_k(address);
	298	if (pgd_none(*pgd_ref))
	299	return -1;
	300	if (pgd_none(*pgd))
	301	set_pgd(pgd, *pgd_ref);
8c914cb7 JB	302	else
8c914cb7 JB	303	BUG_ON(pgd_page(pgd) != pgd_page(pgd_ref));
1da177e4 LT	304
	305	/* Below here mismatches are bugs because these lower tables
	306	are shared */
	307
	308	pud = pud_offset(pgd, address);
	309	pud_ref = pud_offset(pgd_ref, address);
	310	if (pud_none(*pud_ref))
	311	return -1;
	312	if (pud_none(pud) \|\| pud_page(pud) != pud_page(*pud_ref))
	313	BUG();
	314	pmd = pmd_offset(pud, address);
	315	pmd_ref = pmd_offset(pud_ref, address);
	316	if (pmd_none(*pmd_ref))
	317	return -1;
	318	if (pmd_none(pmd) \|\| pmd_page(pmd) != pmd_page(*pmd_ref))
	319	BUG();
	320	pte_ref = pte_offset_kernel(pmd_ref, address);
	321	if (!pte_present(*pte_ref))
	322	return -1;
	323	pte = pte_offset_kernel(pmd, address);
3b9ba4d5 AK	324	/* Don't use pte_page here, because the mappings can point
	325	outside mem_map, and the NUMA hash lookup cannot handle
	326	that. */
	327	if (!pte_present(pte) \|\| pte_pfn(pte) != pte_pfn(*pte_ref))
1da177e4	328	BUG();
1da177e4 LT	329	return 0;
	330	}
	331
	332	int page_fault_trace = 0;
	333	int exception_trace = 1;
	334
	335	/*
	336	* This routine handles page faults. It determines the address,
	337	* and the problem, and then passes it off to one of the appropriate
	338	* routines.
1da177e4	339	*/
0f2fbdcb PP	340	asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
0f2fbdcb PP	341	unsigned long error_code)
1da177e4 LT	342	{
	343	struct task_struct *tsk;
	344	struct mm_struct *mm;
	345	struct vm_area_struct * vma;
	346	unsigned long address;
	347	const struct exception_table_entry *fixup;
	348	int write;
1209140c	349	unsigned long flags;
1da177e4 LT	350	siginfo_t info;
1da177e4 LT	351
a9ba9a3b AV	352	tsk = current;
	353	mm = tsk->mm;
	354	prefetchw(&mm->mmap_sem);
	355
1da177e4 LT	356	/* get the address */
1da177e4 LT	357	__asm__("movq %%cr2,%0":"=r" (address));
1da177e4	358
1da177e4 LT	359	info.si_code = SEGV_MAPERR;
	360
	361
	362	/*
	363	* We fault-in kernel-space virtual memory on-demand. The
	364	* 'reference' page table is init_mm.pgd.
	365	*
	366	* NOTE! We MUST NOT take any locks for this case. We may
	367	* be in an interrupt or a critical region, and should
	368	* only copy the information from the master page table,
	369	* nothing more.
	370	*
	371	* This verifies that the fault happens in kernel space
	372	* (error_code & 4) == 0, and that the fault was not a
8b1bde93	373	* protection error (error_code & 9) == 0.
1da177e4	374	*/
84929801	375	if (unlikely(address >= TASK_SIZE64)) {
f95190b2 AK	376	/*
	377	* Don't check for the module range here: its PML4
	378	* is always initialized because it's shared with the main
	379	* kernel text. Only vmalloc may need PML4 syncups.
	380	*/
66c58156	381	if (!(error_code & (PF_RSVD\|PF_USER\|PF_PROT)) &&
f95190b2	382	((address >= VMALLOC_START && address < VMALLOC_END))) {
8c914cb7 JB	383	if (vmalloc_fault(address) >= 0)
8c914cb7 JB	384	return;
1da177e4	385	}
1bd858a5	386	if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
8c914cb7 JB	387	SIGSEGV) == NOTIFY_STOP)
8c914cb7 JB	388	return;
1da177e4 LT	389	/*
	390	* Don't take the mm semaphore here. If we fixup a prefetch
	391	* fault we could otherwise deadlock.
	392	*/
	393	goto bad_area_nosemaphore;
	394	}
	395
1bd858a5	396	if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
8c914cb7 JB	397	SIGSEGV) == NOTIFY_STOP)
	398	return;
	399
	400	if (likely(regs->eflags & X86_EFLAGS_IF))
	401	local_irq_enable();
	402
	403	if (unlikely(page_fault_trace))
	404	printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
	405	regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
	406
66c58156	407	if (unlikely(error_code & PF_RSVD))
1da177e4 LT	408	pgtable_bad(address, regs, error_code);
	409
	410	/*
	411	* If we're in an interrupt or have no user
	412	* context, we must not take the fault..
	413	*/
	414	if (unlikely(in_atomic() \|\| !mm))
	415	goto bad_area_nosemaphore;
	416
	417	again:
	418	/* When running in the kernel we expect faults to occur only to
	419	* addresses in user space. All other faults represent errors in the
	420	* kernel and should generate an OOPS. Unfortunatly, in the case of an
	421	* erroneous fault occuring in a code path which already holds mmap_sem
	422	* we will deadlock attempting to validate the fault against the
	423	* address space. Luckily the kernel only validly references user
	424	* space from well defined areas of code, which are listed in the
	425	* exceptions table.
	426	*
	427	* As the vast majority of faults will be valid we will only perform
	428	* the source reference check when there is a possibilty of a deadlock.
	429	* Attempt to lock the address space, if we cannot we then validate the
	430	* source. If this is invalid we can skip the address space check,
	431	* thus avoiding the deadlock.
	432	*/
	433	if (!down_read_trylock(&mm->mmap_sem)) {
66c58156	434	if ((error_code & PF_USER) == 0 &&
1da177e4 LT	435	!search_exception_tables(regs->rip))
	436	goto bad_area_nosemaphore;
	437	down_read(&mm->mmap_sem);
	438	}
	439
	440	vma = find_vma(mm, address);
	441	if (!vma)
	442	goto bad_area;
	443	if (likely(vma->vm_start <= address))
	444	goto good_area;
	445	if (!(vma->vm_flags & VM_GROWSDOWN))
	446	goto bad_area;
	447	if (error_code & 4) {
	448	// XXX: align red zone size with ABI
	449	if (address + 128 < regs->rsp)
	450	goto bad_area;
	451	}
	452	if (expand_stack(vma, address))
	453	goto bad_area;
	454	/*
	455	* Ok, we have a good vm_area for this memory access, so
	456	* we can handle it..
	457	*/
	458	good_area:
	459	info.si_code = SEGV_ACCERR;
	460	write = 0;
66c58156	461	switch (error_code & (PF_PROT\|PF_WRITE)) {
1da177e4 LT	462	default: /* 3: write, present */
1da177e4 LT	463	/* fall through */
66c58156	464	case PF_WRITE: /* write, not present */
1da177e4 LT	465	if (!(vma->vm_flags & VM_WRITE))
	466	goto bad_area;
	467	write++;
	468	break;
66c58156	469	case PF_PROT: /* read, present */
1da177e4	470	goto bad_area;
66c58156	471	case 0: /* read, not present */
1da177e4 LT	472	if (!(vma->vm_flags & (VM_READ \| VM_EXEC)))
	473	goto bad_area;
	474	}
	475
	476	/*
	477	* If for any reason at all we couldn't handle the fault,
	478	* make sure we exit gracefully rather than endlessly redo
	479	* the fault.
	480	*/
	481	switch (handle_mm_fault(mm, vma, address, write)) {
96800216	482	case VM_FAULT_MINOR:
1da177e4 LT	483	tsk->min_flt++;
1da177e4 LT	484	break;
96800216	485	case VM_FAULT_MAJOR:
1da177e4 LT	486	tsk->maj_flt++;
1da177e4 LT	487	break;
96800216	488	case VM_FAULT_SIGBUS:
1da177e4 LT	489	goto do_sigbus;
	490	default:
	491	goto out_of_memory;
	492	}
	493
	494	up_read(&mm->mmap_sem);
	495	return;
	496
	497	/*
	498	* Something tried to access memory that isn't in our memory map..
	499	* Fix it, but check if it's kernel or user first..
	500	*/
	501	bad_area:
	502	up_read(&mm->mmap_sem);
	503
	504	bad_area_nosemaphore:
1da177e4	505	/* User mode accesses just cause a SIGSEGV */
66c58156	506	if (error_code & PF_USER) {
1da177e4 LT	507	if (is_prefetch(regs, address, error_code))
	508	return;
	509
	510	/* Work around K8 erratum #100 K8 in compat mode
	511	occasionally jumps to illegal addresses >4GB. We
	512	catch this here in the page fault handler because
	513	these addresses are not reachable. Just detect this
	514	case and return. Any code segment in LDT is
	515	compatibility mode. */
	516	if ((regs->cs == __USER32_CS \|\| (regs->cs & (1<<2))) &&
	517	(address >> 32))
	518	return;
	519
	520	if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
	521	printk(
	522	"%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
	523	tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
	524	tsk->comm, tsk->pid, address, regs->rip,
	525	regs->rsp, error_code);
	526	}
	527
	528	tsk->thread.cr2 = address;
	529	/* Kernel addresses are always protection faults */
	530	tsk->thread.error_code = error_code \| (address >= TASK_SIZE);
	531	tsk->thread.trap_no = 14;
	532	info.si_signo = SIGSEGV;
	533	info.si_errno = 0;
	534	/* info.si_code has been set above */
	535	info.si_addr = (void __user *)address;
	536	force_sig_info(SIGSEGV, &info, tsk);
	537	return;
	538	}
	539
	540	no_context:
	541
	542	/* Are we prepared to handle this kernel fault? */
	543	fixup = search_exception_tables(regs->rip);
	544	if (fixup) {
	545	regs->rip = fixup->fixup;
	546	return;
	547	}
	548
	549	/*
	550	* Hall of shame of CPU/BIOS bugs.
	551	*/
	552
	553	if (is_prefetch(regs, address, error_code))
	554	return;
	555
	556	if (is_errata93(regs, address))
	557	return;
	558
	559	/*
	560	* Oops. The kernel tried to access some bad page. We'll have to
	561	* terminate things with extreme prejudice.
	562	*/
	563
1209140c	564	flags = oops_begin();
1da177e4 LT	565
	566	if (address < PAGE_SIZE)
	567	printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
	568	else
	569	printk(KERN_ALERT "Unable to handle kernel paging request");
	570	printk(" at %016lx RIP: \n" KERN_ALERT,address);
	571	printk_address(regs->rip);
	572	printk("\n");
	573	dump_pagetable(address);
6e3f3617 JB	574	tsk->thread.cr2 = address;
	575	tsk->thread.trap_no = 14;
	576	tsk->thread.error_code = error_code;
1da177e4 LT	577	__die("Oops", regs, error_code);
	578	/* Executive summary in case the body of the oops scrolled away */
	579	printk(KERN_EMERG "CR2: %016lx\n", address);
1209140c	580	oops_end(flags);
1da177e4 LT	581	do_exit(SIGKILL);
	582
	583	/*
	584	* We ran out of memory, or some other thing happened to us that made
	585	* us unable to handle the page fault gracefully.
	586	*/
	587	out_of_memory:
	588	up_read(&mm->mmap_sem);
1da177e4 LT	589	if (current->pid == 1) {
	590	yield();
	591	goto again;
	592	}
	593	printk("VM: killing process %s\n", tsk->comm);
	594	if (error_code & 4)
	595	do_exit(SIGKILL);
	596	goto no_context;
	597
	598	do_sigbus:
	599	up_read(&mm->mmap_sem);
	600
	601	/* Kernel mode? Handle exceptions or die */
66c58156	602	if (!(error_code & PF_USER))
1da177e4 LT	603	goto no_context;
	604
	605	tsk->thread.cr2 = address;
	606	tsk->thread.error_code = error_code;
	607	tsk->thread.trap_no = 14;
	608	info.si_signo = SIGBUS;
	609	info.si_errno = 0;
	610	info.si_code = BUS_ADRERR;
	611	info.si_addr = (void __user *)address;
	612	force_sig_info(SIGBUS, &info, tsk);
	613	return;
	614	}
9e43e1b7	615
8c914cb7 JB	616	DEFINE_SPINLOCK(pgd_lock);
	617	struct page *pgd_list;
	618
	619	void vmalloc_sync_all(void)
	620	{
	621	/* Note that races in the updates of insync and start aren't
	622	problematic:
	623	insync can only get set bits added, and updates to start are only
	624	improving performance (without affecting correctness if undone). */
	625	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
	626	static unsigned long start = VMALLOC_START & PGDIR_MASK;
	627	unsigned long address;
	628
	629	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
	630	if (!test_bit(pgd_index(address), insync)) {
	631	const pgd_t *pgd_ref = pgd_offset_k(address);
	632	struct page *page;
	633
	634	if (pgd_none(*pgd_ref))
	635	continue;
	636	spin_lock(&pgd_lock);
	637	for (page = pgd_list; page;
	638	page = (struct page *)page->index) {
	639	pgd_t *pgd;
	640	pgd = (pgd_t *)page_address(page) + pgd_index(address);
	641	if (pgd_none(*pgd))
	642	set_pgd(pgd, *pgd_ref);
	643	else
	644	BUG_ON(pgd_page(pgd) != pgd_page(pgd_ref));
	645	}
	646	spin_unlock(&pgd_lock);
	647	set_bit(pgd_index(address), insync);
	648	}
	649	if (address == start)
	650	start = address + PGDIR_SIZE;
	651	}
	652	/* Check that there is no need to do the same for the modules area. */
	653	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
	654	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
	655	(__START_KERNEL & PGDIR_MASK)));
	656	}
	657
9e43e1b7 AK	658	static int __init enable_pagefaulttrace(char *str)
	659	{
	660	page_fault_trace = 1;
9b41046c	661	return 1;
9e43e1b7 AK	662	}
9e43e1b7 AK	663	__setup("pagefaulttrace", enable_pagefaulttrace);