[deliverable/linux.git] / mm / oom_kill.c

/*
 *  linux/mm/oom_kill.c
 * 
 *  Copyright (C)  1998,2000  Rik van Riel
 *	Thanks go out to Claus Fischer for some serious inspiration and
 *	for goading me into coding this file...
 *
 *  The routines in this file are used to kill a process when
 *  we're seriously out of memory. This gets called from __alloc_pages()
 *  in mm/page_alloc.c when we really run out of memory.
 *
 *  Since we won't call these routines often (on a well-configured
 *  machine) this file will double as a 'coding guide' and a signpost
 *  for newbie kernel hackers. It features several pointers to major
 *  kernel subsystems and hints as to where to find out what things do.
 */

#include <linux/oom.h>
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/cpuset.h>
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
#include <linux/security.h>

int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks;
static DEFINE_SPINLOCK(zone_scan_lock);
/* #define DEBUG */

/**
 * badness - calculate a numeric value for how bad this task has been
 * @p: task struct of which task we should calculate
 * @uptime: current uptime in seconds
 *
 * The formula used is relatively simple and documented inline in the
 * function. The main rationale is that we want to select a good task
 * to kill when we run out of memory.
 *
 * Good in this context means that:
 * 1) we lose the minimum amount of work done
 * 2) we recover a large amount of memory
 * 3) we don't kill anything innocent of eating tons of memory
 * 4) we want to kill the minimum amount of processes (one)
 * 5) we try to kill the process the user expects us to kill, this
 *    algorithm has been meticulously tuned to meet the principle
 *    of least surprise ... (be careful when you change it)
 */

unsigned long badness(struct task_struct *p, unsigned long uptime)
{
	unsigned long points, cpu_time, run_time;
	struct mm_struct *mm;
	struct task_struct *child;
	int oom_adj;

	task_lock(p);
	mm = p->mm;
	if (!mm) {
		task_unlock(p);
		return 0;
	}
	oom_adj = mm->oom_adj;
	if (oom_adj == OOM_DISABLE) {
		task_unlock(p);
		return 0;
	}

	/*
	 * The memory size of the process is the basis for the badness.
	 */
	points = mm->total_vm;

	/*
	 * After this unlock we can no longer dereference local variable `mm'
	 */
	task_unlock(p);

	/*
	 * swapoff can easily use up all memory, so kill those first.
	 */
	if (p->flags & PF_SWAPOFF)
		return ULONG_MAX;

	/*
	 * Processes which fork a lot of child processes are likely
	 * a good choice. We add half the vmsize of the children if they
	 * have an own mm. This prevents forking servers to flood the
	 * machine with an endless amount of children. In case a single
	 * child is eating the vast majority of memory, adding only half
	 * to the parents will make the child our kill candidate of choice.
	 */
	list_for_each_entry(child, &p->children, sibling) {
		task_lock(child);
		if (child->mm != mm && child->mm)
			points += child->mm->total_vm/2 + 1;
		task_unlock(child);
	}

	/*
	 * CPU time is in tens of seconds and run time is in thousands
         * of seconds. There is no particular reason for this other than
         * that it turned out to work very well in practice.
	 */
	cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
		>> (SHIFT_HZ + 3);

	if (uptime >= p->start_time.tv_sec)
		run_time = (uptime - p->start_time.tv_sec) >> 10;
	else
		run_time = 0;

	if (cpu_time)
		points /= int_sqrt(cpu_time);
	if (run_time)
		points /= int_sqrt(int_sqrt(run_time));

	/*
	 * Niced processes are most likely less important, so double
	 * their badness points.
	 */
	if (task_nice(p) > 0)
		points *= 2;

	/*
	 * Superuser processes are usually more important, so we make it
	 * less likely that we kill those.
	 */
	if (has_capability_noaudit(p, CAP_SYS_ADMIN) ||
	    has_capability_noaudit(p, CAP_SYS_RESOURCE))
		points /= 4;

	/*
	 * We don't want to kill a process with direct hardware access.
	 * Not only could that mess up the hardware, but usually users
	 * tend to only have this flag set on applications they think
	 * of as important.
	 */
	if (has_capability_noaudit(p, CAP_SYS_RAWIO))
		points /= 4;

	/*
	 * If p's nodes don't overlap ours, it may still help to kill p
	 * because p may have allocated or otherwise mapped memory on
	 * this node before. However it will be less likely.
	 */
	if (!cpuset_mems_allowed_intersects(current, p))
		points /= 8;

	/*
	 * Adjust the score by oom_adj.
	 */
	if (oom_adj) {
		if (oom_adj > 0) {
			if (!points)
				points = 1;
			points <<= oom_adj;
		} else
			points >>= -(oom_adj);
	}

#ifdef DEBUG
	printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n",
	p->pid, p->comm, points);
#endif
	return points;
}

/*
 * Determine the type of allocation constraint.
 */
static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
						    gfp_t gfp_mask)
{
#ifdef CONFIG_NUMA
	struct zone *zone;
	struct zoneref *z;
	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
	nodemask_t nodes = node_states[N_HIGH_MEMORY];

	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
		if (cpuset_zone_allowed_softwall(zone, gfp_mask))
			node_clear(zone_to_nid(zone), nodes);
		else
			return CONSTRAINT_CPUSET;

	if (!nodes_empty(nodes))
		return CONSTRAINT_MEMORY_POLICY;
#endif

	return CONSTRAINT_NONE;
}

/*
 * Simple selection loop. We chose the process with the highest
 * number of 'points'. We expect the caller will lock the tasklist.
 *
 * (not docbooked, we don't want this one cluttering up the manual)
 */
static struct task_struct *select_bad_process(unsigned long *ppoints,
						struct mem_cgroup *mem)
{
	struct task_struct *g, *p;
	struct task_struct *chosen = NULL;
	struct timespec uptime;
	*ppoints = 0;

	do_posix_clock_monotonic_gettime(&uptime);
	do_each_thread(g, p) {
		unsigned long points;

		/*
		 * skip kernel threads and tasks which have already released
		 * their mm.
		 */
		if (!p->mm)
			continue;
		/* skip the init task */
		if (is_global_init(p))
			continue;
		if (mem && !task_in_mem_cgroup(p, mem))
			continue;

		/*
		 * This task already has access to memory reserves and is
		 * being killed. Don't allow any other task access to the
		 * memory reserve.
		 *
		 * Note: this may have a chance of deadlock if it gets
		 * blocked waiting for another task which itself is waiting
		 * for memory. Is there a better alternative?
		 */
		if (test_tsk_thread_flag(p, TIF_MEMDIE))
			return ERR_PTR(-1UL);

		/*
		 * This is in the process of releasing memory so wait for it
		 * to finish before killing some other task by mistake.
		 *
		 * However, if p is the current task, we allow the 'kill' to
		 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
		 * which will allow it to gain access to memory reserves in
		 * the process of exiting and releasing its resources.
		 * Otherwise we could get an easy OOM deadlock.
		 */
		if (p->flags & PF_EXITING) {
			if (p != current)
				return ERR_PTR(-1UL);

			chosen = p;
			*ppoints = ULONG_MAX;
		}

		points = badness(p, uptime.tv_sec);
		if (points > *ppoints) {
			chosen = p;
			*ppoints = points;
		}
	} while_each_thread(g, p);

	return chosen;
}

/**
 * dump_tasks - dump current memory state of all system tasks
 * @mem: target memory controller
 *
 * Dumps the current memory state of all system tasks, excluding kernel threads.
 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
 * score, and name.
 *
 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
 * shown.
 *
 * Call with tasklist_lock read-locked.
 */
static void dump_tasks(const struct mem_cgroup *mem)
{
	struct task_struct *g, *p;

	printk(KERN_INFO "[ pid ]   uid  tgid total_vm      rss cpu oom_adj "
	       "name\n");
	do_each_thread(g, p) {
		struct mm_struct *mm;

		if (mem && !task_in_mem_cgroup(p, mem))
			continue;
		if (!thread_group_leader(p))
			continue;

		task_lock(p);
		mm = p->mm;
		if (!mm) {
			/*
			 * total_vm and rss sizes do not exist for tasks with no
			 * mm so there's no need to report them; they can't be
			 * oom killed anyway.
			 */
			task_unlock(p);
			continue;
		}
		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
		       p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
		       get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
		task_unlock(p);
	} while_each_thread(g, p);
}

/*
 * Send SIGKILL to the selected  process irrespective of  CAP_SYS_RAW_IO
 * flag though it's unlikely that  we select a process with CAP_SYS_RAW_IO
 * set.
 */
static void __oom_kill_task(struct task_struct *p, int verbose)
{
	if (is_global_init(p)) {
		WARN_ON(1);
		printk(KERN_WARNING "tried to kill init!\n");
		return;
	}

	if (!p->mm) {
		WARN_ON(1);
		printk(KERN_WARNING "tried to kill an mm-less task!\n");
		return;
	}

	if (verbose)
		printk(KERN_ERR "Killed process %d (%s)\n",
				task_pid_nr(p), p->comm);

	/*
	 * We give our sacrificial lamb high priority and access to
	 * all the memory it needs. That way it should be able to
	 * exit() and clear out its resources quickly...
	 */
	p->rt.time_slice = HZ;
	set_tsk_thread_flag(p, TIF_MEMDIE);

	force_sig(SIGKILL, p);
}

static int oom_kill_task(struct task_struct *p)
{
	struct mm_struct *mm;
	struct task_struct *g, *q;

	task_lock(p);
	mm = p->mm;
	if (!mm || mm->oom_adj == OOM_DISABLE) {
		task_unlock(p);
		return 1;
	}
	task_unlock(p);
	__oom_kill_task(p, 1);

	/*
	 * kill all processes that share the ->mm (i.e. all threads),
	 * but are in a different thread group. Don't let them have access
	 * to memory reserves though, otherwise we might deplete all memory.
	 */
	do_each_thread(g, q) {
		if (q->mm == mm && !same_thread_group(q, p))
			force_sig(SIGKILL, q);
	} while_each_thread(g, q);

	return 0;
}

static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
			    unsigned long points, struct mem_cgroup *mem,
			    const char *message)
{
	struct task_struct *c;

	if (printk_ratelimit()) {
		task_lock(current);
		printk(KERN_WARNING "%s invoked oom-killer: "
			"gfp_mask=0x%x, order=%d, oom_adj=%d\n",
			current->comm, gfp_mask, order,
			current->mm ? current->mm->oom_adj : OOM_DISABLE);
		cpuset_print_task_mems_allowed(current);
		task_unlock(current);
		dump_stack();
		mem_cgroup_print_oom_info(mem, current);
		show_mem();
		if (sysctl_oom_dump_tasks)
			dump_tasks(mem);
	}

	/*
	 * If the task is already exiting, don't alarm the sysadmin or kill
	 * its children or threads, just set TIF_MEMDIE so it can die quickly
	 */
	if (p->flags & PF_EXITING) {
		__oom_kill_task(p, 0);
		return 0;
	}

	printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
					message, task_pid_nr(p), p->comm, points);

	/* Try to kill a child first */
	list_for_each_entry(c, &p->children, sibling) {
		if (c->mm == p->mm)
			continue;
		if (!oom_kill_task(c))
			return 0;
	}
	return oom_kill_task(p);
}

#ifdef CONFIG_CGROUP_MEM_RES_CTLR
void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
{
	unsigned long points = 0;
	struct task_struct *p;

	read_lock(&tasklist_lock);
retry:
	p = select_bad_process(&points, mem);
	if (PTR_ERR(p) == -1UL)
		goto out;

	if (!p)
		p = current;

	if (oom_kill_process(p, gfp_mask, 0, points, mem,
				"Memory cgroup out of memory"))
		goto retry;
out:
	read_unlock(&tasklist_lock);
}
#endif

static BLOCKING_NOTIFIER_HEAD(oom_notify_list);

int register_oom_notifier(struct notifier_block *nb)
{
	return blocking_notifier_chain_register(&oom_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_oom_notifier);

int unregister_oom_notifier(struct notifier_block *nb)
{
	return blocking_notifier_chain_unregister(&oom_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_oom_notifier);

/*
 * Try to acquire the OOM killer lock for the zones in zonelist.  Returns zero
 * if a parallel OOM killing is already taking place that includes a zone in
 * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
 */
int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
{
	struct zoneref *z;
	struct zone *zone;
	int ret = 1;

	spin_lock(&zone_scan_lock);
	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
		if (zone_is_oom_locked(zone)) {
			ret = 0;
			goto out;
		}
	}

	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
		/*
		 * Lock each zone in the zonelist under zone_scan_lock so a
		 * parallel invocation of try_set_zone_oom() doesn't succeed
		 * when it shouldn't.
		 */
		zone_set_flag(zone, ZONE_OOM_LOCKED);
	}

out:
	spin_unlock(&zone_scan_lock);
	return ret;
}

/*
 * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
 * allocation attempts with zonelists containing them may now recall the OOM
 * killer, if necessary.
 */
void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
{
	struct zoneref *z;
	struct zone *zone;

	spin_lock(&zone_scan_lock);
	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
		zone_clear_flag(zone, ZONE_OOM_LOCKED);
	}
	spin_unlock(&zone_scan_lock);
}

/*
 * Must be called with tasklist_lock held for read.
 */
static void __out_of_memory(gfp_t gfp_mask, int order)
{
	struct task_struct *p;
	unsigned long points;

	if (sysctl_oom_kill_allocating_task)
		if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
				"Out of memory (oom_kill_allocating_task)"))
			return;
retry:
	/*
	 * Rambo mode: Shoot down a process and hope it solves whatever
	 * issues we may have.
	 */
	p = select_bad_process(&points, NULL);

	if (PTR_ERR(p) == -1UL)
		return;

	/* Found nothing?!?! Either we hang forever, or we panic. */
	if (!p) {
		read_unlock(&tasklist_lock);
		panic("Out of memory and no killable processes...\n");
	}

	if (oom_kill_process(p, gfp_mask, order, points, NULL,
			     "Out of memory"))
		goto retry;
}

/*
 * pagefault handler calls into here because it is out of memory but
 * doesn't know exactly how or why.
 */
void pagefault_out_of_memory(void)
{
	unsigned long freed = 0;

	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
	if (freed > 0)
		/* Got some memory back in the last second. */
		return;

	/*
	 * If this is from memcg, oom-killer is already invoked.
	 * and not worth to go system-wide-oom.
	 */
	if (mem_cgroup_oom_called(current))
		goto rest_and_return;

	if (sysctl_panic_on_oom)
		panic("out of memory from page fault. panic_on_oom is selected.\n");

	read_lock(&tasklist_lock);
	__out_of_memory(0, 0); /* unknown gfp_mask and order */
	read_unlock(&tasklist_lock);

	/*
	 * Give "p" a good chance of killing itself before we
	 * retry to allocate memory.
	 */
rest_and_return:
	if (!test_thread_flag(TIF_MEMDIE))
		schedule_timeout_uninterruptible(1);
}

/**
 * out_of_memory - kill the "best" process when we run out of memory
 * @zonelist: zonelist pointer
 * @gfp_mask: memory allocation flags
 * @order: amount of memory being requested as a power of 2
 *
 * If we run out of memory, we have the choice between either
 * killing a random task (bad), letting the system crash (worse)
 * OR try to be smart about which process to kill. Note that we
 * don't have to be perfect here, we just have to be good.
 */
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
{
	unsigned long freed = 0;
	enum oom_constraint constraint;

	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
	if (freed > 0)
		/* Got some memory back in the last second. */
		return;

	if (sysctl_panic_on_oom == 2)
		panic("out of memory. Compulsory panic_on_oom is selected.\n");

	/*
	 * Check if there were limitations on the allocation (only relevant for
	 * NUMA) that may require different handling.
	 */
	constraint = constrained_alloc(zonelist, gfp_mask);
	read_lock(&tasklist_lock);

	switch (constraint) {
	case CONSTRAINT_MEMORY_POLICY:
		oom_kill_process(current, gfp_mask, order, 0, NULL,
				"No available memory (MPOL_BIND)");
		break;

	case CONSTRAINT_NONE:
		if (sysctl_panic_on_oom)
			panic("out of memory. panic_on_oom is selected\n");
		/* Fall-through */
	case CONSTRAINT_CPUSET:
		__out_of_memory(gfp_mask, order);
		break;
	}

	read_unlock(&tasklist_lock);

	/*
	 * Give "p" a good chance of killing itself before we
	 * retry to allocate memory unless "p" is current
	 */
	if (!test_thread_flag(TIF_MEMDIE))
		schedule_timeout_uninterruptible(1);
}
Commit	Line	Data
1da177e4 LT	1	/*
	2	* linux/mm/oom_kill.c
	3	*
	4	* Copyright (C) 1998,2000 Rik van Riel
	5	* Thanks go out to Claus Fischer for some serious inspiration and
	6	* for goading me into coding this file...
	7	*
	8	* The routines in this file are used to kill a process when
a49335cc PJ	9	* we're seriously out of memory. This gets called from __alloc_pages()
a49335cc PJ	10	* in mm/page_alloc.c when we really run out of memory.
1da177e4 LT	11	*
	12	* Since we won't call these routines often (on a well-configured
	13	* machine) this file will double as a 'coding guide' and a signpost
	14	* for newbie kernel hackers. It features several pointers to major
	15	* kernel subsystems and hints as to where to find out what things do.
	16	*/
	17
8ac773b4	18	#include <linux/oom.h>
1da177e4	19	#include <linux/mm.h>
4e950f6f	20	#include <linux/err.h>
1da177e4 LT	21	#include <linux/sched.h>
	22	#include <linux/swap.h>
	23	#include <linux/timex.h>
	24	#include <linux/jiffies.h>
ef08e3b4	25	#include <linux/cpuset.h>
8bc719d3 MS	26	#include <linux/module.h>
8bc719d3 MS	27	#include <linux/notifier.h>
c7ba5c9e	28	#include <linux/memcontrol.h>
5cd9c58f	29	#include <linux/security.h>
1da177e4	30
fadd8fbd	31	int sysctl_panic_on_oom;
fe071d7e	32	int sysctl_oom_kill_allocating_task;
fef1bdd6	33	int sysctl_oom_dump_tasks;
c7d4caeb	34	static DEFINE_SPINLOCK(zone_scan_lock);
1da177e4 LT	35	/* #define DEBUG */
	36
	37	/**
6937a25c	38	* badness - calculate a numeric value for how bad this task has been
1da177e4	39	* @p: task struct of which task we should calculate
a49335cc	40	* @uptime: current uptime in seconds
1da177e4 LT	41	*
	42	* The formula used is relatively simple and documented inline in the
	43	* function. The main rationale is that we want to select a good task
	44	* to kill when we run out of memory.
	45	*
	46	* Good in this context means that:
	47	* 1) we lose the minimum amount of work done
	48	* 2) we recover a large amount of memory
	49	* 3) we don't kill anything innocent of eating tons of memory
	50	* 4) we want to kill the minimum amount of processes (one)
	51	* 5) we try to kill the process the user expects us to kill, this
	52	* algorithm has been meticulously tuned to meet the principle
	53	* of least surprise ... (be careful when you change it)
	54	*/
	55
97d87c97	56	unsigned long badness(struct task_struct *p, unsigned long uptime)
1da177e4	57	{
a12888f7	58	unsigned long points, cpu_time, run_time;
97c2c9b8 AM	59	struct mm_struct *mm;
97c2c9b8 AM	60	struct task_struct *child;
2ff05b2b	61	int oom_adj;
1da177e4	62
97c2c9b8 AM	63	task_lock(p);
	64	mm = p->mm;
	65	if (!mm) {
	66	task_unlock(p);
1da177e4	67	return 0;
97c2c9b8	68	}
2ff05b2b	69	oom_adj = mm->oom_adj;
4d8b9135 DR	70	if (oom_adj == OOM_DISABLE) {
	71	task_unlock(p);
	72	return 0;
	73	}
1da177e4 LT	74
	75	/*
	76	* The memory size of the process is the basis for the badness.
	77	*/
97c2c9b8 AM	78	points = mm->total_vm;
	79
	80	/*
	81	* After this unlock we can no longer dereference local variable `mm'
	82	*/
	83	task_unlock(p);
1da177e4	84
7ba34859 HD	85	/*
	86	* swapoff can easily use up all memory, so kill those first.
	87	*/
	88	if (p->flags & PF_SWAPOFF)
	89	return ULONG_MAX;
	90
1da177e4 LT	91	/*
1da177e4 LT	92	* Processes which fork a lot of child processes are likely
9827b781	93	* a good choice. We add half the vmsize of the children if they
1da177e4	94	* have an own mm. This prevents forking servers to flood the
9827b781 KG	95	* machine with an endless amount of children. In case a single
	96	* child is eating the vast majority of memory, adding only half
	97	* to the parents will make the child our kill candidate of choice.
1da177e4	98	*/
97c2c9b8 AM	99	list_for_each_entry(child, &p->children, sibling) {
	100	task_lock(child);
	101	if (child->mm != mm && child->mm)
	102	points += child->mm->total_vm/2 + 1;
	103	task_unlock(child);
1da177e4 LT	104	}
	105
	106	/*
	107	* CPU time is in tens of seconds and run time is in thousands
	108	* of seconds. There is no particular reason for this other than
	109	* that it turned out to work very well in practice.
	110	*/
	111	cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
	112	>> (SHIFT_HZ + 3);
	113
	114	if (uptime >= p->start_time.tv_sec)
	115	run_time = (uptime - p->start_time.tv_sec) >> 10;
	116	else
	117	run_time = 0;
	118
a12888f7 CG	119	if (cpu_time)
	120	points /= int_sqrt(cpu_time);
	121	if (run_time)
	122	points /= int_sqrt(int_sqrt(run_time));
1da177e4 LT	123
	124	/*
	125	* Niced processes are most likely less important, so double
	126	* their badness points.
	127	*/
	128	if (task_nice(p) > 0)
	129	points *= 2;
	130
	131	/*
	132	* Superuser processes are usually more important, so we make it
	133	* less likely that we kill those.
	134	*/
a2f2945a EP	135	if (has_capability_noaudit(p, CAP_SYS_ADMIN) \|\|
a2f2945a EP	136	has_capability_noaudit(p, CAP_SYS_RESOURCE))
1da177e4 LT	137	points /= 4;
	138
	139	/*
	140	* We don't want to kill a process with direct hardware access.
	141	* Not only could that mess up the hardware, but usually users
	142	* tend to only have this flag set on applications they think
	143	* of as important.
	144	*/
a2f2945a	145	if (has_capability_noaudit(p, CAP_SYS_RAWIO))
1da177e4 LT	146	points /= 4;
1da177e4 LT	147
7887a3da NP	148	/*
	149	* If p's nodes don't overlap ours, it may still help to kill p
	150	* because p may have allocated or otherwise mapped memory on
	151	* this node before. However it will be less likely.
	152	*/
bbe373f2	153	if (!cpuset_mems_allowed_intersects(current, p))
7887a3da NP	154	points /= 8;
7887a3da NP	155
1da177e4	156	/*
2ff05b2b	157	* Adjust the score by oom_adj.
1da177e4	158	*/
2ff05b2b DR	159	if (oom_adj) {
2ff05b2b DR	160	if (oom_adj > 0) {
9a82782f JP	161	if (!points)
9a82782f JP	162	points = 1;
2ff05b2b	163	points <<= oom_adj;
9a82782f	164	} else
2ff05b2b	165	points >>= -(oom_adj);
1da177e4 LT	166	}
	167
	168	#ifdef DEBUG
a5e58a61	169	printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n",
1da177e4 LT	170	p->pid, p->comm, points);
	171	#endif
	172	return points;
	173	}
	174
9b0f8b04 CL	175	/*
	176	* Determine the type of allocation constraint.
	177	*/
70e24bdf DR	178	static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
70e24bdf DR	179	gfp_t gfp_mask)
9b0f8b04 CL	180	{
9b0f8b04 CL	181	#ifdef CONFIG_NUMA
54a6eb5c	182	struct zone *zone;
dd1a239f	183	struct zoneref *z;
54a6eb5c	184	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
ee31af5d	185	nodemask_t nodes = node_states[N_HIGH_MEMORY];
9b0f8b04	186
54a6eb5c MG	187	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
	188	if (cpuset_zone_allowed_softwall(zone, gfp_mask))
	189	node_clear(zone_to_nid(zone), nodes);
9b0f8b04 CL	190	else
	191	return CONSTRAINT_CPUSET;
	192
	193	if (!nodes_empty(nodes))
	194	return CONSTRAINT_MEMORY_POLICY;
	195	#endif
	196
	197	return CONSTRAINT_NONE;
	198	}
	199
1da177e4 LT	200	/*
	201	* Simple selection loop. We chose the process with the highest
	202	* number of 'points'. We expect the caller will lock the tasklist.
	203	*
	204	* (not docbooked, we don't want this one cluttering up the manual)
	205	*/
c7ba5c9e PE	206	static struct task_struct select_bad_process(unsigned long ppoints,
c7ba5c9e PE	207	struct mem_cgroup *mem)
1da177e4	208	{
1da177e4 LT	209	struct task_struct g, p;
	210	struct task_struct *chosen = NULL;
	211	struct timespec uptime;
9827b781	212	*ppoints = 0;
1da177e4 LT	213
1da177e4 LT	214	do_posix_clock_monotonic_gettime(&uptime);
a49335cc PJ	215	do_each_thread(g, p) {
a49335cc PJ	216	unsigned long points;
a49335cc	217
28324d1d ON	218	/*
	219	* skip kernel threads and tasks which have already released
	220	* their mm.
	221	*/
5081dde3 NP	222	if (!p->mm)
5081dde3 NP	223	continue;
28324d1d	224	/* skip the init task */
b460cbc5	225	if (is_global_init(p))
a49335cc	226	continue;
4c4a2214 DR	227	if (mem && !task_in_mem_cgroup(p, mem))
4c4a2214 DR	228	continue;
ef08e3b4	229
b78483a4 NP	230	/*
	231	* This task already has access to memory reserves and is
	232	* being killed. Don't allow any other task access to the
	233	* memory reserve.
	234	*
	235	* Note: this may have a chance of deadlock if it gets
	236	* blocked waiting for another task which itself is waiting
	237	* for memory. Is there a better alternative?
	238	*/
	239	if (test_tsk_thread_flag(p, TIF_MEMDIE))
	240	return ERR_PTR(-1UL);
	241
a49335cc	242	/*
6937a25c	243	* This is in the process of releasing memory so wait for it
a49335cc	244	* to finish before killing some other task by mistake.
50ec3bbf NP	245	*
	246	* However, if p is the current task, we allow the 'kill' to
	247	* go ahead if it is exiting: this will simply set TIF_MEMDIE,
	248	* which will allow it to gain access to memory reserves in
	249	* the process of exiting and releasing its resources.
b78483a4	250	* Otherwise we could get an easy OOM deadlock.
a49335cc	251	*/
b78483a4 NP	252	if (p->flags & PF_EXITING) {
	253	if (p != current)
	254	return ERR_PTR(-1UL);
	255
972c4ea5 ON	256	chosen = p;
972c4ea5 ON	257	*ppoints = ULONG_MAX;
50ec3bbf	258	}
972c4ea5	259
97d87c97	260	points = badness(p, uptime.tv_sec);
4d8b9135	261	if (points > *ppoints) {
a49335cc	262	chosen = p;
9827b781	263	*ppoints = points;
1da177e4	264	}
a49335cc	265	} while_each_thread(g, p);
972c4ea5	266
1da177e4 LT	267	return chosen;
	268	}
	269
fef1bdd6	270	/**
1b578df0 RD	271	* dump_tasks - dump current memory state of all system tasks
	272	* @mem: target memory controller
	273	*
fef1bdd6 DR	274	* Dumps the current memory state of all system tasks, excluding kernel threads.
	275	* State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
	276	* score, and name.
	277	*
	278	* If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
	279	* shown.
	280	*
	281	* Call with tasklist_lock read-locked.
	282	*/
	283	static void dump_tasks(const struct mem_cgroup *mem)
	284	{
	285	struct task_struct g, p;
	286
	287	printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
	288	"name\n");
	289	do_each_thread(g, p) {
6d2661ed DR	290	struct mm_struct *mm;
6d2661ed DR	291
fef1bdd6 DR	292	if (mem && !task_in_mem_cgroup(p, mem))
fef1bdd6 DR	293	continue;
b4416d2b DR	294	if (!thread_group_leader(p))
b4416d2b DR	295	continue;
fef1bdd6 DR	296
fef1bdd6 DR	297	task_lock(p);
6d2661ed DR	298	mm = p->mm;
	299	if (!mm) {
	300	/*
	301	* total_vm and rss sizes do not exist for tasks with no
	302	* mm so there's no need to report them; they can't be
	303	* oom killed anyway.
	304	*/
	305	task_unlock(p);
	306	continue;
	307	}
fef1bdd6	308	printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
6d2661ed	309	p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
2ff05b2b	310	get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
fef1bdd6 DR	311	task_unlock(p);
	312	} while_each_thread(g, p);
	313	}
	314
1b578df0	315	/*
5a291b98 RG	316	* Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
	317	* flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
	318	* set.
1da177e4	319	*/
f3af38d3	320	static void __oom_kill_task(struct task_struct *p, int verbose)
1da177e4	321	{
b460cbc5	322	if (is_global_init(p)) {
1da177e4 LT	323	WARN_ON(1);
	324	printk(KERN_WARNING "tried to kill init!\n");
	325	return;
	326	}
	327
01017a22	328	if (!p->mm) {
1da177e4 LT	329	WARN_ON(1);
1da177e4 LT	330	printk(KERN_WARNING "tried to kill an mm-less task!\n");
1da177e4 LT	331	return;
1da177e4 LT	332	}
50ec3bbf	333
f3af38d3	334	if (verbose)
ba25f9dc PE	335	printk(KERN_ERR "Killed process %d (%s)\n",
ba25f9dc PE	336	task_pid_nr(p), p->comm);
1da177e4 LT	337
	338	/*
	339	* We give our sacrificial lamb high priority and access to
	340	* all the memory it needs. That way it should be able to
	341	* exit() and clear out its resources quickly...
	342	*/
fa717060	343	p->rt.time_slice = HZ;
1da177e4 LT	344	set_tsk_thread_flag(p, TIF_MEMDIE);
	345
	346	force_sig(SIGKILL, p);
	347	}
	348
f3af38d3	349	static int oom_kill_task(struct task_struct *p)
1da177e4	350	{
01315922	351	struct mm_struct *mm;
36c8b586	352	struct task_struct g, q;
1da177e4	353
4d8b9135	354	task_lock(p);
01315922	355	mm = p->mm;
4d8b9135 DR	356	if (!mm \|\| mm->oom_adj == OOM_DISABLE) {
4d8b9135 DR	357	task_unlock(p);
01315922	358	return 1;
4d8b9135 DR	359	}
4d8b9135 DR	360	task_unlock(p);
f3af38d3	361	__oom_kill_task(p, 1);
c33e0fca	362
1da177e4 LT	363	/*
1da177e4 LT	364	* kill all processes that share the ->mm (i.e. all threads),
f2a2a710 NP	365	* but are in a different thread group. Don't let them have access
f2a2a710 NP	366	* to memory reserves though, otherwise we might deplete all memory.
1da177e4	367	*/
c33e0fca	368	do_each_thread(g, q) {
bac0abd6	369	if (q->mm == mm && !same_thread_group(q, p))
650a7c97	370	force_sig(SIGKILL, q);
c33e0fca	371	} while_each_thread(g, q);
1da177e4	372
01315922	373	return 0;
1da177e4 LT	374	}
1da177e4 LT	375
7213f506	376	static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
fef1bdd6 DR	377	unsigned long points, struct mem_cgroup *mem,
fef1bdd6 DR	378	const char *message)
1da177e4	379	{
1da177e4	380	struct task_struct *c;
1da177e4	381
7213f506	382	if (printk_ratelimit()) {
75aa1994	383	task_lock(current);
2ff05b2b DR	384	printk(KERN_WARNING "%s invoked oom-killer: "
	385	"gfp_mask=0x%x, order=%d, oom_adj=%d\n",
	386	current->comm, gfp_mask, order,
	387	current->mm ? current->mm->oom_adj : OOM_DISABLE);
75aa1994 DR	388	cpuset_print_task_mems_allowed(current);
75aa1994 DR	389	task_unlock(current);
7213f506	390	dump_stack();
e222432b	391	mem_cgroup_print_oom_info(mem, current);
7213f506	392	show_mem();
fef1bdd6 DR	393	if (sysctl_oom_dump_tasks)
fef1bdd6 DR	394	dump_tasks(mem);
7213f506 DR	395	}
7213f506 DR	396
50ec3bbf NP	397	/*
	398	* If the task is already exiting, don't alarm the sysadmin or kill
	399	* its children or threads, just set TIF_MEMDIE so it can die quickly
	400	*/
	401	if (p->flags & PF_EXITING) {
f3af38d3	402	__oom_kill_task(p, 0);
50ec3bbf NP	403	return 0;
	404	}
	405
f3af38d3	406	printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
ba25f9dc	407	message, task_pid_nr(p), p->comm, points);
f3af38d3	408
1da177e4	409	/* Try to kill a child first */
7b1915a9	410	list_for_each_entry(c, &p->children, sibling) {
1da177e4 LT	411	if (c->mm == p->mm)
1da177e4 LT	412	continue;
f3af38d3	413	if (!oom_kill_task(c))
01315922	414	return 0;
1da177e4	415	}
f3af38d3	416	return oom_kill_task(p);
1da177e4 LT	417	}
1da177e4 LT	418
00f0b825	419	#ifdef CONFIG_CGROUP_MEM_RES_CTLR
c7ba5c9e PE	420	void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
	421	{
	422	unsigned long points = 0;
	423	struct task_struct *p;
	424
e115f2d8	425	read_lock(&tasklist_lock);
c7ba5c9e PE	426	retry:
	427	p = select_bad_process(&points, mem);
	428	if (PTR_ERR(p) == -1UL)
	429	goto out;
	430
	431	if (!p)
	432	p = current;
	433
fef1bdd6	434	if (oom_kill_process(p, gfp_mask, 0, points, mem,
c7ba5c9e PE	435	"Memory cgroup out of memory"))
	436	goto retry;
	437	out:
e115f2d8	438	read_unlock(&tasklist_lock);
c7ba5c9e PE	439	}
	440	#endif
	441
8bc719d3 MS	442	static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
	443
	444	int register_oom_notifier(struct notifier_block *nb)
	445	{
	446	return blocking_notifier_chain_register(&oom_notify_list, nb);
	447	}
	448	EXPORT_SYMBOL_GPL(register_oom_notifier);
	449
	450	int unregister_oom_notifier(struct notifier_block *nb)
	451	{
	452	return blocking_notifier_chain_unregister(&oom_notify_list, nb);
	453	}
	454	EXPORT_SYMBOL_GPL(unregister_oom_notifier);
	455
098d7f12 DR	456	/*
	457	* Try to acquire the OOM killer lock for the zones in zonelist. Returns zero
	458	* if a parallel OOM killing is already taking place that includes a zone in
	459	* the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
	460	*/
dd1a239f	461	int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
098d7f12	462	{
dd1a239f MG	463	struct zoneref *z;
dd1a239f MG	464	struct zone *zone;
098d7f12 DR	465	int ret = 1;
098d7f12 DR	466
c7d4caeb	467	spin_lock(&zone_scan_lock);
dd1a239f MG	468	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
dd1a239f MG	469	if (zone_is_oom_locked(zone)) {
098d7f12 DR	470	ret = 0;
	471	goto out;
	472	}
dd1a239f MG	473	}
	474
	475	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
	476	/*
c7d4caeb	477	* Lock each zone in the zonelist under zone_scan_lock so a
dd1a239f MG	478	* parallel invocation of try_set_zone_oom() doesn't succeed
	479	* when it shouldn't.
	480	*/
	481	zone_set_flag(zone, ZONE_OOM_LOCKED);
	482	}
098d7f12	483
098d7f12	484	out:
c7d4caeb	485	spin_unlock(&zone_scan_lock);
098d7f12 DR	486	return ret;
	487	}
	488
	489	/*
	490	* Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
	491	* allocation attempts with zonelists containing them may now recall the OOM
	492	* killer, if necessary.
	493	*/
dd1a239f	494	void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
098d7f12	495	{
dd1a239f MG	496	struct zoneref *z;
dd1a239f MG	497	struct zone *zone;
098d7f12	498
c7d4caeb	499	spin_lock(&zone_scan_lock);
dd1a239f MG	500	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
	501	zone_clear_flag(zone, ZONE_OOM_LOCKED);
	502	}
c7d4caeb	503	spin_unlock(&zone_scan_lock);
098d7f12 DR	504	}
098d7f12 DR	505
1c0fe6e3 NP	506	/*
	507	* Must be called with tasklist_lock held for read.
	508	*/
	509	static void __out_of_memory(gfp_t gfp_mask, int order)
	510	{
184101bf DR	511	struct task_struct *p;
184101bf DR	512	unsigned long points;
1c0fe6e3	513
184101bf DR	514	if (sysctl_oom_kill_allocating_task)
	515	if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
	516	"Out of memory (oom_kill_allocating_task)"))
1c0fe6e3	517	return;
184101bf DR	518	retry:
	519	/*
	520	* Rambo mode: Shoot down a process and hope it solves whatever
	521	* issues we may have.
	522	*/
	523	p = select_bad_process(&points, NULL);
1c0fe6e3	524
184101bf DR	525	if (PTR_ERR(p) == -1UL)
184101bf DR	526	return;
1c0fe6e3	527
184101bf DR	528	/* Found nothing?!?! Either we hang forever, or we panic. */
	529	if (!p) {
	530	read_unlock(&tasklist_lock);
	531	panic("Out of memory and no killable processes...\n");
1c0fe6e3	532	}
184101bf DR	533
	534	if (oom_kill_process(p, gfp_mask, order, points, NULL,
	535	"Out of memory"))
	536	goto retry;
1c0fe6e3 NP	537	}
	538
	539	/*
	540	* pagefault handler calls into here because it is out of memory but
	541	* doesn't know exactly how or why.
	542	*/
	543	void pagefault_out_of_memory(void)
	544	{
	545	unsigned long freed = 0;
	546
	547	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
	548	if (freed > 0)
	549	/* Got some memory back in the last second. */
	550	return;
	551
a636b327 KH	552	/*
	553	* If this is from memcg, oom-killer is already invoked.
	554	* and not worth to go system-wide-oom.
	555	*/
	556	if (mem_cgroup_oom_called(current))
	557	goto rest_and_return;
	558
1c0fe6e3 NP	559	if (sysctl_panic_on_oom)
	560	panic("out of memory from page fault. panic_on_oom is selected.\n");
	561
	562	read_lock(&tasklist_lock);
	563	__out_of_memory(0, 0); /* unknown gfp_mask and order */
	564	read_unlock(&tasklist_lock);
	565
	566	/*
	567	* Give "p" a good chance of killing itself before we
	568	* retry to allocate memory.
	569	*/
a636b327	570	rest_and_return:
1c0fe6e3 NP	571	if (!test_thread_flag(TIF_MEMDIE))
	572	schedule_timeout_uninterruptible(1);
	573	}
	574
1da177e4	575	/**
6937a25c	576	* out_of_memory - kill the "best" process when we run out of memory
1b578df0 RD	577	* @zonelist: zonelist pointer
	578	* @gfp_mask: memory allocation flags
	579	* @order: amount of memory being requested as a power of 2
1da177e4 LT	580	*
	581	* If we run out of memory, we have the choice between either
	582	* killing a random task (bad), letting the system crash (worse)
	583	* OR try to be smart about which process to kill. Note that we
	584	* don't have to be perfect here, we just have to be good.
	585	*/
9b0f8b04	586	void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
1da177e4	587	{
8bc719d3	588	unsigned long freed = 0;
70e24bdf	589	enum oom_constraint constraint;
8bc719d3 MS	590
	591	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
	592	if (freed > 0)
	593	/* Got some memory back in the last second. */
	594	return;
1da177e4	595
2b744c01 YG	596	if (sysctl_panic_on_oom == 2)
	597	panic("out of memory. Compulsory panic_on_oom is selected.\n");
	598
9b0f8b04 CL	599	/*
	600	* Check if there were limitations on the allocation (only relevant for
	601	* NUMA) that may require different handling.
	602	*/
2b45ab33	603	constraint = constrained_alloc(zonelist, gfp_mask);
2b45ab33 DR	604	read_lock(&tasklist_lock);
	605
	606	switch (constraint) {
9b0f8b04	607	case CONSTRAINT_MEMORY_POLICY:
1c0fe6e3	608	oom_kill_process(current, gfp_mask, order, 0, NULL,
9b0f8b04 CL	609	"No available memory (MPOL_BIND)");
	610	break;
	611
9b0f8b04	612	case CONSTRAINT_NONE:
fadd8fbd KH	613	if (sysctl_panic_on_oom)
fadd8fbd KH	614	panic("out of memory. panic_on_oom is selected\n");
fe071d7e DR	615	/* Fall-through */
fe071d7e DR	616	case CONSTRAINT_CPUSET:
1c0fe6e3	617	__out_of_memory(gfp_mask, order);
9b0f8b04 CL	618	break;
9b0f8b04 CL	619	}
1da177e4	620
140ffcec	621	read_unlock(&tasklist_lock);
1da177e4 LT	622
	623	/*
	624	* Give "p" a good chance of killing itself before we
2f659f46	625	* retry to allocate memory unless "p" is current
1da177e4	626	*/
2f659f46	627	if (!test_thread_flag(TIF_MEMDIE))
140ffcec	628	schedule_timeout_uninterruptible(1);
1da177e4	629	}