Merge branch 'master' of /usr/src/ntfs-2.6/
[deliverable/linux.git] / mm / mempolicy.c
1 /*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
33 *
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56 /* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
67 */
68
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/mm.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <asm/tlbflush.h>
87 #include <asm/uaccess.h>
88
89 static kmem_cache_t *policy_cache;
90 static kmem_cache_t *sn_cache;
91
92 #define PDprintk(fmt...)
93
94 /* Highest zone. An specific allocation for a zone below that is not
95 policied. */
96 static int policy_zone;
97
98 struct mempolicy default_policy = {
99 .refcnt = ATOMIC_INIT(1), /* never free it */
100 .policy = MPOL_DEFAULT,
101 };
102
103 /* Do sanity checking on a policy */
104 static int mpol_check_policy(int mode, nodemask_t *nodes)
105 {
106 int empty = nodes_empty(*nodes);
107
108 switch (mode) {
109 case MPOL_DEFAULT:
110 if (!empty)
111 return -EINVAL;
112 break;
113 case MPOL_BIND:
114 case MPOL_INTERLEAVE:
115 /* Preferred will only use the first bit, but allow
116 more for now. */
117 if (empty)
118 return -EINVAL;
119 break;
120 }
121 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
122 }
123 /* Generate a custom zonelist for the BIND policy. */
124 static struct zonelist *bind_zonelist(nodemask_t *nodes)
125 {
126 struct zonelist *zl;
127 int num, max, nd;
128
129 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
130 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
131 if (!zl)
132 return NULL;
133 num = 0;
134 for_each_node_mask(nd, *nodes) {
135 int k;
136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
138 if (!z->present_pages)
139 continue;
140 zl->zones[num++] = z;
141 if (k > policy_zone)
142 policy_zone = k;
143 }
144 }
145 zl->zones[num] = NULL;
146 return zl;
147 }
148
149 /* Create a new policy */
150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
151 {
152 struct mempolicy *policy;
153
154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
155 if (mode == MPOL_DEFAULT)
156 return NULL;
157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
158 if (!policy)
159 return ERR_PTR(-ENOMEM);
160 atomic_set(&policy->refcnt, 1);
161 switch (mode) {
162 case MPOL_INTERLEAVE:
163 policy->v.nodes = *nodes;
164 break;
165 case MPOL_PREFERRED:
166 policy->v.preferred_node = first_node(*nodes);
167 if (policy->v.preferred_node >= MAX_NUMNODES)
168 policy->v.preferred_node = -1;
169 break;
170 case MPOL_BIND:
171 policy->v.zonelist = bind_zonelist(nodes);
172 if (policy->v.zonelist == NULL) {
173 kmem_cache_free(policy_cache, policy);
174 return ERR_PTR(-ENOMEM);
175 }
176 break;
177 }
178 policy->policy = mode;
179 return policy;
180 }
181
182 /* Ensure all existing pages follow the policy. */
183 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
184 unsigned long addr, unsigned long end, nodemask_t *nodes)
185 {
186 pte_t *orig_pte;
187 pte_t *pte;
188 spinlock_t *ptl;
189
190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
191 do {
192 unsigned long pfn;
193 unsigned int nid;
194
195 if (!pte_present(*pte))
196 continue;
197 pfn = pte_pfn(*pte);
198 if (!pfn_valid(pfn)) {
199 print_bad_pte(vma, *pte, addr);
200 continue;
201 }
202 nid = pfn_to_nid(pfn);
203 if (!node_isset(nid, *nodes))
204 break;
205 } while (pte++, addr += PAGE_SIZE, addr != end);
206 pte_unmap_unlock(orig_pte, ptl);
207 return addr != end;
208 }
209
210 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
211 unsigned long addr, unsigned long end, nodemask_t *nodes)
212 {
213 pmd_t *pmd;
214 unsigned long next;
215
216 pmd = pmd_offset(pud, addr);
217 do {
218 next = pmd_addr_end(addr, end);
219 if (pmd_none_or_clear_bad(pmd))
220 continue;
221 if (check_pte_range(vma, pmd, addr, next, nodes))
222 return -EIO;
223 } while (pmd++, addr = next, addr != end);
224 return 0;
225 }
226
227 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
228 unsigned long addr, unsigned long end, nodemask_t *nodes)
229 {
230 pud_t *pud;
231 unsigned long next;
232
233 pud = pud_offset(pgd, addr);
234 do {
235 next = pud_addr_end(addr, end);
236 if (pud_none_or_clear_bad(pud))
237 continue;
238 if (check_pmd_range(vma, pud, addr, next, nodes))
239 return -EIO;
240 } while (pud++, addr = next, addr != end);
241 return 0;
242 }
243
244 static inline int check_pgd_range(struct vm_area_struct *vma,
245 unsigned long addr, unsigned long end, nodemask_t *nodes)
246 {
247 pgd_t *pgd;
248 unsigned long next;
249
250 pgd = pgd_offset(vma->vm_mm, addr);
251 do {
252 next = pgd_addr_end(addr, end);
253 if (pgd_none_or_clear_bad(pgd))
254 continue;
255 if (check_pud_range(vma, pgd, addr, next, nodes))
256 return -EIO;
257 } while (pgd++, addr = next, addr != end);
258 return 0;
259 }
260
261 /* Step 1: check the range */
262 static struct vm_area_struct *
263 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
264 nodemask_t *nodes, unsigned long flags)
265 {
266 int err;
267 struct vm_area_struct *first, *vma, *prev;
268
269 first = find_vma(mm, start);
270 if (!first)
271 return ERR_PTR(-EFAULT);
272 if (first->vm_flags & VM_RESERVED)
273 return ERR_PTR(-EACCES);
274 prev = NULL;
275 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
276 if (!vma->vm_next && vma->vm_end < end)
277 return ERR_PTR(-EFAULT);
278 if (prev && prev->vm_end < vma->vm_start)
279 return ERR_PTR(-EFAULT);
280 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
281 unsigned long endvma = vma->vm_end;
282 if (endvma > end)
283 endvma = end;
284 if (vma->vm_start > start)
285 start = vma->vm_start;
286 err = check_pgd_range(vma, start, endvma, nodes);
287 if (err) {
288 first = ERR_PTR(err);
289 break;
290 }
291 }
292 prev = vma;
293 }
294 return first;
295 }
296
297 /* Apply policy to a single VMA */
298 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
299 {
300 int err = 0;
301 struct mempolicy *old = vma->vm_policy;
302
303 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
304 vma->vm_start, vma->vm_end, vma->vm_pgoff,
305 vma->vm_ops, vma->vm_file,
306 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
307
308 if (vma->vm_ops && vma->vm_ops->set_policy)
309 err = vma->vm_ops->set_policy(vma, new);
310 if (!err) {
311 mpol_get(new);
312 vma->vm_policy = new;
313 mpol_free(old);
314 }
315 return err;
316 }
317
318 /* Step 2: apply policy to a range and do splits. */
319 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
320 unsigned long end, struct mempolicy *new)
321 {
322 struct vm_area_struct *next;
323 int err;
324
325 err = 0;
326 for (; vma && vma->vm_start < end; vma = next) {
327 next = vma->vm_next;
328 if (vma->vm_start < start)
329 err = split_vma(vma->vm_mm, vma, start, 1);
330 if (!err && vma->vm_end > end)
331 err = split_vma(vma->vm_mm, vma, end, 0);
332 if (!err)
333 err = policy_vma(vma, new);
334 if (err)
335 break;
336 }
337 return err;
338 }
339
340 static int contextualize_policy(int mode, nodemask_t *nodes)
341 {
342 if (!nodes)
343 return 0;
344
345 /* Update current mems_allowed */
346 cpuset_update_current_mems_allowed();
347 /* Ignore nodes not set in current->mems_allowed */
348 cpuset_restrict_to_mems_allowed(nodes->bits);
349 return mpol_check_policy(mode, nodes);
350 }
351
352 long do_mbind(unsigned long start, unsigned long len,
353 unsigned long mode, nodemask_t *nmask, unsigned long flags)
354 {
355 struct vm_area_struct *vma;
356 struct mm_struct *mm = current->mm;
357 struct mempolicy *new;
358 unsigned long end;
359 int err;
360
361 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
362 return -EINVAL;
363 if (start & ~PAGE_MASK)
364 return -EINVAL;
365 if (mode == MPOL_DEFAULT)
366 flags &= ~MPOL_MF_STRICT;
367 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
368 end = start + len;
369 if (end < start)
370 return -EINVAL;
371 if (end == start)
372 return 0;
373 if (mpol_check_policy(mode, nmask))
374 return -EINVAL;
375 new = mpol_new(mode, nmask);
376 if (IS_ERR(new))
377 return PTR_ERR(new);
378
379 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
380 mode,nodes_addr(nodes)[0]);
381
382 down_write(&mm->mmap_sem);
383 vma = check_range(mm, start, end, nmask, flags);
384 err = PTR_ERR(vma);
385 if (!IS_ERR(vma))
386 err = mbind_range(vma, start, end, new);
387 up_write(&mm->mmap_sem);
388 mpol_free(new);
389 return err;
390 }
391
392 /* Set the process memory policy */
393 long do_set_mempolicy(int mode, nodemask_t *nodes)
394 {
395 struct mempolicy *new;
396
397 if (contextualize_policy(mode, nodes))
398 return -EINVAL;
399 new = mpol_new(mode, nodes);
400 if (IS_ERR(new))
401 return PTR_ERR(new);
402 mpol_free(current->mempolicy);
403 current->mempolicy = new;
404 if (new && new->policy == MPOL_INTERLEAVE)
405 current->il_next = first_node(new->v.nodes);
406 return 0;
407 }
408
409 /* Fill a zone bitmap for a policy */
410 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
411 {
412 int i;
413
414 nodes_clear(*nodes);
415 switch (p->policy) {
416 case MPOL_BIND:
417 for (i = 0; p->v.zonelist->zones[i]; i++)
418 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
419 *nodes);
420 break;
421 case MPOL_DEFAULT:
422 break;
423 case MPOL_INTERLEAVE:
424 *nodes = p->v.nodes;
425 break;
426 case MPOL_PREFERRED:
427 /* or use current node instead of online map? */
428 if (p->v.preferred_node < 0)
429 *nodes = node_online_map;
430 else
431 node_set(p->v.preferred_node, *nodes);
432 break;
433 default:
434 BUG();
435 }
436 }
437
438 static int lookup_node(struct mm_struct *mm, unsigned long addr)
439 {
440 struct page *p;
441 int err;
442
443 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
444 if (err >= 0) {
445 err = page_to_nid(p);
446 put_page(p);
447 }
448 return err;
449 }
450
451 /* Retrieve NUMA policy */
452 long do_get_mempolicy(int *policy, nodemask_t *nmask,
453 unsigned long addr, unsigned long flags)
454 {
455 int err;
456 struct mm_struct *mm = current->mm;
457 struct vm_area_struct *vma = NULL;
458 struct mempolicy *pol = current->mempolicy;
459
460 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
461 return -EINVAL;
462 if (flags & MPOL_F_ADDR) {
463 down_read(&mm->mmap_sem);
464 vma = find_vma_intersection(mm, addr, addr+1);
465 if (!vma) {
466 up_read(&mm->mmap_sem);
467 return -EFAULT;
468 }
469 if (vma->vm_ops && vma->vm_ops->get_policy)
470 pol = vma->vm_ops->get_policy(vma, addr);
471 else
472 pol = vma->vm_policy;
473 } else if (addr)
474 return -EINVAL;
475
476 if (!pol)
477 pol = &default_policy;
478
479 if (flags & MPOL_F_NODE) {
480 if (flags & MPOL_F_ADDR) {
481 err = lookup_node(mm, addr);
482 if (err < 0)
483 goto out;
484 *policy = err;
485 } else if (pol == current->mempolicy &&
486 pol->policy == MPOL_INTERLEAVE) {
487 *policy = current->il_next;
488 } else {
489 err = -EINVAL;
490 goto out;
491 }
492 } else
493 *policy = pol->policy;
494
495 if (vma) {
496 up_read(&current->mm->mmap_sem);
497 vma = NULL;
498 }
499
500 err = 0;
501 if (nmask)
502 get_zonemask(pol, nmask);
503
504 out:
505 if (vma)
506 up_read(&current->mm->mmap_sem);
507 return err;
508 }
509
510 /*
511 * User space interface with variable sized bitmaps for nodelists.
512 */
513
514 /* Copy a node mask from user space. */
515 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
516 unsigned long maxnode)
517 {
518 unsigned long k;
519 unsigned long nlongs;
520 unsigned long endmask;
521
522 --maxnode;
523 nodes_clear(*nodes);
524 if (maxnode == 0 || !nmask)
525 return 0;
526
527 nlongs = BITS_TO_LONGS(maxnode);
528 if ((maxnode % BITS_PER_LONG) == 0)
529 endmask = ~0UL;
530 else
531 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
532
533 /* When the user specified more nodes than supported just check
534 if the non supported part is all zero. */
535 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
536 if (nlongs > PAGE_SIZE/sizeof(long))
537 return -EINVAL;
538 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
539 unsigned long t;
540 if (get_user(t, nmask + k))
541 return -EFAULT;
542 if (k == nlongs - 1) {
543 if (t & endmask)
544 return -EINVAL;
545 } else if (t)
546 return -EINVAL;
547 }
548 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
549 endmask = ~0UL;
550 }
551
552 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
553 return -EFAULT;
554 nodes_addr(*nodes)[nlongs-1] &= endmask;
555 return 0;
556 }
557
558 /* Copy a kernel node mask to user space */
559 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
560 nodemask_t *nodes)
561 {
562 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
563 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
564
565 if (copy > nbytes) {
566 if (copy > PAGE_SIZE)
567 return -EINVAL;
568 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
569 return -EFAULT;
570 copy = nbytes;
571 }
572 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
573 }
574
575 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
576 unsigned long mode,
577 unsigned long __user *nmask, unsigned long maxnode,
578 unsigned flags)
579 {
580 nodemask_t nodes;
581 int err;
582
583 err = get_nodes(&nodes, nmask, maxnode);
584 if (err)
585 return err;
586 return do_mbind(start, len, mode, &nodes, flags);
587 }
588
589 /* Set the process memory policy */
590 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
591 unsigned long maxnode)
592 {
593 int err;
594 nodemask_t nodes;
595
596 if (mode < 0 || mode > MPOL_MAX)
597 return -EINVAL;
598 err = get_nodes(&nodes, nmask, maxnode);
599 if (err)
600 return err;
601 return do_set_mempolicy(mode, &nodes);
602 }
603
604 /* Retrieve NUMA policy */
605 asmlinkage long sys_get_mempolicy(int __user *policy,
606 unsigned long __user *nmask,
607 unsigned long maxnode,
608 unsigned long addr, unsigned long flags)
609 {
610 int err, pval;
611 nodemask_t nodes;
612
613 if (nmask != NULL && maxnode < MAX_NUMNODES)
614 return -EINVAL;
615
616 err = do_get_mempolicy(&pval, &nodes, addr, flags);
617
618 if (err)
619 return err;
620
621 if (policy && put_user(pval, policy))
622 return -EFAULT;
623
624 if (nmask)
625 err = copy_nodes_to_user(nmask, maxnode, &nodes);
626
627 return err;
628 }
629
630 #ifdef CONFIG_COMPAT
631
632 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
633 compat_ulong_t __user *nmask,
634 compat_ulong_t maxnode,
635 compat_ulong_t addr, compat_ulong_t flags)
636 {
637 long err;
638 unsigned long __user *nm = NULL;
639 unsigned long nr_bits, alloc_size;
640 DECLARE_BITMAP(bm, MAX_NUMNODES);
641
642 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
643 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
644
645 if (nmask)
646 nm = compat_alloc_user_space(alloc_size);
647
648 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
649
650 if (!err && nmask) {
651 err = copy_from_user(bm, nm, alloc_size);
652 /* ensure entire bitmap is zeroed */
653 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
654 err |= compat_put_bitmap(nmask, bm, nr_bits);
655 }
656
657 return err;
658 }
659
660 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
661 compat_ulong_t maxnode)
662 {
663 long err = 0;
664 unsigned long __user *nm = NULL;
665 unsigned long nr_bits, alloc_size;
666 DECLARE_BITMAP(bm, MAX_NUMNODES);
667
668 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
669 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
670
671 if (nmask) {
672 err = compat_get_bitmap(bm, nmask, nr_bits);
673 nm = compat_alloc_user_space(alloc_size);
674 err |= copy_to_user(nm, bm, alloc_size);
675 }
676
677 if (err)
678 return -EFAULT;
679
680 return sys_set_mempolicy(mode, nm, nr_bits+1);
681 }
682
683 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
684 compat_ulong_t mode, compat_ulong_t __user *nmask,
685 compat_ulong_t maxnode, compat_ulong_t flags)
686 {
687 long err = 0;
688 unsigned long __user *nm = NULL;
689 unsigned long nr_bits, alloc_size;
690 nodemask_t bm;
691
692 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
693 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
694
695 if (nmask) {
696 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
697 nm = compat_alloc_user_space(alloc_size);
698 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
699 }
700
701 if (err)
702 return -EFAULT;
703
704 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
705 }
706
707 #endif
708
709 /* Return effective policy for a VMA */
710 struct mempolicy *
711 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
712 {
713 struct mempolicy *pol = task->mempolicy;
714
715 if (vma) {
716 if (vma->vm_ops && vma->vm_ops->get_policy)
717 pol = vma->vm_ops->get_policy(vma, addr);
718 else if (vma->vm_policy &&
719 vma->vm_policy->policy != MPOL_DEFAULT)
720 pol = vma->vm_policy;
721 }
722 if (!pol)
723 pol = &default_policy;
724 return pol;
725 }
726
727 /* Return a zonelist representing a mempolicy */
728 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
729 {
730 int nd;
731
732 switch (policy->policy) {
733 case MPOL_PREFERRED:
734 nd = policy->v.preferred_node;
735 if (nd < 0)
736 nd = numa_node_id();
737 break;
738 case MPOL_BIND:
739 /* Lower zones don't get a policy applied */
740 /* Careful: current->mems_allowed might have moved */
741 if (gfp_zone(gfp) >= policy_zone)
742 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
743 return policy->v.zonelist;
744 /*FALL THROUGH*/
745 case MPOL_INTERLEAVE: /* should not happen */
746 case MPOL_DEFAULT:
747 nd = numa_node_id();
748 break;
749 default:
750 nd = 0;
751 BUG();
752 }
753 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
754 }
755
756 /* Do dynamic interleaving for a process */
757 static unsigned interleave_nodes(struct mempolicy *policy)
758 {
759 unsigned nid, next;
760 struct task_struct *me = current;
761
762 nid = me->il_next;
763 next = next_node(nid, policy->v.nodes);
764 if (next >= MAX_NUMNODES)
765 next = first_node(policy->v.nodes);
766 me->il_next = next;
767 return nid;
768 }
769
770 /* Do static interleaving for a VMA with known offset. */
771 static unsigned offset_il_node(struct mempolicy *pol,
772 struct vm_area_struct *vma, unsigned long off)
773 {
774 unsigned nnodes = nodes_weight(pol->v.nodes);
775 unsigned target = (unsigned)off % nnodes;
776 int c;
777 int nid = -1;
778
779 c = 0;
780 do {
781 nid = next_node(nid, pol->v.nodes);
782 c++;
783 } while (c <= target);
784 return nid;
785 }
786
787 /* Allocate a page in interleaved policy.
788 Own path because it needs to do special accounting. */
789 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
790 unsigned nid)
791 {
792 struct zonelist *zl;
793 struct page *page;
794
795 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
796 page = __alloc_pages(gfp, order, zl);
797 if (page && page_zone(page) == zl->zones[0]) {
798 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
799 put_cpu();
800 }
801 return page;
802 }
803
804 /**
805 * alloc_page_vma - Allocate a page for a VMA.
806 *
807 * @gfp:
808 * %GFP_USER user allocation.
809 * %GFP_KERNEL kernel allocations,
810 * %GFP_HIGHMEM highmem/user allocations,
811 * %GFP_FS allocation should not call back into a file system.
812 * %GFP_ATOMIC don't sleep.
813 *
814 * @vma: Pointer to VMA or NULL if not available.
815 * @addr: Virtual Address of the allocation. Must be inside the VMA.
816 *
817 * This function allocates a page from the kernel page pool and applies
818 * a NUMA policy associated with the VMA or the current process.
819 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
820 * mm_struct of the VMA to prevent it from going away. Should be used for
821 * all allocations for pages that will be mapped into
822 * user space. Returns NULL when no page can be allocated.
823 *
824 * Should be called with the mm_sem of the vma hold.
825 */
826 struct page *
827 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
828 {
829 struct mempolicy *pol = get_vma_policy(current, vma, addr);
830
831 cpuset_update_current_mems_allowed();
832
833 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
834 unsigned nid;
835 if (vma) {
836 unsigned long off;
837 off = vma->vm_pgoff;
838 off += (addr - vma->vm_start) >> PAGE_SHIFT;
839 nid = offset_il_node(pol, vma, off);
840 } else {
841 /* fall back to process interleaving */
842 nid = interleave_nodes(pol);
843 }
844 return alloc_page_interleave(gfp, 0, nid);
845 }
846 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
847 }
848
849 /**
850 * alloc_pages_current - Allocate pages.
851 *
852 * @gfp:
853 * %GFP_USER user allocation,
854 * %GFP_KERNEL kernel allocation,
855 * %GFP_HIGHMEM highmem allocation,
856 * %GFP_FS don't call back into a file system.
857 * %GFP_ATOMIC don't sleep.
858 * @order: Power of two of allocation size in pages. 0 is a single page.
859 *
860 * Allocate a page from the kernel page pool. When not in
861 * interrupt context and apply the current process NUMA policy.
862 * Returns NULL when no page can be allocated.
863 *
864 * Don't call cpuset_update_current_mems_allowed() unless
865 * 1) it's ok to take cpuset_sem (can WAIT), and
866 * 2) allocating for current task (not interrupt).
867 */
868 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
869 {
870 struct mempolicy *pol = current->mempolicy;
871
872 if ((gfp & __GFP_WAIT) && !in_interrupt())
873 cpuset_update_current_mems_allowed();
874 if (!pol || in_interrupt())
875 pol = &default_policy;
876 if (pol->policy == MPOL_INTERLEAVE)
877 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
878 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
879 }
880 EXPORT_SYMBOL(alloc_pages_current);
881
882 /* Slow path of a mempolicy copy */
883 struct mempolicy *__mpol_copy(struct mempolicy *old)
884 {
885 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
886
887 if (!new)
888 return ERR_PTR(-ENOMEM);
889 *new = *old;
890 atomic_set(&new->refcnt, 1);
891 if (new->policy == MPOL_BIND) {
892 int sz = ksize(old->v.zonelist);
893 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
894 if (!new->v.zonelist) {
895 kmem_cache_free(policy_cache, new);
896 return ERR_PTR(-ENOMEM);
897 }
898 memcpy(new->v.zonelist, old->v.zonelist, sz);
899 }
900 return new;
901 }
902
903 /* Slow path of a mempolicy comparison */
904 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
905 {
906 if (!a || !b)
907 return 0;
908 if (a->policy != b->policy)
909 return 0;
910 switch (a->policy) {
911 case MPOL_DEFAULT:
912 return 1;
913 case MPOL_INTERLEAVE:
914 return nodes_equal(a->v.nodes, b->v.nodes);
915 case MPOL_PREFERRED:
916 return a->v.preferred_node == b->v.preferred_node;
917 case MPOL_BIND: {
918 int i;
919 for (i = 0; a->v.zonelist->zones[i]; i++)
920 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
921 return 0;
922 return b->v.zonelist->zones[i] == NULL;
923 }
924 default:
925 BUG();
926 return 0;
927 }
928 }
929
930 /* Slow path of a mpol destructor. */
931 void __mpol_free(struct mempolicy *p)
932 {
933 if (!atomic_dec_and_test(&p->refcnt))
934 return;
935 if (p->policy == MPOL_BIND)
936 kfree(p->v.zonelist);
937 p->policy = MPOL_DEFAULT;
938 kmem_cache_free(policy_cache, p);
939 }
940
941 /*
942 * Hugetlb policy. Same as above, just works with node numbers instead of
943 * zonelists.
944 */
945
946 /* Find first node suitable for an allocation */
947 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
948 {
949 struct mempolicy *pol = get_vma_policy(current, vma, addr);
950
951 switch (pol->policy) {
952 case MPOL_DEFAULT:
953 return numa_node_id();
954 case MPOL_BIND:
955 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
956 case MPOL_INTERLEAVE:
957 return interleave_nodes(pol);
958 case MPOL_PREFERRED:
959 return pol->v.preferred_node >= 0 ?
960 pol->v.preferred_node : numa_node_id();
961 }
962 BUG();
963 return 0;
964 }
965
966 /* Find secondary valid nodes for an allocation */
967 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
968 {
969 struct mempolicy *pol = get_vma_policy(current, vma, addr);
970
971 switch (pol->policy) {
972 case MPOL_PREFERRED:
973 case MPOL_DEFAULT:
974 case MPOL_INTERLEAVE:
975 return 1;
976 case MPOL_BIND: {
977 struct zone **z;
978 for (z = pol->v.zonelist->zones; *z; z++)
979 if ((*z)->zone_pgdat->node_id == nid)
980 return 1;
981 return 0;
982 }
983 default:
984 BUG();
985 return 0;
986 }
987 }
988
989 /*
990 * Shared memory backing store policy support.
991 *
992 * Remember policies even when nobody has shared memory mapped.
993 * The policies are kept in Red-Black tree linked from the inode.
994 * They are protected by the sp->lock spinlock, which should be held
995 * for any accesses to the tree.
996 */
997
998 /* lookup first element intersecting start-end */
999 /* Caller holds sp->lock */
1000 static struct sp_node *
1001 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1002 {
1003 struct rb_node *n = sp->root.rb_node;
1004
1005 while (n) {
1006 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1007
1008 if (start >= p->end)
1009 n = n->rb_right;
1010 else if (end <= p->start)
1011 n = n->rb_left;
1012 else
1013 break;
1014 }
1015 if (!n)
1016 return NULL;
1017 for (;;) {
1018 struct sp_node *w = NULL;
1019 struct rb_node *prev = rb_prev(n);
1020 if (!prev)
1021 break;
1022 w = rb_entry(prev, struct sp_node, nd);
1023 if (w->end <= start)
1024 break;
1025 n = prev;
1026 }
1027 return rb_entry(n, struct sp_node, nd);
1028 }
1029
1030 /* Insert a new shared policy into the list. */
1031 /* Caller holds sp->lock */
1032 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1033 {
1034 struct rb_node **p = &sp->root.rb_node;
1035 struct rb_node *parent = NULL;
1036 struct sp_node *nd;
1037
1038 while (*p) {
1039 parent = *p;
1040 nd = rb_entry(parent, struct sp_node, nd);
1041 if (new->start < nd->start)
1042 p = &(*p)->rb_left;
1043 else if (new->end > nd->end)
1044 p = &(*p)->rb_right;
1045 else
1046 BUG();
1047 }
1048 rb_link_node(&new->nd, parent, p);
1049 rb_insert_color(&new->nd, &sp->root);
1050 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1051 new->policy ? new->policy->policy : 0);
1052 }
1053
1054 /* Find shared policy intersecting idx */
1055 struct mempolicy *
1056 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1057 {
1058 struct mempolicy *pol = NULL;
1059 struct sp_node *sn;
1060
1061 if (!sp->root.rb_node)
1062 return NULL;
1063 spin_lock(&sp->lock);
1064 sn = sp_lookup(sp, idx, idx+1);
1065 if (sn) {
1066 mpol_get(sn->policy);
1067 pol = sn->policy;
1068 }
1069 spin_unlock(&sp->lock);
1070 return pol;
1071 }
1072
1073 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1074 {
1075 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1076 rb_erase(&n->nd, &sp->root);
1077 mpol_free(n->policy);
1078 kmem_cache_free(sn_cache, n);
1079 }
1080
1081 struct sp_node *
1082 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1083 {
1084 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1085
1086 if (!n)
1087 return NULL;
1088 n->start = start;
1089 n->end = end;
1090 mpol_get(pol);
1091 n->policy = pol;
1092 return n;
1093 }
1094
1095 /* Replace a policy range. */
1096 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1097 unsigned long end, struct sp_node *new)
1098 {
1099 struct sp_node *n, *new2 = NULL;
1100
1101 restart:
1102 spin_lock(&sp->lock);
1103 n = sp_lookup(sp, start, end);
1104 /* Take care of old policies in the same range. */
1105 while (n && n->start < end) {
1106 struct rb_node *next = rb_next(&n->nd);
1107 if (n->start >= start) {
1108 if (n->end <= end)
1109 sp_delete(sp, n);
1110 else
1111 n->start = end;
1112 } else {
1113 /* Old policy spanning whole new range. */
1114 if (n->end > end) {
1115 if (!new2) {
1116 spin_unlock(&sp->lock);
1117 new2 = sp_alloc(end, n->end, n->policy);
1118 if (!new2)
1119 return -ENOMEM;
1120 goto restart;
1121 }
1122 n->end = start;
1123 sp_insert(sp, new2);
1124 new2 = NULL;
1125 break;
1126 } else
1127 n->end = start;
1128 }
1129 if (!next)
1130 break;
1131 n = rb_entry(next, struct sp_node, nd);
1132 }
1133 if (new)
1134 sp_insert(sp, new);
1135 spin_unlock(&sp->lock);
1136 if (new2) {
1137 mpol_free(new2->policy);
1138 kmem_cache_free(sn_cache, new2);
1139 }
1140 return 0;
1141 }
1142
1143 int mpol_set_shared_policy(struct shared_policy *info,
1144 struct vm_area_struct *vma, struct mempolicy *npol)
1145 {
1146 int err;
1147 struct sp_node *new = NULL;
1148 unsigned long sz = vma_pages(vma);
1149
1150 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1151 vma->vm_pgoff,
1152 sz, npol? npol->policy : -1,
1153 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1154
1155 if (npol) {
1156 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1157 if (!new)
1158 return -ENOMEM;
1159 }
1160 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1161 if (err && new)
1162 kmem_cache_free(sn_cache, new);
1163 return err;
1164 }
1165
1166 /* Free a backing policy store on inode delete. */
1167 void mpol_free_shared_policy(struct shared_policy *p)
1168 {
1169 struct sp_node *n;
1170 struct rb_node *next;
1171
1172 if (!p->root.rb_node)
1173 return;
1174 spin_lock(&p->lock);
1175 next = rb_first(&p->root);
1176 while (next) {
1177 n = rb_entry(next, struct sp_node, nd);
1178 next = rb_next(&n->nd);
1179 rb_erase(&n->nd, &p->root);
1180 mpol_free(n->policy);
1181 kmem_cache_free(sn_cache, n);
1182 }
1183 spin_unlock(&p->lock);
1184 }
1185
1186 /* assumes fs == KERNEL_DS */
1187 void __init numa_policy_init(void)
1188 {
1189 policy_cache = kmem_cache_create("numa_policy",
1190 sizeof(struct mempolicy),
1191 0, SLAB_PANIC, NULL, NULL);
1192
1193 sn_cache = kmem_cache_create("shared_policy_node",
1194 sizeof(struct sp_node),
1195 0, SLAB_PANIC, NULL, NULL);
1196
1197 /* Set interleaving policy for system init. This way not all
1198 the data structures allocated at system boot end up in node zero. */
1199
1200 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1201 printk("numa_policy_init: interleaving failed\n");
1202 }
1203
1204 /* Reset policy of current process to default */
1205 void numa_default_policy(void)
1206 {
1207 do_set_mempolicy(MPOL_DEFAULT, NULL);
1208 }
This page took 0.054878 seconds and 6 git commands to generate.