[PATCH] mm: pte_offset_map_lock loops
[deliverable/linux.git] / mm / mempolicy.c
1 /*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
6 *
7 * NUMA policy allows the user to give hints in which node(s) memory should
8 * be allocated.
9 *
10 * Support four policies per VMA and per process:
11 *
12 * The VMA policy has priority over the process policy for a page fault.
13 *
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
19 * is used.
20 * bind Only allocate memory on a specific set of nodes,
21 * no fallback.
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
26 * process policy.
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
30 *
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
35 *
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
39 *
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
44 *
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
47 */
48
49 /* Notebook:
50 fix mmap readahead to honour policy and enable policy for any page cache
51 object
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
54 first item above.
55 handle mremap for shared memory (currently ignored for the policy)
56 grows down?
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
60 */
61
62 #include <linux/mempolicy.h>
63 #include <linux/mm.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
68 #include <linux/mm.h>
69 #include <linux/nodemask.h>
70 #include <linux/cpuset.h>
71 #include <linux/gfp.h>
72 #include <linux/slab.h>
73 #include <linux/string.h>
74 #include <linux/module.h>
75 #include <linux/interrupt.h>
76 #include <linux/init.h>
77 #include <linux/compat.h>
78 #include <linux/mempolicy.h>
79 #include <asm/tlbflush.h>
80 #include <asm/uaccess.h>
81
82 static kmem_cache_t *policy_cache;
83 static kmem_cache_t *sn_cache;
84
85 #define PDprintk(fmt...)
86
87 /* Highest zone. An specific allocation for a zone below that is not
88 policied. */
89 static int policy_zone;
90
91 struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
94 };
95
96 /* Do sanity checking on a policy */
97 static int mpol_check_policy(int mode, nodemask_t *nodes)
98 {
99 int empty = nodes_empty(*nodes);
100
101 switch (mode) {
102 case MPOL_DEFAULT:
103 if (!empty)
104 return -EINVAL;
105 break;
106 case MPOL_BIND:
107 case MPOL_INTERLEAVE:
108 /* Preferred will only use the first bit, but allow
109 more for now. */
110 if (empty)
111 return -EINVAL;
112 break;
113 }
114 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
115 }
116
117 /* Copy a node mask from user space. */
118 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
119 unsigned long maxnode, int mode)
120 {
121 unsigned long k;
122 unsigned long nlongs;
123 unsigned long endmask;
124
125 --maxnode;
126 nodes_clear(*nodes);
127 if (maxnode == 0 || !nmask)
128 return 0;
129
130 nlongs = BITS_TO_LONGS(maxnode);
131 if ((maxnode % BITS_PER_LONG) == 0)
132 endmask = ~0UL;
133 else
134 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
135
136 /* When the user specified more nodes than supported just check
137 if the non supported part is all zero. */
138 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
139 if (nlongs > PAGE_SIZE/sizeof(long))
140 return -EINVAL;
141 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
142 unsigned long t;
143 if (get_user(t, nmask + k))
144 return -EFAULT;
145 if (k == nlongs - 1) {
146 if (t & endmask)
147 return -EINVAL;
148 } else if (t)
149 return -EINVAL;
150 }
151 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
152 endmask = ~0UL;
153 }
154
155 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
156 return -EFAULT;
157 nodes_addr(*nodes)[nlongs-1] &= endmask;
158 /* Update current mems_allowed */
159 cpuset_update_current_mems_allowed();
160 /* Ignore nodes not set in current->mems_allowed */
161 /* AK: shouldn't this error out instead? */
162 cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
163 return mpol_check_policy(mode, nodes);
164 }
165
166 /* Generate a custom zonelist for the BIND policy. */
167 static struct zonelist *bind_zonelist(nodemask_t *nodes)
168 {
169 struct zonelist *zl;
170 int num, max, nd;
171
172 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
173 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
174 if (!zl)
175 return NULL;
176 num = 0;
177 for_each_node_mask(nd, *nodes) {
178 int k;
179 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
180 struct zone *z = &NODE_DATA(nd)->node_zones[k];
181 if (!z->present_pages)
182 continue;
183 zl->zones[num++] = z;
184 if (k > policy_zone)
185 policy_zone = k;
186 }
187 }
188 zl->zones[num] = NULL;
189 return zl;
190 }
191
192 /* Create a new policy */
193 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
194 {
195 struct mempolicy *policy;
196
197 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
198 if (mode == MPOL_DEFAULT)
199 return NULL;
200 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
201 if (!policy)
202 return ERR_PTR(-ENOMEM);
203 atomic_set(&policy->refcnt, 1);
204 switch (mode) {
205 case MPOL_INTERLEAVE:
206 policy->v.nodes = *nodes;
207 break;
208 case MPOL_PREFERRED:
209 policy->v.preferred_node = first_node(*nodes);
210 if (policy->v.preferred_node >= MAX_NUMNODES)
211 policy->v.preferred_node = -1;
212 break;
213 case MPOL_BIND:
214 policy->v.zonelist = bind_zonelist(nodes);
215 if (policy->v.zonelist == NULL) {
216 kmem_cache_free(policy_cache, policy);
217 return ERR_PTR(-ENOMEM);
218 }
219 break;
220 }
221 policy->policy = mode;
222 return policy;
223 }
224
225 /* Ensure all existing pages follow the policy. */
226 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
227 unsigned long addr, unsigned long end, nodemask_t *nodes)
228 {
229 pte_t *orig_pte;
230 pte_t *pte;
231 spinlock_t *ptl;
232
233 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
234 do {
235 unsigned long pfn;
236 unsigned int nid;
237
238 if (!pte_present(*pte))
239 continue;
240 pfn = pte_pfn(*pte);
241 if (!pfn_valid(pfn)) {
242 print_bad_pte(vma, *pte, addr);
243 continue;
244 }
245 nid = pfn_to_nid(pfn);
246 if (!node_isset(nid, *nodes))
247 break;
248 } while (pte++, addr += PAGE_SIZE, addr != end);
249 pte_unmap_unlock(orig_pte, ptl);
250 return addr != end;
251 }
252
253 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
254 unsigned long addr, unsigned long end, nodemask_t *nodes)
255 {
256 pmd_t *pmd;
257 unsigned long next;
258
259 pmd = pmd_offset(pud, addr);
260 do {
261 next = pmd_addr_end(addr, end);
262 if (pmd_none_or_clear_bad(pmd))
263 continue;
264 if (check_pte_range(vma, pmd, addr, next, nodes))
265 return -EIO;
266 } while (pmd++, addr = next, addr != end);
267 return 0;
268 }
269
270 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
271 unsigned long addr, unsigned long end, nodemask_t *nodes)
272 {
273 pud_t *pud;
274 unsigned long next;
275
276 pud = pud_offset(pgd, addr);
277 do {
278 next = pud_addr_end(addr, end);
279 if (pud_none_or_clear_bad(pud))
280 continue;
281 if (check_pmd_range(vma, pud, addr, next, nodes))
282 return -EIO;
283 } while (pud++, addr = next, addr != end);
284 return 0;
285 }
286
287 static inline int check_pgd_range(struct vm_area_struct *vma,
288 unsigned long addr, unsigned long end, nodemask_t *nodes)
289 {
290 pgd_t *pgd;
291 unsigned long next;
292
293 pgd = pgd_offset(vma->vm_mm, addr);
294 do {
295 next = pgd_addr_end(addr, end);
296 if (pgd_none_or_clear_bad(pgd))
297 continue;
298 if (check_pud_range(vma, pgd, addr, next, nodes))
299 return -EIO;
300 } while (pgd++, addr = next, addr != end);
301 return 0;
302 }
303
304 /* Step 1: check the range */
305 static struct vm_area_struct *
306 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
307 nodemask_t *nodes, unsigned long flags)
308 {
309 int err;
310 struct vm_area_struct *first, *vma, *prev;
311
312 first = find_vma(mm, start);
313 if (!first)
314 return ERR_PTR(-EFAULT);
315 if (first->vm_flags & VM_RESERVED)
316 return ERR_PTR(-EACCES);
317 prev = NULL;
318 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
319 if (!vma->vm_next && vma->vm_end < end)
320 return ERR_PTR(-EFAULT);
321 if (prev && prev->vm_end < vma->vm_start)
322 return ERR_PTR(-EFAULT);
323 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
324 unsigned long endvma = vma->vm_end;
325 if (endvma > end)
326 endvma = end;
327 if (vma->vm_start > start)
328 start = vma->vm_start;
329 err = check_pgd_range(vma, start, endvma, nodes);
330 if (err) {
331 first = ERR_PTR(err);
332 break;
333 }
334 }
335 prev = vma;
336 }
337 return first;
338 }
339
340 /* Apply policy to a single VMA */
341 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
342 {
343 int err = 0;
344 struct mempolicy *old = vma->vm_policy;
345
346 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
347 vma->vm_start, vma->vm_end, vma->vm_pgoff,
348 vma->vm_ops, vma->vm_file,
349 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
350
351 if (vma->vm_ops && vma->vm_ops->set_policy)
352 err = vma->vm_ops->set_policy(vma, new);
353 if (!err) {
354 mpol_get(new);
355 vma->vm_policy = new;
356 mpol_free(old);
357 }
358 return err;
359 }
360
361 /* Step 2: apply policy to a range and do splits. */
362 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
363 unsigned long end, struct mempolicy *new)
364 {
365 struct vm_area_struct *next;
366 int err;
367
368 err = 0;
369 for (; vma && vma->vm_start < end; vma = next) {
370 next = vma->vm_next;
371 if (vma->vm_start < start)
372 err = split_vma(vma->vm_mm, vma, start, 1);
373 if (!err && vma->vm_end > end)
374 err = split_vma(vma->vm_mm, vma, end, 0);
375 if (!err)
376 err = policy_vma(vma, new);
377 if (err)
378 break;
379 }
380 return err;
381 }
382
383 /* Change policy for a memory range */
384 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
385 unsigned long mode,
386 unsigned long __user *nmask, unsigned long maxnode,
387 unsigned flags)
388 {
389 struct vm_area_struct *vma;
390 struct mm_struct *mm = current->mm;
391 struct mempolicy *new;
392 unsigned long end;
393 nodemask_t nodes;
394 int err;
395
396 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
397 return -EINVAL;
398 if (start & ~PAGE_MASK)
399 return -EINVAL;
400 if (mode == MPOL_DEFAULT)
401 flags &= ~MPOL_MF_STRICT;
402 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
403 end = start + len;
404 if (end < start)
405 return -EINVAL;
406 if (end == start)
407 return 0;
408
409 err = get_nodes(&nodes, nmask, maxnode, mode);
410 if (err)
411 return err;
412
413 new = mpol_new(mode, &nodes);
414 if (IS_ERR(new))
415 return PTR_ERR(new);
416
417 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
418 mode,nodes_addr(nodes)[0]);
419
420 down_write(&mm->mmap_sem);
421 vma = check_range(mm, start, end, &nodes, flags);
422 err = PTR_ERR(vma);
423 if (!IS_ERR(vma))
424 err = mbind_range(vma, start, end, new);
425 up_write(&mm->mmap_sem);
426 mpol_free(new);
427 return err;
428 }
429
430 /* Set the process memory policy */
431 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
432 unsigned long maxnode)
433 {
434 int err;
435 struct mempolicy *new;
436 nodemask_t nodes;
437
438 if (mode < 0 || mode > MPOL_MAX)
439 return -EINVAL;
440 err = get_nodes(&nodes, nmask, maxnode, mode);
441 if (err)
442 return err;
443 new = mpol_new(mode, &nodes);
444 if (IS_ERR(new))
445 return PTR_ERR(new);
446 mpol_free(current->mempolicy);
447 current->mempolicy = new;
448 if (new && new->policy == MPOL_INTERLEAVE)
449 current->il_next = first_node(new->v.nodes);
450 return 0;
451 }
452
453 /* Fill a zone bitmap for a policy */
454 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
455 {
456 int i;
457
458 nodes_clear(*nodes);
459 switch (p->policy) {
460 case MPOL_BIND:
461 for (i = 0; p->v.zonelist->zones[i]; i++)
462 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
463 break;
464 case MPOL_DEFAULT:
465 break;
466 case MPOL_INTERLEAVE:
467 *nodes = p->v.nodes;
468 break;
469 case MPOL_PREFERRED:
470 /* or use current node instead of online map? */
471 if (p->v.preferred_node < 0)
472 *nodes = node_online_map;
473 else
474 node_set(p->v.preferred_node, *nodes);
475 break;
476 default:
477 BUG();
478 }
479 }
480
481 static int lookup_node(struct mm_struct *mm, unsigned long addr)
482 {
483 struct page *p;
484 int err;
485
486 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
487 if (err >= 0) {
488 err = page_to_nid(p);
489 put_page(p);
490 }
491 return err;
492 }
493
494 /* Copy a kernel node mask to user space */
495 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
496 nodemask_t *nodes)
497 {
498 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
499 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
500
501 if (copy > nbytes) {
502 if (copy > PAGE_SIZE)
503 return -EINVAL;
504 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
505 return -EFAULT;
506 copy = nbytes;
507 }
508 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
509 }
510
511 /* Retrieve NUMA policy */
512 asmlinkage long sys_get_mempolicy(int __user *policy,
513 unsigned long __user *nmask,
514 unsigned long maxnode,
515 unsigned long addr, unsigned long flags)
516 {
517 int err, pval;
518 struct mm_struct *mm = current->mm;
519 struct vm_area_struct *vma = NULL;
520 struct mempolicy *pol = current->mempolicy;
521
522 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
523 return -EINVAL;
524 if (nmask != NULL && maxnode < MAX_NUMNODES)
525 return -EINVAL;
526 if (flags & MPOL_F_ADDR) {
527 down_read(&mm->mmap_sem);
528 vma = find_vma_intersection(mm, addr, addr+1);
529 if (!vma) {
530 up_read(&mm->mmap_sem);
531 return -EFAULT;
532 }
533 if (vma->vm_ops && vma->vm_ops->get_policy)
534 pol = vma->vm_ops->get_policy(vma, addr);
535 else
536 pol = vma->vm_policy;
537 } else if (addr)
538 return -EINVAL;
539
540 if (!pol)
541 pol = &default_policy;
542
543 if (flags & MPOL_F_NODE) {
544 if (flags & MPOL_F_ADDR) {
545 err = lookup_node(mm, addr);
546 if (err < 0)
547 goto out;
548 pval = err;
549 } else if (pol == current->mempolicy &&
550 pol->policy == MPOL_INTERLEAVE) {
551 pval = current->il_next;
552 } else {
553 err = -EINVAL;
554 goto out;
555 }
556 } else
557 pval = pol->policy;
558
559 if (vma) {
560 up_read(&current->mm->mmap_sem);
561 vma = NULL;
562 }
563
564 if (policy && put_user(pval, policy))
565 return -EFAULT;
566
567 err = 0;
568 if (nmask) {
569 nodemask_t nodes;
570 get_zonemask(pol, &nodes);
571 err = copy_nodes_to_user(nmask, maxnode, &nodes);
572 }
573
574 out:
575 if (vma)
576 up_read(&current->mm->mmap_sem);
577 return err;
578 }
579
580 #ifdef CONFIG_COMPAT
581
582 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
583 compat_ulong_t __user *nmask,
584 compat_ulong_t maxnode,
585 compat_ulong_t addr, compat_ulong_t flags)
586 {
587 long err;
588 unsigned long __user *nm = NULL;
589 unsigned long nr_bits, alloc_size;
590 DECLARE_BITMAP(bm, MAX_NUMNODES);
591
592 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
593 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
594
595 if (nmask)
596 nm = compat_alloc_user_space(alloc_size);
597
598 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
599
600 if (!err && nmask) {
601 err = copy_from_user(bm, nm, alloc_size);
602 /* ensure entire bitmap is zeroed */
603 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
604 err |= compat_put_bitmap(nmask, bm, nr_bits);
605 }
606
607 return err;
608 }
609
610 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
611 compat_ulong_t maxnode)
612 {
613 long err = 0;
614 unsigned long __user *nm = NULL;
615 unsigned long nr_bits, alloc_size;
616 DECLARE_BITMAP(bm, MAX_NUMNODES);
617
618 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
619 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
620
621 if (nmask) {
622 err = compat_get_bitmap(bm, nmask, nr_bits);
623 nm = compat_alloc_user_space(alloc_size);
624 err |= copy_to_user(nm, bm, alloc_size);
625 }
626
627 if (err)
628 return -EFAULT;
629
630 return sys_set_mempolicy(mode, nm, nr_bits+1);
631 }
632
633 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
634 compat_ulong_t mode, compat_ulong_t __user *nmask,
635 compat_ulong_t maxnode, compat_ulong_t flags)
636 {
637 long err = 0;
638 unsigned long __user *nm = NULL;
639 unsigned long nr_bits, alloc_size;
640 nodemask_t bm;
641
642 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
643 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
644
645 if (nmask) {
646 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
647 nm = compat_alloc_user_space(alloc_size);
648 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
649 }
650
651 if (err)
652 return -EFAULT;
653
654 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
655 }
656
657 #endif
658
659 /* Return effective policy for a VMA */
660 struct mempolicy *
661 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
662 {
663 struct mempolicy *pol = task->mempolicy;
664
665 if (vma) {
666 if (vma->vm_ops && vma->vm_ops->get_policy)
667 pol = vma->vm_ops->get_policy(vma, addr);
668 else if (vma->vm_policy &&
669 vma->vm_policy->policy != MPOL_DEFAULT)
670 pol = vma->vm_policy;
671 }
672 if (!pol)
673 pol = &default_policy;
674 return pol;
675 }
676
677 /* Return a zonelist representing a mempolicy */
678 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
679 {
680 int nd;
681
682 switch (policy->policy) {
683 case MPOL_PREFERRED:
684 nd = policy->v.preferred_node;
685 if (nd < 0)
686 nd = numa_node_id();
687 break;
688 case MPOL_BIND:
689 /* Lower zones don't get a policy applied */
690 /* Careful: current->mems_allowed might have moved */
691 if (gfp_zone(gfp) >= policy_zone)
692 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
693 return policy->v.zonelist;
694 /*FALL THROUGH*/
695 case MPOL_INTERLEAVE: /* should not happen */
696 case MPOL_DEFAULT:
697 nd = numa_node_id();
698 break;
699 default:
700 nd = 0;
701 BUG();
702 }
703 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
704 }
705
706 /* Do dynamic interleaving for a process */
707 static unsigned interleave_nodes(struct mempolicy *policy)
708 {
709 unsigned nid, next;
710 struct task_struct *me = current;
711
712 nid = me->il_next;
713 next = next_node(nid, policy->v.nodes);
714 if (next >= MAX_NUMNODES)
715 next = first_node(policy->v.nodes);
716 me->il_next = next;
717 return nid;
718 }
719
720 /* Do static interleaving for a VMA with known offset. */
721 static unsigned offset_il_node(struct mempolicy *pol,
722 struct vm_area_struct *vma, unsigned long off)
723 {
724 unsigned nnodes = nodes_weight(pol->v.nodes);
725 unsigned target = (unsigned)off % nnodes;
726 int c;
727 int nid = -1;
728
729 c = 0;
730 do {
731 nid = next_node(nid, pol->v.nodes);
732 c++;
733 } while (c <= target);
734 return nid;
735 }
736
737 /* Allocate a page in interleaved policy.
738 Own path because it needs to do special accounting. */
739 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
740 unsigned nid)
741 {
742 struct zonelist *zl;
743 struct page *page;
744
745 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
746 page = __alloc_pages(gfp, order, zl);
747 if (page && page_zone(page) == zl->zones[0]) {
748 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
749 put_cpu();
750 }
751 return page;
752 }
753
754 /**
755 * alloc_page_vma - Allocate a page for a VMA.
756 *
757 * @gfp:
758 * %GFP_USER user allocation.
759 * %GFP_KERNEL kernel allocations,
760 * %GFP_HIGHMEM highmem/user allocations,
761 * %GFP_FS allocation should not call back into a file system.
762 * %GFP_ATOMIC don't sleep.
763 *
764 * @vma: Pointer to VMA or NULL if not available.
765 * @addr: Virtual Address of the allocation. Must be inside the VMA.
766 *
767 * This function allocates a page from the kernel page pool and applies
768 * a NUMA policy associated with the VMA or the current process.
769 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
770 * mm_struct of the VMA to prevent it from going away. Should be used for
771 * all allocations for pages that will be mapped into
772 * user space. Returns NULL when no page can be allocated.
773 *
774 * Should be called with the mm_sem of the vma hold.
775 */
776 struct page *
777 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
778 {
779 struct mempolicy *pol = get_vma_policy(current, vma, addr);
780
781 cpuset_update_current_mems_allowed();
782
783 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
784 unsigned nid;
785 if (vma) {
786 unsigned long off;
787 off = vma->vm_pgoff;
788 off += (addr - vma->vm_start) >> PAGE_SHIFT;
789 nid = offset_il_node(pol, vma, off);
790 } else {
791 /* fall back to process interleaving */
792 nid = interleave_nodes(pol);
793 }
794 return alloc_page_interleave(gfp, 0, nid);
795 }
796 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
797 }
798
799 /**
800 * alloc_pages_current - Allocate pages.
801 *
802 * @gfp:
803 * %GFP_USER user allocation,
804 * %GFP_KERNEL kernel allocation,
805 * %GFP_HIGHMEM highmem allocation,
806 * %GFP_FS don't call back into a file system.
807 * %GFP_ATOMIC don't sleep.
808 * @order: Power of two of allocation size in pages. 0 is a single page.
809 *
810 * Allocate a page from the kernel page pool. When not in
811 * interrupt context and apply the current process NUMA policy.
812 * Returns NULL when no page can be allocated.
813 *
814 * Don't call cpuset_update_current_mems_allowed() unless
815 * 1) it's ok to take cpuset_sem (can WAIT), and
816 * 2) allocating for current task (not interrupt).
817 */
818 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
819 {
820 struct mempolicy *pol = current->mempolicy;
821
822 if ((gfp & __GFP_WAIT) && !in_interrupt())
823 cpuset_update_current_mems_allowed();
824 if (!pol || in_interrupt())
825 pol = &default_policy;
826 if (pol->policy == MPOL_INTERLEAVE)
827 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
828 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
829 }
830 EXPORT_SYMBOL(alloc_pages_current);
831
832 /* Slow path of a mempolicy copy */
833 struct mempolicy *__mpol_copy(struct mempolicy *old)
834 {
835 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
836
837 if (!new)
838 return ERR_PTR(-ENOMEM);
839 *new = *old;
840 atomic_set(&new->refcnt, 1);
841 if (new->policy == MPOL_BIND) {
842 int sz = ksize(old->v.zonelist);
843 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
844 if (!new->v.zonelist) {
845 kmem_cache_free(policy_cache, new);
846 return ERR_PTR(-ENOMEM);
847 }
848 memcpy(new->v.zonelist, old->v.zonelist, sz);
849 }
850 return new;
851 }
852
853 /* Slow path of a mempolicy comparison */
854 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
855 {
856 if (!a || !b)
857 return 0;
858 if (a->policy != b->policy)
859 return 0;
860 switch (a->policy) {
861 case MPOL_DEFAULT:
862 return 1;
863 case MPOL_INTERLEAVE:
864 return nodes_equal(a->v.nodes, b->v.nodes);
865 case MPOL_PREFERRED:
866 return a->v.preferred_node == b->v.preferred_node;
867 case MPOL_BIND: {
868 int i;
869 for (i = 0; a->v.zonelist->zones[i]; i++)
870 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
871 return 0;
872 return b->v.zonelist->zones[i] == NULL;
873 }
874 default:
875 BUG();
876 return 0;
877 }
878 }
879
880 /* Slow path of a mpol destructor. */
881 void __mpol_free(struct mempolicy *p)
882 {
883 if (!atomic_dec_and_test(&p->refcnt))
884 return;
885 if (p->policy == MPOL_BIND)
886 kfree(p->v.zonelist);
887 p->policy = MPOL_DEFAULT;
888 kmem_cache_free(policy_cache, p);
889 }
890
891 /*
892 * Hugetlb policy. Same as above, just works with node numbers instead of
893 * zonelists.
894 */
895
896 /* Find first node suitable for an allocation */
897 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
898 {
899 struct mempolicy *pol = get_vma_policy(current, vma, addr);
900
901 switch (pol->policy) {
902 case MPOL_DEFAULT:
903 return numa_node_id();
904 case MPOL_BIND:
905 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
906 case MPOL_INTERLEAVE:
907 return interleave_nodes(pol);
908 case MPOL_PREFERRED:
909 return pol->v.preferred_node >= 0 ?
910 pol->v.preferred_node : numa_node_id();
911 }
912 BUG();
913 return 0;
914 }
915
916 /* Find secondary valid nodes for an allocation */
917 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
918 {
919 struct mempolicy *pol = get_vma_policy(current, vma, addr);
920
921 switch (pol->policy) {
922 case MPOL_PREFERRED:
923 case MPOL_DEFAULT:
924 case MPOL_INTERLEAVE:
925 return 1;
926 case MPOL_BIND: {
927 struct zone **z;
928 for (z = pol->v.zonelist->zones; *z; z++)
929 if ((*z)->zone_pgdat->node_id == nid)
930 return 1;
931 return 0;
932 }
933 default:
934 BUG();
935 return 0;
936 }
937 }
938
939 /*
940 * Shared memory backing store policy support.
941 *
942 * Remember policies even when nobody has shared memory mapped.
943 * The policies are kept in Red-Black tree linked from the inode.
944 * They are protected by the sp->lock spinlock, which should be held
945 * for any accesses to the tree.
946 */
947
948 /* lookup first element intersecting start-end */
949 /* Caller holds sp->lock */
950 static struct sp_node *
951 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
952 {
953 struct rb_node *n = sp->root.rb_node;
954
955 while (n) {
956 struct sp_node *p = rb_entry(n, struct sp_node, nd);
957
958 if (start >= p->end)
959 n = n->rb_right;
960 else if (end <= p->start)
961 n = n->rb_left;
962 else
963 break;
964 }
965 if (!n)
966 return NULL;
967 for (;;) {
968 struct sp_node *w = NULL;
969 struct rb_node *prev = rb_prev(n);
970 if (!prev)
971 break;
972 w = rb_entry(prev, struct sp_node, nd);
973 if (w->end <= start)
974 break;
975 n = prev;
976 }
977 return rb_entry(n, struct sp_node, nd);
978 }
979
980 /* Insert a new shared policy into the list. */
981 /* Caller holds sp->lock */
982 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
983 {
984 struct rb_node **p = &sp->root.rb_node;
985 struct rb_node *parent = NULL;
986 struct sp_node *nd;
987
988 while (*p) {
989 parent = *p;
990 nd = rb_entry(parent, struct sp_node, nd);
991 if (new->start < nd->start)
992 p = &(*p)->rb_left;
993 else if (new->end > nd->end)
994 p = &(*p)->rb_right;
995 else
996 BUG();
997 }
998 rb_link_node(&new->nd, parent, p);
999 rb_insert_color(&new->nd, &sp->root);
1000 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1001 new->policy ? new->policy->policy : 0);
1002 }
1003
1004 /* Find shared policy intersecting idx */
1005 struct mempolicy *
1006 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1007 {
1008 struct mempolicy *pol = NULL;
1009 struct sp_node *sn;
1010
1011 if (!sp->root.rb_node)
1012 return NULL;
1013 spin_lock(&sp->lock);
1014 sn = sp_lookup(sp, idx, idx+1);
1015 if (sn) {
1016 mpol_get(sn->policy);
1017 pol = sn->policy;
1018 }
1019 spin_unlock(&sp->lock);
1020 return pol;
1021 }
1022
1023 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1024 {
1025 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1026 rb_erase(&n->nd, &sp->root);
1027 mpol_free(n->policy);
1028 kmem_cache_free(sn_cache, n);
1029 }
1030
1031 struct sp_node *
1032 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1033 {
1034 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1035
1036 if (!n)
1037 return NULL;
1038 n->start = start;
1039 n->end = end;
1040 mpol_get(pol);
1041 n->policy = pol;
1042 return n;
1043 }
1044
1045 /* Replace a policy range. */
1046 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1047 unsigned long end, struct sp_node *new)
1048 {
1049 struct sp_node *n, *new2 = NULL;
1050
1051 restart:
1052 spin_lock(&sp->lock);
1053 n = sp_lookup(sp, start, end);
1054 /* Take care of old policies in the same range. */
1055 while (n && n->start < end) {
1056 struct rb_node *next = rb_next(&n->nd);
1057 if (n->start >= start) {
1058 if (n->end <= end)
1059 sp_delete(sp, n);
1060 else
1061 n->start = end;
1062 } else {
1063 /* Old policy spanning whole new range. */
1064 if (n->end > end) {
1065 if (!new2) {
1066 spin_unlock(&sp->lock);
1067 new2 = sp_alloc(end, n->end, n->policy);
1068 if (!new2)
1069 return -ENOMEM;
1070 goto restart;
1071 }
1072 n->end = start;
1073 sp_insert(sp, new2);
1074 new2 = NULL;
1075 break;
1076 } else
1077 n->end = start;
1078 }
1079 if (!next)
1080 break;
1081 n = rb_entry(next, struct sp_node, nd);
1082 }
1083 if (new)
1084 sp_insert(sp, new);
1085 spin_unlock(&sp->lock);
1086 if (new2) {
1087 mpol_free(new2->policy);
1088 kmem_cache_free(sn_cache, new2);
1089 }
1090 return 0;
1091 }
1092
1093 int mpol_set_shared_policy(struct shared_policy *info,
1094 struct vm_area_struct *vma, struct mempolicy *npol)
1095 {
1096 int err;
1097 struct sp_node *new = NULL;
1098 unsigned long sz = vma_pages(vma);
1099
1100 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1101 vma->vm_pgoff,
1102 sz, npol? npol->policy : -1,
1103 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1104
1105 if (npol) {
1106 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1107 if (!new)
1108 return -ENOMEM;
1109 }
1110 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1111 if (err && new)
1112 kmem_cache_free(sn_cache, new);
1113 return err;
1114 }
1115
1116 /* Free a backing policy store on inode delete. */
1117 void mpol_free_shared_policy(struct shared_policy *p)
1118 {
1119 struct sp_node *n;
1120 struct rb_node *next;
1121
1122 if (!p->root.rb_node)
1123 return;
1124 spin_lock(&p->lock);
1125 next = rb_first(&p->root);
1126 while (next) {
1127 n = rb_entry(next, struct sp_node, nd);
1128 next = rb_next(&n->nd);
1129 rb_erase(&n->nd, &p->root);
1130 mpol_free(n->policy);
1131 kmem_cache_free(sn_cache, n);
1132 }
1133 spin_unlock(&p->lock);
1134 }
1135
1136 /* assumes fs == KERNEL_DS */
1137 void __init numa_policy_init(void)
1138 {
1139 policy_cache = kmem_cache_create("numa_policy",
1140 sizeof(struct mempolicy),
1141 0, SLAB_PANIC, NULL, NULL);
1142
1143 sn_cache = kmem_cache_create("shared_policy_node",
1144 sizeof(struct sp_node),
1145 0, SLAB_PANIC, NULL, NULL);
1146
1147 /* Set interleaving policy for system init. This way not all
1148 the data structures allocated at system boot end up in node zero. */
1149
1150 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1151 MAX_NUMNODES) < 0)
1152 printk("numa_policy_init: interleaving failed\n");
1153 }
1154
1155 /* Reset policy of current process to default.
1156 * Assumes fs == KERNEL_DS */
1157 void numa_default_policy(void)
1158 {
1159 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1160 }
This page took 0.054106 seconds and 6 git commands to generate.