mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <asm/tlbflush.h>
  87 #include <asm/uaccess.h>
  88
  89 static kmem_cache_t *policy_cache;
  90 static kmem_cache_t *sn_cache;
  91
  92 #define PDprintk(fmt...)
  93
  94 /* Highest zone. An specific allocation for a zone below that is not
  95    policied. */
  96 static int policy_zone;
  97
  98 struct mempolicy default_policy = {
  99         .refcnt = ATOMIC_INIT(1), /* never free it */
 100         .policy = MPOL_DEFAULT,
 101 };
 102
 103 /* Do sanity checking on a policy */
 104 static int mpol_check_policy(int mode, nodemask_t *nodes)
 105 {
 106         int empty = nodes_empty(*nodes);
 107
 108         switch (mode) {
 109         case MPOL_DEFAULT:
 110                 if (!empty)
 111                         return -EINVAL;
 112                 break;
 113         case MPOL_BIND:
 114         case MPOL_INTERLEAVE:
 115                 /* Preferred will only use the first bit, but allow
 116                    more for now. */
 117                 if (empty)
 118                         return -EINVAL;
 119                 break;
 120         }
 121         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 122 }
 123 /* Generate a custom zonelist for the BIND policy. */
 124 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 125 {
 126         struct zonelist *zl;
 127         int num, max, nd;
 128
 129         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 130         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 131         if (!zl)
 132                 return NULL;
 133         num = 0;
 134         for_each_node_mask(nd, *nodes) {
 135                 int k;
 136                 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
 137                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 138                         if (!z->present_pages)
 139                                 continue;
 140                         zl->zones[num++] = z;
 141                         if (k > policy_zone)
 142                                 policy_zone = k;
 143                 }
 144         }
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 break;
 165         case MPOL_PREFERRED:
 166                 policy->v.preferred_node = first_node(*nodes);
 167                 if (policy->v.preferred_node >= MAX_NUMNODES)
 168                         policy->v.preferred_node = -1;
 169                 break;
 170         case MPOL_BIND:
 171                 policy->v.zonelist = bind_zonelist(nodes);
 172                 if (policy->v.zonelist == NULL) {
 173                         kmem_cache_free(policy_cache, policy);
 174                         return ERR_PTR(-ENOMEM);
 175                 }
 176                 break;
 177         }
 178         policy->policy = mode;
 179         return policy;
 180 }
 181
 182 /* Ensure all existing pages follow the policy. */
 183 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 184                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 185 {
 186         pte_t *orig_pte;
 187         pte_t *pte;
 188         spinlock_t *ptl;
 189
 190         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 191         do {
 192                 struct page *page;
 193                 unsigned int nid;
 194
 195                 if (!pte_present(*pte))
 196                         continue;
 197                 page = vm_normal_page(vma, addr, *pte);
 198                 if (!page)
 199                         continue;
 200                 nid = page_to_nid(page);
 201                 if (!node_isset(nid, *nodes))
 202                         break;
 203         } while (pte++, addr += PAGE_SIZE, addr != end);
 204         pte_unmap_unlock(orig_pte, ptl);
 205         return addr != end;
 206 }
 207
 208 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 209                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 210 {
 211         pmd_t *pmd;
 212         unsigned long next;
 213
 214         pmd = pmd_offset(pud, addr);
 215         do {
 216                 next = pmd_addr_end(addr, end);
 217                 if (pmd_none_or_clear_bad(pmd))
 218                         continue;
 219                 if (check_pte_range(vma, pmd, addr, next, nodes))
 220                         return -EIO;
 221         } while (pmd++, addr = next, addr != end);
 222         return 0;
 223 }
 224
 225 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 226                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 227 {
 228         pud_t *pud;
 229         unsigned long next;
 230
 231         pud = pud_offset(pgd, addr);
 232         do {
 233                 next = pud_addr_end(addr, end);
 234                 if (pud_none_or_clear_bad(pud))
 235                         continue;
 236                 if (check_pmd_range(vma, pud, addr, next, nodes))
 237                         return -EIO;
 238         } while (pud++, addr = next, addr != end);
 239         return 0;
 240 }
 241
 242 static inline int check_pgd_range(struct vm_area_struct *vma,
 243                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 244 {
 245         pgd_t *pgd;
 246         unsigned long next;
 247
 248         pgd = pgd_offset(vma->vm_mm, addr);
 249         do {
 250                 next = pgd_addr_end(addr, end);
 251                 if (pgd_none_or_clear_bad(pgd))
 252                         continue;
 253                 if (check_pud_range(vma, pgd, addr, next, nodes))
 254                         return -EIO;
 255         } while (pgd++, addr = next, addr != end);
 256         return 0;
 257 }
 258
 259 /* Step 1: check the range */
 260 static struct vm_area_struct *
 261 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 262             nodemask_t *nodes, unsigned long flags)
 263 {
 264         int err;
 265         struct vm_area_struct *first, *vma, *prev;
 266
 267         first = find_vma(mm, start);
 268         if (!first)
 269                 return ERR_PTR(-EFAULT);
 270         prev = NULL;
 271         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 272                 if (!vma->vm_next && vma->vm_end < end)
 273                         return ERR_PTR(-EFAULT);
 274                 if (prev && prev->vm_end < vma->vm_start)
 275                         return ERR_PTR(-EFAULT);
 276                 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
 277                         unsigned long endvma = vma->vm_end;
 278                         if (endvma > end)
 279                                 endvma = end;
 280                         if (vma->vm_start > start)
 281                                 start = vma->vm_start;
 282                         err = check_pgd_range(vma, start, endvma, nodes);
 283                         if (err) {
 284                                 first = ERR_PTR(err);
 285                                 break;
 286                         }
 287                 }
 288                 prev = vma;
 289         }
 290         return first;
 291 }
 292
 293 /* Apply policy to a single VMA */
 294 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 295 {
 296         int err = 0;
 297         struct mempolicy *old = vma->vm_policy;
 298
 299         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 300                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 301                  vma->vm_ops, vma->vm_file,
 302                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 303
 304         if (vma->vm_ops && vma->vm_ops->set_policy)
 305                 err = vma->vm_ops->set_policy(vma, new);
 306         if (!err) {
 307                 mpol_get(new);
 308                 vma->vm_policy = new;
 309                 mpol_free(old);
 310         }
 311         return err;
 312 }
 313
 314 /* Step 2: apply policy to a range and do splits. */
 315 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 316                        unsigned long end, struct mempolicy *new)
 317 {
 318         struct vm_area_struct *next;
 319         int err;
 320
 321         err = 0;
 322         for (; vma && vma->vm_start < end; vma = next) {
 323                 next = vma->vm_next;
 324                 if (vma->vm_start < start)
 325                         err = split_vma(vma->vm_mm, vma, start, 1);
 326                 if (!err && vma->vm_end > end)
 327                         err = split_vma(vma->vm_mm, vma, end, 0);
 328                 if (!err)
 329                         err = policy_vma(vma, new);
 330                 if (err)
 331                         break;
 332         }
 333         return err;
 334 }
 335
 336 static int contextualize_policy(int mode, nodemask_t *nodes)
 337 {
 338         if (!nodes)
 339                 return 0;
 340
 341         /* Update current mems_allowed */
 342         cpuset_update_current_mems_allowed();
 343         /* Ignore nodes not set in current->mems_allowed */
 344         cpuset_restrict_to_mems_allowed(nodes->bits);
 345         return mpol_check_policy(mode, nodes);
 346 }
 347
 348 long do_mbind(unsigned long start, unsigned long len,
 349                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 350 {
 351         struct vm_area_struct *vma;
 352         struct mm_struct *mm = current->mm;
 353         struct mempolicy *new;
 354         unsigned long end;
 355         int err;
 356
 357         if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
 358                 return -EINVAL;
 359         if (start & ~PAGE_MASK)
 360                 return -EINVAL;
 361         if (mode == MPOL_DEFAULT)
 362                 flags &= ~MPOL_MF_STRICT;
 363         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 364         end = start + len;
 365         if (end < start)
 366                 return -EINVAL;
 367         if (end == start)
 368                 return 0;
 369         if (mpol_check_policy(mode, nmask))
 370                 return -EINVAL;
 371         new = mpol_new(mode, nmask);
 372         if (IS_ERR(new))
 373                 return PTR_ERR(new);
 374
 375         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 376                         mode,nodes_addr(nodes)[0]);
 377
 378         down_write(&mm->mmap_sem);
 379         vma = check_range(mm, start, end, nmask, flags);
 380         err = PTR_ERR(vma);
 381         if (!IS_ERR(vma))
 382                 err = mbind_range(vma, start, end, new);
 383         up_write(&mm->mmap_sem);
 384         mpol_free(new);
 385         return err;
 386 }
 387
 388 /* Set the process memory policy */
 389 long do_set_mempolicy(int mode, nodemask_t *nodes)
 390 {
 391         struct mempolicy *new;
 392
 393         if (contextualize_policy(mode, nodes))
 394                 return -EINVAL;
 395         new = mpol_new(mode, nodes);
 396         if (IS_ERR(new))
 397                 return PTR_ERR(new);
 398         mpol_free(current->mempolicy);
 399         current->mempolicy = new;
 400         if (new && new->policy == MPOL_INTERLEAVE)
 401                 current->il_next = first_node(new->v.nodes);
 402         return 0;
 403 }
 404
 405 /* Fill a zone bitmap for a policy */
 406 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 407 {
 408         int i;
 409
 410         nodes_clear(*nodes);
 411         switch (p->policy) {
 412         case MPOL_BIND:
 413                 for (i = 0; p->v.zonelist->zones[i]; i++)
 414                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 415                                 *nodes);
 416                 break;
 417         case MPOL_DEFAULT:
 418                 break;
 419         case MPOL_INTERLEAVE:
 420                 *nodes = p->v.nodes;
 421                 break;
 422         case MPOL_PREFERRED:
 423                 /* or use current node instead of online map? */
 424                 if (p->v.preferred_node < 0)
 425                         *nodes = node_online_map;
 426                 else
 427                         node_set(p->v.preferred_node, *nodes);
 428                 break;
 429         default:
 430                 BUG();
 431         }
 432 }
 433
 434 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 435 {
 436         struct page *p;
 437         int err;
 438
 439         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 440         if (err >= 0) {
 441                 err = page_to_nid(p);
 442                 put_page(p);
 443         }
 444         return err;
 445 }
 446
 447 /* Retrieve NUMA policy */
 448 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 449                         unsigned long addr, unsigned long flags)
 450 {
 451         int err;
 452         struct mm_struct *mm = current->mm;
 453         struct vm_area_struct *vma = NULL;
 454         struct mempolicy *pol = current->mempolicy;
 455
 456         cpuset_update_current_mems_allowed();
 457         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 458                 return -EINVAL;
 459         if (flags & MPOL_F_ADDR) {
 460                 down_read(&mm->mmap_sem);
 461                 vma = find_vma_intersection(mm, addr, addr+1);
 462                 if (!vma) {
 463                         up_read(&mm->mmap_sem);
 464                         return -EFAULT;
 465                 }
 466                 if (vma->vm_ops && vma->vm_ops->get_policy)
 467                         pol = vma->vm_ops->get_policy(vma, addr);
 468                 else
 469                         pol = vma->vm_policy;
 470         } else if (addr)
 471                 return -EINVAL;
 472
 473         if (!pol)
 474                 pol = &default_policy;
 475
 476         if (flags & MPOL_F_NODE) {
 477                 if (flags & MPOL_F_ADDR) {
 478                         err = lookup_node(mm, addr);
 479                         if (err < 0)
 480                                 goto out;
 481                         *policy = err;
 482                 } else if (pol == current->mempolicy &&
 483                                 pol->policy == MPOL_INTERLEAVE) {
 484                         *policy = current->il_next;
 485                 } else {
 486                         err = -EINVAL;
 487                         goto out;
 488                 }
 489         } else
 490                 *policy = pol->policy;
 491
 492         if (vma) {
 493                 up_read(&current->mm->mmap_sem);
 494                 vma = NULL;
 495         }
 496
 497         err = 0;
 498         if (nmask)
 499                 get_zonemask(pol, nmask);
 500
 501  out:
 502         if (vma)
 503                 up_read(&current->mm->mmap_sem);
 504         return err;
 505 }
 506
 507 /*
 508  * User space interface with variable sized bitmaps for nodelists.
 509  */
 510
 511 /* Copy a node mask from user space. */
 512 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
 513                      unsigned long maxnode)
 514 {
 515         unsigned long k;
 516         unsigned long nlongs;
 517         unsigned long endmask;
 518
 519         --maxnode;
 520         nodes_clear(*nodes);
 521         if (maxnode == 0 || !nmask)
 522                 return 0;
 523
 524         nlongs = BITS_TO_LONGS(maxnode);
 525         if ((maxnode % BITS_PER_LONG) == 0)
 526                 endmask = ~0UL;
 527         else
 528                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 529
 530         /* When the user specified more nodes than supported just check
 531            if the non supported part is all zero. */
 532         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 533                 if (nlongs > PAGE_SIZE/sizeof(long))
 534                         return -EINVAL;
 535                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 536                         unsigned long t;
 537                         if (get_user(t, nmask + k))
 538                                 return -EFAULT;
 539                         if (k == nlongs - 1) {
 540                                 if (t & endmask)
 541                                         return -EINVAL;
 542                         } else if (t)
 543                                 return -EINVAL;
 544                 }
 545                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 546                 endmask = ~0UL;
 547         }
 548
 549         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 550                 return -EFAULT;
 551         nodes_addr(*nodes)[nlongs-1] &= endmask;
 552         return 0;
 553 }
 554
 555 /* Copy a kernel node mask to user space */
 556 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 557                               nodemask_t *nodes)
 558 {
 559         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 560         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 561
 562         if (copy > nbytes) {
 563                 if (copy > PAGE_SIZE)
 564                         return -EINVAL;
 565                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 566                         return -EFAULT;
 567                 copy = nbytes;
 568         }
 569         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 570 }
 571
 572 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 573                         unsigned long mode,
 574                         unsigned long __user *nmask, unsigned long maxnode,
 575                         unsigned flags)
 576 {
 577         nodemask_t nodes;
 578         int err;
 579
 580         err = get_nodes(&nodes, nmask, maxnode);
 581         if (err)
 582                 return err;
 583         return do_mbind(start, len, mode, &nodes, flags);
 584 }
 585
 586 /* Set the process memory policy */
 587 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 588                 unsigned long maxnode)
 589 {
 590         int err;
 591         nodemask_t nodes;
 592
 593         if (mode < 0 || mode > MPOL_MAX)
 594                 return -EINVAL;
 595         err = get_nodes(&nodes, nmask, maxnode);
 596         if (err)
 597                 return err;
 598         return do_set_mempolicy(mode, &nodes);
 599 }
 600
 601 /* Retrieve NUMA policy */
 602 asmlinkage long sys_get_mempolicy(int __user *policy,
 603                                 unsigned long __user *nmask,
 604                                 unsigned long maxnode,
 605                                 unsigned long addr, unsigned long flags)
 606 {
 607         int err, pval;
 608         nodemask_t nodes;
 609
 610         if (nmask != NULL && maxnode < MAX_NUMNODES)
 611                 return -EINVAL;
 612
 613         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 614
 615         if (err)
 616                 return err;
 617
 618         if (policy && put_user(pval, policy))
 619                 return -EFAULT;
 620
 621         if (nmask)
 622                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 623
 624         return err;
 625 }
 626
 627 #ifdef CONFIG_COMPAT
 628
 629 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 630                                      compat_ulong_t __user *nmask,
 631                                      compat_ulong_t maxnode,
 632                                      compat_ulong_t addr, compat_ulong_t flags)
 633 {
 634         long err;
 635         unsigned long __user *nm = NULL;
 636         unsigned long nr_bits, alloc_size;
 637         DECLARE_BITMAP(bm, MAX_NUMNODES);
 638
 639         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 640         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 641
 642         if (nmask)
 643                 nm = compat_alloc_user_space(alloc_size);
 644
 645         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 646
 647         if (!err && nmask) {
 648                 err = copy_from_user(bm, nm, alloc_size);
 649                 /* ensure entire bitmap is zeroed */
 650                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 651                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 652         }
 653
 654         return err;
 655 }
 656
 657 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 658                                      compat_ulong_t maxnode)
 659 {
 660         long err = 0;
 661         unsigned long __user *nm = NULL;
 662         unsigned long nr_bits, alloc_size;
 663         DECLARE_BITMAP(bm, MAX_NUMNODES);
 664
 665         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 666         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 667
 668         if (nmask) {
 669                 err = compat_get_bitmap(bm, nmask, nr_bits);
 670                 nm = compat_alloc_user_space(alloc_size);
 671                 err |= copy_to_user(nm, bm, alloc_size);
 672         }
 673
 674         if (err)
 675                 return -EFAULT;
 676
 677         return sys_set_mempolicy(mode, nm, nr_bits+1);
 678 }
 679
 680 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 681                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 682                              compat_ulong_t maxnode, compat_ulong_t flags)
 683 {
 684         long err = 0;
 685         unsigned long __user *nm = NULL;
 686         unsigned long nr_bits, alloc_size;
 687         nodemask_t bm;
 688
 689         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 690         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 691
 692         if (nmask) {
 693                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 694                 nm = compat_alloc_user_space(alloc_size);
 695                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 696         }
 697
 698         if (err)
 699                 return -EFAULT;
 700
 701         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 702 }
 703
 704 #endif
 705
 706 /* Return effective policy for a VMA */
 707 struct mempolicy *
 708 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
 709 {
 710         struct mempolicy *pol = task->mempolicy;
 711
 712         if (vma) {
 713                 if (vma->vm_ops && vma->vm_ops->get_policy)
 714                         pol = vma->vm_ops->get_policy(vma, addr);
 715                 else if (vma->vm_policy &&
 716                                 vma->vm_policy->policy != MPOL_DEFAULT)
 717                         pol = vma->vm_policy;
 718         }
 719         if (!pol)
 720                 pol = &default_policy;
 721         return pol;
 722 }
 723
 724 /* Return a zonelist representing a mempolicy */
 725 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 726 {
 727         int nd;
 728
 729         switch (policy->policy) {
 730         case MPOL_PREFERRED:
 731                 nd = policy->v.preferred_node;
 732                 if (nd < 0)
 733                         nd = numa_node_id();
 734                 break;
 735         case MPOL_BIND:
 736                 /* Lower zones don't get a policy applied */
 737                 /* Careful: current->mems_allowed might have moved */
 738                 if (gfp_zone(gfp) >= policy_zone)
 739                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 740                                 return policy->v.zonelist;
 741                 /*FALL THROUGH*/
 742         case MPOL_INTERLEAVE: /* should not happen */
 743         case MPOL_DEFAULT:
 744                 nd = numa_node_id();
 745                 break;
 746         default:
 747                 nd = 0;
 748                 BUG();
 749         }
 750         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 751 }
 752
 753 /* Do dynamic interleaving for a process */
 754 static unsigned interleave_nodes(struct mempolicy *policy)
 755 {
 756         unsigned nid, next;
 757         struct task_struct *me = current;
 758
 759         nid = me->il_next;
 760         next = next_node(nid, policy->v.nodes);
 761         if (next >= MAX_NUMNODES)
 762                 next = first_node(policy->v.nodes);
 763         me->il_next = next;
 764         return nid;
 765 }
 766
 767 /* Do static interleaving for a VMA with known offset. */
 768 static unsigned offset_il_node(struct mempolicy *pol,
 769                 struct vm_area_struct *vma, unsigned long off)
 770 {
 771         unsigned nnodes = nodes_weight(pol->v.nodes);
 772         unsigned target = (unsigned)off % nnodes;
 773         int c;
 774         int nid = -1;
 775
 776         c = 0;
 777         do {
 778                 nid = next_node(nid, pol->v.nodes);
 779                 c++;
 780         } while (c <= target);
 781         return nid;
 782 }
 783
 784 /* Allocate a page in interleaved policy.
 785    Own path because it needs to do special accounting. */
 786 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 787                                         unsigned nid)
 788 {
 789         struct zonelist *zl;
 790         struct page *page;
 791
 792         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
 793         page = __alloc_pages(gfp, order, zl);
 794         if (page && page_zone(page) == zl->zones[0]) {
 795                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
 796                 put_cpu();
 797         }
 798         return page;
 799 }
 800
 801 /**
 802  *      alloc_page_vma  - Allocate a page for a VMA.
 803  *
 804  *      @gfp:
 805  *      %GFP_USER    user allocation.
 806  *      %GFP_KERNEL  kernel allocations,
 807  *      %GFP_HIGHMEM highmem/user allocations,
 808  *      %GFP_FS      allocation should not call back into a file system.
 809  *      %GFP_ATOMIC  don't sleep.
 810  *
 811  *      @vma:  Pointer to VMA or NULL if not available.
 812  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
 813  *
 814  *      This function allocates a page from the kernel page pool and applies
 815  *      a NUMA policy associated with the VMA or the current process.
 816  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
 817  *      mm_struct of the VMA to prevent it from going away. Should be used for
 818  *      all allocations for pages that will be mapped into
 819  *      user space. Returns NULL when no page can be allocated.
 820  *
 821  *      Should be called with the mm_sem of the vma hold.
 822  */
 823 struct page *
 824 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 825 {
 826         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 827
 828         cpuset_update_current_mems_allowed();
 829
 830         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 831                 unsigned nid;
 832                 if (vma) {
 833                         unsigned long off;
 834                         off = vma->vm_pgoff;
 835                         off += (addr - vma->vm_start) >> PAGE_SHIFT;
 836                         nid = offset_il_node(pol, vma, off);
 837                 } else {
 838                         /* fall back to process interleaving */
 839                         nid = interleave_nodes(pol);
 840                 }
 841                 return alloc_page_interleave(gfp, 0, nid);
 842         }
 843         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
 844 }
 845
 846 /**
 847  *      alloc_pages_current - Allocate pages.
 848  *
 849  *      @gfp:
 850  *              %GFP_USER   user allocation,
 851  *              %GFP_KERNEL kernel allocation,
 852  *              %GFP_HIGHMEM highmem allocation,
 853  *              %GFP_FS     don't call back into a file system.
 854  *              %GFP_ATOMIC don't sleep.
 855  *      @order: Power of two of allocation size in pages. 0 is a single page.
 856  *
 857  *      Allocate a page from the kernel page pool.  When not in
 858  *      interrupt context and apply the current process NUMA policy.
 859  *      Returns NULL when no page can be allocated.
 860  *
 861  *      Don't call cpuset_update_current_mems_allowed() unless
 862  *      1) it's ok to take cpuset_sem (can WAIT), and
 863  *      2) allocating for current task (not interrupt).
 864  */
 865 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 866 {
 867         struct mempolicy *pol = current->mempolicy;
 868
 869         if ((gfp & __GFP_WAIT) && !in_interrupt())
 870                 cpuset_update_current_mems_allowed();
 871         if (!pol || in_interrupt())
 872                 pol = &default_policy;
 873         if (pol->policy == MPOL_INTERLEAVE)
 874                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
 875         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
 876 }
 877 EXPORT_SYMBOL(alloc_pages_current);
 878
 879 /* Slow path of a mempolicy copy */
 880 struct mempolicy *__mpol_copy(struct mempolicy *old)
 881 {
 882         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 883
 884         if (!new)
 885                 return ERR_PTR(-ENOMEM);
 886         *new = *old;
 887         atomic_set(&new->refcnt, 1);
 888         if (new->policy == MPOL_BIND) {
 889                 int sz = ksize(old->v.zonelist);
 890                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
 891                 if (!new->v.zonelist) {
 892                         kmem_cache_free(policy_cache, new);
 893                         return ERR_PTR(-ENOMEM);
 894                 }
 895                 memcpy(new->v.zonelist, old->v.zonelist, sz);
 896         }
 897         return new;
 898 }
 899
 900 /* Slow path of a mempolicy comparison */
 901 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 902 {
 903         if (!a || !b)
 904                 return 0;
 905         if (a->policy != b->policy)
 906                 return 0;
 907         switch (a->policy) {
 908         case MPOL_DEFAULT:
 909                 return 1;
 910         case MPOL_INTERLEAVE:
 911                 return nodes_equal(a->v.nodes, b->v.nodes);
 912         case MPOL_PREFERRED:
 913                 return a->v.preferred_node == b->v.preferred_node;
 914         case MPOL_BIND: {
 915                 int i;
 916                 for (i = 0; a->v.zonelist->zones[i]; i++)
 917                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
 918                                 return 0;
 919                 return b->v.zonelist->zones[i] == NULL;
 920         }
 921         default:
 922                 BUG();
 923                 return 0;
 924         }
 925 }
 926
 927 /* Slow path of a mpol destructor. */
 928 void __mpol_free(struct mempolicy *p)
 929 {
 930         if (!atomic_dec_and_test(&p->refcnt))
 931                 return;
 932         if (p->policy == MPOL_BIND)
 933                 kfree(p->v.zonelist);
 934         p->policy = MPOL_DEFAULT;
 935         kmem_cache_free(policy_cache, p);
 936 }
 937
 938 /*
 939  * Hugetlb policy. Same as above, just works with node numbers instead of
 940  * zonelists.
 941  */
 942
 943 /* Find first node suitable for an allocation */
 944 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
 945 {
 946         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 947
 948         switch (pol->policy) {
 949         case MPOL_DEFAULT:
 950                 return numa_node_id();
 951         case MPOL_BIND:
 952                 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
 953         case MPOL_INTERLEAVE:
 954                 return interleave_nodes(pol);
 955         case MPOL_PREFERRED:
 956                 return pol->v.preferred_node >= 0 ?
 957                                 pol->v.preferred_node : numa_node_id();
 958         }
 959         BUG();
 960         return 0;
 961 }
 962
 963 /* Find secondary valid nodes for an allocation */
 964 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
 965 {
 966         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 967
 968         switch (pol->policy) {
 969         case MPOL_PREFERRED:
 970         case MPOL_DEFAULT:
 971         case MPOL_INTERLEAVE:
 972                 return 1;
 973         case MPOL_BIND: {
 974                 struct zone **z;
 975                 for (z = pol->v.zonelist->zones; *z; z++)
 976                         if ((*z)->zone_pgdat->node_id == nid)
 977                                 return 1;
 978                 return 0;
 979         }
 980         default:
 981                 BUG();
 982                 return 0;
 983         }
 984 }
 985
 986 /*
 987  * Shared memory backing store policy support.
 988  *
 989  * Remember policies even when nobody has shared memory mapped.
 990  * The policies are kept in Red-Black tree linked from the inode.
 991  * They are protected by the sp->lock spinlock, which should be held
 992  * for any accesses to the tree.
 993  */
 994
 995 /* lookup first element intersecting start-end */
 996 /* Caller holds sp->lock */
 997 static struct sp_node *
 998 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 999 {
1000         struct rb_node *n = sp->root.rb_node;
1001
1002         while (n) {
1003                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1004
1005                 if (start >= p->end)
1006                         n = n->rb_right;
1007                 else if (end <= p->start)
1008                         n = n->rb_left;
1009                 else
1010                         break;
1011         }
1012         if (!n)
1013                 return NULL;
1014         for (;;) {
1015                 struct sp_node *w = NULL;
1016                 struct rb_node *prev = rb_prev(n);
1017                 if (!prev)
1018                         break;
1019                 w = rb_entry(prev, struct sp_node, nd);
1020                 if (w->end <= start)
1021                         break;
1022                 n = prev;
1023         }
1024         return rb_entry(n, struct sp_node, nd);
1025 }
1026
1027 /* Insert a new shared policy into the list. */
1028 /* Caller holds sp->lock */
1029 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1030 {
1031         struct rb_node **p = &sp->root.rb_node;
1032         struct rb_node *parent = NULL;
1033         struct sp_node *nd;
1034
1035         while (*p) {
1036                 parent = *p;
1037                 nd = rb_entry(parent, struct sp_node, nd);
1038                 if (new->start < nd->start)
1039                         p = &(*p)->rb_left;
1040                 else if (new->end > nd->end)
1041                         p = &(*p)->rb_right;
1042                 else
1043                         BUG();
1044         }
1045         rb_link_node(&new->nd, parent, p);
1046         rb_insert_color(&new->nd, &sp->root);
1047         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1048                  new->policy ? new->policy->policy : 0);
1049 }
1050
1051 /* Find shared policy intersecting idx */
1052 struct mempolicy *
1053 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1054 {
1055         struct mempolicy *pol = NULL;
1056         struct sp_node *sn;
1057
1058         if (!sp->root.rb_node)
1059                 return NULL;
1060         spin_lock(&sp->lock);
1061         sn = sp_lookup(sp, idx, idx+1);
1062         if (sn) {
1063                 mpol_get(sn->policy);
1064                 pol = sn->policy;
1065         }
1066         spin_unlock(&sp->lock);
1067         return pol;
1068 }
1069
1070 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1071 {
1072         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1073         rb_erase(&n->nd, &sp->root);
1074         mpol_free(n->policy);
1075         kmem_cache_free(sn_cache, n);
1076 }
1077
1078 struct sp_node *
1079 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1080 {
1081         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1082
1083         if (!n)
1084                 return NULL;
1085         n->start = start;
1086         n->end = end;
1087         mpol_get(pol);
1088         n->policy = pol;
1089         return n;
1090 }
1091
1092 /* Replace a policy range. */
1093 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1094                                  unsigned long end, struct sp_node *new)
1095 {
1096         struct sp_node *n, *new2 = NULL;
1097
1098 restart:
1099         spin_lock(&sp->lock);
1100         n = sp_lookup(sp, start, end);
1101         /* Take care of old policies in the same range. */
1102         while (n && n->start < end) {
1103                 struct rb_node *next = rb_next(&n->nd);
1104                 if (n->start >= start) {
1105                         if (n->end <= end)
1106                                 sp_delete(sp, n);
1107                         else
1108                                 n->start = end;
1109                 } else {
1110                         /* Old policy spanning whole new range. */
1111                         if (n->end > end) {
1112                                 if (!new2) {
1113                                         spin_unlock(&sp->lock);
1114                                         new2 = sp_alloc(end, n->end, n->policy);
1115                                         if (!new2)
1116                                                 return -ENOMEM;
1117                                         goto restart;
1118                                 }
1119                                 n->end = start;
1120                                 sp_insert(sp, new2);
1121                                 new2 = NULL;
1122                                 break;
1123                         } else
1124                                 n->end = start;
1125                 }
1126                 if (!next)
1127                         break;
1128                 n = rb_entry(next, struct sp_node, nd);
1129         }
1130         if (new)
1131                 sp_insert(sp, new);
1132         spin_unlock(&sp->lock);
1133         if (new2) {
1134                 mpol_free(new2->policy);
1135                 kmem_cache_free(sn_cache, new2);
1136         }
1137         return 0;
1138 }
1139
1140 int mpol_set_shared_policy(struct shared_policy *info,
1141                         struct vm_area_struct *vma, struct mempolicy *npol)
1142 {
1143         int err;
1144         struct sp_node *new = NULL;
1145         unsigned long sz = vma_pages(vma);
1146
1147         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1148                  vma->vm_pgoff,
1149                  sz, npol? npol->policy : -1,
1150                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1151
1152         if (npol) {
1153                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1154                 if (!new)
1155                         return -ENOMEM;
1156         }
1157         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1158         if (err && new)
1159                 kmem_cache_free(sn_cache, new);
1160         return err;
1161 }
1162
1163 /* Free a backing policy store on inode delete. */
1164 void mpol_free_shared_policy(struct shared_policy *p)
1165 {
1166         struct sp_node *n;
1167         struct rb_node *next;
1168
1169         if (!p->root.rb_node)
1170                 return;
1171         spin_lock(&p->lock);
1172         next = rb_first(&p->root);
1173         while (next) {
1174                 n = rb_entry(next, struct sp_node, nd);
1175                 next = rb_next(&n->nd);
1176                 rb_erase(&n->nd, &p->root);
1177                 mpol_free(n->policy);
1178                 kmem_cache_free(sn_cache, n);
1179         }
1180         spin_unlock(&p->lock);
1181 }
1182
1183 /* assumes fs == KERNEL_DS */
1184 void __init numa_policy_init(void)
1185 {
1186         policy_cache = kmem_cache_create("numa_policy",
1187                                          sizeof(struct mempolicy),
1188                                          0, SLAB_PANIC, NULL, NULL);
1189
1190         sn_cache = kmem_cache_create("shared_policy_node",
1191                                      sizeof(struct sp_node),
1192                                      0, SLAB_PANIC, NULL, NULL);
1193
1194         /* Set interleaving policy for system init. This way not all
1195            the data structures allocated at system boot end up in node zero. */
1196
1197         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1198                 printk("numa_policy_init: interleaving failed\n");
1199 }
1200
1201 /* Reset policy of current process to default */
1202 void numa_default_policy(void)
1203 {
1204         do_set_mempolicy(MPOL_DEFAULT, NULL);
1205 }
1206
1207 /* Migrate a policy to a different set of nodes */
1208 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1209                                                         const nodemask_t *new)
1210 {
1211         nodemask_t tmp;
1212
1213         if (!pol)
1214                 return;
1215
1216         switch (pol->policy) {
1217         case MPOL_DEFAULT:
1218                 break;
1219         case MPOL_INTERLEAVE:
1220                 nodes_remap(tmp, pol->v.nodes, *old, *new);
1221                 pol->v.nodes = tmp;
1222                 current->il_next = node_remap(current->il_next, *old, *new);
1223                 break;
1224         case MPOL_PREFERRED:
1225                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1226                                                                 *old, *new);
1227                 break;
1228         case MPOL_BIND: {
1229                 nodemask_t nodes;
1230                 struct zone **z;
1231                 struct zonelist *zonelist;
1232
1233                 nodes_clear(nodes);
1234                 for (z = pol->v.zonelist->zones; *z; z++)
1235                         node_set((*z)->zone_pgdat->node_id, nodes);
1236                 nodes_remap(tmp, nodes, *old, *new);
1237                 nodes = tmp;
1238
1239                 zonelist = bind_zonelist(&nodes);
1240
1241                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1242                  * If that old zonelist has no remaining mems_allowed nodes,
1243                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1244                  */
1245
1246                 if (zonelist) {
1247                         /* Good - got mem - substitute new zonelist */
1248                         kfree(pol->v.zonelist);
1249                         pol->v.zonelist = zonelist;
1250                 }
1251                 break;
1252         }
1253         default:
1254                 BUG();
1255                 break;
1256         }
1257 }
1258
1259 /*
1260  * Someone moved this task to different nodes.  Fixup mempolicies.
1261  *
1262  * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1263  * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1264  */
1265 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1266 {
1267         rebind_policy(current->mempolicy, old, new);
1268 }