mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89
  90 #include <asm/tlbflush.h>
  91 #include <asm/uaccess.h>
  92
  93 /* Internal flags */
  94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  97
  98 static kmem_cache_t *policy_cache;
  99 static kmem_cache_t *sn_cache;
 100
 101 #define PDprintk(fmt...)
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 int policy_zone = ZONE_DMA;
 106
 107 struct mempolicy default_policy = {
 108         .refcnt = ATOMIC_INIT(1), /* never free it */
 109         .policy = MPOL_DEFAULT,
 110 };
 111
 112 /* Do sanity checking on a policy */
 113 static int mpol_check_policy(int mode, nodemask_t *nodes)
 114 {
 115         int empty = nodes_empty(*nodes);
 116
 117         switch (mode) {
 118         case MPOL_DEFAULT:
 119                 if (!empty)
 120                         return -EINVAL;
 121                 break;
 122         case MPOL_BIND:
 123         case MPOL_INTERLEAVE:
 124                 /* Preferred will only use the first bit, but allow
 125                    more for now. */
 126                 if (empty)
 127                         return -EINVAL;
 128                 break;
 129         }
 130         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 131 }
 132 /* Generate a custom zonelist for the BIND policy. */
 133 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 134 {
 135         struct zonelist *zl;
 136         int num, max, nd;
 137
 138         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 139         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 140         if (!zl)
 141                 return NULL;
 142         num = 0;
 143         for_each_node_mask(nd, *nodes)
 144                 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 if (nodes_weight(*nodes) == 0) {
 165                         kmem_cache_free(policy_cache, policy);
 166                         return ERR_PTR(-EINVAL);
 167                 }
 168                 break;
 169         case MPOL_PREFERRED:
 170                 policy->v.preferred_node = first_node(*nodes);
 171                 if (policy->v.preferred_node >= MAX_NUMNODES)
 172                         policy->v.preferred_node = -1;
 173                 break;
 174         case MPOL_BIND:
 175                 policy->v.zonelist = bind_zonelist(nodes);
 176                 if (policy->v.zonelist == NULL) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-ENOMEM);
 179                 }
 180                 break;
 181         }
 182         policy->policy = mode;
 183         return policy;
 184 }
 185
 186 /* Check if we are the only process mapping the page in question */
 187 static inline int single_mm_mapping(struct mm_struct *mm,
 188                         struct address_space *mapping)
 189 {
 190         struct vm_area_struct *vma;
 191         struct prio_tree_iter iter;
 192         int rc = 1;
 193
 194         spin_lock(&mapping->i_mmap_lock);
 195         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
 196                 if (mm != vma->vm_mm) {
 197                         rc = 0;
 198                         goto out;
 199                 }
 200         list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 201                 if (mm != vma->vm_mm) {
 202                         rc = 0;
 203                         goto out;
 204                 }
 205 out:
 206         spin_unlock(&mapping->i_mmap_lock);
 207         return rc;
 208 }
 209
 210 /*
 211  * Add a page to be migrated to the pagelist
 212  */
 213 static void migrate_page_add(struct vm_area_struct *vma,
 214         struct page *page, struct list_head *pagelist, unsigned long flags)
 215 {
 216         /*
 217          * Avoid migrating a page that is shared by others and not writable.
 218          */
 219         if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 220             mapping_writably_mapped(page->mapping) ||
 221             single_mm_mapping(vma->vm_mm, page->mapping)) {
 222                 int rc = isolate_lru_page(page);
 223
 224                 if (rc == 1)
 225                         list_add(&page->lru, pagelist);
 226                 /*
 227                  * If the isolate attempt was not successful then we just
 228                  * encountered an unswappable page. Something must be wrong.
 229                  */
 230                 WARN_ON(rc == 0);
 231         }
 232 }
 233
 234 static void gather_stats(struct page *, void *);
 235
 236 /* Scan through pages checking if pages follow certain conditions. */
 237 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 238                 unsigned long addr, unsigned long end,
 239                 const nodemask_t *nodes, unsigned long flags,
 240                 void *private)
 241 {
 242         pte_t *orig_pte;
 243         pte_t *pte;
 244         spinlock_t *ptl;
 245
 246         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 247         do {
 248                 struct page *page;
 249                 unsigned int nid;
 250
 251                 if (!pte_present(*pte))
 252                         continue;
 253                 page = vm_normal_page(vma, addr, *pte);
 254                 if (!page)
 255                         continue;
 256                 nid = page_to_nid(page);
 257                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 258                         continue;
 259
 260                 if (flags & MPOL_MF_STATS)
 261                         gather_stats(page, private);
 262                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 263                         migrate_page_add(vma, page, private, flags);
 264                 else
 265                         break;
 266         } while (pte++, addr += PAGE_SIZE, addr != end);
 267         pte_unmap_unlock(orig_pte, ptl);
 268         return addr != end;
 269 }
 270
 271 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 272                 unsigned long addr, unsigned long end,
 273                 const nodemask_t *nodes, unsigned long flags,
 274                 void *private)
 275 {
 276         pmd_t *pmd;
 277         unsigned long next;
 278
 279         pmd = pmd_offset(pud, addr);
 280         do {
 281                 next = pmd_addr_end(addr, end);
 282                 if (pmd_none_or_clear_bad(pmd))
 283                         continue;
 284                 if (check_pte_range(vma, pmd, addr, next, nodes,
 285                                     flags, private))
 286                         return -EIO;
 287         } while (pmd++, addr = next, addr != end);
 288         return 0;
 289 }
 290
 291 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 292                 unsigned long addr, unsigned long end,
 293                 const nodemask_t *nodes, unsigned long flags,
 294                 void *private)
 295 {
 296         pud_t *pud;
 297         unsigned long next;
 298
 299         pud = pud_offset(pgd, addr);
 300         do {
 301                 next = pud_addr_end(addr, end);
 302                 if (pud_none_or_clear_bad(pud))
 303                         continue;
 304                 if (check_pmd_range(vma, pud, addr, next, nodes,
 305                                     flags, private))
 306                         return -EIO;
 307         } while (pud++, addr = next, addr != end);
 308         return 0;
 309 }
 310
 311 static inline int check_pgd_range(struct vm_area_struct *vma,
 312                 unsigned long addr, unsigned long end,
 313                 const nodemask_t *nodes, unsigned long flags,
 314                 void *private)
 315 {
 316         pgd_t *pgd;
 317         unsigned long next;
 318
 319         pgd = pgd_offset(vma->vm_mm, addr);
 320         do {
 321                 next = pgd_addr_end(addr, end);
 322                 if (pgd_none_or_clear_bad(pgd))
 323                         continue;
 324                 if (check_pud_range(vma, pgd, addr, next, nodes,
 325                                     flags, private))
 326                         return -EIO;
 327         } while (pgd++, addr = next, addr != end);
 328         return 0;
 329 }
 330
 331 /* Check if a vma is migratable */
 332 static inline int vma_migratable(struct vm_area_struct *vma)
 333 {
 334         if (vma->vm_flags & (
 335                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
 336                 return 0;
 337         return 1;
 338 }
 339
 340 /*
 341  * Check if all pages in a range are on a set of nodes.
 342  * If pagelist != NULL then isolate pages from the LRU and
 343  * put them on the pagelist.
 344  */
 345 static struct vm_area_struct *
 346 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 347                 const nodemask_t *nodes, unsigned long flags, void *private)
 348 {
 349         int err;
 350         struct vm_area_struct *first, *vma, *prev;
 351
 352         first = find_vma(mm, start);
 353         if (!first)
 354                 return ERR_PTR(-EFAULT);
 355         prev = NULL;
 356         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 357                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 358                         if (!vma->vm_next && vma->vm_end < end)
 359                                 return ERR_PTR(-EFAULT);
 360                         if (prev && prev->vm_end < vma->vm_start)
 361                                 return ERR_PTR(-EFAULT);
 362                 }
 363                 if (!is_vm_hugetlb_page(vma) &&
 364                     ((flags & MPOL_MF_STRICT) ||
 365                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 366                                 vma_migratable(vma)))) {
 367                         unsigned long endvma = vma->vm_end;
 368
 369                         if (endvma > end)
 370                                 endvma = end;
 371                         if (vma->vm_start > start)
 372                                 start = vma->vm_start;
 373                         err = check_pgd_range(vma, start, endvma, nodes,
 374                                                 flags, private);
 375                         if (err) {
 376                                 first = ERR_PTR(err);
 377                                 break;
 378                         }
 379                 }
 380                 prev = vma;
 381         }
 382         return first;
 383 }
 384
 385 /* Apply policy to a single VMA */
 386 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 387 {
 388         int err = 0;
 389         struct mempolicy *old = vma->vm_policy;
 390
 391         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 392                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 393                  vma->vm_ops, vma->vm_file,
 394                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 395
 396         if (vma->vm_ops && vma->vm_ops->set_policy)
 397                 err = vma->vm_ops->set_policy(vma, new);
 398         if (!err) {
 399                 mpol_get(new);
 400                 vma->vm_policy = new;
 401                 mpol_free(old);
 402         }
 403         return err;
 404 }
 405
 406 /* Step 2: apply policy to a range and do splits. */
 407 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 408                        unsigned long end, struct mempolicy *new)
 409 {
 410         struct vm_area_struct *next;
 411         int err;
 412
 413         err = 0;
 414         for (; vma && vma->vm_start < end; vma = next) {
 415                 next = vma->vm_next;
 416                 if (vma->vm_start < start)
 417                         err = split_vma(vma->vm_mm, vma, start, 1);
 418                 if (!err && vma->vm_end > end)
 419                         err = split_vma(vma->vm_mm, vma, end, 0);
 420                 if (!err)
 421                         err = policy_vma(vma, new);
 422                 if (err)
 423                         break;
 424         }
 425         return err;
 426 }
 427
 428 static int contextualize_policy(int mode, nodemask_t *nodes)
 429 {
 430         if (!nodes)
 431                 return 0;
 432
 433         /* Update current mems_allowed */
 434         cpuset_update_current_mems_allowed();
 435         /* Ignore nodes not set in current->mems_allowed */
 436         cpuset_restrict_to_mems_allowed(nodes->bits);
 437         return mpol_check_policy(mode, nodes);
 438 }
 439
 440 static int swap_pages(struct list_head *pagelist)
 441 {
 442         LIST_HEAD(moved);
 443         LIST_HEAD(failed);
 444         int n;
 445
 446         n = migrate_pages(pagelist, NULL, &moved, &failed);
 447         putback_lru_pages(&failed);
 448         putback_lru_pages(&moved);
 449
 450         return n;
 451 }
 452
 453 long do_mbind(unsigned long start, unsigned long len,
 454                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 455 {
 456         struct vm_area_struct *vma;
 457         struct mm_struct *mm = current->mm;
 458         struct mempolicy *new;
 459         unsigned long end;
 460         int err;
 461         LIST_HEAD(pagelist);
 462
 463         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 464                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 465             || mode > MPOL_MAX)
 466                 return -EINVAL;
 467         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
 468                 return -EPERM;
 469
 470         if (start & ~PAGE_MASK)
 471                 return -EINVAL;
 472
 473         if (mode == MPOL_DEFAULT)
 474                 flags &= ~MPOL_MF_STRICT;
 475
 476         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 477         end = start + len;
 478
 479         if (end < start)
 480                 return -EINVAL;
 481         if (end == start)
 482                 return 0;
 483
 484         if (mpol_check_policy(mode, nmask))
 485                 return -EINVAL;
 486
 487         new = mpol_new(mode, nmask);
 488         if (IS_ERR(new))
 489                 return PTR_ERR(new);
 490
 491         /*
 492          * If we are using the default policy then operation
 493          * on discontinuous address spaces is okay after all
 494          */
 495         if (!new)
 496                 flags |= MPOL_MF_DISCONTIG_OK;
 497
 498         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 499                         mode,nodes_addr(nodes)[0]);
 500
 501         down_write(&mm->mmap_sem);
 502         vma = check_range(mm, start, end, nmask,
 503                           flags | MPOL_MF_INVERT, &pagelist);
 504
 505         err = PTR_ERR(vma);
 506         if (!IS_ERR(vma)) {
 507                 int nr_failed = 0;
 508
 509                 err = mbind_range(vma, start, end, new);
 510                 if (!list_empty(&pagelist))
 511                         nr_failed = swap_pages(&pagelist);
 512
 513                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 514                         err = -EIO;
 515         }
 516         if (!list_empty(&pagelist))
 517                 putback_lru_pages(&pagelist);
 518
 519         up_write(&mm->mmap_sem);
 520         mpol_free(new);
 521         return err;
 522 }
 523
 524 /* Set the process memory policy */
 525 long do_set_mempolicy(int mode, nodemask_t *nodes)
 526 {
 527         struct mempolicy *new;
 528
 529         if (contextualize_policy(mode, nodes))
 530                 return -EINVAL;
 531         new = mpol_new(mode, nodes);
 532         if (IS_ERR(new))
 533                 return PTR_ERR(new);
 534         mpol_free(current->mempolicy);
 535         current->mempolicy = new;
 536         if (new && new->policy == MPOL_INTERLEAVE)
 537                 current->il_next = first_node(new->v.nodes);
 538         return 0;
 539 }
 540
 541 /* Fill a zone bitmap for a policy */
 542 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 543 {
 544         int i;
 545
 546         nodes_clear(*nodes);
 547         switch (p->policy) {
 548         case MPOL_BIND:
 549                 for (i = 0; p->v.zonelist->zones[i]; i++)
 550                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 551                                 *nodes);
 552                 break;
 553         case MPOL_DEFAULT:
 554                 break;
 555         case MPOL_INTERLEAVE:
 556                 *nodes = p->v.nodes;
 557                 break;
 558         case MPOL_PREFERRED:
 559                 /* or use current node instead of online map? */
 560                 if (p->v.preferred_node < 0)
 561                         *nodes = node_online_map;
 562                 else
 563                         node_set(p->v.preferred_node, *nodes);
 564                 break;
 565         default:
 566                 BUG();
 567         }
 568 }
 569
 570 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 571 {
 572         struct page *p;
 573         int err;
 574
 575         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 576         if (err >= 0) {
 577                 err = page_to_nid(p);
 578                 put_page(p);
 579         }
 580         return err;
 581 }
 582
 583 /* Retrieve NUMA policy */
 584 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 585                         unsigned long addr, unsigned long flags)
 586 {
 587         int err;
 588         struct mm_struct *mm = current->mm;
 589         struct vm_area_struct *vma = NULL;
 590         struct mempolicy *pol = current->mempolicy;
 591
 592         cpuset_update_current_mems_allowed();
 593         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 594                 return -EINVAL;
 595         if (flags & MPOL_F_ADDR) {
 596                 down_read(&mm->mmap_sem);
 597                 vma = find_vma_intersection(mm, addr, addr+1);
 598                 if (!vma) {
 599                         up_read(&mm->mmap_sem);
 600                         return -EFAULT;
 601                 }
 602                 if (vma->vm_ops && vma->vm_ops->get_policy)
 603                         pol = vma->vm_ops->get_policy(vma, addr);
 604                 else
 605                         pol = vma->vm_policy;
 606         } else if (addr)
 607                 return -EINVAL;
 608
 609         if (!pol)
 610                 pol = &default_policy;
 611
 612         if (flags & MPOL_F_NODE) {
 613                 if (flags & MPOL_F_ADDR) {
 614                         err = lookup_node(mm, addr);
 615                         if (err < 0)
 616                                 goto out;
 617                         *policy = err;
 618                 } else if (pol == current->mempolicy &&
 619                                 pol->policy == MPOL_INTERLEAVE) {
 620                         *policy = current->il_next;
 621                 } else {
 622                         err = -EINVAL;
 623                         goto out;
 624                 }
 625         } else
 626                 *policy = pol->policy;
 627
 628         if (vma) {
 629                 up_read(&current->mm->mmap_sem);
 630                 vma = NULL;
 631         }
 632
 633         err = 0;
 634         if (nmask)
 635                 get_zonemask(pol, nmask);
 636
 637  out:
 638         if (vma)
 639                 up_read(&current->mm->mmap_sem);
 640         return err;
 641 }
 642
 643 /*
 644  * For now migrate_pages simply swaps out the pages from nodes that are in
 645  * the source set but not in the target set. In the future, we would
 646  * want a function that moves pages between the two nodesets in such
 647  * a way as to preserve the physical layout as much as possible.
 648  *
 649  * Returns the number of page that could not be moved.
 650  */
 651 int do_migrate_pages(struct mm_struct *mm,
 652         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 653 {
 654         LIST_HEAD(pagelist);
 655         int count = 0;
 656         nodemask_t nodes;
 657
 658         nodes_andnot(nodes, *from_nodes, *to_nodes);
 659
 660         down_read(&mm->mmap_sem);
 661         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 662                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 663
 664         if (!list_empty(&pagelist)) {
 665                 count = swap_pages(&pagelist);
 666                 putback_lru_pages(&pagelist);
 667         }
 668
 669         up_read(&mm->mmap_sem);
 670         return count;
 671 }
 672
 673 /*
 674  * User space interface with variable sized bitmaps for nodelists.
 675  */
 676
 677 /* Copy a node mask from user space. */
 678 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 679                      unsigned long maxnode)
 680 {
 681         unsigned long k;
 682         unsigned long nlongs;
 683         unsigned long endmask;
 684
 685         --maxnode;
 686         nodes_clear(*nodes);
 687         if (maxnode == 0 || !nmask)
 688                 return 0;
 689
 690         nlongs = BITS_TO_LONGS(maxnode);
 691         if ((maxnode % BITS_PER_LONG) == 0)
 692                 endmask = ~0UL;
 693         else
 694                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 695
 696         /* When the user specified more nodes than supported just check
 697            if the non supported part is all zero. */
 698         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 699                 if (nlongs > PAGE_SIZE/sizeof(long))
 700                         return -EINVAL;
 701                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 702                         unsigned long t;
 703                         if (get_user(t, nmask + k))
 704                                 return -EFAULT;
 705                         if (k == nlongs - 1) {
 706                                 if (t & endmask)
 707                                         return -EINVAL;
 708                         } else if (t)
 709                                 return -EINVAL;
 710                 }
 711                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 712                 endmask = ~0UL;
 713         }
 714
 715         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 716                 return -EFAULT;
 717         nodes_addr(*nodes)[nlongs-1] &= endmask;
 718         return 0;
 719 }
 720
 721 /* Copy a kernel node mask to user space */
 722 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 723                               nodemask_t *nodes)
 724 {
 725         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 726         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 727
 728         if (copy > nbytes) {
 729                 if (copy > PAGE_SIZE)
 730                         return -EINVAL;
 731                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 732                         return -EFAULT;
 733                 copy = nbytes;
 734         }
 735         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 736 }
 737
 738 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 739                         unsigned long mode,
 740                         unsigned long __user *nmask, unsigned long maxnode,
 741                         unsigned flags)
 742 {
 743         nodemask_t nodes;
 744         int err;
 745
 746         err = get_nodes(&nodes, nmask, maxnode);
 747         if (err)
 748                 return err;
 749         return do_mbind(start, len, mode, &nodes, flags);
 750 }
 751
 752 /* Set the process memory policy */
 753 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 754                 unsigned long maxnode)
 755 {
 756         int err;
 757         nodemask_t nodes;
 758
 759         if (mode < 0 || mode > MPOL_MAX)
 760                 return -EINVAL;
 761         err = get_nodes(&nodes, nmask, maxnode);
 762         if (err)
 763                 return err;
 764         return do_set_mempolicy(mode, &nodes);
 765 }
 766
 767 /* Macro needed until Paul implements this function in kernel/cpusets.c */
 768 #define cpuset_mems_allowed(task) node_online_map
 769
 770 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 771                 const unsigned long __user *old_nodes,
 772                 const unsigned long __user *new_nodes)
 773 {
 774         struct mm_struct *mm;
 775         struct task_struct *task;
 776         nodemask_t old;
 777         nodemask_t new;
 778         nodemask_t task_nodes;
 779         int err;
 780
 781         err = get_nodes(&old, old_nodes, maxnode);
 782         if (err)
 783                 return err;
 784
 785         err = get_nodes(&new, new_nodes, maxnode);
 786         if (err)
 787                 return err;
 788
 789         /* Find the mm_struct */
 790         read_lock(&tasklist_lock);
 791         task = pid ? find_task_by_pid(pid) : current;
 792         if (!task) {
 793                 read_unlock(&tasklist_lock);
 794                 return -ESRCH;
 795         }
 796         mm = get_task_mm(task);
 797         read_unlock(&tasklist_lock);
 798
 799         if (!mm)
 800                 return -EINVAL;
 801
 802         /*
 803          * Check if this process has the right to modify the specified
 804          * process. The right exists if the process has administrative
 805          * capabilities, superuser priviledges or the same
 806          * userid as the target process.
 807          */
 808         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 809             (current->uid != task->suid) && (current->uid != task->uid) &&
 810             !capable(CAP_SYS_ADMIN)) {
 811                 err = -EPERM;
 812                 goto out;
 813         }
 814
 815         task_nodes = cpuset_mems_allowed(task);
 816         /* Is the user allowed to access the target nodes? */
 817         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
 818                 err = -EPERM;
 819                 goto out;
 820         }
 821
 822         err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
 823 out:
 824         mmput(mm);
 825         return err;
 826 }
 827
 828
 829 /* Retrieve NUMA policy */
 830 asmlinkage long sys_get_mempolicy(int __user *policy,
 831                                 unsigned long __user *nmask,
 832                                 unsigned long maxnode,
 833                                 unsigned long addr, unsigned long flags)
 834 {
 835         int err, pval;
 836         nodemask_t nodes;
 837
 838         if (nmask != NULL && maxnode < MAX_NUMNODES)
 839                 return -EINVAL;
 840
 841         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 842
 843         if (err)
 844                 return err;
 845
 846         if (policy && put_user(pval, policy))
 847                 return -EFAULT;
 848
 849         if (nmask)
 850                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 851
 852         return err;
 853 }
 854
 855 #ifdef CONFIG_COMPAT
 856
 857 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 858                                      compat_ulong_t __user *nmask,
 859                                      compat_ulong_t maxnode,
 860                                      compat_ulong_t addr, compat_ulong_t flags)
 861 {
 862         long err;
 863         unsigned long __user *nm = NULL;
 864         unsigned long nr_bits, alloc_size;
 865         DECLARE_BITMAP(bm, MAX_NUMNODES);
 866
 867         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 868         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 869
 870         if (nmask)
 871                 nm = compat_alloc_user_space(alloc_size);
 872
 873         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 874
 875         if (!err && nmask) {
 876                 err = copy_from_user(bm, nm, alloc_size);
 877                 /* ensure entire bitmap is zeroed */
 878                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 879                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 880         }
 881
 882         return err;
 883 }
 884
 885 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 886                                      compat_ulong_t maxnode)
 887 {
 888         long err = 0;
 889         unsigned long __user *nm = NULL;
 890         unsigned long nr_bits, alloc_size;
 891         DECLARE_BITMAP(bm, MAX_NUMNODES);
 892
 893         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 894         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 895
 896         if (nmask) {
 897                 err = compat_get_bitmap(bm, nmask, nr_bits);
 898                 nm = compat_alloc_user_space(alloc_size);
 899                 err |= copy_to_user(nm, bm, alloc_size);
 900         }
 901
 902         if (err)
 903                 return -EFAULT;
 904
 905         return sys_set_mempolicy(mode, nm, nr_bits+1);
 906 }
 907
 908 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 909                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 910                              compat_ulong_t maxnode, compat_ulong_t flags)
 911 {
 912         long err = 0;
 913         unsigned long __user *nm = NULL;
 914         unsigned long nr_bits, alloc_size;
 915         nodemask_t bm;
 916
 917         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 918         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 919
 920         if (nmask) {
 921                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 922                 nm = compat_alloc_user_space(alloc_size);
 923                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 924         }
 925
 926         if (err)
 927                 return -EFAULT;
 928
 929         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 930 }
 931
 932 #endif
 933
 934 /* Return effective policy for a VMA */
 935 struct mempolicy *
 936 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
 937 {
 938         struct mempolicy *pol = task->mempolicy;
 939
 940         if (vma) {
 941                 if (vma->vm_ops && vma->vm_ops->get_policy)
 942                         pol = vma->vm_ops->get_policy(vma, addr);
 943                 else if (vma->vm_policy &&
 944                                 vma->vm_policy->policy != MPOL_DEFAULT)
 945                         pol = vma->vm_policy;
 946         }
 947         if (!pol)
 948                 pol = &default_policy;
 949         return pol;
 950 }
 951
 952 /* Return a zonelist representing a mempolicy */
 953 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 954 {
 955         int nd;
 956
 957         switch (policy->policy) {
 958         case MPOL_PREFERRED:
 959                 nd = policy->v.preferred_node;
 960                 if (nd < 0)
 961                         nd = numa_node_id();
 962                 break;
 963         case MPOL_BIND:
 964                 /* Lower zones don't get a policy applied */
 965                 /* Careful: current->mems_allowed might have moved */
 966                 if (gfp_zone(gfp) >= policy_zone)
 967                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 968                                 return policy->v.zonelist;
 969                 /*FALL THROUGH*/
 970         case MPOL_INTERLEAVE: /* should not happen */
 971         case MPOL_DEFAULT:
 972                 nd = numa_node_id();
 973                 break;
 974         default:
 975                 nd = 0;
 976                 BUG();
 977         }
 978         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 979 }
 980
 981 /* Do dynamic interleaving for a process */
 982 static unsigned interleave_nodes(struct mempolicy *policy)
 983 {
 984         unsigned nid, next;
 985         struct task_struct *me = current;
 986
 987         nid = me->il_next;
 988         next = next_node(nid, policy->v.nodes);
 989         if (next >= MAX_NUMNODES)
 990                 next = first_node(policy->v.nodes);
 991         me->il_next = next;
 992         return nid;
 993 }
 994
 995 /* Do static interleaving for a VMA with known offset. */
 996 static unsigned offset_il_node(struct mempolicy *pol,
 997                 struct vm_area_struct *vma, unsigned long off)
 998 {
 999         unsigned nnodes = nodes_weight(pol->v.nodes);
1000         unsigned target = (unsigned)off % nnodes;
1001         int c;
1002         int nid = -1;
1003
1004         c = 0;
1005         do {
1006                 nid = next_node(nid, pol->v.nodes);
1007                 c++;
1008         } while (c <= target);
1009         return nid;
1010 }
1011
1012 /* Determine a node number for interleave */
1013 static inline unsigned interleave_nid(struct mempolicy *pol,
1014                  struct vm_area_struct *vma, unsigned long addr, int shift)
1015 {
1016         if (vma) {
1017                 unsigned long off;
1018
1019                 off = vma->vm_pgoff;
1020                 off += (addr - vma->vm_start) >> shift;
1021                 return offset_il_node(pol, vma, off);
1022         } else
1023                 return interleave_nodes(pol);
1024 }
1025
1026 /* Return a zonelist suitable for a huge page allocation. */
1027 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1028 {
1029         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1030
1031         if (pol->policy == MPOL_INTERLEAVE) {
1032                 unsigned nid;
1033
1034                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1035                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1036         }
1037         return zonelist_policy(GFP_HIGHUSER, pol);
1038 }
1039
1040 /* Allocate a page in interleaved policy.
1041    Own path because it needs to do special accounting. */
1042 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1043                                         unsigned nid)
1044 {
1045         struct zonelist *zl;
1046         struct page *page;
1047
1048         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1049         page = __alloc_pages(gfp, order, zl);
1050         if (page && page_zone(page) == zl->zones[0]) {
1051                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1052                 put_cpu();
1053         }
1054         return page;
1055 }
1056
1057 /**
1058  *      alloc_page_vma  - Allocate a page for a VMA.
1059  *
1060  *      @gfp:
1061  *      %GFP_USER    user allocation.
1062  *      %GFP_KERNEL  kernel allocations,
1063  *      %GFP_HIGHMEM highmem/user allocations,
1064  *      %GFP_FS      allocation should not call back into a file system.
1065  *      %GFP_ATOMIC  don't sleep.
1066  *
1067  *      @vma:  Pointer to VMA or NULL if not available.
1068  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1069  *
1070  *      This function allocates a page from the kernel page pool and applies
1071  *      a NUMA policy associated with the VMA or the current process.
1072  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1073  *      mm_struct of the VMA to prevent it from going away. Should be used for
1074  *      all allocations for pages that will be mapped into
1075  *      user space. Returns NULL when no page can be allocated.
1076  *
1077  *      Should be called with the mm_sem of the vma hold.
1078  */
1079 struct page *
1080 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1081 {
1082         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1083
1084         cpuset_update_current_mems_allowed();
1085
1086         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1087                 unsigned nid;
1088
1089                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1090                 return alloc_page_interleave(gfp, 0, nid);
1091         }
1092         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1093 }
1094
1095 /**
1096  *      alloc_pages_current - Allocate pages.
1097  *
1098  *      @gfp:
1099  *              %GFP_USER   user allocation,
1100  *              %GFP_KERNEL kernel allocation,
1101  *              %GFP_HIGHMEM highmem allocation,
1102  *              %GFP_FS     don't call back into a file system.
1103  *              %GFP_ATOMIC don't sleep.
1104  *      @order: Power of two of allocation size in pages. 0 is a single page.
1105  *
1106  *      Allocate a page from the kernel page pool.  When not in
1107  *      interrupt context and apply the current process NUMA policy.
1108  *      Returns NULL when no page can be allocated.
1109  *
1110  *      Don't call cpuset_update_current_mems_allowed() unless
1111  *      1) it's ok to take cpuset_sem (can WAIT), and
1112  *      2) allocating for current task (not interrupt).
1113  */
1114 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1115 {
1116         struct mempolicy *pol = current->mempolicy;
1117
1118         if ((gfp & __GFP_WAIT) && !in_interrupt())
1119                 cpuset_update_current_mems_allowed();
1120         if (!pol || in_interrupt())
1121                 pol = &default_policy;
1122         if (pol->policy == MPOL_INTERLEAVE)
1123                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1124         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1125 }
1126 EXPORT_SYMBOL(alloc_pages_current);
1127
1128 /* Slow path of a mempolicy copy */
1129 struct mempolicy *__mpol_copy(struct mempolicy *old)
1130 {
1131         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1132
1133         if (!new)
1134                 return ERR_PTR(-ENOMEM);
1135         *new = *old;
1136         atomic_set(&new->refcnt, 1);
1137         if (new->policy == MPOL_BIND) {
1138                 int sz = ksize(old->v.zonelist);
1139                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1140                 if (!new->v.zonelist) {
1141                         kmem_cache_free(policy_cache, new);
1142                         return ERR_PTR(-ENOMEM);
1143                 }
1144                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1145         }
1146         return new;
1147 }
1148
1149 /* Slow path of a mempolicy comparison */
1150 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1151 {
1152         if (!a || !b)
1153                 return 0;
1154         if (a->policy != b->policy)
1155                 return 0;
1156         switch (a->policy) {
1157         case MPOL_DEFAULT:
1158                 return 1;
1159         case MPOL_INTERLEAVE:
1160                 return nodes_equal(a->v.nodes, b->v.nodes);
1161         case MPOL_PREFERRED:
1162                 return a->v.preferred_node == b->v.preferred_node;
1163         case MPOL_BIND: {
1164                 int i;
1165                 for (i = 0; a->v.zonelist->zones[i]; i++)
1166                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1167                                 return 0;
1168                 return b->v.zonelist->zones[i] == NULL;
1169         }
1170         default:
1171                 BUG();
1172                 return 0;
1173         }
1174 }
1175
1176 /* Slow path of a mpol destructor. */
1177 void __mpol_free(struct mempolicy *p)
1178 {
1179         if (!atomic_dec_and_test(&p->refcnt))
1180                 return;
1181         if (p->policy == MPOL_BIND)
1182                 kfree(p->v.zonelist);
1183         p->policy = MPOL_DEFAULT;
1184         kmem_cache_free(policy_cache, p);
1185 }
1186
1187 /*
1188  * Shared memory backing store policy support.
1189  *
1190  * Remember policies even when nobody has shared memory mapped.
1191  * The policies are kept in Red-Black tree linked from the inode.
1192  * They are protected by the sp->lock spinlock, which should be held
1193  * for any accesses to the tree.
1194  */
1195
1196 /* lookup first element intersecting start-end */
1197 /* Caller holds sp->lock */
1198 static struct sp_node *
1199 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1200 {
1201         struct rb_node *n = sp->root.rb_node;
1202
1203         while (n) {
1204                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1205
1206                 if (start >= p->end)
1207                         n = n->rb_right;
1208                 else if (end <= p->start)
1209                         n = n->rb_left;
1210                 else
1211                         break;
1212         }
1213         if (!n)
1214                 return NULL;
1215         for (;;) {
1216                 struct sp_node *w = NULL;
1217                 struct rb_node *prev = rb_prev(n);
1218                 if (!prev)
1219                         break;
1220                 w = rb_entry(prev, struct sp_node, nd);
1221                 if (w->end <= start)
1222                         break;
1223                 n = prev;
1224         }
1225         return rb_entry(n, struct sp_node, nd);
1226 }
1227
1228 /* Insert a new shared policy into the list. */
1229 /* Caller holds sp->lock */
1230 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1231 {
1232         struct rb_node **p = &sp->root.rb_node;
1233         struct rb_node *parent = NULL;
1234         struct sp_node *nd;
1235
1236         while (*p) {
1237                 parent = *p;
1238                 nd = rb_entry(parent, struct sp_node, nd);
1239                 if (new->start < nd->start)
1240                         p = &(*p)->rb_left;
1241                 else if (new->end > nd->end)
1242                         p = &(*p)->rb_right;
1243                 else
1244                         BUG();
1245         }
1246         rb_link_node(&new->nd, parent, p);
1247         rb_insert_color(&new->nd, &sp->root);
1248         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1249                  new->policy ? new->policy->policy : 0);
1250 }
1251
1252 /* Find shared policy intersecting idx */
1253 struct mempolicy *
1254 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1255 {
1256         struct mempolicy *pol = NULL;
1257         struct sp_node *sn;
1258
1259         if (!sp->root.rb_node)
1260                 return NULL;
1261         spin_lock(&sp->lock);
1262         sn = sp_lookup(sp, idx, idx+1);
1263         if (sn) {
1264                 mpol_get(sn->policy);
1265                 pol = sn->policy;
1266         }
1267         spin_unlock(&sp->lock);
1268         return pol;
1269 }
1270
1271 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1272 {
1273         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1274         rb_erase(&n->nd, &sp->root);
1275         mpol_free(n->policy);
1276         kmem_cache_free(sn_cache, n);
1277 }
1278
1279 struct sp_node *
1280 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1281 {
1282         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1283
1284         if (!n)
1285                 return NULL;
1286         n->start = start;
1287         n->end = end;
1288         mpol_get(pol);
1289         n->policy = pol;
1290         return n;
1291 }
1292
1293 /* Replace a policy range. */
1294 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1295                                  unsigned long end, struct sp_node *new)
1296 {
1297         struct sp_node *n, *new2 = NULL;
1298
1299 restart:
1300         spin_lock(&sp->lock);
1301         n = sp_lookup(sp, start, end);
1302         /* Take care of old policies in the same range. */
1303         while (n && n->start < end) {
1304                 struct rb_node *next = rb_next(&n->nd);
1305                 if (n->start >= start) {
1306                         if (n->end <= end)
1307                                 sp_delete(sp, n);
1308                         else
1309                                 n->start = end;
1310                 } else {
1311                         /* Old policy spanning whole new range. */
1312                         if (n->end > end) {
1313                                 if (!new2) {
1314                                         spin_unlock(&sp->lock);
1315                                         new2 = sp_alloc(end, n->end, n->policy);
1316                                         if (!new2)
1317                                                 return -ENOMEM;
1318                                         goto restart;
1319                                 }
1320                                 n->end = start;
1321                                 sp_insert(sp, new2);
1322                                 new2 = NULL;
1323                                 break;
1324                         } else
1325                                 n->end = start;
1326                 }
1327                 if (!next)
1328                         break;
1329                 n = rb_entry(next, struct sp_node, nd);
1330         }
1331         if (new)
1332                 sp_insert(sp, new);
1333         spin_unlock(&sp->lock);
1334         if (new2) {
1335                 mpol_free(new2->policy);
1336                 kmem_cache_free(sn_cache, new2);
1337         }
1338         return 0;
1339 }
1340
1341 int mpol_set_shared_policy(struct shared_policy *info,
1342                         struct vm_area_struct *vma, struct mempolicy *npol)
1343 {
1344         int err;
1345         struct sp_node *new = NULL;
1346         unsigned long sz = vma_pages(vma);
1347
1348         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1349                  vma->vm_pgoff,
1350                  sz, npol? npol->policy : -1,
1351                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1352
1353         if (npol) {
1354                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1355                 if (!new)
1356                         return -ENOMEM;
1357         }
1358         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1359         if (err && new)
1360                 kmem_cache_free(sn_cache, new);
1361         return err;
1362 }
1363
1364 /* Free a backing policy store on inode delete. */
1365 void mpol_free_shared_policy(struct shared_policy *p)
1366 {
1367         struct sp_node *n;
1368         struct rb_node *next;
1369
1370         if (!p->root.rb_node)
1371                 return;
1372         spin_lock(&p->lock);
1373         next = rb_first(&p->root);
1374         while (next) {
1375                 n = rb_entry(next, struct sp_node, nd);
1376                 next = rb_next(&n->nd);
1377                 rb_erase(&n->nd, &p->root);
1378                 mpol_free(n->policy);
1379                 kmem_cache_free(sn_cache, n);
1380         }
1381         spin_unlock(&p->lock);
1382 }
1383
1384 /* assumes fs == KERNEL_DS */
1385 void __init numa_policy_init(void)
1386 {
1387         policy_cache = kmem_cache_create("numa_policy",
1388                                          sizeof(struct mempolicy),
1389                                          0, SLAB_PANIC, NULL, NULL);
1390
1391         sn_cache = kmem_cache_create("shared_policy_node",
1392                                      sizeof(struct sp_node),
1393                                      0, SLAB_PANIC, NULL, NULL);
1394
1395         /* Set interleaving policy for system init. This way not all
1396            the data structures allocated at system boot end up in node zero. */
1397
1398         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1399                 printk("numa_policy_init: interleaving failed\n");
1400 }
1401
1402 /* Reset policy of current process to default */
1403 void numa_default_policy(void)
1404 {
1405         do_set_mempolicy(MPOL_DEFAULT, NULL);
1406 }
1407
1408 /* Migrate a policy to a different set of nodes */
1409 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1410                                                         const nodemask_t *new)
1411 {
1412         nodemask_t tmp;
1413
1414         if (!pol)
1415                 return;
1416
1417         switch (pol->policy) {
1418         case MPOL_DEFAULT:
1419                 break;
1420         case MPOL_INTERLEAVE:
1421                 nodes_remap(tmp, pol->v.nodes, *old, *new);
1422                 pol->v.nodes = tmp;
1423                 current->il_next = node_remap(current->il_next, *old, *new);
1424                 break;
1425         case MPOL_PREFERRED:
1426                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1427                                                                 *old, *new);
1428                 break;
1429         case MPOL_BIND: {
1430                 nodemask_t nodes;
1431                 struct zone **z;
1432                 struct zonelist *zonelist;
1433
1434                 nodes_clear(nodes);
1435                 for (z = pol->v.zonelist->zones; *z; z++)
1436                         node_set((*z)->zone_pgdat->node_id, nodes);
1437                 nodes_remap(tmp, nodes, *old, *new);
1438                 nodes = tmp;
1439
1440                 zonelist = bind_zonelist(&nodes);
1441
1442                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1443                  * If that old zonelist has no remaining mems_allowed nodes,
1444                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1445                  */
1446
1447                 if (zonelist) {
1448                         /* Good - got mem - substitute new zonelist */
1449                         kfree(pol->v.zonelist);
1450                         pol->v.zonelist = zonelist;
1451                 }
1452                 break;
1453         }
1454         default:
1455                 BUG();
1456                 break;
1457         }
1458 }
1459
1460 /*
1461  * Someone moved this task to different nodes.  Fixup mempolicies.
1462  *
1463  * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1464  * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1465  */
1466 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1467 {
1468         rebind_policy(current->mempolicy, old, new);
1469 }
1470
1471 /*
1472  * Display pages allocated per node and memory policy via /proc.
1473  */
1474
1475 static const char *policy_types[] = { "default", "prefer", "bind",
1476                                       "interleave" };
1477
1478 /*
1479  * Convert a mempolicy into a string.
1480  * Returns the number of characters in buffer (if positive)
1481  * or an error (negative)
1482  */
1483 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1484 {
1485         char *p = buffer;
1486         int l;
1487         nodemask_t nodes;
1488         int mode = pol ? pol->policy : MPOL_DEFAULT;
1489
1490         switch (mode) {
1491         case MPOL_DEFAULT:
1492                 nodes_clear(nodes);
1493                 break;
1494
1495         case MPOL_PREFERRED:
1496                 nodes_clear(nodes);
1497                 node_set(pol->v.preferred_node, nodes);
1498                 break;
1499
1500         case MPOL_BIND:
1501                 get_zonemask(pol, &nodes);
1502                 break;
1503
1504         case MPOL_INTERLEAVE:
1505                 nodes = pol->v.nodes;
1506                 break;
1507
1508         default:
1509                 BUG();
1510                 return -EFAULT;
1511         }
1512
1513         l = strlen(policy_types[mode]);
1514         if (buffer + maxlen < p + l + 1)
1515                 return -ENOSPC;
1516
1517         strcpy(p, policy_types[mode]);
1518         p += l;
1519
1520         if (!nodes_empty(nodes)) {
1521                 if (buffer + maxlen < p + 2)
1522                         return -ENOSPC;
1523                 *p++ = '=';
1524                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1525         }
1526         return p - buffer;
1527 }
1528
1529 struct numa_maps {
1530         unsigned long pages;
1531         unsigned long anon;
1532         unsigned long mapped;
1533         unsigned long mapcount_max;
1534         unsigned long node[MAX_NUMNODES];
1535 };
1536
1537 static void gather_stats(struct page *page, void *private)
1538 {
1539         struct numa_maps *md = private;
1540         int count = page_mapcount(page);
1541
1542         if (count)
1543                 md->mapped++;
1544
1545         if (count > md->mapcount_max)
1546                 md->mapcount_max = count;
1547
1548         md->pages++;
1549
1550         if (PageAnon(page))
1551                 md->anon++;
1552
1553         md->node[page_to_nid(page)]++;
1554         cond_resched();
1555 }
1556
1557 int show_numa_map(struct seq_file *m, void *v)
1558 {
1559         struct task_struct *task = m->private;
1560         struct vm_area_struct *vma = v;
1561         struct numa_maps *md;
1562         int n;
1563         char buffer[50];
1564
1565         if (!vma->vm_mm)
1566                 return 0;
1567
1568         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1569         if (!md)
1570                 return 0;
1571
1572         check_pgd_range(vma, vma->vm_start, vma->vm_end,
1573                     &node_online_map, MPOL_MF_STATS, md);
1574
1575         if (md->pages) {
1576                 mpol_to_str(buffer, sizeof(buffer),
1577                             get_vma_policy(task, vma, vma->vm_start));
1578
1579                 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1580                            vma->vm_start, buffer, md->pages,
1581                            md->mapped, md->mapcount_max);
1582
1583                 if (md->anon)
1584                         seq_printf(m," anon=%lu",md->anon);
1585
1586                 for_each_online_node(n)
1587                         if (md->node[n])
1588                                 seq_printf(m, " N%d=%lu", n, md->node[n]);
1589
1590                 seq_putc(m, '\n');
1591         }
1592         kfree(md);
1593
1594         if (m->count < m->size)
1595                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1596         return 0;
1597 }
1598