mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case NUMA_NO_NODE here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70 #include <linux/mempolicy.h>
  71 #include <linux/mm.h>
  72 #include <linux/highmem.h>
  73 #include <linux/hugetlb.h>
  74 #include <linux/kernel.h>
  75 #include <linux/sched.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/slab.h>
  79 #include <linux/string.h>
  80 #include <linux/export.h>
  81 #include <linux/nsproxy.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/swap.h>
  86 #include <linux/seq_file.h>
  87 #include <linux/proc_fs.h>
  88 #include <linux/migrate.h>
  89 #include <linux/ksm.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92 #include <linux/syscalls.h>
  93 #include <linux/ctype.h>
  94 #include <linux/mm_inline.h>
  95 #include <linux/mmu_notifier.h>
  96 #include <linux/printk.h>
  97
  98 #include <asm/tlbflush.h>
  99 #include <asm/uaccess.h>
 100 #include <linux/random.h>
 101
 102 #include "internal.h"
 103
 104 /* Internal flags */
 105 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 106 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 107
 108 static struct kmem_cache *policy_cache;
 109 static struct kmem_cache *sn_cache;
 110
 111 /* Highest zone. An specific allocation for a zone below that is not
 112    policied. */
 113 enum zone_type policy_zone = 0;
 114
 115 /*
 116  * run-time system-wide default policy => local allocation
 117  */
 118 static struct mempolicy default_policy = {
 119         .refcnt = ATOMIC_INIT(1), /* never free it */
 120         .mode = MPOL_PREFERRED,
 121         .flags = MPOL_F_LOCAL,
 122 };
 123
 124 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 125
 126 struct mempolicy *get_task_policy(struct task_struct *p)
 127 {
 128         struct mempolicy *pol = p->mempolicy;
 129         int node;
 130
 131         if (pol)
 132                 return pol;
 133
 134         node = numa_node_id();
 135         if (node != NUMA_NO_NODE) {
 136                 pol = &preferred_node_policy[node];
 137                 /* preferred_node_policy is not initialised early in boot */
 138                 if (pol->mode)
 139                         return pol;
 140         }
 141
 142         return &default_policy;
 143 }
 144
 145 static const struct mempolicy_operations {
 146         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 147         /*
 148          * If read-side task has no lock to protect task->mempolicy, write-side
 149          * task will rebind the task->mempolicy by two step. The first step is
 150          * setting all the newly nodes, and the second step is cleaning all the
 151          * disallowed nodes. In this way, we can avoid finding no node to alloc
 152          * page.
 153          * If we have a lock to protect task->mempolicy in read-side, we do
 154          * rebind directly.
 155          *
 156          * step:
 157          *      MPOL_REBIND_ONCE - do rebind work at once
 158          *      MPOL_REBIND_STEP1 - set all the newly nodes
 159          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 160          */
 161         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 162                         enum mpol_rebind_step step);
 163 } mpol_ops[MPOL_MAX];
 164
 165 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 166 {
 167         return pol->flags & MPOL_MODE_FLAGS;
 168 }
 169
 170 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 171                                    const nodemask_t *rel)
 172 {
 173         nodemask_t tmp;
 174         nodes_fold(tmp, *orig, nodes_weight(*rel));
 175         nodes_onto(*ret, tmp, *rel);
 176 }
 177
 178 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 179 {
 180         if (nodes_empty(*nodes))
 181                 return -EINVAL;
 182         pol->v.nodes = *nodes;
 183         return 0;
 184 }
 185
 186 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 187 {
 188         if (!nodes)
 189                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 190         else if (nodes_empty(*nodes))
 191                 return -EINVAL;                 /*  no allowed nodes */
 192         else
 193                 pol->v.preferred_node = first_node(*nodes);
 194         return 0;
 195 }
 196
 197 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 198 {
 199         if (nodes_empty(*nodes))
 200                 return -EINVAL;
 201         pol->v.nodes = *nodes;
 202         return 0;
 203 }
 204
 205 /*
 206  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 207  * any, for the new policy.  mpol_new() has already validated the nodes
 208  * parameter with respect to the policy mode and flags.  But, we need to
 209  * handle an empty nodemask with MPOL_PREFERRED here.
 210  *
 211  * Must be called holding task's alloc_lock to protect task's mems_allowed
 212  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 213  */
 214 static int mpol_set_nodemask(struct mempolicy *pol,
 215                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 216 {
 217         int ret;
 218
 219         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 220         if (pol == NULL)
 221                 return 0;
 222         /* Check N_MEMORY */
 223         nodes_and(nsc->mask1,
 224                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 225
 226         VM_BUG_ON(!nodes);
 227         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 228                 nodes = NULL;   /* explicit local allocation */
 229         else {
 230                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 231                         mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 232                 else
 233                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 234
 235                 if (mpol_store_user_nodemask(pol))
 236                         pol->w.user_nodemask = *nodes;
 237                 else
 238                         pol->w.cpuset_mems_allowed =
 239                                                 cpuset_current_mems_allowed;
 240         }
 241
 242         if (nodes)
 243                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 244         else
 245                 ret = mpol_ops[pol->mode].create(pol, NULL);
 246         return ret;
 247 }
 248
 249 /*
 250  * This function just creates a new policy, does some check and simple
 251  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 252  */
 253 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 254                                   nodemask_t *nodes)
 255 {
 256         struct mempolicy *policy;
 257
 258         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 259                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 260
 261         if (mode == MPOL_DEFAULT) {
 262                 if (nodes && !nodes_empty(*nodes))
 263                         return ERR_PTR(-EINVAL);
 264                 return NULL;
 265         }
 266         VM_BUG_ON(!nodes);
 267
 268         /*
 269          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 270          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 271          * All other modes require a valid pointer to a non-empty nodemask.
 272          */
 273         if (mode == MPOL_PREFERRED) {
 274                 if (nodes_empty(*nodes)) {
 275                         if (((flags & MPOL_F_STATIC_NODES) ||
 276                              (flags & MPOL_F_RELATIVE_NODES)))
 277                                 return ERR_PTR(-EINVAL);
 278                 }
 279         } else if (mode == MPOL_LOCAL) {
 280                 if (!nodes_empty(*nodes))
 281                         return ERR_PTR(-EINVAL);
 282                 mode = MPOL_PREFERRED;
 283         } else if (nodes_empty(*nodes))
 284                 return ERR_PTR(-EINVAL);
 285         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 286         if (!policy)
 287                 return ERR_PTR(-ENOMEM);
 288         atomic_set(&policy->refcnt, 1);
 289         policy->mode = mode;
 290         policy->flags = flags;
 291
 292         return policy;
 293 }
 294
 295 /* Slow path of a mpol destructor. */
 296 void __mpol_put(struct mempolicy *p)
 297 {
 298         if (!atomic_dec_and_test(&p->refcnt))
 299                 return;
 300         kmem_cache_free(policy_cache, p);
 301 }
 302
 303 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 304                                 enum mpol_rebind_step step)
 305 {
 306 }
 307
 308 /*
 309  * step:
 310  *      MPOL_REBIND_ONCE  - do rebind work at once
 311  *      MPOL_REBIND_STEP1 - set all the newly nodes
 312  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 313  */
 314 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 315                                  enum mpol_rebind_step step)
 316 {
 317         nodemask_t tmp;
 318
 319         if (pol->flags & MPOL_F_STATIC_NODES)
 320                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 321         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 322                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 323         else {
 324                 /*
 325                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 326                  * result
 327                  */
 328                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 329                         nodes_remap(tmp, pol->v.nodes,
 330                                         pol->w.cpuset_mems_allowed, *nodes);
 331                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 332                 } else if (step == MPOL_REBIND_STEP2) {
 333                         tmp = pol->w.cpuset_mems_allowed;
 334                         pol->w.cpuset_mems_allowed = *nodes;
 335                 } else
 336                         BUG();
 337         }
 338
 339         if (nodes_empty(tmp))
 340                 tmp = *nodes;
 341
 342         if (step == MPOL_REBIND_STEP1)
 343                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 344         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 345                 pol->v.nodes = tmp;
 346         else
 347                 BUG();
 348
 349         if (!node_isset(current->il_next, tmp)) {
 350                 current->il_next = next_node(current->il_next, tmp);
 351                 if (current->il_next >= MAX_NUMNODES)
 352                         current->il_next = first_node(tmp);
 353                 if (current->il_next >= MAX_NUMNODES)
 354                         current->il_next = numa_node_id();
 355         }
 356 }
 357
 358 static void mpol_rebind_preferred(struct mempolicy *pol,
 359                                   const nodemask_t *nodes,
 360                                   enum mpol_rebind_step step)
 361 {
 362         nodemask_t tmp;
 363
 364         if (pol->flags & MPOL_F_STATIC_NODES) {
 365                 int node = first_node(pol->w.user_nodemask);
 366
 367                 if (node_isset(node, *nodes)) {
 368                         pol->v.preferred_node = node;
 369                         pol->flags &= ~MPOL_F_LOCAL;
 370                 } else
 371                         pol->flags |= MPOL_F_LOCAL;
 372         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 373                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 374                 pol->v.preferred_node = first_node(tmp);
 375         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 376                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 377                                                    pol->w.cpuset_mems_allowed,
 378                                                    *nodes);
 379                 pol->w.cpuset_mems_allowed = *nodes;
 380         }
 381 }
 382
 383 /*
 384  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 385  *
 386  * If read-side task has no lock to protect task->mempolicy, write-side
 387  * task will rebind the task->mempolicy by two step. The first step is
 388  * setting all the newly nodes, and the second step is cleaning all the
 389  * disallowed nodes. In this way, we can avoid finding no node to alloc
 390  * page.
 391  * If we have a lock to protect task->mempolicy in read-side, we do
 392  * rebind directly.
 393  *
 394  * step:
 395  *      MPOL_REBIND_ONCE  - do rebind work at once
 396  *      MPOL_REBIND_STEP1 - set all the newly nodes
 397  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 398  */
 399 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 400                                 enum mpol_rebind_step step)
 401 {
 402         if (!pol)
 403                 return;
 404         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 405             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 406                 return;
 407
 408         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 409                 return;
 410
 411         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 412                 BUG();
 413
 414         if (step == MPOL_REBIND_STEP1)
 415                 pol->flags |= MPOL_F_REBINDING;
 416         else if (step == MPOL_REBIND_STEP2)
 417                 pol->flags &= ~MPOL_F_REBINDING;
 418         else if (step >= MPOL_REBIND_NSTEP)
 419                 BUG();
 420
 421         mpol_ops[pol->mode].rebind(pol, newmask, step);
 422 }
 423
 424 /*
 425  * Wrapper for mpol_rebind_policy() that just requires task
 426  * pointer, and updates task mempolicy.
 427  *
 428  * Called with task's alloc_lock held.
 429  */
 430
 431 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 432                         enum mpol_rebind_step step)
 433 {
 434         mpol_rebind_policy(tsk->mempolicy, new, step);
 435 }
 436
 437 /*
 438  * Rebind each vma in mm to new nodemask.
 439  *
 440  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 441  */
 442
 443 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 444 {
 445         struct vm_area_struct *vma;
 446
 447         down_write(&mm->mmap_sem);
 448         for (vma = mm->mmap; vma; vma = vma->vm_next)
 449                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 450         up_write(&mm->mmap_sem);
 451 }
 452
 453 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 454         [MPOL_DEFAULT] = {
 455                 .rebind = mpol_rebind_default,
 456         },
 457         [MPOL_INTERLEAVE] = {
 458                 .create = mpol_new_interleave,
 459                 .rebind = mpol_rebind_nodemask,
 460         },
 461         [MPOL_PREFERRED] = {
 462                 .create = mpol_new_preferred,
 463                 .rebind = mpol_rebind_preferred,
 464         },
 465         [MPOL_BIND] = {
 466                 .create = mpol_new_bind,
 467                 .rebind = mpol_rebind_nodemask,
 468         },
 469 };
 470
 471 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 472                                 unsigned long flags);
 473
 474 /*
 475  * Scan through pages checking if pages follow certain conditions,
 476  * and move them to the pagelist if they do.
 477  */
 478 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 479                 unsigned long addr, unsigned long end,
 480                 const nodemask_t *nodes, unsigned long flags,
 481                 void *private)
 482 {
 483         pte_t *orig_pte;
 484         pte_t *pte;
 485         spinlock_t *ptl;
 486
 487         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 488         do {
 489                 struct page *page;
 490                 int nid;
 491
 492                 if (!pte_present(*pte))
 493                         continue;
 494                 page = vm_normal_page(vma, addr, *pte);
 495                 if (!page)
 496                         continue;
 497                 /*
 498                  * vm_normal_page() filters out zero pages, but there might
 499                  * still be PageReserved pages to skip, perhaps in a VDSO.
 500                  */
 501                 if (PageReserved(page))
 502                         continue;
 503                 nid = page_to_nid(page);
 504                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 505                         continue;
 506
 507                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 508                         migrate_page_add(page, private, flags);
 509                 else
 510                         break;
 511         } while (pte++, addr += PAGE_SIZE, addr != end);
 512         pte_unmap_unlock(orig_pte, ptl);
 513         return addr != end;
 514 }
 515
 516 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
 517                 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
 518                                     void *private)
 519 {
 520 #ifdef CONFIG_HUGETLB_PAGE
 521         int nid;
 522         struct page *page;
 523         spinlock_t *ptl;
 524         pte_t entry;
 525
 526         ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
 527         entry = huge_ptep_get((pte_t *)pmd);
 528         if (!pte_present(entry))
 529                 goto unlock;
 530         page = pte_page(entry);
 531         nid = page_to_nid(page);
 532         if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 533                 goto unlock;
 534         /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 535         if (flags & (MPOL_MF_MOVE_ALL) ||
 536             (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 537                 isolate_huge_page(page, private);
 538 unlock:
 539         spin_unlock(ptl);
 540 #else
 541         BUG();
 542 #endif
 543 }
 544
 545 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 546                 unsigned long addr, unsigned long end,
 547                 const nodemask_t *nodes, unsigned long flags,
 548                 void *private)
 549 {
 550         pmd_t *pmd;
 551         unsigned long next;
 552
 553         pmd = pmd_offset(pud, addr);
 554         do {
 555                 next = pmd_addr_end(addr, end);
 556                 if (!pmd_present(*pmd))
 557                         continue;
 558                 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
 559                         queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
 560                                                 flags, private);
 561                         continue;
 562                 }
 563                 split_huge_page_pmd(vma, addr, pmd);
 564                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 565                         continue;
 566                 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
 567                                     flags, private))
 568                         return -EIO;
 569         } while (pmd++, addr = next, addr != end);
 570         return 0;
 571 }
 572
 573 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 574                 unsigned long addr, unsigned long end,
 575                 const nodemask_t *nodes, unsigned long flags,
 576                 void *private)
 577 {
 578         pud_t *pud;
 579         unsigned long next;
 580
 581         pud = pud_offset(pgd, addr);
 582         do {
 583                 next = pud_addr_end(addr, end);
 584                 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
 585                         continue;
 586                 if (pud_none_or_clear_bad(pud))
 587                         continue;
 588                 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
 589                                     flags, private))
 590                         return -EIO;
 591         } while (pud++, addr = next, addr != end);
 592         return 0;
 593 }
 594
 595 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
 596                 unsigned long addr, unsigned long end,
 597                 const nodemask_t *nodes, unsigned long flags,
 598                 void *private)
 599 {
 600         pgd_t *pgd;
 601         unsigned long next;
 602
 603         pgd = pgd_offset(vma->vm_mm, addr);
 604         do {
 605                 next = pgd_addr_end(addr, end);
 606                 if (pgd_none_or_clear_bad(pgd))
 607                         continue;
 608                 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
 609                                     flags, private))
 610                         return -EIO;
 611         } while (pgd++, addr = next, addr != end);
 612         return 0;
 613 }
 614
 615 #ifdef CONFIG_NUMA_BALANCING
 616 /*
 617  * This is used to mark a range of virtual addresses to be inaccessible.
 618  * These are later cleared by a NUMA hinting fault. Depending on these
 619  * faults, pages may be migrated for better NUMA placement.
 620  *
 621  * This is assuming that NUMA faults are handled using PROT_NONE. If
 622  * an architecture makes a different choice, it will need further
 623  * changes to the core.
 624  */
 625 unsigned long change_prot_numa(struct vm_area_struct *vma,
 626                         unsigned long addr, unsigned long end)
 627 {
 628         int nr_updated;
 629
 630         nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 631         if (nr_updated)
 632                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 633
 634         return nr_updated;
 635 }
 636 #else
 637 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 638                         unsigned long addr, unsigned long end)
 639 {
 640         return 0;
 641 }
 642 #endif /* CONFIG_NUMA_BALANCING */
 643
 644 /*
 645  * Walk through page tables and collect pages to be migrated.
 646  *
 647  * If pages found in a given range are on a set of nodes (determined by
 648  * @nodes and @flags,) it's isolated and queued to the pagelist which is
 649  * passed via @private.)
 650  */
 651 static int
 652 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 653                 const nodemask_t *nodes, unsigned long flags, void *private)
 654 {
 655         int err = 0;
 656         struct vm_area_struct *vma, *prev;
 657
 658         vma = find_vma(mm, start);
 659         if (!vma)
 660                 return -EFAULT;
 661         prev = NULL;
 662         for (; vma && vma->vm_start < end; vma = vma->vm_next) {
 663                 unsigned long endvma = vma->vm_end;
 664
 665                 if (endvma > end)
 666                         endvma = end;
 667                 if (vma->vm_start > start)
 668                         start = vma->vm_start;
 669
 670                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 671                         if (!vma->vm_next && vma->vm_end < end)
 672                                 return -EFAULT;
 673                         if (prev && prev->vm_end < vma->vm_start)
 674                                 return -EFAULT;
 675                 }
 676
 677                 if (flags & MPOL_MF_LAZY) {
 678                         /* Similar to task_numa_work, skip inaccessible VMAs */
 679                         if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
 680                                 change_prot_numa(vma, start, endvma);
 681                         goto next;
 682                 }
 683
 684                 if ((flags & MPOL_MF_STRICT) ||
 685                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 686                       vma_migratable(vma))) {
 687
 688                         err = queue_pages_pgd_range(vma, start, endvma, nodes,
 689                                                 flags, private);
 690                         if (err)
 691                                 break;
 692                 }
 693 next:
 694                 prev = vma;
 695         }
 696         return err;
 697 }
 698
 699 /*
 700  * Apply policy to a single VMA
 701  * This must be called with the mmap_sem held for writing.
 702  */
 703 static int vma_replace_policy(struct vm_area_struct *vma,
 704                                                 struct mempolicy *pol)
 705 {
 706         int err;
 707         struct mempolicy *old;
 708         struct mempolicy *new;
 709
 710         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 711                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 712                  vma->vm_ops, vma->vm_file,
 713                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 714
 715         new = mpol_dup(pol);
 716         if (IS_ERR(new))
 717                 return PTR_ERR(new);
 718
 719         if (vma->vm_ops && vma->vm_ops->set_policy) {
 720                 err = vma->vm_ops->set_policy(vma, new);
 721                 if (err)
 722                         goto err_out;
 723         }
 724
 725         old = vma->vm_policy;
 726         vma->vm_policy = new; /* protected by mmap_sem */
 727         mpol_put(old);
 728
 729         return 0;
 730  err_out:
 731         mpol_put(new);
 732         return err;
 733 }
 734
 735 /* Step 2: apply policy to a range and do splits. */
 736 static int mbind_range(struct mm_struct *mm, unsigned long start,
 737                        unsigned long end, struct mempolicy *new_pol)
 738 {
 739         struct vm_area_struct *next;
 740         struct vm_area_struct *prev;
 741         struct vm_area_struct *vma;
 742         int err = 0;
 743         pgoff_t pgoff;
 744         unsigned long vmstart;
 745         unsigned long vmend;
 746
 747         vma = find_vma(mm, start);
 748         if (!vma || vma->vm_start > start)
 749                 return -EFAULT;
 750
 751         prev = vma->vm_prev;
 752         if (start > vma->vm_start)
 753                 prev = vma;
 754
 755         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 756                 next = vma->vm_next;
 757                 vmstart = max(start, vma->vm_start);
 758                 vmend   = min(end, vma->vm_end);
 759
 760                 if (mpol_equal(vma_policy(vma), new_pol))
 761                         continue;
 762
 763                 pgoff = vma->vm_pgoff +
 764                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 765                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 766                                   vma->anon_vma, vma->vm_file, pgoff,
 767                                   new_pol);
 768                 if (prev) {
 769                         vma = prev;
 770                         next = vma->vm_next;
 771                         if (mpol_equal(vma_policy(vma), new_pol))
 772                                 continue;
 773                         /* vma_merge() joined vma && vma->next, case 8 */
 774                         goto replace;
 775                 }
 776                 if (vma->vm_start != vmstart) {
 777                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 778                         if (err)
 779                                 goto out;
 780                 }
 781                 if (vma->vm_end != vmend) {
 782                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 783                         if (err)
 784                                 goto out;
 785                 }
 786  replace:
 787                 err = vma_replace_policy(vma, new_pol);
 788                 if (err)
 789                         goto out;
 790         }
 791
 792  out:
 793         return err;
 794 }
 795
 796 /* Set the process memory policy */
 797 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 798                              nodemask_t *nodes)
 799 {
 800         struct mempolicy *new, *old;
 801         NODEMASK_SCRATCH(scratch);
 802         int ret;
 803
 804         if (!scratch)
 805                 return -ENOMEM;
 806
 807         new = mpol_new(mode, flags, nodes);
 808         if (IS_ERR(new)) {
 809                 ret = PTR_ERR(new);
 810                 goto out;
 811         }
 812
 813         task_lock(current);
 814         ret = mpol_set_nodemask(new, nodes, scratch);
 815         if (ret) {
 816                 task_unlock(current);
 817                 mpol_put(new);
 818                 goto out;
 819         }
 820         old = current->mempolicy;
 821         current->mempolicy = new;
 822         if (new && new->mode == MPOL_INTERLEAVE &&
 823             nodes_weight(new->v.nodes))
 824                 current->il_next = first_node(new->v.nodes);
 825         task_unlock(current);
 826         mpol_put(old);
 827         ret = 0;
 828 out:
 829         NODEMASK_SCRATCH_FREE(scratch);
 830         return ret;
 831 }
 832
 833 /*
 834  * Return nodemask for policy for get_mempolicy() query
 835  *
 836  * Called with task's alloc_lock held
 837  */
 838 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 839 {
 840         nodes_clear(*nodes);
 841         if (p == &default_policy)
 842                 return;
 843
 844         switch (p->mode) {
 845         case MPOL_BIND:
 846                 /* Fall through */
 847         case MPOL_INTERLEAVE:
 848                 *nodes = p->v.nodes;
 849                 break;
 850         case MPOL_PREFERRED:
 851                 if (!(p->flags & MPOL_F_LOCAL))
 852                         node_set(p->v.preferred_node, *nodes);
 853                 /* else return empty node mask for local allocation */
 854                 break;
 855         default:
 856                 BUG();
 857         }
 858 }
 859
 860 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 861 {
 862         struct page *p;
 863         int err;
 864
 865         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 866         if (err >= 0) {
 867                 err = page_to_nid(p);
 868                 put_page(p);
 869         }
 870         return err;
 871 }
 872
 873 /* Retrieve NUMA policy */
 874 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 875                              unsigned long addr, unsigned long flags)
 876 {
 877         int err;
 878         struct mm_struct *mm = current->mm;
 879         struct vm_area_struct *vma = NULL;
 880         struct mempolicy *pol = current->mempolicy;
 881
 882         if (flags &
 883                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 884                 return -EINVAL;
 885
 886         if (flags & MPOL_F_MEMS_ALLOWED) {
 887                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 888                         return -EINVAL;
 889                 *policy = 0;    /* just so it's initialized */
 890                 task_lock(current);
 891                 *nmask  = cpuset_current_mems_allowed;
 892                 task_unlock(current);
 893                 return 0;
 894         }
 895
 896         if (flags & MPOL_F_ADDR) {
 897                 /*
 898                  * Do NOT fall back to task policy if the
 899                  * vma/shared policy at addr is NULL.  We
 900                  * want to return MPOL_DEFAULT in this case.
 901                  */
 902                 down_read(&mm->mmap_sem);
 903                 vma = find_vma_intersection(mm, addr, addr+1);
 904                 if (!vma) {
 905                         up_read(&mm->mmap_sem);
 906                         return -EFAULT;
 907                 }
 908                 if (vma->vm_ops && vma->vm_ops->get_policy)
 909                         pol = vma->vm_ops->get_policy(vma, addr);
 910                 else
 911                         pol = vma->vm_policy;
 912         } else if (addr)
 913                 return -EINVAL;
 914
 915         if (!pol)
 916                 pol = &default_policy;  /* indicates default behavior */
 917
 918         if (flags & MPOL_F_NODE) {
 919                 if (flags & MPOL_F_ADDR) {
 920                         err = lookup_node(mm, addr);
 921                         if (err < 0)
 922                                 goto out;
 923                         *policy = err;
 924                 } else if (pol == current->mempolicy &&
 925                                 pol->mode == MPOL_INTERLEAVE) {
 926                         *policy = current->il_next;
 927                 } else {
 928                         err = -EINVAL;
 929                         goto out;
 930                 }
 931         } else {
 932                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 933                                                 pol->mode;
 934                 /*
 935                  * Internal mempolicy flags must be masked off before exposing
 936                  * the policy to userspace.
 937                  */
 938                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 939         }
 940
 941         if (vma) {
 942                 up_read(&current->mm->mmap_sem);
 943                 vma = NULL;
 944         }
 945
 946         err = 0;
 947         if (nmask) {
 948                 if (mpol_store_user_nodemask(pol)) {
 949                         *nmask = pol->w.user_nodemask;
 950                 } else {
 951                         task_lock(current);
 952                         get_policy_nodemask(pol, nmask);
 953                         task_unlock(current);
 954                 }
 955         }
 956
 957  out:
 958         mpol_cond_put(pol);
 959         if (vma)
 960                 up_read(&current->mm->mmap_sem);
 961         return err;
 962 }
 963
 964 #ifdef CONFIG_MIGRATION
 965 /*
 966  * page migration
 967  */
 968 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 969                                 unsigned long flags)
 970 {
 971         /*
 972          * Avoid migrating a page that is shared with others.
 973          */
 974         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 975                 if (!isolate_lru_page(page)) {
 976                         list_add_tail(&page->lru, pagelist);
 977                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 978                                             page_is_file_cache(page));
 979                 }
 980         }
 981 }
 982
 983 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 984 {
 985         if (PageHuge(page))
 986                 return alloc_huge_page_node(page_hstate(compound_head(page)),
 987                                         node);
 988         else
 989                 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 990 }
 991
 992 /*
 993  * Migrate pages from one node to a target node.
 994  * Returns error or the number of pages not migrated.
 995  */
 996 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 997                            int flags)
 998 {
 999         nodemask_t nmask;
1000         LIST_HEAD(pagelist);
1001         int err = 0;
1002
1003         nodes_clear(nmask);
1004         node_set(source, nmask);
1005
1006         /*
1007          * This does not "check" the range but isolates all pages that
1008          * need migration.  Between passing in the full user address
1009          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1010          */
1011         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1012         queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1013                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1014
1015         if (!list_empty(&pagelist)) {
1016                 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1017                                         MIGRATE_SYNC, MR_SYSCALL);
1018                 if (err)
1019                         putback_movable_pages(&pagelist);
1020         }
1021
1022         return err;
1023 }
1024
1025 /*
1026  * Move pages between the two nodesets so as to preserve the physical
1027  * layout as much as possible.
1028  *
1029  * Returns the number of page that could not be moved.
1030  */
1031 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1032                      const nodemask_t *to, int flags)
1033 {
1034         int busy = 0;
1035         int err;
1036         nodemask_t tmp;
1037
1038         err = migrate_prep();
1039         if (err)
1040                 return err;
1041
1042         down_read(&mm->mmap_sem);
1043
1044         err = migrate_vmas(mm, from, to, flags);
1045         if (err)
1046                 goto out;
1047
1048         /*
1049          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1050          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1051          * bit in 'tmp', and return that <source, dest> pair for migration.
1052          * The pair of nodemasks 'to' and 'from' define the map.
1053          *
1054          * If no pair of bits is found that way, fallback to picking some
1055          * pair of 'source' and 'dest' bits that are not the same.  If the
1056          * 'source' and 'dest' bits are the same, this represents a node
1057          * that will be migrating to itself, so no pages need move.
1058          *
1059          * If no bits are left in 'tmp', or if all remaining bits left
1060          * in 'tmp' correspond to the same bit in 'to', return false
1061          * (nothing left to migrate).
1062          *
1063          * This lets us pick a pair of nodes to migrate between, such that
1064          * if possible the dest node is not already occupied by some other
1065          * source node, minimizing the risk of overloading the memory on a
1066          * node that would happen if we migrated incoming memory to a node
1067          * before migrating outgoing memory source that same node.
1068          *
1069          * A single scan of tmp is sufficient.  As we go, we remember the
1070          * most recent <s, d> pair that moved (s != d).  If we find a pair
1071          * that not only moved, but what's better, moved to an empty slot
1072          * (d is not set in tmp), then we break out then, with that pair.
1073          * Otherwise when we finish scanning from_tmp, we at least have the
1074          * most recent <s, d> pair that moved.  If we get all the way through
1075          * the scan of tmp without finding any node that moved, much less
1076          * moved to an empty node, then there is nothing left worth migrating.
1077          */
1078
1079         tmp = *from;
1080         while (!nodes_empty(tmp)) {
1081                 int s,d;
1082                 int source = NUMA_NO_NODE;
1083                 int dest = 0;
1084
1085                 for_each_node_mask(s, tmp) {
1086
1087                         /*
1088                          * do_migrate_pages() tries to maintain the relative
1089                          * node relationship of the pages established between
1090                          * threads and memory areas.
1091                          *
1092                          * However if the number of source nodes is not equal to
1093                          * the number of destination nodes we can not preserve
1094                          * this node relative relationship.  In that case, skip
1095                          * copying memory from a node that is in the destination
1096                          * mask.
1097                          *
1098                          * Example: [2,3,4] -> [3,4,5] moves everything.
1099                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1100                          */
1101
1102                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1103                                                 (node_isset(s, *to)))
1104                                 continue;
1105
1106                         d = node_remap(s, *from, *to);
1107                         if (s == d)
1108                                 continue;
1109
1110                         source = s;     /* Node moved. Memorize */
1111                         dest = d;
1112
1113                         /* dest not in remaining from nodes? */
1114                         if (!node_isset(dest, tmp))
1115                                 break;
1116                 }
1117                 if (source == NUMA_NO_NODE)
1118                         break;
1119
1120                 node_clear(source, tmp);
1121                 err = migrate_to_node(mm, source, dest, flags);
1122                 if (err > 0)
1123                         busy += err;
1124                 if (err < 0)
1125                         break;
1126         }
1127 out:
1128         up_read(&mm->mmap_sem);
1129         if (err < 0)
1130                 return err;
1131         return busy;
1132
1133 }
1134
1135 /*
1136  * Allocate a new page for page migration based on vma policy.
1137  * Start by assuming the page is mapped by the same vma as contains @start.
1138  * Search forward from there, if not.  N.B., this assumes that the
1139  * list of pages handed to migrate_pages()--which is how we get here--
1140  * is in virtual address order.
1141  */
1142 static struct page *new_page(struct page *page, unsigned long start, int **x)
1143 {
1144         struct vm_area_struct *vma;
1145         unsigned long uninitialized_var(address);
1146
1147         vma = find_vma(current->mm, start);
1148         while (vma) {
1149                 address = page_address_in_vma(page, vma);
1150                 if (address != -EFAULT)
1151                         break;
1152                 vma = vma->vm_next;
1153         }
1154
1155         if (PageHuge(page)) {
1156                 BUG_ON(!vma);
1157                 return alloc_huge_page_noerr(vma, address, 1);
1158         }
1159         /*
1160          * if !vma, alloc_page_vma() will use task or system default policy
1161          */
1162         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1163 }
1164 #else
1165
1166 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1167                                 unsigned long flags)
1168 {
1169 }
1170
1171 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1172                      const nodemask_t *to, int flags)
1173 {
1174         return -ENOSYS;
1175 }
1176
1177 static struct page *new_page(struct page *page, unsigned long start, int **x)
1178 {
1179         return NULL;
1180 }
1181 #endif
1182
1183 static long do_mbind(unsigned long start, unsigned long len,
1184                      unsigned short mode, unsigned short mode_flags,
1185                      nodemask_t *nmask, unsigned long flags)
1186 {
1187         struct mm_struct *mm = current->mm;
1188         struct mempolicy *new;
1189         unsigned long end;
1190         int err;
1191         LIST_HEAD(pagelist);
1192
1193         if (flags & ~(unsigned long)MPOL_MF_VALID)
1194                 return -EINVAL;
1195         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1196                 return -EPERM;
1197
1198         if (start & ~PAGE_MASK)
1199                 return -EINVAL;
1200
1201         if (mode == MPOL_DEFAULT)
1202                 flags &= ~MPOL_MF_STRICT;
1203
1204         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1205         end = start + len;
1206
1207         if (end < start)
1208                 return -EINVAL;
1209         if (end == start)
1210                 return 0;
1211
1212         new = mpol_new(mode, mode_flags, nmask);
1213         if (IS_ERR(new))
1214                 return PTR_ERR(new);
1215
1216         if (flags & MPOL_MF_LAZY)
1217                 new->flags |= MPOL_F_MOF;
1218
1219         /*
1220          * If we are using the default policy then operation
1221          * on discontinuous address spaces is okay after all
1222          */
1223         if (!new)
1224                 flags |= MPOL_MF_DISCONTIG_OK;
1225
1226         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1227                  start, start + len, mode, mode_flags,
1228                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1229
1230         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1231
1232                 err = migrate_prep();
1233                 if (err)
1234                         goto mpol_out;
1235         }
1236         {
1237                 NODEMASK_SCRATCH(scratch);
1238                 if (scratch) {
1239                         down_write(&mm->mmap_sem);
1240                         task_lock(current);
1241                         err = mpol_set_nodemask(new, nmask, scratch);
1242                         task_unlock(current);
1243                         if (err)
1244                                 up_write(&mm->mmap_sem);
1245                 } else
1246                         err = -ENOMEM;
1247                 NODEMASK_SCRATCH_FREE(scratch);
1248         }
1249         if (err)
1250                 goto mpol_out;
1251
1252         err = queue_pages_range(mm, start, end, nmask,
1253                           flags | MPOL_MF_INVERT, &pagelist);
1254         if (!err)
1255                 err = mbind_range(mm, start, end, new);
1256
1257         if (!err) {
1258                 int nr_failed = 0;
1259
1260                 if (!list_empty(&pagelist)) {
1261                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1262                         nr_failed = migrate_pages(&pagelist, new_page, NULL,
1263                                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1264                         if (nr_failed)
1265                                 putback_movable_pages(&pagelist);
1266                 }
1267
1268                 if (nr_failed && (flags & MPOL_MF_STRICT))
1269                         err = -EIO;
1270         } else
1271                 putback_movable_pages(&pagelist);
1272
1273         up_write(&mm->mmap_sem);
1274  mpol_out:
1275         mpol_put(new);
1276         return err;
1277 }
1278
1279 /*
1280  * User space interface with variable sized bitmaps for nodelists.
1281  */
1282
1283 /* Copy a node mask from user space. */
1284 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1285                      unsigned long maxnode)
1286 {
1287         unsigned long k;
1288         unsigned long nlongs;
1289         unsigned long endmask;
1290
1291         --maxnode;
1292         nodes_clear(*nodes);
1293         if (maxnode == 0 || !nmask)
1294                 return 0;
1295         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1296                 return -EINVAL;
1297
1298         nlongs = BITS_TO_LONGS(maxnode);
1299         if ((maxnode % BITS_PER_LONG) == 0)
1300                 endmask = ~0UL;
1301         else
1302                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1303
1304         /* When the user specified more nodes than supported just check
1305            if the non supported part is all zero. */
1306         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1307                 if (nlongs > PAGE_SIZE/sizeof(long))
1308                         return -EINVAL;
1309                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1310                         unsigned long t;
1311                         if (get_user(t, nmask + k))
1312                                 return -EFAULT;
1313                         if (k == nlongs - 1) {
1314                                 if (t & endmask)
1315                                         return -EINVAL;
1316                         } else if (t)
1317                                 return -EINVAL;
1318                 }
1319                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1320                 endmask = ~0UL;
1321         }
1322
1323         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1324                 return -EFAULT;
1325         nodes_addr(*nodes)[nlongs-1] &= endmask;
1326         return 0;
1327 }
1328
1329 /* Copy a kernel node mask to user space */
1330 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1331                               nodemask_t *nodes)
1332 {
1333         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1334         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1335
1336         if (copy > nbytes) {
1337                 if (copy > PAGE_SIZE)
1338                         return -EINVAL;
1339                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1340                         return -EFAULT;
1341                 copy = nbytes;
1342         }
1343         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1344 }
1345
1346 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1347                 unsigned long, mode, const unsigned long __user *, nmask,
1348                 unsigned long, maxnode, unsigned, flags)
1349 {
1350         nodemask_t nodes;
1351         int err;
1352         unsigned short mode_flags;
1353
1354         mode_flags = mode & MPOL_MODE_FLAGS;
1355         mode &= ~MPOL_MODE_FLAGS;
1356         if (mode >= MPOL_MAX)
1357                 return -EINVAL;
1358         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1359             (mode_flags & MPOL_F_RELATIVE_NODES))
1360                 return -EINVAL;
1361         err = get_nodes(&nodes, nmask, maxnode);
1362         if (err)
1363                 return err;
1364         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1365 }
1366
1367 /* Set the process memory policy */
1368 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1369                 unsigned long, maxnode)
1370 {
1371         int err;
1372         nodemask_t nodes;
1373         unsigned short flags;
1374
1375         flags = mode & MPOL_MODE_FLAGS;
1376         mode &= ~MPOL_MODE_FLAGS;
1377         if ((unsigned int)mode >= MPOL_MAX)
1378                 return -EINVAL;
1379         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1380                 return -EINVAL;
1381         err = get_nodes(&nodes, nmask, maxnode);
1382         if (err)
1383                 return err;
1384         return do_set_mempolicy(mode, flags, &nodes);
1385 }
1386
1387 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1388                 const unsigned long __user *, old_nodes,
1389                 const unsigned long __user *, new_nodes)
1390 {
1391         const struct cred *cred = current_cred(), *tcred;
1392         struct mm_struct *mm = NULL;
1393         struct task_struct *task;
1394         nodemask_t task_nodes;
1395         int err;
1396         nodemask_t *old;
1397         nodemask_t *new;
1398         NODEMASK_SCRATCH(scratch);
1399
1400         if (!scratch)
1401                 return -ENOMEM;
1402
1403         old = &scratch->mask1;
1404         new = &scratch->mask2;
1405
1406         err = get_nodes(old, old_nodes, maxnode);
1407         if (err)
1408                 goto out;
1409
1410         err = get_nodes(new, new_nodes, maxnode);
1411         if (err)
1412                 goto out;
1413
1414         /* Find the mm_struct */
1415         rcu_read_lock();
1416         task = pid ? find_task_by_vpid(pid) : current;
1417         if (!task) {
1418                 rcu_read_unlock();
1419                 err = -ESRCH;
1420                 goto out;
1421         }
1422         get_task_struct(task);
1423
1424         err = -EINVAL;
1425
1426         /*
1427          * Check if this process has the right to modify the specified
1428          * process. The right exists if the process has administrative
1429          * capabilities, superuser privileges or the same
1430          * userid as the target process.
1431          */
1432         tcred = __task_cred(task);
1433         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1434             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1435             !capable(CAP_SYS_NICE)) {
1436                 rcu_read_unlock();
1437                 err = -EPERM;
1438                 goto out_put;
1439         }
1440         rcu_read_unlock();
1441
1442         task_nodes = cpuset_mems_allowed(task);
1443         /* Is the user allowed to access the target nodes? */
1444         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1445                 err = -EPERM;
1446                 goto out_put;
1447         }
1448
1449         if (!nodes_subset(*new, node_states[N_MEMORY])) {
1450                 err = -EINVAL;
1451                 goto out_put;
1452         }
1453
1454         err = security_task_movememory(task);
1455         if (err)
1456                 goto out_put;
1457
1458         mm = get_task_mm(task);
1459         put_task_struct(task);
1460
1461         if (!mm) {
1462                 err = -EINVAL;
1463                 goto out;
1464         }
1465
1466         err = do_migrate_pages(mm, old, new,
1467                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1468
1469         mmput(mm);
1470 out:
1471         NODEMASK_SCRATCH_FREE(scratch);
1472
1473         return err;
1474
1475 out_put:
1476         put_task_struct(task);
1477         goto out;
1478
1479 }
1480
1481
1482 /* Retrieve NUMA policy */
1483 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1484                 unsigned long __user *, nmask, unsigned long, maxnode,
1485                 unsigned long, addr, unsigned long, flags)
1486 {
1487         int err;
1488         int uninitialized_var(pval);
1489         nodemask_t nodes;
1490
1491         if (nmask != NULL && maxnode < MAX_NUMNODES)
1492                 return -EINVAL;
1493
1494         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1495
1496         if (err)
1497                 return err;
1498
1499         if (policy && put_user(pval, policy))
1500                 return -EFAULT;
1501
1502         if (nmask)
1503                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1504
1505         return err;
1506 }
1507
1508 #ifdef CONFIG_COMPAT
1509
1510 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1511                        compat_ulong_t __user *, nmask,
1512                        compat_ulong_t, maxnode,
1513                        compat_ulong_t, addr, compat_ulong_t, flags)
1514 {
1515         long err;
1516         unsigned long __user *nm = NULL;
1517         unsigned long nr_bits, alloc_size;
1518         DECLARE_BITMAP(bm, MAX_NUMNODES);
1519
1520         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1521         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1522
1523         if (nmask)
1524                 nm = compat_alloc_user_space(alloc_size);
1525
1526         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1527
1528         if (!err && nmask) {
1529                 unsigned long copy_size;
1530                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1531                 err = copy_from_user(bm, nm, copy_size);
1532                 /* ensure entire bitmap is zeroed */
1533                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1534                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1535         }
1536
1537         return err;
1538 }
1539
1540 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1541                        compat_ulong_t, maxnode)
1542 {
1543         long err = 0;
1544         unsigned long __user *nm = NULL;
1545         unsigned long nr_bits, alloc_size;
1546         DECLARE_BITMAP(bm, MAX_NUMNODES);
1547
1548         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1549         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1550
1551         if (nmask) {
1552                 err = compat_get_bitmap(bm, nmask, nr_bits);
1553                 nm = compat_alloc_user_space(alloc_size);
1554                 err |= copy_to_user(nm, bm, alloc_size);
1555         }
1556
1557         if (err)
1558                 return -EFAULT;
1559
1560         return sys_set_mempolicy(mode, nm, nr_bits+1);
1561 }
1562
1563 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1564                        compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1565                        compat_ulong_t, maxnode, compat_ulong_t, flags)
1566 {
1567         long err = 0;
1568         unsigned long __user *nm = NULL;
1569         unsigned long nr_bits, alloc_size;
1570         nodemask_t bm;
1571
1572         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1573         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1574
1575         if (nmask) {
1576                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1577                 nm = compat_alloc_user_space(alloc_size);
1578                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1579         }
1580
1581         if (err)
1582                 return -EFAULT;
1583
1584         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1585 }
1586
1587 #endif
1588
1589 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1590                                                 unsigned long addr)
1591 {
1592         struct mempolicy *pol = NULL;
1593
1594         if (vma) {
1595                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1596                         pol = vma->vm_ops->get_policy(vma, addr);
1597                 } else if (vma->vm_policy) {
1598                         pol = vma->vm_policy;
1599
1600                         /*
1601                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1602                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1603                          * count on these policies which will be dropped by
1604                          * mpol_cond_put() later
1605                          */
1606                         if (mpol_needs_cond_ref(pol))
1607                                 mpol_get(pol);
1608                 }
1609         }
1610
1611         return pol;
1612 }
1613
1614 /*
1615  * get_vma_policy(@vma, @addr)
1616  * @vma: virtual memory area whose policy is sought
1617  * @addr: address in @vma for shared policy lookup
1618  *
1619  * Returns effective policy for a VMA at specified address.
1620  * Falls back to current->mempolicy or system default policy, as necessary.
1621  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1622  * count--added by the get_policy() vm_op, as appropriate--to protect against
1623  * freeing by another task.  It is the caller's responsibility to free the
1624  * extra reference for shared policies.
1625  */
1626 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1627                                                 unsigned long addr)
1628 {
1629         struct mempolicy *pol = __get_vma_policy(vma, addr);
1630
1631         if (!pol)
1632                 pol = get_task_policy(current);
1633
1634         return pol;
1635 }
1636
1637 bool vma_policy_mof(struct vm_area_struct *vma)
1638 {
1639         struct mempolicy *pol;
1640
1641         if (vma->vm_ops && vma->vm_ops->get_policy) {
1642                 bool ret = false;
1643
1644                 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1645                 if (pol && (pol->flags & MPOL_F_MOF))
1646                         ret = true;
1647                 mpol_cond_put(pol);
1648
1649                 return ret;
1650         }
1651
1652         pol = vma->vm_policy;
1653         if (!pol)
1654                 pol = get_task_policy(current);
1655
1656         return pol->flags & MPOL_F_MOF;
1657 }
1658
1659 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1660 {
1661         enum zone_type dynamic_policy_zone = policy_zone;
1662
1663         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1664
1665         /*
1666          * if policy->v.nodes has movable memory only,
1667          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1668          *
1669          * policy->v.nodes is intersect with node_states[N_MEMORY].
1670          * so if the following test faile, it implies
1671          * policy->v.nodes has movable memory only.
1672          */
1673         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1674                 dynamic_policy_zone = ZONE_MOVABLE;
1675
1676         return zone >= dynamic_policy_zone;
1677 }
1678
1679 /*
1680  * Return a nodemask representing a mempolicy for filtering nodes for
1681  * page allocation
1682  */
1683 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1684 {
1685         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1686         if (unlikely(policy->mode == MPOL_BIND) &&
1687                         apply_policy_zone(policy, gfp_zone(gfp)) &&
1688                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1689                 return &policy->v.nodes;
1690
1691         return NULL;
1692 }
1693
1694 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1695 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1696         int nd)
1697 {
1698         switch (policy->mode) {
1699         case MPOL_PREFERRED:
1700                 if (!(policy->flags & MPOL_F_LOCAL))
1701                         nd = policy->v.preferred_node;
1702                 break;
1703         case MPOL_BIND:
1704                 /*
1705                  * Normally, MPOL_BIND allocations are node-local within the
1706                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1707                  * current node isn't part of the mask, we use the zonelist for
1708                  * the first node in the mask instead.
1709                  */
1710                 if (unlikely(gfp & __GFP_THISNODE) &&
1711                                 unlikely(!node_isset(nd, policy->v.nodes)))
1712                         nd = first_node(policy->v.nodes);
1713                 break;
1714         default:
1715                 BUG();
1716         }
1717         return node_zonelist(nd, gfp);
1718 }
1719
1720 /* Do dynamic interleaving for a process */
1721 static unsigned interleave_nodes(struct mempolicy *policy)
1722 {
1723         unsigned nid, next;
1724         struct task_struct *me = current;
1725
1726         nid = me->il_next;
1727         next = next_node(nid, policy->v.nodes);
1728         if (next >= MAX_NUMNODES)
1729                 next = first_node(policy->v.nodes);
1730         if (next < MAX_NUMNODES)
1731                 me->il_next = next;
1732         return nid;
1733 }
1734
1735 /*
1736  * Depending on the memory policy provide a node from which to allocate the
1737  * next slab entry.
1738  */
1739 unsigned int mempolicy_slab_node(void)
1740 {
1741         struct mempolicy *policy;
1742         int node = numa_mem_id();
1743
1744         if (in_interrupt())
1745                 return node;
1746
1747         policy = current->mempolicy;
1748         if (!policy || policy->flags & MPOL_F_LOCAL)
1749                 return node;
1750
1751         switch (policy->mode) {
1752         case MPOL_PREFERRED:
1753                 /*
1754                  * handled MPOL_F_LOCAL above
1755                  */
1756                 return policy->v.preferred_node;
1757
1758         case MPOL_INTERLEAVE:
1759                 return interleave_nodes(policy);
1760
1761         case MPOL_BIND: {
1762                 /*
1763                  * Follow bind policy behavior and start allocation at the
1764                  * first node.
1765                  */
1766                 struct zonelist *zonelist;
1767                 struct zone *zone;
1768                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1769                 zonelist = &NODE_DATA(node)->node_zonelists[0];
1770                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1771                                                         &policy->v.nodes,
1772                                                         &zone);
1773                 return zone ? zone->node : node;
1774         }
1775
1776         default:
1777                 BUG();
1778         }
1779 }
1780
1781 /* Do static interleaving for a VMA with known offset. */
1782 static unsigned offset_il_node(struct mempolicy *pol,
1783                 struct vm_area_struct *vma, unsigned long off)
1784 {
1785         unsigned nnodes = nodes_weight(pol->v.nodes);
1786         unsigned target;
1787         int c;
1788         int nid = NUMA_NO_NODE;
1789
1790         if (!nnodes)
1791                 return numa_node_id();
1792         target = (unsigned int)off % nnodes;
1793         c = 0;
1794         do {
1795                 nid = next_node(nid, pol->v.nodes);
1796                 c++;
1797         } while (c <= target);
1798         return nid;
1799 }
1800
1801 /* Determine a node number for interleave */
1802 static inline unsigned interleave_nid(struct mempolicy *pol,
1803                  struct vm_area_struct *vma, unsigned long addr, int shift)
1804 {
1805         if (vma) {
1806                 unsigned long off;
1807
1808                 /*
1809                  * for small pages, there is no difference between
1810                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1811                  * for huge pages, since vm_pgoff is in units of small
1812                  * pages, we need to shift off the always 0 bits to get
1813                  * a useful offset.
1814                  */
1815                 BUG_ON(shift < PAGE_SHIFT);
1816                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1817                 off += (addr - vma->vm_start) >> shift;
1818                 return offset_il_node(pol, vma, off);
1819         } else
1820                 return interleave_nodes(pol);
1821 }
1822
1823 /*
1824  * Return the bit number of a random bit set in the nodemask.
1825  * (returns NUMA_NO_NODE if nodemask is empty)
1826  */
1827 int node_random(const nodemask_t *maskp)
1828 {
1829         int w, bit = NUMA_NO_NODE;
1830
1831         w = nodes_weight(*maskp);
1832         if (w)
1833                 bit = bitmap_ord_to_pos(maskp->bits,
1834                         get_random_int() % w, MAX_NUMNODES);
1835         return bit;
1836 }
1837
1838 #ifdef CONFIG_HUGETLBFS
1839 /*
1840  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1841  * @vma: virtual memory area whose policy is sought
1842  * @addr: address in @vma for shared policy lookup and interleave policy
1843  * @gfp_flags: for requested zone
1844  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1845  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1846  *
1847  * Returns a zonelist suitable for a huge page allocation and a pointer
1848  * to the struct mempolicy for conditional unref after allocation.
1849  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1850  * @nodemask for filtering the zonelist.
1851  *
1852  * Must be protected by read_mems_allowed_begin()
1853  */
1854 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1855                                 gfp_t gfp_flags, struct mempolicy **mpol,
1856                                 nodemask_t **nodemask)
1857 {
1858         struct zonelist *zl;
1859
1860         *mpol = get_vma_policy(vma, addr);
1861         *nodemask = NULL;       /* assume !MPOL_BIND */
1862
1863         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1864                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1865                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1866         } else {
1867                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1868                 if ((*mpol)->mode == MPOL_BIND)
1869                         *nodemask = &(*mpol)->v.nodes;
1870         }
1871         return zl;
1872 }
1873
1874 /*
1875  * init_nodemask_of_mempolicy
1876  *
1877  * If the current task's mempolicy is "default" [NULL], return 'false'
1878  * to indicate default policy.  Otherwise, extract the policy nodemask
1879  * for 'bind' or 'interleave' policy into the argument nodemask, or
1880  * initialize the argument nodemask to contain the single node for
1881  * 'preferred' or 'local' policy and return 'true' to indicate presence
1882  * of non-default mempolicy.
1883  *
1884  * We don't bother with reference counting the mempolicy [mpol_get/put]
1885  * because the current task is examining it's own mempolicy and a task's
1886  * mempolicy is only ever changed by the task itself.
1887  *
1888  * N.B., it is the caller's responsibility to free a returned nodemask.
1889  */
1890 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1891 {
1892         struct mempolicy *mempolicy;
1893         int nid;
1894
1895         if (!(mask && current->mempolicy))
1896                 return false;
1897
1898         task_lock(current);
1899         mempolicy = current->mempolicy;
1900         switch (mempolicy->mode) {
1901         case MPOL_PREFERRED:
1902                 if (mempolicy->flags & MPOL_F_LOCAL)
1903                         nid = numa_node_id();
1904                 else
1905                         nid = mempolicy->v.preferred_node;
1906                 init_nodemask_of_node(mask, nid);
1907                 break;
1908
1909         case MPOL_BIND:
1910                 /* Fall through */
1911         case MPOL_INTERLEAVE:
1912                 *mask =  mempolicy->v.nodes;
1913                 break;
1914
1915         default:
1916                 BUG();
1917         }
1918         task_unlock(current);
1919
1920         return true;
1921 }
1922 #endif
1923
1924 /*
1925  * mempolicy_nodemask_intersects
1926  *
1927  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1928  * policy.  Otherwise, check for intersection between mask and the policy
1929  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1930  * policy, always return true since it may allocate elsewhere on fallback.
1931  *
1932  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1933  */
1934 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1935                                         const nodemask_t *mask)
1936 {
1937         struct mempolicy *mempolicy;
1938         bool ret = true;
1939
1940         if (!mask)
1941                 return ret;
1942         task_lock(tsk);
1943         mempolicy = tsk->mempolicy;
1944         if (!mempolicy)
1945                 goto out;
1946
1947         switch (mempolicy->mode) {
1948         case MPOL_PREFERRED:
1949                 /*
1950                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1951                  * allocate from, they may fallback to other nodes when oom.
1952                  * Thus, it's possible for tsk to have allocated memory from
1953                  * nodes in mask.
1954                  */
1955                 break;
1956         case MPOL_BIND:
1957         case MPOL_INTERLEAVE:
1958                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1959                 break;
1960         default:
1961                 BUG();
1962         }
1963 out:
1964         task_unlock(tsk);
1965         return ret;
1966 }
1967
1968 /* Allocate a page in interleaved policy.
1969    Own path because it needs to do special accounting. */
1970 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1971                                         unsigned nid)
1972 {
1973         struct zonelist *zl;
1974         struct page *page;
1975
1976         zl = node_zonelist(nid, gfp);
1977         page = __alloc_pages(gfp, order, zl);
1978         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1979                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1980         return page;
1981 }
1982
1983 /**
1984  *      alloc_pages_vma - Allocate a page for a VMA.
1985  *
1986  *      @gfp:
1987  *      %GFP_USER    user allocation.
1988  *      %GFP_KERNEL  kernel allocations,
1989  *      %GFP_HIGHMEM highmem/user allocations,
1990  *      %GFP_FS      allocation should not call back into a file system.
1991  *      %GFP_ATOMIC  don't sleep.
1992  *
1993  *      @order:Order of the GFP allocation.
1994  *      @vma:  Pointer to VMA or NULL if not available.
1995  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1996  *
1997  *      This function allocates a page from the kernel page pool and applies
1998  *      a NUMA policy associated with the VMA or the current process.
1999  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2000  *      mm_struct of the VMA to prevent it from going away. Should be used for
2001  *      all allocations for pages that will be mapped into
2002  *      user space. Returns NULL when no page can be allocated.
2003  *
2004  *      Should be called with the mm_sem of the vma hold.
2005  */
2006 struct page *
2007 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2008                 unsigned long addr, int node)
2009 {
2010         struct mempolicy *pol;
2011         struct page *page;
2012         unsigned int cpuset_mems_cookie;
2013
2014 retry_cpuset:
2015         pol = get_vma_policy(vma, addr);
2016         cpuset_mems_cookie = read_mems_allowed_begin();
2017
2018         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2019                 unsigned nid;
2020
2021                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2022                 mpol_cond_put(pol);
2023                 page = alloc_page_interleave(gfp, order, nid);
2024                 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2025                         goto retry_cpuset;
2026
2027                 return page;
2028         }
2029         page = __alloc_pages_nodemask(gfp, order,
2030                                       policy_zonelist(gfp, pol, node),
2031                                       policy_nodemask(gfp, pol));
2032         mpol_cond_put(pol);
2033         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2034                 goto retry_cpuset;
2035         return page;
2036 }
2037
2038 /**
2039  *      alloc_pages_current - Allocate pages.
2040  *
2041  *      @gfp:
2042  *              %GFP_USER   user allocation,
2043  *              %GFP_KERNEL kernel allocation,
2044  *              %GFP_HIGHMEM highmem allocation,
2045  *              %GFP_FS     don't call back into a file system.
2046  *              %GFP_ATOMIC don't sleep.
2047  *      @order: Power of two of allocation size in pages. 0 is a single page.
2048  *
2049  *      Allocate a page from the kernel page pool.  When not in
2050  *      interrupt context and apply the current process NUMA policy.
2051  *      Returns NULL when no page can be allocated.
2052  *
2053  *      Don't call cpuset_update_task_memory_state() unless
2054  *      1) it's ok to take cpuset_sem (can WAIT), and
2055  *      2) allocating for current task (not interrupt).
2056  */
2057 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2058 {
2059         struct mempolicy *pol = &default_policy;
2060         struct page *page;
2061         unsigned int cpuset_mems_cookie;
2062
2063         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2064                 pol = get_task_policy(current);
2065
2066 retry_cpuset:
2067         cpuset_mems_cookie = read_mems_allowed_begin();
2068
2069         /*
2070          * No reference counting needed for current->mempolicy
2071          * nor system default_policy
2072          */
2073         if (pol->mode == MPOL_INTERLEAVE)
2074                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2075         else
2076                 page = __alloc_pages_nodemask(gfp, order,
2077                                 policy_zonelist(gfp, pol, numa_node_id()),
2078                                 policy_nodemask(gfp, pol));
2079
2080         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2081                 goto retry_cpuset;
2082
2083         return page;
2084 }
2085 EXPORT_SYMBOL(alloc_pages_current);
2086
2087 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2088 {
2089         struct mempolicy *pol = mpol_dup(vma_policy(src));
2090
2091         if (IS_ERR(pol))
2092                 return PTR_ERR(pol);
2093         dst->vm_policy = pol;
2094         return 0;
2095 }
2096
2097 /*
2098  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2099  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2100  * with the mems_allowed returned by cpuset_mems_allowed().  This
2101  * keeps mempolicies cpuset relative after its cpuset moves.  See
2102  * further kernel/cpuset.c update_nodemask().
2103  *
2104  * current's mempolicy may be rebinded by the other task(the task that changes
2105  * cpuset's mems), so we needn't do rebind work for current task.
2106  */
2107
2108 /* Slow path of a mempolicy duplicate */
2109 struct mempolicy *__mpol_dup(struct mempolicy *old)
2110 {
2111         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2112
2113         if (!new)
2114                 return ERR_PTR(-ENOMEM);
2115
2116         /* task's mempolicy is protected by alloc_lock */
2117         if (old == current->mempolicy) {
2118                 task_lock(current);
2119                 *new = *old;
2120                 task_unlock(current);
2121         } else
2122                 *new = *old;
2123
2124         if (current_cpuset_is_being_rebound()) {
2125                 nodemask_t mems = cpuset_mems_allowed(current);
2126                 if (new->flags & MPOL_F_REBINDING)
2127                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2128                 else
2129                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2130         }
2131         atomic_set(&new->refcnt, 1);
2132         return new;
2133 }
2134
2135 /* Slow path of a mempolicy comparison */
2136 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2137 {
2138         if (!a || !b)
2139                 return false;
2140         if (a->mode != b->mode)
2141                 return false;
2142         if (a->flags != b->flags)
2143                 return false;
2144         if (mpol_store_user_nodemask(a))
2145                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2146                         return false;
2147
2148         switch (a->mode) {
2149         case MPOL_BIND:
2150                 /* Fall through */
2151         case MPOL_INTERLEAVE:
2152                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2153         case MPOL_PREFERRED:
2154                 return a->v.preferred_node == b->v.preferred_node;
2155         default:
2156                 BUG();
2157                 return false;
2158         }
2159 }
2160
2161 /*
2162  * Shared memory backing store policy support.
2163  *
2164  * Remember policies even when nobody has shared memory mapped.
2165  * The policies are kept in Red-Black tree linked from the inode.
2166  * They are protected by the sp->lock spinlock, which should be held
2167  * for any accesses to the tree.
2168  */
2169
2170 /* lookup first element intersecting start-end */
2171 /* Caller holds sp->lock */
2172 static struct sp_node *
2173 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2174 {
2175         struct rb_node *n = sp->root.rb_node;
2176
2177         while (n) {
2178                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2179
2180                 if (start >= p->end)
2181                         n = n->rb_right;
2182                 else if (end <= p->start)
2183                         n = n->rb_left;
2184                 else
2185                         break;
2186         }
2187         if (!n)
2188                 return NULL;
2189         for (;;) {
2190                 struct sp_node *w = NULL;
2191                 struct rb_node *prev = rb_prev(n);
2192                 if (!prev)
2193                         break;
2194                 w = rb_entry(prev, struct sp_node, nd);
2195                 if (w->end <= start)
2196                         break;
2197                 n = prev;
2198         }
2199         return rb_entry(n, struct sp_node, nd);
2200 }
2201
2202 /* Insert a new shared policy into the list. */
2203 /* Caller holds sp->lock */
2204 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2205 {
2206         struct rb_node **p = &sp->root.rb_node;
2207         struct rb_node *parent = NULL;
2208         struct sp_node *nd;
2209
2210         while (*p) {
2211                 parent = *p;
2212                 nd = rb_entry(parent, struct sp_node, nd);
2213                 if (new->start < nd->start)
2214                         p = &(*p)->rb_left;
2215                 else if (new->end > nd->end)
2216                         p = &(*p)->rb_right;
2217                 else
2218                         BUG();
2219         }
2220         rb_link_node(&new->nd, parent, p);
2221         rb_insert_color(&new->nd, &sp->root);
2222         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2223                  new->policy ? new->policy->mode : 0);
2224 }
2225
2226 /* Find shared policy intersecting idx */
2227 struct mempolicy *
2228 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2229 {
2230         struct mempolicy *pol = NULL;
2231         struct sp_node *sn;
2232
2233         if (!sp->root.rb_node)
2234                 return NULL;
2235         spin_lock(&sp->lock);
2236         sn = sp_lookup(sp, idx, idx+1);
2237         if (sn) {
2238                 mpol_get(sn->policy);
2239                 pol = sn->policy;
2240         }
2241         spin_unlock(&sp->lock);
2242         return pol;
2243 }
2244
2245 static void sp_free(struct sp_node *n)
2246 {
2247         mpol_put(n->policy);
2248         kmem_cache_free(sn_cache, n);
2249 }
2250
2251 /**
2252  * mpol_misplaced - check whether current page node is valid in policy
2253  *
2254  * @page: page to be checked
2255  * @vma: vm area where page mapped
2256  * @addr: virtual address where page mapped
2257  *
2258  * Lookup current policy node id for vma,addr and "compare to" page's
2259  * node id.
2260  *
2261  * Returns:
2262  *      -1      - not misplaced, page is in the right node
2263  *      node    - node id where the page should be
2264  *
2265  * Policy determination "mimics" alloc_page_vma().
2266  * Called from fault path where we know the vma and faulting address.
2267  */
2268 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2269 {
2270         struct mempolicy *pol;
2271         struct zone *zone;
2272         int curnid = page_to_nid(page);
2273         unsigned long pgoff;
2274         int thiscpu = raw_smp_processor_id();
2275         int thisnid = cpu_to_node(thiscpu);
2276         int polnid = -1;
2277         int ret = -1;
2278
2279         BUG_ON(!vma);
2280
2281         pol = get_vma_policy(vma, addr);
2282         if (!(pol->flags & MPOL_F_MOF))
2283                 goto out;
2284
2285         switch (pol->mode) {
2286         case MPOL_INTERLEAVE:
2287                 BUG_ON(addr >= vma->vm_end);
2288                 BUG_ON(addr < vma->vm_start);
2289
2290                 pgoff = vma->vm_pgoff;
2291                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2292                 polnid = offset_il_node(pol, vma, pgoff);
2293                 break;
2294
2295         case MPOL_PREFERRED:
2296                 if (pol->flags & MPOL_F_LOCAL)
2297                         polnid = numa_node_id();
2298                 else
2299                         polnid = pol->v.preferred_node;
2300                 break;
2301
2302         case MPOL_BIND:
2303                 /*
2304                  * allows binding to multiple nodes.
2305                  * use current page if in policy nodemask,
2306                  * else select nearest allowed node, if any.
2307                  * If no allowed nodes, use current [!misplaced].
2308                  */
2309                 if (node_isset(curnid, pol->v.nodes))
2310                         goto out;
2311                 (void)first_zones_zonelist(
2312                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2313                                 gfp_zone(GFP_HIGHUSER),
2314                                 &pol->v.nodes, &zone);
2315                 polnid = zone->node;
2316                 break;
2317
2318         default:
2319                 BUG();
2320         }
2321
2322         /* Migrate the page towards the node whose CPU is referencing it */
2323         if (pol->flags & MPOL_F_MORON) {
2324                 polnid = thisnid;
2325
2326                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2327                         goto out;
2328         }
2329
2330         if (curnid != polnid)
2331                 ret = polnid;
2332 out:
2333         mpol_cond_put(pol);
2334
2335         return ret;
2336 }
2337
2338 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2339 {
2340         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2341         rb_erase(&n->nd, &sp->root);
2342         sp_free(n);
2343 }
2344
2345 static void sp_node_init(struct sp_node *node, unsigned long start,
2346                         unsigned long end, struct mempolicy *pol)
2347 {
2348         node->start = start;
2349         node->end = end;
2350         node->policy = pol;
2351 }
2352
2353 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2354                                 struct mempolicy *pol)
2355 {
2356         struct sp_node *n;
2357         struct mempolicy *newpol;
2358
2359         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2360         if (!n)
2361                 return NULL;
2362
2363         newpol = mpol_dup(pol);
2364         if (IS_ERR(newpol)) {
2365                 kmem_cache_free(sn_cache, n);
2366                 return NULL;
2367         }
2368         newpol->flags |= MPOL_F_SHARED;
2369         sp_node_init(n, start, end, newpol);
2370
2371         return n;
2372 }
2373
2374 /* Replace a policy range. */
2375 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2376                                  unsigned long end, struct sp_node *new)
2377 {
2378         struct sp_node *n;
2379         struct sp_node *n_new = NULL;
2380         struct mempolicy *mpol_new = NULL;
2381         int ret = 0;
2382
2383 restart:
2384         spin_lock(&sp->lock);
2385         n = sp_lookup(sp, start, end);
2386         /* Take care of old policies in the same range. */
2387         while (n && n->start < end) {
2388                 struct rb_node *next = rb_next(&n->nd);
2389                 if (n->start >= start) {
2390                         if (n->end <= end)
2391                                 sp_delete(sp, n);
2392                         else
2393                                 n->start = end;
2394                 } else {
2395                         /* Old policy spanning whole new range. */
2396                         if (n->end > end) {
2397                                 if (!n_new)
2398                                         goto alloc_new;
2399
2400                                 *mpol_new = *n->policy;
2401                                 atomic_set(&mpol_new->refcnt, 1);
2402                                 sp_node_init(n_new, end, n->end, mpol_new);
2403                                 n->end = start;
2404                                 sp_insert(sp, n_new);
2405                                 n_new = NULL;
2406                                 mpol_new = NULL;
2407                                 break;
2408                         } else
2409                                 n->end = start;
2410                 }
2411                 if (!next)
2412                         break;
2413                 n = rb_entry(next, struct sp_node, nd);
2414         }
2415         if (new)
2416                 sp_insert(sp, new);
2417         spin_unlock(&sp->lock);
2418         ret = 0;
2419
2420 err_out:
2421         if (mpol_new)
2422                 mpol_put(mpol_new);
2423         if (n_new)
2424                 kmem_cache_free(sn_cache, n_new);
2425
2426         return ret;
2427
2428 alloc_new:
2429         spin_unlock(&sp->lock);
2430         ret = -ENOMEM;
2431         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2432         if (!n_new)
2433                 goto err_out;
2434         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2435         if (!mpol_new)
2436                 goto err_out;
2437         goto restart;
2438 }
2439
2440 /**
2441  * mpol_shared_policy_init - initialize shared policy for inode
2442  * @sp: pointer to inode shared policy
2443  * @mpol:  struct mempolicy to install
2444  *
2445  * Install non-NULL @mpol in inode's shared policy rb-tree.
2446  * On entry, the current task has a reference on a non-NULL @mpol.
2447  * This must be released on exit.
2448  * This is called at get_inode() calls and we can use GFP_KERNEL.
2449  */
2450 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2451 {
2452         int ret;
2453
2454         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2455         spin_lock_init(&sp->lock);
2456
2457         if (mpol) {
2458                 struct vm_area_struct pvma;
2459                 struct mempolicy *new;
2460                 NODEMASK_SCRATCH(scratch);
2461
2462                 if (!scratch)
2463                         goto put_mpol;
2464                 /* contextualize the tmpfs mount point mempolicy */
2465                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2466                 if (IS_ERR(new))
2467                         goto free_scratch; /* no valid nodemask intersection */
2468
2469                 task_lock(current);
2470                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2471                 task_unlock(current);
2472                 if (ret)
2473                         goto put_new;
2474
2475                 /* Create pseudo-vma that contains just the policy */
2476                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2477                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2478                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2479
2480 put_new:
2481                 mpol_put(new);                  /* drop initial ref */
2482 free_scratch:
2483                 NODEMASK_SCRATCH_FREE(scratch);
2484 put_mpol:
2485                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2486         }
2487 }
2488
2489 int mpol_set_shared_policy(struct shared_policy *info,
2490                         struct vm_area_struct *vma, struct mempolicy *npol)
2491 {
2492         int err;
2493         struct sp_node *new = NULL;
2494         unsigned long sz = vma_pages(vma);
2495
2496         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2497                  vma->vm_pgoff,
2498                  sz, npol ? npol->mode : -1,
2499                  npol ? npol->flags : -1,
2500                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2501
2502         if (npol) {
2503                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2504                 if (!new)
2505                         return -ENOMEM;
2506         }
2507         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2508         if (err && new)
2509                 sp_free(new);
2510         return err;
2511 }
2512
2513 /* Free a backing policy store on inode delete. */
2514 void mpol_free_shared_policy(struct shared_policy *p)
2515 {
2516         struct sp_node *n;
2517         struct rb_node *next;
2518
2519         if (!p->root.rb_node)
2520                 return;
2521         spin_lock(&p->lock);
2522         next = rb_first(&p->root);
2523         while (next) {
2524                 n = rb_entry(next, struct sp_node, nd);
2525                 next = rb_next(&n->nd);
2526                 sp_delete(p, n);
2527         }
2528         spin_unlock(&p->lock);
2529 }
2530
2531 #ifdef CONFIG_NUMA_BALANCING
2532 static int __initdata numabalancing_override;
2533
2534 static void __init check_numabalancing_enable(void)
2535 {
2536         bool numabalancing_default = false;
2537
2538         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2539                 numabalancing_default = true;
2540
2541         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2542         if (numabalancing_override)
2543                 set_numabalancing_state(numabalancing_override == 1);
2544
2545         if (nr_node_ids > 1 && !numabalancing_override) {
2546                 pr_info("%s automatic NUMA balancing. "
2547                         "Configure with numa_balancing= or the "
2548                         "kernel.numa_balancing sysctl",
2549                         numabalancing_default ? "Enabling" : "Disabling");
2550                 set_numabalancing_state(numabalancing_default);
2551         }
2552 }
2553
2554 static int __init setup_numabalancing(char *str)
2555 {
2556         int ret = 0;
2557         if (!str)
2558                 goto out;
2559
2560         if (!strcmp(str, "enable")) {
2561                 numabalancing_override = 1;
2562                 ret = 1;
2563         } else if (!strcmp(str, "disable")) {
2564                 numabalancing_override = -1;
2565                 ret = 1;
2566         }
2567 out:
2568         if (!ret)
2569                 pr_warn("Unable to parse numa_balancing=\n");
2570
2571         return ret;
2572 }
2573 __setup("numa_balancing=", setup_numabalancing);
2574 #else
2575 static inline void __init check_numabalancing_enable(void)
2576 {
2577 }
2578 #endif /* CONFIG_NUMA_BALANCING */
2579
2580 /* assumes fs == KERNEL_DS */
2581 void __init numa_policy_init(void)
2582 {
2583         nodemask_t interleave_nodes;
2584         unsigned long largest = 0;
2585         int nid, prefer = 0;
2586
2587         policy_cache = kmem_cache_create("numa_policy",
2588                                          sizeof(struct mempolicy),
2589                                          0, SLAB_PANIC, NULL);
2590
2591         sn_cache = kmem_cache_create("shared_policy_node",
2592                                      sizeof(struct sp_node),
2593                                      0, SLAB_PANIC, NULL);
2594
2595         for_each_node(nid) {
2596                 preferred_node_policy[nid] = (struct mempolicy) {
2597                         .refcnt = ATOMIC_INIT(1),
2598                         .mode = MPOL_PREFERRED,
2599                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2600                         .v = { .preferred_node = nid, },
2601                 };
2602         }
2603
2604         /*
2605          * Set interleaving policy for system init. Interleaving is only
2606          * enabled across suitably sized nodes (default is >= 16MB), or
2607          * fall back to the largest node if they're all smaller.
2608          */
2609         nodes_clear(interleave_nodes);
2610         for_each_node_state(nid, N_MEMORY) {
2611                 unsigned long total_pages = node_present_pages(nid);
2612
2613                 /* Preserve the largest node */
2614                 if (largest < total_pages) {
2615                         largest = total_pages;
2616                         prefer = nid;
2617                 }
2618
2619                 /* Interleave this node? */
2620                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2621                         node_set(nid, interleave_nodes);
2622         }
2623
2624         /* All too small, use the largest */
2625         if (unlikely(nodes_empty(interleave_nodes)))
2626                 node_set(prefer, interleave_nodes);
2627
2628         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2629                 pr_err("%s: interleaving failed\n", __func__);
2630
2631         check_numabalancing_enable();
2632 }
2633
2634 /* Reset policy of current process to default */
2635 void numa_default_policy(void)
2636 {
2637         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2638 }
2639
2640 /*
2641  * Parse and format mempolicy from/to strings
2642  */
2643
2644 /*
2645  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2646  */
2647 static const char * const policy_modes[] =
2648 {
2649         [MPOL_DEFAULT]    = "default",
2650         [MPOL_PREFERRED]  = "prefer",
2651         [MPOL_BIND]       = "bind",
2652         [MPOL_INTERLEAVE] = "interleave",
2653         [MPOL_LOCAL]      = "local",
2654 };
2655
2656
2657 #ifdef CONFIG_TMPFS
2658 /**
2659  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2660  * @str:  string containing mempolicy to parse
2661  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2662  *
2663  * Format of input:
2664  *      <mode>[=<flags>][:<nodelist>]
2665  *
2666  * On success, returns 0, else 1
2667  */
2668 int mpol_parse_str(char *str, struct mempolicy **mpol)
2669 {
2670         struct mempolicy *new = NULL;
2671         unsigned short mode;
2672         unsigned short mode_flags;
2673         nodemask_t nodes;
2674         char *nodelist = strchr(str, ':');
2675         char *flags = strchr(str, '=');
2676         int err = 1;
2677
2678         if (nodelist) {
2679                 /* NUL-terminate mode or flags string */
2680                 *nodelist++ = '\0';
2681                 if (nodelist_parse(nodelist, nodes))
2682                         goto out;
2683                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2684                         goto out;
2685         } else
2686                 nodes_clear(nodes);
2687
2688         if (flags)
2689                 *flags++ = '\0';        /* terminate mode string */
2690
2691         for (mode = 0; mode < MPOL_MAX; mode++) {
2692                 if (!strcmp(str, policy_modes[mode])) {
2693                         break;
2694                 }
2695         }
2696         if (mode >= MPOL_MAX)
2697                 goto out;
2698
2699         switch (mode) {
2700         case MPOL_PREFERRED:
2701                 /*
2702                  * Insist on a nodelist of one node only
2703                  */
2704                 if (nodelist) {
2705                         char *rest = nodelist;
2706                         while (isdigit(*rest))
2707                                 rest++;
2708                         if (*rest)
2709                                 goto out;
2710                 }
2711                 break;
2712         case MPOL_INTERLEAVE:
2713                 /*
2714                  * Default to online nodes with memory if no nodelist
2715                  */
2716                 if (!nodelist)
2717                         nodes = node_states[N_MEMORY];
2718                 break;
2719         case MPOL_LOCAL:
2720                 /*
2721                  * Don't allow a nodelist;  mpol_new() checks flags
2722                  */
2723                 if (nodelist)
2724                         goto out;
2725                 mode = MPOL_PREFERRED;
2726                 break;
2727         case MPOL_DEFAULT:
2728                 /*
2729                  * Insist on a empty nodelist
2730                  */
2731                 if (!nodelist)
2732                         err = 0;
2733                 goto out;
2734         case MPOL_BIND:
2735                 /*
2736                  * Insist on a nodelist
2737                  */
2738                 if (!nodelist)
2739                         goto out;
2740         }
2741
2742         mode_flags = 0;
2743         if (flags) {
2744                 /*
2745                  * Currently, we only support two mutually exclusive
2746                  * mode flags.
2747                  */
2748                 if (!strcmp(flags, "static"))
2749                         mode_flags |= MPOL_F_STATIC_NODES;
2750                 else if (!strcmp(flags, "relative"))
2751                         mode_flags |= MPOL_F_RELATIVE_NODES;
2752                 else
2753                         goto out;
2754         }
2755
2756         new = mpol_new(mode, mode_flags, &nodes);
2757         if (IS_ERR(new))
2758                 goto out;
2759
2760         /*
2761          * Save nodes for mpol_to_str() to show the tmpfs mount options
2762          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2763          */
2764         if (mode != MPOL_PREFERRED)
2765                 new->v.nodes = nodes;
2766         else if (nodelist)
2767                 new->v.preferred_node = first_node(nodes);
2768         else
2769                 new->flags |= MPOL_F_LOCAL;
2770
2771         /*
2772          * Save nodes for contextualization: this will be used to "clone"
2773          * the mempolicy in a specific context [cpuset] at a later time.
2774          */
2775         new->w.user_nodemask = nodes;
2776
2777         err = 0;
2778
2779 out:
2780         /* Restore string for error message */
2781         if (nodelist)
2782                 *--nodelist = ':';
2783         if (flags)
2784                 *--flags = '=';
2785         if (!err)
2786                 *mpol = new;
2787         return err;
2788 }
2789 #endif /* CONFIG_TMPFS */
2790
2791 /**
2792  * mpol_to_str - format a mempolicy structure for printing
2793  * @buffer:  to contain formatted mempolicy string
2794  * @maxlen:  length of @buffer
2795  * @pol:  pointer to mempolicy to be formatted
2796  *
2797  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2798  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2799  * longest flag, "relative", and to display at least a few node ids.
2800  */
2801 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2802 {
2803         char *p = buffer;
2804         nodemask_t nodes = NODE_MASK_NONE;
2805         unsigned short mode = MPOL_DEFAULT;
2806         unsigned short flags = 0;
2807
2808         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2809                 mode = pol->mode;
2810                 flags = pol->flags;
2811         }
2812
2813         switch (mode) {
2814         case MPOL_DEFAULT:
2815                 break;
2816         case MPOL_PREFERRED:
2817                 if (flags & MPOL_F_LOCAL)
2818                         mode = MPOL_LOCAL;
2819                 else
2820                         node_set(pol->v.preferred_node, nodes);
2821                 break;
2822         case MPOL_BIND:
2823         case MPOL_INTERLEAVE:
2824                 nodes = pol->v.nodes;
2825                 break;
2826         default:
2827                 WARN_ON_ONCE(1);
2828                 snprintf(p, maxlen, "unknown");
2829                 return;
2830         }
2831
2832         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2833
2834         if (flags & MPOL_MODE_FLAGS) {
2835                 p += snprintf(p, buffer + maxlen - p, "=");
2836
2837                 /*
2838                  * Currently, the only defined flags are mutually exclusive
2839                  */
2840                 if (flags & MPOL_F_STATIC_NODES)
2841                         p += snprintf(p, buffer + maxlen - p, "static");
2842                 else if (flags & MPOL_F_RELATIVE_NODES)
2843                         p += snprintf(p, buffer + maxlen - p, "relative");
2844         }
2845
2846         if (!nodes_empty(nodes)) {
2847                 p += snprintf(p, buffer + maxlen - p, ":");
2848                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2849         }
2850 }