[PATCH] mm: optimize numa policy handling in slab allocator

[deliverable/linux.git] / mm / mempolicy.c
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 515bfeee027e1cdecad2618a5c61a17ef1962d5f..73790188b0eb27a91edd4d0d8efba6c90d4b28be 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -180,12 +180,13 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
                 break;
         }
         policy->policy = mode;
+       policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
         return policy;
  }
  
  static void gather_stats(struct page *, void *);
-static void migrate_page_add(struct vm_area_struct *vma,
-       struct page *page, struct list_head *pagelist, unsigned long flags);
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+                               unsigned long flags);
  
  /* Scan through pages checking if pages follow certain conditions. */
  static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -207,17 +208,27 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                 page = vm_normal_page(vma, addr, *pte);
                 if (!page)
                         continue;
+               /*
+                * The check for PageReserved here is important to avoid
+                * handling zero pages and other pages that may have been
+                * marked special by the system.
+                *
+                * If the PageReserved would not be checked here then f.e.
+                * the location of the zero page could have an influence
+                * on MPOL_MF_STRICT, zero pages would be counted for
+                * the per node stats, and there would be useless attempts
+                * to put zero pages on the migration list.
+                */
+               if (PageReserved(page))
+                       continue;
                 nid = page_to_nid(page);
                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
                         continue;
  
                 if (flags & MPOL_MF_STATS)
                         gather_stats(page, private);
-               else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-                       spin_unlock(ptl);
-                       migrate_page_add(vma, page, private, flags);
-                       spin_lock(ptl);
-               }
+               else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+                       migrate_page_add(page, private, flags);
                 else
                         break;
         } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -289,7 +300,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
  static inline int vma_migratable(struct vm_area_struct *vma)
  {
         if (vma->vm_flags & (
-               VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
+               VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
                 return 0;
         return 1;
  }
@@ -306,6 +317,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
         int err;
         struct vm_area_struct *first, *vma, *prev;
  
+       /* Clear the LRU lists so pages can be isolated */
+       if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+               lru_add_drain_all();
+
         first = find_vma(mm, start);
         if (!first)
                 return ERR_PTR(-EFAULT);
@@ -516,51 +531,15 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
   * page migration
   */
  
-/* Check if we are the only process mapping the page in question */
-static inline int single_mm_mapping(struct mm_struct *mm,
-                       struct address_space *mapping)
-{
-       struct vm_area_struct *vma;
-       struct prio_tree_iter iter;
-       int rc = 1;
-
-       spin_lock(&mapping->i_mmap_lock);
-       vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
-               if (mm != vma->vm_mm) {
-                       rc = 0;
-                       goto out;
-               }
-       list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
-               if (mm != vma->vm_mm) {
-                       rc = 0;
-                       goto out;
-               }
-out:
-       spin_unlock(&mapping->i_mmap_lock);
-       return rc;
-}
-
-/*
- * Add a page to be migrated to the pagelist
- */
-static void migrate_page_add(struct vm_area_struct *vma,
-       struct page *page, struct list_head *pagelist, unsigned long flags)
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+                               unsigned long flags)
  {
         /*
-        * Avoid migrating a page that is shared by others and not writable.
+        * Avoid migrating a page that is shared with others.
          */
-       if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
-           mapping_writably_mapped(page->mapping) ||
-           single_mm_mapping(vma->vm_mm, page->mapping)) {
-               int rc = isolate_lru_page(page);
-
-               if (rc == 1)
+       if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+               if (isolate_lru_page(page))
                         list_add(&page->lru, pagelist);
-               /*
-                * If the isolate attempt was not successful then we just
-                * encountered an unswappable page. Something must be wrong.
-                */
-               WARN_ON(rc == 0);
         }
  }
  
@@ -772,9 +751,6 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
         return do_set_mempolicy(mode, &nodes);
  }
  
-/* Macro needed until Paul implements this function in kernel/cpusets.c */
-#define cpuset_mems_allowed(task) node_online_map
-
  asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
                 const unsigned long __user *old_nodes,
                 const unsigned long __user *new_nodes)
@@ -1000,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy)
         return nid;
  }
  
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+       switch (policy->policy) {
+       case MPOL_INTERLEAVE:
+               return interleave_nodes(policy);
+
+       case MPOL_BIND:
+               /*
+                * Follow bind policy behavior and start allocation at the
+                * first node.
+                */
+               return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+
+       case MPOL_PREFERRED:
+               if (policy->v.preferred_node >= 0)
+                       return policy->v.preferred_node;
+               /* Fall through */
+
+       default:
+               return numa_node_id();
+       }
+}
+
  /* Do static interleaving for a VMA with known offset. */
  static unsigned offset_il_node(struct mempolicy *pol,
                 struct vm_area_struct *vma, unsigned long off)
@@ -1133,6 +1136,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
  }
  EXPORT_SYMBOL(alloc_pages_current);
  
+/*
+ * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed().  This
+ * keeps mempolicies cpuset relative after its cpuset moves.  See
+ * further kernel/cpuset.c update_nodemask().
+ */
+void *cpuset_being_rebound;
+
  /* Slow path of a mempolicy copy */
  struct mempolicy *__mpol_copy(struct mempolicy *old)
  {
@@ -1140,6 +1152,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
  
         if (!new)
                 return ERR_PTR(-ENOMEM);
+       if (current_cpuset_is_being_rebound()) {
+               nodemask_t mems = cpuset_mems_allowed(current);
+               mpol_rebind_policy(old, &mems);
+       }
         *new = *old;
         atomic_set(&new->refcnt, 1);
         if (new->policy == MPOL_BIND) {
@@ -1346,6 +1362,30 @@ restart:
         return 0;
  }
  
+void mpol_shared_policy_init(struct shared_policy *info, int policy,
+                               nodemask_t *policy_nodes)
+{
+       info->root = RB_ROOT;
+       spin_lock_init(&info->lock);
+
+       if (policy != MPOL_DEFAULT) {
+               struct mempolicy *newpol;
+
+               /* Falls back to MPOL_DEFAULT on any error */
+               newpol = mpol_new(policy, policy_nodes);
+               if (!IS_ERR(newpol)) {
+                       /* Create pseudo-vma that contains just the policy */
+                       struct vm_area_struct pvma;
+
+                       memset(&pvma, 0, sizeof(struct vm_area_struct));
+                       /* Policy covers entire file */
+                       pvma.vm_end = TASK_SIZE;
+                       mpol_set_shared_policy(info, &pvma, newpol);
+                       mpol_free(newpol);
+               }
+       }
+}
+
  int mpol_set_shared_policy(struct shared_policy *info,
                         struct vm_area_struct *vma, struct mempolicy *npol)
  {
@@ -1414,25 +1454,31 @@ void numa_default_policy(void)
  }
  
  /* Migrate a policy to a different set of nodes */
-static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
-                                                       const nodemask_t *new)
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
  {
+       nodemask_t *mpolmask;
         nodemask_t tmp;
  
         if (!pol)
                 return;
+       mpolmask = &pol->cpuset_mems_allowed;
+       if (nodes_equal(*mpolmask, *newmask))
+               return;
  
         switch (pol->policy) {
         case MPOL_DEFAULT:
                 break;
         case MPOL_INTERLEAVE:
-               nodes_remap(tmp, pol->v.nodes, *old, *new);
+               nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
                 pol->v.nodes = tmp;
-               current->il_next = node_remap(current->il_next, *old, *new);
+               *mpolmask = *newmask;
+               current->il_next = node_remap(current->il_next,
+                                               *mpolmask, *newmask);
                 break;
         case MPOL_PREFERRED:
                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
-                                                               *old, *new);
+                                               *mpolmask, *newmask);
+               *mpolmask = *newmask;
                 break;
         case MPOL_BIND: {
                 nodemask_t nodes;
@@ -1442,7 +1488,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
                 nodes_clear(nodes);
                 for (z = pol->v.zonelist->zones; *z; z++)
                         node_set((*z)->zone_pgdat->node_id, nodes);
-               nodes_remap(tmp, nodes, *old, *new);
+               nodes_remap(tmp, nodes, *mpolmask, *newmask);
                 nodes = tmp;
  
                 zonelist = bind_zonelist(&nodes);
@@ -1457,6 +1503,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
                         kfree(pol->v.zonelist);
                         pol->v.zonelist = zonelist;
                 }
+               *mpolmask = *newmask;
                 break;
         }
         default:
@@ -1466,14 +1513,29 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
  }
  
  /*
- * Someone moved this task to different nodes.  Fixup mempolicies.
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ */
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+{
+       mpol_rebind_policy(tsk->mempolicy, new);
+}
+
+/*
+ * Rebind each vma in mm to new nodemask.
   *
- * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
- * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
+ * Call holding a reference to mm.  Takes mm->mmap_sem during call.
   */
-void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
  {
-       rebind_policy(current->mempolicy, old, new);
+       struct vm_area_struct *vma;
+
+       down_write(&mm->mmap_sem);
+       for (vma = mm->mmap; vma; vma = vma->vm_next)
+               mpol_rebind_policy(vma->vm_policy, new);
+       up_write(&mm->mmap_sem);
  }
  
  /*