[PATCH] mm: split page table lock

[deliverable/linux.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index e1d3d77f4aeef89259d9e3b09e500042a42d7a87..a2995a5d012c7e924924e044b3d22f750a25c555 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -114,7 +114,8 @@ static void bad_page(const char *function, struct page *page)
                         1 << PG_reclaim |
                         1 << PG_slab    |
                         1 << PG_swapcache |
-                       1 << PG_writeback);
+                       1 << PG_writeback |
+                       1 << PG_reserved );
         set_page_count(page, 0);
         reset_page_mapcount(page);
         page->mapping = NULL;
@@ -153,7 +154,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
                 struct page *p = page + i;
  
                 SetPageCompound(p);
-               p->private = (unsigned long)page;
+               set_page_private(p, (unsigned long)page);
         }
  }
  
@@ -173,7 +174,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
  
                 if (!PageCompound(p))
                         bad_page(__FUNCTION__, page);
-               if (p->private != (unsigned long)page)
+               if (page_private(p) != (unsigned long)page)
                         bad_page(__FUNCTION__, page);
                 ClearPageCompound(p);
         }
@@ -186,18 +187,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
   * So, we don't need atomic page->flags operations here.
   */
  static inline unsigned long page_order(struct page *page) {
-       return page->private;
+       return page_private(page);
  }
  
  static inline void set_page_order(struct page *page, int order) {
-       page->private = order;
+       set_page_private(page, order);
         __SetPagePrivate(page);
  }
  
  static inline void rmv_page_order(struct page *page)
  {
         __ClearPagePrivate(page);
-       page->private = 0;
+       set_page_private(page, 0);
  }
  
  /*
@@ -237,14 +238,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
   * (a) the buddy is free &&
   * (b) the buddy is on the buddy system &&
   * (c) a page and its buddy have the same order.
- * for recording page's order, we use page->private and PG_private.
+ * for recording page's order, we use page_private(page) and PG_private.
   *
   */
  static inline int page_is_buddy(struct page *page, int order)
  {
         if (PagePrivate(page)           &&
             (page_order(page) == order) &&
-           !PageReserved(page)         &&
              page_count(page) == 0)
                 return 1;
         return 0;
@@ -264,7 +264,7 @@ static inline int page_is_buddy(struct page *page, int order)
   * parts of the VM system.
   * At each level, we keep a list of pages, which are heads of continuous
   * free pages of length of (1 << order) and marked with PG_Private.Page's
- * order is recorded in page->private field.
+ * order is recorded in page_private(page) field.
   * So when we are allocating or freeing one, we can derive the state of the
   * other.  That is, if we allocate a small block, and both were   
   * free, the remainder of the region must be split into blocks.   
@@ -327,7 +327,8 @@ static inline void free_pages_check(const char *function, struct page *page)
                         1 << PG_reclaim |
                         1 << PG_slab    |
                         1 << PG_swapcache |
-                       1 << PG_writeback )))
+                       1 << PG_writeback |
+                       1 << PG_reserved )))
                 bad_page(function, page);
         if (PageDirty(page))
                 __ClearPageDirty(page);
@@ -455,13 +456,14 @@ static void prep_new_page(struct page *page, int order)
                         1 << PG_reclaim |
                         1 << PG_slab    |
                         1 << PG_swapcache |
-                       1 << PG_writeback )))
+                       1 << PG_writeback |
+                       1 << PG_reserved )))
                 bad_page(__FUNCTION__, page);
  
         page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
                         1 << PG_referenced | 1 << PG_arch_1 |
                         1 << PG_checked | 1 << PG_mappedtodisk);
-       page->private = 0;
+       set_page_private(page, 0);
         set_page_refs(page, order);
         kernel_map_pages(page, 1 << order, 1);
  }
@@ -734,7 +736,7 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
   * of the allocation.
   */
  int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                     int classzone_idx, int can_try_harder, int gfp_high)
+                     int classzone_idx, int can_try_harder, gfp_t gfp_high)
  {
         /* free_pages my go negative - that's OK */
         long min = mark, free_pages = z->free_pages - (1 << order) + 1;
@@ -777,7 +779,7 @@ struct page * fastcall
  __alloc_pages(gfp_t gfp_mask, unsigned int order,
                 struct zonelist *zonelist)
  {
-       const int wait = gfp_mask & __GFP_WAIT;
+       const gfp_t wait = gfp_mask & __GFP_WAIT;
         struct zone **zones, *z;
         struct page *page;
         struct reclaim_state reclaim_state;
@@ -996,7 +998,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
          * get_zeroed_page() returns a 32-bit address, which cannot represent
          * a highmem page
          */
-       BUG_ON(gfp_mask & __GFP_HIGHMEM);
+       BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
  
         page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
         if (page)
@@ -1016,7 +1018,7 @@ void __pagevec_free(struct pagevec *pvec)
  
  fastcall void __free_pages(struct page *page, unsigned int order)
  {
-       if (!PageReserved(page) && put_page_testzero(page)) {
+       if (put_page_testzero(page)) {
                 if (order == 0)
                         free_hot_page(page);
                 else
@@ -1089,7 +1091,7 @@ static unsigned int nr_free_zone_pages(int offset)
   */
  unsigned int nr_free_buffer_pages(void)
  {
-       return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
+       return nr_free_zone_pages(gfp_zone(GFP_USER));
  }
  
  /*
@@ -1097,7 +1099,7 @@ unsigned int nr_free_buffer_pages(void)
   */
  unsigned int nr_free_pagecache_pages(void)
  {
-       return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
+       return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
  }
  
  #ifdef CONFIG_HIGHMEM
@@ -1428,6 +1430,16 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
         return j;
  }
  
+static inline int highest_zone(int zone_bits)
+{
+       int res = ZONE_NORMAL;
+       if (zone_bits & (__force int)__GFP_HIGHMEM)
+               res = ZONE_HIGHMEM;
+       if (zone_bits & (__force int)__GFP_DMA)
+               res = ZONE_DMA;
+       return res;
+}
+
  #ifdef CONFIG_NUMA
  #define MAX_NODE_LOAD (num_online_nodes())
  static int __initdata node_load[MAX_NUMNODES];
@@ -1524,11 +1536,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
                         zonelist = pgdat->node_zonelists + i;
                         for (j = 0; zonelist->zones[j] != NULL; j++);
  
-                       k = ZONE_NORMAL;
-                       if (i & __GFP_HIGHMEM)
-                               k = ZONE_HIGHMEM;
-                       if (i & __GFP_DMA)
-                               k = ZONE_DMA;
+                       k = highest_zone(i);
  
                         j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
                         zonelist->zones[j] = NULL;
@@ -1549,12 +1557,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
                 zonelist = pgdat->node_zonelists + i;
  
                 j = 0;
-               k = ZONE_NORMAL;
-               if (i & __GFP_HIGHMEM)
-                       k = ZONE_HIGHMEM;
-               if (i & __GFP_DMA)
-                       k = ZONE_DMA;
-
+               k = highest_zone(i);
                 j = build_zonelists_node(pgdat, zonelist, j, k);
                 /*
                  * Now we build the zonelist so that it contains the zones
@@ -1673,7 +1676,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                         continue;
                 page = pfn_to_page(pfn);
                 set_page_links(page, zone, nid, pfn);
-               set_page_count(page, 0);
+               set_page_count(page, 1);
                 reset_page_mapcount(page);
                 SetPageReserved(page);
                 INIT_LIST_HEAD(&page->lru);
@@ -1720,29 +1723,29 @@ static int __devinit zone_batchsize(struct zone *zone)
  
         /*
          * The per-cpu-pages pools are set to around 1000th of the
-        * size of the zone.  But no more than 1/4 of a meg - there's
-        * no point in going beyond the size of L2 cache.
+        * size of the zone.  But no more than 1/2 of a meg.
          *
          * OK, so we don't know how big the cache is.  So guess.
          */
         batch = zone->present_pages / 1024;
-       if (batch * PAGE_SIZE > 256 * 1024)
-               batch = (256 * 1024) / PAGE_SIZE;
+       if (batch * PAGE_SIZE > 512 * 1024)
+               batch = (512 * 1024) / PAGE_SIZE;
         batch /= 4;             /* We effectively *= 4 below */
         if (batch < 1)
                 batch = 1;
  
         /*
-        * Clamp the batch to a 2^n - 1 value. Having a power
-        * of 2 value was found to be more likely to have
-        * suboptimal cache aliasing properties in some cases.
+        * We will be trying to allcoate bigger chunks of contiguous
+        * memory of the order of fls(batch).  This should result in
+        * better cache coloring.
          *
-        * For example if 2 tasks are alternately allocating
-        * batches of pages, one task can end up with a lot
-        * of pages of one half of the possible page colors
-        * and the other with pages of the other colors.
+        * A sanity check also to ensure that batch is still in limits.
          */
-       batch = (1 << fls(batch + batch/2)) - 1;
+       batch = (1 << fls(batch + batch/2));
+
+       if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
+               batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
+
         return batch;
  }
  
@@ -1754,7 +1757,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
  
         pcp = &p->pcp[0];               /* hot */
         pcp->count = 0;
-       pcp->low = 2 * batch;
+       pcp->low = 0;
         pcp->high = 6 * batch;
         pcp->batch = max(1UL, 1 * batch);
         INIT_LIST_HEAD(&pcp->list);
@@ -1763,7 +1766,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
         pcp->count = 0;
         pcp->low = 0;
         pcp->high = 2 * batch;
-       pcp->batch = max(1UL, 1 * batch);
+       pcp->batch = max(1UL, batch/2);
         INIT_LIST_HEAD(&pcp->list);
  }