mm: vmstat: account per-zone stalls and pages skipped during reclaim

[deliverable/linux.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 1013f37cd815d7629abe0564cf32514da1743edc..d5ee6d998b5e68b734f5fb67564c4aaae5389ccb 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1394,6 +1394,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
         struct list_head *src = &lruvec->lists[lru];
         unsigned long nr_taken = 0;
         unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
+       unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
         unsigned long scan, nr_pages;
         LIST_HEAD(pages_skipped);
  
@@ -1408,6 +1409,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  
                 if (page_zonenum(page) > sc->reclaim_idx) {
                         list_move(&page->lru, &pages_skipped);
+                       nr_skipped[page_zonenum(page)]++;
                         continue;
                 }
  
@@ -1436,10 +1438,19 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
          * scanning would soon rescan the same pages to skip and put the
          * system at risk of premature OOM.
          */
-       if (!list_empty(&pages_skipped))
+       if (!list_empty(&pages_skipped)) {
+               int zid;
+
                 list_splice(&pages_skipped, src);
+               for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                       if (!nr_skipped[zid])
+                               continue;
+
+                       __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
+               }
+       }
         *nr_scanned = scan;
-       trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
+       trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
                                     nr_taken, mode, is_file_lru(lru));
         for (scan = 0; scan < MAX_NR_ZONES; scan++) {
                 nr_pages = nr_zone_taken[scan];
@@ -2428,8 +2439,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
         return true;
  }
  
-static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
-                       enum zone_type classzone_idx)
+static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
  {
         struct reclaim_state *reclaim_state = current->reclaim_state;
         unsigned long nr_reclaimed, nr_scanned;
@@ -2524,7 +2534,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
   * Returns true if compaction should go ahead for a high-order request, or
   * the high-order allocation would succeed without compaction.
   */
-static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx)
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
  {
         unsigned long watermark;
         bool watermark_ok;
@@ -2535,21 +2545,21 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
          * there is a buffer of free pages available to give compaction
          * a reasonable chance of completing and allocating the page
          */
-       watermark = high_wmark_pages(zone) + (2UL << order);
-       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx);
+       watermark = high_wmark_pages(zone) + (2UL << sc->order);
+       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
  
         /*
          * If compaction is deferred, reclaim up to a point where
          * compaction will have a chance of success when re-enabled
          */
-       if (compaction_deferred(zone, order))
+       if (compaction_deferred(zone, sc->order))
                 return watermark_ok;
  
         /*
          * If compaction is not ready to start and allocation is not likely
          * to succeed without it, then keep reclaiming.
          */
-       if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED)
+       if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED)
                 return false;
  
         return watermark_ok;
@@ -2570,7 +2580,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
         unsigned long nr_soft_reclaimed;
         unsigned long nr_soft_scanned;
         gfp_t orig_mask;
-       enum zone_type classzone_idx;
         pg_data_t *last_pgdat = NULL;
  
         /*
@@ -2581,7 +2590,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
         orig_mask = sc->gfp_mask;
         if (buffer_heads_over_limit) {
                 sc->gfp_mask |= __GFP_HIGHMEM;
-               sc->reclaim_idx = classzone_idx = gfp_zone(sc->gfp_mask);
+               sc->reclaim_idx = gfp_zone(sc->gfp_mask);
         }
  
         for_each_zone_zonelist_nodemask(zone, z, zonelist,
@@ -2589,17 +2598,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                 if (!populated_zone(zone))
                         continue;
  
-               /*
-                * Note that reclaim_idx does not change as it is the highest
-                * zone reclaimed from which for empty zones is a no-op but
-                * classzone_idx is used by shrink_node to test if the slabs
-                * should be shrunk on a given node.
-                */
-               classzone_idx = sc->reclaim_idx;
-               while (!populated_zone(zone->zone_pgdat->node_zones +
-                                                       classzone_idx))
-                       classzone_idx--;
-
                 /*
                  * Take care memory controller reclaiming has small influence
                  * to global LRU.
@@ -2624,8 +2622,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                          */
                         if (IS_ENABLED(CONFIG_COMPACTION) &&
                             sc->order > PAGE_ALLOC_COSTLY_ORDER &&
-                           zonelist_zone_idx(z) <= classzone_idx &&
-                           compaction_ready(zone, sc->order, classzone_idx)) {
+                           compaction_ready(zone, sc)) {
                                 sc->compaction_ready = true;
                                 continue;
                         }
@@ -2658,7 +2655,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                 if (zone->zone_pgdat == last_pgdat)
                         continue;
                 last_pgdat = zone->zone_pgdat;
-               shrink_node(zone->zone_pgdat, sc, classzone_idx);
+               shrink_node(zone->zone_pgdat, sc);
         }
  
         /*
@@ -2694,7 +2691,7 @@ retry:
         delayacct_freepages_start();
  
         if (global_reclaim(sc))
-               count_vm_event(ALLOCSTALL);
+               __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
  
         do {
                 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
@@ -2903,7 +2900,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
  
         trace_mm_vmscan_direct_reclaim_begin(order,
                                 sc.may_writepage,
-                               gfp_mask);
+                               gfp_mask,
+                               sc.reclaim_idx);
  
         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
  
@@ -2934,7 +2932,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
  
         trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
                                                       sc.may_writepage,
-                                                     sc.gfp_mask);
+                                                     sc.gfp_mask,
+                                                     sc.reclaim_idx);
  
         /*
          * NOTE: Although we can get the priority field, using it
@@ -2982,7 +2981,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
  
         trace_mm_vmscan_memcg_reclaim_begin(0,
                                             sc.may_writepage,
-                                           sc.gfp_mask);
+                                           sc.gfp_mask,
+                                           sc.reclaim_idx);
  
         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
  
@@ -3035,15 +3035,10 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
   *
   * Returns true if kswapd is ready to sleep
   */
-static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
-                                       int classzone_idx)
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  {
         int i;
  
-       /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
-       if (remaining)
-               return false;
-
         /*
          * The throttled processes are normally woken up in balance_pgdat() as
          * soon as pfmemalloc_watermark_ok() is true. But there is a potential
@@ -3082,7 +3077,6 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
   * This is used to determine if the scanning priority needs to be raised.
   */
  static bool kswapd_shrink_node(pg_data_t *pgdat,
-                              int classzone_idx,
                                struct scan_control *sc)
  {
         struct zone *zone;
@@ -3090,7 +3084,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
  
         /* Reclaim a number of pages proportional to the number of zones */
         sc->nr_to_reclaim = 0;
-       for (z = 0; z <= classzone_idx; z++) {
+       for (z = 0; z <= sc->reclaim_idx; z++) {
                 zone = pgdat->node_zones + z;
                 if (!populated_zone(zone))
                         continue;
@@ -3102,7 +3096,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
          * Historically care was taken to put equal pressure on all zones but
          * now pressure is applied based on node LRU order.
          */
-       shrink_node(pgdat, sc, classzone_idx);
+       shrink_node(pgdat, sc);
  
         /*
          * Fragmentation may mean that the system cannot be rebalanced for
@@ -3143,7 +3137,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 .may_writepage = !laptop_mode,
                 .may_unmap = 1,
                 .may_swap = 1,
-               .reclaim_idx = classzone_idx,
         };
         count_vm_event(PAGEOUTRUN);
  
@@ -3151,12 +3144,17 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 bool raise_priority = true;
  
                 sc.nr_reclaimed = 0;
+               sc.reclaim_idx = classzone_idx;
  
                 /*
-                * If the number of buffer_heads in the machine exceeds the
-                * maximum allowed level then reclaim from all zones. This is
-                * not specific to highmem as highmem may not exist but it is
-                * it is expected that buffer_heads are stripped in writeback.
+                * If the number of buffer_heads exceeds the maximum allowed
+                * then consider reclaiming from all zones. This has a dual
+                * purpose -- on 64-bit systems it is expected that
+                * buffer_heads are stripped during active rotation. On 32-bit
+                * systems, highmem pages can pin lowmem memory and shrinking
+                * buffers can relieve lowmem pressure. Reclaim may still not
+                * go ahead if all eligible zones for the original allocation
+                * request are balanced to avoid excessive reclaim from kswapd.
                  */
                 if (buffer_heads_over_limit) {
                         for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
@@ -3164,7 +3162,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                                 if (!populated_zone(zone))
                                         continue;
  
-                               classzone_idx = i;
+                               sc.reclaim_idx = i;
                                 break;
                         }
                 }
@@ -3175,7 +3173,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * Scanning from low to high zone would allow congestion to be
                  * cleared during a very small window when a small low
                  * zone was balanced even under extreme pressure when the
-                * overall node may be congested.
+                * overall node may be congested. Note that sc.reclaim_idx
+                * is not used as buffer_heads_over_limit may have adjusted
+                * it.
                  */
                 for (i = classzone_idx; i >= 0; i--) {
                         zone = pgdat->node_zones + i;
@@ -3213,7 +3213,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * enough pages are already being scanned that that high
                  * watermark would be met at 100% efficiency.
                  */
-               if (kswapd_shrink_node(pgdat, classzone_idx, &sc))
+               if (kswapd_shrink_node(pgdat, &sc))
                         raise_priority = false;
  
                 /*
@@ -3259,7 +3259,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
         prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
  
         /* Try to sleep for a short interval */
-       if (prepare_kswapd_sleep(pgdat, reclaim_order, remaining, classzone_idx)) {
+       if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
                 /*
                  * Compaction records what page blocks it recently failed to
                  * isolate pages from and skips them in the future scanning.
@@ -3294,7 +3294,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
          * After a short sleep, check if it was a premature sleep. If not, then
          * go fully to sleep until explicitly woken up.
          */
-       if (prepare_kswapd_sleep(pgdat, reclaim_order, remaining, classzone_idx)) {
+       if (!remaining &&
+           prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
                 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
  
                 /*
@@ -3399,7 +3400,8 @@ kswapd_try_sleep:
                  * but kcompactd is woken to compact for the original
                  * request (alloc_order).
                  */
-               trace_mm_vmscan_kswapd_wake(pgdat->node_id, alloc_order);
+               trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
+                                               alloc_order);
                 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
                 if (reclaim_order < alloc_order)
                         goto kswapd_try_sleep;
@@ -3676,7 +3678,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
                  * priorities until we have enough memory freed.
                  */
                 do {
-                       shrink_node(pgdat, &sc, classzone_idx);
+                       shrink_node(pgdat, &sc);
                 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
         }