mm: vmstat: account per-zone stalls and pages skipped during reclaim
[deliverable/linux.git] / mm / vmscan.c
index a9bd8b2da2c8a679feeb8648d9089ff6a4822e96..d5ee6d998b5e68b734f5fb67564c4aaae5389ccb 100644 (file)
@@ -1394,6 +1394,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        struct list_head *src = &lruvec->lists[lru];
        unsigned long nr_taken = 0;
        unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
+       unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
        unsigned long scan, nr_pages;
        LIST_HEAD(pages_skipped);
 
@@ -1408,6 +1409,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
                if (page_zonenum(page) > sc->reclaim_idx) {
                        list_move(&page->lru, &pages_skipped);
+                       nr_skipped[page_zonenum(page)]++;
                        continue;
                }
 
@@ -1436,10 +1438,19 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
         * scanning would soon rescan the same pages to skip and put the
         * system at risk of premature OOM.
         */
-       if (!list_empty(&pages_skipped))
+       if (!list_empty(&pages_skipped)) {
+               int zid;
+
                list_splice(&pages_skipped, src);
+               for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                       if (!nr_skipped[zid])
+                               continue;
+
+                       __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
+               }
+       }
        *nr_scanned = scan;
-       trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
+       trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
                                    nr_taken, mode, is_file_lru(lru));
        for (scan = 0; scan < MAX_NR_ZONES; scan++) {
                nr_pages = nr_zone_taken[scan];
@@ -2680,7 +2691,7 @@ retry:
        delayacct_freepages_start();
 
        if (global_reclaim(sc))
-               count_vm_event(ALLOCSTALL);
+               __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
 
        do {
                vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
@@ -2889,7 +2900,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 
        trace_mm_vmscan_direct_reclaim_begin(order,
                                sc.may_writepage,
-                               gfp_mask);
+                               gfp_mask,
+                               sc.reclaim_idx);
 
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
@@ -2920,7 +2932,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 
        trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
                                                      sc.may_writepage,
-                                                     sc.gfp_mask);
+                                                     sc.gfp_mask,
+                                                     sc.reclaim_idx);
 
        /*
         * NOTE: Although we can get the priority field, using it
@@ -2968,7 +2981,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 
        trace_mm_vmscan_memcg_reclaim_begin(0,
                                            sc.may_writepage,
-                                           sc.gfp_mask);
+                                           sc.gfp_mask,
+                                           sc.reclaim_idx);
 
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
@@ -3021,15 +3035,10 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
  *
  * Returns true if kswapd is ready to sleep
  */
-static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
-                                       int classzone_idx)
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 {
        int i;
 
-       /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
-       if (remaining)
-               return false;
-
        /*
         * The throttled processes are normally woken up in balance_pgdat() as
         * soon as pfmemalloc_watermark_ok() is true. But there is a potential
@@ -3128,7 +3137,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = 1,
-               .reclaim_idx = classzone_idx,
        };
        count_vm_event(PAGEOUTRUN);
 
@@ -3136,12 +3144,17 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                bool raise_priority = true;
 
                sc.nr_reclaimed = 0;
+               sc.reclaim_idx = classzone_idx;
 
                /*
-                * If the number of buffer_heads in the machine exceeds the
-                * maximum allowed level then reclaim from all zones. This is
-                * not specific to highmem as highmem may not exist but it is
-                * it is expected that buffer_heads are stripped in writeback.
+                * If the number of buffer_heads exceeds the maximum allowed
+                * then consider reclaiming from all zones. This has a dual
+                * purpose -- on 64-bit systems it is expected that
+                * buffer_heads are stripped during active rotation. On 32-bit
+                * systems, highmem pages can pin lowmem memory and shrinking
+                * buffers can relieve lowmem pressure. Reclaim may still not
+                * go ahead if all eligible zones for the original allocation
+                * request are balanced to avoid excessive reclaim from kswapd.
                 */
                if (buffer_heads_over_limit) {
                        for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
@@ -3160,14 +3173,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 * Scanning from low to high zone would allow congestion to be
                 * cleared during a very small window when a small low
                 * zone was balanced even under extreme pressure when the
-                * overall node may be congested.
+                * overall node may be congested. Note that sc.reclaim_idx
+                * is not used as buffer_heads_over_limit may have adjusted
+                * it.
                 */
-               for (i = sc.reclaim_idx; i >= 0; i--) {
+               for (i = classzone_idx; i >= 0; i--) {
                        zone = pgdat->node_zones + i;
                        if (!populated_zone(zone))
                                continue;
 
-                       if (zone_balanced(zone, sc.order, sc.reclaim_idx))
+                       if (zone_balanced(zone, sc.order, classzone_idx))
                                goto out;
                }
 
@@ -3244,7 +3259,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 
        /* Try to sleep for a short interval */
-       if (prepare_kswapd_sleep(pgdat, reclaim_order, remaining, classzone_idx)) {
+       if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
                /*
                 * Compaction records what page blocks it recently failed to
                 * isolate pages from and skips them in the future scanning.
@@ -3279,7 +3294,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
         * After a short sleep, check if it was a premature sleep. If not, then
         * go fully to sleep until explicitly woken up.
         */
-       if (prepare_kswapd_sleep(pgdat, reclaim_order, remaining, classzone_idx)) {
+       if (!remaining &&
+           prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
 
                /*
@@ -3384,7 +3400,8 @@ kswapd_try_sleep:
                 * but kcompactd is woken to compact for the original
                 * request (alloc_order).
                 */
-               trace_mm_vmscan_kswapd_wake(pgdat->node_id, alloc_order);
+               trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
+                                               alloc_order);
                reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
                if (reclaim_order < alloc_order)
                        goto kswapd_try_sleep;
This page took 0.032894 seconds and 5 git commands to generate.