X-Git-Url: http://drtracing.org/?a=blobdiff_plain;f=mm%2Fvmscan.c;h=a90c603a8d02937fd41bac6a7e72d2891f86bfaa;hb=70b50f94f1644e2aa7cb374819cfd93f3c28d725;hp=5c596654bd3731ef61f3c50b891cf4b77642a7de;hpb=a18bba061c789f5815c3efc3c80e6ac269911964;p=deliverable%2Flinux.git diff --git a/mm/vmscan.c b/mm/vmscan.c index 5c596654bd37..a90c603a8d02 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -633,13 +633,14 @@ redo: lru = LRU_UNEVICTABLE; add_page_to_unevictable_list(page); /* - * When racing with an mlock clearing (page is - * unlocked), make sure that if the other thread does - * not observe our setting of PG_lru and fails - * isolation, we see PG_mlocked cleared below and move + * When racing with an mlock or AS_UNEVICTABLE clearing + * (page is unlocked) make sure that if the other thread + * does not observe our setting of PG_lru and fails + * isolation/check_move_unevictable_page, + * we see PG_mlocked/AS_UNEVICTABLE cleared below and move * the page back to the evictable list. * - * The other side is TestClearPageMlocked(). + * The other side is TestClearPageMlocked() or shmem_lock(). */ smp_mb(); } @@ -750,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) */ static unsigned long shrink_page_list(struct list_head *page_list, struct zone *zone, - struct scan_control *sc) + struct scan_control *sc, + int priority, + unsigned long *ret_nr_dirty, + unsigned long *ret_nr_writeback) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -758,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; + unsigned long nr_writeback = 0; cond_resched(); @@ -794,6 +799,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); if (PageWriteback(page)) { + nr_writeback++; /* * Synchronous reclaim cannot queue pages for * writeback due to the possibility of stack overflow @@ -856,10 +862,20 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow + * avoid risk of stack overflow but do not writeback + * unless under significant pressure. */ - if (page_is_file_cache(page) && !current_is_kswapd()) { - inc_zone_page_state(page, NR_VMSCAN_WRITE_SKIP); + if (page_is_file_cache(page) && + (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { + /* + * Immediately reclaim when written back. + * Similar in principal to deactivate_page() + * except we already have the page isolated + * and know it's dirty + */ + inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); + SetPageReclaim(page); + goto keep_locked; } @@ -997,6 +1013,8 @@ keep_lumpy: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); + *ret_nr_dirty += nr_dirty; + *ret_nr_writeback += nr_writeback; return nr_reclaimed; } @@ -1457,6 +1475,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, unsigned long nr_taken; unsigned long nr_anon; unsigned long nr_file; + unsigned long nr_dirty = 0; + unsigned long nr_writeback = 0; isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; while (unlikely(too_many_isolated(zone, file, sc))) { @@ -1509,12 +1529,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, zone, sc); + nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, + &nr_dirty, &nr_writeback); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { set_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, zone, sc); + nr_reclaimed += shrink_page_list(&page_list, zone, sc, + priority, &nr_dirty, &nr_writeback); } local_irq_disable(); @@ -1524,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + /* + * If reclaim is isolating dirty pages under writeback, it implies + * that the long-lived page allocation rate is exceeding the page + * laundering rate. Either the global limits are not being effective + * at throttling processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing device. The + * only option is to throttle from reclaim context which is not ideal + * as there is no guarantee the dirtying process is throttled in the + * same way balance_dirty_pages() manages. + * + * This scales the number of dirty pages that must be under writeback + * before throttling depending on priority. It is a simple backoff + * function that has the most effect in the range DEF_PRIORITY to + * DEF_PRIORITY-2 which is the priority reclaim is considered to be + * in trouble and reclaim is considered to be in trouble. + * + * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle + * DEF_PRIORITY-1 50% must be PageWriteback + * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble + * ... + * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any + * isolated page is PageWriteback + */ + if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) + wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, zone_idx(zone), nr_scanned, nr_reclaimed, @@ -2055,14 +2103,19 @@ restart: * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. + * + * This function returns true if a zone is being reclaimed for a costly + * high-order allocation and compaction is either ready to begin or deferred. + * This indicates to the caller that it should retry the allocation or fail. */ -static void shrink_zones(int priority, struct zonelist *zonelist, +static bool shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + bool should_abort_reclaim = false; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2077,6 +2130,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist, continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ + if (COMPACTION_BUILD) { + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticable problem, like transparent huge page + * allocations. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER && + (compaction_suitable(zone, sc->order) || + compaction_deferred(zone))) { + should_abort_reclaim = true; + continue; + } + } /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and @@ -2094,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, shrink_zone(priority, zone, sc); } + + return should_abort_reclaim; } static bool zone_reclaimable(struct zone *zone) @@ -2158,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(sc->mem_cgroup); - shrink_zones(priority, zonelist, sc); + if (shrink_zones(priority, zonelist, sc)) + break; + /* * Don't shrink slabs when reclaiming memory from * over limit cgroups @@ -2701,6 +2775,8 @@ out: /* If balanced, clear the congested flag */ zone_clear_flag(zone, ZONE_CONGESTED); + if (i <= *classzone_idx) + balanced += zone->present_pages; } } @@ -2774,7 +2850,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) static int kswapd(void *p) { unsigned long order, new_order; + unsigned balanced_order; int classzone_idx, new_classzone_idx; + int balanced_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -2805,7 +2883,9 @@ static int kswapd(void *p) set_freezable(); order = new_order = 0; + balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; + balanced_classzone_idx = classzone_idx; for ( ; ; ) { int ret; @@ -2814,7 +2894,8 @@ static int kswapd(void *p) * new request of a similar or harder type will succeed soon * so consider going to sleep on the basis we reclaimed at */ - if (classzone_idx >= new_classzone_idx && order == new_order) { + if (balanced_classzone_idx >= new_classzone_idx && + balanced_order == new_order) { new_order = pgdat->kswapd_max_order; new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; @@ -2829,9 +2910,12 @@ static int kswapd(void *p) order = new_order; classzone_idx = new_classzone_idx; } else { - kswapd_try_to_sleep(pgdat, order, classzone_idx); + kswapd_try_to_sleep(pgdat, balanced_order, + balanced_classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; + new_order = order; + new_classzone_idx = classzone_idx; pgdat->kswapd_max_order = 0; pgdat->classzone_idx = pgdat->nr_zones - 1; } @@ -2846,7 +2930,9 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - order = balance_pgdat(pgdat, order, &classzone_idx); + balanced_classzone_idx = classzone_idx; + balanced_order = balance_pgdat(pgdat, order, + &balanced_classzone_idx); } } return 0; @@ -3358,66 +3444,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) } -/** - * scan_zone_unevictable_pages - check unevictable list for evictable pages - * @zone - zone of which to scan the unevictable list - * - * Scan @zone's unevictable LRU lists to check for pages that have become - * evictable. Move those that have to @zone's inactive list where they - * become candidates for reclaim, unless shrink_inactive_zone() decides - * to reactivate them. Pages that are still unevictable are rotated - * back onto @zone's unevictable list. - */ -#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ -static void scan_zone_unevictable_pages(struct zone *zone) +static void warn_scan_unevictable_pages(void) { - struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; - unsigned long scan; - unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); - - while (nr_to_scan > 0) { - unsigned long batch_size = min(nr_to_scan, - SCAN_UNEVICTABLE_BATCH_SIZE); - - spin_lock_irq(&zone->lru_lock); - for (scan = 0; scan < batch_size; scan++) { - struct page *page = lru_to_page(l_unevictable); - - if (!trylock_page(page)) - continue; - - prefetchw_prev_lru_page(page, l_unevictable, flags); - - if (likely(PageLRU(page) && PageUnevictable(page))) - check_move_unevictable_page(page, zone); - - unlock_page(page); - } - spin_unlock_irq(&zone->lru_lock); - - nr_to_scan -= batch_size; - } -} - - -/** - * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages - * - * A really big hammer: scan all zones' unevictable LRU lists to check for - * pages that have become evictable. Move those back to the zones' - * inactive list where they become candidates for reclaim. - * This occurs when, e.g., we have unswappable pages on the unevictable lists, - * and we add swap to the system. As such, it runs in the context of a task - * that has possibly/probably made some previously unevictable pages - * evictable. - */ -static void scan_all_zones_unevictable_pages(void) -{ - struct zone *zone; - - for_each_zone(zone) { - scan_zone_unevictable_pages(zone); - } + printk_once(KERN_WARNING + "The scan_unevictable_pages sysctl/node-interface has been " + "disabled for lack of a legitimate use case. If you have " + "one, please send an email to linux-mm@kvack.org.\n"); } /* @@ -3430,11 +3462,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { + warn_scan_unevictable_pages(); proc_doulongvec_minmax(table, write, buffer, length, ppos); - - if (write && *(unsigned long *)table->data) - scan_all_zones_unevictable_pages(); - scan_unevictable_pages = 0; return 0; } @@ -3449,6 +3478,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev, struct sysdev_attribute *attr, char *buf) { + warn_scan_unevictable_pages(); return sprintf(buf, "0\n"); /* always zero; should fit... */ } @@ -3456,19 +3486,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev, struct sysdev_attribute *attr, const char *buf, size_t count) { - struct zone *node_zones = NODE_DATA(dev->id)->node_zones; - struct zone *zone; - unsigned long res; - unsigned long req = strict_strtoul(buf, 10, &res); - - if (!req) - return 1; /* zero is no-op */ - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) - continue; - scan_zone_unevictable_pages(zone); - } + warn_scan_unevictable_pages(); return 1; }