mm/hwpoison: fix refcount of THP head page in no-injection case
[deliverable/linux.git] / mm / vmscan.c
index e61445dce04e3cc83e9704e84f3d5bf9074b31db..8276a3a615ca0049515affe9adbd4a0480ce8c4a 100644 (file)
@@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc)
        if (!memcg)
                return true;
 #ifdef CONFIG_CGROUP_WRITEBACK
-       if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+       if (memcg->css.cgroup)
                return true;
 #endif
        return false;
@@ -973,22 +973,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 *    caller can stall after page list has been processed.
                 *
                 * 2) Global or new memcg reclaim encounters a page that is
-                *    not marked for immediate reclaim or the caller does not
-                *    have __GFP_IO. In this case mark the page for immediate
+                *    not marked for immediate reclaim, or the caller does not
+                *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
+                *    not to fs). In this case mark the page for immediate
                 *    reclaim and continue scanning.
                 *
-                *    __GFP_IO is checked  because a loop driver thread might
+                *    Require may_enter_fs because we would wait on fs, which
+                *    may not have submitted IO yet. And the loop driver might
                 *    enter reclaim, and deadlock if it waits on a page for
                 *    which it is needed to do the write (loop masks off
                 *    __GFP_IO|__GFP_FS for this reason); but more thought
                 *    would probably show more reasons.
                 *
-                *    Don't require __GFP_FS, since we're not going into the
-                *    FS, just waiting on its writeback completion. Worryingly,
-                *    ext4 gfs2 and xfs allocate pages with
-                *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
-                *    may_enter_fs here is liable to OOM on them.
-                *
                 * 3) Legacy memcg encounters a page that is not already marked
                 *    PageReclaim. memcg does not have any dirty pages
                 *    throttling so we could easily OOM just because too many
@@ -1005,7 +1001,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
                        /* Case 2 above */
                        } else if (sane_reclaim(sc) ||
-                           !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+                           !PageReclaim(page) || !may_enter_fs) {
                                /*
                                 * This is slightly racy - end_page_writeback()
                                 * might have just cleared PageReclaim, then
@@ -1061,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                       switch (try_to_unmap(page, ttu_flags)) {
+                       switch (try_to_unmap(page,
+                                       ttu_flags|TTU_BATCH_FLUSH)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
@@ -1101,7 +1098,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        if (!sc->may_writepage)
                                goto keep_locked;
 
-                       /* Page is dirty, try to write it out here */
+                       /*
+                        * Page is dirty. Flush the TLB if a writable entry
+                        * potentially exists to avoid CPU writes after IO
+                        * starts and then write it out here.
+                        */
+                       try_to_unmap_flush_dirty();
                        switch (pageout(page, mapping, sc)) {
                        case PAGE_KEEP:
                                goto keep_locked;
@@ -1194,7 +1196,7 @@ cull_mlocked:
                if (PageSwapCache(page))
                        try_to_free_swap(page);
                unlock_page(page);
-               putback_lru_page(page);
+               list_add(&page->lru, &ret_pages);
                continue;
 
 activate_locked:
@@ -1212,6 +1214,7 @@ keep:
        }
 
        mem_cgroup_uncharge_list(&free_pages);
+       try_to_unmap_flush();
        free_hot_cold_page_list(&free_pages, true);
 
        list_splice(&ret_pages, page_list);
@@ -1356,7 +1359,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        unsigned long nr_taken = 0;
        unsigned long scan;
 
-       for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+       for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
+                                       !list_empty(src); scan++) {
                struct page *page;
                int nr_pages;
 
@@ -2155,6 +2159,23 @@ out:
        }
 }
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void init_tlb_ubc(void)
+{
+       /*
+        * This deliberately does not clear the cpumask as it's expensive
+        * and unnecessary. If there happens to be data in there then the
+        * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
+        * then will be cleared.
+        */
+       current->tlb_ubc.flush_required = false;
+}
+#else
+static inline void init_tlb_ubc(void)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -2189,6 +2210,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
        scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
                         sc->priority == DEF_PRIORITY);
 
+       init_tlb_ubc();
+
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
This page took 0.084867 seconds and 5 git commands to generate.