thp: reduce usage of huge zero page's atomic counter
authorAaron Lu <aaron.lu@intel.com>
Sat, 10 Sep 2016 10:34:14 +0000 (20:34 +1000)
committerStephen Rothwell <sfr@canb.auug.org.au>
Sat, 10 Sep 2016 10:34:14 +0000 (20:34 +1000)
The global zero page is used to satisfy an anonymous read fault.  If
THP(Transparent HugePage) is enabled then the global huge zero page is
used.  The global huge zero page uses an atomic counter for reference
counting and is allocated/freed dynamically according to its counter
value.

CPU time spent on that counter will greatly increase if there are a lot of
processes doing anonymous read faults.  This patch proposes a way to
reduce the access to the global counter so that the CPU load can be
reduced accordingly.

To do this, a new flag of the mm_struct is introduced:
MMF_USED_HUGE_ZERO_PAGE.  With this flag, the process only need to touch
the global counter in two cases:

1 The first time it uses the global huge zero page;
2 The time when mm_user of its mm_struct reaches zero.

Note that right now, the huge zero page is eligible to be freed as soon as
its last use goes away.  With this patch, the page will not be eligible to
be freed until the exit of the last process from which it was ever used.

And with the use of mm_user, the kthread is not eligible to use huge zero
page either.  Since no kthread is using huge zero page today, there is no
difference after applying this patch.  But if that is not desired, I can
change it to when mm_count reaches zero.

Case used for test on Haswell EP:
usemem -n 72 --readonly -j 0x200000 100G
Which spawns 72 processes and each will mmap 100G anonymous space and
then do read only access to that space sequentially with a step of 2MB.

CPU cycles from perf report for base commit:
    54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
CPU cycles from perf report for this commit:
     0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page

Performance(throughput) of the workload for base commit: 1784430792
Performance(throughput) of the workload for this commit: 4726928591
164% increase.

Runtime of the workload for base commit: 707592 us
Runtime of the workload for this commit: 303970 us
50% drop.

Link: http://lkml.kernel.org/r/fe51a88f-446a-4622-1363-ad1282d71385@intel.com
Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
fs/dax.c
include/linux/huge_mm.h
include/linux/sched.h
kernel/fork.c
mm/huge_memory.c
mm/swap.c
mm/swap_state.c

index 993dc6fe0416e17e8a0ca5c8a432b8daf574df86..226c0d5eedaca1a14c133fb909b502c83f0d1c82 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1034,7 +1034,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
        if (!write && !buffer_mapped(&bh)) {
                spinlock_t *ptl;
                pmd_t entry;
-               struct page *zero_page = get_huge_zero_page();
+               struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
 
                if (unlikely(!zero_page)) {
                        dax_pmd_dbg(&bh, address, "no zero page");
index 4fca5263fd4253fcfb7ac93b9b0d5a8fe2aac8bb..9b9f65d9987393d456911f41eacb4bdfa9fe0284 100644 (file)
@@ -156,8 +156,8 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
        return is_huge_zero_page(pmd_page(pmd));
 }
 
-struct page *get_huge_zero_page(void);
-void put_huge_zero_page(void);
+struct page *mm_get_huge_zero_page(struct mm_struct *mm);
+void mm_put_huge_zero_page(struct mm_struct *mm);
 
 #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
 
@@ -220,9 +220,9 @@ static inline bool is_huge_zero_page(struct page *page)
        return false;
 }
 
-static inline void put_huge_zero_page(void)
+static inline void mm_put_huge_zero_page(struct mm_struct *mm)
 {
-       BUILD_BUG();
+       return;
 }
 
 static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
index b162048511c46ad76d856cc0624cd24afe3b058f..0e974b51a3f2736f3ac0b39335e9e3206fa6993f 100644 (file)
@@ -524,6 +524,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_RECALC_UPROBES     20      /* MMF_HAS_UPROBES can be wrong */
 #define MMF_OOM_SKIP           21      /* mm is of no interest for the OOM killer */
 #define MMF_UNSTABLE           22      /* mm is unstable for copy_from_user */
+#define MMF_HUGE_ZERO_PAGE     23      /* mm has ever used the global huge zero page */
 
 #define MMF_INIT_MASK          (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
index 56e049047ebb101ae60bf6984b7c6696e2574360..690a1aad95e4fa11bbf6b2605bfb7d94bbb666a7 100644 (file)
@@ -717,6 +717,7 @@ static inline void __mmput(struct mm_struct *mm)
        ksm_exit(mm);
        khugepaged_exit(mm); /* must run before exit_mmap */
        exit_mmap(mm);
+       mm_put_huge_zero_page(mm);
        set_mm_exe_file(mm, NULL);
        if (!list_empty(&mm->mmlist)) {
                spin_lock(&mmlist_lock);
index 883f0ee0e73b8d385d67ce0d0c4ab5919234a487..fc0d37e357c328411992ab229ae63c94a9e368ed 100644 (file)
@@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker;
 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 
-struct page *get_huge_zero_page(void)
+static struct page *get_huge_zero_page(void)
 {
        struct page *zero_page;
 retry:
@@ -86,7 +86,7 @@ retry:
        return READ_ONCE(huge_zero_page);
 }
 
-void put_huge_zero_page(void)
+static void put_huge_zero_page(void)
 {
        /*
         * Counter should never go to zero here. Only shrinker can put
@@ -95,6 +95,26 @@ void put_huge_zero_page(void)
        BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 }
 
+struct page *mm_get_huge_zero_page(struct mm_struct *mm)
+{
+       if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+               return READ_ONCE(huge_zero_page);
+
+       if (!get_huge_zero_page())
+               return NULL;
+
+       if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+               put_huge_zero_page();
+
+       return READ_ONCE(huge_zero_page);
+}
+
+void mm_put_huge_zero_page(struct mm_struct *mm)
+{
+       if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+               put_huge_zero_page();
+}
+
 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
                                        struct shrink_control *sc)
 {
@@ -644,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
                pgtable = pte_alloc_one(vma->vm_mm, haddr);
                if (unlikely(!pgtable))
                        return VM_FAULT_OOM;
-               zero_page = get_huge_zero_page();
+               zero_page = mm_get_huge_zero_page(vma->vm_mm);
                if (unlikely(!zero_page)) {
                        pte_free(vma->vm_mm, pgtable);
                        count_vm_event(THP_FAULT_FALLBACK);
@@ -666,10 +686,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
                        }
                } else
                        spin_unlock(fe->ptl);
-               if (!set) {
+               if (!set)
                        pte_free(vma->vm_mm, pgtable);
-                       put_huge_zero_page();
-               }
                return ret;
        }
        gfp = alloc_hugepage_direct_gfpmask(vma);
@@ -823,7 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                 * since we already have a zero page to copy. It just takes a
                 * reference.
                 */
-               zero_page = get_huge_zero_page();
+               zero_page = mm_get_huge_zero_page(dst_mm);
                set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
                                zero_page);
                ret = 0;
@@ -1081,7 +1099,6 @@ alloc:
                update_mmu_cache_pmd(vma, fe->address, fe->pmd);
                if (!page) {
                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-                       put_huge_zero_page();
                } else {
                        VM_BUG_ON_PAGE(!PageHead(page), page);
                        page_remove_rmap(page, true);
@@ -1545,7 +1562,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
        }
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
-       put_huge_zero_page();
 }
 
 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1568,8 +1584,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
        if (!vma_is_anonymous(vma)) {
                _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
-               if (is_huge_zero_pmd(_pmd))
-                       put_huge_zero_page();
                if (vma_is_dax(vma))
                        return;
                page = pmd_page(_pmd);
index 75c63bb2a1da1dc0c3e55600e9db1618949df87a..4dcf852e1e6d8f2e9f0eeca9ee39f620ea972957 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold)
                        locked_pgdat = NULL;
                }
 
-               if (is_huge_zero_page(page)) {
-                       put_huge_zero_page();
+               if (is_huge_zero_page(page))
                        continue;
-               }
 
                page = compound_head(page);
                if (!put_page_testzero(page))
index 268b8191982b245e2adb0c68d3518f173b012cb4..8679c997eab63677a0d01547f54508ca04e64bc2 100644 (file)
@@ -254,9 +254,7 @@ static inline void free_swap_cache(struct page *page)
 void free_page_and_swap_cache(struct page *page)
 {
        free_swap_cache(page);
-       if (is_huge_zero_page(page))
-               put_huge_zero_page();
-       else
+       if (!is_huge_zero_page(page))
                put_page(page);
 }
 
This page took 0.034107 seconds and 5 git commands to generate.