Merge branch 'for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/libata
[deliverable/linux.git] / fs / fs-writeback.c
index f98d40333c8529b3391c99ee5dacd3bbd1fb5f67..f0520bcf209442914eff0ed60d380fb3d2c66402 100644 (file)
@@ -258,6 +258,248 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
                wb_put(wb);
 }
 
+/**
+ * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
+ * @inode: inode of interest with i_lock held
+ *
+ * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
+ * held on entry and is released on return.  The returned wb is guaranteed
+ * to stay @inode's associated wb until its list_lock is released.
+ */
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+       __releases(&inode->i_lock)
+       __acquires(&wb->list_lock)
+{
+       while (true) {
+               struct bdi_writeback *wb = inode_to_wb(inode);
+
+               /*
+                * inode_to_wb() association is protected by both
+                * @inode->i_lock and @wb->list_lock but list_lock nests
+                * outside i_lock.  Drop i_lock and verify that the
+                * association hasn't changed after acquiring list_lock.
+                */
+               wb_get(wb);
+               spin_unlock(&inode->i_lock);
+               spin_lock(&wb->list_lock);
+               wb_put(wb);             /* not gonna deref it anymore */
+
+               /* i_wb may have changed inbetween, can't use inode_to_wb() */
+               if (likely(wb == inode->i_wb))
+                       return wb;      /* @inode already has ref */
+
+               spin_unlock(&wb->list_lock);
+               cpu_relax();
+               spin_lock(&inode->i_lock);
+       }
+}
+
+/**
+ * inode_to_wb_and_lock_list - determine an inode's wb and lock it
+ * @inode: inode of interest
+ *
+ * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
+ * on entry.
+ */
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+       __acquires(&wb->list_lock)
+{
+       spin_lock(&inode->i_lock);
+       return locked_inode_to_wb_and_lock_list(inode);
+}
+
+struct inode_switch_wbs_context {
+       struct inode            *inode;
+       struct bdi_writeback    *new_wb;
+
+       struct rcu_head         rcu_head;
+       struct work_struct      work;
+};
+
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+       struct inode_switch_wbs_context *isw =
+               container_of(work, struct inode_switch_wbs_context, work);
+       struct inode *inode = isw->inode;
+       struct address_space *mapping = inode->i_mapping;
+       struct bdi_writeback *old_wb = inode->i_wb;
+       struct bdi_writeback *new_wb = isw->new_wb;
+       struct radix_tree_iter iter;
+       bool switched = false;
+       void **slot;
+
+       /*
+        * By the time control reaches here, RCU grace period has passed
+        * since I_WB_SWITCH assertion and all wb stat update transactions
+        * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+        * synchronizing against mapping->tree_lock.
+        *
+        * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
+        * gives us exclusion against all wb related operations on @inode
+        * including IO list manipulations and stat updates.
+        */
+       if (old_wb < new_wb) {
+               spin_lock(&old_wb->list_lock);
+               spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+       } else {
+               spin_lock(&new_wb->list_lock);
+               spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+       }
+       spin_lock(&inode->i_lock);
+       spin_lock_irq(&mapping->tree_lock);
+
+       /*
+        * Once I_FREEING is visible under i_lock, the eviction path owns
+        * the inode and we shouldn't modify ->i_wb_list.
+        */
+       if (unlikely(inode->i_state & I_FREEING))
+               goto skip_switch;
+
+       /*
+        * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
+        * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
+        * pages actually under underwriteback.
+        */
+       radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+                                  PAGECACHE_TAG_DIRTY) {
+               struct page *page = radix_tree_deref_slot_protected(slot,
+                                                       &mapping->tree_lock);
+               if (likely(page) && PageDirty(page)) {
+                       __dec_wb_stat(old_wb, WB_RECLAIMABLE);
+                       __inc_wb_stat(new_wb, WB_RECLAIMABLE);
+               }
+       }
+
+       radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+                                  PAGECACHE_TAG_WRITEBACK) {
+               struct page *page = radix_tree_deref_slot_protected(slot,
+                                                       &mapping->tree_lock);
+               if (likely(page)) {
+                       WARN_ON_ONCE(!PageWriteback(page));
+                       __dec_wb_stat(old_wb, WB_WRITEBACK);
+                       __inc_wb_stat(new_wb, WB_WRITEBACK);
+               }
+       }
+
+       wb_get(new_wb);
+
+       /*
+        * Transfer to @new_wb's IO list if necessary.  The specific list
+        * @inode was on is ignored and the inode is put on ->b_dirty which
+        * is always correct including from ->b_dirty_time.  The transfer
+        * preserves @inode->dirtied_when ordering.
+        */
+       if (!list_empty(&inode->i_wb_list)) {
+               struct inode *pos;
+
+               inode_wb_list_del_locked(inode, old_wb);
+               inode->i_wb = new_wb;
+               list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+                       if (time_after_eq(inode->dirtied_when,
+                                         pos->dirtied_when))
+                               break;
+               inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+       } else {
+               inode->i_wb = new_wb;
+       }
+
+       /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
+       inode->i_wb_frn_winner = 0;
+       inode->i_wb_frn_avg_time = 0;
+       inode->i_wb_frn_history = 0;
+       switched = true;
+skip_switch:
+       /*
+        * Paired with load_acquire in unlocked_inode_to_wb_begin() and
+        * ensures that the new wb is visible if they see !I_WB_SWITCH.
+        */
+       smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
+
+       spin_unlock_irq(&mapping->tree_lock);
+       spin_unlock(&inode->i_lock);
+       spin_unlock(&new_wb->list_lock);
+       spin_unlock(&old_wb->list_lock);
+
+       if (switched) {
+               wb_wakeup(new_wb);
+               wb_put(old_wb);
+       }
+       wb_put(new_wb);
+
+       iput(inode);
+       kfree(isw);
+}
+
+static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
+{
+       struct inode_switch_wbs_context *isw = container_of(rcu_head,
+                               struct inode_switch_wbs_context, rcu_head);
+
+       /* needs to grab bh-unsafe locks, bounce to work item */
+       INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
+       schedule_work(&isw->work);
+}
+
+/**
+ * inode_switch_wbs - change the wb association of an inode
+ * @inode: target inode
+ * @new_wb_id: ID of the new wb
+ *
+ * Switch @inode's wb association to the wb identified by @new_wb_id.  The
+ * switching is performed asynchronously and may fail silently.
+ */
+static void inode_switch_wbs(struct inode *inode, int new_wb_id)
+{
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
+       struct cgroup_subsys_state *memcg_css;
+       struct inode_switch_wbs_context *isw;
+
+       /* noop if seems to be already in progress */
+       if (inode->i_state & I_WB_SWITCH)
+               return;
+
+       isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+       if (!isw)
+               return;
+
+       /* find and pin the new wb */
+       rcu_read_lock();
+       memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
+       if (memcg_css)
+               isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+       rcu_read_unlock();
+       if (!isw->new_wb)
+               goto out_free;
+
+       /* while holding I_WB_SWITCH, no one else can update the association */
+       spin_lock(&inode->i_lock);
+       if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+           inode_to_wb(inode) == isw->new_wb) {
+               spin_unlock(&inode->i_lock);
+               goto out_free;
+       }
+       inode->i_state |= I_WB_SWITCH;
+       spin_unlock(&inode->i_lock);
+
+       ihold(inode);
+       isw->inode = inode;
+
+       /*
+        * In addition to synchronizing among switchers, I_WB_SWITCH tells
+        * the RCU protected stat update paths to grab the mapping's
+        * tree_lock so that stat transfer can synchronize against them.
+        * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+        */
+       call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
+       return;
+
+out_free:
+       if (isw->new_wb)
+               wb_put(isw->new_wb);
+       kfree(isw);
+}
+
 /**
  * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
  * @wbc: writeback_control of interest
@@ -271,6 +513,11 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
 void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                 struct inode *inode)
 {
+       if (!inode_cgwb_enabled(inode)) {
+               spin_unlock(&inode->i_lock);
+               return;
+       }
+
        wbc->wb = inode_to_wb(inode);
        wbc->inode = inode;
 
@@ -283,6 +530,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 
        wb_get(wbc->wb);
        spin_unlock(&inode->i_lock);
+
+       /*
+        * A dying wb indicates that the memcg-blkcg mapping has changed
+        * and a new wb is already serving the memcg.  Switch immediately.
+        */
+       if (unlikely(wb_dying(wbc->wb)))
+               inode_switch_wbs(inode, wbc->wb_id);
 }
 
 /**
@@ -326,11 +580,16 @@ void wbc_detach_inode(struct writeback_control *wbc)
 {
        struct bdi_writeback *wb = wbc->wb;
        struct inode *inode = wbc->inode;
-       u16 history = inode->i_wb_frn_history;
-       unsigned long avg_time = inode->i_wb_frn_avg_time;
-       unsigned long max_bytes, max_time;
+       unsigned long avg_time, max_bytes, max_time;
+       u16 history;
        int max_id;
 
+       if (!wb)
+               return;
+
+       history = inode->i_wb_frn_history;
+       avg_time = inode->i_wb_frn_avg_time;
+
        /* pick the winner of this round */
        if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
            wbc->wb_bytes >= wbc->wb_tcand_bytes) {
@@ -383,12 +642,8 @@ void wbc_detach_inode(struct writeback_control *wbc)
                 * is okay.  The main goal is avoiding keeping an inode on
                 * the wrong wb for an extended period of time.
                 */
-               if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) {
-                       /* switch */
-                       max_id = 0;
-                       avg_time = 0;
-                       history = 0;
-               }
+               if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
+                       inode_switch_wbs(inode, max_id);
        }
 
        /*
@@ -463,10 +718,18 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
  */
 int inode_congested(struct inode *inode, int cong_bits)
 {
-       if (inode) {
-               struct bdi_writeback *wb = inode_to_wb(inode);
-               if (wb)
-                       return wb_congested(wb, cong_bits);
+       /*
+        * Once set, ->i_wb never becomes NULL while the inode is alive.
+        * Start transaction iff ->i_wb is visible.
+        */
+       if (inode && inode_to_wb_is_valid(inode)) {
+               struct bdi_writeback *wb;
+               bool locked, congested;
+
+               wb = unlocked_inode_to_wb_begin(inode, &locked);
+               congested = wb_congested(wb, cong_bits);
+               unlocked_inode_to_wb_end(inode, locked);
+               return congested;
        }
 
        return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
@@ -603,6 +866,27 @@ restart:
 
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+       __releases(&inode->i_lock)
+       __acquires(&wb->list_lock)
+{
+       struct bdi_writeback *wb = inode_to_wb(inode);
+
+       spin_unlock(&inode->i_lock);
+       spin_lock(&wb->list_lock);
+       return wb;
+}
+
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+       __acquires(&wb->list_lock)
+{
+       struct bdi_writeback *wb = inode_to_wb(inode);
+
+       spin_lock(&wb->list_lock);
+       return wb;
+}
+
 static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
 {
        return nr_pages;
@@ -678,9 +962,9 @@ void wb_start_background_writeback(struct bdi_writeback *wb)
  */
 void inode_wb_list_del(struct inode *inode)
 {
-       struct bdi_writeback *wb = inode_to_wb(inode);
+       struct bdi_writeback *wb;
 
-       spin_lock(&wb->list_lock);
+       wb = inode_to_wb_and_lock_list(inode);
        inode_wb_list_del_locked(inode, wb);
        spin_unlock(&wb->list_lock);
 }
@@ -1784,12 +2068,11 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 * reposition it (that would break b_dirty time-ordering).
                 */
                if (!was_dirty) {
-                       struct bdi_writeback *wb = inode_to_wb(inode);
+                       struct bdi_writeback *wb;
                        struct list_head *dirty_list;
                        bool wakeup_bdi = false;
 
-                       spin_unlock(&inode->i_lock);
-                       spin_lock(&wb->list_lock);
+                       wb = locked_inode_to_wb_and_lock_list(inode);
 
                        WARN(bdi_cap_writeback_dirty(wb->bdi) &&
                             !test_bit(WB_registered, &wb->state),
This page took 0.029113 seconds and 5 git commands to generate.