Merge branch 'for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/libata
[deliverable/linux.git] / fs / fs-writeback.c
index 08f5496fcf1b5a6dede24a03385bee5c24b7e1be..f0520bcf209442914eff0ed60d380fb3d2c66402 100644 (file)
@@ -285,7 +285,8 @@ locked_inode_to_wb_and_lock_list(struct inode *inode)
                spin_lock(&wb->list_lock);
                wb_put(wb);             /* not gonna deref it anymore */
 
-               if (likely(wb == inode_to_wb(inode)))
+               /* i_wb may have changed inbetween, can't use inode_to_wb() */
+               if (likely(wb == inode->i_wb))
                        return wb;      /* @inode already has ref */
 
                spin_unlock(&wb->list_lock);
@@ -321,30 +322,112 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
        struct inode_switch_wbs_context *isw =
                container_of(work, struct inode_switch_wbs_context, work);
        struct inode *inode = isw->inode;
+       struct address_space *mapping = inode->i_mapping;
+       struct bdi_writeback *old_wb = inode->i_wb;
        struct bdi_writeback *new_wb = isw->new_wb;
+       struct radix_tree_iter iter;
+       bool switched = false;
+       void **slot;
 
        /*
         * By the time control reaches here, RCU grace period has passed
         * since I_WB_SWITCH assertion and all wb stat update transactions
         * between unlocked_inode_to_wb_begin/end() are guaranteed to be
         * synchronizing against mapping->tree_lock.
+        *
+        * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
+        * gives us exclusion against all wb related operations on @inode
+        * including IO list manipulations and stat updates.
         */
+       if (old_wb < new_wb) {
+               spin_lock(&old_wb->list_lock);
+               spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+       } else {
+               spin_lock(&new_wb->list_lock);
+               spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+       }
        spin_lock(&inode->i_lock);
+       spin_lock_irq(&mapping->tree_lock);
+
+       /*
+        * Once I_FREEING is visible under i_lock, the eviction path owns
+        * the inode and we shouldn't modify ->i_wb_list.
+        */
+       if (unlikely(inode->i_state & I_FREEING))
+               goto skip_switch;
+
+       /*
+        * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
+        * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
+        * pages actually under underwriteback.
+        */
+       radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+                                  PAGECACHE_TAG_DIRTY) {
+               struct page *page = radix_tree_deref_slot_protected(slot,
+                                                       &mapping->tree_lock);
+               if (likely(page) && PageDirty(page)) {
+                       __dec_wb_stat(old_wb, WB_RECLAIMABLE);
+                       __inc_wb_stat(new_wb, WB_RECLAIMABLE);
+               }
+       }
+
+       radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+                                  PAGECACHE_TAG_WRITEBACK) {
+               struct page *page = radix_tree_deref_slot_protected(slot,
+                                                       &mapping->tree_lock);
+               if (likely(page)) {
+                       WARN_ON_ONCE(!PageWriteback(page));
+                       __dec_wb_stat(old_wb, WB_WRITEBACK);
+                       __inc_wb_stat(new_wb, WB_WRITEBACK);
+               }
+       }
+
+       wb_get(new_wb);
+
+       /*
+        * Transfer to @new_wb's IO list if necessary.  The specific list
+        * @inode was on is ignored and the inode is put on ->b_dirty which
+        * is always correct including from ->b_dirty_time.  The transfer
+        * preserves @inode->dirtied_when ordering.
+        */
+       if (!list_empty(&inode->i_wb_list)) {
+               struct inode *pos;
+
+               inode_wb_list_del_locked(inode, old_wb);
+               inode->i_wb = new_wb;
+               list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+                       if (time_after_eq(inode->dirtied_when,
+                                         pos->dirtied_when))
+                               break;
+               inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+       } else {
+               inode->i_wb = new_wb;
+       }
 
+       /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
-
+       switched = true;
+skip_switch:
        /*
         * Paired with load_acquire in unlocked_inode_to_wb_begin() and
         * ensures that the new wb is visible if they see !I_WB_SWITCH.
         */
        smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
 
+       spin_unlock_irq(&mapping->tree_lock);
        spin_unlock(&inode->i_lock);
+       spin_unlock(&new_wb->list_lock);
+       spin_unlock(&old_wb->list_lock);
 
-       iput(inode);
+       if (switched) {
+               wb_wakeup(new_wb);
+               wb_put(old_wb);
+       }
        wb_put(new_wb);
+
+       iput(inode);
        kfree(isw);
 }
 
@@ -430,6 +513,11 @@ out_free:
 void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                 struct inode *inode)
 {
+       if (!inode_cgwb_enabled(inode)) {
+               spin_unlock(&inode->i_lock);
+               return;
+       }
+
        wbc->wb = inode_to_wb(inode);
        wbc->inode = inode;
 
@@ -442,6 +530,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 
        wb_get(wbc->wb);
        spin_unlock(&inode->i_lock);
+
+       /*
+        * A dying wb indicates that the memcg-blkcg mapping has changed
+        * and a new wb is already serving the memcg.  Switch immediately.
+        */
+       if (unlikely(wb_dying(wbc->wb)))
+               inode_switch_wbs(inode, wbc->wb_id);
 }
 
 /**
@@ -485,11 +580,16 @@ void wbc_detach_inode(struct writeback_control *wbc)
 {
        struct bdi_writeback *wb = wbc->wb;
        struct inode *inode = wbc->inode;
-       u16 history = inode->i_wb_frn_history;
-       unsigned long avg_time = inode->i_wb_frn_avg_time;
-       unsigned long max_bytes, max_time;
+       unsigned long avg_time, max_bytes, max_time;
+       u16 history;
        int max_id;
 
+       if (!wb)
+               return;
+
+       history = inode->i_wb_frn_history;
+       avg_time = inode->i_wb_frn_avg_time;
+
        /* pick the winner of this round */
        if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
            wbc->wb_bytes >= wbc->wb_tcand_bytes) {
@@ -618,10 +718,18 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
  */
 int inode_congested(struct inode *inode, int cong_bits)
 {
-       if (inode) {
-               struct bdi_writeback *wb = inode_to_wb(inode);
-               if (wb)
-                       return wb_congested(wb, cong_bits);
+       /*
+        * Once set, ->i_wb never becomes NULL while the inode is alive.
+        * Start transaction iff ->i_wb is visible.
+        */
+       if (inode && inode_to_wb_is_valid(inode)) {
+               struct bdi_writeback *wb;
+               bool locked, congested;
+
+               wb = unlocked_inode_to_wb_begin(inode, &locked);
+               congested = wb_congested(wb, cong_bits);
+               unlocked_inode_to_wb_end(inode, locked);
+               return congested;
        }
 
        return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
This page took 0.03624 seconds and 5 git commands to generate.