spin_lock(&wb->list_lock);
wb_put(wb); /* not gonna deref it anymore */
- if (likely(wb == inode_to_wb(inode)))
+ /* i_wb may have changed inbetween, can't use inode_to_wb() */
+ if (likely(wb == inode->i_wb))
return wb; /* @inode already has ref */
spin_unlock(&wb->list_lock);
struct inode_switch_wbs_context *isw =
container_of(work, struct inode_switch_wbs_context, work);
struct inode *inode = isw->inode;
+ struct address_space *mapping = inode->i_mapping;
+ struct bdi_writeback *old_wb = inode->i_wb;
struct bdi_writeback *new_wb = isw->new_wb;
+ struct radix_tree_iter iter;
+ bool switched = false;
+ void **slot;
/*
* By the time control reaches here, RCU grace period has passed
* since I_WB_SWITCH assertion and all wb stat update transactions
* between unlocked_inode_to_wb_begin/end() are guaranteed to be
* synchronizing against mapping->tree_lock.
+ *
+ * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
+ * gives us exclusion against all wb related operations on @inode
+ * including IO list manipulations and stat updates.
*/
+ if (old_wb < new_wb) {
+ spin_lock(&old_wb->list_lock);
+ spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&new_wb->list_lock);
+ spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+ }
spin_lock(&inode->i_lock);
+ spin_lock_irq(&mapping->tree_lock);
+
+ /*
+ * Once I_FREEING is visible under i_lock, the eviction path owns
+ * the inode and we shouldn't modify ->i_wb_list.
+ */
+ if (unlikely(inode->i_state & I_FREEING))
+ goto skip_switch;
+
+ /*
+ * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
+ * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
+ * pages actually under underwriteback.
+ */
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+ PAGECACHE_TAG_DIRTY) {
+ struct page *page = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (likely(page) && PageDirty(page)) {
+ __dec_wb_stat(old_wb, WB_RECLAIMABLE);
+ __inc_wb_stat(new_wb, WB_RECLAIMABLE);
+ }
+ }
+
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+ PAGECACHE_TAG_WRITEBACK) {
+ struct page *page = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (likely(page)) {
+ WARN_ON_ONCE(!PageWriteback(page));
+ __dec_wb_stat(old_wb, WB_WRITEBACK);
+ __inc_wb_stat(new_wb, WB_WRITEBACK);
+ }
+ }
+
+ wb_get(new_wb);
+
+ /*
+ * Transfer to @new_wb's IO list if necessary. The specific list
+ * @inode was on is ignored and the inode is put on ->b_dirty which
+ * is always correct including from ->b_dirty_time. The transfer
+ * preserves @inode->dirtied_when ordering.
+ */
+ if (!list_empty(&inode->i_wb_list)) {
+ struct inode *pos;
+
+ inode_wb_list_del_locked(inode, old_wb);
+ inode->i_wb = new_wb;
+ list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+ if (time_after_eq(inode->dirtied_when,
+ pos->dirtied_when))
+ break;
+ inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+ } else {
+ inode->i_wb = new_wb;
+ }
+ /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
inode->i_wb_frn_winner = 0;
inode->i_wb_frn_avg_time = 0;
inode->i_wb_frn_history = 0;
-
+ switched = true;
+skip_switch:
/*
* Paired with load_acquire in unlocked_inode_to_wb_begin() and
* ensures that the new wb is visible if they see !I_WB_SWITCH.
*/
smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
+ spin_unlock_irq(&mapping->tree_lock);
spin_unlock(&inode->i_lock);
+ spin_unlock(&new_wb->list_lock);
+ spin_unlock(&old_wb->list_lock);
- iput(inode);
+ if (switched) {
+ wb_wakeup(new_wb);
+ wb_put(old_wb);
+ }
wb_put(new_wb);
+
+ iput(inode);
kfree(isw);
}
void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
struct inode *inode)
{
+ if (!inode_cgwb_enabled(inode)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+
wbc->wb = inode_to_wb(inode);
wbc->inode = inode;
wb_get(wbc->wb);
spin_unlock(&inode->i_lock);
+
+ /*
+ * A dying wb indicates that the memcg-blkcg mapping has changed
+ * and a new wb is already serving the memcg. Switch immediately.
+ */
+ if (unlikely(wb_dying(wbc->wb)))
+ inode_switch_wbs(inode, wbc->wb_id);
}
/**
{
struct bdi_writeback *wb = wbc->wb;
struct inode *inode = wbc->inode;
- u16 history = inode->i_wb_frn_history;
- unsigned long avg_time = inode->i_wb_frn_avg_time;
- unsigned long max_bytes, max_time;
+ unsigned long avg_time, max_bytes, max_time;
+ u16 history;
int max_id;
+ if (!wb)
+ return;
+
+ history = inode->i_wb_frn_history;
+ avg_time = inode->i_wb_frn_avg_time;
+
/* pick the winner of this round */
if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
wbc->wb_bytes >= wbc->wb_tcand_bytes) {
*/
int inode_congested(struct inode *inode, int cong_bits)
{
- if (inode) {
- struct bdi_writeback *wb = inode_to_wb(inode);
- if (wb)
- return wb_congested(wb, cong_bits);
+ /*
+ * Once set, ->i_wb never becomes NULL while the inode is alive.
+ * Start transaction iff ->i_wb is visible.
+ */
+ if (inode && inode_to_wb_is_valid(inode)) {
+ struct bdi_writeback *wb;
+ bool locked, congested;
+
+ wb = unlocked_inode_to_wb_begin(inode, &locked);
+ congested = wb_congested(wb, cong_bits);
+ unlocked_inode_to_wb_end(inode, locked);
+ return congested;
}
return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);