[deliverable/linux.git] / fs / xfs / linux-2.6 / xfs_sync.c

/*
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
#include "xfs_btree.h"
#include "xfs_dir2_sf.h"
#include "xfs_attr_sf.h"
#include "xfs_inode.h"
#include "xfs_dinode.h"
#include "xfs_error.h"
#include "xfs_mru_cache.h"
#include "xfs_filestream.h"
#include "xfs_vnodeops.h"
#include "xfs_utils.h"
#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
#include "xfs_rw.h"

#include <linux/kthread.h>
#include <linux/freezer.h>

/*
 * xfs_sync flushes any pending I/O to file system vfsp.
 *
 * This routine is called by vfs_sync() to make sure that things make it
 * out to disk eventually, on sync() system calls to flush out everything,
 * and when the file system is unmounted.  For the vfs_sync() case, all
 * we really need to do is sync out the log to make all of our meta-data
 * updates permanent (except for timestamps).  For calls from pflushd(),
 * dirty pages are kept moving by calling pdflush() on the inodes
 * containing them.  We also flush the inodes that we can lock without
 * sleeping and the superblock if we can lock it without sleeping from
 * vfs_sync() so that items at the tail of the log are always moving out.
 *
 * Flags:
 *      SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
 *		       to sleep if we can help it.  All we really need
 *		       to do is ensure that the log is synced at least
 *		       periodically.  We also push the inodes and
 *		       superblock if we can lock them without sleeping
 *			and they are not pinned.
 *      SYNC_ATTR    - We need to flush the inodes.  If SYNC_BDFLUSH is not
 *		       set, then we really want to lock each inode and flush
 *		       it.
 *      SYNC_WAIT    - All the flushes that take place in this call should
 *		       be synchronous.
 *      SYNC_DELWRI  - This tells us to push dirty pages associated with
 *		       inodes.  SYNC_WAIT and SYNC_BDFLUSH are used to
 *		       determine if they should be flushed sync, async, or
 *		       delwri.
 *      SYNC_CLOSE   - This flag is passed when the system is being
 *		       unmounted.  We should sync and invalidate everything.
 *      SYNC_FSDATA  - This indicates that the caller would like to make
 *		       sure the superblock is safe on disk.  We can ensure
 *		       this by simply making sure the log gets flushed
 *		       if SYNC_BDFLUSH is set, and by actually writing it
 *		       out otherwise.
 *	SYNC_IOWAIT  - The caller wants us to wait for all data I/O to complete
 *		       before we return (including direct I/O). Forms the drain
 *		       side of the write barrier needed to safely quiesce the
 *		       filesystem.
 *
 */
int
xfs_sync(
	xfs_mount_t	*mp,
	int		flags)
{
	int		error;

	/*
	 * Get the Quota Manager to flush the dquots.
	 *
	 * If XFS quota support is not enabled or this filesystem
	 * instance does not use quotas XFS_QM_DQSYNC will always
	 * return zero.
	 */
	error = XFS_QM_DQSYNC(mp, flags);
	if (error) {
		/*
		 * If we got an IO error, we will be shutting down.
		 * So, there's nothing more for us to do here.
		 */
		ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
		if (XFS_FORCED_SHUTDOWN(mp))
			return XFS_ERROR(error);
	}

	if (flags & SYNC_IOWAIT)
		xfs_filestream_flush(mp);

	return xfs_syncsub(mp, flags, NULL);
}

/*
 * Sync all the inodes in the given AG according to the
 * direction given by the flags.
 */
STATIC int
xfs_sync_inodes_ag(
	xfs_mount_t	*mp,
	int		ag,
	int		flags,
	int		*bypassed)
{
	xfs_inode_t	*ip = NULL;
	struct inode	*vp = NULL;
	xfs_perag_t	*pag = &mp->m_perag[ag];
	boolean_t	vnode_refed = B_FALSE;
	int		nr_found;
	int		first_index = 0;
	int		error = 0;
	int		last_error = 0;
	int		fflag = XFS_B_ASYNC;
	int		lock_flags = XFS_ILOCK_SHARED;

	if (flags & SYNC_DELWRI)
		fflag = XFS_B_DELWRI;
	if (flags & SYNC_WAIT)
		fflag = 0;		/* synchronous overrides all */

	if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
		/*
		 * We need the I/O lock if we're going to call any of
		 * the flush/inval routines.
		 */
		lock_flags |= XFS_IOLOCK_SHARED;
	}

	do {
		/*
		 * use a gang lookup to find the next inode in the tree
		 * as the tree is sparse and a gang lookup walks to find
		 * the number of objects requested.
		 */
		read_lock(&pag->pag_ici_lock);
		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
				(void**)&ip, first_index, 1);

		if (!nr_found) {
			read_unlock(&pag->pag_ici_lock);
			break;
		}

		/* update the index for the next lookup */
		first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);

		/*
		 * skip inodes in reclaim. Let xfs_syncsub do that for
		 * us so we don't need to worry.
		 */
		vp = VFS_I(ip);
		if (!vp) {
			read_unlock(&pag->pag_ici_lock);
			continue;
		}

		/* bad inodes are dealt with elsewhere */
		if (VN_BAD(vp)) {
			read_unlock(&pag->pag_ici_lock);
			continue;
		}

		/* nothing to sync during shutdown */
		if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
			read_unlock(&pag->pag_ici_lock);
			return 0;
		}

		/*
		 * The inode lock here actually coordinates with the almost
		 * spurious inode lock in xfs_ireclaim() to prevent the vnode
		 * we handle here without a reference from being freed while we
		 * reference it.  If we lock the inode while it's on the mount
		 * list here, then the spurious inode lock in xfs_ireclaim()
		 * after the inode is pulled from the mount list will sleep
		 * until we release it here.  This keeps the vnode from being
		 * freed while we reference it.
		 */
		if (xfs_ilock_nowait(ip, lock_flags) == 0) {
			vp = vn_grab(vp);
			read_unlock(&pag->pag_ici_lock);
			if (!vp)
				continue;
			xfs_ilock(ip, lock_flags);

			ASSERT(vp == VFS_I(ip));
			ASSERT(ip->i_mount == mp);

			vnode_refed = B_TRUE;
		} else {
			/* safe to unlock here as we have a reference */
			read_unlock(&pag->pag_ici_lock);
		}
		/*
		 * If we have to flush data or wait for I/O completion
		 * we need to drop the ilock that we currently hold.
		 * If we need to drop the lock, insert a marker if we
		 * have not already done so.
		 */
		if (flags & SYNC_CLOSE) {
			xfs_iunlock(ip, XFS_ILOCK_SHARED);
			if (XFS_FORCED_SHUTDOWN(mp))
				xfs_tosspages(ip, 0, -1, FI_REMAPF);
			else
				error = xfs_flushinval_pages(ip, 0, -1,
							FI_REMAPF);
			/* wait for I/O on freeze */
			if (flags & SYNC_IOWAIT)
				vn_iowait(ip);

			xfs_ilock(ip, XFS_ILOCK_SHARED);
		}

		if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
			xfs_iunlock(ip, XFS_ILOCK_SHARED);
			error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
			if (flags & SYNC_IOWAIT)
				vn_iowait(ip);
			xfs_ilock(ip, XFS_ILOCK_SHARED);
		}

		if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
			if (flags & SYNC_WAIT) {
				xfs_iflock(ip);
				if (!xfs_inode_clean(ip))
					error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
				else
					xfs_ifunlock(ip);
			} else if (xfs_iflock_nowait(ip)) {
				if (!xfs_inode_clean(ip))
					error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
				else
					xfs_ifunlock(ip);
			} else if (bypassed) {
				(*bypassed)++;
			}
		}

		if (lock_flags)
			xfs_iunlock(ip, lock_flags);

		if (vnode_refed) {
			IRELE(ip);
			vnode_refed = B_FALSE;
		}

		if (error)
			last_error = error;
		/*
		 * bail out if the filesystem is corrupted.
		 */
		if (error == EFSCORRUPTED)
			return XFS_ERROR(error);

	} while (nr_found);

	return last_error;
}

int
xfs_sync_inodes(
	xfs_mount_t	*mp,
	int		flags,
	int             *bypassed)
{
	int		error;
	int		last_error;
	int		i;

	if (bypassed)
		*bypassed = 0;
	if (mp->m_flags & XFS_MOUNT_RDONLY)
		return 0;
	error = 0;
	last_error = 0;

	for (i = 0; i < mp->m_sb.sb_agcount; i++) {
		if (!mp->m_perag[i].pag_ici_init)
			continue;
		error = xfs_sync_inodes_ag(mp, i, flags, bypassed);
		if (error)
			last_error = error;
		if (error == EFSCORRUPTED)
			break;
	}
	return XFS_ERROR(last_error);
}

STATIC int
xfs_commit_dummy_trans(
	struct xfs_mount	*mp,
	uint			log_flags)
{
	struct xfs_inode	*ip = mp->m_rootip;
	struct xfs_trans	*tp;
	int			error;

	/*
	 * Put a dummy transaction in the log to tell recovery
	 * that all others are OK.
	 */
	tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
	if (error) {
		xfs_trans_cancel(tp, 0);
		return error;
	}

	xfs_ilock(ip, XFS_ILOCK_EXCL);

	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
	xfs_trans_ihold(tp, ip);
	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
	/* XXX(hch): ignoring the error here.. */
	error = xfs_trans_commit(tp, 0);

	xfs_iunlock(ip, XFS_ILOCK_EXCL);

	xfs_log_force(mp, 0, log_flags);
	return 0;
}

STATIC int
xfs_sync_fsdata(
	struct xfs_mount	*mp,
	int			flags)
{
	struct xfs_buf		*bp;
	struct xfs_buf_log_item	*bip;
	int			error = 0;

	/*
	 * If this is xfssyncd() then only sync the superblock if we can
	 * lock it without sleeping and it is not pinned.
	 */
	if (flags & SYNC_BDFLUSH) {
		ASSERT(!(flags & SYNC_WAIT));

		bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
		if (!bp)
			goto out;

		bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
		if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
			goto out_brelse;
	} else {
		bp = xfs_getsb(mp, 0);

		/*
		 * If the buffer is pinned then push on the log so we won't
		 * get stuck waiting in the write for someone, maybe
		 * ourselves, to flush the log.
		 *
		 * Even though we just pushed the log above, we did not have
		 * the superblock buffer locked at that point so it can
		 * become pinned in between there and here.
		 */
		if (XFS_BUF_ISPINNED(bp))
			xfs_log_force(mp, 0, XFS_LOG_FORCE);
	}


	if (flags & SYNC_WAIT)
		XFS_BUF_UNASYNC(bp);
	else
		XFS_BUF_ASYNC(bp);

	return xfs_bwrite(mp, bp);

 out_brelse:
	xfs_buf_relse(bp);
 out:
	return error;
}

/*
 * xfs sync routine for internal use
 *
 * This routine supports all of the flags defined for the generic vfs_sync
 * interface as explained above under xfs_sync.
 *
 */
int
xfs_syncsub(
	xfs_mount_t	*mp,
	int		flags,
	int             *bypassed)
{
	int		error = 0;
	int		last_error = 0;
	uint		log_flags = XFS_LOG_FORCE;

	/*
	 * Sync out the log.  This ensures that the log is periodically
	 * flushed even if there is not enough activity to fill it up.
	 */
	if (flags & SYNC_WAIT)
		log_flags |= XFS_LOG_SYNC;

	xfs_log_force(mp, (xfs_lsn_t)0, log_flags);

	if (flags & (SYNC_ATTR|SYNC_DELWRI)) {
		if (flags & SYNC_BDFLUSH)
			xfs_finish_reclaim_all(mp, 1, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
		else
			error = xfs_sync_inodes(mp, flags, bypassed);
	}

	/*
	 * Flushing out dirty data above probably generated more
	 * log activity, so if this isn't vfs_sync() then flush
	 * the log again.
	 */
	if (flags & SYNC_DELWRI)
		xfs_log_force(mp, 0, log_flags);

	if (flags & SYNC_FSDATA) {
		error = xfs_sync_fsdata(mp, flags);
		if (error)
			last_error = error;
	}

	/*
	 * Now check to see if the log needs a "dummy" transaction.
	 */
	if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
		error = xfs_commit_dummy_trans(mp, log_flags);
		if (error)
			return error;
	}

	/*
	 * When shutting down, we need to insure that the AIL is pushed
	 * to disk or the filesystem can appear corrupt from the PROM.
	 */
	if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
		XFS_bflush(mp->m_ddev_targp);
		if (mp->m_rtdev_targp) {
			XFS_bflush(mp->m_rtdev_targp);
		}
	}

	return XFS_ERROR(last_error);
}

/*
 * Enqueue a work item to be picked up by the vfs xfssyncd thread.
 * Doing this has two advantages:
 * - It saves on stack space, which is tight in certain situations
 * - It can be used (with care) as a mechanism to avoid deadlocks.
 * Flushing while allocating in a full filesystem requires both.
 */
STATIC void
xfs_syncd_queue_work(
	struct xfs_mount *mp,
	void		*data,
	void		(*syncer)(struct xfs_mount *, void *))
{
	struct bhv_vfs_sync_work *work;

	work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
	INIT_LIST_HEAD(&work->w_list);
	work->w_syncer = syncer;
	work->w_data = data;
	work->w_mount = mp;
	spin_lock(&mp->m_sync_lock);
	list_add_tail(&work->w_list, &mp->m_sync_list);
	spin_unlock(&mp->m_sync_lock);
	wake_up_process(mp->m_sync_task);
}

/*
 * Flush delayed allocate data, attempting to free up reserved space
 * from existing allocations.  At this point a new allocation attempt
 * has failed with ENOSPC and we are in the process of scratching our
 * heads, looking about for more room...
 */
STATIC void
xfs_flush_inode_work(
	struct xfs_mount *mp,
	void		*arg)
{
	struct inode	*inode = arg;
	filemap_flush(inode->i_mapping);
	iput(inode);
}

void
xfs_flush_inode(
	xfs_inode_t	*ip)
{
	struct inode	*inode = VFS_I(ip);

	igrab(inode);
	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
	delay(msecs_to_jiffies(500));
}

/*
 * This is the "bigger hammer" version of xfs_flush_inode_work...
 * (IOW, "If at first you don't succeed, use a Bigger Hammer").
 */
STATIC void
xfs_flush_device_work(
	struct xfs_mount *mp,
	void		*arg)
{
	struct inode	*inode = arg;
	sync_blockdev(mp->m_super->s_bdev);
	iput(inode);
}

void
xfs_flush_device(
	xfs_inode_t	*ip)
{
	struct inode	*inode = VFS_I(ip);

	igrab(inode);
	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
	delay(msecs_to_jiffies(500));
	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
}

STATIC void
xfs_sync_worker(
	struct xfs_mount *mp,
	void		*unused)
{
	int		error;

	if (!(mp->m_flags & XFS_MOUNT_RDONLY))
		error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
	mp->m_sync_seq++;
	wake_up(&mp->m_wait_single_sync_task);
}

STATIC int
xfssyncd(
	void			*arg)
{
	struct xfs_mount	*mp = arg;
	long			timeleft;
	bhv_vfs_sync_work_t	*work, *n;
	LIST_HEAD		(tmp);

	set_freezable();
	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
	for (;;) {
		timeleft = schedule_timeout_interruptible(timeleft);
		/* swsusp */
		try_to_freeze();
		if (kthread_should_stop() && list_empty(&mp->m_sync_list))
			break;

		spin_lock(&mp->m_sync_lock);
		/*
		 * We can get woken by laptop mode, to do a sync -
		 * that's the (only!) case where the list would be
		 * empty with time remaining.
		 */
		if (!timeleft || list_empty(&mp->m_sync_list)) {
			if (!timeleft)
				timeleft = xfs_syncd_centisecs *
							msecs_to_jiffies(10);
			INIT_LIST_HEAD(&mp->m_sync_work.w_list);
			list_add_tail(&mp->m_sync_work.w_list,
					&mp->m_sync_list);
		}
		list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
			list_move(&work->w_list, &tmp);
		spin_unlock(&mp->m_sync_lock);

		list_for_each_entry_safe(work, n, &tmp, w_list) {
			(*work->w_syncer)(mp, work->w_data);
			list_del(&work->w_list);
			if (work == &mp->m_sync_work)
				continue;
			kmem_free(work);
		}
	}

	return 0;
}

int
xfs_syncd_init(
	struct xfs_mount	*mp)
{
	mp->m_sync_work.w_syncer = xfs_sync_worker;
	mp->m_sync_work.w_mount = mp;
	mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
	if (IS_ERR(mp->m_sync_task))
		return -PTR_ERR(mp->m_sync_task);
	return 0;
}

void
xfs_syncd_stop(
	struct xfs_mount	*mp)
{
	kthread_stop(mp->m_sync_task);
}
Commit	Line	Data
fe4fa4b8 DC	1	/*
	2	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
	3	* All Rights Reserved.
	4	*
	5	* This program is free software; you can redistribute it and/or
	6	* modify it under the terms of the GNU General Public License as
	7	* published by the Free Software Foundation.
	8	*
	9	* This program is distributed in the hope that it would be useful,
	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	* GNU General Public License for more details.
	13	*
	14	* You should have received a copy of the GNU General Public License
	15	* along with this program; if not, write the Free Software Foundation,
	16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	17	*/
	18	#include "xfs.h"
	19	#include "xfs_fs.h"
	20	#include "xfs_types.h"
	21	#include "xfs_bit.h"
	22	#include "xfs_log.h"
	23	#include "xfs_inum.h"
	24	#include "xfs_trans.h"
	25	#include "xfs_sb.h"
	26	#include "xfs_ag.h"
	27	#include "xfs_dir2.h"
	28	#include "xfs_dmapi.h"
	29	#include "xfs_mount.h"
	30	#include "xfs_bmap_btree.h"
	31	#include "xfs_alloc_btree.h"
	32	#include "xfs_ialloc_btree.h"
	33	#include "xfs_btree.h"
	34	#include "xfs_dir2_sf.h"
	35	#include "xfs_attr_sf.h"
	36	#include "xfs_inode.h"
	37	#include "xfs_dinode.h"
	38	#include "xfs_error.h"
	39	#include "xfs_mru_cache.h"
	40	#include "xfs_filestream.h"
	41	#include "xfs_vnodeops.h"
	42	#include "xfs_utils.h"
	43	#include "xfs_buf_item.h"
	44	#include "xfs_inode_item.h"
	45	#include "xfs_rw.h"
	46
a167b17e DC	47	#include <linux/kthread.h>
	48	#include <linux/freezer.h>
	49
fe4fa4b8 DC	50	/*
	51	* xfs_sync flushes any pending I/O to file system vfsp.
	52	*
	53	* This routine is called by vfs_sync() to make sure that things make it
	54	* out to disk eventually, on sync() system calls to flush out everything,
	55	* and when the file system is unmounted. For the vfs_sync() case, all
	56	* we really need to do is sync out the log to make all of our meta-data
	57	* updates permanent (except for timestamps). For calls from pflushd(),
	58	* dirty pages are kept moving by calling pdflush() on the inodes
	59	* containing them. We also flush the inodes that we can lock without
	60	* sleeping and the superblock if we can lock it without sleeping from
	61	* vfs_sync() so that items at the tail of the log are always moving out.
	62	*
	63	* Flags:
	64	* SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
	65	* to sleep if we can help it. All we really need
	66	* to do is ensure that the log is synced at least
	67	* periodically. We also push the inodes and
	68	* superblock if we can lock them without sleeping
	69	* and they are not pinned.
	70	* SYNC_ATTR - We need to flush the inodes. If SYNC_BDFLUSH is not
	71	* set, then we really want to lock each inode and flush
	72	* it.
	73	* SYNC_WAIT - All the flushes that take place in this call should
	74	* be synchronous.
	75	* SYNC_DELWRI - This tells us to push dirty pages associated with
	76	* inodes. SYNC_WAIT and SYNC_BDFLUSH are used to
	77	* determine if they should be flushed sync, async, or
	78	* delwri.
	79	* SYNC_CLOSE - This flag is passed when the system is being
	80	* unmounted. We should sync and invalidate everything.
	81	* SYNC_FSDATA - This indicates that the caller would like to make
	82	* sure the superblock is safe on disk. We can ensure
	83	* this by simply making sure the log gets flushed
	84	* if SYNC_BDFLUSH is set, and by actually writing it
	85	* out otherwise.
	86	* SYNC_IOWAIT - The caller wants us to wait for all data I/O to complete
	87	* before we return (including direct I/O). Forms the drain
	88	* side of the write barrier needed to safely quiesce the
	89	* filesystem.
	90	*
	91	*/
	92	int
	93	xfs_sync(
	94	xfs_mount_t *mp,
	95	int flags)
	96	{
	97	int error;
	98
	99	/*
	100	* Get the Quota Manager to flush the dquots.
	101	*
	102	* If XFS quota support is not enabled or this filesystem
	103	* instance does not use quotas XFS_QM_DQSYNC will always
	104	* return zero.
	105	*/
	106	error = XFS_QM_DQSYNC(mp, flags);
	107	if (error) {
	108	/*
	109	* If we got an IO error, we will be shutting down.
	110	* So, there's nothing more for us to do here.
	111	*/
	112	ASSERT(error != EIO \|\| XFS_FORCED_SHUTDOWN(mp));
	113	if (XFS_FORCED_SHUTDOWN(mp))
114	return XFS_ERROR(error);
115	}
116
117	if (flags & SYNC_IOWAIT)
118	xfs_filestream_flush(mp);
119
120	return xfs_syncsub(mp, flags, NULL);
121	}
122
123	/*
683a8970 DC	124	* Sync all the inodes in the given AG according to the
683a8970 DC	125	* direction given by the flags.
fe4fa4b8	126	*/
683a8970 DC	127	STATIC int
683a8970 DC	128	xfs_sync_inodes_ag(
fe4fa4b8	129	xfs_mount_t *mp,
683a8970	130	int ag,
fe4fa4b8	131	int flags,
683a8970	132	int *bypassed)
fe4fa4b8 DC	133	{
	134	xfs_inode_t *ip = NULL;
	135	struct inode *vp = NULL;
683a8970 DC	136	xfs_perag_t *pag = &mp->m_perag[ag];
	137	boolean_t vnode_refed = B_FALSE;
	138	int nr_found;
	139	int first_index = 0;
	140	int error = 0;
	141	int last_error = 0;
	142	int fflag = XFS_B_ASYNC;
	143	int lock_flags = XFS_ILOCK_SHARED;
fe4fa4b8	144
fe4fa4b8 DC	145	if (flags & SYNC_DELWRI)
	146	fflag = XFS_B_DELWRI;
	147	if (flags & SYNC_WAIT)
	148	fflag = 0; /* synchronous overrides all */
	149
fe4fa4b8 DC	150	if (flags & (SYNC_DELWRI \| SYNC_CLOSE)) {
	151	/*
	152	* We need the I/O lock if we're going to call any of
	153	* the flush/inval routines.
	154	*/
683a8970	155	lock_flags \|= XFS_IOLOCK_SHARED;
fe4fa4b8 DC	156	}
fe4fa4b8 DC	157
fe4fa4b8	158	do {
fe4fa4b8	159	/*
683a8970 DC	160	* use a gang lookup to find the next inode in the tree
	161	* as the tree is sparse and a gang lookup walks to find
	162	* the number of objects requested.
fe4fa4b8	163	*/
683a8970 DC	164	read_lock(&pag->pag_ici_lock);
	165	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
	166	(void**)&ip, first_index, 1);
fe4fa4b8	167
683a8970 DC	168	if (!nr_found) {
	169	read_unlock(&pag->pag_ici_lock);
	170	break;
fe4fa4b8 DC	171	}
fe4fa4b8 DC	172
683a8970 DC	173	/* update the index for the next lookup */
683a8970 DC	174	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
fe4fa4b8 DC	175
fe4fa4b8 DC	176	/*
683a8970 DC	177	* skip inodes in reclaim. Let xfs_syncsub do that for
683a8970 DC	178	* us so we don't need to worry.
fe4fa4b8	179	*/
683a8970 DC	180	vp = VFS_I(ip);
	181	if (!vp) {
	182	read_unlock(&pag->pag_ici_lock);
fe4fa4b8 DC	183	continue;
	184	}
	185
683a8970	186	/* bad inodes are dealt with elsewhere */
fe4fa4b8	187	if (VN_BAD(vp)) {
683a8970	188	read_unlock(&pag->pag_ici_lock);
fe4fa4b8 DC	189	continue;
	190	}
	191
683a8970	192	/* nothing to sync during shutdown */
fe4fa4b8	193	if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
683a8970	194	read_unlock(&pag->pag_ici_lock);
fe4fa4b8 DC	195	return 0;
	196	}
	197
	198	/*
683a8970 DC	199	* The inode lock here actually coordinates with the almost
	200	* spurious inode lock in xfs_ireclaim() to prevent the vnode
	201	* we handle here without a reference from being freed while we
	202	* reference it. If we lock the inode while it's on the mount
	203	* list here, then the spurious inode lock in xfs_ireclaim()
	204	* after the inode is pulled from the mount list will sleep
	205	* until we release it here. This keeps the vnode from being
	206	* freed while we reference it.
fe4fa4b8 DC	207	*/
fe4fa4b8 DC	208	if (xfs_ilock_nowait(ip, lock_flags) == 0) {
fe4fa4b8	209	vp = vn_grab(vp);
683a8970 DC	210	read_unlock(&pag->pag_ici_lock);
683a8970 DC	211	if (!vp)
fe4fa4b8	212	continue;
fe4fa4b8 DC	213	xfs_ilock(ip, lock_flags);
	214
	215	ASSERT(vp == VFS_I(ip));
	216	ASSERT(ip->i_mount == mp);
	217
	218	vnode_refed = B_TRUE;
683a8970 DC	219	} else {
	220	/* safe to unlock here as we have a reference */
	221	read_unlock(&pag->pag_ici_lock);
fe4fa4b8	222	}
fe4fa4b8 DC	223	/*
	224	* If we have to flush data or wait for I/O completion
	225	* we need to drop the ilock that we currently hold.
	226	* If we need to drop the lock, insert a marker if we
	227	* have not already done so.
	228	*/
683a8970	229	if (flags & SYNC_CLOSE) {
fe4fa4b8	230	xfs_iunlock(ip, XFS_ILOCK_SHARED);
683a8970 DC	231	if (XFS_FORCED_SHUTDOWN(mp))
	232	xfs_tosspages(ip, 0, -1, FI_REMAPF);
	233	else
	234	error = xfs_flushinval_pages(ip, 0, -1,
	235	FI_REMAPF);
	236	/* wait for I/O on freeze */
fe4fa4b8 DC	237	if (flags & SYNC_IOWAIT)
	238	vn_iowait(ip);
	239
	240	xfs_ilock(ip, XFS_ILOCK_SHARED);
	241	}
	242
683a8970 DC	243	if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
	244	xfs_iunlock(ip, XFS_ILOCK_SHARED);
	245	error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
	246	if (flags & SYNC_IOWAIT)
	247	vn_iowait(ip);
	248	xfs_ilock(ip, XFS_ILOCK_SHARED);
	249	}
fe4fa4b8	250
683a8970	251	if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
fe4fa4b8 DC	252	if (flags & SYNC_WAIT) {
fe4fa4b8 DC	253	xfs_iflock(ip);
683a8970 DC	254	if (!xfs_inode_clean(ip))
	255	error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
	256	else
	257	xfs_ifunlock(ip);
fe4fa4b8	258	} else if (xfs_iflock_nowait(ip)) {
683a8970 DC	259	if (!xfs_inode_clean(ip))
	260	error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
	261	else
	262	xfs_ifunlock(ip);
fe4fa4b8 DC	263	} else if (bypassed) {
	264	(*bypassed)++;
	265	}
	266	}
	267
683a8970	268	if (lock_flags)
fe4fa4b8	269	xfs_iunlock(ip, lock_flags);
fe4fa4b8 DC	270
fe4fa4b8 DC	271	if (vnode_refed) {
fe4fa4b8	272	IRELE(ip);
fe4fa4b8 DC	273	vnode_refed = B_FALSE;
	274	}
	275
683a8970	276	if (error)
fe4fa4b8	277	last_error = error;
fe4fa4b8 DC	278	/*
	279	* bail out if the filesystem is corrupted.
	280	*/
683a8970	281	if (error == EFSCORRUPTED)
fe4fa4b8	282	return XFS_ERROR(error);
fe4fa4b8	283
683a8970	284	} while (nr_found);
fe4fa4b8	285
683a8970 DC	286	return last_error;
683a8970 DC	287	}
fe4fa4b8	288
683a8970 DC	289	int
	290	xfs_sync_inodes(
	291	xfs_mount_t *mp,
	292	int flags,
	293	int *bypassed)
	294	{
	295	int error;
	296	int last_error;
	297	int i;
fe4fa4b8	298
683a8970 DC	299	if (bypassed)
	300	*bypassed = 0;
	301	if (mp->m_flags & XFS_MOUNT_RDONLY)
	302	return 0;
	303	error = 0;
	304	last_error = 0;
fe4fa4b8	305
683a8970 DC	306	for (i = 0; i < mp->m_sb.sb_agcount; i++) {
	307	if (!mp->m_perag[i].pag_ici_init)
	308	continue;
	309	error = xfs_sync_inodes_ag(mp, i, flags, bypassed);
	310	if (error)
	311	last_error = error;
	312	if (error == EFSCORRUPTED)
	313	break;
	314	}
fe4fa4b8 DC	315	return XFS_ERROR(last_error);
	316	}
	317
2af75df7 CH	318	STATIC int
	319	xfs_commit_dummy_trans(
	320	struct xfs_mount *mp,
	321	uint log_flags)
	322	{
	323	struct xfs_inode *ip = mp->m_rootip;
	324	struct xfs_trans *tp;
	325	int error;
	326
	327	/*
	328	* Put a dummy transaction in the log to tell recovery
	329	* that all others are OK.
	330	*/
	331	tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
	332	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
	333	if (error) {
	334	xfs_trans_cancel(tp, 0);
	335	return error;
	336	}
	337
	338	xfs_ilock(ip, XFS_ILOCK_EXCL);
	339
	340	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
	341	xfs_trans_ihold(tp, ip);
	342	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
	343	/* XXX(hch): ignoring the error here.. */
	344	error = xfs_trans_commit(tp, 0);
	345
	346	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	347
	348	xfs_log_force(mp, 0, log_flags);
	349	return 0;
	350	}
	351
	352	STATIC int
	353	xfs_sync_fsdata(
	354	struct xfs_mount *mp,
	355	int flags)
	356	{
	357	struct xfs_buf *bp;
	358	struct xfs_buf_log_item *bip;
	359	int error = 0;
	360
	361	/*
	362	* If this is xfssyncd() then only sync the superblock if we can
	363	* lock it without sleeping and it is not pinned.
	364	*/
	365	if (flags & SYNC_BDFLUSH) {
	366	ASSERT(!(flags & SYNC_WAIT));
	367
	368	bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
	369	if (!bp)
	370	goto out;
	371
	372	bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
	373	if (!bip \|\| !xfs_buf_item_dirty(bip) \|\| XFS_BUF_ISPINNED(bp))
	374	goto out_brelse;
	375	} else {
	376	bp = xfs_getsb(mp, 0);
	377
	378	/*
	379	* If the buffer is pinned then push on the log so we won't
	380	* get stuck waiting in the write for someone, maybe
	381	* ourselves, to flush the log.
382	*
383	* Even though we just pushed the log above, we did not have
384	* the superblock buffer locked at that point so it can
385	* become pinned in between there and here.
386	*/
387	if (XFS_BUF_ISPINNED(bp))
388	xfs_log_force(mp, 0, XFS_LOG_FORCE);
389	}
390
391
392	if (flags & SYNC_WAIT)
393	XFS_BUF_UNASYNC(bp);
394	else
395	XFS_BUF_ASYNC(bp);
396
397	return xfs_bwrite(mp, bp);
398
399	out_brelse:
400	xfs_buf_relse(bp);
401	out:
402	return error;
403	}
404
fe4fa4b8 DC	405	/*
	406	* xfs sync routine for internal use
	407	*
	408	* This routine supports all of the flags defined for the generic vfs_sync
	409	* interface as explained above under xfs_sync.
	410	*
	411	*/
	412	int
	413	xfs_syncsub(
	414	xfs_mount_t *mp,
	415	int flags,
	416	int *bypassed)
	417	{
	418	int error = 0;
	419	int last_error = 0;
	420	uint log_flags = XFS_LOG_FORCE;
fe4fa4b8 DC	421
	422	/*
	423	* Sync out the log. This ensures that the log is periodically
	424	* flushed even if there is not enough activity to fill it up.
	425	*/
	426	if (flags & SYNC_WAIT)
	427	log_flags \|= XFS_LOG_SYNC;
	428
	429	xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
	430
	431	if (flags & (SYNC_ATTR\|SYNC_DELWRI)) {
	432	if (flags & SYNC_BDFLUSH)
75c68f41	433	xfs_finish_reclaim_all(mp, 1, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
fe4fa4b8 DC	434	else
	435	error = xfs_sync_inodes(mp, flags, bypassed);
	436	}
	437
	438	/*
	439	* Flushing out dirty data above probably generated more
	440	* log activity, so if this isn't vfs_sync() then flush
	441	* the log again.
	442	*/
2af75df7 CH	443	if (flags & SYNC_DELWRI)
2af75df7 CH	444	xfs_log_force(mp, 0, log_flags);
fe4fa4b8 DC	445
fe4fa4b8 DC	446	if (flags & SYNC_FSDATA) {
2af75df7 CH	447	error = xfs_sync_fsdata(mp, flags);
2af75df7 CH	448	if (error)
fe4fa4b8	449	last_error = error;
fe4fa4b8 DC	450	}
	451
	452	/*
	453	* Now check to see if the log needs a "dummy" transaction.
	454	*/
	455	if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
2af75df7 CH	456	error = xfs_commit_dummy_trans(mp, log_flags);
2af75df7 CH	457	if (error)
fe4fa4b8	458	return error;
fe4fa4b8 DC	459	}
	460
	461	/*
	462	* When shutting down, we need to insure that the AIL is pushed
	463	* to disk or the filesystem can appear corrupt from the PROM.
	464	*/
	465	if ((flags & (SYNC_CLOSE\|SYNC_WAIT)) == (SYNC_CLOSE\|SYNC_WAIT)) {
	466	XFS_bflush(mp->m_ddev_targp);
	467	if (mp->m_rtdev_targp) {
	468	XFS_bflush(mp->m_rtdev_targp);
	469	}
	470	}
	471
	472	return XFS_ERROR(last_error);
	473	}
a167b17e DC	474
	475	/*
	476	* Enqueue a work item to be picked up by the vfs xfssyncd thread.
	477	* Doing this has two advantages:
	478	* - It saves on stack space, which is tight in certain situations
	479	* - It can be used (with care) as a mechanism to avoid deadlocks.
	480	* Flushing while allocating in a full filesystem requires both.
	481	*/
	482	STATIC void
	483	xfs_syncd_queue_work(
	484	struct xfs_mount *mp,
	485	void *data,
	486	void (syncer)(struct xfs_mount , void *))
	487	{
	488	struct bhv_vfs_sync_work *work;
	489
	490	work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
	491	INIT_LIST_HEAD(&work->w_list);
	492	work->w_syncer = syncer;
	493	work->w_data = data;
	494	work->w_mount = mp;
	495	spin_lock(&mp->m_sync_lock);
	496	list_add_tail(&work->w_list, &mp->m_sync_list);
	497	spin_unlock(&mp->m_sync_lock);
	498	wake_up_process(mp->m_sync_task);
	499	}
	500
	501	/*
	502	* Flush delayed allocate data, attempting to free up reserved space
	503	* from existing allocations. At this point a new allocation attempt
	504	* has failed with ENOSPC and we are in the process of scratching our
	505	* heads, looking about for more room...
	506	*/
	507	STATIC void
	508	xfs_flush_inode_work(
	509	struct xfs_mount *mp,
	510	void *arg)
	511	{
	512	struct inode *inode = arg;
	513	filemap_flush(inode->i_mapping);
	514	iput(inode);
	515	}
	516
	517	void
	518	xfs_flush_inode(
	519	xfs_inode_t *ip)
	520	{
	521	struct inode *inode = VFS_I(ip);
	522
	523	igrab(inode);
	524	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
	525	delay(msecs_to_jiffies(500));
	526	}
	527
	528	/*
	529	* This is the "bigger hammer" version of xfs_flush_inode_work...
	530	* (IOW, "If at first you don't succeed, use a Bigger Hammer").
	531	*/
	532	STATIC void
	533	xfs_flush_device_work(
	534	struct xfs_mount *mp,
	535	void *arg)
	536	{
	537	struct inode *inode = arg;
538	sync_blockdev(mp->m_super->s_bdev);
539	iput(inode);
540	}
541
542	void
543	xfs_flush_device(
544	xfs_inode_t *ip)
545	{
546	struct inode *inode = VFS_I(ip);
547
548	igrab(inode);
549	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
550	delay(msecs_to_jiffies(500));
551	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE\|XFS_LOG_SYNC);
552	}
553
554	STATIC void
555	xfs_sync_worker(
556	struct xfs_mount *mp,
557	void *unused)
558	{
559	int error;
560
561	if (!(mp->m_flags & XFS_MOUNT_RDONLY))
562	error = xfs_sync(mp, SYNC_FSDATA \| SYNC_BDFLUSH \| SYNC_ATTR);
563	mp->m_sync_seq++;
564	wake_up(&mp->m_wait_single_sync_task);
565	}
566
567	STATIC int
568	xfssyncd(
569	void *arg)
570	{
571	struct xfs_mount *mp = arg;
572	long timeleft;
573	bhv_vfs_sync_work_t work, n;
574	LIST_HEAD (tmp);
575
576	set_freezable();
577	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
578	for (;;) {
579	timeleft = schedule_timeout_interruptible(timeleft);
580	/* swsusp */
581	try_to_freeze();
582	if (kthread_should_stop() && list_empty(&mp->m_sync_list))
583	break;
584
585	spin_lock(&mp->m_sync_lock);
586	/*
587	* We can get woken by laptop mode, to do a sync -
588	* that's the (only!) case where the list would be
589	* empty with time remaining.
590	*/
591	if (!timeleft \|\| list_empty(&mp->m_sync_list)) {
592	if (!timeleft)
593	timeleft = xfs_syncd_centisecs *
594	msecs_to_jiffies(10);
595	INIT_LIST_HEAD(&mp->m_sync_work.w_list);
596	list_add_tail(&mp->m_sync_work.w_list,
597	&mp->m_sync_list);
598	}
599	list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
600	list_move(&work->w_list, &tmp);
601	spin_unlock(&mp->m_sync_lock);
602
603	list_for_each_entry_safe(work, n, &tmp, w_list) {
604	(*work->w_syncer)(mp, work->w_data);
605	list_del(&work->w_list);
606	if (work == &mp->m_sync_work)
607	continue;
608	kmem_free(work);
609	}
610	}
611
612	return 0;
613	}
614
615	int
616	xfs_syncd_init(
617	struct xfs_mount *mp)
618	{
619	mp->m_sync_work.w_syncer = xfs_sync_worker;
620	mp->m_sync_work.w_mount = mp;
621	mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
622	if (IS_ERR(mp->m_sync_task))
623	return -PTR_ERR(mp->m_sync_task);
624	return 0;
625	}
626
627	void
628	xfs_syncd_stop(
629	struct xfs_mount *mp)
630	{
631	kthread_stop(mp->m_sync_task);
632	}
633