xfs: kill xfs_vnodeops.[ch]

[deliverable/linux.git] / fs / xfs / xfs_inode.c
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index 1029f5670207ede4e88f0bbb2cdcc0f7d5b909a8..73a7a1d84243cb2dc588975b52709105788c3774 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -23,14 +23,19 @@
  #include "xfs_log.h"
  #include "xfs_inum.h"
  #include "xfs_trans.h"
+#include "xfs_trans_space.h"
  #include "xfs_trans_priv.h"
  #include "xfs_sb.h"
  #include "xfs_ag.h"
  #include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2.h"
  #include "xfs_bmap_btree.h"
  #include "xfs_alloc_btree.h"
  #include "xfs_ialloc_btree.h"
  #include "xfs_attr_sf.h"
+#include "xfs_attr.h"
  #include "xfs_dinode.h"
  #include "xfs_inode.h"
  #include "xfs_buf_item.h"
@@ -44,10 +49,10 @@
  #include "xfs_utils.h"
  #include "xfs_quota.h"
  #include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
  #include "xfs_cksum.h"
  #include "xfs_trace.h"
  #include "xfs_icache.h"
+#include "xfs_symlink.h"
  
  kmem_zone_t *xfs_inode_zone;
  
@@ -307,6 +312,188 @@ xfs_isilocked(
  }
  #endif
  
+#ifdef DEBUG
+int xfs_locked_n;
+int xfs_small_retries;
+int xfs_middle_retries;
+int xfs_lots_retries;
+int xfs_lock_delays;
+#endif
+
+/*
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with
+ * a different value
+ */
+static inline int
+xfs_lock_inumorder(int lock_mode, int subclass)
+{
+       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+               lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+       if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
+               lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
+
+       return lock_mode;
+}
+
+/*
+ * The following routine will lock n inodes in exclusive mode.
+ * We assume the caller calls us with the inodes in i_ino order.
+ *
+ * We need to detect deadlock where an inode that we lock
+ * is in the AIL and we start waiting for another inode that is locked
+ * by a thread in a long running transaction (such as truncate). This can
+ * result in deadlock since the long running trans might need to wait
+ * for the inode we just locked in order to push the tail and free space
+ * in the log.
+ */
+void
+xfs_lock_inodes(
+       xfs_inode_t     **ips,
+       int             inodes,
+       uint            lock_mode)
+{
+       int             attempts = 0, i, j, try_lock;
+       xfs_log_item_t  *lp;
+
+       ASSERT(ips && (inodes >= 2)); /* we need at least two */
+
+       try_lock = 0;
+       i = 0;
+
+again:
+       for (; i < inodes; i++) {
+               ASSERT(ips[i]);
+
+               if (i && (ips[i] == ips[i-1]))  /* Already locked */
+                       continue;
+
+               /*
+                * If try_lock is not set yet, make sure all locked inodes
+                * are not in the AIL.
+                * If any are, set try_lock to be used later.
+                */
+
+               if (!try_lock) {
+                       for (j = (i - 1); j >= 0 && !try_lock; j--) {
+                               lp = (xfs_log_item_t *)ips[j]->i_itemp;
+                               if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+                                       try_lock++;
+                               }
+                       }
+               }
+
+               /*
+                * If any of the previous locks we have locked is in the AIL,
+                * we must TRY to get the second and subsequent locks. If
+                * we can't get any, we must release all we have
+                * and try again.
+                */
+
+               if (try_lock) {
+                       /* try_lock must be 0 if i is 0. */
+                       /*
+                        * try_lock means we have an inode locked
+                        * that is in the AIL.
+                        */
+                       ASSERT(i != 0);
+                       if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
+                               attempts++;
+
+                               /*
+                                * Unlock all previous guys and try again.
+                                * xfs_iunlock will try to push the tail
+                                * if the inode is in the AIL.
+                                */
+
+                               for(j = i - 1; j >= 0; j--) {
+
+                                       /*
+                                        * Check to see if we've already
+                                        * unlocked this one.
+                                        * Not the first one going back,
+                                        * and the inode ptr is the same.
+                                        */
+                                       if ((j != (i - 1)) && ips[j] ==
+                                                               ips[j+1])
+                                               continue;
+
+                                       xfs_iunlock(ips[j], lock_mode);
+                               }
+
+                               if ((attempts % 5) == 0) {
+                                       delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+                                       xfs_lock_delays++;
+#endif
+                               }
+                               i = 0;
+                               try_lock = 0;
+                               goto again;
+                       }
+               } else {
+                       xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
+               }
+       }
+
+#ifdef DEBUG
+       if (attempts) {
+               if (attempts < 5) xfs_small_retries++;
+               else if (attempts < 100) xfs_middle_retries++;
+               else xfs_lots_retries++;
+       } else {
+               xfs_locked_n++;
+       }
+#endif
+}
+
+/*
+ * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * at a time - the iolock or the ilock, but not both at once. If
+ * we lock both at once, lockdep will report false positives saying
+ * we have violated locking orders.
+ */
+void
+xfs_lock_two_inodes(
+       xfs_inode_t             *ip0,
+       xfs_inode_t             *ip1,
+       uint                    lock_mode)
+{
+       xfs_inode_t             *temp;
+       int                     attempts = 0;
+       xfs_log_item_t          *lp;
+
+       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+               ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+       ASSERT(ip0->i_ino != ip1->i_ino);
+
+       if (ip0->i_ino > ip1->i_ino) {
+               temp = ip0;
+               ip0 = ip1;
+               ip1 = temp;
+       }
+
+ again:
+       xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
+
+       /*
+        * If the first lock we have locked is in the AIL, we must TRY to get
+        * the second lock. If we can't get it, we must release the first one
+        * and try again.
+        */
+       lp = (xfs_log_item_t *)ip0->i_itemp;
+       if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+               if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
+                       xfs_iunlock(ip0, lock_mode);
+                       if ((++attempts % 5) == 0)
+                               delay(1); /* Don't just spin the CPU */
+                       goto again;
+               }
+       } else {
+               xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
+       }
+}
+
+
  void
  __xfs_iflock(
         struct xfs_inode        *ip)
@@ -381,6 +568,49 @@ xfs_dic2xflags(
                                 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
  }
  
+/*
+ * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
+ * is allowed, otherwise it has to be an exact match. If a CI match is found,
+ * ci_name->name will point to a the actual name (caller must free) or
+ * will be set to NULL if an exact match is found.
+ */
+int
+xfs_lookup(
+       xfs_inode_t             *dp,
+       struct xfs_name         *name,
+       xfs_inode_t             **ipp,
+       struct xfs_name         *ci_name)
+{
+       xfs_ino_t               inum;
+       int                     error;
+       uint                    lock_mode;
+
+       trace_xfs_lookup(dp, name);
+
+       if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+               return XFS_ERROR(EIO);
+
+       lock_mode = xfs_ilock_map_shared(dp);
+       error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
+       xfs_iunlock_map_shared(dp, lock_mode);
+
+       if (error)
+               goto out;
+
+       error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
+       if (error)
+               goto out_free_name;
+
+       return 0;
+
+out_free_name:
+       if (ci_name)
+               kmem_free(ci_name->name);
+out:
+       *ipp = NULL;
+       return error;
+}
+
  /*
   * Allocate an inode on disk and return a copy of its in-core version.
   * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
@@ -632,6 +862,308 @@ xfs_ialloc(
         return 0;
  }
  
+int
+xfs_create(
+       xfs_inode_t             *dp,
+       struct xfs_name         *name,
+       umode_t                 mode,
+       xfs_dev_t               rdev,
+       xfs_inode_t             **ipp)
+{
+       int                     is_dir = S_ISDIR(mode);
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_inode        *ip = NULL;
+       struct xfs_trans        *tp = NULL;
+       int                     error;
+       xfs_bmap_free_t         free_list;
+       xfs_fsblock_t           first_block;
+       bool                    unlock_dp_on_error = false;
+       uint                    cancel_flags;
+       int                     committed;
+       prid_t                  prid;
+       struct xfs_dquot        *udqp = NULL;
+       struct xfs_dquot        *gdqp = NULL;
+       struct xfs_dquot        *pdqp = NULL;
+       uint                    resblks;
+       uint                    log_res;
+       uint                    log_count;
+
+       trace_xfs_create(dp, name);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+               prid = xfs_get_projid(dp);
+       else
+               prid = XFS_PROJID_DEFAULT;
+
+       /*
+        * Make sure that we have allocated dquot(s) on disk.
+        */
+       error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+                                       XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+                                       &udqp, &gdqp, &pdqp);
+       if (error)
+               return error;
+
+       if (is_dir) {
+               rdev = 0;
+               resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+               log_res = XFS_MKDIR_LOG_RES(mp);
+               log_count = XFS_MKDIR_LOG_COUNT;
+               tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+       } else {
+               resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+               log_res = XFS_CREATE_LOG_RES(mp);
+               log_count = XFS_CREATE_LOG_COUNT;
+               tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+       }
+
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+       /*
+        * Initially assume that the file does not exist and
+        * reserve the resources for that case.  If that is not
+        * the case we'll drop the one we have and get a more
+        * appropriate transaction later.
+        */
+       error = xfs_trans_reserve(tp, resblks, log_res, 0,
+                       XFS_TRANS_PERM_LOG_RES, log_count);
+       if (error == ENOSPC) {
+               /* flush outstanding delalloc blocks and retry */
+               xfs_flush_inodes(mp);
+               error = xfs_trans_reserve(tp, resblks, log_res, 0,
+                               XFS_TRANS_PERM_LOG_RES, log_count);
+       }
+       if (error == ENOSPC) {
+               /* No space at all so try a "no-allocation" reservation */
+               resblks = 0;
+               error = xfs_trans_reserve(tp, 0, log_res, 0,
+                               XFS_TRANS_PERM_LOG_RES, log_count);
+       }
+       if (error) {
+               cancel_flags = 0;
+               goto out_trans_cancel;
+       }
+
+       xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+       unlock_dp_on_error = true;
+
+       xfs_bmap_init(&free_list, &first_block);
+
+       /*
+        * Reserve disk quota and the inode.
+        */
+       error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+                                               pdqp, resblks, 1, 0);
+       if (error)
+               goto out_trans_cancel;
+
+       error = xfs_dir_canenter(tp, dp, name, resblks);
+       if (error)
+               goto out_trans_cancel;
+
+       /*
+        * A newly created regular or special file just has one directory
+        * entry pointing to them, but a directory also the "." entry
+        * pointing to itself.
+        */
+       error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
+                              prid, resblks > 0, &ip, &committed);
+       if (error) {
+               if (error == ENOSPC)
+                       goto out_trans_cancel;
+               goto out_trans_abort;
+       }
+
+       /*
+        * Now we join the directory inode to the transaction.  We do not do it
+        * earlier because xfs_dir_ialloc might commit the previous transaction
+        * (and release all the locks).  An error from here on will result in
+        * the transaction cancel unlocking dp so don't do it explicitly in the
+        * error path.
+        */
+       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+       unlock_dp_on_error = false;
+
+       error = xfs_dir_createname(tp, dp, name, ip->i_ino,
+                                       &first_block, &free_list, resblks ?
+                                       resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+       if (error) {
+               ASSERT(error != ENOSPC);
+               goto out_trans_abort;
+       }
+       xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+       if (is_dir) {
+               error = xfs_dir_init(tp, ip, dp);
+               if (error)
+                       goto out_bmap_cancel;
+
+               error = xfs_bumplink(tp, dp);
+               if (error)
+                       goto out_bmap_cancel;
+       }
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * create transaction goes to disk before returning to
+        * the user.
+        */
+       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+               xfs_trans_set_sync(tp);
+
+       /*
+        * Attach the dquot(s) to the inodes and modify them incore.
+        * These ids of the inode couldn't have changed since the new
+        * inode has been locked ever since it was created.
+        */
+       xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+       error = xfs_bmap_finish(&tp, &free_list, &committed);
+       if (error)
+               goto out_bmap_cancel;
+
+       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       if (error)
+               goto out_release_inode;
+
+       xfs_qm_dqrele(udqp);
+       xfs_qm_dqrele(gdqp);
+       xfs_qm_dqrele(pdqp);
+
+       *ipp = ip;
+       return 0;
+
+ out_bmap_cancel:
+       xfs_bmap_cancel(&free_list);
+ out_trans_abort:
+       cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+       xfs_trans_cancel(tp, cancel_flags);
+ out_release_inode:
+       /*
+        * Wait until after the current transaction is aborted to
+        * release the inode.  This prevents recursive transactions
+        * and deadlocks from xfs_inactive.
+        */
+       if (ip)
+               IRELE(ip);
+
+       xfs_qm_dqrele(udqp);
+       xfs_qm_dqrele(gdqp);
+       xfs_qm_dqrele(pdqp);
+
+       if (unlock_dp_on_error)
+               xfs_iunlock(dp, XFS_ILOCK_EXCL);
+       return error;
+}
+
+int
+xfs_link(
+       xfs_inode_t             *tdp,
+       xfs_inode_t             *sip,
+       struct xfs_name         *target_name)
+{
+       xfs_mount_t             *mp = tdp->i_mount;
+       xfs_trans_t             *tp;
+       int                     error;
+       xfs_bmap_free_t         free_list;
+       xfs_fsblock_t           first_block;
+       int                     cancel_flags;
+       int                     committed;
+       int                     resblks;
+
+       trace_xfs_link(tdp, target_name);
+
+       ASSERT(!S_ISDIR(sip->i_d.di_mode));
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       error = xfs_qm_dqattach(sip, 0);
+       if (error)
+               goto std_return;
+
+       error = xfs_qm_dqattach(tdp, 0);
+       if (error)
+               goto std_return;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+       resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
+       error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
+                       XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
+       if (error == ENOSPC) {
+               resblks = 0;
+               error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
+                               XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
+       }
+       if (error) {
+               cancel_flags = 0;
+               goto error_return;
+       }
+
+       xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
+
+       xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+
+       /*
+        * If we are using project inheritance, we only allow hard link
+        * creation in our tree when the project IDs are the same; else
+        * the tree quota mechanism could be circumvented.
+        */
+       if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+                    (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
+               error = XFS_ERROR(EXDEV);
+               goto error_return;
+       }
+
+       error = xfs_dir_canenter(tp, tdp, target_name, resblks);
+       if (error)
+               goto error_return;
+
+       xfs_bmap_init(&free_list, &first_block);
+
+       error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
+                                       &first_block, &free_list, resblks);
+       if (error)
+               goto abort_return;
+       xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
+
+       error = xfs_bumplink(tp, sip);
+       if (error)
+               goto abort_return;
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * link transaction goes to disk before returning to
+        * the user.
+        */
+       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+               xfs_trans_set_sync(tp);
+       }
+
+       error = xfs_bmap_finish (&tp, &free_list, &committed);
+       if (error) {
+               xfs_bmap_cancel(&free_list);
+               goto abort_return;
+       }
+
+       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+ abort_return:
+       cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+       xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+       return error;
+}
+
  /*
   * Free up the underlying blocks past new_size.  The new size must be smaller
   * than the current size.  This routine can be used both for the attribute and
@@ -771,6 +1303,276 @@ out_bmap_cancel:
         goto out;
  }
  
+int
+xfs_release(
+       xfs_inode_t     *ip)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       int             error;
+
+       if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
+               return 0;
+
+       /* If this is a read-only mount, don't do this (would generate I/O) */
+       if (mp->m_flags & XFS_MOUNT_RDONLY)
+               return 0;
+
+       if (!XFS_FORCED_SHUTDOWN(mp)) {
+               int truncated;
+
+               /*
+                * If we are using filestreams, and we have an unlinked
+                * file that we are processing the last close on, then nothing
+                * will be able to reopen and write to this file. Purge this
+                * inode from the filestreams cache so that it doesn't delay
+                * teardown of the inode.
+                */
+               if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
+                       xfs_filestream_deassociate(ip);
+
+               /*
+                * If we previously truncated this file and removed old data
+                * in the process, we want to initiate "early" writeout on
+                * the last close.  This is an attempt to combat the notorious
+                * NULL files problem which is particularly noticeable from a
+                * truncate down, buffered (re-)write (delalloc), followed by
+                * a crash.  What we are effectively doing here is
+                * significantly reducing the time window where we'd otherwise
+                * be exposed to that problem.
+                */
+               truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
+               if (truncated) {
+                       xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
+                       if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
+                               error = -filemap_flush(VFS_I(ip)->i_mapping);
+                               if (error)
+                                       return error;
+                       }
+               }
+       }
+
+       if (ip->i_d.di_nlink == 0)
+               return 0;
+
+       if (xfs_can_free_eofblocks(ip, false)) {
+
+               /*
+                * If we can't get the iolock just skip truncating the blocks
+                * past EOF because we could deadlock with the mmap_sem
+                * otherwise.  We'll get another chance to drop them once the
+                * last reference to the inode is dropped, so we'll never leak
+                * blocks permanently.
+                *
+                * Further, check if the inode is being opened, written and
+                * closed frequently and we have delayed allocation blocks
+                * outstanding (e.g. streaming writes from the NFS server),
+                * truncating the blocks past EOF will cause fragmentation to
+                * occur.
+                *
+                * In this case don't do the truncation, either, but we have to
+                * be careful how we detect this case. Blocks beyond EOF show
+                * up as i_delayed_blks even when the inode is clean, so we
+                * need to truncate them away first before checking for a dirty
+                * release. Hence on the first dirty close we will still remove
+                * the speculative allocation, but after that we will leave it
+                * in place.
+                */
+               if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+                       return 0;
+
+               error = xfs_free_eofblocks(mp, ip, true);
+               if (error && error != EAGAIN)
+                       return error;
+
+               /* delalloc blocks after truncation means it really is dirty */
+               if (ip->i_delayed_blks)
+                       xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
+       }
+       return 0;
+}
+
+/*
+ * xfs_inactive
+ *
+ * This is called when the vnode reference count for the vnode
+ * goes to zero.  If the file has been unlinked, then it must
+ * now be truncated.  Also, we clear all of the read-ahead state
+ * kept for the inode here since the file is now closed.
+ */
+int
+xfs_inactive(
+       xfs_inode_t     *ip)
+{
+       xfs_bmap_free_t free_list;
+       xfs_fsblock_t   first_block;
+       int             committed;
+       xfs_trans_t     *tp;
+       xfs_mount_t     *mp;
+       int             error;
+       int             truncate = 0;
+
+       /*
+        * If the inode is already free, then there can be nothing
+        * to clean up here.
+        */
+       if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
+               ASSERT(ip->i_df.if_real_bytes == 0);
+               ASSERT(ip->i_df.if_broot_bytes == 0);
+               return VN_INACTIVE_CACHE;
+       }
+
+       mp = ip->i_mount;
+
+       error = 0;
+
+       /* If this is a read-only mount, don't do this (would generate I/O) */
+       if (mp->m_flags & XFS_MOUNT_RDONLY)
+               goto out;
+
+       if (ip->i_d.di_nlink != 0) {
+               /*
+                * force is true because we are evicting an inode from the
+                * cache. Post-eof blocks must be freed, lest we end up with
+                * broken free space accounting.
+                */
+               if (xfs_can_free_eofblocks(ip, true)) {
+                       error = xfs_free_eofblocks(mp, ip, false);
+                       if (error)
+                               return VN_INACTIVE_CACHE;
+               }
+               goto out;
+       }
+
+       if (S_ISREG(ip->i_d.di_mode) &&
+           (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
+            ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
+               truncate = 1;
+
+       error = xfs_qm_dqattach(ip, 0);
+       if (error)
+               return VN_INACTIVE_CACHE;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+       error = xfs_trans_reserve(tp, 0,
+                       (truncate || S_ISLNK(ip->i_d.di_mode)) ?
+                               XFS_ITRUNCATE_LOG_RES(mp) :
+                               XFS_IFREE_LOG_RES(mp),
+                       0,
+                       XFS_TRANS_PERM_LOG_RES,
+                       XFS_ITRUNCATE_LOG_COUNT);
+       if (error) {
+               ASSERT(XFS_FORCED_SHUTDOWN(mp));
+               xfs_trans_cancel(tp, 0);
+               return VN_INACTIVE_CACHE;
+       }
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, 0);
+
+       if (S_ISLNK(ip->i_d.di_mode)) {
+               error = xfs_inactive_symlink(ip, &tp);
+               if (error)
+                       goto out_cancel;
+       } else if (truncate) {
+               ip->i_d.di_size = 0;
+               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
+               if (error)
+                       goto out_cancel;
+
+               ASSERT(ip->i_d.di_nextents == 0);
+       }
+
+       /*
+        * If there are attributes associated with the file then blow them away
+        * now.  The code calls a routine that recursively deconstructs the
+        * attribute fork.  We need to just commit the current transaction
+        * because we can't use it for xfs_attr_inactive().
+        */
+       if (ip->i_d.di_anextents > 0) {
+               ASSERT(ip->i_d.di_forkoff != 0);
+
+               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               if (error)
+                       goto out_unlock;
+
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+               error = xfs_attr_inactive(ip);
+               if (error)
+                       goto out;
+
+               tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+               error = xfs_trans_reserve(tp, 0,
+                                         XFS_IFREE_LOG_RES(mp),
+                                         0, XFS_TRANS_PERM_LOG_RES,
+                                         XFS_INACTIVE_LOG_COUNT);
+               if (error) {
+                       xfs_trans_cancel(tp, 0);
+                       goto out;
+               }
+
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               xfs_trans_ijoin(tp, ip, 0);
+       }
+
+       if (ip->i_afp)
+               xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+       ASSERT(ip->i_d.di_anextents == 0);
+
+       /*
+        * Free the inode.
+        */
+       xfs_bmap_init(&free_list, &first_block);
+       error = xfs_ifree(tp, ip, &free_list);
+       if (error) {
+               /*
+                * If we fail to free the inode, shut down.  The cancel
+                * might do that, we need to make sure.  Otherwise the
+                * inode might be lost for a long time or forever.
+                */
+               if (!XFS_FORCED_SHUTDOWN(mp)) {
+                       xfs_notice(mp, "%s: xfs_ifree returned error %d",
+                               __func__, error);
+                       xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+               }
+               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+       } else {
+               /*
+                * Credit the quota account(s). The inode is gone.
+                */
+               xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+
+               /*
+                * Just ignore errors at this point.  There is nothing we can
+                * do except to try to keep going. Make sure it's not a silent
+                * error.
+                */
+               error = xfs_bmap_finish(&tp,  &free_list, &committed);
+               if (error)
+                       xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
+                               __func__, error);
+               error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+               if (error)
+                       xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
+                               __func__, error);
+       }
+
+       /*
+        * Release the dquots held by inode, if any.
+        */
+       xfs_qm_dqdetach(ip);
+out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+       return VN_INACTIVE_CACHE;
+out_cancel:
+       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       goto out_unlock;
+}
+
  /*
   * This is called when the inode's link count goes to 0.
   * We place the on-disk inode on a list in the AGI.  It
@@ -1302,6 +2104,170 @@ xfs_iunpin_wait(
                 __xfs_iunpin_wait(ip);
  }
  
+int
+xfs_remove(
+       xfs_inode_t             *dp,
+       struct xfs_name         *name,
+       xfs_inode_t             *ip)
+{
+       xfs_mount_t             *mp = dp->i_mount;
+       xfs_trans_t             *tp = NULL;
+       int                     is_dir = S_ISDIR(ip->i_d.di_mode);
+       int                     error = 0;
+       xfs_bmap_free_t         free_list;
+       xfs_fsblock_t           first_block;
+       int                     cancel_flags;
+       int                     committed;
+       int                     link_zero;
+       uint                    resblks;
+       uint                    log_count;
+
+       trace_xfs_remove(dp, name);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       error = xfs_qm_dqattach(dp, 0);
+       if (error)
+               goto std_return;
+
+       error = xfs_qm_dqattach(ip, 0);
+       if (error)
+               goto std_return;
+
+       if (is_dir) {
+               tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+               log_count = XFS_DEFAULT_LOG_COUNT;
+       } else {
+               tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+               log_count = XFS_REMOVE_LOG_COUNT;
+       }
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+       /*
+        * We try to get the real space reservation first,
+        * allowing for directory btree deletion(s) implying
+        * possible bmap insert(s).  If we can't get the space
+        * reservation then we use 0 instead, and avoid the bmap
+        * btree insert(s) in the directory code by, if the bmap
+        * insert tries to happen, instead trimming the LAST
+        * block from the directory.
+        */
+       resblks = XFS_REMOVE_SPACE_RES(mp);
+       error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
+                                 XFS_TRANS_PERM_LOG_RES, log_count);
+       if (error == ENOSPC) {
+               resblks = 0;
+               error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
+                                         XFS_TRANS_PERM_LOG_RES, log_count);
+       }
+       if (error) {
+               ASSERT(error != ENOSPC);
+               cancel_flags = 0;
+               goto out_trans_cancel;
+       }
+
+       xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
+
+       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+       /*
+        * If we're removing a directory perform some additional validation.
+        */
+       if (is_dir) {
+               ASSERT(ip->i_d.di_nlink >= 2);
+               if (ip->i_d.di_nlink != 2) {
+                       error = XFS_ERROR(ENOTEMPTY);
+                       goto out_trans_cancel;
+               }
+               if (!xfs_dir_isempty(ip)) {
+                       error = XFS_ERROR(ENOTEMPTY);
+                       goto out_trans_cancel;
+               }
+       }
+
+       xfs_bmap_init(&free_list, &first_block);
+       error = xfs_dir_removename(tp, dp, name, ip->i_ino,
+                                       &first_block, &free_list, resblks);
+       if (error) {
+               ASSERT(error != ENOENT);
+               goto out_bmap_cancel;
+       }
+       xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+       if (is_dir) {
+               /*
+                * Drop the link from ip's "..".
+                */
+               error = xfs_droplink(tp, dp);
+               if (error)
+                       goto out_bmap_cancel;
+
+               /*
+                * Drop the "." link from ip to self.
+                */
+               error = xfs_droplink(tp, ip);
+               if (error)
+                       goto out_bmap_cancel;
+       } else {
+               /*
+                * When removing a non-directory we need to log the parent
+                * inode here.  For a directory this is done implicitly
+                * by the xfs_droplink call for the ".." entry.
+                */
+               xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+       }
+
+       /*
+        * Drop the link from dp to ip.
+        */
+       error = xfs_droplink(tp, ip);
+       if (error)
+               goto out_bmap_cancel;
+
+       /*
+        * Determine if this is the last link while
+        * we are in the transaction.
+        */
+       link_zero = (ip->i_d.di_nlink == 0);
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * remove transaction goes to disk before returning to
+        * the user.
+        */
+       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+               xfs_trans_set_sync(tp);
+
+       error = xfs_bmap_finish(&tp, &free_list, &committed);
+       if (error)
+               goto out_bmap_cancel;
+
+       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       if (error)
+               goto std_return;
+
+       /*
+        * If we are using filestreams, kill the stream association.
+        * If the file is still open it may get a new one but that
+        * will get killed on last close in xfs_close() so we don't
+        * have to worry about that.
+        */
+       if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
+               xfs_filestream_deassociate(ip);
+
+       return 0;
+
+ out_bmap_cancel:
+       xfs_bmap_cancel(&free_list);
+       cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+       xfs_trans_cancel(tp, cancel_flags);
+ std_return:
+       return error;
+}
+
  STATIC int
  xfs_iflush_cluster(
         xfs_inode_t     *ip,
@@ -1746,39 +2712,3 @@ xfs_iflush_int(
  corrupt_out:
         return XFS_ERROR(EFSCORRUPTED);
  }
-
-/*
- * Test whether it is appropriate to check an inode for and free post EOF
- * blocks. The 'force' parameter determines whether we should also consider
- * regular files that are marked preallocated or append-only.
- */
-bool
-xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
-{
-       /* prealloc/delalloc exists only on regular files */
-       if (!S_ISREG(ip->i_d.di_mode))
-               return false;
-
-       /*
-        * Zero sized files with no cached pages and delalloc blocks will not
-        * have speculative prealloc/delalloc blocks to remove.
-        */
-       if (VFS_I(ip)->i_size == 0 &&
-           VN_CACHED(VFS_I(ip)) == 0 &&
-           ip->i_delayed_blks == 0)
-               return false;
-
-       /* If we haven't read in the extent list, then don't do it now. */
-       if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
-               return false;
-
-       /*
-        * Do not free real preallocated or append-only files unless the file
-        * has delalloc blocks and we are forced to remove them.
-        */
-       if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
-               if (!force || ip->i_delayed_blks == 0)
-                       return false;
-
-       return true;
-}