Merge branch 'xfs-dax-support' into for-next

author Dave Chinner <david@fromorbit.com>

Thu, 4 Jun 2015 03:01:49 +0000 (13:01 +1000)

committer Dave Chinner <david@fromorbit.com>

Thu, 4 Jun 2015 03:01:49 +0000 (13:01 +1000)
author Dave Chinner <david@fromorbit.com>
Thu, 4 Jun 2015 03:01:49 +0000 (13:01 +1000)
committer Dave Chinner <david@fromorbit.com>
Thu, 4 Jun 2015 03:01:49 +0000 (13:01 +1000)
diff --git a/fs/dax.c b/fs/dax.c

index 6f65f00e58ecdc695284de68b832f3822247f594..99b5fbc38992db1f88be1a1e48dad4fda584a0c1 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -309,14 +309,21 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
   out:
         i_mmap_unlock_read(mapping);
  
-       if (bh->b_end_io)
-               bh->b_end_io(bh, 1);
-
         return error;
  }
  
-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-                       get_block_t get_block)
+/**
+ * __dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files. __dax_fault() assumes the caller has done all
+ * the necessary locking for the page fault to proceed successfully.
+ */
+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+                       get_block_t get_block, dax_iodone_t complete_unwritten)
  {
         struct file *file = vma->vm_file;
         struct address_space *mapping = file->f_mapping;
@@ -417,7 +424,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                 page_cache_release(page);
         }
  
+       /*
+        * If we successfully insert the new mapping over an unwritten extent,
+        * we need to ensure we convert the unwritten extent. If there is an
+        * error inserting the mapping, the filesystem needs to leave it as
+        * unwritten to prevent exposure of the stale underlying data to
+        * userspace, but we still need to call the completion function so
+        * the private resources on the mapping buffer can be released. We
+        * indicate what the callback should do via the uptodate variable, same
+        * as for normal BH based IO completions.
+        */
         error = dax_insert_mapping(inode, &bh, vma, vmf);
+       if (buffer_unwritten(&bh))
+               complete_unwritten(&bh, !error);
  
   out:
         if (error == -ENOMEM)
@@ -434,6 +453,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
         }
         goto out;
  }
+EXPORT_SYMBOL(__dax_fault);
  
  /**
   * dax_fault - handle a page fault on a DAX file
@@ -445,7 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
   * fault handler for DAX files.
   */
  int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-                       get_block_t get_block)
+             get_block_t get_block, dax_iodone_t complete_unwritten)
  {
         int result;
         struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -454,7 +474,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                 sb_start_pagefault(sb);
                 file_update_time(vma->vm_file);
         }
-       result = do_dax_fault(vma, vmf, get_block);
+       result = __dax_fault(vma, vmf, get_block, complete_unwritten);
         if (vmf->flags & FAULT_FLAG_WRITE)
                 sb_end_pagefault(sb);
  
diff --git a/fs/ext2/file.c b/fs/ext2/file.c

index 3a0a6c6406d000560c5c5971c733751e091c3382..3b57c9f83c9b9b6469b014a317b0e50c1b1f1cf4 100644 (file)
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -28,12 +28,12 @@
  #ifdef CONFIG_FS_DAX
  static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
-       return dax_fault(vma, vmf, ext2_get_block);
+       return dax_fault(vma, vmf, ext2_get_block, NULL);
  }
  
  static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
-       return dax_mkwrite(vma, vmf, ext2_get_block);
+       return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
  }
  
  static const struct vm_operations_struct ext2_dax_vm_ops = {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index 0613c256c3441a208f51291dbc2691489cf4dd9c..f713cfcc43a293a45d3bd5b51bad1ef2fbc3916d 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -192,15 +192,27 @@ out:
  }
  
  #ifdef CONFIG_FS_DAX
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+       struct inode *inode = bh->b_assoc_map->host;
+       /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+       loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+       int err;
+       if (!uptodate)
+               return;
+       WARN_ON(!buffer_unwritten(bh));
+       err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
+
  static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
-       return dax_fault(vma, vmf, ext4_get_block);
+       return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
                                         /* Is this the right get_block? */
  }
  
  static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
-       return dax_mkwrite(vma, vmf, ext4_get_block);
+       return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
  }
  
  static const struct vm_operations_struct ext4_dax_vm_ops = {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 55b187c3bac1f1626cf0267734aa805b6f9a1263..7c38ed3494cb61c91a426723f75f3e7023d1a3f5 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -656,18 +656,6 @@ has_zeroout:
         return retval;
  }
  
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
-       struct inode *inode = bh->b_assoc_map->host;
-       /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
-       loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
-       int err;
-       if (!uptodate)
-               return;
-       WARN_ON(!buffer_unwritten(bh));
-       err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
  /* Maximum number of blocks we map for direct IO at once. */
  #define DIO_MAX_BLOCKS 4096
  
@@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
  
                 map_bh(bh, inode->i_sb, map.m_pblk);
                 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-               if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+               if (IS_DAX(inode) && buffer_unwritten(bh)) {
+                       /*
+                        * dgc: I suspect unwritten conversion on ext4+DAX is
+                        * fundamentally broken here when there are concurrent
+                        * read/write in progress on this inode.
+                        */
+                       WARN_ON_ONCE(io_end);
                         bh->b_assoc_map = inode->i_mapping;
                         bh->b_private = (void *)(unsigned long)iblock;
-                       bh->b_end_io = ext4_end_io_unwritten;
                 }
                 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
                         set_buffer_defer_completion(bh);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index a56960dd16847bccd3fea619f28aedad0f744f31..e5e9fc23f230b4225f5d8d48221ed8ab9a56aeb0 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1349,7 +1349,7 @@ __xfs_get_blocks(
         sector_t                iblock,
         struct buffer_head      *bh_result,
         int                     create,
-       int                     direct)
+       bool                    direct)
  {
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
@@ -1414,6 +1414,7 @@ __xfs_get_blocks(
                         if (error)
                                 return error;
                         new = 1;
+
                 } else {
                         /*
                          * Delalloc reservations do not require a transaction,
@@ -1508,49 +1509,29 @@ xfs_get_blocks(
         struct buffer_head      *bh_result,
         int                     create)
  {
-       return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+       return __xfs_get_blocks(inode, iblock, bh_result, create, false);
  }
  
-STATIC int
+int
  xfs_get_blocks_direct(
         struct inode            *inode,
         sector_t                iblock,
         struct buffer_head      *bh_result,
         int                     create)
  {
-       return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+       return __xfs_get_blocks(inode, iblock, bh_result, create, true);
  }
  
-/*
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC void
-xfs_end_io_direct_write(
-       struct kiocb            *iocb,
+static void
+__xfs_end_io_direct_write(
+       struct inode            *inode,
+       struct xfs_ioend        *ioend,
         loff_t                  offset,
-       ssize_t                 size,
-       void                    *private)
+       ssize_t                 size)
  {
-       struct inode            *inode = file_inode(iocb->ki_filp);
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_ioend        *ioend = private;
-
-       trace_xfs_gbmap_direct_endio(ip, offset, size,
-                                    ioend ? ioend->io_type : 0, NULL);
+       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
  
-       if (!ioend) {
-               ASSERT(offset + size <= i_size_read(inode));
-               return;
-       }
-
-       if (XFS_FORCED_SHUTDOWN(mp))
+       if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
                 goto out_end_io;
  
         /*
@@ -1587,10 +1568,10 @@ xfs_end_io_direct_write(
          * here can result in EOF moving backwards and Bad Things Happen when
          * that occurs.
          */
-       spin_lock(&ip->i_flags_lock);
+       spin_lock(&XFS_I(inode)->i_flags_lock);
         if (offset + size > i_size_read(inode))
                 i_size_write(inode, offset + size);
-       spin_unlock(&ip->i_flags_lock);
+       spin_unlock(&XFS_I(inode)->i_flags_lock);
  
         /*
          * If we are doing an append IO that needs to update the EOF on disk,
@@ -1607,6 +1588,98 @@ out_end_io:
         return;
  }
  
+/*
+ * Complete a direct I/O write request.
+ *
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
+ */
+STATIC void
+xfs_end_io_direct_write(
+       struct kiocb            *iocb,
+       loff_t                  offset,
+       ssize_t                 size,
+       void                    *private)
+{
+       struct inode            *inode = file_inode(iocb->ki_filp);
+       struct xfs_ioend        *ioend = private;
+
+       trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
+                                    ioend ? ioend->io_type : 0, NULL);
+
+       if (!ioend) {
+               ASSERT(offset + size <= i_size_read(inode));
+               return;
+       }
+
+       __xfs_end_io_direct_write(inode, ioend, offset, size);
+}
+
+/*
+ * For DAX we need a mapping buffer callback for unwritten extent conversion
+ * when page faults allocate blocks and then zero them. Note that in this
+ * case the mapping indicated by the ioend may extend beyond EOF. We most
+ * definitely do not want to extend EOF here, so we trim back the ioend size to
+ * EOF.
+ */
+#ifdef CONFIG_FS_DAX
+void
+xfs_end_io_dax_write(
+       struct buffer_head      *bh,
+       int                     uptodate)
+{
+       struct xfs_ioend        *ioend = bh->b_private;
+       struct inode            *inode = ioend->io_inode;
+       ssize_t                 size = ioend->io_size;
+
+       ASSERT(IS_DAX(ioend->io_inode));
+
+       /* if there was an error zeroing, then don't convert it */
+       if (!uptodate)
+               ioend->io_error = -EIO;
+
+       /*
+        * Trim update to EOF, so we don't extend EOF during unwritten extent
+        * conversion of partial EOF blocks.
+        */
+       spin_lock(&XFS_I(inode)->i_flags_lock);
+       if (ioend->io_offset + size > i_size_read(inode))
+               size = i_size_read(inode) - ioend->io_offset;
+       spin_unlock(&XFS_I(inode)->i_flags_lock);
+
+       __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
+
+}
+#else
+void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
+#endif
+
+static inline ssize_t
+xfs_vm_do_dio(
+       struct inode            *inode,
+       struct kiocb            *iocb,
+       struct iov_iter         *iter,
+       loff_t                  offset,
+       void                    (*endio)(struct kiocb   *iocb,
+                                        loff_t         offset,
+                                        ssize_t        size,
+                                        void           *private),
+       int                     flags)
+{
+       struct block_device     *bdev;
+
+       if (IS_DAX(inode))
+               return dax_do_io(iocb, inode, iter, offset,
+                                xfs_get_blocks_direct, endio, 0);
+
+       bdev = xfs_find_bdev_for_inode(inode);
+       return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
+                                    xfs_get_blocks_direct, endio, NULL, flags);
+}
+
  STATIC ssize_t
  xfs_vm_direct_IO(
         struct kiocb            *iocb,
@@ -1614,16 +1687,11 @@ xfs_vm_direct_IO(
         loff_t                  offset)
  {
         struct inode            *inode = iocb->ki_filp->f_mapping->host;
-       struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
  
-       if (iov_iter_rw(iter) == WRITE) {
-               return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-                                           xfs_get_blocks_direct,
-                                           xfs_end_io_direct_write, NULL,
-                                           DIO_ASYNC_EXTEND);
-       }
-       return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-                                   xfs_get_blocks_direct, NULL, NULL, 0);
+       if (iov_iter_rw(iter) == WRITE)
+               return xfs_vm_do_dio(inode, iocb, iter, offset,
+                                    xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
+       return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
  }
  
  /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h

index ac644e0137a49f021ba3c3d840e4b91017674095..86afd1ac7895f8d225fa2da2285c03f7014666e3 100644 (file)
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -53,7 +53,12 @@ typedef struct xfs_ioend {
  } xfs_ioend_t;
  
  extern const struct address_space_operations xfs_address_space_operations;
-extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+
+int    xfs_get_blocks(struct inode *inode, sector_t offset,
+                      struct buffer_head *map_bh, int create);
+int    xfs_get_blocks_direct(struct inode *inode, sector_t offset,
+                             struct buffer_head *map_bh, int create);
+void   xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
  
  extern void xfs_count_page_state(struct page *, int *, int *);
  
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

index a52bbd3abc7df3ac7fa07a12b5dc9ec336d9ab49..4a2965515ca883a92df0313ba908893103fd1ddf 100644 (file)
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1133,14 +1133,29 @@ xfs_zero_remaining_bytes(
                         break;
                 ASSERT(imap.br_blockcount >= 1);
                 ASSERT(imap.br_startoff == offset_fsb);
+               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+
+               if (imap.br_startblock == HOLESTARTBLOCK ||
+                   imap.br_state == XFS_EXT_UNWRITTEN) {
+                       /* skip the entire extent */
+                       lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
+                                                     imap.br_blockcount) - 1;
+                       continue;
+               }
+
                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
                 if (lastoffset > endoff)
                         lastoffset = endoff;
-               if (imap.br_startblock == HOLESTARTBLOCK)
-                       continue;
-               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-               if (imap.br_state == XFS_EXT_UNWRITTEN)
+
+               /* DAX can just zero the backing device directly */
+               if (IS_DAX(VFS_I(ip))) {
+                       error = dax_zero_page_range(VFS_I(ip), offset,
+                                                   lastoffset - offset + 1,
+                                                   xfs_get_blocks_direct);
+                       if (error)
+                               return error;
                         continue;
+               }
  
                 error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
                                 mp->m_rtdev_targp : mp->m_ddev_targp,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 3b7591224f4a6698d32371a927e70cb2a391f4a9..84b2421b22ae51d4fb66b84baa6c49bf03e31226 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -79,14 +79,15 @@ xfs_rw_ilock_demote(
  }
  
  /*
- *     xfs_iozero
+ * xfs_iozero clears the specified range supplied via the page cache (except in
+ * the DAX case). Writes through the page cache will allocate blocks over holes,
+ * though the callers usually map the holes first and avoid them. If a block is
+ * not completely zeroed, then it will be read from disk before being partially
+ * zeroed.
   *
- *     xfs_iozero clears the specified range of buffer supplied,
- *     and marks all the affected blocks as valid and modified.  If
- *     an affected block is not allocated, it will be allocated.  If
- *     an affected block is not completely overwritten, and is not
- *     valid before the operation, it will be read from disk before
- *     being partially zeroed.
+ * In the DAX case, we can just directly write to the underlying pages. This
+ * will not allocate blocks, but will avoid holes and unwritten extents and so
+ * not do unnecessary work.
   */
  int
  xfs_iozero(
@@ -96,7 +97,8 @@ xfs_iozero(
  {
         struct page             *page;
         struct address_space    *mapping;
-       int                     status;
+       int                     status = 0;
+
  
         mapping = VFS_I(ip)->i_mapping;
         do {
@@ -108,20 +110,27 @@ xfs_iozero(
                 if (bytes > count)
                         bytes = count;
  
-               status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                       AOP_FLAG_UNINTERRUPTIBLE,
-                                       &page, &fsdata);
-               if (status)
-                       break;
+               if (IS_DAX(VFS_I(ip))) {
+                       status = dax_zero_page_range(VFS_I(ip), pos, bytes,
+                                                    xfs_get_blocks_direct);
+                       if (status)
+                               break;
+               } else {
+                       status = pagecache_write_begin(NULL, mapping, pos, bytes,
+                                               AOP_FLAG_UNINTERRUPTIBLE,
+                                               &page, &fsdata);
+                       if (status)
+                               break;
  
-               zero_user(page, offset, bytes);
+                       zero_user(page, offset, bytes);
  
-               status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
-                                       page, fsdata);
-               WARN_ON(status <= 0); /* can't return less than zero! */
+                       status = pagecache_write_end(NULL, mapping, pos, bytes,
+                                               bytes, page, fsdata);
+                       WARN_ON(status <= 0); /* can't return less than zero! */
+                       status = 0;
+               }
                 pos += bytes;
                 count -= bytes;
-               status = 0;
         } while (count);
  
         return status;
@@ -284,7 +293,7 @@ xfs_file_read_iter(
         if (file->f_mode & FMODE_NOCMTIME)
                 ioflags |= XFS_IO_INVIS;
  
-       if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
+       if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
                 xfs_buftarg_t   *target =
                         XFS_IS_REALTIME_INODE(ip) ?
                                 mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -378,7 +387,11 @@ xfs_file_splice_read(
  
         trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
  
-       ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+       /* for dax, we need to avoid the page cache */
+       if (IS_DAX(VFS_I(ip)))
+               ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
+       else
+               ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
         if (ret > 0)
                 XFS_STATS_ADD(xs_read_bytes, ret);
  
@@ -672,7 +685,7 @@ xfs_file_dio_aio_write(
                                         mp->m_rtdev_targp : mp->m_ddev_targp;
  
         /* DIO must be aligned to device logical sector size */
-       if ((pos | count) & target->bt_logical_sectormask)
+       if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
                 return -EINVAL;
  
         /* "unaligned" here means not aligned to a filesystem block */
@@ -758,8 +771,11 @@ xfs_file_dio_aio_write(
  out:
         xfs_rw_iunlock(ip, iolock);
  
-       /* No fallback to buffered IO on errors for XFS. */
-       ASSERT(ret < 0 || ret == count);
+       /*
+        * No fallback to buffered IO on errors for XFS. DAX can result in
+        * partial writes, but direct IO will either complete fully or fail.
+        */
+       ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
         return ret;
  }
  
@@ -842,7 +858,7 @@ xfs_file_write_iter(
         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                 return -EIO;
  
-       if (unlikely(iocb->ki_flags & IOCB_DIRECT))
+       if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
                 ret = xfs_file_dio_aio_write(iocb, from);
         else
                 ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1063,17 +1079,6 @@ xfs_file_readdir(
         return xfs_readdir(ip, ctx, bufsize);
  }
  
-STATIC int
-xfs_file_mmap(
-       struct file     *filp,
-       struct vm_area_struct *vma)
-{
-       vma->vm_ops = &xfs_file_vm_ops;
-
-       file_accessed(filp);
-       return 0;
-}
-
  /*
   * This type is designed to indicate the type of offset we would like
   * to search from page cache for xfs_seek_hole_data().
@@ -1454,48 +1459,83 @@ xfs_file_llseek(
   * ordering of:
   *
   * mmap_sem (MM)
- *   i_mmap_lock (XFS - truncate serialisation)
- *     page_lock (MM)
- *       i_lock (XFS - extent map serialisation)
+ *   sb_start_pagefault(vfs, freeze)
+ *     i_mmap_lock (XFS - truncate serialisation)
+ *       page_lock (MM)
+ *         i_lock (XFS - extent map serialisation)
+ */
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
   */
  STATIC int
-xfs_filemap_fault(
+xfs_filemap_page_mkwrite(
         struct vm_area_struct   *vma,
         struct vm_fault         *vmf)
  {
-       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
-       int                     error;
+       struct inode            *inode = file_inode(vma->vm_file);
+       int                     ret;
  
-       trace_xfs_filemap_fault(ip);
+       trace_xfs_filemap_page_mkwrite(XFS_I(inode));
  
-       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-       error = filemap_fault(vma, vmf);
-       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+       xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
-       return error;
+       if (IS_DAX(inode)) {
+               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
+                                   xfs_end_io_dax_write);
+       } else {
+               ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+               ret = block_page_mkwrite_return(ret);
+       }
+
+       xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+       sb_end_pagefault(inode->i_sb);
+
+       return ret;
  }
  
-/*
- * mmap()d file has taken write protection fault and is being made writable. We
- * can set the page state up correctly for a writable page, which means we can
- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- * mapping.
- */
  STATIC int
-xfs_filemap_page_mkwrite(
+xfs_filemap_fault(
         struct vm_area_struct   *vma,
         struct vm_fault         *vmf)
  {
-       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
-       int                     error;
+       struct xfs_inode        *ip = XFS_I(file_inode(vma->vm_file));
+       int                     ret;
+
+       trace_xfs_filemap_fault(ip);
  
-       trace_xfs_filemap_page_mkwrite(ip);
+       /* DAX can shortcut the normal fault path on write faults! */
+       if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
+               return xfs_filemap_page_mkwrite(vma, vmf);
  
         xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-       error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+       ret = filemap_fault(vma, vmf);
         xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
  
-       return error;
+       return ret;
+}
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+       .fault          = xfs_filemap_fault,
+       .map_pages      = filemap_map_pages,
+       .page_mkwrite   = xfs_filemap_page_mkwrite,
+};
+
+STATIC int
+xfs_file_mmap(
+       struct file     *filp,
+       struct vm_area_struct *vma)
+{
+       file_accessed(filp);
+       vma->vm_ops = &xfs_file_vm_ops;
+       if (IS_DAX(file_inode(filp)))
+               vma->vm_flags |= VM_MIXEDMAP;
+       return 0;
  }
  
  const struct file_operations xfs_file_operations = {
@@ -1526,9 +1566,3 @@ const struct file_operations xfs_dir_file_operations = {
  #endif
         .fsync          = xfs_dir_fsync,
  };
-
-static const struct vm_operations_struct xfs_file_vm_ops = {
-       .fault          = xfs_filemap_fault,
-       .map_pages      = filemap_map_pages,
-       .page_mkwrite   = xfs_filemap_page_mkwrite,
-};
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c

index f4cd7204e23667724c01a4c4b8efe8c1d48b1cb3..3e8d32d41f350486be9f92a0d4db0eeb6795249c 100644 (file)
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -851,7 +851,11 @@ xfs_setattr_size(
          * to hope that the caller sees ENOMEM and retries the truncate
          * operation.
          */
-       error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
+       if (IS_DAX(inode))
+               error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
+       else
+               error = block_truncate_page(inode->i_mapping, newsize,
+                                           xfs_get_blocks);
         if (error)
                 return error;
         truncate_setsize(inode, newsize);
@@ -1191,22 +1195,22 @@ xfs_diflags_to_iflags(
         struct inode            *inode,
         struct xfs_inode        *ip)
  {
-       if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+       uint16_t                flags = ip->i_d.di_flags;
+
+       inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
+                           S_NOATIME | S_DAX);
+
+       if (flags & XFS_DIFLAG_IMMUTABLE)
                 inode->i_flags |= S_IMMUTABLE;
-       else
-               inode->i_flags &= ~S_IMMUTABLE;
-       if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+       if (flags & XFS_DIFLAG_APPEND)
                 inode->i_flags |= S_APPEND;
-       else
-               inode->i_flags &= ~S_APPEND;
-       if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+       if (flags & XFS_DIFLAG_SYNC)
                 inode->i_flags |= S_SYNC;
-       else
-               inode->i_flags &= ~S_SYNC;
-       if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+       if (flags & XFS_DIFLAG_NOATIME)
                 inode->i_flags |= S_NOATIME;
-       else
-               inode->i_flags &= ~S_NOATIME;
+       /* XXX: Also needs an on-disk per inode flag! */
+       if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
+               inode->i_flags |= S_DAX;
  }
  
  /*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h

index df209c2902585d65ebea39c602b8122ae9f6ed87..7999e91cd49ad862d5fbeb29a5e5487ed94509a8 100644 (file)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -181,6 +181,8 @@ typedef struct xfs_mount {
                                                    allocator */
  #define XFS_MOUNT_NOATTR2      (1ULL << 25)    /* disable use of attr2 format */
  
+#define XFS_MOUNT_DAX          (1ULL << 62)    /* TEST ONLY! */
+
  
  /*
   * Default minimum read and write sizes.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index 858e1e62bbaa390d0ea34a5b7766274f04361fd2..1fb16562c159947ac27adae43f6abb4f195e1cfa 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj;        /* global debug sysfs attrs */
  #define MNTOPT_DISCARD    "discard"    /* Discard unused blocks */
  #define MNTOPT_NODISCARD   "nodiscard" /* Do not discard unused blocks */
  
+#define MNTOPT_DAX     "dax"           /* Enable direct access to bdev pages */
+
  /*
   * Table driven mount option parser.
   *
@@ -363,6 +365,10 @@ xfs_parseargs(
                         mp->m_flags |= XFS_MOUNT_DISCARD;
                 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
                         mp->m_flags &= ~XFS_MOUNT_DISCARD;
+#ifdef CONFIG_FS_DAX
+               } else if (!strcmp(this_char, MNTOPT_DAX)) {
+                       mp->m_flags |= XFS_MOUNT_DAX;
+#endif
                 } else {
                         xfs_warn(mp, "unknown mount option [%s].", this_char);
                         return -EINVAL;
@@ -452,8 +458,8 @@ done:
  }
  
  struct proc_xfs_info {
-       int     flag;
-       char    *str;
+       uint64_t        flag;
+       char            *str;
  };
  
  STATIC int
@@ -474,6 +480,7 @@ xfs_showargs(
                 { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
                 { XFS_MOUNT_DISCARD,            "," MNTOPT_DISCARD },
                 { XFS_MOUNT_SMALL_INUMS,        "," MNTOPT_32BITINODE },
+               { XFS_MOUNT_DAX,                "," MNTOPT_DAX },
                 { 0, NULL }
         };
         static struct proc_xfs_info xfs_info_unset[] = {
@@ -1507,6 +1514,20 @@ xfs_fs_fill_super(
         if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
                 sb->s_flags |= MS_I_VERSION;
  
+       if (mp->m_flags & XFS_MOUNT_DAX) {
+               xfs_warn(mp,
+       "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+               if (sb->s_blocksize != PAGE_SIZE) {
+                       xfs_alert(mp,
+               "Filesystem block size invalid for DAX Turning DAX off.");
+                       mp->m_flags &= ~XFS_MOUNT_DAX;
+               } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
+                       xfs_alert(mp,
+               "Block device does not support DAX Turning DAX off.");
+                       mp->m_flags &= ~XFS_MOUNT_DAX;
+               }
+       }
+
         error = xfs_mountfs(mp);
         if (error)
                 goto out_filestream_unmount;
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 35ec87e490b1a41ff0bc3ba20b06ac9d958f972a..5784377e7c56361caa5f6bba11cd9aff575182f6 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                         struct buffer_head *bh_result, int create);
  typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
                         ssize_t bytes, void *private);
+typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
  
  #define MAY_EXEC               0x00000001
  #define MAY_WRITE              0x00000002
@@ -2627,9 +2628,13 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
  int dax_clear_blocks(struct inode *, sector_t block, long size);
  int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
  int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+               dax_iodone_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+               dax_iodone_t);
  int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb)      dax_fault(vma, vmf, gb)
+#define dax_mkwrite(vma, vmf, gb, iod)         dax_fault(vma, vmf, gb, iod)
+#define __dax_mkwrite(vma, vmf, gb, iod)       __dax_fault(vma, vmf, gb, iod)
  
  #ifdef CONFIG_BLOCK
  typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
author	Dave Chinner <david@fromorbit.com>
	Thu, 4 Jun 2015 03:01:49 +0000 (13:01 +1000)
committer	Dave Chinner <david@fromorbit.com>
	Thu, 4 Jun 2015 03:01:49 +0000 (13:01 +1000)
fs/dax.c		patch \| blob \| blame \| history
fs/ext2/file.c		patch \| blob \| blame \| history
fs/ext4/file.c		patch \| blob \| blame \| history
fs/ext4/inode.c		patch \| blob \| blame \| history
fs/xfs/xfs_aops.c		patch \| blob \| blame \| history
fs/xfs/xfs_aops.h		patch \| blob \| blame \| history
fs/xfs/xfs_bmap_util.c		patch \| blob \| blame \| history
fs/xfs/xfs_file.c		patch \| blob \| blame \| history
fs/xfs/xfs_iops.c		patch \| blob \| blame \| history
fs/xfs/xfs_mount.h		patch \| blob \| blame \| history
fs/xfs/xfs_super.c		patch \| blob \| blame \| history
include/linux/fs.h		patch \| blob \| blame \| history