fs: fix data-loss on error

[deliverable/linux.git] / fs / buffer.c
diff --git a/fs/buffer.c b/fs/buffer.c

index 75b51dfa5e0396dd2fa17756819bd678fda00208..9ece6c2086d0588c4a1e9f5bf276c5ca20df443d 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -110,10 +110,14 @@ static void buffer_io_error(struct buffer_head *bh)
  }
  
  /*
- * Default synchronous end-of-IO handler..  Just mark it up-to-date and
- * unlock the buffer. This is what ll_rw_block uses too.
+ * End-of-IO handler helper function which does not touch the bh after
+ * unlocking it.
+ * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
+ * a race there is benign: unlock_buffer() only use the bh's address for
+ * hashing after unlocking the buffer, so it doesn't actually touch the bh
+ * itself.
   */
-void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
+static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
  {
         if (uptodate) {
                 set_buffer_uptodate(bh);
@@ -122,6 +126,15 @@ void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
                 clear_buffer_uptodate(bh);
         }
         unlock_buffer(bh);
+}
+
+/*
+ * Default synchronous end-of-IO handler..  Just mark it up-to-date and
+ * unlock the buffer. This is what ll_rw_block uses too.
+ */
+void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
+{
+       __end_buffer_read_notouch(bh, uptodate);
         put_bh(bh);
  }
  
@@ -1800,7 +1813,9 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
                                 unmap_underlying_metadata(bh->b_bdev,
                                                         bh->b_blocknr);
                                 if (PageUptodate(page)) {
+                                       clear_buffer_new(bh);
                                         set_buffer_uptodate(bh);
+                                       mark_buffer_dirty(bh);
                                         continue;
                                 }
                                 if (block_end > to || block_start < from) {
@@ -2245,21 +2260,10 @@ out_unlock:
   * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
   * immediately, while under the page lock.  So it needs a special end_io
   * handler which does not touch the bh after unlocking it.
- *
- * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
- * a race there is benign: unlock_buffer() only use the bh's address for
- * hashing after unlocking the buffer, so it doesn't actually touch the bh
- * itself.
   */
  static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
  {
-       if (uptodate) {
-               set_buffer_uptodate(bh);
-       } else {
-               /* This happens, due to failed READA attempts. */
-               clear_buffer_uptodate(bh);
-       }
-       unlock_buffer(bh);
+       __end_buffer_read_notouch(bh, uptodate);
  }
  
  /*
@@ -2272,51 +2276,64 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
         struct inode *inode = page->mapping->host;
         const unsigned blkbits = inode->i_blkbits;
         const unsigned blocksize = 1 << blkbits;
-       struct buffer_head map_bh;
-       struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
+       struct buffer_head *head, *bh;
         unsigned block_in_page;
-       unsigned block_start;
+       unsigned block_start, block_end;
         sector_t block_in_file;
         char *kaddr;
         int nr_reads = 0;
-       int i;
         int ret = 0;
         int is_mapped_to_disk = 1;
  
+       if (page_has_buffers(page))
+               return block_prepare_write(page, from, to, get_block);
+
         if (PageMappedToDisk(page))
                 return 0;
  
+       /*
+        * Allocate buffers so that we can keep track of state, and potentially
+        * attach them to the page if an error occurs. In the common case of
+        * no error, they will just be freed again without ever being attached
+        * to the page (which is all OK, because we're under the page lock).
+        *
+        * Be careful: the buffer linked list is a NULL terminated one, rather
+        * than the circular one we're used to.
+        */
+       head = alloc_page_buffers(page, blocksize, 0);
+       if (!head)
+               return -ENOMEM;
+
         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
-       map_bh.b_page = page;
  
         /*
          * We loop across all blocks in the page, whether or not they are
          * part of the affected region.  This is so we can discover if the
          * page is fully mapped-to-disk.
          */
-       for (block_start = 0, block_in_page = 0;
+       for (block_start = 0, block_in_page = 0, bh = head;
                   block_start < PAGE_CACHE_SIZE;
-                 block_in_page++, block_start += blocksize) {
-               unsigned block_end = block_start + blocksize;
+                 block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
                 int create;
  
-               map_bh.b_state = 0;
+               block_end = block_start + blocksize;
+               bh->b_state = 0;
                 create = 1;
                 if (block_start >= to)
                         create = 0;
-               map_bh.b_size = blocksize;
                 ret = get_block(inode, block_in_file + block_in_page,
-                                       &map_bh, create);
+                                       bh, create);
                 if (ret)
                         goto failed;
-               if (!buffer_mapped(&map_bh))
+               if (!buffer_mapped(bh))
                         is_mapped_to_disk = 0;
-               if (buffer_new(&map_bh))
-                       unmap_underlying_metadata(map_bh.b_bdev,
-                                                       map_bh.b_blocknr);
-               if (PageUptodate(page))
+               if (buffer_new(bh))
+                       unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+               if (PageUptodate(page)) {
+                       set_buffer_uptodate(bh);
                         continue;
-               if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
+               }
+               if (buffer_new(bh) || !buffer_mapped(bh)) {
                         kaddr = kmap_atomic(page, KM_USER0);
                         if (block_start < from)
                                 memset(kaddr+block_start, 0, from-block_start);
@@ -2326,49 +2343,26 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
                         kunmap_atomic(kaddr, KM_USER0);
                         continue;
                 }
-               if (buffer_uptodate(&map_bh))
+               if (buffer_uptodate(bh))
                         continue;       /* reiserfs does this */
                 if (block_start < from || block_end > to) {
-                       struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
-
-                       if (!bh) {
-                               ret = -ENOMEM;
-                               goto failed;
-                       }
-                       bh->b_state = map_bh.b_state;
-                       atomic_set(&bh->b_count, 0);
-                       bh->b_this_page = NULL;
-                       bh->b_page = page;
-                       bh->b_blocknr = map_bh.b_blocknr;
-                       bh->b_size = blocksize;
-                       bh->b_data = (char *)(long)block_start;
-                       bh->b_bdev = map_bh.b_bdev;
-                       bh->b_private = NULL;
-                       read_bh[nr_reads++] = bh;
+                       lock_buffer(bh);
+                       bh->b_end_io = end_buffer_read_nobh;
+                       submit_bh(READ, bh);
+                       nr_reads++;
                 }
         }
  
         if (nr_reads) {
-               struct buffer_head *bh;
-
                 /*
                  * The page is locked, so these buffers are protected from
                  * any VM or truncate activity.  Hence we don't need to care
                  * for the buffer_head refcounts.
                  */
-               for (i = 0; i < nr_reads; i++) {
-                       bh = read_bh[i];
-                       lock_buffer(bh);
-                       bh->b_end_io = end_buffer_read_nobh;
-                       submit_bh(READ, bh);
-               }
-               for (i = 0; i < nr_reads; i++) {
-                       bh = read_bh[i];
+               for (bh = head; bh; bh = bh->b_this_page) {
                         wait_on_buffer(bh);
                         if (!buffer_uptodate(bh))
                                 ret = -EIO;
-                       free_buffer_head(bh);
-                       read_bh[i] = NULL;
                 }
                 if (ret)
                         goto failed;
@@ -2377,21 +2371,54 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
         if (is_mapped_to_disk)
                 SetPageMappedToDisk(page);
  
+       do {
+               bh = head;
+               head = head->b_this_page;
+               free_buffer_head(bh);
+       } while (head);
+
         return 0;
  
  failed:
-       for (i = 0; i < nr_reads; i++) {
-               if (read_bh[i])
-                       free_buffer_head(read_bh[i]);
-       }
-
         /*
-        * Error recovery is pretty slack.  Clear the page and mark it dirty
-        * so we'll later zero out any blocks which _were_ allocated.
+        * Error recovery is a bit difficult. We need to zero out blocks that
+        * were newly allocated, and dirty them to ensure they get written out.
+        * Buffers need to be attached to the page at this point, otherwise
+        * the handling of potential IO errors during writeout would be hard
+        * (could try doing synchronous writeout, but what if that fails too?)
          */
-       zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
-       SetPageUptodate(page);
-       set_page_dirty(page);
+       spin_lock(&page->mapping->private_lock);
+       bh = head;
+       block_start = 0;
+       do {
+               if (PageUptodate(page))
+                       set_buffer_uptodate(bh);
+               if (PageDirty(page))
+                       set_buffer_dirty(bh);
+
+               block_end = block_start+blocksize;
+               if (block_end <= from)
+                       goto next;
+               if (block_start >= to)
+                       goto next;
+
+               if (buffer_new(bh)) {
+                       clear_buffer_new(bh);
+                       if (!buffer_uptodate(bh)) {
+                               zero_user_page(page, block_start, bh->b_size, KM_USER0);
+                               set_buffer_uptodate(bh);
+                       }
+                       mark_buffer_dirty(bh);
+               }
+next:
+               block_start = block_end;
+               if (!bh->b_this_page)
+                       bh->b_this_page = head;
+               bh = bh->b_this_page;
+       } while (bh != head);
+       attach_page_buffers(page, head);
+       spin_unlock(&page->mapping->private_lock);
+
         return ret;
  }
  EXPORT_SYMBOL(nobh_prepare_write);
@@ -2406,6 +2433,9 @@ int nobh_commit_write(struct file *file, struct page *page,
         struct inode *inode = page->mapping->host;
         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
  
+       if (page_has_buffers(page))
+               return generic_commit_write(file, page, from, to);
+
         SetPageUptodate(page);
         set_page_dirty(page);
         if (pos > inode->i_size) {