fs/ext4/inode.c

   1 /*
   2  *  linux/fs/ext4/inode.c
   3  *
   4  * Copyright (C) 1992, 1993, 1994, 1995
   5  * Remy Card (card@masi.ibp.fr)
   6  * Laboratoire MASI - Institut Blaise Pascal
   7  * Universite Pierre et Marie Curie (Paris VI)
   8  *
   9  *  from
  10  *
  11  *  linux/fs/minix/inode.c
  12  *
  13  *  Copyright (C) 1991, 1992  Linus Torvalds
  14  *
  15  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  16  *      (jj@sunsite.ms.mff.cuni.cz)
  17  *
  18  *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
  19  */
  20
  21 #include <linux/module.h>
  22 #include <linux/fs.h>
  23 #include <linux/time.h>
  24 #include <linux/jbd2.h>
  25 #include <linux/highuid.h>
  26 #include <linux/pagemap.h>
  27 #include <linux/quotaops.h>
  28 #include <linux/string.h>
  29 #include <linux/buffer_head.h>
  30 #include <linux/writeback.h>
  31 #include <linux/pagevec.h>
  32 #include <linux/mpage.h>
  33 #include <linux/namei.h>
  34 #include <linux/uio.h>
  35 #include <linux/bio.h>
  36 #include <linux/workqueue.h>
  37 #include <linux/kernel.h>
  38 #include <linux/printk.h>
  39 #include <linux/slab.h>
  40 #include <linux/ratelimit.h>
  41
  42 #include "ext4_jbd2.h"
  43 #include "xattr.h"
  44 #include "acl.h"
  45 #include "truncate.h"
  46
  47 #include <trace/events/ext4.h>
  48
  49 #define MPAGE_DA_EXTENT_TAIL 0x01
  50
  51 static inline int ext4_begin_ordered_truncate(struct inode *inode,
  52                                               loff_t new_size)
  53 {
  54         trace_ext4_begin_ordered_truncate(inode, new_size);
  55         /*
  56          * If jinode is zero, then we never opened the file for
  57          * writing, so there's no need to call
  58          * jbd2_journal_begin_ordered_truncate() since there's no
  59          * outstanding writes we need to flush.
  60          */
  61         if (!EXT4_I(inode)->jinode)
  62                 return 0;
  63         return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
  64                                                    EXT4_I(inode)->jinode,
  65                                                    new_size);
  66 }
  67
  68 static void ext4_invalidatepage(struct page *page, unsigned long offset);
  69 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
  70                                    struct buffer_head *bh_result, int create);
  71 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
  72 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
  73 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
  74 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
  75
  76 /*
  77  * Test whether an inode is a fast symlink.
  78  */
  79 static int ext4_inode_is_fast_symlink(struct inode *inode)
  80 {
  81         int ea_blocks = EXT4_I(inode)->i_file_acl ?
  82                 (inode->i_sb->s_blocksize >> 9) : 0;
  83
  84         return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
  85 }
  86
  87 /*
  88  * Restart the transaction associated with *handle.  This does a commit,
  89  * so before we call here everything must be consistently dirtied against
  90  * this transaction.
  91  */
  92 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
  93                                  int nblocks)
  94 {
  95         int ret;
  96
  97         /*
  98          * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
  99          * moment, get_block can be called only for blocks inside i_size since
 100          * page cache has been already dropped and writes are blocked by
 101          * i_mutex. So we can safely drop the i_data_sem here.
 102          */
 103         BUG_ON(EXT4_JOURNAL(inode) == NULL);
 104         jbd_debug(2, "restarting handle %p\n", handle);
 105         up_write(&EXT4_I(inode)->i_data_sem);
 106         ret = ext4_journal_restart(handle, nblocks);
 107         down_write(&EXT4_I(inode)->i_data_sem);
 108         ext4_discard_preallocations(inode);
 109
 110         return ret;
 111 }
 112
 113 /*
 114  * Called at the last iput() if i_nlink is zero.
 115  */
 116 void ext4_evict_inode(struct inode *inode)
 117 {
 118         handle_t *handle;
 119         int err;
 120
 121         trace_ext4_evict_inode(inode);
 122
 123         ext4_ioend_wait(inode);
 124
 125         if (inode->i_nlink) {
 126                 /*
 127                  * When journalling data dirty buffers are tracked only in the
 128                  * journal. So although mm thinks everything is clean and
 129                  * ready for reaping the inode might still have some pages to
 130                  * write in the running transaction or waiting to be
 131                  * checkpointed. Thus calling jbd2_journal_invalidatepage()
 132                  * (via truncate_inode_pages()) to discard these buffers can
 133                  * cause data loss. Also even if we did not discard these
 134                  * buffers, we would have no way to find them after the inode
 135                  * is reaped and thus user could see stale data if he tries to
 136                  * read them before the transaction is checkpointed. So be
 137                  * careful and force everything to disk here... We use
 138                  * ei->i_datasync_tid to store the newest transaction
 139                  * containing inode's data.
 140                  *
 141                  * Note that directories do not have this problem because they
 142                  * don't use page cache.
 143                  */
 144                 if (ext4_should_journal_data(inode) &&
 145                     (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
 146                         journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 147                         tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
 148
 149                         jbd2_log_start_commit(journal, commit_tid);
 150                         jbd2_log_wait_commit(journal, commit_tid);
 151                         filemap_write_and_wait(&inode->i_data);
 152                 }
 153                 truncate_inode_pages(&inode->i_data, 0);
 154                 goto no_delete;
 155         }
 156
 157         if (!is_bad_inode(inode))
 158                 dquot_initialize(inode);
 159
 160         if (ext4_should_order_data(inode))
 161                 ext4_begin_ordered_truncate(inode, 0);
 162         truncate_inode_pages(&inode->i_data, 0);
 163
 164         if (is_bad_inode(inode))
 165                 goto no_delete;
 166
 167         handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
 168         if (IS_ERR(handle)) {
 169                 ext4_std_error(inode->i_sb, PTR_ERR(handle));
 170                 /*
 171                  * If we're going to skip the normal cleanup, we still need to
 172                  * make sure that the in-core orphan linked list is properly
 173                  * cleaned up.
 174                  */
 175                 ext4_orphan_del(NULL, inode);
 176                 goto no_delete;
 177         }
 178
 179         if (IS_SYNC(inode))
 180                 ext4_handle_sync(handle);
 181         inode->i_size = 0;
 182         err = ext4_mark_inode_dirty(handle, inode);
 183         if (err) {
 184                 ext4_warning(inode->i_sb,
 185                              "couldn't mark inode dirty (err %d)", err);
 186                 goto stop_handle;
 187         }
 188         if (inode->i_blocks)
 189                 ext4_truncate(inode);
 190
 191         /*
 192          * ext4_ext_truncate() doesn't reserve any slop when it
 193          * restarts journal transactions; therefore there may not be
 194          * enough credits left in the handle to remove the inode from
 195          * the orphan list and set the dtime field.
 196          */
 197         if (!ext4_handle_has_enough_credits(handle, 3)) {
 198                 err = ext4_journal_extend(handle, 3);
 199                 if (err > 0)
 200                         err = ext4_journal_restart(handle, 3);
 201                 if (err != 0) {
 202                         ext4_warning(inode->i_sb,
 203                                      "couldn't extend journal (err %d)", err);
 204                 stop_handle:
 205                         ext4_journal_stop(handle);
 206                         ext4_orphan_del(NULL, inode);
 207                         goto no_delete;
 208                 }
 209         }
 210
 211         /*
 212          * Kill off the orphan record which ext4_truncate created.
 213          * AKPM: I think this can be inside the above `if'.
 214          * Note that ext4_orphan_del() has to be able to cope with the
 215          * deletion of a non-existent orphan - this is because we don't
 216          * know if ext4_truncate() actually created an orphan record.
 217          * (Well, we could do this if we need to, but heck - it works)
 218          */
 219         ext4_orphan_del(handle, inode);
 220         EXT4_I(inode)->i_dtime  = get_seconds();
 221
 222         /*
 223          * One subtle ordering requirement: if anything has gone wrong
 224          * (transaction abort, IO errors, whatever), then we can still
 225          * do these next steps (the fs will already have been marked as
 226          * having errors), but we can't free the inode if the mark_dirty
 227          * fails.
 228          */
 229         if (ext4_mark_inode_dirty(handle, inode))
 230                 /* If that failed, just do the required in-core inode clear. */
 231                 ext4_clear_inode(inode);
 232         else
 233                 ext4_free_inode(handle, inode);
 234         ext4_journal_stop(handle);
 235         return;
 236 no_delete:
 237         ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
 238 }
 239
 240 #ifdef CONFIG_QUOTA
 241 qsize_t *ext4_get_reserved_space(struct inode *inode)
 242 {
 243         return &EXT4_I(inode)->i_reserved_quota;
 244 }
 245 #endif
 246
 247 /*
 248  * Calculate the number of metadata blocks need to reserve
 249  * to allocate a block located at @lblock
 250  */
 251 static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 252 {
 253         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 254                 return ext4_ext_calc_metadata_amount(inode, lblock);
 255
 256         return ext4_ind_calc_metadata_amount(inode, lblock);
 257 }
 258
 259 /*
 260  * Called with i_data_sem down, which is important since we can call
 261  * ext4_discard_preallocations() from here.
 262  */
 263 void ext4_da_update_reserve_space(struct inode *inode,
 264                                         int used, int quota_claim)
 265 {
 266         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 267         struct ext4_inode_info *ei = EXT4_I(inode);
 268
 269         spin_lock(&ei->i_block_reservation_lock);
 270         trace_ext4_da_update_reserve_space(inode, used, quota_claim);
 271         if (unlikely(used > ei->i_reserved_data_blocks)) {
 272                 ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
 273                          "with only %d reserved data blocks\n",
 274                          __func__, inode->i_ino, used,
 275                          ei->i_reserved_data_blocks);
 276                 WARN_ON(1);
 277                 used = ei->i_reserved_data_blocks;
 278         }
 279
 280         /* Update per-inode reservations */
 281         ei->i_reserved_data_blocks -= used;
 282         ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
 283         percpu_counter_sub(&sbi->s_dirtyclusters_counter,
 284                            used + ei->i_allocated_meta_blocks);
 285         ei->i_allocated_meta_blocks = 0;
 286
 287         if (ei->i_reserved_data_blocks == 0) {
 288                 /*
 289                  * We can release all of the reserved metadata blocks
 290                  * only when we have written all of the delayed
 291                  * allocation blocks.
 292                  */
 293                 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
 294                                    ei->i_reserved_meta_blocks);
 295                 ei->i_reserved_meta_blocks = 0;
 296                 ei->i_da_metadata_calc_len = 0;
 297         }
 298         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 299
 300         /* Update quota subsystem for data blocks */
 301         if (quota_claim)
 302                 dquot_claim_block(inode, EXT4_C2B(sbi, used));
 303         else {
 304                 /*
 305                  * We did fallocate with an offset that is already delayed
 306                  * allocated. So on delayed allocated writeback we should
 307                  * not re-claim the quota for fallocated blocks.
 308                  */
 309                 dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
 310         }
 311
 312         /*
 313          * If we have done all the pending block allocations and if
 314          * there aren't any writers on the inode, we can discard the
 315          * inode's preallocations.
 316          */
 317         if ((ei->i_reserved_data_blocks == 0) &&
 318             (atomic_read(&inode->i_writecount) == 0))
 319                 ext4_discard_preallocations(inode);
 320 }
 321
 322 static int __check_block_validity(struct inode *inode, const char *func,
 323                                 unsigned int line,
 324                                 struct ext4_map_blocks *map)
 325 {
 326         if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
 327                                    map->m_len)) {
 328                 ext4_error_inode(inode, func, line, map->m_pblk,
 329                                  "lblock %lu mapped to illegal pblock "
 330                                  "(length %d)", (unsigned long) map->m_lblk,
 331                                  map->m_len);
 332                 return -EIO;
 333         }
 334         return 0;
 335 }
 336
 337 #define check_block_validity(inode, map)        \
 338         __check_block_validity((inode), __func__, __LINE__, (map))
 339
 340 /*
 341  * Return the number of contiguous dirty pages in a given inode
 342  * starting at page frame idx.
 343  */
 344 static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 345                                     unsigned int max_pages)
 346 {
 347         struct address_space *mapping = inode->i_mapping;
 348         pgoff_t index;
 349         struct pagevec pvec;
 350         pgoff_t num = 0;
 351         int i, nr_pages, done = 0;
 352
 353         if (max_pages == 0)
 354                 return 0;
 355         pagevec_init(&pvec, 0);
 356         while (!done) {
 357                 index = idx;
 358                 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 359                                               PAGECACHE_TAG_DIRTY,
 360                                               (pgoff_t)PAGEVEC_SIZE);
 361                 if (nr_pages == 0)
 362                         break;
 363                 for (i = 0; i < nr_pages; i++) {
 364                         struct page *page = pvec.pages[i];
 365                         struct buffer_head *bh, *head;
 366
 367                         lock_page(page);
 368                         if (unlikely(page->mapping != mapping) ||
 369                             !PageDirty(page) ||
 370                             PageWriteback(page) ||
 371                             page->index != idx) {
 372                                 done = 1;
 373                                 unlock_page(page);
 374                                 break;
 375                         }
 376                         if (page_has_buffers(page)) {
 377                                 bh = head = page_buffers(page);
 378                                 do {
 379                                         if (!buffer_delay(bh) &&
 380                                             !buffer_unwritten(bh))
 381                                                 done = 1;
 382                                         bh = bh->b_this_page;
 383                                 } while (!done && (bh != head));
 384                         }
 385                         unlock_page(page);
 386                         if (done)
 387                                 break;
 388                         idx++;
 389                         num++;
 390                         if (num >= max_pages) {
 391                                 done = 1;
 392                                 break;
 393                         }
 394                 }
 395                 pagevec_release(&pvec);
 396         }
 397         return num;
 398 }
 399
 400 /*
 401  * The ext4_map_blocks() function tries to look up the requested blocks,
 402  * and returns if the blocks are already mapped.
 403  *
 404  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 405  * and store the allocated blocks in the result buffer head and mark it
 406  * mapped.
 407  *
 408  * If file type is extents based, it will call ext4_ext_map_blocks(),
 409  * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
 410  * based files
 411  *
 412  * On success, it returns the number of blocks being mapped or allocate.
 413  * if create==0 and the blocks are pre-allocated and uninitialized block,
 414  * the result buffer head is unmapped. If the create ==1, it will make sure
 415  * the buffer head is mapped.
 416  *
 417  * It returns 0 if plain look up failed (blocks have not been allocated), in
 418  * that casem, buffer head is unmapped
 419  *
 420  * It returns the error in case of allocation failure.
 421  */
 422 int ext4_map_blocks(handle_t *handle, struct inode *inode,
 423                     struct ext4_map_blocks *map, int flags)
 424 {
 425         int retval;
 426
 427         map->m_flags = 0;
 428         ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
 429                   "logical block %lu\n", inode->i_ino, flags, map->m_len,
 430                   (unsigned long) map->m_lblk);
 431         /*
 432          * Try to see if we can get the block without requesting a new
 433          * file system block.
 434          */
 435         down_read((&EXT4_I(inode)->i_data_sem));
 436         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 437                 retval = ext4_ext_map_blocks(handle, inode, map, 0);
 438         } else {
 439                 retval = ext4_ind_map_blocks(handle, inode, map, 0);
 440         }
 441         up_read((&EXT4_I(inode)->i_data_sem));
 442
 443         if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 444                 int ret = check_block_validity(inode, map);
 445                 if (ret != 0)
 446                         return ret;
 447         }
 448
 449         /* If it is only a block(s) look up */
 450         if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
 451                 return retval;
 452
 453         /*
 454          * Returns if the blocks have already allocated
 455          *
 456          * Note that if blocks have been preallocated
 457          * ext4_ext_get_block() returns th create = 0
 458          * with buffer head unmapped.
 459          */
 460         if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
 461                 return retval;
 462
 463         /*
 464          * When we call get_blocks without the create flag, the
 465          * BH_Unwritten flag could have gotten set if the blocks
 466          * requested were part of a uninitialized extent.  We need to
 467          * clear this flag now that we are committed to convert all or
 468          * part of the uninitialized extent to be an initialized
 469          * extent.  This is because we need to avoid the combination
 470          * of BH_Unwritten and BH_Mapped flags being simultaneously
 471          * set on the buffer_head.
 472          */
 473         map->m_flags &= ~EXT4_MAP_UNWRITTEN;
 474
 475         /*
 476          * New blocks allocate and/or writing to uninitialized extent
 477          * will possibly result in updating i_data, so we take
 478          * the write lock of i_data_sem, and call get_blocks()
 479          * with create == 1 flag.
 480          */
 481         down_write((&EXT4_I(inode)->i_data_sem));
 482
 483         /*
 484          * if the caller is from delayed allocation writeout path
 485          * we have already reserved fs blocks for allocation
 486          * let the underlying get_block() function know to
 487          * avoid double accounting
 488          */
 489         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 490                 ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 491         /*
 492          * We need to check for EXT4 here because migrate
 493          * could have changed the inode type in between
 494          */
 495         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 496                 retval = ext4_ext_map_blocks(handle, inode, map, flags);
 497         } else {
 498                 retval = ext4_ind_map_blocks(handle, inode, map, flags);
 499
 500                 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
 501                         /*
 502                          * We allocated new blocks which will result in
 503                          * i_data's format changing.  Force the migrate
 504                          * to fail by clearing migrate flags
 505                          */
 506                         ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
 507                 }
 508
 509                 /*
 510                  * Update reserved blocks/metadata blocks after successful
 511                  * block allocation which had been deferred till now. We don't
 512                  * support fallocate for non extent files. So we can update
 513                  * reserve space here.
 514                  */
 515                 if ((retval > 0) &&
 516                         (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
 517                         ext4_da_update_reserve_space(inode, retval, 1);
 518         }
 519         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 520                 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 521
 522         up_write((&EXT4_I(inode)->i_data_sem));
 523         if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 524                 int ret = check_block_validity(inode, map);
 525                 if (ret != 0)
 526                         return ret;
 527         }
 528         return retval;
 529 }
 530
 531 /* Maximum number of blocks we map for direct IO at once. */
 532 #define DIO_MAX_BLOCKS 4096
 533
 534 static int _ext4_get_block(struct inode *inode, sector_t iblock,
 535                            struct buffer_head *bh, int flags)
 536 {
 537         handle_t *handle = ext4_journal_current_handle();
 538         struct ext4_map_blocks map;
 539         int ret = 0, started = 0;
 540         int dio_credits;
 541
 542         map.m_lblk = iblock;
 543         map.m_len = bh->b_size >> inode->i_blkbits;
 544
 545         if (flags && !handle) {
 546                 /* Direct IO write... */
 547                 if (map.m_len > DIO_MAX_BLOCKS)
 548                         map.m_len = DIO_MAX_BLOCKS;
 549                 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
 550                 handle = ext4_journal_start(inode, dio_credits);
 551                 if (IS_ERR(handle)) {
 552                         ret = PTR_ERR(handle);
 553                         return ret;
 554                 }
 555                 started = 1;
 556         }
 557
 558         ret = ext4_map_blocks(handle, inode, &map, flags);
 559         if (ret > 0) {
 560                 map_bh(bh, inode->i_sb, map.m_pblk);
 561                 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
 562                 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
 563                 ret = 0;
 564         }
 565         if (started)
 566                 ext4_journal_stop(handle);
 567         return ret;
 568 }
 569
 570 int ext4_get_block(struct inode *inode, sector_t iblock,
 571                    struct buffer_head *bh, int create)
 572 {
 573         return _ext4_get_block(inode, iblock, bh,
 574                                create ? EXT4_GET_BLOCKS_CREATE : 0);
 575 }
 576
 577 /*
 578  * `handle' can be NULL if create is zero
 579  */
 580 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 581                                 ext4_lblk_t block, int create, int *errp)
 582 {
 583         struct ext4_map_blocks map;
 584         struct buffer_head *bh;
 585         int fatal = 0, err;
 586
 587         J_ASSERT(handle != NULL || create == 0);
 588
 589         map.m_lblk = block;
 590         map.m_len = 1;
 591         err = ext4_map_blocks(handle, inode, &map,
 592                               create ? EXT4_GET_BLOCKS_CREATE : 0);
 593
 594         if (err < 0)
 595                 *errp = err;
 596         if (err <= 0)
 597                 return NULL;
 598         *errp = 0;
 599
 600         bh = sb_getblk(inode->i_sb, map.m_pblk);
 601         if (!bh) {
 602                 *errp = -EIO;
 603                 return NULL;
 604         }
 605         if (map.m_flags & EXT4_MAP_NEW) {
 606                 J_ASSERT(create != 0);
 607                 J_ASSERT(handle != NULL);
 608
 609                 /*
 610                  * Now that we do not always journal data, we should
 611                  * keep in mind whether this should always journal the
 612                  * new buffer as metadata.  For now, regular file
 613                  * writes use ext4_get_block instead, so it's not a
 614                  * problem.
 615                  */
 616                 lock_buffer(bh);
 617                 BUFFER_TRACE(bh, "call get_create_access");
 618                 fatal = ext4_journal_get_create_access(handle, bh);
 619                 if (!fatal && !buffer_uptodate(bh)) {
 620                         memset(bh->b_data, 0, inode->i_sb->s_blocksize);
 621                         set_buffer_uptodate(bh);
 622                 }
 623                 unlock_buffer(bh);
 624                 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 625                 err = ext4_handle_dirty_metadata(handle, inode, bh);
 626                 if (!fatal)
 627                         fatal = err;
 628         } else {
 629                 BUFFER_TRACE(bh, "not a new buffer");
 630         }
 631         if (fatal) {
 632                 *errp = fatal;
 633                 brelse(bh);
 634                 bh = NULL;
 635         }
 636         return bh;
 637 }
 638
 639 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 640                                ext4_lblk_t block, int create, int *err)
 641 {
 642         struct buffer_head *bh;
 643
 644         bh = ext4_getblk(handle, inode, block, create, err);
 645         if (!bh)
 646                 return bh;
 647         if (buffer_uptodate(bh))
 648                 return bh;
 649         ll_rw_block(READ_META, 1, &bh);
 650         wait_on_buffer(bh);
 651         if (buffer_uptodate(bh))
 652                 return bh;
 653         put_bh(bh);
 654         *err = -EIO;
 655         return NULL;
 656 }
 657
 658 static int walk_page_buffers(handle_t *handle,
 659                              struct buffer_head *head,
 660                              unsigned from,
 661                              unsigned to,
 662                              int *partial,
 663                              int (*fn)(handle_t *handle,
 664                                        struct buffer_head *bh))
 665 {
 666         struct buffer_head *bh;
 667         unsigned block_start, block_end;
 668         unsigned blocksize = head->b_size;
 669         int err, ret = 0;
 670         struct buffer_head *next;
 671
 672         for (bh = head, block_start = 0;
 673              ret == 0 && (bh != head || !block_start);
 674              block_start = block_end, bh = next) {
 675                 next = bh->b_this_page;
 676                 block_end = block_start + blocksize;
 677                 if (block_end <= from || block_start >= to) {
 678                         if (partial && !buffer_uptodate(bh))
 679                                 *partial = 1;
 680                         continue;
 681                 }
 682                 err = (*fn)(handle, bh);
 683                 if (!ret)
 684                         ret = err;
 685         }
 686         return ret;
 687 }
 688
 689 /*
 690  * To preserve ordering, it is essential that the hole instantiation and
 691  * the data write be encapsulated in a single transaction.  We cannot
 692  * close off a transaction and start a new one between the ext4_get_block()
 693  * and the commit_write().  So doing the jbd2_journal_start at the start of
 694  * prepare_write() is the right place.
 695  *
 696  * Also, this function can nest inside ext4_writepage() ->
 697  * block_write_full_page(). In that case, we *know* that ext4_writepage()
 698  * has generated enough buffer credits to do the whole page.  So we won't
 699  * block on the journal in that case, which is good, because the caller may
 700  * be PF_MEMALLOC.
 701  *
 702  * By accident, ext4 can be reentered when a transaction is open via
 703  * quota file writes.  If we were to commit the transaction while thus
 704  * reentered, there can be a deadlock - we would be holding a quota
 705  * lock, and the commit would never complete if another thread had a
 706  * transaction open and was blocking on the quota lock - a ranking
 707  * violation.
 708  *
 709  * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
 710  * will _not_ run commit under these circumstances because handle->h_ref
 711  * is elevated.  We'll still have enough credits for the tiny quotafile
 712  * write.
 713  */
 714 static int do_journal_get_write_access(handle_t *handle,
 715                                        struct buffer_head *bh)
 716 {
 717         int dirty = buffer_dirty(bh);
 718         int ret;
 719
 720         if (!buffer_mapped(bh) || buffer_freed(bh))
 721                 return 0;
 722         /*
 723          * __block_write_begin() could have dirtied some buffers. Clean
 724          * the dirty bit as jbd2_journal_get_write_access() could complain
 725          * otherwise about fs integrity issues. Setting of the dirty bit
 726          * by __block_write_begin() isn't a real problem here as we clear
 727          * the bit before releasing a page lock and thus writeback cannot
 728          * ever write the buffer.
 729          */
 730         if (dirty)
 731                 clear_buffer_dirty(bh);
 732         ret = ext4_journal_get_write_access(handle, bh);
 733         if (!ret && dirty)
 734                 ret = ext4_handle_dirty_metadata(handle, NULL, bh);
 735         return ret;
 736 }
 737
 738 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 739                    struct buffer_head *bh_result, int create);
 740 static int ext4_write_begin(struct file *file, struct address_space *mapping,
 741                             loff_t pos, unsigned len, unsigned flags,
 742                             struct page **pagep, void **fsdata)
 743 {
 744         struct inode *inode = mapping->host;
 745         int ret, needed_blocks;
 746         handle_t *handle;
 747         int retries = 0;
 748         struct page *page;
 749         pgoff_t index;
 750         unsigned from, to;
 751
 752         trace_ext4_write_begin(inode, pos, len, flags);
 753         /*
 754          * Reserve one block more for addition to orphan list in case
 755          * we allocate blocks but write fails for some reason
 756          */
 757         needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
 758         index = pos >> PAGE_CACHE_SHIFT;
 759         from = pos & (PAGE_CACHE_SIZE - 1);
 760         to = from + len;
 761
 762 retry:
 763         handle = ext4_journal_start(inode, needed_blocks);
 764         if (IS_ERR(handle)) {
 765                 ret = PTR_ERR(handle);
 766                 goto out;
 767         }
 768
 769         /* We cannot recurse into the filesystem as the transaction is already
 770          * started */
 771         flags |= AOP_FLAG_NOFS;
 772
 773         page = grab_cache_page_write_begin(mapping, index, flags);
 774         if (!page) {
 775                 ext4_journal_stop(handle);
 776                 ret = -ENOMEM;
 777                 goto out;
 778         }
 779         *pagep = page;
 780
 781         if (ext4_should_dioread_nolock(inode))
 782                 ret = __block_write_begin(page, pos, len, ext4_get_block_write);
 783         else
 784                 ret = __block_write_begin(page, pos, len, ext4_get_block);
 785
 786         if (!ret && ext4_should_journal_data(inode)) {
 787                 ret = walk_page_buffers(handle, page_buffers(page),
 788                                 from, to, NULL, do_journal_get_write_access);
 789         }
 790
 791         if (ret) {
 792                 unlock_page(page);
 793                 page_cache_release(page);
 794                 /*
 795                  * __block_write_begin may have instantiated a few blocks
 796                  * outside i_size.  Trim these off again. Don't need
 797                  * i_size_read because we hold i_mutex.
 798                  *
 799                  * Add inode to orphan list in case we crash before
 800                  * truncate finishes
 801                  */
 802                 if (pos + len > inode->i_size && ext4_can_truncate(inode))
 803                         ext4_orphan_add(handle, inode);
 804
 805                 ext4_journal_stop(handle);
 806                 if (pos + len > inode->i_size) {
 807                         ext4_truncate_failed_write(inode);
 808                         /*
 809                          * If truncate failed early the inode might
 810                          * still be on the orphan list; we need to
 811                          * make sure the inode is removed from the
 812                          * orphan list in that case.
 813                          */
 814                         if (inode->i_nlink)
 815                                 ext4_orphan_del(NULL, inode);
 816                 }
 817         }
 818
 819         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 820                 goto retry;
 821 out:
 822         return ret;
 823 }
 824
 825 /* For write_end() in data=journal mode */
 826 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 827 {
 828         if (!buffer_mapped(bh) || buffer_freed(bh))
 829                 return 0;
 830         set_buffer_uptodate(bh);
 831         return ext4_handle_dirty_metadata(handle, NULL, bh);
 832 }
 833
 834 static int ext4_generic_write_end(struct file *file,
 835                                   struct address_space *mapping,
 836                                   loff_t pos, unsigned len, unsigned copied,
 837                                   struct page *page, void *fsdata)
 838 {
 839         int i_size_changed = 0;
 840         struct inode *inode = mapping->host;
 841         handle_t *handle = ext4_journal_current_handle();
 842
 843         copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 844
 845         /*
 846          * No need to use i_size_read() here, the i_size
 847          * cannot change under us because we hold i_mutex.
 848          *
 849          * But it's important to update i_size while still holding page lock:
 850          * page writeout could otherwise come in and zero beyond i_size.
 851          */
 852         if (pos + copied > inode->i_size) {
 853                 i_size_write(inode, pos + copied);
 854                 i_size_changed = 1;
 855         }
 856
 857         if (pos + copied >  EXT4_I(inode)->i_disksize) {
 858                 /* We need to mark inode dirty even if
 859                  * new_i_size is less that inode->i_size
 860                  * bu greater than i_disksize.(hint delalloc)
 861                  */
 862                 ext4_update_i_disksize(inode, (pos + copied));
 863                 i_size_changed = 1;
 864         }
 865         unlock_page(page);
 866         page_cache_release(page);
 867
 868         /*
 869          * Don't mark the inode dirty under page lock. First, it unnecessarily
 870          * makes the holding time of page lock longer. Second, it forces lock
 871          * ordering of page lock and transaction start for journaling
 872          * filesystems.
 873          */
 874         if (i_size_changed)
 875                 ext4_mark_inode_dirty(handle, inode);
 876
 877         return copied;
 878 }
 879
 880 /*
 881  * We need to pick up the new inode size which generic_commit_write gave us
 882  * `file' can be NULL - eg, when called from page_symlink().
 883  *
 884  * ext4 never places buffers on inode->i_mapping->private_list.  metadata
 885  * buffers are managed internally.
 886  */
 887 static int ext4_ordered_write_end(struct file *file,
 888                                   struct address_space *mapping,
 889                                   loff_t pos, unsigned len, unsigned copied,
 890                                   struct page *page, void *fsdata)
 891 {
 892         handle_t *handle = ext4_journal_current_handle();
 893         struct inode *inode = mapping->host;
 894         int ret = 0, ret2;
 895
 896         trace_ext4_ordered_write_end(inode, pos, len, copied);
 897         ret = ext4_jbd2_file_inode(handle, inode);
 898
 899         if (ret == 0) {
 900                 ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 901                                                         page, fsdata);
 902                 copied = ret2;
 903                 if (pos + len > inode->i_size && ext4_can_truncate(inode))
 904                         /* if we have allocated more blocks and copied
 905                          * less. We will have blocks allocated outside
 906                          * inode->i_size. So truncate them
 907                          */
 908                         ext4_orphan_add(handle, inode);
 909                 if (ret2 < 0)
 910                         ret = ret2;
 911         }
 912         ret2 = ext4_journal_stop(handle);
 913         if (!ret)
 914                 ret = ret2;
 915
 916         if (pos + len > inode->i_size) {
 917                 ext4_truncate_failed_write(inode);
 918                 /*
 919                  * If truncate failed early the inode might still be
 920                  * on the orphan list; we need to make sure the inode
 921                  * is removed from the orphan list in that case.
 922                  */
 923                 if (inode->i_nlink)
 924                         ext4_orphan_del(NULL, inode);
 925         }
 926
 927
 928         return ret ? ret : copied;
 929 }
 930
 931 static int ext4_writeback_write_end(struct file *file,
 932                                     struct address_space *mapping,
 933                                     loff_t pos, unsigned len, unsigned copied,
 934                                     struct page *page, void *fsdata)
 935 {
 936         handle_t *handle = ext4_journal_current_handle();
 937         struct inode *inode = mapping->host;
 938         int ret = 0, ret2;
 939
 940         trace_ext4_writeback_write_end(inode, pos, len, copied);
 941         ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 942                                                         page, fsdata);
 943         copied = ret2;
 944         if (pos + len > inode->i_size && ext4_can_truncate(inode))
 945                 /* if we have allocated more blocks and copied
 946                  * less. We will have blocks allocated outside
 947                  * inode->i_size. So truncate them
 948                  */
 949                 ext4_orphan_add(handle, inode);
 950
 951         if (ret2 < 0)
 952                 ret = ret2;
 953
 954         ret2 = ext4_journal_stop(handle);
 955         if (!ret)
 956                 ret = ret2;
 957
 958         if (pos + len > inode->i_size) {
 959                 ext4_truncate_failed_write(inode);
 960                 /*
 961                  * If truncate failed early the inode might still be
 962                  * on the orphan list; we need to make sure the inode
 963                  * is removed from the orphan list in that case.
 964                  */
 965                 if (inode->i_nlink)
 966                         ext4_orphan_del(NULL, inode);
 967         }
 968
 969         return ret ? ret : copied;
 970 }
 971
 972 static int ext4_journalled_write_end(struct file *file,
 973                                      struct address_space *mapping,
 974                                      loff_t pos, unsigned len, unsigned copied,
 975                                      struct page *page, void *fsdata)
 976 {
 977         handle_t *handle = ext4_journal_current_handle();
 978         struct inode *inode = mapping->host;
 979         int ret = 0, ret2;
 980         int partial = 0;
 981         unsigned from, to;
 982         loff_t new_i_size;
 983
 984         trace_ext4_journalled_write_end(inode, pos, len, copied);
 985         from = pos & (PAGE_CACHE_SIZE - 1);
 986         to = from + len;
 987
 988         BUG_ON(!ext4_handle_valid(handle));
 989
 990         if (copied < len) {
 991                 if (!PageUptodate(page))
 992                         copied = 0;
 993                 page_zero_new_buffers(page, from+copied, to);
 994         }
 995
 996         ret = walk_page_buffers(handle, page_buffers(page), from,
 997                                 to, &partial, write_end_fn);
 998         if (!partial)
 999                 SetPageUptodate(page);
1000         new_i_size = pos + copied;
1001         if (new_i_size > inode->i_size)
1002                 i_size_write(inode, pos+copied);
1003         ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1004         EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1005         if (new_i_size > EXT4_I(inode)->i_disksize) {
1006                 ext4_update_i_disksize(inode, new_i_size);
1007                 ret2 = ext4_mark_inode_dirty(handle, inode);
1008                 if (!ret)
1009                         ret = ret2;
1010         }
1011
1012         unlock_page(page);
1013         page_cache_release(page);
1014         if (pos + len > inode->i_size && ext4_can_truncate(inode))
1015                 /* if we have allocated more blocks and copied
1016                  * less. We will have blocks allocated outside
1017                  * inode->i_size. So truncate them
1018                  */
1019                 ext4_orphan_add(handle, inode);
1020
1021         ret2 = ext4_journal_stop(handle);
1022         if (!ret)
1023                 ret = ret2;
1024         if (pos + len > inode->i_size) {
1025                 ext4_truncate_failed_write(inode);
1026                 /*
1027                  * If truncate failed early the inode might still be
1028                  * on the orphan list; we need to make sure the inode
1029                  * is removed from the orphan list in that case.
1030                  */
1031                 if (inode->i_nlink)
1032                         ext4_orphan_del(NULL, inode);
1033         }
1034
1035         return ret ? ret : copied;
1036 }
1037
1038 /*
1039  * Reserve a single cluster located at lblock
1040  */
1041 int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1042 {
1043         int retries = 0;
1044         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1045         struct ext4_inode_info *ei = EXT4_I(inode);
1046         unsigned int md_needed;
1047         int ret;
1048
1049         /*
1050          * recalculate the amount of metadata blocks to reserve
1051          * in order to allocate nrblocks
1052          * worse case is one extent per block
1053          */
1054 repeat:
1055         spin_lock(&ei->i_block_reservation_lock);
1056         md_needed = EXT4_NUM_B2C(sbi,
1057                                  ext4_calc_metadata_amount(inode, lblock));
1058         trace_ext4_da_reserve_space(inode, md_needed);
1059         spin_unlock(&ei->i_block_reservation_lock);
1060
1061         /*
1062          * We will charge metadata quota at writeout time; this saves
1063          * us from metadata over-estimation, though we may go over by
1064          * a small amount in the end.  Here we just reserve for data.
1065          */
1066         ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
1067         if (ret)
1068                 return ret;
1069         /*
1070          * We do still charge estimated metadata to the sb though;
1071          * we cannot afford to run out of free blocks.
1072          */
1073         if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
1074                 dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
1075                 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1076                         yield();
1077                         goto repeat;
1078                 }
1079                 return -ENOSPC;
1080         }
1081         spin_lock(&ei->i_block_reservation_lock);
1082         ei->i_reserved_data_blocks++;
1083         ei->i_reserved_meta_blocks += md_needed;
1084         spin_unlock(&ei->i_block_reservation_lock);
1085
1086         return 0;       /* success */
1087 }
1088
1089 static void ext4_da_release_space(struct inode *inode, int to_free)
1090 {
1091         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1092         struct ext4_inode_info *ei = EXT4_I(inode);
1093
1094         if (!to_free)
1095                 return;         /* Nothing to release, exit */
1096
1097         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1098
1099         trace_ext4_da_release_space(inode, to_free);
1100         if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1101                 /*
1102                  * if there aren't enough reserved blocks, then the
1103                  * counter is messed up somewhere.  Since this
1104                  * function is called from invalidate page, it's
1105                  * harmless to return without any action.
1106                  */
1107                 ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
1108                          "ino %lu, to_free %d with only %d reserved "
1109                          "data blocks\n", inode->i_ino, to_free,
1110                          ei->i_reserved_data_blocks);
1111                 WARN_ON(1);
1112                 to_free = ei->i_reserved_data_blocks;
1113         }
1114         ei->i_reserved_data_blocks -= to_free;
1115
1116         if (ei->i_reserved_data_blocks == 0) {
1117                 /*
1118                  * We can release all of the reserved metadata blocks
1119                  * only when we have written all of the delayed
1120                  * allocation blocks.
1121                  * Note that in case of bigalloc, i_reserved_meta_blocks,
1122                  * i_reserved_data_blocks, etc. refer to number of clusters.
1123                  */
1124                 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
1125                                    ei->i_reserved_meta_blocks);
1126                 ei->i_reserved_meta_blocks = 0;
1127                 ei->i_da_metadata_calc_len = 0;
1128         }
1129
1130         /* update fs dirty data blocks counter */
1131         percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
1132
1133         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1134
1135         dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
1136 }
1137
1138 static void ext4_da_page_release_reservation(struct page *page,
1139                                              unsigned long offset)
1140 {
1141         int to_release = 0;
1142         struct buffer_head *head, *bh;
1143         unsigned int curr_off = 0;
1144         struct inode *inode = page->mapping->host;
1145         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1146         int num_clusters;
1147
1148         head = page_buffers(page);
1149         bh = head;
1150         do {
1151                 unsigned int next_off = curr_off + bh->b_size;
1152
1153                 if ((offset <= curr_off) && (buffer_delay(bh))) {
1154                         to_release++;
1155                         clear_buffer_delay(bh);
1156                 }
1157                 curr_off = next_off;
1158         } while ((bh = bh->b_this_page) != head);
1159
1160         /* If we have released all the blocks belonging to a cluster, then we
1161          * need to release the reserved space for that cluster. */
1162         num_clusters = EXT4_NUM_B2C(sbi, to_release);
1163         while (num_clusters > 0) {
1164                 ext4_fsblk_t lblk;
1165                 lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
1166                         ((num_clusters - 1) << sbi->s_cluster_bits);
1167                 if (sbi->s_cluster_ratio == 1 ||
1168                     !ext4_find_delalloc_cluster(inode, lblk, 1))
1169                         ext4_da_release_space(inode, 1);
1170
1171                 num_clusters--;
1172         }
1173 }
1174
1175 /*
1176  * Delayed allocation stuff
1177  */
1178
1179 /*
1180  * mpage_da_submit_io - walks through extent of pages and try to write
1181  * them with writepage() call back
1182  *
1183  * @mpd->inode: inode
1184  * @mpd->first_page: first page of the extent
1185  * @mpd->next_page: page after the last page of the extent
1186  *
1187  * By the time mpage_da_submit_io() is called we expect all blocks
1188  * to be allocated. this may be wrong if allocation failed.
1189  *
1190  * As pages are already locked by write_cache_pages(), we can't use it
1191  */
1192 static int mpage_da_submit_io(struct mpage_da_data *mpd,
1193                               struct ext4_map_blocks *map)
1194 {
1195         struct pagevec pvec;
1196         unsigned long index, end;
1197         int ret = 0, err, nr_pages, i;
1198         struct inode *inode = mpd->inode;
1199         struct address_space *mapping = inode->i_mapping;
1200         loff_t size = i_size_read(inode);
1201         unsigned int len, block_start;
1202         struct buffer_head *bh, *page_bufs = NULL;
1203         int journal_data = ext4_should_journal_data(inode);
1204         sector_t pblock = 0, cur_logical = 0;
1205         struct ext4_io_submit io_submit;
1206
1207         BUG_ON(mpd->next_page <= mpd->first_page);
1208         memset(&io_submit, 0, sizeof(io_submit));
1209         /*
1210          * We need to start from the first_page to the next_page - 1
1211          * to make sure we also write the mapped dirty buffer_heads.
1212          * If we look at mpd->b_blocknr we would only be looking
1213          * at the currently mapped buffer_heads.
1214          */
1215         index = mpd->first_page;
1216         end = mpd->next_page - 1;
1217
1218         pagevec_init(&pvec, 0);
1219         while (index <= end) {
1220                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1221                 if (nr_pages == 0)
1222                         break;
1223                 for (i = 0; i < nr_pages; i++) {
1224                         int commit_write = 0, skip_page = 0;
1225                         struct page *page = pvec.pages[i];
1226
1227                         index = page->index;
1228                         if (index > end)
1229                                 break;
1230
1231                         if (index == size >> PAGE_CACHE_SHIFT)
1232                                 len = size & ~PAGE_CACHE_MASK;
1233                         else
1234                                 len = PAGE_CACHE_SIZE;
1235                         if (map) {
1236                                 cur_logical = index << (PAGE_CACHE_SHIFT -
1237                                                         inode->i_blkbits);
1238                                 pblock = map->m_pblk + (cur_logical -
1239                                                         map->m_lblk);
1240                         }
1241                         index++;
1242
1243                         BUG_ON(!PageLocked(page));
1244                         BUG_ON(PageWriteback(page));
1245
1246                         /*
1247                          * If the page does not have buffers (for
1248                          * whatever reason), try to create them using
1249                          * __block_write_begin.  If this fails,
1250                          * skip the page and move on.
1251                          */
1252                         if (!page_has_buffers(page)) {
1253                                 if (__block_write_begin(page, 0, len,
1254                                                 noalloc_get_block_write)) {
1255                                 skip_page:
1256                                         unlock_page(page);
1257                                         continue;
1258                                 }
1259                                 commit_write = 1;
1260                         }
1261
1262                         bh = page_bufs = page_buffers(page);
1263                         block_start = 0;
1264                         do {
1265                                 if (!bh)
1266                                         goto skip_page;
1267                                 if (map && (cur_logical >= map->m_lblk) &&
1268                                     (cur_logical <= (map->m_lblk +
1269                                                      (map->m_len - 1)))) {
1270                                         if (buffer_delay(bh)) {
1271                                                 clear_buffer_delay(bh);
1272                                                 bh->b_blocknr = pblock;
1273                                         }
1274                                         if (buffer_unwritten(bh) ||
1275                                             buffer_mapped(bh))
1276                                                 BUG_ON(bh->b_blocknr != pblock);
1277                                         if (map->m_flags & EXT4_MAP_UNINIT)
1278                                                 set_buffer_uninit(bh);
1279                                         clear_buffer_unwritten(bh);
1280                                 }
1281
1282                                 /* skip page if block allocation undone */
1283                                 if (buffer_delay(bh) || buffer_unwritten(bh))
1284                                         skip_page = 1;
1285                                 bh = bh->b_this_page;
1286                                 block_start += bh->b_size;
1287                                 cur_logical++;
1288                                 pblock++;
1289                         } while (bh != page_bufs);
1290
1291                         if (skip_page)
1292                                 goto skip_page;
1293
1294                         if (commit_write)
1295                                 /* mark the buffer_heads as dirty & uptodate */
1296                                 block_commit_write(page, 0, len);
1297
1298                         clear_page_dirty_for_io(page);
1299                         /*
1300                          * Delalloc doesn't support data journalling,
1301                          * but eventually maybe we'll lift this
1302                          * restriction.
1303                          */
1304                         if (unlikely(journal_data && PageChecked(page)))
1305                                 err = __ext4_journalled_writepage(page, len);
1306                         else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
1307                                 err = ext4_bio_write_page(&io_submit, page,
1308                                                           len, mpd->wbc);
1309                         else if (buffer_uninit(page_bufs)) {
1310                                 ext4_set_bh_endio(page_bufs, inode);
1311                                 err = block_write_full_page_endio(page,
1312                                         noalloc_get_block_write,
1313                                         mpd->wbc, ext4_end_io_buffer_write);
1314                         } else
1315                                 err = block_write_full_page(page,
1316                                         noalloc_get_block_write, mpd->wbc);
1317
1318                         if (!err)
1319                                 mpd->pages_written++;
1320                         /*
1321                          * In error case, we have to continue because
1322                          * remaining pages are still locked
1323                          */
1324                         if (ret == 0)
1325                                 ret = err;
1326                 }
1327                 pagevec_release(&pvec);
1328         }
1329         ext4_io_submit(&io_submit);
1330         return ret;
1331 }
1332
1333 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1334 {
1335         int nr_pages, i;
1336         pgoff_t index, end;
1337         struct pagevec pvec;
1338         struct inode *inode = mpd->inode;
1339         struct address_space *mapping = inode->i_mapping;
1340
1341         index = mpd->first_page;
1342         end   = mpd->next_page - 1;
1343         while (index <= end) {
1344                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1345                 if (nr_pages == 0)
1346                         break;
1347                 for (i = 0; i < nr_pages; i++) {
1348                         struct page *page = pvec.pages[i];
1349                         if (page->index > end)
1350                                 break;
1351                         BUG_ON(!PageLocked(page));
1352                         BUG_ON(PageWriteback(page));
1353                         block_invalidatepage(page, 0);
1354                         ClearPageUptodate(page);
1355                         unlock_page(page);
1356                 }
1357                 index = pvec.pages[nr_pages - 1]->index + 1;
1358                 pagevec_release(&pvec);
1359         }
1360         return;
1361 }
1362
1363 static void ext4_print_free_blocks(struct inode *inode)
1364 {
1365         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1366         printk(KERN_CRIT "Total free blocks count %lld\n",
1367                EXT4_C2B(EXT4_SB(inode->i_sb),
1368                         ext4_count_free_clusters(inode->i_sb)));
1369         printk(KERN_CRIT "Free/Dirty block details\n");
1370         printk(KERN_CRIT "free_blocks=%lld\n",
1371                (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1372                 percpu_counter_sum(&sbi->s_freeclusters_counter)));
1373         printk(KERN_CRIT "dirty_blocks=%lld\n",
1374                (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
1375                 percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
1376         printk(KERN_CRIT "Block reservation details\n");
1377         printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
1378                EXT4_I(inode)->i_reserved_data_blocks);
1379         printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
1380                EXT4_I(inode)->i_reserved_meta_blocks);
1381         return;
1382 }
1383
1384 /*
1385  * mpage_da_map_and_submit - go through given space, map them
1386  *       if necessary, and then submit them for I/O
1387  *
1388  * @mpd - bh describing space
1389  *
1390  * The function skips space we know is already mapped to disk blocks.
1391  *
1392  */
1393 static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1394 {
1395         int err, blks, get_blocks_flags;
1396         struct ext4_map_blocks map, *mapp = NULL;
1397         sector_t next = mpd->b_blocknr;
1398         unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
1399         loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
1400         handle_t *handle = NULL;
1401
1402         /*
1403          * If the blocks are mapped already, or we couldn't accumulate
1404          * any blocks, then proceed immediately to the submission stage.
1405          */
1406         if ((mpd->b_size == 0) ||
1407             ((mpd->b_state  & (1 << BH_Mapped)) &&
1408              !(mpd->b_state & (1 << BH_Delay)) &&
1409              !(mpd->b_state & (1 << BH_Unwritten))))
1410                 goto submit_io;
1411
1412         handle = ext4_journal_current_handle();
1413         BUG_ON(!handle);
1414
1415         /*
1416          * Call ext4_map_blocks() to allocate any delayed allocation
1417          * blocks, or to convert an uninitialized extent to be
1418          * initialized (in the case where we have written into
1419          * one or more preallocated blocks).
1420          *
1421          * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
1422          * indicate that we are on the delayed allocation path.  This
1423          * affects functions in many different parts of the allocation
1424          * call path.  This flag exists primarily because we don't
1425          * want to change *many* call functions, so ext4_map_blocks()
1426          * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
1427          * inode's allocation semaphore is taken.
1428          *
1429          * If the blocks in questions were delalloc blocks, set
1430          * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
1431          * variables are updated after the blocks have been allocated.
1432          */
1433         map.m_lblk = next;
1434         map.m_len = max_blocks;
1435         get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
1436         if (ext4_should_dioread_nolock(mpd->inode))
1437                 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
1438         if (mpd->b_state & (1 << BH_Delay))
1439                 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
1440
1441         blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
1442         if (blks < 0) {
1443                 struct super_block *sb = mpd->inode->i_sb;
1444
1445                 err = blks;
1446                 /*
1447                  * If get block returns EAGAIN or ENOSPC and there
1448                  * appears to be free blocks we will just let
1449                  * mpage_da_submit_io() unlock all of the pages.
1450                  */
1451                 if (err == -EAGAIN)
1452                         goto submit_io;
1453
1454                 if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
1455                         mpd->retval = err;
1456                         goto submit_io;
1457                 }
1458
1459                 /*
1460                  * get block failure will cause us to loop in
1461                  * writepages, because a_ops->writepage won't be able
1462                  * to make progress. The page will be redirtied by
1463                  * writepage and writepages will again try to write
1464                  * the same.
1465                  */
1466                 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
1467                         ext4_msg(sb, KERN_CRIT,
1468                                  "delayed block allocation failed for inode %lu "
1469                                  "at logical offset %llu with max blocks %zd "
1470                                  "with error %d", mpd->inode->i_ino,
1471                                  (unsigned long long) next,
1472                                  mpd->b_size >> mpd->inode->i_blkbits, err);
1473                         ext4_msg(sb, KERN_CRIT,
1474                                 "This should not happen!! Data will be lost\n");
1475                         if (err == -ENOSPC)
1476                                 ext4_print_free_blocks(mpd->inode);
1477                 }
1478                 /* invalidate all the pages */
1479                 ext4_da_block_invalidatepages(mpd);
1480
1481                 /* Mark this page range as having been completed */
1482                 mpd->io_done = 1;
1483                 return;
1484         }
1485         BUG_ON(blks == 0);
1486
1487         mapp = &map;
1488         if (map.m_flags & EXT4_MAP_NEW) {
1489                 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
1490                 int i;
1491
1492                 for (i = 0; i < map.m_len; i++)
1493                         unmap_underlying_metadata(bdev, map.m_pblk + i);
1494
1495                 if (ext4_should_order_data(mpd->inode)) {
1496                         err = ext4_jbd2_file_inode(handle, mpd->inode);
1497                         if (err)
1498                                 /* Only if the journal is aborted */
1499                                 return;
1500                 }
1501         }
1502
1503         /*
1504          * Update on-disk size along with block allocation.
1505          */
1506         disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
1507         if (disksize > i_size_read(mpd->inode))
1508                 disksize = i_size_read(mpd->inode);
1509         if (disksize > EXT4_I(mpd->inode)->i_disksize) {
1510                 ext4_update_i_disksize(mpd->inode, disksize);
1511                 err = ext4_mark_inode_dirty(handle, mpd->inode);
1512                 if (err)
1513                         ext4_error(mpd->inode->i_sb,
1514                                    "Failed to mark inode %lu dirty",
1515                                    mpd->inode->i_ino);
1516         }
1517
1518 submit_io:
1519         mpage_da_submit_io(mpd, mapp);
1520         mpd->io_done = 1;
1521 }
1522
1523 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
1524                 (1 << BH_Delay) | (1 << BH_Unwritten))
1525
1526 /*
1527  * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1528  *
1529  * @mpd->lbh - extent of blocks
1530  * @logical - logical number of the block in the file
1531  * @bh - bh of the block (used to access block's state)
1532  *
1533  * the function is used to collect contig. blocks in same state
1534  */
1535 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
1536                                    sector_t logical, size_t b_size,
1537                                    unsigned long b_state)
1538 {
1539         sector_t next;
1540         int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
1541
1542         /*
1543          * XXX Don't go larger than mballoc is willing to allocate
1544          * This is a stopgap solution.  We eventually need to fold
1545          * mpage_da_submit_io() into this function and then call
1546          * ext4_map_blocks() multiple times in a loop
1547          */
1548         if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
1549                 goto flush_it;
1550
1551         /* check if thereserved journal credits might overflow */
1552         if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
1553                 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1554                         /*
1555                          * With non-extent format we are limited by the journal
1556                          * credit available.  Total credit needed to insert
1557                          * nrblocks contiguous blocks is dependent on the
1558                          * nrblocks.  So limit nrblocks.
1559                          */
1560                         goto flush_it;
1561                 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
1562                                 EXT4_MAX_TRANS_DATA) {
1563                         /*
1564                          * Adding the new buffer_head would make it cross the
1565                          * allowed limit for which we have journal credit
1566                          * reserved. So limit the new bh->b_size
1567                          */
1568                         b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
1569                                                 mpd->inode->i_blkbits;
1570                         /* we will do mpage_da_submit_io in the next loop */
1571                 }
1572         }
1573         /*
1574          * First block in the extent
1575          */
1576         if (mpd->b_size == 0) {
1577                 mpd->b_blocknr = logical;
1578                 mpd->b_size = b_size;
1579                 mpd->b_state = b_state & BH_FLAGS;
1580                 return;
1581         }
1582
1583         next = mpd->b_blocknr + nrblocks;
1584         /*
1585          * Can we merge the block to our big extent?
1586          */
1587         if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
1588                 mpd->b_size += b_size;
1589                 return;
1590         }
1591
1592 flush_it:
1593         /*
1594          * We couldn't merge the block to our extent, so we
1595          * need to flush current  extent and start new one
1596          */
1597         mpage_da_map_and_submit(mpd);
1598         return;
1599 }
1600
1601 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1602 {
1603         return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
1604 }
1605
1606 /*
1607  * This is a special get_blocks_t callback which is used by
1608  * ext4_da_write_begin().  It will either return mapped block or
1609  * reserve space for a single block.
1610  *
1611  * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
1612  * We also have b_blocknr = -1 and b_bdev initialized properly
1613  *
1614  * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
1615  * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
1616  * initialized properly.
1617  */
1618 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1619                                   struct buffer_head *bh, int create)
1620 {
1621         struct ext4_map_blocks map;
1622         int ret = 0;
1623         sector_t invalid_block = ~((sector_t) 0xffff);
1624
1625         if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1626                 invalid_block = ~0;
1627
1628         BUG_ON(create == 0);
1629         BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
1630
1631         map.m_lblk = iblock;
1632         map.m_len = 1;
1633
1634         /*
1635          * first, we need to know whether the block is allocated already
1636          * preallocated blocks are unmapped but should treated
1637          * the same as allocated blocks.
1638          */
1639         ret = ext4_map_blocks(NULL, inode, &map, 0);
1640         if (ret < 0)
1641                 return ret;
1642         if (ret == 0) {
1643                 if (buffer_delay(bh))
1644                         return 0; /* Not sure this could or should happen */
1645                 /*
1646                  * XXX: __block_write_begin() unmaps passed block, is it OK?
1647                  */
1648                 /* If the block was allocated from previously allocated cluster,
1649                  * then we dont need to reserve it again. */
1650                 if (!(map.m_flags & EXT4_MAP_FROM_CLUSTER)) {
1651                         ret = ext4_da_reserve_space(inode, iblock);
1652                         if (ret)
1653                                 /* not enough space to reserve */
1654                                 return ret;
1655                 }
1656
1657                 map_bh(bh, inode->i_sb, invalid_block);
1658                 set_buffer_new(bh);
1659                 set_buffer_delay(bh);
1660                 return 0;
1661         }
1662
1663         map_bh(bh, inode->i_sb, map.m_pblk);
1664         bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
1665
1666         if (buffer_unwritten(bh)) {
1667                 /* A delayed write to unwritten bh should be marked
1668                  * new and mapped.  Mapped ensures that we don't do
1669                  * get_block multiple times when we write to the same
1670                  * offset and new ensures that we do proper zero out
1671                  * for partial write.
1672                  */
1673                 set_buffer_new(bh);
1674                 set_buffer_mapped(bh);
1675         }
1676         return 0;
1677 }
1678
1679 /*
1680  * This function is used as a standard get_block_t calback function
1681  * when there is no desire to allocate any blocks.  It is used as a
1682  * callback function for block_write_begin() and block_write_full_page().
1683  * These functions should only try to map a single block at a time.
1684  *
1685  * Since this function doesn't do block allocations even if the caller
1686  * requests it by passing in create=1, it is critically important that
1687  * any caller checks to make sure that any buffer heads are returned
1688  * by this function are either all already mapped or marked for
1689  * delayed allocation before calling  block_write_full_page().  Otherwise,
1690  * b_blocknr could be left unitialized, and the page write functions will
1691  * be taken by surprise.
1692  */
1693 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
1694                                    struct buffer_head *bh_result, int create)
1695 {
1696         BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1697         return _ext4_get_block(inode, iblock, bh_result, 0);
1698 }
1699
1700 static int bget_one(handle_t *handle, struct buffer_head *bh)
1701 {
1702         get_bh(bh);
1703         return 0;
1704 }
1705
1706 static int bput_one(handle_t *handle, struct buffer_head *bh)
1707 {
1708         put_bh(bh);
1709         return 0;
1710 }
1711
1712 static int __ext4_journalled_writepage(struct page *page,
1713                                        unsigned int len)
1714 {
1715         struct address_space *mapping = page->mapping;
1716         struct inode *inode = mapping->host;
1717         struct buffer_head *page_bufs;
1718         handle_t *handle = NULL;
1719         int ret = 0;
1720         int err;
1721
1722         ClearPageChecked(page);
1723         page_bufs = page_buffers(page);
1724         BUG_ON(!page_bufs);
1725         walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
1726         /* As soon as we unlock the page, it can go away, but we have
1727          * references to buffers so we are safe */
1728         unlock_page(page);
1729
1730         handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
1731         if (IS_ERR(handle)) {
1732                 ret = PTR_ERR(handle);
1733                 goto out;
1734         }
1735
1736         BUG_ON(!ext4_handle_valid(handle));
1737
1738         ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
1739                                 do_journal_get_write_access);
1740
1741         err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
1742                                 write_end_fn);
1743         if (ret == 0)
1744                 ret = err;
1745         EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1746         err = ext4_journal_stop(handle);
1747         if (!ret)
1748                 ret = err;
1749
1750         walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
1751         ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1752 out:
1753         return ret;
1754 }
1755
1756 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
1757 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
1758
1759 /*
1760  * Note that we don't need to start a transaction unless we're journaling data
1761  * because we should have holes filled from ext4_page_mkwrite(). We even don't
1762  * need to file the inode to the transaction's list in ordered mode because if
1763  * we are writing back data added by write(), the inode is already there and if
1764  * we are writing back data modified via mmap(), no one guarantees in which
1765  * transaction the data will hit the disk. In case we are journaling data, we
1766  * cannot start transaction directly because transaction start ranks above page
1767  * lock so we have to do some magic.
1768  *
1769  * This function can get called via...
1770  *   - ext4_da_writepages after taking page lock (have journal handle)
1771  *   - journal_submit_inode_data_buffers (no journal handle)
1772  *   - shrink_page_list via pdflush (no journal handle)
1773  *   - grab_page_cache when doing write_begin (have journal handle)
1774  *
1775  * We don't do any block allocation in this function. If we have page with
1776  * multiple blocks we need to write those buffer_heads that are mapped. This
1777  * is important for mmaped based write. So if we do with blocksize 1K
1778  * truncate(f, 1024);
1779  * a = mmap(f, 0, 4096);
1780  * a[0] = 'a';
1781  * truncate(f, 4096);
1782  * we have in the page first buffer_head mapped via page_mkwrite call back
1783  * but other bufer_heads would be unmapped but dirty(dirty done via the
1784  * do_wp_page). So writepage should write the first block. If we modify
1785  * the mmap area beyond 1024 we will again get a page_fault and the
1786  * page_mkwrite callback will do the block allocation and mark the
1787  * buffer_heads mapped.
1788  *
1789  * We redirty the page if we have any buffer_heads that is either delay or
1790  * unwritten in the page.
1791  *
1792  * We can get recursively called as show below.
1793  *
1794  *      ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1795  *              ext4_writepage()
1796  *
1797  * But since we don't do any block allocation we should not deadlock.
1798  * Page also have the dirty flag cleared so we don't get recurive page_lock.
1799  */
1800 static int ext4_writepage(struct page *page,
1801                           struct writeback_control *wbc)
1802 {
1803         int ret = 0, commit_write = 0;
1804         loff_t size;
1805         unsigned int len;
1806         struct buffer_head *page_bufs = NULL;
1807         struct inode *inode = page->mapping->host;
1808
1809         trace_ext4_writepage(page);
1810         size = i_size_read(inode);
1811         if (page->index == size >> PAGE_CACHE_SHIFT)
1812                 len = size & ~PAGE_CACHE_MASK;
1813         else
1814                 len = PAGE_CACHE_SIZE;
1815
1816         /*
1817          * If the page does not have buffers (for whatever reason),
1818          * try to create them using __block_write_begin.  If this
1819          * fails, redirty the page and move on.
1820          */
1821         if (!page_has_buffers(page)) {
1822                 if (__block_write_begin(page, 0, len,
1823                                         noalloc_get_block_write)) {
1824                 redirty_page:
1825                         redirty_page_for_writepage(wbc, page);
1826                         unlock_page(page);
1827                         return 0;
1828                 }
1829                 commit_write = 1;
1830         }
1831         page_bufs = page_buffers(page);
1832         if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
1833                               ext4_bh_delay_or_unwritten)) {
1834                 /*
1835                  * We don't want to do block allocation, so redirty
1836                  * the page and return.  We may reach here when we do
1837                  * a journal commit via journal_submit_inode_data_buffers.
1838                  * We can also reach here via shrink_page_list
1839                  */
1840                 goto redirty_page;
1841         }
1842         if (commit_write)
1843                 /* now mark the buffer_heads as dirty and uptodate */
1844                 block_commit_write(page, 0, len);
1845
1846         if (PageChecked(page) && ext4_should_journal_data(inode))
1847                 /*
1848                  * It's mmapped pagecache.  Add buffers and journal it.  There
1849                  * doesn't seem much point in redirtying the page here.
1850                  */
1851                 return __ext4_journalled_writepage(page, len);
1852
1853         if (buffer_uninit(page_bufs)) {
1854                 ext4_set_bh_endio(page_bufs, inode);
1855                 ret = block_write_full_page_endio(page, noalloc_get_block_write,
1856                                             wbc, ext4_end_io_buffer_write);
1857         } else
1858                 ret = block_write_full_page(page, noalloc_get_block_write,
1859                                             wbc);
1860
1861         return ret;
1862 }
1863
1864 /*
1865  * This is called via ext4_da_writepages() to
1866  * calculate the total number of credits to reserve to fit
1867  * a single extent allocation into a single transaction,
1868  * ext4_da_writpeages() will loop calling this before
1869  * the block allocation.
1870  */
1871
1872 static int ext4_da_writepages_trans_blocks(struct inode *inode)
1873 {
1874         int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
1875
1876         /*
1877          * With non-extent format the journal credit needed to
1878          * insert nrblocks contiguous block is dependent on
1879          * number of contiguous block. So we will limit
1880          * number of contiguous block to a sane value
1881          */
1882         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
1883             (max_blocks > EXT4_MAX_TRANS_DATA))
1884                 max_blocks = EXT4_MAX_TRANS_DATA;
1885
1886         return ext4_chunk_trans_blocks(inode, max_blocks);
1887 }
1888
1889 /*
1890  * write_cache_pages_da - walk the list of dirty pages of the given
1891  * address space and accumulate pages that need writing, and call
1892  * mpage_da_map_and_submit to map a single contiguous memory region
1893  * and then write them.
1894  */
1895 static int write_cache_pages_da(struct address_space *mapping,
1896                                 struct writeback_control *wbc,
1897                                 struct mpage_da_data *mpd,
1898                                 pgoff_t *done_index)
1899 {
1900         struct buffer_head      *bh, *head;
1901         struct inode            *inode = mapping->host;
1902         struct pagevec          pvec;
1903         unsigned int            nr_pages;
1904         sector_t                logical;
1905         pgoff_t                 index, end;
1906         long                    nr_to_write = wbc->nr_to_write;
1907         int                     i, tag, ret = 0;
1908
1909         memset(mpd, 0, sizeof(struct mpage_da_data));
1910         mpd->wbc = wbc;
1911         mpd->inode = inode;
1912         pagevec_init(&pvec, 0);
1913         index = wbc->range_start >> PAGE_CACHE_SHIFT;
1914         end = wbc->range_end >> PAGE_CACHE_SHIFT;
1915
1916         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
1917                 tag = PAGECACHE_TAG_TOWRITE;
1918         else
1919                 tag = PAGECACHE_TAG_DIRTY;
1920
1921         *done_index = index;
1922         while (index <= end) {
1923                 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
1924                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1925                 if (nr_pages == 0)
1926                         return 0;
1927
1928                 for (i = 0; i < nr_pages; i++) {
1929                         struct page *page = pvec.pages[i];
1930
1931                         /*
1932                          * At this point, the page may be truncated or
1933                          * invalidated (changing page->mapping to NULL), or
1934                          * even swizzled back from swapper_space to tmpfs file
1935                          * mapping. However, page->index will not change
1936                          * because we have a reference on the page.
1937                          */
1938                         if (page->index > end)
1939                                 goto out;
1940
1941                         *done_index = page->index + 1;
1942
1943                         /*
1944                          * If we can't merge this page, and we have
1945                          * accumulated an contiguous region, write it
1946                          */
1947                         if ((mpd->next_page != page->index) &&
1948                             (mpd->next_page != mpd->first_page)) {
1949                                 mpage_da_map_and_submit(mpd);
1950                                 goto ret_extent_tail;
1951                         }
1952
1953                         lock_page(page);
1954
1955                         /*
1956                          * If the page is no longer dirty, or its
1957                          * mapping no longer corresponds to inode we
1958                          * are writing (which means it has been
1959                          * truncated or invalidated), or the page is
1960                          * already under writeback and we are not
1961                          * doing a data integrity writeback, skip the page
1962                          */
1963                         if (!PageDirty(page) ||
1964                             (PageWriteback(page) &&
1965                              (wbc->sync_mode == WB_SYNC_NONE)) ||
1966                             unlikely(page->mapping != mapping)) {
1967                                 unlock_page(page);
1968                                 continue;
1969                         }
1970
1971                         wait_on_page_writeback(page);
1972                         BUG_ON(PageWriteback(page));
1973
1974                         if (mpd->next_page != page->index)
1975                                 mpd->first_page = page->index;
1976                         mpd->next_page = page->index + 1;
1977                         logical = (sector_t) page->index <<
1978                                 (PAGE_CACHE_SHIFT - inode->i_blkbits);
1979
1980                         if (!page_has_buffers(page)) {
1981                                 mpage_add_bh_to_extent(mpd, logical,
1982                                                        PAGE_CACHE_SIZE,
1983                                                        (1 << BH_Dirty) | (1 << BH_Uptodate));
1984                                 if (mpd->io_done)
1985                                         goto ret_extent_tail;
1986                         } else {
1987                                 /*
1988                                  * Page with regular buffer heads,
1989                                  * just add all dirty ones
1990                                  */
1991                                 head = page_buffers(page);
1992                                 bh = head;
1993                                 do {
1994                                         BUG_ON(buffer_locked(bh));
1995                                         /*
1996                                          * We need to try to allocate
1997                                          * unmapped blocks in the same page.
1998                                          * Otherwise we won't make progress
1999                                          * with the page in ext4_writepage
2000                                          */
2001                                         if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2002                                                 mpage_add_bh_to_extent(mpd, logical,
2003                                                                        bh->b_size,
2004                                                                        bh->b_state);
2005                                                 if (mpd->io_done)
2006                                                         goto ret_extent_tail;
2007                                         } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2008                                                 /*
2009                                                  * mapped dirty buffer. We need
2010                                                  * to update the b_state
2011                                                  * because we look at b_state
2012                                                  * in mpage_da_map_blocks.  We
2013                                                  * don't update b_size because
2014                                                  * if we find an unmapped
2015                                                  * buffer_head later we need to
2016                                                  * use the b_state flag of that
2017                                                  * buffer_head.
2018                                                  */
2019                                                 if (mpd->b_size == 0)
2020                                                         mpd->b_state = bh->b_state & BH_FLAGS;
2021                                         }
2022                                         logical++;
2023                                 } while ((bh = bh->b_this_page) != head);
2024                         }
2025
2026                         if (nr_to_write > 0) {
2027                                 nr_to_write--;
2028                                 if (nr_to_write == 0 &&
2029                                     wbc->sync_mode == WB_SYNC_NONE)
2030                                         /*
2031                                          * We stop writing back only if we are
2032                                          * not doing integrity sync. In case of
2033                                          * integrity sync we have to keep going
2034                                          * because someone may be concurrently
2035                                          * dirtying pages, and we might have
2036                                          * synced a lot of newly appeared dirty
2037                                          * pages, but have not synced all of the
2038                                          * old dirty pages.
2039                                          */
2040                                         goto out;
2041                         }
2042                 }
2043                 pagevec_release(&pvec);
2044                 cond_resched();
2045         }
2046         return 0;
2047 ret_extent_tail:
2048         ret = MPAGE_DA_EXTENT_TAIL;
2049 out:
2050         pagevec_release(&pvec);
2051         cond_resched();
2052         return ret;
2053 }
2054
2055
2056 static int ext4_da_writepages(struct address_space *mapping,
2057                               struct writeback_control *wbc)
2058 {
2059         pgoff_t index;
2060         int range_whole = 0;
2061         handle_t *handle = NULL;
2062         struct mpage_da_data mpd;
2063         struct inode *inode = mapping->host;
2064         int pages_written = 0;
2065         unsigned int max_pages;
2066         int range_cyclic, cycled = 1, io_done = 0;
2067         int needed_blocks, ret = 0;
2068         long desired_nr_to_write, nr_to_writebump = 0;
2069         loff_t range_start = wbc->range_start;
2070         struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2071         pgoff_t done_index = 0;
2072         pgoff_t end;
2073
2074         trace_ext4_da_writepages(inode, wbc);
2075
2076         /*
2077          * No pages to write? This is mainly a kludge to avoid starting
2078          * a transaction for special inodes like journal inode on last iput()
2079          * because that could violate lock ordering on umount
2080          */
2081         if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2082                 return 0;
2083
2084         /*
2085          * If the filesystem has aborted, it is read-only, so return
2086          * right away instead of dumping stack traces later on that
2087          * will obscure the real source of the problem.  We test
2088          * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2089          * the latter could be true if the filesystem is mounted
2090          * read-only, and in that case, ext4_da_writepages should
2091          * *never* be called, so if that ever happens, we would want
2092          * the stack trace.
2093          */
2094         if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2095                 return -EROFS;
2096
2097         if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2098                 range_whole = 1;
2099
2100         range_cyclic = wbc->range_cyclic;
2101         if (wbc->range_cyclic) {
2102                 index = mapping->writeback_index;
2103                 if (index)
2104                         cycled = 0;
2105                 wbc->range_start = index << PAGE_CACHE_SHIFT;
2106                 wbc->range_end  = LLONG_MAX;
2107                 wbc->range_cyclic = 0;
2108                 end = -1;
2109         } else {
2110                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2111                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2112         }
2113
2114         /*
2115          * This works around two forms of stupidity.  The first is in
2116          * the writeback code, which caps the maximum number of pages
2117          * written to be 1024 pages.  This is wrong on multiple
2118          * levels; different architectues have a different page size,
2119          * which changes the maximum amount of data which gets
2120          * written.  Secondly, 4 megabytes is way too small.  XFS
2121          * forces this value to be 16 megabytes by multiplying
2122          * nr_to_write parameter by four, and then relies on its
2123          * allocator to allocate larger extents to make them
2124          * contiguous.  Unfortunately this brings us to the second
2125          * stupidity, which is that ext4's mballoc code only allocates
2126          * at most 2048 blocks.  So we force contiguous writes up to
2127          * the number of dirty blocks in the inode, or
2128          * sbi->max_writeback_mb_bump whichever is smaller.
2129          */
2130         max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2131         if (!range_cyclic && range_whole) {
2132                 if (wbc->nr_to_write == LONG_MAX)
2133                         desired_nr_to_write = wbc->nr_to_write;
2134                 else
2135                         desired_nr_to_write = wbc->nr_to_write * 8;
2136         } else
2137                 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2138                                                            max_pages);
2139         if (desired_nr_to_write > max_pages)
2140                 desired_nr_to_write = max_pages;
2141
2142         if (wbc->nr_to_write < desired_nr_to_write) {
2143                 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2144                 wbc->nr_to_write = desired_nr_to_write;
2145         }
2146
2147 retry:
2148         if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2149                 tag_pages_for_writeback(mapping, index, end);
2150
2151         while (!ret && wbc->nr_to_write > 0) {
2152
2153                 /*
2154                  * we  insert one extent at a time. So we need
2155                  * credit needed for single extent allocation.
2156                  * journalled mode is currently not supported
2157                  * by delalloc
2158                  */
2159                 BUG_ON(ext4_should_journal_data(inode));
2160                 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2161
2162                 /* start a new transaction*/
2163                 handle = ext4_journal_start(inode, needed_blocks);
2164                 if (IS_ERR(handle)) {
2165                         ret = PTR_ERR(handle);
2166                         ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2167                                "%ld pages, ino %lu; err %d", __func__,
2168                                 wbc->nr_to_write, inode->i_ino, ret);
2169                         goto out_writepages;
2170                 }
2171
2172                 /*
2173                  * Now call write_cache_pages_da() to find the next
2174                  * contiguous region of logical blocks that need
2175                  * blocks to be allocated by ext4 and submit them.
2176                  */
2177                 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
2178                 /*
2179                  * If we have a contiguous extent of pages and we
2180                  * haven't done the I/O yet, map the blocks and submit
2181                  * them for I/O.
2182                  */
2183                 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
2184                         mpage_da_map_and_submit(&mpd);
2185                         ret = MPAGE_DA_EXTENT_TAIL;
2186                 }
2187                 trace_ext4_da_write_pages(inode, &mpd);
2188                 wbc->nr_to_write -= mpd.pages_written;
2189
2190                 ext4_journal_stop(handle);
2191
2192                 if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
2193                         /* commit the transaction which would
2194                          * free blocks released in the transaction
2195                          * and try again
2196                          */
2197                         jbd2_journal_force_commit_nested(sbi->s_journal);
2198                         ret = 0;
2199                 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
2200                         /*
2201                          * got one extent now try with
2202                          * rest of the pages
2203                          */
2204                         pages_written += mpd.pages_written;
2205                         ret = 0;
2206                         io_done = 1;
2207                 } else if (wbc->nr_to_write)
2208                         /*
2209                          * There is no more writeout needed
2210                          * or we requested for a noblocking writeout
2211                          * and we found the device congested
2212                          */
2213                         break;
2214         }
2215         if (!io_done && !cycled) {
2216                 cycled = 1;
2217                 index = 0;
2218                 wbc->range_start = index << PAGE_CACHE_SHIFT;
2219                 wbc->range_end  = mapping->writeback_index - 1;
2220                 goto retry;
2221         }
2222
2223         /* Update index */
2224         wbc->range_cyclic = range_cyclic;
2225         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2226                 /*
2227                  * set the writeback_index so that range_cyclic
2228                  * mode will write it back later
2229                  */
2230                 mapping->writeback_index = done_index;
2231
2232 out_writepages:
2233         wbc->nr_to_write -= nr_to_writebump;
2234         wbc->range_start = range_start;
2235         trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2236         return ret;
2237 }
2238
2239 #define FALL_BACK_TO_NONDELALLOC 1
2240 static int ext4_nonda_switch(struct super_block *sb)
2241 {
2242         s64 free_blocks, dirty_blocks;
2243         struct ext4_sb_info *sbi = EXT4_SB(sb);
2244
2245         /*
2246          * switch to non delalloc mode if we are running low
2247          * on free block. The free block accounting via percpu
2248          * counters can get slightly wrong with percpu_counter_batch getting
2249          * accumulated on each CPU without updating global counters
2250          * Delalloc need an accurate free block accounting. So switch
2251          * to non delalloc when we are near to error range.
2252          */
2253         free_blocks  = EXT4_C2B(sbi,
2254                 percpu_counter_read_positive(&sbi->s_freeclusters_counter));
2255         dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
2256         if (2 * free_blocks < 3 * dirty_blocks ||
2257                 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
2258                 /*
2259                  * free block count is less than 150% of dirty blocks
2260                  * or free blocks is less than watermark
2261                  */
2262                 return 1;
2263         }
2264         /*
2265          * Even if we don't switch but are nearing capacity,
2266          * start pushing delalloc when 1/2 of free blocks are dirty.
2267          */
2268         if (free_blocks < 2 * dirty_blocks)
2269                 writeback_inodes_sb_if_idle(sb);
2270
2271         return 0;
2272 }
2273
2274 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2275                                loff_t pos, unsigned len, unsigned flags,
2276                                struct page **pagep, void **fsdata)
2277 {
2278         int ret, retries = 0;
2279         struct page *page;
2280         pgoff_t index;
2281         struct inode *inode = mapping->host;
2282         handle_t *handle;
2283         loff_t page_len;
2284
2285         index = pos >> PAGE_CACHE_SHIFT;
2286
2287         if (ext4_nonda_switch(inode->i_sb)) {
2288                 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
2289                 return ext4_write_begin(file, mapping, pos,
2290                                         len, flags, pagep, fsdata);
2291         }
2292         *fsdata = (void *)0;
2293         trace_ext4_da_write_begin(inode, pos, len, flags);
2294 retry:
2295         /*
2296          * With delayed allocation, we don't log the i_disksize update
2297          * if there is delayed block allocation. But we still need
2298          * to journalling the i_disksize update if writes to the end
2299          * of file which has an already mapped buffer.
2300          */
2301         handle = ext4_journal_start(inode, 1);
2302         if (IS_ERR(handle)) {
2303                 ret = PTR_ERR(handle);
2304                 goto out;
2305         }
2306         /* We cannot recurse into the filesystem as the transaction is already
2307          * started */
2308         flags |= AOP_FLAG_NOFS;
2309
2310         page = grab_cache_page_write_begin(mapping, index, flags);
2311         if (!page) {
2312                 ext4_journal_stop(handle);
2313                 ret = -ENOMEM;
2314                 goto out;
2315         }
2316         *pagep = page;
2317
2318         ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
2319         if (ret < 0) {
2320                 unlock_page(page);
2321                 ext4_journal_stop(handle);
2322                 page_cache_release(page);
2323                 /*
2324                  * block_write_begin may have instantiated a few blocks
2325                  * outside i_size.  Trim these off again. Don't need
2326                  * i_size_read because we hold i_mutex.
2327                  */
2328                 if (pos + len > inode->i_size)
2329                         ext4_truncate_failed_write(inode);
2330         } else {
2331                 page_len = pos & (PAGE_CACHE_SIZE - 1);
2332                 if (page_len > 0) {
2333                         ret = ext4_discard_partial_page_buffers_no_lock(handle,
2334                                 inode, page, pos - page_len, page_len,
2335                                 EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
2336                 }
2337         }
2338
2339         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
2340                 goto retry;
2341 out:
2342         return ret;
2343 }
2344
2345 /*
2346  * Check if we should update i_disksize
2347  * when write to the end of file but not require block allocation
2348  */
2349 static int ext4_da_should_update_i_disksize(struct page *page,
2350                                             unsigned long offset)
2351 {
2352         struct buffer_head *bh;
2353         struct inode *inode = page->mapping->host;
2354         unsigned int idx;
2355         int i;
2356
2357         bh = page_buffers(page);
2358         idx = offset >> inode->i_blkbits;
2359
2360         for (i = 0; i < idx; i++)
2361                 bh = bh->b_this_page;
2362
2363         if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
2364                 return 0;
2365         return 1;
2366 }
2367
2368 static int ext4_da_write_end(struct file *file,
2369                              struct address_space *mapping,
2370                              loff_t pos, unsigned len, unsigned copied,
2371                              struct page *page, void *fsdata)
2372 {
2373         struct inode *inode = mapping->host;
2374         int ret = 0, ret2;
2375         handle_t *handle = ext4_journal_current_handle();
2376         loff_t new_i_size;
2377         unsigned long start, end;
2378         int write_mode = (int)(unsigned long)fsdata;
2379         loff_t page_len;
2380
2381         if (write_mode == FALL_BACK_TO_NONDELALLOC) {
2382                 if (ext4_should_order_data(inode)) {
2383                         return ext4_ordered_write_end(file, mapping, pos,
2384                                         len, copied, page, fsdata);
2385                 } else if (ext4_should_writeback_data(inode)) {
2386                         return ext4_writeback_write_end(file, mapping, pos,
2387                                         len, copied, page, fsdata);
2388                 } else {
2389                         BUG();
2390                 }
2391         }
2392
2393         trace_ext4_da_write_end(inode, pos, len, copied);
2394         start = pos & (PAGE_CACHE_SIZE - 1);
2395         end = start + copied - 1;
2396
2397         /*
2398          * generic_write_end() will run mark_inode_dirty() if i_size
2399          * changes.  So let's piggyback the i_disksize mark_inode_dirty
2400          * into that.
2401          */
2402
2403         new_i_size = pos + copied;
2404         if (new_i_size > EXT4_I(inode)->i_disksize) {
2405                 if (ext4_da_should_update_i_disksize(page, end)) {
2406                         down_write(&EXT4_I(inode)->i_data_sem);
2407                         if (new_i_size > EXT4_I(inode)->i_disksize) {
2408                                 /*
2409                                  * Updating i_disksize when extending file
2410                                  * without needing block allocation
2411                                  */
2412                                 if (ext4_should_order_data(inode))
2413                                         ret = ext4_jbd2_file_inode(handle,
2414                                                                    inode);
2415
2416                                 EXT4_I(inode)->i_disksize = new_i_size;
2417                         }
2418                         up_write(&EXT4_I(inode)->i_data_sem);
2419                         /* We need to mark inode dirty even if
2420                          * new_i_size is less that inode->i_size
2421                          * bu greater than i_disksize.(hint delalloc)
2422                          */
2423                         ext4_mark_inode_dirty(handle, inode);
2424                 }
2425         }
2426         ret2 = generic_write_end(file, mapping, pos, len, copied,
2427                                                         page, fsdata);
2428
2429         page_len = PAGE_CACHE_SIZE -
2430                         ((pos + copied - 1) & (PAGE_CACHE_SIZE - 1));
2431
2432         if (page_len > 0) {
2433                 ret = ext4_discard_partial_page_buffers_no_lock(handle,
2434                         inode, page, pos + copied - 1, page_len,
2435                         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
2436         }
2437
2438         copied = ret2;
2439         if (ret2 < 0)
2440                 ret = ret2;
2441         ret2 = ext4_journal_stop(handle);
2442         if (!ret)
2443                 ret = ret2;
2444
2445         return ret ? ret : copied;
2446 }
2447
2448 static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2449 {
2450         /*
2451          * Drop reserved blocks
2452          */
2453         BUG_ON(!PageLocked(page));
2454         if (!page_has_buffers(page))
2455                 goto out;
2456
2457         ext4_da_page_release_reservation(page, offset);
2458
2459 out:
2460         ext4_invalidatepage(page, offset);
2461
2462         return;
2463 }
2464
2465 /*
2466  * Force all delayed allocation blocks to be allocated for a given inode.
2467  */
2468 int ext4_alloc_da_blocks(struct inode *inode)
2469 {
2470         trace_ext4_alloc_da_blocks(inode);
2471
2472         if (!EXT4_I(inode)->i_reserved_data_blocks &&
2473             !EXT4_I(inode)->i_reserved_meta_blocks)
2474                 return 0;
2475
2476         /*
2477          * We do something simple for now.  The filemap_flush() will
2478          * also start triggering a write of the data blocks, which is
2479          * not strictly speaking necessary (and for users of
2480          * laptop_mode, not even desirable).  However, to do otherwise
2481          * would require replicating code paths in:
2482          *
2483          * ext4_da_writepages() ->
2484          *    write_cache_pages() ---> (via passed in callback function)
2485          *        __mpage_da_writepage() -->
2486          *           mpage_add_bh_to_extent()
2487          *           mpage_da_map_blocks()
2488          *
2489          * The problem is that write_cache_pages(), located in
2490          * mm/page-writeback.c, marks pages clean in preparation for
2491          * doing I/O, which is not desirable if we're not planning on
2492          * doing I/O at all.
2493          *
2494          * We could call write_cache_pages(), and then redirty all of
2495          * the pages by calling redirty_page_for_writepage() but that
2496          * would be ugly in the extreme.  So instead we would need to
2497          * replicate parts of the code in the above functions,
2498          * simplifying them because we wouldn't actually intend to
2499          * write out the pages, but rather only collect contiguous
2500          * logical block extents, call the multi-block allocator, and
2501          * then update the buffer heads with the block allocations.
2502          *
2503          * For now, though, we'll cheat by calling filemap_flush(),
2504          * which will map the blocks, and start the I/O, but not
2505          * actually wait for the I/O to complete.
2506          */
2507         return filemap_flush(inode->i_mapping);
2508 }
2509
2510 /*
2511  * bmap() is special.  It gets used by applications such as lilo and by
2512  * the swapper to find the on-disk block of a specific piece of data.
2513  *
2514  * Naturally, this is dangerous if the block concerned is still in the
2515  * journal.  If somebody makes a swapfile on an ext4 data-journaling
2516  * filesystem and enables swap, then they may get a nasty shock when the
2517  * data getting swapped to that swapfile suddenly gets overwritten by
2518  * the original zero's written out previously to the journal and
2519  * awaiting writeback in the kernel's buffer cache.
2520  *
2521  * So, if we see any bmap calls here on a modified, data-journaled file,
2522  * take extra steps to flush any blocks which might be in the cache.
2523  */
2524 static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2525 {
2526         struct inode *inode = mapping->host;
2527         journal_t *journal;
2528         int err;
2529
2530         if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
2531                         test_opt(inode->i_sb, DELALLOC)) {
2532                 /*
2533                  * With delalloc we want to sync the file
2534                  * so that we can make sure we allocate
2535                  * blocks for file
2536                  */
2537                 filemap_write_and_wait(mapping);
2538         }
2539
2540         if (EXT4_JOURNAL(inode) &&
2541             ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
2542                 /*
2543                  * This is a REALLY heavyweight approach, but the use of
2544                  * bmap on dirty files is expected to be extremely rare:
2545                  * only if we run lilo or swapon on a freshly made file
2546                  * do we expect this to happen.
2547                  *
2548                  * (bmap requires CAP_SYS_RAWIO so this does not
2549                  * represent an unprivileged user DOS attack --- we'd be
2550                  * in trouble if mortal users could trigger this path at
2551                  * will.)
2552                  *
2553                  * NB. EXT4_STATE_JDATA is not set on files other than
2554                  * regular files.  If somebody wants to bmap a directory
2555                  * or symlink and gets confused because the buffer
2556                  * hasn't yet been flushed to disk, they deserve
2557                  * everything they get.
2558                  */
2559
2560                 ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
2561                 journal = EXT4_JOURNAL(inode);
2562                 jbd2_journal_lock_updates(journal);
2563                 err = jbd2_journal_flush(journal);
2564                 jbd2_journal_unlock_updates(journal);
2565
2566                 if (err)
2567                         return 0;
2568         }
2569
2570         return generic_block_bmap(mapping, block, ext4_get_block);
2571 }
2572
2573 static int ext4_readpage(struct file *file, struct page *page)
2574 {
2575         trace_ext4_readpage(page);
2576         return mpage_readpage(page, ext4_get_block);
2577 }
2578
2579 static int
2580 ext4_readpages(struct file *file, struct address_space *mapping,
2581                 struct list_head *pages, unsigned nr_pages)
2582 {
2583         return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
2584 }
2585
2586 static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
2587 {
2588         struct buffer_head *head, *bh;
2589         unsigned int curr_off = 0;
2590
2591         if (!page_has_buffers(page))
2592                 return;
2593         head = bh = page_buffers(page);
2594         do {
2595                 if (offset <= curr_off && test_clear_buffer_uninit(bh)
2596                                         && bh->b_private) {
2597                         ext4_free_io_end(bh->b_private);
2598                         bh->b_private = NULL;
2599                         bh->b_end_io = NULL;
2600                 }
2601                 curr_off = curr_off + bh->b_size;
2602                 bh = bh->b_this_page;
2603         } while (bh != head);
2604 }
2605
2606 static void ext4_invalidatepage(struct page *page, unsigned long offset)
2607 {
2608         journal_t *journal = EXT4_JOURNAL(page->mapping->host);
2609
2610         trace_ext4_invalidatepage(page, offset);
2611
2612         /*
2613          * free any io_end structure allocated for buffers to be discarded
2614          */
2615         if (ext4_should_dioread_nolock(page->mapping->host))
2616                 ext4_invalidatepage_free_endio(page, offset);
2617         /*
2618          * If it's a full truncate we just forget about the pending dirtying
2619          */
2620         if (offset == 0)
2621                 ClearPageChecked(page);
2622
2623         if (journal)
2624                 jbd2_journal_invalidatepage(journal, page, offset);
2625         else
2626                 block_invalidatepage(page, offset);
2627 }
2628
2629 static int ext4_releasepage(struct page *page, gfp_t wait)
2630 {
2631         journal_t *journal = EXT4_JOURNAL(page->mapping->host);
2632
2633         trace_ext4_releasepage(page);
2634
2635         WARN_ON(PageChecked(page));
2636         if (!page_has_buffers(page))
2637                 return 0;
2638         if (journal)
2639                 return jbd2_journal_try_to_free_buffers(journal, page, wait);
2640         else
2641                 return try_to_free_buffers(page);
2642 }
2643
2644 /*
2645  * ext4_get_block used when preparing for a DIO write or buffer write.
2646  * We allocate an uinitialized extent if blocks haven't been allocated.
2647  * The extent will be converted to initialized after the IO is complete.
2648  */
2649 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
2650                    struct buffer_head *bh_result, int create)
2651 {
2652         ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
2653                    inode->i_ino, create);
2654         return _ext4_get_block(inode, iblock, bh_result,
2655                                EXT4_GET_BLOCKS_IO_CREATE_EXT);
2656 }
2657
2658 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2659                             ssize_t size, void *private, int ret,
2660                             bool is_async)
2661 {
2662         struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
2663         ext4_io_end_t *io_end = iocb->private;
2664         struct workqueue_struct *wq;
2665         unsigned long flags;
2666         struct ext4_inode_info *ei;
2667
2668         /* if not async direct IO or dio with 0 bytes write, just return */
2669         if (!io_end || !size)
2670                 goto out;
2671
2672         ext_debug("ext4_end_io_dio(): io_end 0x%p"
2673                   "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
2674                   iocb->private, io_end->inode->i_ino, iocb, offset,
2675                   size);
2676
2677         /* if not aio dio with unwritten extents, just free io and return */
2678         if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
2679                 ext4_free_io_end(io_end);
2680                 iocb->private = NULL;
2681 out:
2682                 if (is_async)
2683                         aio_complete(iocb, ret, 0);
2684                 inode_dio_done(inode);
2685                 return;
2686         }
2687
2688         io_end->offset = offset;
2689         io_end->size = size;
2690         if (is_async) {
2691                 io_end->iocb = iocb;
2692                 io_end->result = ret;
2693         }
2694         wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
2695
2696         /* Add the io_end to per-inode completed aio dio list*/
2697         ei = EXT4_I(io_end->inode);
2698         spin_lock_irqsave(&ei->i_completed_io_lock, flags);
2699         list_add_tail(&io_end->list, &ei->i_completed_io_list);
2700         spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
2701
2702         /* queue the work to convert unwritten extents to written */
2703         queue_work(wq, &io_end->work);
2704         iocb->private = NULL;
2705
2706         /* XXX: probably should move into the real I/O completion handler */
2707         inode_dio_done(inode);
2708 }
2709
2710 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2711 {
2712         ext4_io_end_t *io_end = bh->b_private;
2713         struct workqueue_struct *wq;
2714         struct inode *inode;
2715         unsigned long flags;
2716
2717         if (!test_clear_buffer_uninit(bh) || !io_end)
2718                 goto out;
2719
2720         if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
2721                 printk("sb umounted, discard end_io request for inode %lu\n",
2722                         io_end->inode->i_ino);
2723                 ext4_free_io_end(io_end);
2724                 goto out;
2725         }
2726
2727         /*
2728          * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
2729          * but being more careful is always safe for the future change.
2730          */
2731         inode = io_end->inode;
2732         if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
2733                 io_end->flag |= EXT4_IO_END_UNWRITTEN;
2734                 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
2735         }
2736
2737         /* Add the io_end to per-inode completed io list*/
2738         spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
2739         list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
2740         spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
2741
2742         wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
2743         /* queue the work to convert unwritten extents to written */
2744         queue_work(wq, &io_end->work);
2745 out:
2746         bh->b_private = NULL;
2747         bh->b_end_io = NULL;
2748         clear_buffer_uninit(bh);
2749         end_buffer_async_write(bh, uptodate);
2750 }
2751
2752 static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
2753 {
2754         ext4_io_end_t *io_end;
2755         struct page *page = bh->b_page;
2756         loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
2757         size_t size = bh->b_size;
2758
2759 retry:
2760         io_end = ext4_init_io_end(inode, GFP_ATOMIC);
2761         if (!io_end) {
2762                 pr_warn_ratelimited("%s: allocation fail\n", __func__);
2763                 schedule();
2764                 goto retry;
2765         }
2766         io_end->offset = offset;
2767         io_end->size = size;
2768         /*
2769          * We need to hold a reference to the page to make sure it
2770          * doesn't get evicted before ext4_end_io_work() has a chance
2771          * to convert the extent from written to unwritten.
2772          */
2773         io_end->page = page;
2774         get_page(io_end->page);
2775
2776         bh->b_private = io_end;
2777         bh->b_end_io = ext4_end_io_buffer_write;
2778         return 0;
2779 }
2780
2781 /*
2782  * For ext4 extent files, ext4 will do direct-io write to holes,
2783  * preallocated extents, and those write extend the file, no need to
2784  * fall back to buffered IO.
2785  *
2786  * For holes, we fallocate those blocks, mark them as uninitialized
2787  * If those blocks were preallocated, we mark sure they are splited, but
2788  * still keep the range to write as uninitialized.
2789  *
2790  * The unwrritten extents will be converted to written when DIO is completed.
2791  * For async direct IO, since the IO may still pending when return, we
2792  * set up an end_io call back function, which will do the conversion
2793  * when async direct IO completed.
2794  *
2795  * If the O_DIRECT write will extend the file then add this inode to the
2796  * orphan list.  So recovery will truncate it back to the original size
2797  * if the machine crashes during the write.
2798  *
2799  */
2800 static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
2801                               const struct iovec *iov, loff_t offset,
2802                               unsigned long nr_segs)
2803 {
2804         struct file *file = iocb->ki_filp;
2805         struct inode *inode = file->f_mapping->host;
2806         ssize_t ret;
2807         size_t count = iov_length(iov, nr_segs);
2808
2809         loff_t final_size = offset + count;
2810         if (rw == WRITE && final_size <= inode->i_size) {
2811                 /*
2812                  * We could direct write to holes and fallocate.
2813                  *
2814                  * Allocated blocks to fill the hole are marked as uninitialized
2815                  * to prevent parallel buffered read to expose the stale data
2816                  * before DIO complete the data IO.
2817                  *
2818                  * As to previously fallocated extents, ext4 get_block
2819                  * will just simply mark the buffer mapped but still
2820                  * keep the extents uninitialized.
2821                  *
2822                  * for non AIO case, we will convert those unwritten extents
2823                  * to written after return back from blockdev_direct_IO.
2824                  *
2825                  * for async DIO, the conversion needs to be defered when
2826                  * the IO is completed. The ext4 end_io callback function
2827                  * will be called to take care of the conversion work.
2828                  * Here for async case, we allocate an io_end structure to
2829                  * hook to the iocb.
2830                  */
2831                 iocb->private = NULL;
2832                 EXT4_I(inode)->cur_aio_dio = NULL;
2833                 if (!is_sync_kiocb(iocb)) {
2834                         iocb->private = ext4_init_io_end(inode, GFP_NOFS);
2835                         if (!iocb->private)
2836                                 return -ENOMEM;
2837                         /*
2838                          * we save the io structure for current async
2839                          * direct IO, so that later ext4_map_blocks()
2840                          * could flag the io structure whether there
2841                          * is a unwritten extents needs to be converted
2842                          * when IO is completed.
2843                          */
2844                         EXT4_I(inode)->cur_aio_dio = iocb->private;
2845                 }
2846
2847                 ret = __blockdev_direct_IO(rw, iocb, inode,
2848                                          inode->i_sb->s_bdev, iov,
2849                                          offset, nr_segs,
2850                                          ext4_get_block_write,
2851                                          ext4_end_io_dio,
2852                                          NULL,
2853                                          DIO_LOCKING | DIO_SKIP_HOLES);
2854                 if (iocb->private)
2855                         EXT4_I(inode)->cur_aio_dio = NULL;
2856                 /*
2857                  * The io_end structure takes a reference to the inode,
2858                  * that structure needs to be destroyed and the
2859                  * reference to the inode need to be dropped, when IO is
2860                  * complete, even with 0 byte write, or failed.
2861                  *
2862                  * In the successful AIO DIO case, the io_end structure will be
2863                  * desctroyed and the reference to the inode will be dropped
2864                  * after the end_io call back function is called.
2865                  *
2866                  * In the case there is 0 byte write, or error case, since
2867                  * VFS direct IO won't invoke the end_io call back function,
2868                  * we need to free the end_io structure here.
2869                  */
2870                 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
2871                         ext4_free_io_end(iocb->private);
2872                         iocb->private = NULL;
2873                 } else if (ret > 0 && ext4_test_inode_state(inode,
2874                                                 EXT4_STATE_DIO_UNWRITTEN)) {
2875                         int err;
2876                         /*
2877                          * for non AIO case, since the IO is already
2878                          * completed, we could do the conversion right here
2879                          */
2880                         err = ext4_convert_unwritten_extents(inode,
2881                                                              offset, ret);
2882                         if (err < 0)
2883                                 ret = err;
2884                         ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
2885                 }
2886                 return ret;
2887         }
2888
2889         /* for write the the end of file case, we fall back to old way */
2890         return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
2891 }
2892
2893 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
2894                               const struct iovec *iov, loff_t offset,
2895                               unsigned long nr_segs)
2896 {
2897         struct file *file = iocb->ki_filp;
2898         struct inode *inode = file->f_mapping->host;
2899         ssize_t ret;
2900
2901         /*
2902          * If we are doing data journalling we don't support O_DIRECT
2903          */
2904         if (ext4_should_journal_data(inode))
2905                 return 0;
2906
2907         trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
2908         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
2909                 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
2910         else
2911                 ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
2912         trace_ext4_direct_IO_exit(inode, offset,
2913                                 iov_length(iov, nr_segs), rw, ret);
2914         return ret;
2915 }
2916
2917 /*
2918  * Pages can be marked dirty completely asynchronously from ext4's journalling
2919  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
2920  * much here because ->set_page_dirty is called under VFS locks.  The page is
2921  * not necessarily locked.
2922  *
2923  * We cannot just dirty the page and leave attached buffers clean, because the
2924  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
2925  * or jbddirty because all the journalling code will explode.
2926  *
2927  * So what we do is to mark the page "pending dirty" and next time writepage
2928  * is called, propagate that into the buffers appropriately.
2929  */
2930 static int ext4_journalled_set_page_dirty(struct page *page)
2931 {
2932         SetPageChecked(page);
2933         return __set_page_dirty_nobuffers(page);
2934 }
2935
2936 static const struct address_space_operations ext4_ordered_aops = {
2937         .readpage               = ext4_readpage,
2938         .readpages              = ext4_readpages,
2939         .writepage              = ext4_writepage,
2940         .write_begin            = ext4_write_begin,
2941         .write_end              = ext4_ordered_write_end,
2942         .bmap                   = ext4_bmap,
2943         .invalidatepage         = ext4_invalidatepage,
2944         .releasepage            = ext4_releasepage,
2945         .direct_IO              = ext4_direct_IO,
2946         .migratepage            = buffer_migrate_page,
2947         .is_partially_uptodate  = block_is_partially_uptodate,
2948         .error_remove_page      = generic_error_remove_page,
2949 };
2950
2951 static const struct address_space_operations ext4_writeback_aops = {
2952         .readpage               = ext4_readpage,
2953         .readpages              = ext4_readpages,
2954         .writepage              = ext4_writepage,
2955         .write_begin            = ext4_write_begin,
2956         .write_end              = ext4_writeback_write_end,
2957         .bmap                   = ext4_bmap,
2958         .invalidatepage         = ext4_invalidatepage,
2959         .releasepage            = ext4_releasepage,
2960         .direct_IO              = ext4_direct_IO,
2961         .migratepage            = buffer_migrate_page,
2962         .is_partially_uptodate  = block_is_partially_uptodate,
2963         .error_remove_page      = generic_error_remove_page,
2964 };
2965
2966 static const struct address_space_operations ext4_journalled_aops = {
2967         .readpage               = ext4_readpage,
2968         .readpages              = ext4_readpages,
2969         .writepage              = ext4_writepage,
2970         .write_begin            = ext4_write_begin,
2971         .write_end              = ext4_journalled_write_end,
2972         .set_page_dirty         = ext4_journalled_set_page_dirty,
2973         .bmap                   = ext4_bmap,
2974         .invalidatepage         = ext4_invalidatepage,
2975         .releasepage            = ext4_releasepage,
2976         .direct_IO              = ext4_direct_IO,
2977         .is_partially_uptodate  = block_is_partially_uptodate,
2978         .error_remove_page      = generic_error_remove_page,
2979 };
2980
2981 static const struct address_space_operations ext4_da_aops = {
2982         .readpage               = ext4_readpage,
2983         .readpages              = ext4_readpages,
2984         .writepage              = ext4_writepage,
2985         .writepages             = ext4_da_writepages,
2986         .write_begin            = ext4_da_write_begin,
2987         .write_end              = ext4_da_write_end,
2988         .bmap                   = ext4_bmap,
2989         .invalidatepage         = ext4_da_invalidatepage,
2990         .releasepage            = ext4_releasepage,
2991         .direct_IO              = ext4_direct_IO,
2992         .migratepage            = buffer_migrate_page,
2993         .is_partially_uptodate  = block_is_partially_uptodate,
2994         .error_remove_page      = generic_error_remove_page,
2995 };
2996
2997 void ext4_set_aops(struct inode *inode)
2998 {
2999         if (ext4_should_order_data(inode) &&
3000                 test_opt(inode->i_sb, DELALLOC))
3001                 inode->i_mapping->a_ops = &ext4_da_aops;
3002         else if (ext4_should_order_data(inode))
3003                 inode->i_mapping->a_ops = &ext4_ordered_aops;
3004         else if (ext4_should_writeback_data(inode) &&
3005                  test_opt(inode->i_sb, DELALLOC))
3006                 inode->i_mapping->a_ops = &ext4_da_aops;
3007         else if (ext4_should_writeback_data(inode))
3008                 inode->i_mapping->a_ops = &ext4_writeback_aops;
3009         else
3010                 inode->i_mapping->a_ops = &ext4_journalled_aops;
3011 }
3012
3013
3014 /*
3015  * ext4_discard_partial_page_buffers()
3016  * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
3017  * This function finds and locks the page containing the offset
3018  * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
3019  * Calling functions that already have the page locked should call
3020  * ext4_discard_partial_page_buffers_no_lock directly.
3021  */
3022 int ext4_discard_partial_page_buffers(handle_t *handle,
3023                 struct address_space *mapping, loff_t from,
3024                 loff_t length, int flags)
3025 {
3026         struct inode *inode = mapping->host;
3027         struct page *page;
3028         int err = 0;
3029
3030         page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3031                                    mapping_gfp_mask(mapping) & ~__GFP_FS);
3032         if (!page)
3033                 return -EINVAL;
3034
3035         err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
3036                 from, length, flags);
3037
3038         unlock_page(page);
3039         page_cache_release(page);
3040         return err;
3041 }
3042
3043 /*
3044  * ext4_discard_partial_page_buffers_no_lock()
3045  * Zeros a page range of length 'length' starting from offset 'from'.
3046  * Buffer heads that correspond to the block aligned regions of the
3047  * zeroed range will be unmapped.  Unblock aligned regions
3048  * will have the corresponding buffer head mapped if needed so that
3049  * that region of the page can be updated with the partial zero out.
3050  *
3051  * This function assumes that the page has already been  locked.  The
3052  * The range to be discarded must be contained with in the given page.
3053  * If the specified range exceeds the end of the page it will be shortened
3054  * to the end of the page that corresponds to 'from'.  This function is
3055  * appropriate for updating a page and it buffer heads to be unmapped and
3056  * zeroed for blocks that have been either released, or are going to be
3057  * released.
3058  *
3059  * handle: The journal handle
3060  * inode:  The files inode
3061  * page:   A locked page that contains the offset "from"
3062  * from:   The starting byte offset (from the begining of the file)
3063  *         to begin discarding
3064  * len:    The length of bytes to discard
3065  * flags:  Optional flags that may be used:
3066  *
3067  *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
3068  *         Only zero the regions of the page whose buffer heads
3069  *         have already been unmapped.  This flag is appropriate
3070  *         for updateing the contents of a page whose blocks may
3071  *         have already been released, and we only want to zero
3072  *         out the regions that correspond to those released blocks.
3073  *
3074  * Returns zero on sucess or negative on failure.
3075  */
3076 int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
3077                 struct inode *inode, struct page *page, loff_t from,
3078                 loff_t length, int flags)
3079 {
3080         ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3081         unsigned int offset = from & (PAGE_CACHE_SIZE-1);
3082         unsigned int blocksize, max, pos;
3083         unsigned int end_of_block, range_to_discard;
3084         ext4_lblk_t iblock;
3085         struct buffer_head *bh;
3086         int err = 0;
3087
3088         blocksize = inode->i_sb->s_blocksize;
3089         max = PAGE_CACHE_SIZE - offset;
3090
3091         if (index != page->index)
3092                 return -EINVAL;
3093
3094         /*
3095          * correct length if it does not fall between
3096          * 'from' and the end of the page
3097          */
3098         if (length > max || length < 0)
3099                 length = max;
3100
3101         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3102
3103         if (!page_has_buffers(page)) {
3104                 /*
3105                  * If the range to be discarded covers a partial block
3106                  * we need to get the page buffers.  This is because
3107                  * partial blocks cannot be released and the page needs
3108                  * to be updated with the contents of the block before
3109                  * we write the zeros on top of it.
3110                  */
3111                 if (!(from & (blocksize - 1)) ||
3112                     !((from + length) & (blocksize - 1))) {
3113                         create_empty_buffers(page, blocksize, 0);
3114                 } else {
3115                         /*
3116                          * If there are no partial blocks,
3117                          * there is nothing to update,
3118                          * so we can return now
3119                          */
3120                         return 0;
3121                 }
3122         }
3123
3124         /* Find the buffer that contains "offset" */
3125         bh = page_buffers(page);
3126         pos = blocksize;
3127         while (offset >= pos) {
3128                 bh = bh->b_this_page;
3129                 iblock++;
3130                 pos += blocksize;
3131         }
3132
3133         pos = offset;
3134         while (pos < offset + length) {
3135                 err = 0;
3136
3137                 /* The length of space left to zero and unmap */
3138                 range_to_discard = offset + length - pos;
3139
3140                 /* The length of space until the end of the block */
3141                 end_of_block = blocksize - (pos & (blocksize-1));
3142
3143                 /*
3144                  * Do not unmap or zero past end of block
3145                  * for this buffer head
3146                  */
3147                 if (range_to_discard > end_of_block)
3148                         range_to_discard = end_of_block;
3149
3150
3151                 /*
3152                  * Skip this buffer head if we are only zeroing unampped
3153                  * regions of the page
3154                  */
3155                 if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
3156                         buffer_mapped(bh))
3157                                 goto next;
3158
3159                 /* If the range is block aligned, unmap */
3160                 if (range_to_discard == blocksize) {
3161                         clear_buffer_dirty(bh);
3162                         bh->b_bdev = NULL;
3163                         clear_buffer_mapped(bh);
3164                         clear_buffer_req(bh);
3165                         clear_buffer_new(bh);
3166                         clear_buffer_delay(bh);
3167                         clear_buffer_unwritten(bh);
3168                         clear_buffer_uptodate(bh);
3169                         zero_user(page, pos, range_to_discard);
3170                         BUFFER_TRACE(bh, "Buffer discarded");
3171                         goto next;
3172                 }
3173
3174                 /*
3175                  * If this block is not completely contained in the range
3176                  * to be discarded, then it is not going to be released. Because
3177                  * we need to keep this block, we need to make sure this part
3178                  * of the page is uptodate before we modify it by writeing
3179                  * partial zeros on it.
3180                  */
3181                 if (!buffer_mapped(bh)) {
3182                         /*
3183                          * Buffer head must be mapped before we can read
3184                          * from the block
3185                          */
3186                         BUFFER_TRACE(bh, "unmapped");
3187                         ext4_get_block(inode, iblock, bh, 0);
3188                         /* unmapped? It's a hole - nothing to do */
3189                         if (!buffer_mapped(bh)) {
3190                                 BUFFER_TRACE(bh, "still unmapped");
3191                                 goto next;
3192                         }
3193                 }
3194
3195                 /* Ok, it's mapped. Make sure it's up-to-date */
3196                 if (PageUptodate(page))
3197                         set_buffer_uptodate(bh);
3198
3199                 if (!buffer_uptodate(bh)) {
3200                         err = -EIO;
3201                         ll_rw_block(READ, 1, &bh);
3202                         wait_on_buffer(bh);
3203                         /* Uhhuh. Read error. Complain and punt.*/
3204                         if (!buffer_uptodate(bh))
3205                                 goto next;
3206                 }
3207
3208                 if (ext4_should_journal_data(inode)) {
3209                         BUFFER_TRACE(bh, "get write access");
3210                         err = ext4_journal_get_write_access(handle, bh);
3211                         if (err)
3212                                 goto next;
3213                 }
3214
3215                 zero_user(page, pos, range_to_discard);
3216
3217                 err = 0;
3218                 if (ext4_should_journal_data(inode)) {
3219                         err = ext4_handle_dirty_metadata(handle, inode, bh);
3220                 } else
3221                         mark_buffer_dirty(bh);
3222
3223                 BUFFER_TRACE(bh, "Partial buffer zeroed");
3224 next:
3225                 bh = bh->b_this_page;
3226                 iblock++;
3227                 pos += range_to_discard;
3228         }
3229
3230         return err;
3231 }
3232
3233 /*
3234  * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3235  * up to the end of the block which corresponds to `from'.
3236  * This required during truncate. We need to physically zero the tail end
3237  * of that block so it doesn't yield old data if the file is later grown.
3238  */
3239 int ext4_block_truncate_page(handle_t *handle,
3240                 struct address_space *mapping, loff_t from)
3241 {
3242         unsigned offset = from & (PAGE_CACHE_SIZE-1);
3243         unsigned length;
3244         unsigned blocksize;
3245         struct inode *inode = mapping->host;
3246
3247         blocksize = inode->i_sb->s_blocksize;
3248         length = blocksize - (offset & (blocksize - 1));
3249
3250         return ext4_block_zero_page_range(handle, mapping, from, length);
3251 }
3252
3253 /*
3254  * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3255  * starting from file offset 'from'.  The range to be zero'd must
3256  * be contained with in one block.  If the specified range exceeds
3257  * the end of the block it will be shortened to end of the block
3258  * that cooresponds to 'from'
3259  */
3260 int ext4_block_zero_page_range(handle_t *handle,
3261                 struct address_space *mapping, loff_t from, loff_t length)
3262 {
3263         ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3264         unsigned offset = from & (PAGE_CACHE_SIZE-1);
3265         unsigned blocksize, max, pos;
3266         ext4_lblk_t iblock;
3267         struct inode *inode = mapping->host;
3268         struct buffer_head *bh;
3269         struct page *page;
3270         int err = 0;
3271
3272         page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3273                                    mapping_gfp_mask(mapping) & ~__GFP_FS);
3274         if (!page)
3275                 return -EINVAL;
3276
3277         blocksize = inode->i_sb->s_blocksize;
3278         max = blocksize - (offset & (blocksize - 1));
3279
3280         /*
3281          * correct length if it does not fall between
3282          * 'from' and the end of the block
3283          */
3284         if (length > max || length < 0)
3285                 length = max;
3286
3287         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3288
3289         if (!page_has_buffers(page))
3290                 create_empty_buffers(page, blocksize, 0);
3291
3292         /* Find the buffer that contains "offset" */
3293         bh = page_buffers(page);
3294         pos = blocksize;
3295         while (offset >= pos) {
3296                 bh = bh->b_this_page;
3297                 iblock++;
3298                 pos += blocksize;
3299         }
3300
3301         err = 0;
3302         if (buffer_freed(bh)) {
3303                 BUFFER_TRACE(bh, "freed: skip");
3304                 goto unlock;
3305         }
3306
3307         if (!buffer_mapped(bh)) {
3308                 BUFFER_TRACE(bh, "unmapped");
3309                 ext4_get_block(inode, iblock, bh, 0);
3310                 /* unmapped? It's a hole - nothing to do */
3311                 if (!buffer_mapped(bh)) {
3312                         BUFFER_TRACE(bh, "still unmapped");
3313                         goto unlock;
3314                 }
3315         }
3316
3317         /* Ok, it's mapped. Make sure it's up-to-date */
3318         if (PageUptodate(page))
3319                 set_buffer_uptodate(bh);
3320
3321         if (!buffer_uptodate(bh)) {
3322                 err = -EIO;
3323                 ll_rw_block(READ, 1, &bh);
3324                 wait_on_buffer(bh);
3325                 /* Uhhuh. Read error. Complain and punt. */
3326                 if (!buffer_uptodate(bh))
3327                         goto unlock;
3328         }
3329
3330         if (ext4_should_journal_data(inode)) {
3331                 BUFFER_TRACE(bh, "get write access");
3332                 err = ext4_journal_get_write_access(handle, bh);
3333                 if (err)
3334                         goto unlock;
3335         }
3336
3337         zero_user(page, offset, length);
3338
3339         BUFFER_TRACE(bh, "zeroed end of block");
3340
3341         err = 0;
3342         if (ext4_should_journal_data(inode)) {
3343                 err = ext4_handle_dirty_metadata(handle, inode, bh);
3344         } else
3345                 mark_buffer_dirty(bh);
3346
3347 unlock:
3348         unlock_page(page);
3349         page_cache_release(page);
3350         return err;
3351 }
3352
3353 int ext4_can_truncate(struct inode *inode)
3354 {
3355         if (S_ISREG(inode->i_mode))
3356                 return 1;
3357         if (S_ISDIR(inode->i_mode))
3358                 return 1;
3359         if (S_ISLNK(inode->i_mode))
3360                 return !ext4_inode_is_fast_symlink(inode);
3361         return 0;
3362 }
3363
3364 /*
3365  * ext4_punch_hole: punches a hole in a file by releaseing the blocks
3366  * associated with the given offset and length
3367  *
3368  * @inode:  File inode
3369  * @offset: The offset where the hole will begin
3370  * @len:    The length of the hole
3371  *
3372  * Returns: 0 on sucess or negative on failure
3373  */
3374
3375 int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3376 {
3377         struct inode *inode = file->f_path.dentry->d_inode;
3378         if (!S_ISREG(inode->i_mode))
3379                 return -ENOTSUPP;
3380
3381         if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
3382                 /* TODO: Add support for non extent hole punching */
3383                 return -ENOTSUPP;
3384         }
3385
3386         if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
3387                 /* TODO: Add support for bigalloc file systems */
3388                 return -ENOTSUPP;
3389         }
3390
3391         return ext4_ext_punch_hole(file, offset, length);
3392 }
3393
3394 /*
3395  * ext4_truncate()
3396  *
3397  * We block out ext4_get_block() block instantiations across the entire
3398  * transaction, and VFS/VM ensures that ext4_truncate() cannot run
3399  * simultaneously on behalf of the same inode.
3400  *
3401  * As we work through the truncate and commmit bits of it to the journal there
3402  * is one core, guiding principle: the file's tree must always be consistent on
3403  * disk.  We must be able to restart the truncate after a crash.
3404  *
3405  * The file's tree may be transiently inconsistent in memory (although it
3406  * probably isn't), but whenever we close off and commit a journal transaction,
3407  * the contents of (the filesystem + the journal) must be consistent and
3408  * restartable.  It's pretty simple, really: bottom up, right to left (although
3409  * left-to-right works OK too).
3410  *
3411  * Note that at recovery time, journal replay occurs *before* the restart of
3412  * truncate against the orphan inode list.
3413  *
3414  * The committed inode has the new, desired i_size (which is the same as
3415  * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
3416  * that this inode's truncate did not complete and it will again call
3417  * ext4_truncate() to have another go.  So there will be instantiated blocks
3418  * to the right of the truncation point in a crashed ext4 filesystem.  But
3419  * that's fine - as long as they are linked from the inode, the post-crash
3420  * ext4_truncate() run will find them and release them.
3421  */
3422 void ext4_truncate(struct inode *inode)
3423 {
3424         trace_ext4_truncate_enter(inode);
3425
3426         if (!ext4_can_truncate(inode))
3427                 return;
3428
3429         ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3430
3431         if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3432                 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
3433
3434         if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3435                 ext4_ext_truncate(inode);
3436         else
3437                 ext4_ind_truncate(inode);
3438
3439         trace_ext4_truncate_exit(inode);
3440 }
3441
3442 /*
3443  * ext4_get_inode_loc returns with an extra refcount against the inode's
3444  * underlying buffer_head on success. If 'in_mem' is true, we have all
3445  * data in memory that is needed to recreate the on-disk version of this
3446  * inode.
3447  */
3448 static int __ext4_get_inode_loc(struct inode *inode,
3449                                 struct ext4_iloc *iloc, int in_mem)
3450 {
3451         struct ext4_group_desc  *gdp;
3452         struct buffer_head      *bh;
3453         struct super_block      *sb = inode->i_sb;
3454         ext4_fsblk_t            block;
3455         int                     inodes_per_block, inode_offset;
3456
3457         iloc->bh = NULL;
3458         if (!ext4_valid_inum(sb, inode->i_ino))
3459                 return -EIO;
3460
3461         iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
3462         gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
3463         if (!gdp)
3464                 return -EIO;
3465
3466         /*
3467          * Figure out the offset within the block group inode table
3468          */
3469         inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
3470         inode_offset = ((inode->i_ino - 1) %
3471                         EXT4_INODES_PER_GROUP(sb));
3472         block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
3473         iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
3474
3475         bh = sb_getblk(sb, block);
3476         if (!bh) {
3477                 EXT4_ERROR_INODE_BLOCK(inode, block,
3478                                        "unable to read itable block");
3479                 return -EIO;
3480         }
3481         if (!buffer_uptodate(bh)) {
3482                 lock_buffer(bh);
3483
3484                 /*
3485                  * If the buffer has the write error flag, we have failed
3486                  * to write out another inode in the same block.  In this
3487                  * case, we don't have to read the block because we may
3488                  * read the old inode data successfully.
3489                  */
3490                 if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
3491                         set_buffer_uptodate(bh);
3492
3493                 if (buffer_uptodate(bh)) {
3494                         /* someone brought it uptodate while we waited */
3495                         unlock_buffer(bh);
3496                         goto has_buffer;
3497                 }
3498
3499                 /*
3500                  * If we have all information of the inode in memory and this
3501                  * is the only valid inode in the block, we need not read the
3502                  * block.
3503                  */
3504                 if (in_mem) {
3505                         struct buffer_head *bitmap_bh;
3506                         int i, start;
3507
3508                         start = inode_offset & ~(inodes_per_block - 1);
3509
3510                         /* Is the inode bitmap in cache? */
3511                         bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
3512                         if (!bitmap_bh)
3513                                 goto make_io;
3514
3515                         /*
3516                          * If the inode bitmap isn't in cache then the
3517                          * optimisation may end up performing two reads instead
3518                          * of one, so skip it.
3519                          */
3520                         if (!buffer_uptodate(bitmap_bh)) {
3521                                 brelse(bitmap_bh);
3522                                 goto make_io;
3523                         }
3524                         for (i = start; i < start + inodes_per_block; i++) {
3525                                 if (i == inode_offset)
3526                                         continue;
3527                                 if (ext4_test_bit(i, bitmap_bh->b_data))
3528                                         break;
3529                         }
3530                         brelse(bitmap_bh);
3531                         if (i == start + inodes_per_block) {
3532                                 /* all other inodes are free, so skip I/O */
3533                                 memset(bh->b_data, 0, bh->b_size);
3534                                 set_buffer_uptodate(bh);
3535                                 unlock_buffer(bh);
3536                                 goto has_buffer;
3537                         }
3538                 }
3539
3540 make_io:
3541                 /*
3542                  * If we need to do any I/O, try to pre-readahead extra
3543                  * blocks from the inode table.
3544                  */
3545                 if (EXT4_SB(sb)->s_inode_readahead_blks) {
3546                         ext4_fsblk_t b, end, table;
3547                         unsigned num;
3548
3549                         table = ext4_inode_table(sb, gdp);
3550                         /* s_inode_readahead_blks is always a power of 2 */
3551                         b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
3552                         if (table > b)
3553                                 b = table;
3554                         end = b + EXT4_SB(sb)->s_inode_readahead_blks;
3555                         num = EXT4_INODES_PER_GROUP(sb);
3556                         if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3557                                        EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
3558                                 num -= ext4_itable_unused_count(sb, gdp);
3559                         table += num / inodes_per_block;
3560                         if (end > table)
3561                                 end = table;
3562                         while (b <= end)
3563                                 sb_breadahead(sb, b++);
3564                 }
3565
3566                 /*
3567                  * There are other valid inodes in the buffer, this inode
3568                  * has in-inode xattrs, or we don't have this inode in memory.
3569                  * Read the block from disk.
3570                  */
3571                 trace_ext4_load_inode(inode);
3572                 get_bh(bh);
3573                 bh->b_end_io = end_buffer_read_sync;
3574                 submit_bh(READ_META, bh);
3575                 wait_on_buffer(bh);
3576                 if (!buffer_uptodate(bh)) {
3577                         EXT4_ERROR_INODE_BLOCK(inode, block,
3578                                                "unable to read itable block");
3579                         brelse(bh);
3580                         return -EIO;
3581                 }
3582         }
3583 has_buffer:
3584         iloc->bh = bh;
3585         return 0;
3586 }
3587
3588 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
3589 {
3590         /* We have all inode data except xattrs in memory here. */
3591         return __ext4_get_inode_loc(inode, iloc,
3592                 !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
3593 }
3594
3595 void ext4_set_inode_flags(struct inode *inode)
3596 {
3597         unsigned int flags = EXT4_I(inode)->i_flags;
3598
3599         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
3600         if (flags & EXT4_SYNC_FL)
3601                 inode->i_flags |= S_SYNC;
3602         if (flags & EXT4_APPEND_FL)
3603                 inode->i_flags |= S_APPEND;
3604         if (flags & EXT4_IMMUTABLE_FL)
3605                 inode->i_flags |= S_IMMUTABLE;
3606         if (flags & EXT4_NOATIME_FL)
3607                 inode->i_flags |= S_NOATIME;
3608         if (flags & EXT4_DIRSYNC_FL)
3609                 inode->i_flags |= S_DIRSYNC;
3610 }
3611
3612 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
3613 void ext4_get_inode_flags(struct ext4_inode_info *ei)
3614 {
3615         unsigned int vfs_fl;
3616         unsigned long old_fl, new_fl;
3617
3618         do {
3619                 vfs_fl = ei->vfs_inode.i_flags;
3620                 old_fl = ei->i_flags;
3621                 new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
3622                                 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
3623                                 EXT4_DIRSYNC_FL);
3624                 if (vfs_fl & S_SYNC)
3625                         new_fl |= EXT4_SYNC_FL;
3626                 if (vfs_fl & S_APPEND)
3627                         new_fl |= EXT4_APPEND_FL;
3628                 if (vfs_fl & S_IMMUTABLE)
3629                         new_fl |= EXT4_IMMUTABLE_FL;
3630                 if (vfs_fl & S_NOATIME)
3631                         new_fl |= EXT4_NOATIME_FL;
3632                 if (vfs_fl & S_DIRSYNC)
3633                         new_fl |= EXT4_DIRSYNC_FL;
3634         } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
3635 }
3636
3637 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
3638                                   struct ext4_inode_info *ei)
3639 {
3640         blkcnt_t i_blocks ;
3641         struct inode *inode = &(ei->vfs_inode);
3642         struct super_block *sb = inode->i_sb;
3643
3644         if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3645                                 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
3646                 /* we are using combined 48 bit field */
3647                 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
3648                                         le32_to_cpu(raw_inode->i_blocks_lo);
3649                 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
3650                         /* i_blocks represent file system block size */
3651                         return i_blocks  << (inode->i_blkbits - 9);
3652                 } else {
3653                         return i_blocks;
3654                 }
3655         } else {
3656                 return le32_to_cpu(raw_inode->i_blocks_lo);
3657         }
3658 }
3659
3660 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3661 {
3662         struct ext4_iloc iloc;
3663         struct ext4_inode *raw_inode;
3664         struct ext4_inode_info *ei;
3665         struct inode *inode;
3666         journal_t *journal = EXT4_SB(sb)->s_journal;
3667         long ret;
3668         int block;
3669
3670         inode = iget_locked(sb, ino);
3671         if (!inode)
3672                 return ERR_PTR(-ENOMEM);
3673         if (!(inode->i_state & I_NEW))
3674                 return inode;
3675
3676         ei = EXT4_I(inode);
3677         iloc.bh = NULL;
3678
3679         ret = __ext4_get_inode_loc(inode, &iloc, 0);
3680         if (ret < 0)
3681                 goto bad_inode;
3682         raw_inode = ext4_raw_inode(&iloc);
3683         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
3684         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
3685         inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
3686         if (!(test_opt(inode->i_sb, NO_UID32))) {
3687                 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
3688                 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
3689         }
3690         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
3691
3692         ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
3693         ei->i_dir_start_lookup = 0;
3694         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
3695         /* We now have enough fields to check if the inode was active or not.
3696          * This is needed because nfsd might try to access dead inodes
3697          * the test is that same one that e2fsck uses
3698          * NeilBrown 1999oct15
3699          */
3700         if (inode->i_nlink == 0) {
3701                 if (inode->i_mode == 0 ||
3702                     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
3703                         /* this inode is deleted */
3704                         ret = -ESTALE;
3705                         goto bad_inode;
3706                 }
3707                 /* The only unlinked inodes we let through here have
3708                  * valid i_mode and are being read by the orphan
3709                  * recovery code: that's fine, we're about to complete
3710                  * the process of deleting those. */
3711         }
3712         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
3713         inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
3714         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
3715         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
3716                 ei->i_file_acl |=
3717                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
3718         inode->i_size = ext4_isize(raw_inode);
3719         ei->i_disksize = inode->i_size;
3720 #ifdef CONFIG_QUOTA
3721         ei->i_reserved_quota = 0;
3722 #endif
3723         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
3724         ei->i_block_group = iloc.block_group;
3725         ei->i_last_alloc_group = ~0;
3726         /*
3727          * NOTE! The in-memory inode i_data array is in little-endian order
3728          * even on big-endian machines: we do NOT byteswap the block numbers!
3729          */
3730         for (block = 0; block < EXT4_N_BLOCKS; block++)
3731                 ei->i_data[block] = raw_inode->i_block[block];
3732         INIT_LIST_HEAD(&ei->i_orphan);
3733
3734         /*
3735          * Set transaction id's of transactions that have to be committed
3736          * to finish f[data]sync. We set them to currently running transaction
3737          * as we cannot be sure that the inode or some of its metadata isn't
3738          * part of the transaction - the inode could have been reclaimed and
3739          * now it is reread from disk.
3740          */
3741         if (journal) {
3742                 transaction_t *transaction;
3743                 tid_t tid;
3744
3745                 read_lock(&journal->j_state_lock);
3746                 if (journal->j_running_transaction)
3747                         transaction = journal->j_running_transaction;
3748                 else
3749                         transaction = journal->j_committing_transaction;
3750                 if (transaction)
3751                         tid = transaction->t_tid;
3752                 else
3753                         tid = journal->j_commit_sequence;
3754                 read_unlock(&journal->j_state_lock);
3755                 ei->i_sync_tid = tid;
3756                 ei->i_datasync_tid = tid;
3757         }
3758
3759         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
3760                 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
3761                 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
3762                     EXT4_INODE_SIZE(inode->i_sb)) {
3763                         ret = -EIO;
3764                         goto bad_inode;
3765                 }
3766                 if (ei->i_extra_isize == 0) {
3767                         /* The extra space is currently unused. Use it. */
3768                         ei->i_extra_isize = sizeof(struct ext4_inode) -
3769                                             EXT4_GOOD_OLD_INODE_SIZE;
3770                 } else {
3771                         __le32 *magic = (void *)raw_inode +
3772                                         EXT4_GOOD_OLD_INODE_SIZE +
3773                                         ei->i_extra_isize;
3774                         if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
3775                                 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
3776                 }
3777         } else
3778                 ei->i_extra_isize = 0;
3779
3780         EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
3781         EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
3782         EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
3783         EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
3784
3785         inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
3786         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
3787                 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
3788                         inode->i_version |=
3789                         (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
3790         }
3791
3792         ret = 0;
3793         if (ei->i_file_acl &&
3794             !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
3795                 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
3796                                  ei->i_file_acl);
3797                 ret = -EIO;
3798                 goto bad_inode;
3799         } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
3800                 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
3801                     (S_ISLNK(inode->i_mode) &&
3802                      !ext4_inode_is_fast_symlink(inode)))
3803                         /* Validate extent which is part of inode */
3804                         ret = ext4_ext_check_inode(inode);
3805         } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
3806                    (S_ISLNK(inode->i_mode) &&
3807                     !ext4_inode_is_fast_symlink(inode))) {
3808                 /* Validate block references which are part of inode */
3809                 ret = ext4_ind_check_inode(inode);
3810         }
3811         if (ret)
3812                 goto bad_inode;
3813
3814         if (S_ISREG(inode->i_mode)) {
3815                 inode->i_op = &ext4_file_inode_operations;
3816                 inode->i_fop = &ext4_file_operations;
3817                 ext4_set_aops(inode);
3818         } else if (S_ISDIR(inode->i_mode)) {
3819                 inode->i_op = &ext4_dir_inode_operations;
3820                 inode->i_fop = &ext4_dir_operations;
3821         } else if (S_ISLNK(inode->i_mode)) {
3822                 if (ext4_inode_is_fast_symlink(inode)) {
3823                         inode->i_op = &ext4_fast_symlink_inode_operations;
3824                         nd_terminate_link(ei->i_data, inode->i_size,
3825                                 sizeof(ei->i_data) - 1);
3826                 } else {
3827                         inode->i_op = &ext4_symlink_inode_operations;
3828                         ext4_set_aops(inode);
3829                 }
3830         } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
3831               S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
3832                 inode->i_op = &ext4_special_inode_operations;
3833                 if (raw_inode->i_block[0])
3834                         init_special_inode(inode, inode->i_mode,
3835                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
3836                 else
3837                         init_special_inode(inode, inode->i_mode,
3838                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
3839         } else {
3840                 ret = -EIO;
3841                 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
3842                 goto bad_inode;
3843         }
3844         brelse(iloc.bh);
3845         ext4_set_inode_flags(inode);
3846         unlock_new_inode(inode);
3847         return inode;
3848
3849 bad_inode:
3850         brelse(iloc.bh);
3851         iget_failed(inode);
3852         return ERR_PTR(ret);
3853 }
3854
3855 static int ext4_inode_blocks_set(handle_t *handle,
3856                                 struct ext4_inode *raw_inode,
3857                                 struct ext4_inode_info *ei)
3858 {
3859         struct inode *inode = &(ei->vfs_inode);
3860         u64 i_blocks = inode->i_blocks;
3861         struct super_block *sb = inode->i_sb;
3862
3863         if (i_blocks <= ~0U) {
3864                 /*
3865                  * i_blocks can be represnted in a 32 bit variable
3866                  * as multiple of 512 bytes
3867                  */
3868                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
3869                 raw_inode->i_blocks_high = 0;
3870                 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
3871                 return 0;
3872         }
3873         if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
3874                 return -EFBIG;
3875
3876         if (i_blocks <= 0xffffffffffffULL) {
3877                 /*
3878                  * i_blocks can be represented in a 48 bit variable
3879                  * as multiple of 512 bytes
3880                  */
3881                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
3882                 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
3883                 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
3884         } else {
3885                 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
3886                 /* i_block is stored in file system block size */
3887                 i_blocks = i_blocks >> (inode->i_blkbits - 9);
3888                 raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
3889                 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
3890         }
3891         return 0;
3892 }
3893
3894 /*
3895  * Post the struct inode info into an on-disk inode location in the
3896  * buffer-cache.  This gobbles the caller's reference to the
3897  * buffer_head in the inode location struct.
3898  *
3899  * The caller must have write access to iloc->bh.
3900  */
3901 static int ext4_do_update_inode(handle_t *handle,
3902                                 struct inode *inode,
3903                                 struct ext4_iloc *iloc)
3904 {
3905         struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
3906         struct ext4_inode_info *ei = EXT4_I(inode);
3907         struct buffer_head *bh = iloc->bh;
3908         int err = 0, rc, block;
3909
3910         /* For fields not not tracking in the in-memory inode,
3911          * initialise them to zero for new inodes. */
3912         if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
3913                 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
3914
3915         ext4_get_inode_flags(ei);
3916         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
3917         if (!(test_opt(inode->i_sb, NO_UID32))) {
3918                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
3919                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
3920 /*
3921  * Fix up interoperability with old kernels. Otherwise, old inodes get
3922  * re-used with the upper 16 bits of the uid/gid intact
3923  */
3924                 if (!ei->i_dtime) {
3925                         raw_inode->i_uid_high =
3926                                 cpu_to_le16(high_16_bits(inode->i_uid));
3927                         raw_inode->i_gid_high =
3928                                 cpu_to_le16(high_16_bits(inode->i_gid));
3929                 } else {
3930                         raw_inode->i_uid_high = 0;
3931                         raw_inode->i_gid_high = 0;
3932                 }
3933         } else {
3934                 raw_inode->i_uid_low =
3935                         cpu_to_le16(fs_high2lowuid(inode->i_uid));
3936                 raw_inode->i_gid_low =
3937                         cpu_to_le16(fs_high2lowgid(inode->i_gid));
3938                 raw_inode->i_uid_high = 0;
3939                 raw_inode->i_gid_high = 0;
3940         }
3941         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
3942
3943         EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
3944         EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
3945         EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
3946         EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
3947
3948         if (ext4_inode_blocks_set(handle, raw_inode, ei))
3949                 goto out_brelse;
3950         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
3951         raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
3952         if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
3953             cpu_to_le32(EXT4_OS_HURD))
3954                 raw_inode->i_file_acl_high =
3955                         cpu_to_le16(ei->i_file_acl >> 32);
3956         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
3957         ext4_isize_set(raw_inode, ei->i_disksize);
3958         if (ei->i_disksize > 0x7fffffffULL) {
3959                 struct super_block *sb = inode->i_sb;
3960                 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
3961                                 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
3962                                 EXT4_SB(sb)->s_es->s_rev_level ==
3963                                 cpu_to_le32(EXT4_GOOD_OLD_REV)) {
3964                         /* If this is the first large file
3965                          * created, add a flag to the superblock.
3966                          */
3967                         err = ext4_journal_get_write_access(handle,
3968                                         EXT4_SB(sb)->s_sbh);
3969                         if (err)
3970                                 goto out_brelse;
3971                         ext4_update_dynamic_rev(sb);
3972                         EXT4_SET_RO_COMPAT_FEATURE(sb,
3973                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
3974                         sb->s_dirt = 1;
3975                         ext4_handle_sync(handle);
3976                         err = ext4_handle_dirty_metadata(handle, NULL,
3977                                         EXT4_SB(sb)->s_sbh);
3978                 }
3979         }
3980         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
3981         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
3982                 if (old_valid_dev(inode->i_rdev)) {
3983                         raw_inode->i_block[0] =
3984                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
3985                         raw_inode->i_block[1] = 0;
3986                 } else {
3987                         raw_inode->i_block[0] = 0;
3988                         raw_inode->i_block[1] =
3989                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
3990                         raw_inode->i_block[2] = 0;
3991                 }
3992         } else
3993                 for (block = 0; block < EXT4_N_BLOCKS; block++)
3994                         raw_inode->i_block[block] = ei->i_data[block];
3995
3996         raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
3997         if (ei->i_extra_isize) {
3998                 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
3999                         raw_inode->i_version_hi =
4000                         cpu_to_le32(inode->i_version >> 32);
4001                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4002         }
4003
4004         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4005         rc = ext4_handle_dirty_metadata(handle, NULL, bh);
4006         if (!err)
4007                 err = rc;
4008         ext4_clear_inode_state(inode, EXT4_STATE_NEW);
4009
4010         ext4_update_inode_fsync_trans(handle, inode, 0);
4011 out_brelse:
4012         brelse(bh);
4013         ext4_std_error(inode->i_sb, err);
4014         return err;
4015 }
4016
4017 /*
4018  * ext4_write_inode()
4019  *
4020  * We are called from a few places:
4021  *
4022  * - Within generic_file_write() for O_SYNC files.
4023  *   Here, there will be no transaction running. We wait for any running
4024  *   trasnaction to commit.
4025  *
4026  * - Within sys_sync(), kupdate and such.
4027  *   We wait on commit, if tol to.
4028  *
4029  * - Within prune_icache() (PF_MEMALLOC == true)
4030  *   Here we simply return.  We can't afford to block kswapd on the
4031  *   journal commit.
4032  *
4033  * In all cases it is actually safe for us to return without doing anything,
4034  * because the inode has been copied into a raw inode buffer in
4035  * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
4036  * knfsd.
4037  *
4038  * Note that we are absolutely dependent upon all inode dirtiers doing the
4039  * right thing: they *must* call mark_inode_dirty() after dirtying info in
4040  * which we are interested.
4041  *
4042  * It would be a bug for them to not do this.  The code:
4043  *
4044  *      mark_inode_dirty(inode)
4045  *      stuff();
4046  *      inode->i_size = expr;
4047  *
4048  * is in error because a kswapd-driven write_inode() could occur while
4049  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
4050  * will no longer be on the superblock's dirty inode list.
4051  */
4052 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
4053 {
4054         int err;
4055
4056         if (current->flags & PF_MEMALLOC)
4057                 return 0;
4058
4059         if (EXT4_SB(inode->i_sb)->s_journal) {
4060                 if (ext4_journal_current_handle()) {
4061                         jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
4062                         dump_stack();
4063                         return -EIO;
4064                 }
4065
4066                 if (wbc->sync_mode != WB_SYNC_ALL)
4067                         return 0;
4068
4069                 err = ext4_force_commit(inode->i_sb);
4070         } else {
4071                 struct ext4_iloc iloc;
4072
4073                 err = __ext4_get_inode_loc(inode, &iloc, 0);
4074                 if (err)
4075                         return err;
4076                 if (wbc->sync_mode == WB_SYNC_ALL)
4077                         sync_dirty_buffer(iloc.bh);
4078                 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
4079                         EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
4080                                          "IO error syncing inode");
4081                         err = -EIO;
4082                 }
4083                 brelse(iloc.bh);
4084         }
4085         return err;
4086 }
4087
4088 /*
4089  * ext4_setattr()
4090  *
4091  * Called from notify_change.
4092  *
4093  * We want to trap VFS attempts to truncate the file as soon as
4094  * possible.  In particular, we want to make sure that when the VFS
4095  * shrinks i_size, we put the inode on the orphan list and modify
4096  * i_disksize immediately, so that during the subsequent flushing of
4097  * dirty pages and freeing of disk blocks, we can guarantee that any
4098  * commit will leave the blocks being flushed in an unused state on
4099  * disk.  (On recovery, the inode will get truncated and the blocks will
4100  * be freed, so we have a strong guarantee that no future commit will
4101  * leave these blocks visible to the user.)
4102  *
4103  * Another thing we have to assure is that if we are in ordered mode
4104  * and inode is still attached to the committing transaction, we must
4105  * we start writeout of all the dirty pages which are being truncated.
4106  * This way we are sure that all the data written in the previous
4107  * transaction are already on disk (truncate waits for pages under
4108  * writeback).
4109  *
4110  * Called with inode->i_mutex down.
4111  */
4112 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4113 {
4114         struct inode *inode = dentry->d_inode;
4115         int error, rc = 0;
4116         int orphan = 0;
4117         const unsigned int ia_valid = attr->ia_valid;
4118
4119         error = inode_change_ok(inode, attr);
4120         if (error)
4121                 return error;
4122
4123         if (is_quota_modification(inode, attr))
4124                 dquot_initialize(inode);
4125         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
4126                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
4127                 handle_t *handle;
4128
4129                 /* (user+group)*(old+new) structure, inode write (sb,
4130                  * inode block, ? - but truncate inode update has it) */
4131                 handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
4132                                         EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
4133                 if (IS_ERR(handle)) {
4134                         error = PTR_ERR(handle);
4135                         goto err_out;
4136                 }
4137                 error = dquot_transfer(inode, attr);
4138                 if (error) {
4139                         ext4_journal_stop(handle);
4140                         return error;
4141                 }
4142                 /* Update corresponding info in inode so that everything is in
4143                  * one transaction */
4144                 if (attr->ia_valid & ATTR_UID)
4145                         inode->i_uid = attr->ia_uid;
4146                 if (attr->ia_valid & ATTR_GID)
4147                         inode->i_gid = attr->ia_gid;
4148                 error = ext4_mark_inode_dirty(handle, inode);
4149                 ext4_journal_stop(handle);
4150         }
4151
4152         if (attr->ia_valid & ATTR_SIZE) {
4153                 inode_dio_wait(inode);
4154
4155                 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4156                         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4157
4158                         if (attr->ia_size > sbi->s_bitmap_maxbytes)
4159                                 return -EFBIG;
4160                 }
4161         }
4162
4163         if (S_ISREG(inode->i_mode) &&
4164             attr->ia_valid & ATTR_SIZE &&
4165             (attr->ia_size < inode->i_size)) {
4166                 handle_t *handle;
4167
4168                 handle = ext4_journal_start(inode, 3);
4169                 if (IS_ERR(handle)) {
4170                         error = PTR_ERR(handle);
4171                         goto err_out;
4172                 }
4173                 if (ext4_handle_valid(handle)) {
4174                         error = ext4_orphan_add(handle, inode);
4175                         orphan = 1;
4176                 }
4177                 EXT4_I(inode)->i_disksize = attr->ia_size;
4178                 rc = ext4_mark_inode_dirty(handle, inode);
4179                 if (!error)
4180                         error = rc;
4181                 ext4_journal_stop(handle);
4182
4183                 if (ext4_should_order_data(inode)) {
4184                         error = ext4_begin_ordered_truncate(inode,
4185                                                             attr->ia_size);
4186                         if (error) {
4187                                 /* Do as much error cleanup as possible */
4188                                 handle = ext4_journal_start(inode, 3);
4189                                 if (IS_ERR(handle)) {
4190                                         ext4_orphan_del(NULL, inode);
4191                                         goto err_out;
4192                                 }
4193                                 ext4_orphan_del(handle, inode);
4194                                 orphan = 0;
4195                                 ext4_journal_stop(handle);
4196                                 goto err_out;
4197                         }
4198                 }
4199         }
4200
4201         if (attr->ia_valid & ATTR_SIZE) {
4202                 if (attr->ia_size != i_size_read(inode)) {
4203                         truncate_setsize(inode, attr->ia_size);
4204                         ext4_truncate(inode);
4205                 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
4206                         ext4_truncate(inode);
4207         }
4208
4209         if (!rc) {
4210                 setattr_copy(inode, attr);
4211                 mark_inode_dirty(inode);
4212         }
4213
4214         /*
4215          * If the call to ext4_truncate failed to get a transaction handle at
4216          * all, we need to clean up the in-core orphan list manually.
4217          */
4218         if (orphan && inode->i_nlink)
4219                 ext4_orphan_del(NULL, inode);
4220
4221         if (!rc && (ia_valid & ATTR_MODE))
4222                 rc = ext4_acl_chmod(inode);
4223
4224 err_out:
4225         ext4_std_error(inode->i_sb, error);
4226         if (!error)
4227                 error = rc;
4228         return error;
4229 }
4230
4231 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4232                  struct kstat *stat)
4233 {
4234         struct inode *inode;
4235         unsigned long delalloc_blocks;
4236
4237         inode = dentry->d_inode;
4238         generic_fillattr(inode, stat);
4239
4240         /*
4241          * We can't update i_blocks if the block allocation is delayed
4242          * otherwise in the case of system crash before the real block
4243          * allocation is done, we will have i_blocks inconsistent with
4244          * on-disk file blocks.
4245          * We always keep i_blocks updated together with real
4246          * allocation. But to not confuse with user, stat
4247          * will return the blocks that include the delayed allocation
4248          * blocks for this file.
4249          */
4250         delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
4251
4252         stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
4253         return 0;
4254 }
4255
4256 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4257 {
4258         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4259                 return ext4_ind_trans_blocks(inode, nrblocks, chunk);
4260         return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
4261 }
4262
4263 /*
4264  * Account for index blocks, block groups bitmaps and block group
4265  * descriptor blocks if modify datablocks and index blocks
4266  * worse case, the indexs blocks spread over different block groups
4267  *
4268  * If datablocks are discontiguous, they are possible to spread over
4269  * different block groups too. If they are contiuguous, with flexbg,
4270  * they could still across block group boundary.
4271  *
4272  * Also account for superblock, inode, quota and xattr blocks
4273  */
4274 static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4275 {
4276         ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
4277         int gdpblocks;
4278         int idxblocks;
4279         int ret = 0;
4280
4281         /*
4282          * How many index blocks need to touch to modify nrblocks?
4283          * The "Chunk" flag indicating whether the nrblocks is
4284          * physically contiguous on disk
4285          *
4286          * For Direct IO and fallocate, they calls get_block to allocate
4287          * one single extent at a time, so they could set the "Chunk" flag
4288          */
4289         idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
4290
4291         ret = idxblocks;
4292
4293         /*
4294          * Now let's see how many group bitmaps and group descriptors need
4295          * to account
4296          */
4297         groups = idxblocks;
4298         if (chunk)
4299                 groups += 1;
4300         else
4301                 groups += nrblocks;
4302
4303         gdpblocks = groups;
4304         if (groups > ngroups)
4305                 groups = ngroups;
4306         if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
4307                 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
4308
4309         /* bitmaps and block group descriptor blocks */
4310         ret += groups + gdpblocks;
4311
4312         /* Blocks for super block, inode, quota and xattr blocks */
4313         ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
4314
4315         return ret;
4316 }
4317
4318 /*
4319  * Calculate the total number of credits to reserve to fit
4320  * the modification of a single pages into a single transaction,
4321  * which may include multiple chunks of block allocations.
4322  *
4323  * This could be called via ext4_write_begin()
4324  *
4325  * We need to consider the worse case, when
4326  * one new block per extent.
4327  */
4328 int ext4_writepage_trans_blocks(struct inode *inode)
4329 {
4330         int bpp = ext4_journal_blocks_per_page(inode);
4331         int ret;
4332
4333         ret = ext4_meta_trans_blocks(inode, bpp, 0);
4334
4335         /* Account for data blocks for journalled mode */
4336         if (ext4_should_journal_data(inode))
4337                 ret += bpp;
4338         return ret;
4339 }
4340
4341 /*
4342  * Calculate the journal credits for a chunk of data modification.
4343  *
4344  * This is called from DIO, fallocate or whoever calling
4345  * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
4346  *
4347  * journal buffers for data blocks are not included here, as DIO
4348  * and fallocate do no need to journal data buffers.
4349  */
4350 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
4351 {
4352         return ext4_meta_trans_blocks(inode, nrblocks, 1);
4353 }
4354
4355 /*
4356  * The caller must have previously called ext4_reserve_inode_write().
4357  * Give this, we know that the caller already has write access to iloc->bh.
4358  */
4359 int ext4_mark_iloc_dirty(handle_t *handle,
4360                          struct inode *inode, struct ext4_iloc *iloc)
4361 {
4362         int err = 0;
4363
4364         if (test_opt(inode->i_sb, I_VERSION))
4365                 inode_inc_iversion(inode);
4366
4367         /* the do_update_inode consumes one bh->b_count */
4368         get_bh(iloc->bh);
4369
4370         /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
4371         err = ext4_do_update_inode(handle, inode, iloc);
4372         put_bh(iloc->bh);
4373         return err;
4374 }
4375
4376 /*
4377  * On success, We end up with an outstanding reference count against
4378  * iloc->bh.  This _must_ be cleaned up later.
4379  */
4380
4381 int
4382 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
4383                          struct ext4_iloc *iloc)
4384 {
4385         int err;
4386
4387         err = ext4_get_inode_loc(inode, iloc);
4388         if (!err) {
4389                 BUFFER_TRACE(iloc->bh, "get_write_access");
4390                 err = ext4_journal_get_write_access(handle, iloc->bh);
4391                 if (err) {
4392                         brelse(iloc->bh);
4393                         iloc->bh = NULL;
4394                 }
4395         }
4396         ext4_std_error(inode->i_sb, err);
4397         return err;
4398 }
4399
4400 /*
4401  * Expand an inode by new_extra_isize bytes.
4402  * Returns 0 on success or negative error number on failure.
4403  */
4404 static int ext4_expand_extra_isize(struct inode *inode,
4405                                    unsigned int new_extra_isize,
4406                                    struct ext4_iloc iloc,
4407                                    handle_t *handle)
4408 {
4409         struct ext4_inode *raw_inode;
4410         struct ext4_xattr_ibody_header *header;
4411
4412         if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
4413                 return 0;
4414
4415         raw_inode = ext4_raw_inode(&iloc);
4416
4417         header = IHDR(inode, raw_inode);
4418
4419         /* No extended attributes present */
4420         if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
4421             header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
4422                 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
4423                         new_extra_isize);
4424                 EXT4_I(inode)->i_extra_isize = new_extra_isize;
4425                 return 0;
4426         }
4427
4428         /* try to expand with EAs present */
4429         return ext4_expand_extra_isize_ea(inode, new_extra_isize,
4430                                           raw_inode, handle);
4431 }
4432
4433 /*
4434  * What we do here is to mark the in-core inode as clean with respect to inode
4435  * dirtiness (it may still be data-dirty).
4436  * This means that the in-core inode may be reaped by prune_icache
4437  * without having to perform any I/O.  This is a very good thing,
4438  * because *any* task may call prune_icache - even ones which
4439  * have a transaction open against a different journal.
4440  *
4441  * Is this cheating?  Not really.  Sure, we haven't written the
4442  * inode out, but prune_icache isn't a user-visible syncing function.
4443  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
4444  * we start and wait on commits.
4445  *
4446  * Is this efficient/effective?  Well, we're being nice to the system
4447  * by cleaning up our inodes proactively so they can be reaped
4448  * without I/O.  But we are potentially leaving up to five seconds'
4449  * worth of inodes floating about which prune_icache wants us to
4450  * write out.  One way to fix that would be to get prune_icache()
4451  * to do a write_super() to free up some memory.  It has the desired
4452  * effect.
4453  */
4454 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
4455 {
4456         struct ext4_iloc iloc;
4457         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4458         static unsigned int mnt_count;
4459         int err, ret;
4460
4461         might_sleep();
4462         trace_ext4_mark_inode_dirty(inode, _RET_IP_);
4463         err = ext4_reserve_inode_write(handle, inode, &iloc);
4464         if (ext4_handle_valid(handle) &&
4465             EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
4466             !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
4467                 /*
4468                  * We need extra buffer credits since we may write into EA block
4469                  * with this same handle. If journal_extend fails, then it will
4470                  * only result in a minor loss of functionality for that inode.
4471                  * If this is felt to be critical, then e2fsck should be run to
4472                  * force a large enough s_min_extra_isize.
4473                  */
4474                 if ((jbd2_journal_extend(handle,
4475                              EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
4476                         ret = ext4_expand_extra_isize(inode,
4477                                                       sbi->s_want_extra_isize,
4478                                                       iloc, handle);
4479                         if (ret) {
4480                                 ext4_set_inode_state(inode,
4481                                                      EXT4_STATE_NO_EXPAND);
4482                                 if (mnt_count !=
4483                                         le16_to_cpu(sbi->s_es->s_mnt_count)) {
4484                                         ext4_warning(inode->i_sb,
4485                                         "Unable to expand inode %lu. Delete"
4486                                         " some EAs or run e2fsck.",
4487                                         inode->i_ino);
4488                                         mnt_count =
4489                                           le16_to_cpu(sbi->s_es->s_mnt_count);
4490                                 }
4491                         }
4492                 }
4493         }
4494         if (!err)
4495                 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
4496         return err;
4497 }
4498
4499 /*
4500  * ext4_dirty_inode() is called from __mark_inode_dirty()
4501  *
4502  * We're really interested in the case where a file is being extended.
4503  * i_size has been changed by generic_commit_write() and we thus need
4504  * to include the updated inode in the current transaction.
4505  *
4506  * Also, dquot_alloc_block() will always dirty the inode when blocks
4507  * are allocated to the file.
4508  *
4509  * If the inode is marked synchronous, we don't honour that here - doing
4510  * so would cause a commit on atime updates, which we don't bother doing.
4511  * We handle synchronous inodes at the highest possible level.
4512  */
4513 void ext4_dirty_inode(struct inode *inode, int flags)
4514 {
4515         handle_t *handle;
4516
4517         handle = ext4_journal_start(inode, 2);
4518         if (IS_ERR(handle))
4519                 goto out;
4520
4521         ext4_mark_inode_dirty(handle, inode);
4522
4523         ext4_journal_stop(handle);
4524 out:
4525         return;
4526 }
4527
4528 #if 0
4529 /*
4530  * Bind an inode's backing buffer_head into this transaction, to prevent
4531  * it from being flushed to disk early.  Unlike
4532  * ext4_reserve_inode_write, this leaves behind no bh reference and
4533  * returns no iloc structure, so the caller needs to repeat the iloc
4534  * lookup to mark the inode dirty later.
4535  */
4536 static int ext4_pin_inode(handle_t *handle, struct inode *inode)
4537 {
4538         struct ext4_iloc iloc;
4539
4540         int err = 0;
4541         if (handle) {
4542                 err = ext4_get_inode_loc(inode, &iloc);
4543                 if (!err) {
4544                         BUFFER_TRACE(iloc.bh, "get_write_access");
4545                         err = jbd2_journal_get_write_access(handle, iloc.bh);
4546                         if (!err)
4547                                 err = ext4_handle_dirty_metadata(handle,
4548                                                                  NULL,
4549                                                                  iloc.bh);
4550                         brelse(iloc.bh);
4551                 }
4552         }
4553         ext4_std_error(inode->i_sb, err);
4554         return err;
4555 }
4556 #endif
4557
4558 int ext4_change_inode_journal_flag(struct inode *inode, int val)
4559 {
4560         journal_t *journal;
4561         handle_t *handle;
4562         int err;
4563
4564         /*
4565          * We have to be very careful here: changing a data block's
4566          * journaling status dynamically is dangerous.  If we write a
4567          * data block to the journal, change the status and then delete
4568          * that block, we risk forgetting to revoke the old log record
4569          * from the journal and so a subsequent replay can corrupt data.
4570          * So, first we make sure that the journal is empty and that
4571          * nobody is changing anything.
4572          */
4573
4574         journal = EXT4_JOURNAL(inode);
4575         if (!journal)
4576                 return 0;
4577         if (is_journal_aborted(journal))
4578                 return -EROFS;
4579
4580         jbd2_journal_lock_updates(journal);
4581         jbd2_journal_flush(journal);
4582
4583         /*
4584          * OK, there are no updates running now, and all cached data is
4585          * synced to disk.  We are now in a completely consistent state
4586          * which doesn't have anything in the journal, and we know that
4587          * no filesystem updates are running, so it is safe to modify
4588          * the inode's in-core data-journaling state flag now.
4589          */
4590
4591         if (val)
4592                 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
4593         else
4594                 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
4595         ext4_set_aops(inode);
4596
4597         jbd2_journal_unlock_updates(journal);
4598
4599         /* Finally we can mark the inode as dirty. */
4600
4601         handle = ext4_journal_start(inode, 1);
4602         if (IS_ERR(handle))
4603                 return PTR_ERR(handle);
4604
4605         err = ext4_mark_inode_dirty(handle, inode);
4606         ext4_handle_sync(handle);
4607         ext4_journal_stop(handle);
4608         ext4_std_error(inode->i_sb, err);
4609
4610         return err;
4611 }
4612
4613 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
4614 {
4615         return !buffer_mapped(bh);
4616 }
4617
4618 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4619 {
4620         struct page *page = vmf->page;
4621         loff_t size;
4622         unsigned long len;
4623         int ret;
4624         struct file *file = vma->vm_file;
4625         struct inode *inode = file->f_path.dentry->d_inode;
4626         struct address_space *mapping = inode->i_mapping;
4627         handle_t *handle;
4628         get_block_t *get_block;
4629         int retries = 0;
4630
4631         /*
4632          * This check is racy but catches the common case. We rely on
4633          * __block_page_mkwrite() to do a reliable check.
4634          */
4635         vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
4636         /* Delalloc case is easy... */
4637         if (test_opt(inode->i_sb, DELALLOC) &&
4638             !ext4_should_journal_data(inode) &&
4639             !ext4_nonda_switch(inode->i_sb)) {
4640                 do {
4641                         ret = __block_page_mkwrite(vma, vmf,
4642                                                    ext4_da_get_block_prep);
4643                 } while (ret == -ENOSPC &&
4644                        ext4_should_retry_alloc(inode->i_sb, &retries));
4645                 goto out_ret;
4646         }
4647
4648         lock_page(page);
4649         size = i_size_read(inode);
4650         /* Page got truncated from under us? */
4651         if (page->mapping != mapping || page_offset(page) > size) {
4652                 unlock_page(page);
4653                 ret = VM_FAULT_NOPAGE;
4654                 goto out;
4655         }
4656
4657         if (page->index == size >> PAGE_CACHE_SHIFT)
4658                 len = size & ~PAGE_CACHE_MASK;
4659         else
4660                 len = PAGE_CACHE_SIZE;
4661         /*
4662          * Return if we have all the buffers mapped. This avoids the need to do
4663          * journal_start/journal_stop which can block and take a long time
4664          */
4665         if (page_has_buffers(page)) {
4666                 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4667                                         ext4_bh_unmapped)) {
4668                         /* Wait so that we don't change page under IO */
4669                         wait_on_page_writeback(page);
4670                         ret = VM_FAULT_LOCKED;
4671                         goto out;
4672                 }
4673         }
4674         unlock_page(page);
4675         /* OK, we need to fill the hole... */
4676         if (ext4_should_dioread_nolock(inode))
4677                 get_block = ext4_get_block_write;
4678         else
4679                 get_block = ext4_get_block;
4680 retry_alloc:
4681         handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
4682         if (IS_ERR(handle)) {
4683                 ret = VM_FAULT_SIGBUS;
4684                 goto out;
4685         }
4686         ret = __block_page_mkwrite(vma, vmf, get_block);
4687         if (!ret && ext4_should_journal_data(inode)) {
4688                 if (walk_page_buffers(handle, page_buffers(page), 0,
4689                           PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
4690                         unlock_page(page);
4691                         ret = VM_FAULT_SIGBUS;
4692                         goto out;
4693                 }
4694                 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
4695         }
4696         ext4_journal_stop(handle);
4697         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
4698                 goto retry_alloc;
4699 out_ret:
4700         ret = block_page_mkwrite_return(ret);
4701 out:
4702         return ret;
4703 }