fs/btrfs/super.c

   1 #include <linux/module.h>
   2 #include <linux/buffer_head.h>
   3 #include <linux/fs.h>
   4 #include <linux/pagemap.h>
   5 #include <linux/highmem.h>
   6 #include <linux/time.h>
   7 #include <linux/init.h>
   8 #include <linux/string.h>
   9 #include <linux/smp_lock.h>
  10 #include <linux/backing-dev.h>
  11 #include <linux/mpage.h>
  12 #include <linux/swap.h>
  13 #include <linux/writeback.h>
  14 #include <linux/statfs.h>
  15 #include "ctree.h"
  16 #include "disk-io.h"
  17 #include "transaction.h"
  18 #include "btrfs_inode.h"
  19 #include "ioctl.h"
  20
  21 void btrfs_fsinfo_release(struct kobject *obj)
  22 {
  23         struct btrfs_fs_info *fsinfo = container_of(obj,
  24                                             struct btrfs_fs_info, kobj);
  25         kfree(fsinfo);
  26 }
  27
  28 struct kobj_type btrfs_fsinfo_ktype = {
  29         .release = btrfs_fsinfo_release,
  30 };
  31
  32 struct btrfs_iget_args {
  33         u64 ino;
  34         struct btrfs_root *root;
  35 };
  36
  37 decl_subsys(btrfs, &btrfs_fsinfo_ktype, NULL);
  38
  39 #define BTRFS_SUPER_MAGIC 0x9123682E
  40
  41 static struct inode_operations btrfs_dir_inode_operations;
  42 static struct inode_operations btrfs_dir_ro_inode_operations;
  43 static struct super_operations btrfs_super_ops;
  44 static struct file_operations btrfs_dir_file_operations;
  45 static struct inode_operations btrfs_file_inode_operations;
  46 static struct address_space_operations btrfs_aops;
  47 static struct file_operations btrfs_file_operations;
  48
  49 static void btrfs_read_locked_inode(struct inode *inode)
  50 {
  51         struct btrfs_path *path;
  52         struct btrfs_inode_item *inode_item;
  53         struct btrfs_root *root = BTRFS_I(inode)->root;
  54         struct btrfs_key location;
  55         int ret;
  56
  57         path = btrfs_alloc_path();
  58         BUG_ON(!path);
  59         btrfs_init_path(path);
  60         mutex_lock(&root->fs_info->fs_mutex);
  61
  62         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
  63         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
  64         if (ret) {
  65                 btrfs_free_path(path);
  66                 goto make_bad;
  67         }
  68         inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
  69                                   path->slots[0],
  70                                   struct btrfs_inode_item);
  71
  72         inode->i_mode = btrfs_inode_mode(inode_item);
  73         inode->i_nlink = btrfs_inode_nlink(inode_item);
  74         inode->i_uid = btrfs_inode_uid(inode_item);
  75         inode->i_gid = btrfs_inode_gid(inode_item);
  76         inode->i_size = btrfs_inode_size(inode_item);
  77         inode->i_atime.tv_sec = btrfs_timespec_sec(&inode_item->atime);
  78         inode->i_atime.tv_nsec = btrfs_timespec_nsec(&inode_item->atime);
  79         inode->i_mtime.tv_sec = btrfs_timespec_sec(&inode_item->mtime);
  80         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(&inode_item->mtime);
  81         inode->i_ctime.tv_sec = btrfs_timespec_sec(&inode_item->ctime);
  82         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime);
  83         inode->i_blocks = btrfs_inode_nblocks(inode_item);
  84         inode->i_generation = btrfs_inode_generation(inode_item);
  85
  86         btrfs_free_path(path);
  87         inode_item = NULL;
  88
  89         mutex_unlock(&root->fs_info->fs_mutex);
  90
  91         switch (inode->i_mode & S_IFMT) {
  92 #if 0
  93         default:
  94                 init_special_inode(inode, inode->i_mode,
  95                                    btrfs_inode_rdev(inode_item));
  96                 break;
  97 #endif
  98         case S_IFREG:
  99                 inode->i_mapping->a_ops = &btrfs_aops;
 100                 inode->i_fop = &btrfs_file_operations;
 101                 inode->i_op = &btrfs_file_inode_operations;
 102                 break;
 103         case S_IFDIR:
 104                 inode->i_fop = &btrfs_dir_file_operations;
 105                 if (root == root->fs_info->tree_root)
 106                         inode->i_op = &btrfs_dir_ro_inode_operations;
 107                 else
 108                         inode->i_op = &btrfs_dir_inode_operations;
 109                 break;
 110         case S_IFLNK:
 111                 // inode->i_op = &page_symlink_inode_operations;
 112                 break;
 113         }
 114         return;
 115
 116 make_bad:
 117         btrfs_release_path(root, path);
 118         btrfs_free_path(path);
 119         mutex_unlock(&root->fs_info->fs_mutex);
 120         make_bad_inode(inode);
 121 }
 122
 123 static void fill_inode_item(struct btrfs_inode_item *item,
 124                             struct inode *inode)
 125 {
 126         btrfs_set_inode_uid(item, inode->i_uid);
 127         btrfs_set_inode_gid(item, inode->i_gid);
 128         btrfs_set_inode_size(item, inode->i_size);
 129         btrfs_set_inode_mode(item, inode->i_mode);
 130         btrfs_set_inode_nlink(item, inode->i_nlink);
 131         btrfs_set_timespec_sec(&item->atime, inode->i_atime.tv_sec);
 132         btrfs_set_timespec_nsec(&item->atime, inode->i_atime.tv_nsec);
 133         btrfs_set_timespec_sec(&item->mtime, inode->i_mtime.tv_sec);
 134         btrfs_set_timespec_nsec(&item->mtime, inode->i_mtime.tv_nsec);
 135         btrfs_set_timespec_sec(&item->ctime, inode->i_ctime.tv_sec);
 136         btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
 137         btrfs_set_inode_nblocks(item, inode->i_blocks);
 138         btrfs_set_inode_generation(item, inode->i_generation);
 139 }
 140
 141
 142 static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 143                               struct btrfs_root *root,
 144                               struct inode *inode)
 145 {
 146         struct btrfs_inode_item *inode_item;
 147         struct btrfs_path *path;
 148         int ret;
 149
 150         path = btrfs_alloc_path();
 151         BUG_ON(!path);
 152         btrfs_init_path(path);
 153         ret = btrfs_lookup_inode(trans, root, path,
 154                                  &BTRFS_I(inode)->location, 1);
 155         if (ret) {
 156                 if (ret > 0)
 157                         ret = -ENOENT;
 158                 goto failed;
 159         }
 160
 161         inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
 162                                   path->slots[0],
 163                                   struct btrfs_inode_item);
 164
 165         fill_inode_item(inode_item, inode);
 166         btrfs_mark_buffer_dirty(path->nodes[0]);
 167         ret = 0;
 168 failed:
 169         btrfs_release_path(root, path);
 170         btrfs_free_path(path);
 171         return ret;
 172 }
 173
 174
 175 static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 176                               struct btrfs_root *root,
 177                               struct inode *dir,
 178                               struct dentry *dentry)
 179 {
 180         struct btrfs_path *path;
 181         const char *name = dentry->d_name.name;
 182         int name_len = dentry->d_name.len;
 183         int ret = 0;
 184         u64 objectid;
 185         struct btrfs_dir_item *di;
 186
 187         path = btrfs_alloc_path();
 188         BUG_ON(!path);
 189         btrfs_init_path(path);
 190         di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
 191                                     name, name_len, -1);
 192         if (IS_ERR(di)) {
 193                 ret = PTR_ERR(di);
 194                 goto err;
 195         }
 196         if (!di) {
 197                 ret = -ENOENT;
 198                 goto err;
 199         }
 200         objectid = btrfs_disk_key_objectid(&di->location);
 201         ret = btrfs_delete_one_dir_name(trans, root, path, di);
 202         BUG_ON(ret);
 203         btrfs_release_path(root, path);
 204
 205         di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
 206                                          objectid, name, name_len, -1);
 207         if (IS_ERR(di)) {
 208                 ret = PTR_ERR(di);
 209                 goto err;
 210         }
 211         if (!di) {
 212                 ret = -ENOENT;
 213                 goto err;
 214         }
 215         ret = btrfs_delete_one_dir_name(trans, root, path, di);
 216         BUG_ON(ret);
 217
 218         dentry->d_inode->i_ctime = dir->i_ctime;
 219 err:
 220         btrfs_free_path(path);
 221         if (!ret) {
 222                 dir->i_size -= name_len * 2;
 223                 btrfs_update_inode(trans, root, dir);
 224                 drop_nlink(dentry->d_inode);
 225                 btrfs_update_inode(trans, root, dentry->d_inode);
 226         }
 227         return ret;
 228 }
 229
 230 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 231 {
 232         struct btrfs_root *root;
 233         struct btrfs_trans_handle *trans;
 234         int ret;
 235
 236         root = BTRFS_I(dir)->root;
 237         mutex_lock(&root->fs_info->fs_mutex);
 238         trans = btrfs_start_transaction(root, 1);
 239         ret = btrfs_unlink_trans(trans, root, dir, dentry);
 240         btrfs_end_transaction(trans, root);
 241         mutex_unlock(&root->fs_info->fs_mutex);
 242         return ret;
 243 }
 244
 245 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 246 {
 247         struct inode *inode = dentry->d_inode;
 248         int err;
 249         int ret;
 250         struct btrfs_root *root = BTRFS_I(dir)->root;
 251         struct btrfs_path *path;
 252         struct btrfs_key key;
 253         struct btrfs_trans_handle *trans;
 254         struct btrfs_key found_key;
 255         int found_type;
 256         struct btrfs_leaf *leaf;
 257         char *goodnames = "..";
 258
 259         path = btrfs_alloc_path();
 260         BUG_ON(!path);
 261         btrfs_init_path(path);
 262         mutex_lock(&root->fs_info->fs_mutex);
 263         trans = btrfs_start_transaction(root, 1);
 264         key.objectid = inode->i_ino;
 265         key.offset = (u64)-1;
 266         key.flags = (u32)-1;
 267         while(1) {
 268                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 269                 if (ret < 0) {
 270                         err = ret;
 271                         goto out;
 272                 }
 273                 BUG_ON(ret == 0);
 274                 if (path->slots[0] == 0) {
 275                         err = -ENOENT;
 276                         goto out;
 277                 }
 278                 path->slots[0]--;
 279                 leaf = btrfs_buffer_leaf(path->nodes[0]);
 280                 btrfs_disk_key_to_cpu(&found_key,
 281                                       &leaf->items[path->slots[0]].key);
 282                 found_type = btrfs_key_type(&found_key);
 283                 if (found_key.objectid != inode->i_ino) {
 284                         err = -ENOENT;
 285                         goto out;
 286                 }
 287                 if ((found_type != BTRFS_DIR_ITEM_KEY &&
 288                      found_type != BTRFS_DIR_INDEX_KEY) ||
 289                     (!btrfs_match_dir_item_name(root, path, goodnames, 2) &&
 290                     !btrfs_match_dir_item_name(root, path, goodnames, 1))) {
 291                         err = -ENOTEMPTY;
 292                         goto out;
 293                 }
 294                 ret = btrfs_del_item(trans, root, path);
 295                 BUG_ON(ret);
 296
 297                 if (found_type == BTRFS_DIR_ITEM_KEY && found_key.offset == 1)
 298                         break;
 299                 btrfs_release_path(root, path);
 300         }
 301         ret = 0;
 302         btrfs_release_path(root, path);
 303
 304         /* now the directory is empty */
 305         err = btrfs_unlink_trans(trans, root, dir, dentry);
 306         if (!err) {
 307                 inode->i_size = 0;
 308         }
 309 out:
 310         btrfs_release_path(root, path);
 311         btrfs_free_path(path);
 312         mutex_unlock(&root->fs_info->fs_mutex);
 313         ret = btrfs_end_transaction(trans, root);
 314         if (ret && !err)
 315                 err = ret;
 316         return err;
 317 }
 318
 319 static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 320                             struct btrfs_root *root,
 321                             struct inode *inode)
 322 {
 323         struct btrfs_path *path;
 324         int ret;
 325
 326         clear_inode(inode);
 327
 328         path = btrfs_alloc_path();
 329         BUG_ON(!path);
 330         btrfs_init_path(path);
 331         ret = btrfs_lookup_inode(trans, root, path,
 332                                  &BTRFS_I(inode)->location, -1);
 333         BUG_ON(ret);
 334         ret = btrfs_del_item(trans, root, path);
 335         BUG_ON(ret);
 336         btrfs_free_path(path);
 337         return ret;
 338 }
 339
 340 static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 341                                    struct btrfs_root *root,
 342                                    struct inode *inode)
 343 {
 344         int ret;
 345         struct btrfs_path *path;
 346         struct btrfs_key key;
 347         struct btrfs_disk_key *found_key;
 348         struct btrfs_leaf *leaf;
 349         struct btrfs_file_extent_item *fi = NULL;
 350         u64 extent_start = 0;
 351         u64 extent_num_blocks = 0;
 352         int found_extent;
 353
 354         path = btrfs_alloc_path();
 355         BUG_ON(!path);
 356         /* FIXME, add redo link to tree so we don't leak on crash */
 357         key.objectid = inode->i_ino;
 358         key.offset = (u64)-1;
 359         key.flags = 0;
 360         /*
 361          * use BTRFS_CSUM_ITEM_KEY because it is larger than inline keys
 362          * or extent data
 363          */
 364         btrfs_set_key_type(&key, BTRFS_CSUM_ITEM_KEY);
 365         while(1) {
 366                 btrfs_init_path(path);
 367                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 368                 if (ret < 0) {
 369                         goto error;
 370                 }
 371                 if (ret > 0) {
 372                         BUG_ON(path->slots[0] == 0);
 373                         path->slots[0]--;
 374                 }
 375                 leaf = btrfs_buffer_leaf(path->nodes[0]);
 376                 found_key = &leaf->items[path->slots[0]].key;
 377                 if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
 378                         break;
 379                 if (btrfs_disk_key_type(found_key) != BTRFS_CSUM_ITEM_KEY &&
 380                     btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY)
 381                         break;
 382                 if (btrfs_disk_key_offset(found_key) < inode->i_size)
 383                         break;
 384                 found_extent = 0;
 385                 if (btrfs_disk_key_type(found_key) == BTRFS_EXTENT_DATA_KEY) {
 386                         fi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
 387                                             path->slots[0],
 388                                             struct btrfs_file_extent_item);
 389                         if (btrfs_file_extent_type(fi) !=
 390                             BTRFS_FILE_EXTENT_INLINE) {
 391                                 extent_start =
 392                                         btrfs_file_extent_disk_blocknr(fi);
 393                                 extent_num_blocks =
 394                                         btrfs_file_extent_disk_num_blocks(fi);
 395                                 /* FIXME blocksize != 4096 */
 396                                 inode->i_blocks -=
 397                                         btrfs_file_extent_num_blocks(fi) << 3;
 398                                 found_extent = 1;
 399                         }
 400                 }
 401                 ret = btrfs_del_item(trans, root, path);
 402                 BUG_ON(ret);
 403                 btrfs_release_path(root, path);
 404                 if (found_extent) {
 405                         ret = btrfs_free_extent(trans, root, extent_start,
 406                                                 extent_num_blocks, 0);
 407                         BUG_ON(ret);
 408                 }
 409         }
 410         ret = 0;
 411 error:
 412         btrfs_release_path(root, path);
 413         btrfs_free_path(path);
 414         return ret;
 415 }
 416
 417 static void btrfs_delete_inode(struct inode *inode)
 418 {
 419         struct btrfs_trans_handle *trans;
 420         struct btrfs_root *root = BTRFS_I(inode)->root;
 421         int ret;
 422
 423         truncate_inode_pages(&inode->i_data, 0);
 424         if (is_bad_inode(inode)) {
 425                 goto no_delete;
 426         }
 427         inode->i_size = 0;
 428         mutex_lock(&root->fs_info->fs_mutex);
 429         trans = btrfs_start_transaction(root, 1);
 430         if (S_ISREG(inode->i_mode)) {
 431                 ret = btrfs_truncate_in_trans(trans, root, inode);
 432                 BUG_ON(ret);
 433         }
 434         btrfs_free_inode(trans, root, inode);
 435         btrfs_end_transaction(trans, root);
 436         mutex_unlock(&root->fs_info->fs_mutex);
 437         return;
 438 no_delete:
 439         clear_inode(inode);
 440 }
 441
 442 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 443                                struct btrfs_key *location)
 444 {
 445         const char *name = dentry->d_name.name;
 446         int namelen = dentry->d_name.len;
 447         struct btrfs_dir_item *di;
 448         struct btrfs_path *path;
 449         struct btrfs_root *root = BTRFS_I(dir)->root;
 450         int ret;
 451
 452         path = btrfs_alloc_path();
 453         BUG_ON(!path);
 454         btrfs_init_path(path);
 455         di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
 456                                     namelen, 0);
 457         if (!di || IS_ERR(di)) {
 458                 location->objectid = 0;
 459                 ret = 0;
 460                 goto out;
 461         }
 462         btrfs_disk_key_to_cpu(location, &di->location);
 463 out:
 464         btrfs_release_path(root, path);
 465         btrfs_free_path(path);
 466         return ret;
 467 }
 468
 469 int fixup_tree_root_location(struct btrfs_root *root,
 470                              struct btrfs_key *location,
 471                              struct btrfs_root **sub_root)
 472 {
 473         struct btrfs_path *path;
 474         struct btrfs_root_item *ri;
 475
 476         if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
 477                 return 0;
 478         if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
 479                 return 0;
 480
 481         path = btrfs_alloc_path();
 482         BUG_ON(!path);
 483         mutex_lock(&root->fs_info->fs_mutex);
 484
 485         *sub_root = btrfs_read_fs_root(root->fs_info, location);
 486         if (IS_ERR(*sub_root))
 487                 return PTR_ERR(*sub_root);
 488
 489         ri = &(*sub_root)->root_item;
 490         location->objectid = btrfs_root_dirid(ri);
 491         location->flags = 0;
 492         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 493         location->offset = 0;
 494
 495         btrfs_free_path(path);
 496         mutex_unlock(&root->fs_info->fs_mutex);
 497         return 0;
 498 }
 499
 500 int btrfs_init_locked_inode(struct inode *inode, void *p)
 501 {
 502         struct btrfs_iget_args *args = p;
 503         inode->i_ino = args->ino;
 504         BTRFS_I(inode)->root = args->root;
 505         return 0;
 506 }
 507
 508 int btrfs_find_actor(struct inode *inode, void *opaque)
 509 {
 510         struct btrfs_iget_args *args = opaque;
 511         return (args->ino == inode->i_ino &&
 512                 args->root == BTRFS_I(inode)->root);
 513 }
 514
 515 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 516                                 struct btrfs_root *root)
 517 {
 518         struct inode *inode;
 519         struct btrfs_iget_args args;
 520         args.ino = objectid;
 521         args.root = root;
 522
 523         inode = iget5_locked(s, objectid, btrfs_find_actor,
 524                              btrfs_init_locked_inode,
 525                              (void *)&args);
 526         return inode;
 527 }
 528
 529 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 530                                    struct nameidata *nd)
 531 {
 532         struct inode * inode;
 533         struct btrfs_inode *bi = BTRFS_I(dir);
 534         struct btrfs_root *root = bi->root;
 535         struct btrfs_root *sub_root = root;
 536         struct btrfs_key location;
 537         int ret;
 538
 539         if (dentry->d_name.len > BTRFS_NAME_LEN)
 540                 return ERR_PTR(-ENAMETOOLONG);
 541         mutex_lock(&root->fs_info->fs_mutex);
 542         ret = btrfs_inode_by_name(dir, dentry, &location);
 543         mutex_unlock(&root->fs_info->fs_mutex);
 544         if (ret < 0)
 545                 return ERR_PTR(ret);
 546         inode = NULL;
 547         if (location.objectid) {
 548                 ret = fixup_tree_root_location(root, &location, &sub_root);
 549                 if (ret < 0)
 550                         return ERR_PTR(ret);
 551                 if (ret > 0)
 552                         return ERR_PTR(-ENOENT);
 553                 inode = btrfs_iget_locked(dir->i_sb, location.objectid,
 554                                           sub_root);
 555                 if (!inode)
 556                         return ERR_PTR(-EACCES);
 557                 if (inode->i_state & I_NEW) {
 558                         if (sub_root != root) {
 559 printk("adding new root for inode %lu root %p (found %p)\n", inode->i_ino, sub_root, BTRFS_I(inode)->root);
 560                                 igrab(inode);
 561                                 sub_root->inode = inode;
 562                         }
 563                         BTRFS_I(inode)->root = sub_root;
 564                         memcpy(&BTRFS_I(inode)->location, &location,
 565                                sizeof(location));
 566                         btrfs_read_locked_inode(inode);
 567                         unlock_new_inode(inode);
 568                 }
 569         }
 570         return d_splice_alias(inode, dentry);
 571 }
 572
 573 static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 574 {
 575         struct inode *inode = filp->f_path.dentry->d_inode;
 576         struct btrfs_root *root = BTRFS_I(inode)->root;
 577         struct btrfs_item *item;
 578         struct btrfs_dir_item *di;
 579         struct btrfs_key key;
 580         struct btrfs_path *path;
 581         int ret;
 582         u32 nritems;
 583         struct btrfs_leaf *leaf;
 584         int slot;
 585         int advance;
 586         unsigned char d_type = DT_UNKNOWN;
 587         int over = 0;
 588         u32 di_cur;
 589         u32 di_total;
 590         u32 di_len;
 591         int key_type = BTRFS_DIR_INDEX_KEY;
 592
 593         /* FIXME, use a real flag for deciding about the key type */
 594         if (root->fs_info->tree_root == root)
 595                 key_type = BTRFS_DIR_ITEM_KEY;
 596         mutex_lock(&root->fs_info->fs_mutex);
 597         key.objectid = inode->i_ino;
 598         key.flags = 0;
 599         btrfs_set_key_type(&key, key_type);
 600         key.offset = filp->f_pos;
 601         path = btrfs_alloc_path();
 602         btrfs_init_path(path);
 603         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 604         if (ret < 0)
 605                 goto err;
 606         advance = 0;
 607         while(1) {
 608                 leaf = btrfs_buffer_leaf(path->nodes[0]);
 609                 nritems = btrfs_header_nritems(&leaf->header);
 610                 slot = path->slots[0];
 611                 if (advance || slot >= nritems) {
 612                         if (slot >= nritems -1) {
 613                                 ret = btrfs_next_leaf(root, path);
 614                                 if (ret)
 615                                         break;
 616                                 leaf = btrfs_buffer_leaf(path->nodes[0]);
 617                                 nritems = btrfs_header_nritems(&leaf->header);
 618                                 slot = path->slots[0];
 619                         } else {
 620                                 slot++;
 621                                 path->slots[0]++;
 622                         }
 623                 }
 624                 advance = 1;
 625                 item = leaf->items + slot;
 626                 if (btrfs_disk_key_objectid(&item->key) != key.objectid)
 627                         break;
 628                 if (btrfs_disk_key_type(&item->key) != key_type)
 629                         break;
 630                 if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
 631                         continue;
 632                 filp->f_pos = btrfs_disk_key_offset(&item->key);
 633                 advance = 1;
 634                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 635                 di_cur = 0;
 636                 di_total = btrfs_item_size(leaf->items + slot);
 637                 while(di_cur < di_total) {
 638                         over = filldir(dirent, (const char *)(di + 1),
 639                                        btrfs_dir_name_len(di),
 640                                        btrfs_disk_key_offset(&item->key),
 641                                        btrfs_disk_key_objectid(&di->location),
 642                                        d_type);
 643                         if (over)
 644                                 goto nopos;
 645                         di_len = btrfs_dir_name_len(di) + sizeof(*di);
 646                         di_cur += di_len;
 647                         di = (struct btrfs_dir_item *)((char *)di + di_len);
 648                 }
 649         }
 650         filp->f_pos++;
 651 nopos:
 652         ret = 0;
 653 err:
 654         btrfs_release_path(root, path);
 655         btrfs_free_path(path);
 656         mutex_unlock(&root->fs_info->fs_mutex);
 657         return ret;
 658 }
 659
 660 static void btrfs_put_super (struct super_block * sb)
 661 {
 662         struct btrfs_root *root = btrfs_sb(sb);
 663         int ret;
 664
 665         ret = close_ctree(root);
 666         if (ret) {
 667                 printk("close ctree returns %d\n", ret);
 668         }
 669         sb->s_fs_info = NULL;
 670 }
 671
 672 static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 673 {
 674         struct inode * inode;
 675         struct dentry * root_dentry;
 676         struct btrfs_super_block *disk_super;
 677         struct btrfs_root *tree_root;
 678         struct btrfs_inode *bi;
 679
 680         sb->s_maxbytes = MAX_LFS_FILESIZE;
 681         sb->s_magic = BTRFS_SUPER_MAGIC;
 682         sb->s_op = &btrfs_super_ops;
 683         sb->s_time_gran = 1;
 684
 685         tree_root = open_ctree(sb);
 686
 687         if (!tree_root) {
 688                 printk("btrfs: open_ctree failed\n");
 689                 return -EIO;
 690         }
 691         sb->s_fs_info = tree_root;
 692         disk_super = tree_root->fs_info->disk_super;
 693         printk("read in super total blocks %Lu root %Lu\n",
 694                btrfs_super_total_blocks(disk_super),
 695                btrfs_super_root_dir(disk_super));
 696
 697         inode = btrfs_iget_locked(sb, btrfs_super_root_dir(disk_super),
 698                                   tree_root);
 699         bi = BTRFS_I(inode);
 700         bi->location.objectid = inode->i_ino;
 701         bi->location.offset = 0;
 702         bi->location.flags = 0;
 703         bi->root = tree_root;
 704         btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
 705
 706         if (!inode)
 707                 return -ENOMEM;
 708         if (inode->i_state & I_NEW) {
 709                 btrfs_read_locked_inode(inode);
 710                 unlock_new_inode(inode);
 711         }
 712
 713         root_dentry = d_alloc_root(inode);
 714         if (!root_dentry) {
 715                 iput(inode);
 716                 return -ENOMEM;
 717         }
 718         sb->s_root = root_dentry;
 719
 720         return 0;
 721 }
 722
 723 static int btrfs_write_inode(struct inode *inode, int wait)
 724 {
 725         struct btrfs_root *root = BTRFS_I(inode)->root;
 726         struct btrfs_trans_handle *trans;
 727         int ret = 0;
 728
 729         if (wait) {
 730                 mutex_lock(&root->fs_info->fs_mutex);
 731                 trans = btrfs_start_transaction(root, 1);
 732                 ret = btrfs_commit_transaction(trans, root);
 733                 mutex_unlock(&root->fs_info->fs_mutex);
 734         }
 735         return ret;
 736 }
 737
 738 static void btrfs_dirty_inode(struct inode *inode)
 739 {
 740         struct btrfs_root *root = BTRFS_I(inode)->root;
 741         struct btrfs_trans_handle *trans;
 742
 743         mutex_lock(&root->fs_info->fs_mutex);
 744         trans = btrfs_start_transaction(root, 1);
 745         btrfs_update_inode(trans, root, inode);
 746         btrfs_end_transaction(trans, root);
 747         mutex_unlock(&root->fs_info->fs_mutex);
 748 }
 749
 750 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 751                                      struct btrfs_root *root,
 752                                      u64 objectid, int mode)
 753 {
 754         struct inode *inode;
 755         struct btrfs_inode_item inode_item;
 756         struct btrfs_key *location;
 757         int ret;
 758
 759         inode = new_inode(root->fs_info->sb);
 760         if (!inode)
 761                 return ERR_PTR(-ENOMEM);
 762
 763         BTRFS_I(inode)->root = root;
 764
 765         inode->i_uid = current->fsuid;
 766         inode->i_gid = current->fsgid;
 767         inode->i_mode = mode;
 768         inode->i_ino = objectid;
 769         inode->i_blocks = 0;
 770         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 771         fill_inode_item(&inode_item, inode);
 772         location = &BTRFS_I(inode)->location;
 773         location->objectid = objectid;
 774         location->flags = 0;
 775         location->offset = 0;
 776         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 777
 778         ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
 779         BUG_ON(ret);
 780
 781         insert_inode_hash(inode);
 782         return inode;
 783 }
 784
 785 static int btrfs_add_link(struct btrfs_trans_handle *trans,
 786                             struct dentry *dentry, struct inode *inode)
 787 {
 788         int ret;
 789         struct btrfs_key key;
 790         struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
 791         key.objectid = inode->i_ino;
 792         key.flags = 0;
 793         btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 794         key.offset = 0;
 795
 796         ret = btrfs_insert_dir_item(trans, root,
 797                                     dentry->d_name.name, dentry->d_name.len,
 798                                     dentry->d_parent->d_inode->i_ino,
 799                                     &key, 0);
 800         if (ret == 0) {
 801                 dentry->d_parent->d_inode->i_size += dentry->d_name.len * 2;
 802                 ret = btrfs_update_inode(trans, root,
 803                                          dentry->d_parent->d_inode);
 804         }
 805         return ret;
 806 }
 807
 808 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
 809                             struct dentry *dentry, struct inode *inode)
 810 {
 811         int err = btrfs_add_link(trans, dentry, inode);
 812         if (!err) {
 813                 d_instantiate(dentry, inode);
 814                 return 0;
 815         }
 816         if (err > 0)
 817                 err = -EEXIST;
 818         return err;
 819 }
 820
 821 static int btrfs_create(struct inode *dir, struct dentry *dentry,
 822                         int mode, struct nameidata *nd)
 823 {
 824         struct btrfs_trans_handle *trans;
 825         struct btrfs_root *root = BTRFS_I(dir)->root;
 826         struct inode *inode;
 827         int err;
 828         int drop_inode = 0;
 829         u64 objectid;
 830
 831         mutex_lock(&root->fs_info->fs_mutex);
 832         trans = btrfs_start_transaction(root, 1);
 833
 834         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
 835         if (err) {
 836                 err = -ENOSPC;
 837                 goto out_unlock;
 838         }
 839
 840         inode = btrfs_new_inode(trans, root, objectid, mode);
 841         err = PTR_ERR(inode);
 842         if (IS_ERR(inode))
 843                 goto out_unlock;
 844         // FIXME mark the inode dirty
 845         err = btrfs_add_nondir(trans, dentry, inode);
 846         if (err)
 847                 drop_inode = 1;
 848         else {
 849                 inode->i_mapping->a_ops = &btrfs_aops;
 850                 inode->i_fop = &btrfs_file_operations;
 851                 inode->i_op = &btrfs_file_inode_operations;
 852         }
 853         dir->i_sb->s_dirt = 1;
 854 out_unlock:
 855         btrfs_end_transaction(trans, root);
 856         mutex_unlock(&root->fs_info->fs_mutex);
 857
 858         if (drop_inode) {
 859                 inode_dec_link_count(inode);
 860                 iput(inode);
 861         }
 862         return err;
 863 }
 864
 865 static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
 866                                 struct btrfs_root *root,
 867                                 u64 objectid, u64 dirid)
 868 {
 869         int ret;
 870         char buf[2];
 871         struct btrfs_key key;
 872
 873         buf[0] = '.';
 874         buf[1] = '.';
 875
 876         key.objectid = objectid;
 877         key.offset = 0;
 878         key.flags = 0;
 879         btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 880
 881         ret = btrfs_insert_dir_item(trans, root, buf, 1, objectid,
 882                                     &key, 1);
 883         if (ret)
 884                 goto error;
 885         key.objectid = dirid;
 886         ret = btrfs_insert_dir_item(trans, root, buf, 2, objectid,
 887                                     &key, 1);
 888         if (ret)
 889                 goto error;
 890 error:
 891         return ret;
 892 }
 893
 894 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 895 {
 896         struct inode *inode;
 897         struct btrfs_trans_handle *trans;
 898         struct btrfs_root *root = BTRFS_I(dir)->root;
 899         int err = 0;
 900         int drop_on_err = 0;
 901         u64 objectid;
 902
 903         mutex_lock(&root->fs_info->fs_mutex);
 904         trans = btrfs_start_transaction(root, 1);
 905         if (IS_ERR(trans)) {
 906                 err = PTR_ERR(trans);
 907                 goto out_unlock;
 908         }
 909
 910         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
 911         if (err) {
 912                 err = -ENOSPC;
 913                 goto out_unlock;
 914         }
 915
 916         inode = btrfs_new_inode(trans, root, objectid, S_IFDIR | mode);
 917         if (IS_ERR(inode)) {
 918                 err = PTR_ERR(inode);
 919                 goto out_fail;
 920         }
 921         drop_on_err = 1;
 922         inode->i_op = &btrfs_dir_inode_operations;
 923         inode->i_fop = &btrfs_dir_file_operations;
 924
 925         err = btrfs_make_empty_dir(trans, root, inode->i_ino, dir->i_ino);
 926         if (err)
 927                 goto out_fail;
 928
 929         inode->i_size = 6;
 930         err = btrfs_update_inode(trans, root, inode);
 931         if (err)
 932                 goto out_fail;
 933         err = btrfs_add_link(trans, dentry, inode);
 934         if (err)
 935                 goto out_fail;
 936         d_instantiate(dentry, inode);
 937         drop_on_err = 0;
 938
 939 out_fail:
 940         btrfs_end_transaction(trans, root);
 941 out_unlock:
 942         mutex_unlock(&root->fs_info->fs_mutex);
 943         if (drop_on_err)
 944                 iput(inode);
 945         return err;
 946 }
 947
 948 static int btrfs_sync_file(struct file *file,
 949                            struct dentry *dentry, int datasync)
 950 {
 951         struct inode *inode = dentry->d_inode;
 952         struct btrfs_root *root = BTRFS_I(inode)->root;
 953         int ret;
 954         struct btrfs_trans_handle *trans;
 955
 956         mutex_lock(&root->fs_info->fs_mutex);
 957         trans = btrfs_start_transaction(root, 1);
 958         if (!trans) {
 959                 ret = -ENOMEM;
 960                 goto out;
 961         }
 962         ret = btrfs_commit_transaction(trans, root);
 963         mutex_unlock(&root->fs_info->fs_mutex);
 964 out:
 965         return ret > 0 ? EIO : ret;
 966 }
 967
 968 static int btrfs_sync_fs(struct super_block *sb, int wait)
 969 {
 970         struct btrfs_trans_handle *trans;
 971         struct btrfs_root *root;
 972         int ret;
 973         root = btrfs_sb(sb);
 974
 975         sb->s_dirt = 0;
 976         if (!wait) {
 977                 filemap_flush(root->fs_info->btree_inode->i_mapping);
 978                 return 0;
 979         }
 980         filemap_write_and_wait(root->fs_info->btree_inode->i_mapping);
 981         mutex_lock(&root->fs_info->fs_mutex);
 982         trans = btrfs_start_transaction(root, 1);
 983         ret = btrfs_commit_transaction(trans, root);
 984         sb->s_dirt = 0;
 985         BUG_ON(ret);
 986 printk("btrfs sync_fs\n");
 987         mutex_unlock(&root->fs_info->fs_mutex);
 988         return 0;
 989 }
 990
 991 static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 992                            struct buffer_head *result, int create)
 993 {
 994         int ret;
 995         int err = 0;
 996         u64 blocknr;
 997         u64 extent_start = 0;
 998         u64 extent_end = 0;
 999         u64 objectid = inode->i_ino;
1000         u32 found_type;
1001         struct btrfs_path *path;
1002         struct btrfs_root *root = BTRFS_I(inode)->root;
1003         struct btrfs_file_extent_item *item;
1004         struct btrfs_leaf *leaf;
1005         struct btrfs_disk_key *found_key;
1006
1007         path = btrfs_alloc_path();
1008         BUG_ON(!path);
1009         btrfs_init_path(path);
1010         if (create) {
1011                 WARN_ON(1);
1012         }
1013
1014         ret = btrfs_lookup_file_extent(NULL, root, path,
1015                                        inode->i_ino,
1016                                        iblock << inode->i_blkbits, 0);
1017         if (ret < 0) {
1018                 err = ret;
1019                 goto out;
1020         }
1021
1022         if (ret != 0) {
1023                 if (path->slots[0] == 0) {
1024                         btrfs_release_path(root, path);
1025                         goto out;
1026                 }
1027                 path->slots[0]--;
1028         }
1029
1030         item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
1031                               struct btrfs_file_extent_item);
1032         leaf = btrfs_buffer_leaf(path->nodes[0]);
1033         blocknr = btrfs_file_extent_disk_blocknr(item);
1034         blocknr += btrfs_file_extent_offset(item);
1035
1036         /* are we inside the extent that was found? */
1037         found_key = &leaf->items[path->slots[0]].key;
1038         found_type = btrfs_disk_key_type(found_key);
1039         if (btrfs_disk_key_objectid(found_key) != objectid ||
1040             found_type != BTRFS_EXTENT_DATA_KEY) {
1041                 extent_end = 0;
1042                 extent_start = 0;
1043                 btrfs_release_path(root, path);
1044                 goto out;
1045         }
1046         found_type = btrfs_file_extent_type(item);
1047         extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
1048         if (found_type == BTRFS_FILE_EXTENT_REG) {
1049                 extent_start = extent_start >> inode->i_blkbits;
1050                 extent_end = extent_start + btrfs_file_extent_num_blocks(item);
1051                 if (iblock >= extent_start && iblock < extent_end) {
1052                         err = 0;
1053                         btrfs_map_bh_to_logical(root, result, blocknr +
1054                                                 iblock - extent_start);
1055                         goto out;
1056                 }
1057         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
1058                 char *ptr;
1059                 char *map;
1060                 u32 size;
1061                 size = btrfs_file_extent_inline_len(leaf->items +
1062                                                     path->slots[0]);
1063                 extent_end = (extent_start + size) >> inode->i_blkbits;
1064                 extent_start >>= inode->i_blkbits;
1065                 if (iblock < extent_start || iblock > extent_end) {
1066                         goto out;
1067                 }
1068                 ptr = btrfs_file_extent_inline_start(item);
1069                 map = kmap(result->b_page);
1070                 memcpy(map, ptr, size);
1071                 memset(map + size, 0, PAGE_CACHE_SIZE - size);
1072                 flush_dcache_page(result->b_page);
1073                 kunmap(result->b_page);
1074                 set_buffer_uptodate(result);
1075                 SetPageChecked(result->b_page);
1076                 btrfs_map_bh_to_logical(root, result, 0);
1077         }
1078 out:
1079         btrfs_release_path(root, path);
1080         btrfs_free_path(path);
1081         return err;
1082 }
1083
1084 static int btrfs_get_block(struct inode *inode, sector_t iblock,
1085                            struct buffer_head *result, int create)
1086 {
1087         int err;
1088         struct btrfs_root *root = BTRFS_I(inode)->root;
1089         mutex_lock(&root->fs_info->fs_mutex);
1090         err = btrfs_get_block_lock(inode, iblock, result, create);
1091         mutex_unlock(&root->fs_info->fs_mutex);
1092         return err;
1093 }
1094
1095 static int btrfs_prepare_write(struct file *file, struct page *page,
1096                                unsigned from, unsigned to)
1097 {
1098         return nobh_prepare_write(page, from, to, btrfs_get_block);
1099 }
1100
1101 static void btrfs_write_super(struct super_block *sb)
1102 {
1103         btrfs_sync_fs(sb, 1);
1104 }
1105
1106 static int btrfs_readpage(struct file *file, struct page *page)
1107 {
1108         return mpage_readpage(page, btrfs_get_block);
1109 }
1110
1111 /*
1112  * While block_write_full_page is writing back the dirty buffers under
1113  * the page lock, whoever dirtied the buffers may decide to clean them
1114  * again at any time.  We handle that by only looking at the buffer
1115  * state inside lock_buffer().
1116  *
1117  * If block_write_full_page() is called for regular writeback
1118  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1119  * locked buffer.   This only can happen if someone has written the buffer
1120  * directly, with submit_bh().  At the address_space level PageWriteback
1121  * prevents this contention from occurring.
1122  */
1123 static int __btrfs_write_full_page(struct inode *inode, struct page *page,
1124                                    struct writeback_control *wbc)
1125 {
1126         int err;
1127         sector_t block;
1128         sector_t last_block;
1129         struct buffer_head *bh, *head;
1130         const unsigned blocksize = 1 << inode->i_blkbits;
1131         int nr_underway = 0;
1132
1133         BUG_ON(!PageLocked(page));
1134
1135         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1136
1137         if (!page_has_buffers(page)) {
1138                 create_empty_buffers(page, blocksize,
1139                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1140         }
1141
1142         /*
1143          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1144          * here, and the (potentially unmapped) buffers may become dirty at
1145          * any time.  If a buffer becomes dirty here after we've inspected it
1146          * then we just miss that fact, and the page stays dirty.
1147          *
1148          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1149          * handle that here by just cleaning them.
1150          */
1151
1152         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1153         head = page_buffers(page);
1154         bh = head;
1155
1156         /*
1157          * Get all the dirty buffers mapped to disk addresses and
1158          * handle any aliases from the underlying blockdev's mapping.
1159          */
1160         do {
1161                 if (block > last_block) {
1162                         /*
1163                          * mapped buffers outside i_size will occur, because
1164                          * this page can be outside i_size when there is a
1165                          * truncate in progress.
1166                          */
1167                         /*
1168                          * The buffer was zeroed by block_write_full_page()
1169                          */
1170                         clear_buffer_dirty(bh);
1171                         set_buffer_uptodate(bh);
1172                 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1173                         WARN_ON(bh->b_size != blocksize);
1174                         err = btrfs_get_block(inode, block, bh, 0);
1175                         if (err)
1176                                 goto recover;
1177                         if (buffer_new(bh)) {
1178                                 /* blockdev mappings never come here */
1179                                 clear_buffer_new(bh);
1180                                 unmap_underlying_metadata(bh->b_bdev,
1181                                                         bh->b_blocknr);
1182                         }
1183                 }
1184                 bh = bh->b_this_page;
1185                 block++;
1186         } while (bh != head);
1187
1188         do {
1189                 if (!buffer_mapped(bh))
1190                         continue;
1191                 /*
1192                  * If it's a fully non-blocking write attempt and we cannot
1193                  * lock the buffer then redirty the page.  Note that this can
1194                  * potentially cause a busy-wait loop from pdflush and kswapd
1195                  * activity, but those code paths have their own higher-level
1196                  * throttling.
1197                  */
1198                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1199                         lock_buffer(bh);
1200                 } else if (test_set_buffer_locked(bh)) {
1201                         redirty_page_for_writepage(wbc, page);
1202                         continue;
1203                 }
1204                 if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) {
1205                         mark_buffer_async_write(bh);
1206                 } else {
1207                         unlock_buffer(bh);
1208                 }
1209         } while ((bh = bh->b_this_page) != head);
1210
1211         /*
1212          * The page and its buffers are protected by PageWriteback(), so we can
1213          * drop the bh refcounts early.
1214          */
1215         BUG_ON(PageWriteback(page));
1216         set_page_writeback(page);
1217
1218         do {
1219                 struct buffer_head *next = bh->b_this_page;
1220                 if (buffer_async_write(bh)) {
1221                         submit_bh(WRITE, bh);
1222                         nr_underway++;
1223                 }
1224                 bh = next;
1225         } while (bh != head);
1226         unlock_page(page);
1227
1228         err = 0;
1229 done:
1230         if (nr_underway == 0) {
1231                 /*
1232                  * The page was marked dirty, but the buffers were
1233                  * clean.  Someone wrote them back by hand with
1234                  * ll_rw_block/submit_bh.  A rare case.
1235                  */
1236                 int uptodate = 1;
1237                 do {
1238                         if (!buffer_uptodate(bh)) {
1239                                 uptodate = 0;
1240                                 break;
1241                         }
1242                         bh = bh->b_this_page;
1243                 } while (bh != head);
1244                 if (uptodate)
1245                         SetPageUptodate(page);
1246                 end_page_writeback(page);
1247                 /*
1248                  * The page and buffer_heads can be released at any time from
1249                  * here on.
1250                  */
1251                 wbc->pages_skipped++;   /* We didn't write this page */
1252         }
1253         return err;
1254
1255 recover:
1256         /*
1257          * ENOSPC, or some other error.  We may already have added some
1258          * blocks to the file, so we need to write these out to avoid
1259          * exposing stale data.
1260          * The page is currently locked and not marked for writeback
1261          */
1262         bh = head;
1263         /* Recovery: lock and submit the mapped buffers */
1264         do {
1265                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1266                         lock_buffer(bh);
1267                         mark_buffer_async_write(bh);
1268                 } else {
1269                         /*
1270                          * The buffer may have been set dirty during
1271                          * attachment to a dirty page.
1272                          */
1273                         clear_buffer_dirty(bh);
1274                 }
1275         } while ((bh = bh->b_this_page) != head);
1276         SetPageError(page);
1277         BUG_ON(PageWriteback(page));
1278         set_page_writeback(page);
1279         do {
1280                 struct buffer_head *next = bh->b_this_page;
1281                 if (buffer_async_write(bh)) {
1282                         clear_buffer_dirty(bh);
1283                         submit_bh(WRITE, bh);
1284                         nr_underway++;
1285                 }
1286                 bh = next;
1287         } while (bh != head);
1288         unlock_page(page);
1289         goto done;
1290 }
1291
1292 /*
1293  * The generic ->writepage function for buffer-backed address_spaces
1294  */
1295 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
1296 {
1297         struct inode * const inode = page->mapping->host;
1298         loff_t i_size = i_size_read(inode);
1299         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
1300         unsigned offset;
1301         void *kaddr;
1302
1303         /* Is the page fully inside i_size? */
1304         if (page->index < end_index)
1305                 return __btrfs_write_full_page(inode, page, wbc);
1306
1307         /* Is the page fully outside i_size? (truncate in progress) */
1308         offset = i_size & (PAGE_CACHE_SIZE-1);
1309         if (page->index >= end_index+1 || !offset) {
1310                 /*
1311                  * The page may have dirty, unmapped buffers.  For example,
1312                  * they may have been added in ext3_writepage().  Make them
1313                  * freeable here, so the page does not leak.
1314                  */
1315                 block_invalidatepage(page, 0);
1316                 unlock_page(page);
1317                 return 0; /* don't care */
1318         }
1319
1320         /*
1321          * The page straddles i_size.  It must be zeroed out on each and every
1322          * writepage invokation because it may be mmapped.  "A file is mapped
1323          * in multiples of the page size.  For a file that is not a multiple of
1324          * the  page size, the remaining memory is zeroed when mapped, and
1325          * writes to that region are not written out to the file."
1326          */
1327         kaddr = kmap_atomic(page, KM_USER0);
1328         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
1329         flush_dcache_page(page);
1330         kunmap_atomic(kaddr, KM_USER0);
1331         return __btrfs_write_full_page(inode, page, wbc);
1332 }
1333
1334 static void btrfs_truncate(struct inode *inode)
1335 {
1336         struct btrfs_root *root = BTRFS_I(inode)->root;
1337         int ret;
1338         struct btrfs_trans_handle *trans;
1339
1340         if (!S_ISREG(inode->i_mode))
1341                 return;
1342         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1343                 return;
1344
1345         nobh_truncate_page(inode->i_mapping, inode->i_size);
1346
1347         /* FIXME, add redo link to tree so we don't leak on crash */
1348         mutex_lock(&root->fs_info->fs_mutex);
1349         trans = btrfs_start_transaction(root, 1);
1350         ret = btrfs_truncate_in_trans(trans, root, inode);
1351         BUG_ON(ret);
1352         ret = btrfs_end_transaction(trans, root);
1353         BUG_ON(ret);
1354         mutex_unlock(&root->fs_info->fs_mutex);
1355         mark_inode_dirty(inode);
1356 }
1357
1358 /*
1359  * Make sure any changes to nobh_commit_write() are reflected in
1360  * nobh_truncate_page(), since it doesn't call commit_write().
1361  */
1362 static int btrfs_commit_write(struct file *file, struct page *page,
1363                               unsigned from, unsigned to)
1364 {
1365         struct inode *inode = page->mapping->host;
1366         struct buffer_head *bh;
1367         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1368
1369         SetPageUptodate(page);
1370         bh = page_buffers(page);
1371         if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1372                 set_page_dirty(page);
1373         }
1374         if (pos > inode->i_size) {
1375                 i_size_write(inode, pos);
1376                 mark_inode_dirty(inode);
1377         }
1378         return 0;
1379 }
1380
1381 static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
1382                                 struct page **prepared_pages,
1383                                 const char __user * buf)
1384 {
1385         long page_fault = 0;
1386         int i;
1387         int offset = pos & (PAGE_CACHE_SIZE - 1);
1388
1389         for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
1390                 size_t count = min_t(size_t,
1391                                      PAGE_CACHE_SIZE - offset, write_bytes);
1392                 struct page *page = prepared_pages[i];
1393                 fault_in_pages_readable(buf, count);
1394
1395                 /* Copy data from userspace to the current page */
1396                 kmap(page);
1397                 page_fault = __copy_from_user(page_address(page) + offset,
1398                                               buf, count);
1399                 /* Flush processor's dcache for this page */
1400                 flush_dcache_page(page);
1401                 kunmap(page);
1402                 buf += count;
1403                 write_bytes -= count;
1404
1405                 if (page_fault)
1406                         break;
1407         }
1408         return page_fault ? -EFAULT : 0;
1409 }
1410
1411 static void btrfs_drop_pages(struct page **pages, size_t num_pages)
1412 {
1413         size_t i;
1414         for (i = 0; i < num_pages; i++) {
1415                 if (!pages[i])
1416                         break;
1417                 unlock_page(pages[i]);
1418                 mark_page_accessed(pages[i]);
1419                 page_cache_release(pages[i]);
1420         }
1421 }
1422 static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
1423                                    struct btrfs_root *root,
1424                                    struct file *file,
1425                                    struct page **pages,
1426                                    size_t num_pages,
1427                                    loff_t pos,
1428                                    size_t write_bytes)
1429 {
1430         int i;
1431         int offset;
1432         int err = 0;
1433         int ret;
1434         int this_write;
1435         struct inode *inode = file->f_path.dentry->d_inode;
1436         struct buffer_head *bh;
1437         struct btrfs_file_extent_item *ei;
1438
1439         for (i = 0; i < num_pages; i++) {
1440                 offset = pos & (PAGE_CACHE_SIZE -1);
1441                 this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
1442                 /* FIXME, one block at a time */
1443
1444                 mutex_lock(&root->fs_info->fs_mutex);
1445                 trans = btrfs_start_transaction(root, 1);
1446
1447                 bh = page_buffers(pages[i]);
1448                 if (buffer_mapped(bh) && bh->b_blocknr == 0) {
1449                         struct btrfs_key key;
1450                         struct btrfs_path *path;
1451                         char *ptr;
1452                         u32 datasize;
1453
1454                         path = btrfs_alloc_path();
1455                         BUG_ON(!path);
1456                         key.objectid = inode->i_ino;
1457                         key.offset = pages[i]->index << PAGE_CACHE_SHIFT;
1458                         key.flags = 0;
1459                         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
1460                         BUG_ON(write_bytes >= PAGE_CACHE_SIZE);
1461                         datasize = offset +
1462                                 btrfs_file_extent_calc_inline_size(write_bytes);
1463                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1464                                                       datasize);
1465                         BUG_ON(ret);
1466                         ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
1467                                path->slots[0], struct btrfs_file_extent_item);
1468                         btrfs_set_file_extent_generation(ei, trans->transid);
1469                         btrfs_set_file_extent_type(ei,
1470                                                    BTRFS_FILE_EXTENT_INLINE);
1471                         ptr = btrfs_file_extent_inline_start(ei);
1472                         memcpy(ptr, bh->b_data, offset + write_bytes);
1473                         mark_buffer_dirty(path->nodes[0]);
1474                         btrfs_free_path(path);
1475                 } else {
1476                         btrfs_csum_file_block(trans, root, inode->i_ino,
1477                                       pages[i]->index << PAGE_CACHE_SHIFT,
1478                                       kmap(pages[i]), PAGE_CACHE_SIZE);
1479                         kunmap(pages[i]);
1480                 }
1481                 SetPageChecked(pages[i]);
1482                 ret = btrfs_end_transaction(trans, root);
1483                 BUG_ON(ret);
1484                 mutex_unlock(&root->fs_info->fs_mutex);
1485
1486                 ret = btrfs_commit_write(file, pages[i], offset,
1487                                          offset + this_write);
1488                 pos += this_write;
1489                 if (ret) {
1490                         err = ret;
1491                         goto failed;
1492                 }
1493                 WARN_ON(this_write > write_bytes);
1494                 write_bytes -= this_write;
1495         }
1496 failed:
1497         return err;
1498 }
1499
1500 static int drop_extents(struct btrfs_trans_handle *trans,
1501                           struct btrfs_root *root,
1502                           struct inode *inode,
1503                           u64 start, u64 end)
1504 {
1505         int ret;
1506         struct btrfs_key key;
1507         struct btrfs_leaf *leaf;
1508         int slot;
1509         struct btrfs_file_extent_item *extent;
1510         u64 extent_end = 0;
1511         int keep;
1512         struct btrfs_file_extent_item old;
1513         struct btrfs_path *path;
1514         u64 search_start = start;
1515         int bookend;
1516         int found_type;
1517         int found_extent;
1518         int found_inline;
1519
1520         path = btrfs_alloc_path();
1521         if (!path)
1522                 return -ENOMEM;
1523         while(1) {
1524                 btrfs_release_path(root, path);
1525                 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
1526                                                search_start, -1);
1527                 if (ret < 0)
1528                         goto out;
1529                 if (ret > 0) {
1530                         if (path->slots[0] == 0) {
1531                                 ret = 0;
1532                                 goto out;
1533                         }
1534                         path->slots[0]--;
1535                 }
1536                 keep = 0;
1537                 bookend = 0;
1538                 found_extent = 0;
1539                 found_inline = 0;
1540                 extent = NULL;
1541                 leaf = btrfs_buffer_leaf(path->nodes[0]);
1542                 slot = path->slots[0];
1543                 btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
1544                 if (key.offset >= end || key.objectid != inode->i_ino) {
1545                         ret = 0;
1546                         goto out;
1547                 }
1548                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) {
1549                         ret = 0;
1550                         goto out;
1551                 }
1552                 extent = btrfs_item_ptr(leaf, slot,
1553                                         struct btrfs_file_extent_item);
1554                 found_type = btrfs_file_extent_type(extent);
1555                 if (found_type == BTRFS_FILE_EXTENT_REG) {
1556                         extent_end = key.offset +
1557                                 (btrfs_file_extent_num_blocks(extent) <<
1558                                  inode->i_blkbits);
1559                         found_extent = 1;
1560                 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
1561                         found_inline = 1;
1562                         extent_end = key.offset +
1563                              btrfs_file_extent_inline_len(leaf->items + slot);
1564                 }
1565
1566                 if (!found_extent && !found_inline) {
1567                         ret = 0;
1568                         goto out;
1569                 }
1570
1571                 if (search_start >= extent_end) {
1572                         ret = 0;
1573                         goto out;
1574                 }
1575
1576                 search_start = extent_end;
1577
1578                 if (end < extent_end && end >= key.offset) {
1579                         if (found_extent) {
1580                                 memcpy(&old, extent, sizeof(old));
1581                                 ret = btrfs_inc_extent_ref(trans, root,
1582                                       btrfs_file_extent_disk_blocknr(&old),
1583                                       btrfs_file_extent_disk_num_blocks(&old));
1584                                 BUG_ON(ret);
1585                         }
1586                         WARN_ON(found_inline);
1587                         bookend = 1;
1588                 }
1589
1590                 if (start > key.offset) {
1591                         u64 new_num;
1592                         u64 old_num;
1593                         /* truncate existing extent */
1594                         keep = 1;
1595                         WARN_ON(start & (root->blocksize - 1));
1596                         if (found_extent) {
1597                                 new_num = (start - key.offset) >>
1598                                         inode->i_blkbits;
1599                                 old_num = btrfs_file_extent_num_blocks(extent);
1600                                 inode->i_blocks -= (old_num - new_num) << 3;
1601                                 btrfs_set_file_extent_num_blocks(extent,
1602                                                                  new_num);
1603                                 mark_buffer_dirty(path->nodes[0]);
1604                         } else {
1605                                 WARN_ON(1);
1606                                 /*
1607                                 ret = btrfs_truncate_item(trans, root, path,
1608                                                           start - key.offset);
1609                                 BUG_ON(ret);
1610                                 */
1611                         }
1612                 }
1613                 if (!keep) {
1614                         u64 disk_blocknr = 0;
1615                         u64 disk_num_blocks = 0;
1616                         u64 extent_num_blocks = 0;
1617                         if (found_extent) {
1618                                 disk_blocknr =
1619                                       btrfs_file_extent_disk_blocknr(extent);
1620                                 disk_num_blocks =
1621                                       btrfs_file_extent_disk_num_blocks(extent);
1622                                 extent_num_blocks =
1623                                       btrfs_file_extent_num_blocks(extent);
1624                         }
1625                         ret = btrfs_del_item(trans, root, path);
1626                         BUG_ON(ret);
1627                         btrfs_release_path(root, path);
1628                         if (found_extent) {
1629                                 inode->i_blocks -=
1630                                 btrfs_file_extent_num_blocks(extent) << 3;
1631                                 ret = btrfs_free_extent(trans, root,
1632                                                         disk_blocknr,
1633                                                         disk_num_blocks, 0);
1634                         }
1635
1636                         BUG_ON(ret);
1637                         if (!bookend && search_start >= end) {
1638                                 ret = 0;
1639                                 goto out;
1640                         }
1641                         if (!bookend)
1642                                 continue;
1643                 }
1644                 if (bookend && found_extent) {
1645                         /* create bookend */
1646                         struct btrfs_key ins;
1647                         ins.objectid = inode->i_ino;
1648                         ins.offset = end;
1649                         ins.flags = 0;
1650                         btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
1651
1652                         btrfs_release_path(root, path);
1653                         ret = btrfs_insert_empty_item(trans, root, path, &ins,
1654                                                       sizeof(*extent));
1655                         BUG_ON(ret);
1656                         extent = btrfs_item_ptr(
1657                                     btrfs_buffer_leaf(path->nodes[0]),
1658                                     path->slots[0],
1659                                     struct btrfs_file_extent_item);
1660                         btrfs_set_file_extent_disk_blocknr(extent,
1661                                     btrfs_file_extent_disk_blocknr(&old));
1662                         btrfs_set_file_extent_disk_num_blocks(extent,
1663                                     btrfs_file_extent_disk_num_blocks(&old));
1664
1665                         btrfs_set_file_extent_offset(extent,
1666                                     btrfs_file_extent_offset(&old) +
1667                                     ((end - key.offset) >> inode->i_blkbits));
1668                         WARN_ON(btrfs_file_extent_num_blocks(&old) <
1669                                 (end - key.offset) >> inode->i_blkbits);
1670                         btrfs_set_file_extent_num_blocks(extent,
1671                                     btrfs_file_extent_num_blocks(&old) -
1672                                     ((end - key.offset) >> inode->i_blkbits));
1673
1674                         btrfs_set_file_extent_type(extent,
1675                                                    BTRFS_FILE_EXTENT_REG);
1676                         btrfs_set_file_extent_generation(extent,
1677                                     btrfs_file_extent_generation(&old));
1678                         btrfs_mark_buffer_dirty(path->nodes[0]);
1679                         inode->i_blocks +=
1680                                 btrfs_file_extent_num_blocks(extent) << 3;
1681                         ret = 0;
1682                         goto out;
1683                 }
1684         }
1685 out:
1686         btrfs_free_path(path);
1687         return ret;
1688 }
1689
1690 static int prepare_pages(struct btrfs_root *root,
1691                          struct file *file,
1692                          struct page **pages,
1693                          size_t num_pages,
1694                          loff_t pos,
1695                          unsigned long first_index,
1696                          unsigned long last_index,
1697                          size_t write_bytes,
1698                          u64 alloc_extent_start)
1699 {
1700         int i;
1701         unsigned long index = pos >> PAGE_CACHE_SHIFT;
1702         struct inode *inode = file->f_path.dentry->d_inode;
1703         int offset;
1704         int err = 0;
1705         int this_write;
1706         struct buffer_head *bh;
1707         struct buffer_head *head;
1708         loff_t isize = i_size_read(inode);
1709
1710         memset(pages, 0, num_pages * sizeof(struct page *));
1711
1712         for (i = 0; i < num_pages; i++) {
1713                 pages[i] = grab_cache_page(inode->i_mapping, index + i);
1714                 if (!pages[i]) {
1715                         err = -ENOMEM;
1716                         goto failed_release;
1717                 }
1718                 offset = pos & (PAGE_CACHE_SIZE -1);
1719                 this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
1720                 create_empty_buffers(pages[i], root->fs_info->sb->s_blocksize,
1721                                      (1 << BH_Uptodate));
1722                 head = page_buffers(pages[i]);
1723                 bh = head;
1724                 do {
1725                         err = btrfs_map_bh_to_logical(root, bh,
1726                                                       alloc_extent_start);
1727                         BUG_ON(err);
1728                         if (err)
1729                                 goto failed_truncate;
1730                         bh = bh->b_this_page;
1731                         if (alloc_extent_start)
1732                                 alloc_extent_start++;
1733                 } while (bh != head);
1734                 pos += this_write;
1735                 WARN_ON(this_write > write_bytes);
1736                 write_bytes -= this_write;
1737         }
1738         return 0;
1739
1740 failed_release:
1741         btrfs_drop_pages(pages, num_pages);
1742         return err;
1743
1744 failed_truncate:
1745         btrfs_drop_pages(pages, num_pages);
1746         if (pos > isize)
1747                 vmtruncate(inode, isize);
1748         return err;
1749 }
1750
1751 static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1752                                 size_t count, loff_t *ppos)
1753 {
1754         loff_t pos;
1755         size_t num_written = 0;
1756         int err = 0;
1757         int ret = 0;
1758         struct inode *inode = file->f_path.dentry->d_inode;
1759         struct btrfs_root *root = BTRFS_I(inode)->root;
1760         struct page *pages[8];
1761         struct page *pinned[2] = { NULL, NULL };
1762         unsigned long first_index;
1763         unsigned long last_index;
1764         u64 start_pos;
1765         u64 num_blocks;
1766         u64 alloc_extent_start;
1767         struct btrfs_trans_handle *trans;
1768         struct btrfs_key ins;
1769
1770         if (file->f_flags & O_DIRECT)
1771                 return -EINVAL;
1772         pos = *ppos;
1773         vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1774         current->backing_dev_info = inode->i_mapping->backing_dev_info;
1775         err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1776         if (err)
1777                 goto out;
1778         if (count == 0)
1779                 goto out;
1780         err = remove_suid(file->f_path.dentry);
1781         if (err)
1782                 goto out;
1783         file_update_time(file);
1784
1785         start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
1786         num_blocks = (count + pos - start_pos + root->blocksize - 1) >>
1787                         inode->i_blkbits;
1788
1789         mutex_lock(&inode->i_mutex);
1790         first_index = pos >> PAGE_CACHE_SHIFT;
1791         last_index = (pos + count) >> PAGE_CACHE_SHIFT;
1792
1793         if ((first_index << PAGE_CACHE_SHIFT) < inode->i_size &&
1794             (pos & (PAGE_CACHE_SIZE - 1))) {
1795                 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
1796                 if (!PageUptodate(pinned[0])) {
1797                         ret = mpage_readpage(pinned[0], btrfs_get_block);
1798                         BUG_ON(ret);
1799                 } else {
1800                         unlock_page(pinned[0]);
1801                 }
1802         }
1803         if (first_index != last_index &&
1804             (last_index << PAGE_CACHE_SHIFT) < inode->i_size &&
1805             (count & (PAGE_CACHE_SIZE - 1))) {
1806                 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
1807                 if (!PageUptodate(pinned[1])) {
1808                         ret = mpage_readpage(pinned[1], btrfs_get_block);
1809                         BUG_ON(ret);
1810                 } else {
1811                         unlock_page(pinned[1]);
1812                 }
1813         }
1814
1815         mutex_lock(&root->fs_info->fs_mutex);
1816         trans = btrfs_start_transaction(root, 1);
1817         if (!trans) {
1818                 err = -ENOMEM;
1819                 mutex_unlock(&root->fs_info->fs_mutex);
1820                 goto out_unlock;
1821         }
1822         /* FIXME blocksize != 4096 */
1823         inode->i_blocks += num_blocks << 3;
1824         if (start_pos < inode->i_size) {
1825                 /* FIXME blocksize != pagesize */
1826                 ret = drop_extents(trans, root, inode,
1827                                    start_pos,
1828                                    (pos + count + root->blocksize -1) &
1829                                    ~((u64)root->blocksize - 1));
1830                 BUG_ON(ret);
1831         }
1832         if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size ||
1833             pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
1834                 ret = btrfs_alloc_extent(trans, root, inode->i_ino,
1835                                          num_blocks, 1, (u64)-1, &ins);
1836                 BUG_ON(ret);
1837                 ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
1838                                        start_pos, ins.objectid, ins.offset);
1839                 BUG_ON(ret);
1840         } else {
1841                 ins.offset = 0;
1842                 ins.objectid = 0;
1843         }
1844         BUG_ON(ret);
1845         alloc_extent_start = ins.objectid;
1846         ret = btrfs_end_transaction(trans, root);
1847         mutex_unlock(&root->fs_info->fs_mutex);
1848
1849         while(count > 0) {
1850                 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1851                 size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
1852                 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
1853                                         PAGE_CACHE_SHIFT;
1854
1855                 memset(pages, 0, sizeof(pages));
1856                 ret = prepare_pages(root, file, pages, num_pages,
1857                                     pos, first_index, last_index,
1858                                     write_bytes, alloc_extent_start);
1859                 BUG_ON(ret);
1860
1861                 /* FIXME blocks != pagesize */
1862                 if (alloc_extent_start)
1863                         alloc_extent_start += num_pages;
1864                 ret = btrfs_copy_from_user(pos, num_pages,
1865                                            write_bytes, pages, buf);
1866                 BUG_ON(ret);
1867
1868                 ret = dirty_and_release_pages(NULL, root, file, pages,
1869                                               num_pages, pos, write_bytes);
1870                 BUG_ON(ret);
1871                 btrfs_drop_pages(pages, num_pages);
1872
1873                 buf += write_bytes;
1874                 count -= write_bytes;
1875                 pos += write_bytes;
1876                 num_written += write_bytes;
1877
1878                 balance_dirty_pages_ratelimited(inode->i_mapping);
1879                 cond_resched();
1880         }
1881 out_unlock:
1882         mutex_unlock(&inode->i_mutex);
1883 out:
1884         if (pinned[0])
1885                 page_cache_release(pinned[0]);
1886         if (pinned[1])
1887                 page_cache_release(pinned[1]);
1888         *ppos = pos;
1889         current->backing_dev_info = NULL;
1890         mark_inode_dirty(inode);
1891         return num_written ? num_written : err;
1892 }
1893
1894 static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
1895                         unsigned long offset, unsigned long size)
1896 {
1897         char *kaddr;
1898         unsigned long left, count = desc->count;
1899         struct inode *inode = page->mapping->host;
1900
1901         if (size > count)
1902                 size = count;
1903
1904         if (!PageChecked(page)) {
1905                 /* FIXME, do it per block */
1906                 struct btrfs_root *root = BTRFS_I(inode)->root;
1907
1908                 int ret = btrfs_csum_verify_file_block(root,
1909                                   page->mapping->host->i_ino,
1910                                   page->index << PAGE_CACHE_SHIFT,
1911                                   kmap(page), PAGE_CACHE_SIZE);
1912                 if (ret) {
1913                         printk("failed to verify ino %lu page %lu\n",
1914                                page->mapping->host->i_ino,
1915                                page->index);
1916                         memset(page_address(page), 0, PAGE_CACHE_SIZE);
1917                 }
1918                 SetPageChecked(page);
1919                 kunmap(page);
1920         }
1921         /*
1922          * Faults on the destination of a read are common, so do it before
1923          * taking the kmap.
1924          */
1925         if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1926                 kaddr = kmap_atomic(page, KM_USER0);
1927                 left = __copy_to_user_inatomic(desc->arg.buf,
1928                                                 kaddr + offset, size);
1929                 kunmap_atomic(kaddr, KM_USER0);
1930                 if (left == 0)
1931                         goto success;
1932         }
1933
1934         /* Do it the slow way */
1935         kaddr = kmap(page);
1936         left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1937         kunmap(page);
1938
1939         if (left) {
1940                 size -= left;
1941                 desc->error = -EFAULT;
1942         }
1943 success:
1944         desc->count = count - size;
1945         desc->written += size;
1946         desc->arg.buf += size;
1947         return size;
1948 }
1949
1950 /**
1951  * btrfs_file_aio_read - filesystem read routine
1952  * @iocb:       kernel I/O control block
1953  * @iov:        io vector request
1954  * @nr_segs:    number of segments in the iovec
1955  * @pos:        current file position
1956  */
1957 static ssize_t btrfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1958                                    unsigned long nr_segs, loff_t pos)
1959 {
1960         struct file *filp = iocb->ki_filp;
1961         ssize_t retval;
1962         unsigned long seg;
1963         size_t count;
1964         loff_t *ppos = &iocb->ki_pos;
1965
1966         count = 0;
1967         for (seg = 0; seg < nr_segs; seg++) {
1968                 const struct iovec *iv = &iov[seg];
1969
1970                 /*
1971                  * If any segment has a negative length, or the cumulative
1972                  * length ever wraps negative then return -EINVAL.
1973                  */
1974                 count += iv->iov_len;
1975                 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
1976                         return -EINVAL;
1977                 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
1978                         continue;
1979                 if (seg == 0)
1980                         return -EFAULT;
1981                 nr_segs = seg;
1982                 count -= iv->iov_len;   /* This segment is no good */
1983                 break;
1984         }
1985         retval = 0;
1986         if (count) {
1987                 for (seg = 0; seg < nr_segs; seg++) {
1988                         read_descriptor_t desc;
1989
1990                         desc.written = 0;
1991                         desc.arg.buf = iov[seg].iov_base;
1992                         desc.count = iov[seg].iov_len;
1993                         if (desc.count == 0)
1994                                 continue;
1995                         desc.error = 0;
1996                         do_generic_file_read(filp, ppos, &desc,
1997                                              btrfs_read_actor);
1998                         retval += desc.written;
1999                         if (desc.error) {
2000                                 retval = retval ?: desc.error;
2001                                 break;
2002                         }
2003                 }
2004         }
2005         return retval;
2006 }
2007
2008 static int create_subvol(struct btrfs_root *root, char *name, int namelen)
2009 {
2010         struct btrfs_trans_handle *trans;
2011         struct btrfs_key key;
2012         struct btrfs_root_item root_item;
2013         struct btrfs_inode_item *inode_item;
2014         struct buffer_head *subvol;
2015         struct btrfs_leaf *leaf;
2016         struct btrfs_root *new_root;
2017         struct inode *inode;
2018         int ret;
2019         u64 objectid;
2020         u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
2021
2022         mutex_lock(&root->fs_info->fs_mutex);
2023         trans = btrfs_start_transaction(root, 1);
2024         BUG_ON(!trans);
2025
2026         subvol = btrfs_alloc_free_block(trans, root);
2027         if (subvol == NULL)
2028                 return -ENOSPC;
2029         leaf = btrfs_buffer_leaf(subvol);
2030         btrfs_set_header_nritems(&leaf->header, 0);
2031         btrfs_set_header_level(&leaf->header, 0);
2032         btrfs_set_header_blocknr(&leaf->header, bh_blocknr(subvol));
2033         btrfs_set_header_generation(&leaf->header, trans->transid);
2034         btrfs_set_header_owner(&leaf->header, root->root_key.objectid);
2035         memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid,
2036                sizeof(leaf->header.fsid));
2037         mark_buffer_dirty(subvol);
2038
2039         inode_item = &root_item.inode;
2040         memset(inode_item, 0, sizeof(*inode_item));
2041         btrfs_set_inode_generation(inode_item, 1);
2042         btrfs_set_inode_size(inode_item, 3);
2043         btrfs_set_inode_nlink(inode_item, 1);
2044         btrfs_set_inode_nblocks(inode_item, 1);
2045         btrfs_set_inode_mode(inode_item, S_IFDIR | 0755);
2046
2047         btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol));
2048         btrfs_set_root_refs(&root_item, 1);
2049         brelse(subvol);
2050         subvol = NULL;
2051
2052         ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
2053                                        0, &objectid);
2054         BUG_ON(ret);
2055
2056         btrfs_set_root_dirid(&root_item, new_dirid);
2057
2058         key.objectid = objectid;
2059         key.offset = 1;
2060         key.flags = 0;
2061         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2062         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
2063                                 &root_item);
2064         BUG_ON(ret);
2065
2066         /*
2067          * insert the directory item
2068          */
2069         key.offset = (u64)-1;
2070         ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
2071                                     name, namelen,
2072                                     root->fs_info->sb->s_root->d_inode->i_ino,
2073                                     &key, 0);
2074         BUG_ON(ret);
2075
2076         ret = btrfs_commit_transaction(trans, root);
2077         BUG_ON(ret);
2078
2079         new_root = btrfs_read_fs_root(root->fs_info, &key);
2080         BUG_ON(!new_root);
2081
2082         trans = btrfs_start_transaction(new_root, 1);
2083         BUG_ON(!trans);
2084
2085         inode = btrfs_new_inode(trans, new_root, new_dirid, S_IFDIR | 0700);
2086         inode->i_op = &btrfs_dir_inode_operations;
2087         inode->i_fop = &btrfs_dir_file_operations;
2088
2089         ret = btrfs_make_empty_dir(trans, new_root, new_dirid, new_dirid);
2090         BUG_ON(ret);
2091
2092         inode->i_nlink = 1;
2093         inode->i_size = 6;
2094         ret = btrfs_update_inode(trans, new_root, inode);
2095         BUG_ON(ret);
2096
2097         ret = btrfs_commit_transaction(trans, new_root);
2098         BUG_ON(ret);
2099
2100         iput(inode);
2101
2102         mutex_unlock(&root->fs_info->fs_mutex);
2103         return 0;
2104 }
2105
2106 static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
2107 {
2108         struct btrfs_trans_handle *trans;
2109         struct btrfs_key key;
2110         struct btrfs_root_item new_root_item;
2111         int ret;
2112         u64 objectid;
2113
2114         if (!root->ref_cows)
2115                 return -EINVAL;
2116
2117         mutex_lock(&root->fs_info->fs_mutex);
2118         trans = btrfs_start_transaction(root, 1);
2119         BUG_ON(!trans);
2120
2121         ret = btrfs_update_inode(trans, root, root->inode);
2122         BUG_ON(ret);
2123
2124         ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
2125                                        0, &objectid);
2126         BUG_ON(ret);
2127
2128         memcpy(&new_root_item, &root->root_item,
2129                sizeof(new_root_item));
2130
2131         key.objectid = objectid;
2132         key.offset = 1;
2133         key.flags = 0;
2134         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2135         btrfs_set_root_blocknr(&new_root_item, bh_blocknr(root->node));
2136
2137         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
2138                                 &new_root_item);
2139         BUG_ON(ret);
2140
2141         /*
2142          * insert the directory item
2143          */
2144         key.offset = (u64)-1;
2145         ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
2146                                     name, namelen,
2147                                     root->fs_info->sb->s_root->d_inode->i_ino,
2148                                     &key, 0);
2149
2150         BUG_ON(ret);
2151
2152         ret = btrfs_inc_root_ref(trans, root);
2153         BUG_ON(ret);
2154
2155         ret = btrfs_commit_transaction(trans, root);
2156         BUG_ON(ret);
2157         mutex_unlock(&root->fs_info->fs_mutex);
2158         return 0;
2159 }
2160
2161 static int add_disk(struct btrfs_root *root, char *name, int namelen)
2162 {
2163         struct block_device *bdev;
2164         struct btrfs_path *path;
2165         struct super_block *sb = root->fs_info->sb;
2166         struct btrfs_root *dev_root = root->fs_info->dev_root;
2167         struct btrfs_trans_handle *trans;
2168         struct btrfs_device_item *dev_item;
2169         struct btrfs_key key;
2170         u16 item_size;
2171         u64 num_blocks;
2172         u64 new_blocks;
2173         u64 device_id;
2174         int ret;
2175
2176 printk("adding disk %s\n", name);
2177         path = btrfs_alloc_path();
2178         if (!path)
2179                 return -ENOMEM;
2180         num_blocks = btrfs_super_total_blocks(root->fs_info->disk_super);
2181         bdev = open_bdev_excl(name, O_RDWR, sb);
2182         if (IS_ERR(bdev)) {
2183                 ret = PTR_ERR(bdev);
2184 printk("open bdev excl failed ret %d\n", ret);
2185                 goto out_nolock;
2186         }
2187         set_blocksize(bdev, sb->s_blocksize);
2188         new_blocks = bdev->bd_inode->i_size >> sb->s_blocksize_bits;
2189         key.objectid = num_blocks;
2190         key.offset = new_blocks;
2191         key.flags = 0;
2192         btrfs_set_key_type(&key, BTRFS_DEV_ITEM_KEY);
2193
2194         mutex_lock(&dev_root->fs_info->fs_mutex);
2195         trans = btrfs_start_transaction(dev_root, 1);
2196         item_size = sizeof(*dev_item) + namelen;
2197 printk("insert empty on %Lu %Lu %u size %d\n", num_blocks, new_blocks, key.flags, item_size);
2198         ret = btrfs_insert_empty_item(trans, dev_root, path, &key, item_size);
2199         if (ret) {
2200 printk("insert failed %d\n", ret);
2201                 close_bdev_excl(bdev);
2202                 if (ret > 0)
2203                         ret = -EEXIST;
2204                 goto out;
2205         }
2206         dev_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
2207                                   path->slots[0], struct btrfs_device_item);
2208         btrfs_set_device_pathlen(dev_item, namelen);
2209         memcpy(dev_item + 1, name, namelen);
2210
2211         device_id = btrfs_super_last_device_id(root->fs_info->disk_super) + 1;
2212         btrfs_set_super_last_device_id(root->fs_info->disk_super, device_id);
2213         btrfs_set_device_id(dev_item, device_id);
2214         mark_buffer_dirty(path->nodes[0]);
2215
2216         ret = btrfs_insert_dev_radix(root, bdev, device_id, num_blocks,
2217                                      new_blocks);
2218
2219         if (!ret) {
2220                 btrfs_set_super_total_blocks(root->fs_info->disk_super,
2221                                              num_blocks + new_blocks);
2222                 i_size_write(root->fs_info->btree_inode,
2223                              (num_blocks + new_blocks) <<
2224                              root->fs_info->btree_inode->i_blkbits);
2225         }
2226
2227 out:
2228         ret = btrfs_commit_transaction(trans, dev_root);
2229         BUG_ON(ret);
2230         mutex_unlock(&root->fs_info->fs_mutex);
2231 out_nolock:
2232         btrfs_free_path(path);
2233
2234         return ret;
2235 }
2236
2237 static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
2238                        cmd, unsigned long arg)
2239 {
2240         struct btrfs_root *root = BTRFS_I(inode)->root;
2241         struct btrfs_ioctl_vol_args vol_args;
2242         int ret = 0;
2243         struct btrfs_dir_item *di;
2244         int namelen;
2245         struct btrfs_path *path;
2246         u64 root_dirid;
2247
2248         switch (cmd) {
2249         case BTRFS_IOC_SNAP_CREATE:
2250                 if (copy_from_user(&vol_args,
2251                                    (struct btrfs_ioctl_vol_args __user *)arg,
2252                                    sizeof(vol_args)))
2253                         return -EFAULT;
2254                 namelen = strlen(vol_args.name);
2255                 if (namelen > BTRFS_VOL_NAME_MAX)
2256                         return -EINVAL;
2257                 path = btrfs_alloc_path();
2258                 if (!path)
2259                         return -ENOMEM;
2260                 root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
2261                 mutex_lock(&root->fs_info->fs_mutex);
2262                 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
2263                                     path, root_dirid,
2264                                     vol_args.name, namelen, 0);
2265                 mutex_unlock(&root->fs_info->fs_mutex);
2266                 btrfs_free_path(path);
2267                 if (di && !IS_ERR(di))
2268                         return -EEXIST;
2269
2270                 if (root == root->fs_info->tree_root)
2271                         ret = create_subvol(root, vol_args.name, namelen);
2272                 else
2273                         ret = create_snapshot(root, vol_args.name, namelen);
2274                 WARN_ON(ret);
2275                 break;
2276         case BTRFS_IOC_ADD_DISK:
2277                 if (copy_from_user(&vol_args,
2278                                    (struct btrfs_ioctl_vol_args __user *)arg,
2279                                    sizeof(vol_args)))
2280                         return -EFAULT;
2281                 namelen = strlen(vol_args.name);
2282                 if (namelen > BTRFS_VOL_NAME_MAX)
2283                         return -EINVAL;
2284                 vol_args.name[namelen] = '\0';
2285                 ret = add_disk(root, vol_args.name, namelen);
2286                 break;
2287         default:
2288                 return -ENOTTY;
2289         }
2290         return ret;
2291 }
2292
2293 static struct kmem_cache *btrfs_inode_cachep;
2294 struct kmem_cache *btrfs_trans_handle_cachep;
2295 struct kmem_cache *btrfs_transaction_cachep;
2296 struct kmem_cache *btrfs_bit_radix_cachep;
2297 struct kmem_cache *btrfs_path_cachep;
2298
2299 /*
2300  * Called inside transaction, so use GFP_NOFS
2301  */
2302 static struct inode *btrfs_alloc_inode(struct super_block *sb)
2303 {
2304         struct btrfs_inode *ei;
2305
2306         ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
2307         if (!ei)
2308                 return NULL;
2309         return &ei->vfs_inode;
2310 }
2311
2312 static void btrfs_destroy_inode(struct inode *inode)
2313 {
2314         WARN_ON(!list_empty(&inode->i_dentry));
2315         WARN_ON(inode->i_data.nrpages);
2316
2317         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
2318 }
2319
2320 static void init_once(void * foo, struct kmem_cache * cachep,
2321                       unsigned long flags)
2322 {
2323         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
2324
2325         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2326             SLAB_CTOR_CONSTRUCTOR) {
2327                 inode_init_once(&ei->vfs_inode);
2328         }
2329 }
2330
2331 static int init_inodecache(void)
2332 {
2333         btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
2334                                              sizeof(struct btrfs_inode),
2335                                              0, (SLAB_RECLAIM_ACCOUNT|
2336                                                 SLAB_MEM_SPREAD),
2337                                              init_once, NULL);
2338         btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
2339                                              sizeof(struct btrfs_trans_handle),
2340                                              0, (SLAB_RECLAIM_ACCOUNT|
2341                                                 SLAB_MEM_SPREAD),
2342                                              NULL, NULL);
2343         btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
2344                                              sizeof(struct btrfs_transaction),
2345                                              0, (SLAB_RECLAIM_ACCOUNT|
2346                                                 SLAB_MEM_SPREAD),
2347                                              NULL, NULL);
2348         btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
2349                                              sizeof(struct btrfs_transaction),
2350                                              0, (SLAB_RECLAIM_ACCOUNT|
2351                                                 SLAB_MEM_SPREAD),
2352                                              NULL, NULL);
2353         btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix",
2354                                              256,
2355                                              0, (SLAB_RECLAIM_ACCOUNT|
2356                                                 SLAB_MEM_SPREAD |
2357                                                 SLAB_DESTROY_BY_RCU),
2358                                              NULL, NULL);
2359         if (btrfs_inode_cachep == NULL || btrfs_trans_handle_cachep == NULL ||
2360             btrfs_transaction_cachep == NULL || btrfs_bit_radix_cachep == NULL)
2361                 return -ENOMEM;
2362         return 0;
2363 }
2364
2365 static void destroy_inodecache(void)
2366 {
2367         kmem_cache_destroy(btrfs_inode_cachep);
2368         kmem_cache_destroy(btrfs_trans_handle_cachep);
2369         kmem_cache_destroy(btrfs_transaction_cachep);
2370         kmem_cache_destroy(btrfs_bit_radix_cachep);
2371         kmem_cache_destroy(btrfs_path_cachep);
2372 }
2373
2374 static int btrfs_get_sb(struct file_system_type *fs_type,
2375         int flags, const char *dev_name, void *data, struct vfsmount *mnt)
2376 {
2377         return get_sb_bdev(fs_type, flags, dev_name, data,
2378                            btrfs_fill_super, mnt);
2379 }
2380
2381
2382 static int btrfs_getattr(struct vfsmount *mnt,
2383                          struct dentry *dentry, struct kstat *stat)
2384 {
2385         struct inode *inode = dentry->d_inode;
2386         generic_fillattr(inode, stat);
2387         stat->blksize = 256 * 1024;
2388         return 0;
2389 }
2390
2391 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
2392 {
2393         struct btrfs_root *root = btrfs_sb(dentry->d_sb);
2394         struct btrfs_super_block *disk_super = root->fs_info->disk_super;
2395
2396         buf->f_namelen = BTRFS_NAME_LEN;
2397         buf->f_blocks = btrfs_super_total_blocks(disk_super);
2398         buf->f_bfree = buf->f_blocks - btrfs_super_blocks_used(disk_super);
2399         buf->f_bavail = buf->f_bfree;
2400         buf->f_bsize = dentry->d_sb->s_blocksize;
2401         buf->f_type = BTRFS_SUPER_MAGIC;
2402         return 0;
2403 }
2404
2405 static struct file_system_type btrfs_fs_type = {
2406         .owner          = THIS_MODULE,
2407         .name           = "btrfs",
2408         .get_sb         = btrfs_get_sb,
2409         .kill_sb        = kill_block_super,
2410         .fs_flags       = FS_REQUIRES_DEV,
2411 };
2412
2413 static struct super_operations btrfs_super_ops = {
2414         .delete_inode   = btrfs_delete_inode,
2415         .put_super      = btrfs_put_super,
2416         .read_inode     = btrfs_read_locked_inode,
2417         .write_super    = btrfs_write_super,
2418         .sync_fs        = btrfs_sync_fs,
2419         .write_inode    = btrfs_write_inode,
2420         .dirty_inode    = btrfs_dirty_inode,
2421         .alloc_inode    = btrfs_alloc_inode,
2422         .destroy_inode  = btrfs_destroy_inode,
2423         .statfs         = btrfs_statfs,
2424 };
2425
2426 static struct inode_operations btrfs_dir_inode_operations = {
2427         .lookup         = btrfs_lookup,
2428         .create         = btrfs_create,
2429         .unlink         = btrfs_unlink,
2430         .mkdir          = btrfs_mkdir,
2431         .rmdir          = btrfs_rmdir,
2432 };
2433
2434 static struct inode_operations btrfs_dir_ro_inode_operations = {
2435         .lookup         = btrfs_lookup,
2436 };
2437
2438 static struct file_operations btrfs_dir_file_operations = {
2439         .llseek         = generic_file_llseek,
2440         .read           = generic_read_dir,
2441         .readdir        = btrfs_readdir,
2442         .ioctl          = btrfs_ioctl,
2443 };
2444
2445 static struct address_space_operations btrfs_aops = {
2446         .readpage       = btrfs_readpage,
2447         .writepage      = btrfs_writepage,
2448         .sync_page      = block_sync_page,
2449         .prepare_write  = btrfs_prepare_write,
2450         .commit_write   = btrfs_commit_write,
2451 };
2452
2453 static struct inode_operations btrfs_file_inode_operations = {
2454         .truncate       = btrfs_truncate,
2455         .getattr        = btrfs_getattr,
2456 };
2457
2458 static struct file_operations btrfs_file_operations = {
2459         .llseek         = generic_file_llseek,
2460         .read           = do_sync_read,
2461         .aio_read       = btrfs_file_aio_read,
2462         .write          = btrfs_file_write,
2463         .mmap           = generic_file_mmap,
2464         .open           = generic_file_open,
2465         .ioctl          = btrfs_ioctl,
2466         .fsync          = btrfs_sync_file,
2467 };
2468
2469 static int __init init_btrfs_fs(void)
2470 {
2471         int err;
2472         printk("btrfs loaded!\n");
2473         err = init_inodecache();
2474         if (err)
2475                 return err;
2476         kset_set_kset_s(&btrfs_subsys, fs_subsys);
2477         err = subsystem_register(&btrfs_subsys);
2478         if (err)
2479                 goto out;
2480         return register_filesystem(&btrfs_fs_type);
2481 out:
2482         destroy_inodecache();
2483         return err;
2484 }
2485
2486 static void __exit exit_btrfs_fs(void)
2487 {
2488         destroy_inodecache();
2489         unregister_filesystem(&btrfs_fs_type);
2490         subsystem_unregister(&btrfs_subsys);
2491         printk("btrfs unloaded\n");
2492 }
2493
2494 module_init(init_btrfs_fs)
2495 module_exit(exit_btrfs_fs)
2496
2497 MODULE_LICENSE("GPL");