Btrfs: rework qgroup accounting
[deliverable/linux.git] / fs / btrfs / qgroup.c
index 2cf905877aaf4cab8b24d27a9bd1fc6a309d0103..09b8cc83965cc8263837ff94593d7599a8cb5ed4 100644 (file)
@@ -32,6 +32,7 @@
 #include "ulist.h"
 #include "backref.h"
 #include "extent_io.h"
+#include "qgroup.h"
 
 /* TODO XXX FIXME
  *  - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -84,8 +85,8 @@ struct btrfs_qgroup {
        /*
         * temp variables for accounting operations
         */
-       u64 tag;
-       u64 refcnt;
+       u64 old_refcnt;
+       u64 new_refcnt;
 };
 
 /*
@@ -98,6 +99,9 @@ struct btrfs_qgroup_list {
        struct btrfs_qgroup *member;
 };
 
+#define ptr_to_u64(x) ((u64)(uintptr_t)x)
+#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x)
+
 static int
 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
                   int init_flags);
@@ -1174,33 +1178,198 @@ out:
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        return ret;
 }
+static int comp_oper(struct btrfs_qgroup_operation *oper1,
+                    struct btrfs_qgroup_operation *oper2)
+{
+       if (oper1->bytenr < oper2->bytenr)
+               return -1;
+       if (oper1->bytenr > oper2->bytenr)
+               return 1;
+       if (oper1->seq < oper2->seq)
+               return -1;
+       if (oper1->seq > oper2->seq)
+               return -1;
+       if (oper1->ref_root < oper2->ref_root)
+               return -1;
+       if (oper1->ref_root > oper2->ref_root)
+               return 1;
+       if (oper1->type < oper2->type)
+               return -1;
+       if (oper1->type > oper2->type)
+               return 1;
+       return 0;
+}
+
+static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
+                             struct btrfs_qgroup_operation *oper)
+{
+       struct rb_node **p;
+       struct rb_node *parent = NULL;
+       struct btrfs_qgroup_operation *cur;
+       int cmp;
+
+       spin_lock(&fs_info->qgroup_op_lock);
+       p = &fs_info->qgroup_op_tree.rb_node;
+       while (*p) {
+               parent = *p;
+               cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
+               cmp = comp_oper(cur, oper);
+               if (cmp < 0) {
+                       p = &(*p)->rb_right;
+               } else if (cmp) {
+                       p = &(*p)->rb_left;
+               } else {
+                       spin_unlock(&fs_info->qgroup_op_lock);
+                       return -EEXIST;
+               }
+       }
+       rb_link_node(&oper->n, parent, p);
+       rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
+       spin_unlock(&fs_info->qgroup_op_lock);
+       return 0;
+}
 
 /*
- * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
- * the modification into a list that's later used by btrfs_end_transaction to
- * pass the recorded modifications on to btrfs_qgroup_account_ref.
+ * Record a quota operation for processing later on.
+ * @trans: the transaction we are adding the delayed op to.
+ * @fs_info: the fs_info for this fs.
+ * @ref_root: the root of the reference we are acting on,
+ * @bytenr: the bytenr we are acting on.
+ * @num_bytes: the number of bytes in the reference.
+ * @type: the type of operation this is.
+ * @mod_seq: do we need to get a sequence number for looking up roots.
+ *
+ * We just add it to our trans qgroup_ref_list and carry on and process these
+ * operations in order at some later point.  If the reference root isn't a fs
+ * root then we don't bother with doing anything.
+ *
+ * MUST BE HOLDING THE REF LOCK.
  */
 int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
-                           struct btrfs_delayed_ref_node *node,
-                           struct btrfs_delayed_extent_op *extent_op)
+                           struct btrfs_fs_info *fs_info, u64 ref_root,
+                           u64 bytenr, u64 num_bytes,
+                           enum btrfs_qgroup_operation_type type, int mod_seq)
 {
-       struct qgroup_update *u;
+       struct btrfs_qgroup_operation *oper;
+       int ret;
 
-       BUG_ON(!trans->delayed_ref_elem.seq);
-       u = kmalloc(sizeof(*u), GFP_NOFS);
-       if (!u)
+       if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+               return 0;
+
+       oper = kmalloc(sizeof(*oper), GFP_NOFS);
+       if (!oper)
                return -ENOMEM;
 
-       u->node = node;
-       u->extent_op = extent_op;
-       list_add_tail(&u->list, &trans->qgroup_ref_list);
+       oper->ref_root = ref_root;
+       oper->bytenr = bytenr;
+       oper->num_bytes = num_bytes;
+       oper->type = type;
+       oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
+       INIT_LIST_HEAD(&oper->elem.list);
+       oper->elem.seq = 0;
+       ret = insert_qgroup_oper(fs_info, oper);
+       if (ret) {
+               /* Shouldn't happen so have an assert for developers */
+               ASSERT(0);
+               kfree(oper);
+               return ret;
+       }
+       list_add_tail(&oper->list, &trans->qgroup_ref_list);
+
+       if (mod_seq)
+               btrfs_get_tree_mod_seq(fs_info, &oper->elem);
 
        return 0;
 }
 
-static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,
-                                   struct ulist *roots, struct ulist *tmp,
-                                   u64 seq)
+/*
+ * The easy accounting, if we are adding/removing the only ref for an extent
+ * then this qgroup and all of the parent qgroups get their refrence and
+ * exclusive counts adjusted.
+ */
+static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_qgroup_operation *oper)
+{
+       struct btrfs_qgroup *qgroup;
+       struct ulist *tmp;
+       struct btrfs_qgroup_list *glist;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+       int sign = 0;
+       int ret = 0;
+
+       tmp = ulist_alloc(GFP_NOFS);
+       if (!tmp)
+               return -ENOMEM;
+
+       spin_lock(&fs_info->qgroup_lock);
+       if (!fs_info->quota_root)
+               goto out;
+       qgroup = find_qgroup_rb(fs_info, oper->ref_root);
+       if (!qgroup)
+               goto out;
+       switch (oper->type) {
+       case BTRFS_QGROUP_OPER_ADD_EXCL:
+               sign = 1;
+               break;
+       case BTRFS_QGROUP_OPER_SUB_EXCL:
+               sign = -1;
+               break;
+       default:
+               ASSERT(0);
+       }
+       qgroup->rfer += sign * oper->num_bytes;
+       qgroup->rfer_cmpr += sign * oper->num_bytes;
+
+       WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
+       qgroup->excl += sign * oper->num_bytes;
+       qgroup->excl_cmpr += sign * oper->num_bytes;
+
+       qgroup_dirty(fs_info, qgroup);
+
+       /* Get all of the parent groups that contain this qgroup */
+       list_for_each_entry(glist, &qgroup->groups, next_group) {
+               ret = ulist_add(tmp, glist->group->qgroupid,
+                               ptr_to_u64(glist->group), GFP_ATOMIC);
+               if (ret < 0)
+                       goto out;
+       }
+
+       /* Iterate all of the parents and adjust their reference counts */
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(tmp, &uiter))) {
+               qgroup = u64_to_ptr(unode->aux);
+               qgroup->rfer += sign * oper->num_bytes;
+               qgroup->rfer_cmpr += sign * oper->num_bytes;
+               qgroup->excl += sign * oper->num_bytes;
+               if (sign < 0)
+                       WARN_ON(qgroup->excl < oper->num_bytes);
+               qgroup->excl_cmpr += sign * oper->num_bytes;
+               qgroup_dirty(fs_info, qgroup);
+
+               /* Add any parents of the parents */
+               list_for_each_entry(glist, &qgroup->groups, next_group) {
+                       ret = ulist_add(tmp, glist->group->qgroupid,
+                                       ptr_to_u64(glist->group), GFP_ATOMIC);
+                       if (ret < 0)
+                               goto out;
+               }
+       }
+       ret = 0;
+out:
+       spin_unlock(&fs_info->qgroup_lock);
+       ulist_free(tmp);
+       return ret;
+}
+
+/*
+ * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
+ * properly.
+ */
+static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
+                                 u64 root_to_skip, struct ulist *tmp,
+                                 struct ulist *roots, struct ulist *qgroups,
+                                 u64 seq, int *old_roots, int rescan)
 {
        struct ulist_node *unode;
        struct ulist_iterator uiter;
@@ -1211,256 +1380,549 @@ static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,
 
        ULIST_ITER_INIT(&uiter);
        while ((unode = ulist_next(roots, &uiter))) {
+               /* We don't count our current root here */
+               if (unode->val == root_to_skip)
+                       continue;
                qg = find_qgroup_rb(fs_info, unode->val);
                if (!qg)
                        continue;
+               /*
+                * We could have a pending removal of this same ref so we may
+                * not have actually found our ref root when doing
+                * btrfs_find_all_roots, so we need to keep track of how many
+                * old roots we find in case we removed ours and added a
+                * different one at the same time.  I don't think this could
+                * happen in practice but that sort of thinking leads to pain
+                * and suffering and to the dark side.
+                */
+               (*old_roots)++;
 
                ulist_reinit(tmp);
-                                               /* XXX id not needed */
-               ret = ulist_add(tmp, qg->qgroupid,
-                               (u64)(uintptr_t)qg, GFP_ATOMIC);
+               ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+                               GFP_ATOMIC);
+               if (ret < 0)
+                       return ret;
+               ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
                if (ret < 0)
                        return ret;
                ULIST_ITER_INIT(&tmp_uiter);
                while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
                        struct btrfs_qgroup_list *glist;
 
-                       qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
-                       if (qg->refcnt < seq)
-                               qg->refcnt = seq + 1;
+                       qg = u64_to_ptr(tmp_unode->aux);
+                       /*
+                        * We use this sequence number to keep from having to
+                        * run the whole list and 0 out the refcnt every time.
+                        * We basically use sequnce as the known 0 count and
+                        * then add 1 everytime we see a qgroup.  This is how we
+                        * get how many of the roots actually point up to the
+                        * upper level qgroups in order to determine exclusive
+                        * counts.
+                        *
+                        * For rescan we want to set old_refcnt to seq so our
+                        * exclusive calculations end up correct.
+                        */
+                       if (rescan)
+                               qg->old_refcnt = seq;
+                       else if (qg->old_refcnt < seq)
+                               qg->old_refcnt = seq + 1;
                        else
-                               ++qg->refcnt;
+                               qg->old_refcnt++;
 
+                       if (qg->new_refcnt < seq)
+                               qg->new_refcnt = seq + 1;
+                       else
+                               qg->new_refcnt++;
                        list_for_each_entry(glist, &qg->groups, next_group) {
+                               ret = ulist_add(qgroups, glist->group->qgroupid,
+                                               ptr_to_u64(glist->group),
+                                               GFP_ATOMIC);
+                               if (ret < 0)
+                                       return ret;
                                ret = ulist_add(tmp, glist->group->qgroupid,
-                                               (u64)(uintptr_t)glist->group,
+                                               ptr_to_u64(glist->group),
                                                GFP_ATOMIC);
                                if (ret < 0)
                                        return ret;
                        }
                }
        }
+       return 0;
+}
 
+/*
+ * We need to walk forward in our operation tree and account for any roots that
+ * were deleted after we made this operation.
+ */
+static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
+                                      struct btrfs_qgroup_operation *oper,
+                                      struct ulist *tmp,
+                                      struct ulist *qgroups, u64 seq,
+                                      int *old_roots)
+{
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+       struct btrfs_qgroup *qg;
+       struct btrfs_qgroup_operation *tmp_oper;
+       struct rb_node *n;
+       int ret;
+
+       ulist_reinit(tmp);
+
+       /*
+        * We only walk forward in the tree since we're only interested in
+        * removals that happened _after_  our operation.
+        */
+       spin_lock(&fs_info->qgroup_op_lock);
+       n = rb_next(&oper->n);
+       spin_unlock(&fs_info->qgroup_op_lock);
+       if (!n)
+               return 0;
+       tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+       while (tmp_oper->bytenr == oper->bytenr) {
+               /*
+                * If it's not a removal we don't care, additions work out
+                * properly with our refcnt tracking.
+                */
+               if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
+                   tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
+                       goto next;
+               qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
+               if (!qg)
+                       goto next;
+               ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+                               GFP_ATOMIC);
+               if (ret) {
+                       if (ret < 0)
+                               return ret;
+                       /*
+                        * We only want to increase old_roots if this qgroup is
+                        * not already in the list of qgroups.  If it is already
+                        * there then that means it must have been re-added or
+                        * the delete will be discarded because we had an
+                        * existing ref that we haven't looked up yet.  In this
+                        * case we don't want to increase old_roots.  So if ret
+                        * == 1 then we know that this is the first time we've
+                        * seen this qgroup and we can bump the old_roots.
+                        */
+                       (*old_roots)++;
+                       ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
+                                       GFP_ATOMIC);
+                       if (ret < 0)
+                               return ret;
+               }
+next:
+               spin_lock(&fs_info->qgroup_op_lock);
+               n = rb_next(&tmp_oper->n);
+               spin_unlock(&fs_info->qgroup_op_lock);
+               if (!n)
+                       break;
+               tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+       }
+
+       /* Ok now process the qgroups we found */
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(tmp, &uiter))) {
+               struct btrfs_qgroup_list *glist;
+
+               qg = u64_to_ptr(unode->aux);
+               if (qg->old_refcnt < seq)
+                       qg->old_refcnt = seq + 1;
+               else
+                       qg->old_refcnt++;
+               if (qg->new_refcnt < seq)
+                       qg->new_refcnt = seq + 1;
+               else
+                       qg->new_refcnt++;
+               list_for_each_entry(glist, &qg->groups, next_group) {
+                       ret = ulist_add(qgroups, glist->group->qgroupid,
+                                       ptr_to_u64(glist->group), GFP_ATOMIC);
+                       if (ret < 0)
+                               return ret;
+                       ret = ulist_add(tmp, glist->group->qgroupid,
+                                       ptr_to_u64(glist->group), GFP_ATOMIC);
+                       if (ret < 0)
+                               return ret;
+               }
+       }
        return 0;
 }
 
-static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info,
-                                   struct ulist *roots, struct ulist *tmp,
-                                   u64 seq, int sgn, u64 num_bytes,
-                                   struct btrfs_qgroup *qgroup)
+/* Add refcnt for the newly added reference. */
+static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_qgroup_operation *oper,
+                                 struct btrfs_qgroup *qgroup,
+                                 struct ulist *tmp, struct ulist *qgroups,
+                                 u64 seq)
 {
        struct ulist_node *unode;
        struct ulist_iterator uiter;
        struct btrfs_qgroup *qg;
-       struct btrfs_qgroup_list *glist;
        int ret;
 
        ulist_reinit(tmp);
-       ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
+       ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
+                       GFP_ATOMIC);
+       if (ret < 0)
+               return ret;
+       ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
+                       GFP_ATOMIC);
        if (ret < 0)
                return ret;
-
        ULIST_ITER_INIT(&uiter);
        while ((unode = ulist_next(tmp, &uiter))) {
-               qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
-               if (qg->refcnt < seq) {
-                       /* not visited by step 1 */
-                       qg->rfer += sgn * num_bytes;
-                       qg->rfer_cmpr += sgn * num_bytes;
-                       if (roots->nnodes == 0) {
-                               qg->excl += sgn * num_bytes;
-                               qg->excl_cmpr += sgn * num_bytes;
-                       }
-                       qgroup_dirty(fs_info, qg);
-               }
-               WARN_ON(qg->tag >= seq);
-               qg->tag = seq;
+               struct btrfs_qgroup_list *glist;
 
+               qg = u64_to_ptr(unode->aux);
+               if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+                       if (qg->new_refcnt < seq)
+                               qg->new_refcnt = seq + 1;
+                       else
+                               qg->new_refcnt++;
+               } else {
+                       if (qg->old_refcnt < seq)
+                               qg->old_refcnt = seq + 1;
+                       else
+                               qg->old_refcnt++;
+               }
                list_for_each_entry(glist, &qg->groups, next_group) {
                        ret = ulist_add(tmp, glist->group->qgroupid,
-                                       (uintptr_t)glist->group, GFP_ATOMIC);
+                                       ptr_to_u64(glist->group), GFP_ATOMIC);
+                       if (ret < 0)
+                               return ret;
+                       ret = ulist_add(qgroups, glist->group->qgroupid,
+                                       ptr_to_u64(glist->group), GFP_ATOMIC);
                        if (ret < 0)
                                return ret;
                }
        }
-
        return 0;
 }
 
-static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info,
-                                   struct ulist *roots, struct ulist *tmp,
-                                   u64 seq, int sgn, u64 num_bytes)
+/*
+ * This adjusts the counters for all referenced qgroups if need be.
+ */
+static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
+                                 u64 root_to_skip, u64 num_bytes,
+                                 struct ulist *qgroups, u64 seq,
+                                 int old_roots, int new_roots, int rescan)
 {
        struct ulist_node *unode;
        struct ulist_iterator uiter;
        struct btrfs_qgroup *qg;
-       struct ulist_node *tmp_unode;
-       struct ulist_iterator tmp_uiter;
-       int ret;
+       u64 cur_new_count, cur_old_count;
 
        ULIST_ITER_INIT(&uiter);
-       while ((unode = ulist_next(roots, &uiter))) {
-               qg = find_qgroup_rb(fs_info, unode->val);
-               if (!qg)
-                       continue;
+       while ((unode = ulist_next(qgroups, &uiter))) {
+               bool dirty = false;
 
-               ulist_reinit(tmp);
-               ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
-               if (ret < 0)
-                       return ret;
+               qg = u64_to_ptr(unode->aux);
+               /*
+                * Wasn't referenced before but is now, add to the reference
+                * counters.
+                */
+               if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+                       qg->rfer += num_bytes;
+                       qg->rfer_cmpr += num_bytes;
+                       dirty = true;
+               }
 
-               ULIST_ITER_INIT(&tmp_uiter);
-               while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
-                       struct btrfs_qgroup_list *glist;
+               /*
+                * Was referenced before but isn't now, subtract from the
+                * reference counters.
+                */
+               if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+                       qg->rfer -= num_bytes;
+                       qg->rfer_cmpr -= num_bytes;
+                       dirty = true;
+               }
 
-                       qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
-                       if (qg->tag == seq)
-                               continue;
+               if (qg->old_refcnt < seq)
+                       cur_old_count = 0;
+               else
+                       cur_old_count = qg->old_refcnt - seq;
+               if (qg->new_refcnt < seq)
+                       cur_new_count = 0;
+               else
+                       cur_new_count = qg->new_refcnt - seq;
 
-                       if (qg->refcnt - seq == roots->nnodes) {
-                               qg->excl -= sgn * num_bytes;
-                               qg->excl_cmpr -= sgn * num_bytes;
-                               qgroup_dirty(fs_info, qg);
-                       }
+               /*
+                * If our refcount was the same as the roots previously but our
+                * new count isn't the same as the number of roots now then we
+                * went from having a exclusive reference on this range to not.
+                */
+               if (old_roots && cur_old_count == old_roots &&
+                   (cur_new_count != new_roots || new_roots == 0)) {
+                       WARN_ON(cur_new_count != new_roots && new_roots == 0);
+                       qg->excl -= num_bytes;
+                       qg->excl_cmpr -= num_bytes;
+                       dirty = true;
+               }
 
-                       list_for_each_entry(glist, &qg->groups, next_group) {
-                               ret = ulist_add(tmp, glist->group->qgroupid,
-                                               (uintptr_t)glist->group,
-                                               GFP_ATOMIC);
-                               if (ret < 0)
-                                       return ret;
-                       }
+               /*
+                * If we didn't reference all the roots before but now we do we
+                * have an exclusive reference to this range.
+                */
+               if ((!old_roots || (old_roots && cur_old_count != old_roots))
+                   && cur_new_count == new_roots) {
+                       qg->excl += num_bytes;
+                       qg->excl_cmpr += num_bytes;
+                       dirty = true;
                }
-       }
 
+               if (dirty)
+                       qgroup_dirty(fs_info, qg);
+       }
        return 0;
 }
 
 /*
- * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
- * from the fs. First, all roots referencing the extent are searched, and
- * then the space is accounted accordingly to the different roots. The
- * accounting algorithm works in 3 steps documented inline.
+ * If we removed a data extent and there were other references for that bytenr
+ * then we need to lookup all referenced roots to make sure we still don't
+ * reference this bytenr.  If we do then we can just discard this operation.
  */
-int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
-                            struct btrfs_fs_info *fs_info,
-                            struct btrfs_delayed_ref_node *node,
-                            struct btrfs_delayed_extent_op *extent_op)
+static int check_existing_refs(struct btrfs_trans_handle *trans,
+                              struct btrfs_fs_info *fs_info,
+                              struct btrfs_qgroup_operation *oper)
 {
-       struct btrfs_root *quota_root;
-       u64 ref_root;
-       struct btrfs_qgroup *qgroup;
        struct ulist *roots = NULL;
-       u64 seq;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
        int ret = 0;
-       int sgn;
 
-       if (!fs_info->quota_enabled)
-               return 0;
-
-       BUG_ON(!fs_info->quota_root);
+       ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+                                  oper->elem.seq, &roots);
+       if (ret < 0)
+               return ret;
+       ret = 0;
 
-       if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
-           node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
-               struct btrfs_delayed_tree_ref *ref;
-               ref = btrfs_delayed_node_to_tree_ref(node);
-               ref_root = ref->root;
-       } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
-                  node->type == BTRFS_SHARED_DATA_REF_KEY) {
-               struct btrfs_delayed_data_ref *ref;
-               ref = btrfs_delayed_node_to_data_ref(node);
-               ref_root = ref->root;
-       } else {
-               BUG();
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(roots, &uiter))) {
+               if (unode->val == oper->ref_root) {
+                       ret = 1;
+                       break;
+               }
        }
+       ulist_free(roots);
+       btrfs_put_tree_mod_seq(fs_info, &oper->elem);
 
-       if (!is_fstree(ref_root)) {
-               /*
-                * non-fs-trees are not being accounted
-                */
-               return 0;
-       }
+       return ret;
+}
 
-       switch (node->action) {
-       case BTRFS_ADD_DELAYED_REF:
-       case BTRFS_ADD_DELAYED_EXTENT:
-               sgn = 1;
-               seq = btrfs_tree_mod_seq_prev(node->seq);
-               break;
-       case BTRFS_DROP_DELAYED_REF:
-               sgn = -1;
-               seq = node->seq;
-               break;
-       case BTRFS_UPDATE_DELAYED_HEAD:
-               return 0;
-       default:
-               BUG();
-       }
+/*
+ * If we share a reference across multiple roots then we may need to adjust
+ * various qgroups referenced and exclusive counters.  The basic premise is this
+ *
+ * 1) We have seq to represent a 0 count.  Instead of looping through all of the
+ * qgroups and resetting their refcount to 0 we just constantly bump this
+ * sequence number to act as the base reference count.  This means that if
+ * anybody is equal to or below this sequence they were never referenced.  We
+ * jack this sequence up by the number of roots we found each time in order to
+ * make sure we don't have any overlap.
+ *
+ * 2) We first search all the roots that reference the area _except_ the root
+ * we're acting on currently.  This makes up the old_refcnt of all the qgroups
+ * before.
+ *
+ * 3) We walk all of the qgroups referenced by the root we are currently acting
+ * on, and will either adjust old_refcnt in the case of a removal or the
+ * new_refcnt in the case of an addition.
+ *
+ * 4) Finally we walk all the qgroups that are referenced by this range
+ * including the root we are acting on currently.  We will adjust the counters
+ * based on the number of roots we had and will have after this operation.
+ *
+ * Take this example as an illustration
+ *
+ *                     [qgroup 1/0]
+ *                  /         |          \
+ *             [qg 0/0]   [qg 0/1]     [qg 0/2]
+ *                \          |            /
+ *               [        extent           ]
+ *
+ * Say we are adding a reference that is covered by qg 0/0.  The first step
+ * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
+ * old_roots being 2.  Because it is adding new_roots will be 1.  We then go
+ * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
+ * new_refcnt, bringing it to 3.  We then walk through all of the qgroups, we
+ * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
+ * reference and thus must add the size to the referenced bytes.  Everything
+ * else is the same so nothing else changes.
+ */
+static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
+                                   struct btrfs_fs_info *fs_info,
+                                   struct btrfs_qgroup_operation *oper)
+{
+       struct ulist *roots = NULL;
+       struct ulist *qgroups, *tmp;
+       struct btrfs_qgroup *qgroup;
+       struct seq_list elem = {};
+       u64 seq;
+       int old_roots = 0;
+       int new_roots = 0;
+       int ret = 0;
 
-       mutex_lock(&fs_info->qgroup_rescan_lock);
-       if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
-               if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
-                       mutex_unlock(&fs_info->qgroup_rescan_lock);
+       if (oper->elem.seq) {
+               ret = check_existing_refs(trans, fs_info, oper);
+               if (ret < 0)
+                       return ret;
+               if (ret)
                        return 0;
-               }
        }
-       mutex_unlock(&fs_info->qgroup_rescan_lock);
 
-       /*
-        * the delayed ref sequence number we pass depends on the direction of
-        * the operation. for add operations, we pass
-        * tree_mod_log_prev_seq(node->seq) to skip
-        * the delayed ref's current sequence number, because we need the state
-        * of the tree before the add operation. for delete operations, we pass
-        * (node->seq) to include the delayed ref's current sequence number,
-        * because we need the state of the tree after the delete operation.
-        */
-       ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
-       if (ret < 0)
-               return ret;
-
-       spin_lock(&fs_info->qgroup_lock);
+       qgroups = ulist_alloc(GFP_NOFS);
+       if (!qgroups)
+               return -ENOMEM;
 
-       quota_root = fs_info->quota_root;
-       if (!quota_root)
-               goto unlock;
+       tmp = ulist_alloc(GFP_NOFS);
+       if (!tmp)
+               return -ENOMEM;
 
-       qgroup = find_qgroup_rb(fs_info, ref_root);
+       btrfs_get_tree_mod_seq(fs_info, &elem);
+       ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
+                                  &roots);
+       btrfs_put_tree_mod_seq(fs_info, &elem);
+       if (ret < 0) {
+               ulist_free(qgroups);
+               ulist_free(tmp);
+               return ret;
+       }
+       spin_lock(&fs_info->qgroup_lock);
+       qgroup = find_qgroup_rb(fs_info, oper->ref_root);
        if (!qgroup)
-               goto unlock;
+               goto out;
+       seq = fs_info->qgroup_seq;
 
        /*
-        * step 1: for each old ref, visit all nodes once and inc refcnt
+        * So roots is the list of all the roots currently pointing at the
+        * bytenr, including the ref we are adding if we are adding, or not if
+        * we are removing a ref.  So we pass in the ref_root to skip that root
+        * in our calculations.  We set old_refnct and new_refcnt cause who the
+        * hell knows what everything looked like before, and it doesn't matter
+        * except...
         */
-       ulist_reinit(fs_info->qgroup_ulist);
-       seq = fs_info->qgroup_seq;
-       fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+       ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
+                                    seq, &old_roots, 0);
+       if (ret < 0)
+               goto out;
 
-       ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
-                                      seq);
-       if (ret)
-               goto unlock;
+       /*
+        * Now adjust the refcounts of the qgroups that care about this
+        * reference, either the old_count in the case of removal or new_count
+        * in the case of an addition.
+        */
+       ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
+                                    seq);
+       if (ret < 0)
+               goto out;
 
        /*
-        * step 2: walk from the new root
+        * ...in the case of removals.  If we had a removal before we got around
+        * to processing this operation then we need to find that guy and count
+        * his references as if they really existed so we don't end up screwing
+        * up the exclusive counts.  Then whenever we go to process the delete
+        * everything will be grand and we can account for whatever exclusive
+        * changes need to be made there.  We also have to pass in old_roots so
+        * we have an accurate count of the roots as it pertains to this
+        * operations view of the world.
         */
-       ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
-                                      seq, sgn, node->num_bytes, qgroup);
-       if (ret)
-               goto unlock;
+       ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
+                                         &old_roots);
+       if (ret < 0)
+               goto out;
 
        /*
-        * step 3: walk again from old refs
+        * We are adding our root, need to adjust up the number of roots,
+        * otherwise old_roots is the number of roots we want.
         */
-       ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
-                                      seq, sgn, node->num_bytes);
-       if (ret)
-               goto unlock;
+       if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+               new_roots = old_roots + 1;
+       } else {
+               new_roots = old_roots;
+               old_roots++;
+       }
+       fs_info->qgroup_seq += old_roots + 1;
 
-unlock:
+
+       /*
+        * And now the magic happens, bless Arne for having a pretty elegant
+        * solution for this.
+        */
+       qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
+                              qgroups, seq, old_roots, new_roots, 0);
+out:
        spin_unlock(&fs_info->qgroup_lock);
+       ulist_free(qgroups);
        ulist_free(roots);
+       ulist_free(tmp);
+       return ret;
+}
+
+/*
+ * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
+ * from the fs. First, all roots referencing the extent are searched, and
+ * then the space is accounted accordingly to the different roots. The
+ * accounting algorithm works in 3 steps documented inline.
+ */
+static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
+                               struct btrfs_fs_info *fs_info,
+                               struct btrfs_qgroup_operation *oper)
+{
+       int ret = 0;
+
+       if (!fs_info->quota_enabled)
+               return 0;
+
+       BUG_ON(!fs_info->quota_root);
+
+       mutex_lock(&fs_info->qgroup_rescan_lock);
+       if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+               if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
+                       mutex_unlock(&fs_info->qgroup_rescan_lock);
+                       return 0;
+               }
+       }
+       mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+       ASSERT(is_fstree(oper->ref_root));
+
+       switch (oper->type) {
+       case BTRFS_QGROUP_OPER_ADD_EXCL:
+       case BTRFS_QGROUP_OPER_SUB_EXCL:
+               ret = qgroup_excl_accounting(fs_info, oper);
+               break;
+       case BTRFS_QGROUP_OPER_ADD_SHARED:
+       case BTRFS_QGROUP_OPER_SUB_SHARED:
+               ret = qgroup_shared_accounting(trans, fs_info, oper);
+               break;
+       default:
+               ASSERT(0);
+       }
+       return ret;
+}
 
+/*
+ * Needs to be called everytime we run delayed refs, even if there is an error
+ * in order to cleanup outstanding operations.
+ */
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+                                   struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_qgroup_operation *oper;
+       int ret = 0;
+
+       while (!list_empty(&trans->qgroup_ref_list)) {
+               oper = list_first_entry(&trans->qgroup_ref_list,
+                                       struct btrfs_qgroup_operation, list);
+               list_del_init(&oper->list);
+               if (!ret || !trans->aborted)
+                       ret = btrfs_qgroup_account(trans, fs_info, oper);
+               spin_lock(&fs_info->qgroup_op_lock);
+               rb_erase(&oper->n, &fs_info->qgroup_op_tree);
+               spin_unlock(&fs_info->qgroup_op_lock);
+               btrfs_put_tree_mod_seq(fs_info, &oper->elem);
+               kfree(oper);
+       }
        return ret;
 }
 
@@ -1629,8 +2091,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
                srcgroup = find_qgroup_rb(fs_info, srcid);
                if (!srcgroup)
                        goto unlock;
-               dstgroup->rfer = srcgroup->rfer - level_size;
-               dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
+
+               /*
+                * We call inherit after we clone the root in order to make sure
+                * our counts don't go crazy, so at this point the only
+                * difference between the two roots should be the root node.
+                */
+               dstgroup->rfer = srcgroup->rfer;
+               dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
+               dstgroup->excl = level_size;
+               dstgroup->excl_cmpr = level_size;
                srcgroup->excl = level_size;
                srcgroup->excl_cmpr = level_size;
                qgroup_dirty(fs_info, dstgroup);
@@ -1734,7 +2204,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
 
-               qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+               qg = u64_to_ptr(unode->aux);
 
                if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
                    qg->reserved + (s64)qg->rfer + num_bytes >
@@ -1766,7 +2236,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
                struct btrfs_qgroup *qg;
 
-               qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+               qg = u64_to_ptr(unode->aux);
 
                qg->reserved += num_bytes;
        }
@@ -1812,7 +2282,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
                struct btrfs_qgroup *qg;
                struct btrfs_qgroup_list *glist;
 
-               qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+               qg = u64_to_ptr(unode->aux);
 
                qg->reserved -= num_bytes;
 
@@ -1848,15 +2318,15 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
  */
 static int
 qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
-                  struct btrfs_trans_handle *trans, struct ulist *tmp,
-                  struct extent_buffer *scratch_leaf)
+                  struct btrfs_trans_handle *trans, struct ulist *qgroups,
+                  struct ulist *tmp, struct extent_buffer *scratch_leaf)
 {
        struct btrfs_key found;
        struct ulist *roots = NULL;
-       struct ulist_node *unode;
-       struct ulist_iterator uiter;
        struct seq_list tree_mod_seq_elem = {};
+       u64 num_bytes;
        u64 seq;
+       int new_roots;
        int slot;
        int ret;
 
@@ -1897,8 +2367,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
        mutex_unlock(&fs_info->qgroup_rescan_lock);
 
        for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
-               u64 num_bytes;
-
                btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
                if (found.type != BTRFS_EXTENT_ITEM_KEY &&
                    found.type != BTRFS_METADATA_ITEM_KEY)
@@ -1908,76 +2376,34 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
                else
                        num_bytes = found.offset;
 
-               ret = btrfs_find_all_roots(trans, fs_info, found.objectid,
-                                          tree_mod_seq_elem.seq, &roots);
+               ulist_reinit(qgroups);
+               ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
+                                          &roots);
                if (ret < 0)
                        goto out;
                spin_lock(&fs_info->qgroup_lock);
                seq = fs_info->qgroup_seq;
                fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
 
-               ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq);
-               if (ret) {
+               new_roots = 0;
+               ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
+                                            seq, &new_roots, 1);
+               if (ret < 0) {
                        spin_unlock(&fs_info->qgroup_lock);
                        ulist_free(roots);
                        goto out;
                }
 
-               /*
-                * step2 of btrfs_qgroup_account_ref works from a single root,
-                * we're doing all at once here.
-                */
-               ulist_reinit(tmp);
-               ULIST_ITER_INIT(&uiter);
-               while ((unode = ulist_next(roots, &uiter))) {
-                       struct btrfs_qgroup *qg;
-
-                       qg = find_qgroup_rb(fs_info, unode->val);
-                       if (!qg)
-                               continue;
-
-                       ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg,
-                                       GFP_ATOMIC);
-                       if (ret < 0) {
-                               spin_unlock(&fs_info->qgroup_lock);
-                               ulist_free(roots);
-                               goto out;
-                       }
-               }
-
-               /* this loop is similar to step 2 of btrfs_qgroup_account_ref */
-               ULIST_ITER_INIT(&uiter);
-               while ((unode = ulist_next(tmp, &uiter))) {
-                       struct btrfs_qgroup *qg;
-                       struct btrfs_qgroup_list *glist;
-
-                       qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux;
-                       qg->rfer += num_bytes;
-                       qg->rfer_cmpr += num_bytes;
-                       WARN_ON(qg->tag >= seq);
-                       if (qg->refcnt - seq == roots->nnodes) {
-                               qg->excl += num_bytes;
-                               qg->excl_cmpr += num_bytes;
-                       }
-                       qgroup_dirty(fs_info, qg);
-
-                       list_for_each_entry(glist, &qg->groups, next_group) {
-                               ret = ulist_add(tmp, glist->group->qgroupid,
-                                               (uintptr_t)glist->group,
-                                               GFP_ATOMIC);
-                               if (ret < 0) {
-                                       spin_unlock(&fs_info->qgroup_lock);
-                                       ulist_free(roots);
-                                       goto out;
-                               }
-                       }
+               ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
+                                            seq, 0, new_roots, 1);
+               if (ret < 0) {
+                       spin_unlock(&fs_info->qgroup_lock);
+                       ulist_free(roots);
+                       goto out;
                }
-
                spin_unlock(&fs_info->qgroup_lock);
                ulist_free(roots);
-               ret = 0;
        }
-
 out:
        btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
 
@@ -1990,13 +2416,16 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
                                                     qgroup_rescan_work);
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans = NULL;
-       struct ulist *tmp = NULL;
+       struct ulist *tmp = NULL, *qgroups = NULL;
        struct extent_buffer *scratch_leaf = NULL;
        int err = -ENOMEM;
 
        path = btrfs_alloc_path();
        if (!path)
                goto out;
+       qgroups = ulist_alloc(GFP_NOFS);
+       if (!qgroups)
+               goto out;
        tmp = ulist_alloc(GFP_NOFS);
        if (!tmp)
                goto out;
@@ -2015,7 +2444,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
                        err = -EINTR;
                } else {
                        err = qgroup_rescan_leaf(fs_info, path, trans,
-                                                tmp, scratch_leaf);
+                                                qgroups, tmp, scratch_leaf);
                }
                if (err > 0)
                        btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2025,7 +2454,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 
 out:
        kfree(scratch_leaf);
-       ulist_free(tmp);
+       ulist_free(qgroups);
        btrfs_free_path(path);
 
        mutex_lock(&fs_info->qgroup_rescan_lock);
This page took 0.037667 seconds and 5 git commands to generate.