Merge branch 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 26 May 2011 17:55:15 +0000 (10:55 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 26 May 2011 17:55:15 +0000 (10:55 -0700)
* 'linux-next' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2: (28 commits)
  Ocfs2: Teach local-mounted ocfs2 to handle unwritten_extents correctly.
  ocfs2/dlm: Do not migrate resource to a node that is leaving the domain
  ocfs2/dlm: Add new dlm message DLM_BEGIN_EXIT_DOMAIN_MSG
  Ocfs2/move_extents: Set several trivial constraints for threshold.
  Ocfs2/move_extents: Let defrag handle partial extent moving.
  Ocfs2/move_extents: move/defrag extents within a certain range.
  Ocfs2/move_extents: helper to calculate the defraging length in one run.
  Ocfs2/move_extents: move entire/partial extent.
  Ocfs2/move_extents: helpers to update the group descriptor and global bitmap inode.
  Ocfs2/move_extents: helper to probe a proper region to move in an alloc group.
  Ocfs2/move_extents: helper to validate and adjust moving goal.
  Ocfs2/move_extents: find the victim alloc group, where the given #blk fits.
  Ocfs2/move_extents: defrag a range of extent.
  Ocfs2/move_extents: move a range of extent.
  Ocfs2/move_extents: lock allocators and reserve metadata blocks and data clusters for extents moving.
  Ocfs2/move_extents: Add basic framework and source files for extent moving.
  Ocfs2/move_extents: Adding new ioctl code 'OCFS2_IOC_MOVE_EXT' to ocfs2.
  Ocfs2/refcounttree: Publicize couple of funcs from refcounttree.c
  Ocfs2: Add a new code 'OCFS2_INFO_FREEFRAG' for o2info ioctl.
  Ocfs2: Add a new code 'OCFS2_INFO_FREEINODE' for o2info ioctl.
  ...

23 files changed:
Documentation/ABI/obsolete/o2cb [deleted file]
Documentation/ABI/removed/o2cb [new file with mode: 0644]
Documentation/feature-removal-schedule.txt
Documentation/filesystems/ocfs2.txt
fs/ocfs2/Makefile
fs/ocfs2/alloc.c
fs/ocfs2/alloc.h
fs/ocfs2/cluster/sys.c
fs/ocfs2/dlm/dlmcommon.h
fs/ocfs2/dlm/dlmdebug.c
fs/ocfs2/dlm/dlmdomain.c
fs/ocfs2/dlm/dlmmaster.c
fs/ocfs2/dlm/dlmrecovery.c
fs/ocfs2/dlmfs/dlmfs.c
fs/ocfs2/file.c
fs/ocfs2/ioctl.c
fs/ocfs2/move_extents.c [new file with mode: 0644]
fs/ocfs2/move_extents.h [new file with mode: 0644]
fs/ocfs2/ocfs2_ioctl.h
fs/ocfs2/ocfs2_trace.h
fs/ocfs2/refcounttree.c
fs/ocfs2/refcounttree.h
fs/ocfs2/super.c

diff --git a/Documentation/ABI/obsolete/o2cb b/Documentation/ABI/obsolete/o2cb
deleted file mode 100644 (file)
index 9c49d8e..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-What:          /sys/o2cb symlink
-Date:          Dec 2005
-KernelVersion: 2.6.16
-Contact:       ocfs2-devel@oss.oracle.com
-Description:   This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will
-               be removed when new versions of ocfs2-tools which know to look
-               in /sys/fs/o2cb are sufficiently prevalent. Don't code new
-               software to look here, it should try /sys/fs/o2cb instead.
-               See Documentation/ABI/stable/o2cb for more information on usage.
-Users:         ocfs2-tools. It's sufficient to mail proposed changes to
-               ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/ABI/removed/o2cb b/Documentation/ABI/removed/o2cb
new file mode 100644 (file)
index 0000000..7f5daa4
--- /dev/null
@@ -0,0 +1,10 @@
+What:          /sys/o2cb symlink
+Date:          May 2011
+KernelVersion: 2.6.40
+Contact:       ocfs2-devel@oss.oracle.com
+Description:   This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink is
+               removed when new versions of ocfs2-tools which know to look
+               in /sys/fs/o2cb are sufficiently prevalent. Don't code new
+               software to look here, it should try /sys/fs/o2cb instead.
+Users:         ocfs2-tools. It's sufficient to mail proposed changes to
+               ocfs2-devel@oss.oracle.com.
index 95788ad2506c57575e1ead9af19f6fc3440e59d5..ff31b1cc50aa3ddbf66f12172ec434456b662283 100644 (file)
@@ -262,16 +262,6 @@ Who:       Michael Buesch <mb@bu3sch.de>
 
 ---------------------------
 
-What:  /sys/o2cb symlink
-When:  January 2010
-Why:   /sys/fs/o2cb is the proper location for this information - /sys/o2cb
-       exists as a symlink for backwards compatibility for old versions of
-       ocfs2-tools. 2 years should be sufficient time to phase in new versions
-       which know to look in /sys/fs/o2cb.
-Who:   ocfs2-devel@oss.oracle.com
-
----------------------------
-
 What:  Ability for non root users to shm_get hugetlb pages based on mlock
        resource limits
 When:  2.6.31
index 9ed920a8cd79e92ef871e3fa29201b0777a8ce2a..7618a287aa41f085e8c166393b99ee31672c2a9e 100644 (file)
@@ -46,9 +46,15 @@ errors=panic         Panic and halt the machine if an error occurs.
 intr           (*)     Allow signals to interrupt cluster operations.
 nointr                 Do not allow signals to interrupt cluster
                        operations.
+noatime                        Do not update access time.
+relatime(*)            Update atime if the previous atime is older than
+                       mtime or ctime
+strictatime            Always update atime, but the minimum update interval
+                       is specified by atime_quantum.
 atime_quantum=60(*)    OCFS2 will not update atime unless this number
                        of seconds has passed since the last update.
-                       Set to zero to always update atime.
+                       Set to zero to always update atime. This option need
+                       work with strictatime.
 data=ordered   (*)     All data are forced directly out to the main file
                        system prior to its metadata being committed to the
                        journal.
index d8a0313e99e6af42b8412e91bb44cb2d7bb410b8..f17e58b32989a53f4f0413065bbd5e07d7901d5d 100644 (file)
@@ -30,6 +30,7 @@ ocfs2-objs := \
        namei.o                 \
        refcounttree.o          \
        reservations.o          \
+       move_extents.o          \
        resize.o                \
        slot_map.o              \
        suballoc.o              \
index 48aa9c7401c77aa5487fbc2fb1c472efa6217faa..ed553c60de827e0ebad24e3501e0e00d21c82cfc 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 
 #include <cluster/masklog.h>
 
@@ -7184,3 +7185,168 @@ out_commit:
 out:
        return ret;
 }
+
+static int ocfs2_trim_extent(struct super_block *sb,
+                            struct ocfs2_group_desc *gd,
+                            u32 start, u32 count)
+{
+       u64 discard, bcount;
+
+       bcount = ocfs2_clusters_to_blocks(sb, count);
+       discard = le64_to_cpu(gd->bg_blkno) +
+                       ocfs2_clusters_to_blocks(sb, start);
+
+       trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
+
+       return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+                           struct ocfs2_group_desc *gd,
+                           u32 start, u32 max, u32 minbits)
+{
+       int ret = 0, count = 0, next;
+       void *bitmap = gd->bg_bitmap;
+
+       if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
+               return 0;
+
+       trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
+                              start, max, minbits);
+
+       while (start < max) {
+               start = ocfs2_find_next_zero_bit(bitmap, max, start);
+               if (start >= max)
+                       break;
+               next = ocfs2_find_next_bit(bitmap, max, start);
+
+               if ((next - start) >= minbits) {
+                       ret = ocfs2_trim_extent(sb, gd,
+                                               start, next - start);
+                       if (ret < 0) {
+                               mlog_errno(ret);
+                               break;
+                       }
+                       count += next - start;
+               }
+               start = next + 1;
+
+               if (fatal_signal_pending(current)) {
+                       count = -ERESTARTSYS;
+                       break;
+               }
+
+               if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+                       break;
+       }
+
+       if (ret < 0)
+               count = ret;
+
+       return count;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+       struct ocfs2_super *osb = OCFS2_SB(sb);
+       u64 start, len, trimmed, first_group, last_group, group;
+       int ret, cnt;
+       u32 first_bit, last_bit, minlen;
+       struct buffer_head *main_bm_bh = NULL;
+       struct inode *main_bm_inode = NULL;
+       struct buffer_head *gd_bh = NULL;
+       struct ocfs2_dinode *main_bm;
+       struct ocfs2_group_desc *gd = NULL;
+
+       start = range->start >> osb->s_clustersize_bits;
+       len = range->len >> osb->s_clustersize_bits;
+       minlen = range->minlen >> osb->s_clustersize_bits;
+       trimmed = 0;
+
+       if (!len) {
+               range->len = 0;
+               return 0;
+       }
+
+       if (minlen >= osb->bitmap_cpg)
+               return -EINVAL;
+
+       main_bm_inode = ocfs2_get_system_file_inode(osb,
+                                                   GLOBAL_BITMAP_SYSTEM_INODE,
+                                                   OCFS2_INVALID_SLOT);
+       if (!main_bm_inode) {
+               ret = -EIO;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       mutex_lock(&main_bm_inode->i_mutex);
+
+       ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out_mutex;
+       }
+       main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+       if (start >= le32_to_cpu(main_bm->i_clusters)) {
+               ret = -EINVAL;
+               goto out_unlock;
+       }
+
+       if (start + len > le32_to_cpu(main_bm->i_clusters))
+               len = le32_to_cpu(main_bm->i_clusters) - start;
+
+       trace_ocfs2_trim_fs(start, len, minlen);
+
+       /* Determine first and last group to examine based on start and len */
+       first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+       if (first_group == osb->first_cluster_group_blkno)
+               first_bit = start;
+       else
+               first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+       last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+       last_bit = osb->bitmap_cpg;
+
+       for (group = first_group; group <= last_group;) {
+               if (first_bit + len >= osb->bitmap_cpg)
+                       last_bit = osb->bitmap_cpg;
+               else
+                       last_bit = first_bit + len;
+
+               ret = ocfs2_read_group_descriptor(main_bm_inode,
+                                                 main_bm, group,
+                                                 &gd_bh);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       break;
+               }
+
+               gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+               cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+               brelse(gd_bh);
+               gd_bh = NULL;
+               if (cnt < 0) {
+                       ret = cnt;
+                       mlog_errno(ret);
+                       break;
+               }
+
+               trimmed += cnt;
+               len -= osb->bitmap_cpg - first_bit;
+               first_bit = 0;
+               if (group == osb->first_cluster_group_blkno)
+                       group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+               else
+                       group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+       }
+       range->len = trimmed * sb->s_blocksize;
+out_unlock:
+       ocfs2_inode_unlock(main_bm_inode, 0);
+       brelse(main_bm_bh);
+out_mutex:
+       mutex_unlock(&main_bm_inode->i_mutex);
+       iput(main_bm_inode);
+out:
+       return ret;
+}
index 3bd08a03251c32fe904af81cb49305f72dba1e41..ca381c5841273433d279cd6aca8137ec01d03bbd 100644 (file)
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
                    struct buffer_head **leaf_bh);
 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
 
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
 /*
  * Helper function to look at the # of clusters in an extent record.
  */
index bc702dab5d1f912be9e70ee1dd58fb8aaeb441dc..a4b07730b2e1d0abb257a126fce7f3911ae1d434 100644 (file)
@@ -57,7 +57,6 @@ static struct kset *o2cb_kset;
 void o2cb_sys_shutdown(void)
 {
        mlog_sys_shutdown();
-       sysfs_remove_link(NULL, "o2cb");
        kset_unregister(o2cb_kset);
 }
 
@@ -69,14 +68,6 @@ int o2cb_sys_init(void)
        if (!o2cb_kset)
                return -ENOMEM;
 
-       /*
-        * Create this symlink for backwards compatibility with old
-        * versions of ocfs2-tools which look for things in /sys/o2cb.
-        */
-       ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
-       if (ret)
-               goto error;
-
        ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
        if (ret)
                goto error;
index 4bdf7baee344dd9533bf08bc3969f3b4b00b4ac8..d602abb51b610d525cc437daa05c25d2105fe0c3 100644 (file)
@@ -144,6 +144,7 @@ struct dlm_ctxt
        wait_queue_head_t dlm_join_events;
        unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+       unsigned long exit_domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        struct dlm_recovery_ctxt reco;
        spinlock_t master_lock;
@@ -401,6 +402,18 @@ static inline int dlm_lvb_is_empty(char *lvb)
        return 1;
 }
 
+static inline char *dlm_list_in_text(enum dlm_lockres_list idx)
+{
+       if (idx == DLM_GRANTED_LIST)
+               return "granted";
+       else if (idx == DLM_CONVERTING_LIST)
+               return "converting";
+       else if (idx == DLM_BLOCKED_LIST)
+               return "blocked";
+       else
+               return "unknown";
+}
+
 static inline struct list_head *
 dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
 {
@@ -448,6 +461,7 @@ enum {
        DLM_FINALIZE_RECO_MSG           = 518,
        DLM_QUERY_REGION                = 519,
        DLM_QUERY_NODEINFO              = 520,
+       DLM_BEGIN_EXIT_DOMAIN_MSG       = 521,
 };
 
 struct dlm_reco_node_data
index 04a32be0aeb92f6ec15fb9217e0cca789f895bca..56f82cb912e379e4149ab5454261a9bb6bdadf67 100644 (file)
@@ -756,6 +756,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
                                 buf + out, len - out);
        out += snprintf(buf + out, len - out, "\n");
 
+       /* Exit Domain Map: xx xx xx */
+       out += snprintf(buf + out, len - out, "Exit Domain Map: ");
+       out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES,
+                                buf + out, len - out);
+       out += snprintf(buf + out, len - out, "\n");
+
        /* Live Map: xx xx xx */
        out += snprintf(buf + out, len - out, "Live Map: ");
        out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
index 3b179d6cbde09be9017eb21fbefcc5df8b217a53..6ed6b95dcf935a6516e935b85a3ca9ffc0b8a9d8 100644 (file)
@@ -132,10 +132,12 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
  * New in version 1.1:
  *     - Message DLM_QUERY_REGION added to support global heartbeat
  *     - Message DLM_QUERY_NODEINFO added to allow online node removes
+ * New in version 1.2:
+ *     - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
  */
 static const struct dlm_protocol_version dlm_protocol = {
        .pv_major = 1,
-       .pv_minor = 1,
+       .pv_minor = 2,
 };
 
 #define DLM_DOMAIN_BACKOFF_MS 200
@@ -449,14 +451,18 @@ redo_bucket:
                        dropped = dlm_empty_lockres(dlm, res);
 
                        spin_lock(&res->spinlock);
-                       __dlm_lockres_calc_usage(dlm, res);
-                       iter = res->hash_node.next;
+                       if (dropped)
+                               __dlm_lockres_calc_usage(dlm, res);
+                       else
+                               iter = res->hash_node.next;
                        spin_unlock(&res->spinlock);
 
                        dlm_lockres_put(res);
 
-                       if (dropped)
+                       if (dropped) {
+                               cond_resched_lock(&dlm->spinlock);
                                goto redo_bucket;
+                       }
                }
                cond_resched_lock(&dlm->spinlock);
                num += n;
@@ -486,6 +492,28 @@ static int dlm_no_joining_node(struct dlm_ctxt *dlm)
        return ret;
 }
 
+static int dlm_begin_exit_domain_handler(struct o2net_msg *msg, u32 len,
+                                        void *data, void **ret_data)
+{
+       struct dlm_ctxt *dlm = data;
+       unsigned int node;
+       struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
+
+       if (!dlm_grab(dlm))
+               return 0;
+
+       node = exit_msg->node_idx;
+       mlog(0, "%s: Node %u sent a begin exit domain message\n", dlm->name, node);
+
+       spin_lock(&dlm->spinlock);
+       set_bit(node, dlm->exit_domain_map);
+       spin_unlock(&dlm->spinlock);
+
+       dlm_put(dlm);
+
+       return 0;
+}
+
 static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
 {
        /* Yikes, a double spinlock! I need domain_lock for the dlm
@@ -542,6 +570,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
 
        spin_lock(&dlm->spinlock);
        clear_bit(node, dlm->domain_map);
+       clear_bit(node, dlm->exit_domain_map);
        __dlm_print_nodes(dlm);
 
        /* notify anything attached to the heartbeat events */
@@ -554,29 +583,56 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
        return 0;
 }
 
-static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
+static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, u32 msg_type,
                                    unsigned int node)
 {
        int status;
        struct dlm_exit_domain leave_msg;
 
-       mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
-                 node, dlm->name, dlm->node_num);
+       mlog(0, "%s: Sending domain exit message %u to node %u\n", dlm->name,
+            msg_type, node);
 
        memset(&leave_msg, 0, sizeof(leave_msg));
        leave_msg.node_idx = dlm->node_num;
 
-       status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
-                                   &leave_msg, sizeof(leave_msg), node,
-                                   NULL);
+       status = o2net_send_message(msg_type, dlm->key, &leave_msg,
+                                   sizeof(leave_msg), node, NULL);
        if (status < 0)
-               mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
-                    "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
-       mlog(0, "status return %d from o2net_send_message\n", status);
+               mlog(ML_ERROR, "Error %d sending domain exit message %u "
+                    "to node %u on domain %s\n", status, msg_type, node,
+                    dlm->name);
 
        return status;
 }
 
+static void dlm_begin_exit_domain(struct dlm_ctxt *dlm)
+{
+       int node = -1;
+
+       /* Support for begin exit domain was added in 1.2 */
+       if (dlm->dlm_locking_proto.pv_major == 1 &&
+           dlm->dlm_locking_proto.pv_minor < 2)
+               return;
+
+       /*
+        * Unlike DLM_EXIT_DOMAIN_MSG, DLM_BEGIN_EXIT_DOMAIN_MSG is purely
+        * informational. Meaning if a node does not receive the message,
+        * so be it.
+        */
+       spin_lock(&dlm->spinlock);
+       while (1) {
+               node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1);
+               if (node >= O2NM_MAX_NODES)
+                       break;
+               if (node == dlm->node_num)
+                       continue;
+
+               spin_unlock(&dlm->spinlock);
+               dlm_send_one_domain_exit(dlm, DLM_BEGIN_EXIT_DOMAIN_MSG, node);
+               spin_lock(&dlm->spinlock);
+       }
+       spin_unlock(&dlm->spinlock);
+}
 
 static void dlm_leave_domain(struct dlm_ctxt *dlm)
 {
@@ -602,7 +658,8 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
 
                clear_node = 1;
 
-               status = dlm_send_one_domain_exit(dlm, node);
+               status = dlm_send_one_domain_exit(dlm, DLM_EXIT_DOMAIN_MSG,
+                                                 node);
                if (status < 0 &&
                    status != -ENOPROTOOPT &&
                    status != -ENOTCONN) {
@@ -677,6 +734,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
 
        if (leave) {
                mlog(0, "shutting down domain %s\n", dlm->name);
+               dlm_begin_exit_domain(dlm);
 
                /* We changed dlm state, notify the thread */
                dlm_kick_thread(dlm, NULL);
@@ -909,6 +967,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
                 * leftover join state. */
                BUG_ON(dlm->joining_node != assert->node_idx);
                set_bit(assert->node_idx, dlm->domain_map);
+               clear_bit(assert->node_idx, dlm->exit_domain_map);
                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
 
                printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
@@ -1793,6 +1852,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        if (status)
                goto bail;
 
+       status = o2net_register_handler(DLM_BEGIN_EXIT_DOMAIN_MSG, dlm->key,
+                                       sizeof(struct dlm_exit_domain),
+                                       dlm_begin_exit_domain_handler,
+                                       dlm, NULL, &dlm->dlm_domain_handlers);
+       if (status)
+               goto bail;
+
 bail:
        if (status)
                dlm_unregister_domain_handlers(dlm);
index 84d166328cf7448f36cfcb0a54f08eefda50e18d..11eefb8c12e98fb418f41c31a3b0a32201be3ca1 100644 (file)
@@ -2339,65 +2339,55 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
        dlm_lockres_put(res);
 }
 
-/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0
- * if not. If 0, numlocks is set to the number of locks in the lockres.
+/*
+ * A migrateable resource is one that is :
+ * 1. locally mastered, and,
+ * 2. zero local locks, and,
+ * 3. one or more non-local locks, or, one or more references
+ * Returns 1 if yes, 0 if not.
  */
 static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
-                                     struct dlm_lock_resource *res,
-                                     int *numlocks,
-                                     int *hasrefs)
+                                     struct dlm_lock_resource *res)
 {
-       int ret;
-       int i;
-       int count = 0;
+       enum dlm_lockres_list idx;
+       int nonlocal = 0, node_ref;
        struct list_head *queue;
        struct dlm_lock *lock;
+       u64 cookie;
 
        assert_spin_locked(&res->spinlock);
 
-       *numlocks = 0;
-       *hasrefs = 0;
-
-       ret = -EINVAL;
-       if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
-               mlog(0, "cannot migrate lockres with unknown owner!\n");
-               goto leave;
-       }
-
-       if (res->owner != dlm->node_num) {
-               mlog(0, "cannot migrate lockres this node doesn't own!\n");
-               goto leave;
-       }
+       if (res->owner != dlm->node_num)
+               return 0;
 
-       ret = 0;
-       queue = &res->granted;
-       for (i = 0; i < 3; i++) {
+        for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+               queue = dlm_list_idx_to_ptr(res, idx);
                list_for_each_entry(lock, queue, list) {
-                       ++count;
-                       if (lock->ml.node == dlm->node_num) {
-                               mlog(0, "found a lock owned by this node still "
-                                    "on the %s queue!  will not migrate this "
-                                    "lockres\n", (i == 0 ? "granted" :
-                                                  (i == 1 ? "converting" :
-                                                   "blocked")));
-                               ret = -ENOTEMPTY;
-                               goto leave;
+                       if (lock->ml.node != dlm->node_num) {
+                               nonlocal++;
+                               continue;
                        }
+                       cookie = be64_to_cpu(lock->ml.cookie);
+                       mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
+                            "%s list\n", dlm->name, res->lockname.len,
+                            res->lockname.name,
+                            dlm_get_lock_cookie_node(cookie),
+                            dlm_get_lock_cookie_seq(cookie),
+                            dlm_list_in_text(idx));
+                       return 0;
                }
-               queue++;
        }
 
-       *numlocks = count;
-
-       count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-       if (count < O2NM_MAX_NODES)
-               *hasrefs = 1;
+       if (!nonlocal) {
+               node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
+               if (node_ref >= O2NM_MAX_NODES)
+                       return 0;
+       }
 
-       mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name,
-            res->lockname.len, res->lockname.name, *numlocks, *hasrefs);
+       mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
+            res->lockname.name);
 
-leave:
-       return ret;
+       return 1;
 }
 
 /*
@@ -2406,8 +2396,7 @@ leave:
 
 
 static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
-                              struct dlm_lock_resource *res,
-                              u8 target)
+                              struct dlm_lock_resource *res, u8 target)
 {
        struct dlm_master_list_entry *mle = NULL;
        struct dlm_master_list_entry *oldmle = NULL;
@@ -2416,37 +2405,20 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        const char *name;
        unsigned int namelen;
        int mle_added = 0;
-       int numlocks, hasrefs;
        int wake = 0;
 
        if (!dlm_grab(dlm))
                return -EINVAL;
 
+       BUG_ON(target == O2NM_MAX_NODES);
+
        name = res->lockname.name;
        namelen = res->lockname.len;
 
-       mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target);
-
-       /*
-        * ensure this lockres is a proper candidate for migration
-        */
-       spin_lock(&res->spinlock);
-       ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-       if (ret < 0) {
-               spin_unlock(&res->spinlock);
-               goto leave;
-       }
-       spin_unlock(&res->spinlock);
-
-       /* no work to do */
-       if (numlocks == 0 && !hasrefs)
-               goto leave;
-
-       /*
-        * preallocate up front
-        * if this fails, abort
-        */
+       mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
+            target);
 
+       /* preallocate up front. if this fails, abort */
        ret = -ENOMEM;
        mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
        if (!mres) {
@@ -2461,36 +2433,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
        }
        ret = 0;
 
-       /*
-        * find a node to migrate the lockres to
-        */
-
-       spin_lock(&dlm->spinlock);
-       /* pick a new node */
-       if (!test_bit(target, dlm->domain_map) ||
-           target >= O2NM_MAX_NODES) {
-               target = dlm_pick_migration_target(dlm, res);
-       }
-       mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name,
-            namelen, name, target);
-
-       if (target >= O2NM_MAX_NODES ||
-           !test_bit(target, dlm->domain_map)) {
-               /* target chosen is not alive */
-               ret = -EINVAL;
-       }
-
-       if (ret) {
-               spin_unlock(&dlm->spinlock);
-               goto fail;
-       }
-
-       mlog(0, "continuing with target = %u\n", target);
-
        /*
         * clear any existing master requests and
         * add the migration mle to the list
         */
+       spin_lock(&dlm->spinlock);
        spin_lock(&dlm->master_lock);
        ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
                                    namelen, target, dlm->node_num);
@@ -2531,6 +2478,7 @@ fail:
                        dlm_put_mle(mle);
                } else if (mle) {
                        kmem_cache_free(dlm_mle_cache, mle);
+                       mle = NULL;
                }
                goto leave;
        }
@@ -2652,69 +2600,52 @@ leave:
        if (wake)
                wake_up(&res->wq);
 
-       /* TODO: cleanup */
        if (mres)
                free_page((unsigned long)mres);
 
        dlm_put(dlm);
 
-       mlog(0, "returning %d\n", ret);
+       mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
+            name, target, ret);
        return ret;
 }
 
 #define DLM_MIGRATION_RETRY_MS  100
 
-/* Should be called only after beginning the domain leave process.
+/*
+ * Should be called only after beginning the domain leave process.
  * There should not be any remaining locks on nonlocal lock resources,
  * and there should be no local locks left on locally mastered resources.
  *
  * Called with the dlm spinlock held, may drop it to do migration, but
  * will re-acquire before exit.
  *
- * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */
+ * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
+ */
 int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 {
        int ret;
        int lock_dropped = 0;
-       int numlocks, hasrefs;
+       u8 target = O2NM_MAX_NODES;
+
+       assert_spin_locked(&dlm->spinlock);
 
        spin_lock(&res->spinlock);
-       if (res->owner != dlm->node_num) {
-               if (!__dlm_lockres_unused(res)) {
-                       mlog(ML_ERROR, "%s:%.*s: this node is not master, "
-                            "trying to free this but locks remain\n",
-                            dlm->name, res->lockname.len, res->lockname.name);
-               }
-               spin_unlock(&res->spinlock);
-               goto leave;
-       }
+       if (dlm_is_lockres_migrateable(dlm, res))
+               target = dlm_pick_migration_target(dlm, res);
+       spin_unlock(&res->spinlock);
 
-       /* No need to migrate a lockres having no locks */
-       ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs);
-       if (ret >= 0 && numlocks == 0 && !hasrefs) {
-               spin_unlock(&res->spinlock);
+       if (target == O2NM_MAX_NODES)
                goto leave;
-       }
-       spin_unlock(&res->spinlock);
 
        /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
        spin_unlock(&dlm->spinlock);
        lock_dropped = 1;
-       while (1) {
-               ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES);
-               if (ret >= 0)
-                       break;
-               if (ret == -ENOTEMPTY) {
-                       mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
-                               res->lockname.len, res->lockname.name);
-                       BUG();
-               }
-
-               mlog(0, "lockres %.*s: migrate failed, "
-                    "retrying\n", res->lockname.len,
-                    res->lockname.name);
-               msleep(DLM_MIGRATION_RETRY_MS);
-       }
+       ret = dlm_migrate_lockres(dlm, res, target);
+       if (ret)
+               mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
+                    dlm->name, res->lockname.len, res->lockname.name,
+                    target, ret);
        spin_lock(&dlm->spinlock);
 leave:
        return lock_dropped;
@@ -2898,61 +2829,55 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
        }
 }
 
-/* for now this is not too intelligent.  we will
- * need stats to make this do the right thing.
- * this just finds the first lock on one of the
- * queues and uses that node as the target. */
+/*
+ * Pick a node to migrate the lock resource to. This function selects a
+ * potential target based first on the locks and then on refmap. It skips
+ * nodes that are in the process of exiting the domain.
+ */
 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
                                    struct dlm_lock_resource *res)
 {
-       int i;
+       enum dlm_lockres_list idx;
        struct list_head *queue = &res->granted;
        struct dlm_lock *lock;
-       int nodenum;
+       int noderef;
+       u8 nodenum = O2NM_MAX_NODES;
 
        assert_spin_locked(&dlm->spinlock);
+       assert_spin_locked(&res->spinlock);
 
-       spin_lock(&res->spinlock);
-       for (i=0; i<3; i++) {
+       /* Go through all the locks */
+       for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
+               queue = dlm_list_idx_to_ptr(res, idx);
                list_for_each_entry(lock, queue, list) {
-                       /* up to the caller to make sure this node
-                        * is alive */
-                       if (lock->ml.node != dlm->node_num) {
-                               spin_unlock(&res->spinlock);
-                               return lock->ml.node;
-                       }
+                       if (lock->ml.node == dlm->node_num)
+                               continue;
+                       if (test_bit(lock->ml.node, dlm->exit_domain_map))
+                               continue;
+                       nodenum = lock->ml.node;
+                       goto bail;
                }
-               queue++;
-       }
-
-       nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-       if (nodenum < O2NM_MAX_NODES) {
-               spin_unlock(&res->spinlock);
-               return nodenum;
        }
-       spin_unlock(&res->spinlock);
-       mlog(0, "have not found a suitable target yet! checking domain map\n");
 
-       /* ok now we're getting desperate.  pick anyone alive. */
-       nodenum = -1;
+       /* Go thru the refmap */
+       noderef = -1;
        while (1) {
-               nodenum = find_next_bit(dlm->domain_map,
-                                       O2NM_MAX_NODES, nodenum+1);
-               mlog(0, "found %d in domain map\n", nodenum);
-               if (nodenum >= O2NM_MAX_NODES)
+               noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
+                                       noderef + 1);
+               if (noderef >= O2NM_MAX_NODES)
                        break;
-               if (nodenum != dlm->node_num) {
-                       mlog(0, "picking %d\n", nodenum);
-                       return nodenum;
-               }
+               if (noderef == dlm->node_num)
+                       continue;
+               if (test_bit(noderef, dlm->exit_domain_map))
+                       continue;
+               nodenum = noderef;
+               goto bail;
        }
 
-       mlog(0, "giving up.  no master to migrate to\n");
-       return DLM_LOCK_RES_OWNER_UNKNOWN;
+bail:
+       return nodenum;
 }
 
-
-
 /* this is called by the new master once all lockres
  * data has been received */
 static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
index f1beb6fc254d1720ae10b10a0a7ea904703c7d0d..7efab6d28a21b4ee6a8376559d70f739a4e1da90 100644 (file)
@@ -2393,6 +2393,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
 
        mlog(0, "node %u being removed from domain map!\n", idx);
        clear_bit(idx, dlm->domain_map);
+       clear_bit(idx, dlm->exit_domain_map);
        /* wake up migration waiters if a node goes down.
         * perhaps later we can genericize this for other waiters. */
        wake_up(&dlm->migration_wq);
index 8c5c0eddc365a060d278d49dac042226fafa0515..b420767970492082b357cf67f8735894dd7e103f 100644 (file)
@@ -88,7 +88,7 @@ struct workqueue_struct *user_dlm_worker;
  *               signifies a bast fired on the lock.
  */
 #define DLMFS_CAPABILITIES "bast stackglue"
-extern int param_set_dlmfs_capabilities(const char *val,
+static int param_set_dlmfs_capabilities(const char *val,
                                        struct kernel_param *kp)
 {
        printk(KERN_ERR "%s: readonly parameter\n", kp->name);
index 89659d6dc2067276f2cebcea63a9bcc34a0a2486..b1e35a392ca5279d7aa7d645aa75fd8c101c6d05 100644 (file)
@@ -2670,6 +2670,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
+       .fallocate      = ocfs2_fallocate,
 };
 
 const struct file_operations ocfs2_dops_no_plocks = {
index 8f13c5989eaeeecd543be983cec37176c1c54ece..bc91072b72196fd335c4b7cbc02ba08cb67254e6 100644 (file)
 #include "ioctl.h"
 #include "resize.h"
 #include "refcounttree.h"
+#include "sysfile.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "suballoc.h"
+#include "move_extents.h"
 
 #include <linux/ext2_fs.h>
 
  * be -EFAULT.  The error will be returned from the ioctl(2) call.  It's
  * just a best-effort to tell userspace that this request caused the error.
  */
-static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
                                        struct ocfs2_info_request __user *req)
 {
        kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
        (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
 }
 
-#define o2info_set_request_error(a, b) \
-               __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
-
-static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
+static inline void o2info_set_request_filled(struct ocfs2_info_request *req)
 {
        req->ir_flags |= OCFS2_INFO_FL_FILLED;
 }
 
-#define o2info_set_request_filled(a) \
-               __o2info_set_request_filled((struct ocfs2_info_request *)&(a))
-
-static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
+static inline void o2info_clear_request_filled(struct ocfs2_info_request *req)
 {
        req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
 }
 
-#define o2info_clear_request_filled(a) \
-               __o2info_clear_request_filled((struct ocfs2_info_request *)&(a))
+static inline int o2info_coherent(struct ocfs2_info_request *req)
+{
+       return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
+}
 
 static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
@@ -153,7 +154,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
 
        oib.ib_blocksize = inode->i_sb->s_blocksize;
 
-       o2info_set_request_filled(oib);
+       o2info_set_request_filled(&oib.ib_req);
 
        if (o2info_to_user(oib, req))
                goto bail;
@@ -161,7 +162,7 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
        status = 0;
 bail:
        if (status)
-               o2info_set_request_error(oib, req);
+               o2info_set_request_error(&oib.ib_req, req);
 
        return status;
 }
@@ -178,7 +179,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
 
        oic.ic_clustersize = osb->s_clustersize;
 
-       o2info_set_request_filled(oic);
+       o2info_set_request_filled(&oic.ic_req);
 
        if (o2info_to_user(oic, req))
                goto bail;
@@ -186,7 +187,7 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
        status = 0;
 bail:
        if (status)
-               o2info_set_request_error(oic, req);
+               o2info_set_request_error(&oic.ic_req, req);
 
        return status;
 }
@@ -203,7 +204,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
 
        oim.im_max_slots = osb->max_slots;
 
-       o2info_set_request_filled(oim);
+       o2info_set_request_filled(&oim.im_req);
 
        if (o2info_to_user(oim, req))
                goto bail;
@@ -211,7 +212,7 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
        status = 0;
 bail:
        if (status)
-               o2info_set_request_error(oim, req);
+               o2info_set_request_error(&oim.im_req, req);
 
        return status;
 }
@@ -228,7 +229,7 @@ int ocfs2_info_handle_label(struct inode *inode,
 
        memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
 
-       o2info_set_request_filled(oil);
+       o2info_set_request_filled(&oil.il_req);
 
        if (o2info_to_user(oil, req))
                goto bail;
@@ -236,7 +237,7 @@ int ocfs2_info_handle_label(struct inode *inode,
        status = 0;
 bail:
        if (status)
-               o2info_set_request_error(oil, req);
+               o2info_set_request_error(&oil.il_req, req);
 
        return status;
 }
@@ -253,7 +254,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
 
        memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
 
-       o2info_set_request_filled(oiu);
+       o2info_set_request_filled(&oiu.iu_req);
 
        if (o2info_to_user(oiu, req))
                goto bail;
@@ -261,7 +262,7 @@ int ocfs2_info_handle_uuid(struct inode *inode,
        status = 0;
 bail:
        if (status)
-               o2info_set_request_error(oiu, req);
+               o2info_set_request_error(&oiu.iu_req, req);
 
        return status;
 }
@@ -280,7 +281,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
        oif.if_incompat_features = osb->s_feature_incompat;
        oif.if_ro_compat_features = osb->s_feature_ro_compat;
 
-       o2info_set_request_filled(oif);
+       o2info_set_request_filled(&oif.if_req);
 
        if (o2info_to_user(oif, req))
                goto bail;
@@ -288,7 +289,7 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
        status = 0;
 bail:
        if (status)
-               o2info_set_request_error(oif, req);
+               o2info_set_request_error(&oif.if_req, req);
 
        return status;
 }
@@ -305,7 +306,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
 
        oij.ij_journal_size = osb->journal->j_inode->i_size;
 
-       o2info_set_request_filled(oij);
+       o2info_set_request_filled(&oij.ij_req);
 
        if (o2info_to_user(oij, req))
                goto bail;
@@ -313,7 +314,408 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
        status = 0;
 bail:
        if (status)
-               o2info_set_request_error(oij, req);
+               o2info_set_request_error(&oij.ij_req, req);
+
+       return status;
+}
+
+int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
+                               struct inode *inode_alloc, u64 blkno,
+                               struct ocfs2_info_freeinode *fi, u32 slot)
+{
+       int status = 0, unlock = 0;
+
+       struct buffer_head *bh = NULL;
+       struct ocfs2_dinode *dinode_alloc = NULL;
+
+       if (inode_alloc)
+               mutex_lock(&inode_alloc->i_mutex);
+
+       if (o2info_coherent(&fi->ifi_req)) {
+               status = ocfs2_inode_lock(inode_alloc, &bh, 0);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+               unlock = 1;
+       } else {
+               status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+       }
+
+       dinode_alloc = (struct ocfs2_dinode *)bh->b_data;
+
+       fi->ifi_stat[slot].lfi_total =
+               le32_to_cpu(dinode_alloc->id1.bitmap1.i_total);
+       fi->ifi_stat[slot].lfi_free =
+               le32_to_cpu(dinode_alloc->id1.bitmap1.i_total) -
+               le32_to_cpu(dinode_alloc->id1.bitmap1.i_used);
+
+bail:
+       if (unlock)
+               ocfs2_inode_unlock(inode_alloc, 0);
+
+       if (inode_alloc)
+               mutex_unlock(&inode_alloc->i_mutex);
+
+       brelse(bh);
+
+       return status;
+}
+
+int ocfs2_info_handle_freeinode(struct inode *inode,
+                               struct ocfs2_info_request __user *req)
+{
+       u32 i;
+       u64 blkno = -1;
+       char namebuf[40];
+       int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE;
+       struct ocfs2_info_freeinode *oifi = NULL;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct inode *inode_alloc = NULL;
+
+       oifi = kzalloc(sizeof(struct ocfs2_info_freeinode), GFP_KERNEL);
+       if (!oifi) {
+               status = -ENOMEM;
+               mlog_errno(status);
+               goto bail;
+       }
+
+       if (o2info_from_user(*oifi, req))
+               goto bail;
+
+       oifi->ifi_slotnum = osb->max_slots;
+
+       for (i = 0; i < oifi->ifi_slotnum; i++) {
+               if (o2info_coherent(&oifi->ifi_req)) {
+                       inode_alloc = ocfs2_get_system_file_inode(osb, type, i);
+                       if (!inode_alloc) {
+                               mlog(ML_ERROR, "unable to get alloc inode in "
+                                   "slot %u\n", i);
+                               status = -EIO;
+                               goto bail;
+                       }
+               } else {
+                       ocfs2_sprintf_system_inode_name(namebuf,
+                                                       sizeof(namebuf),
+                                                       type, i);
+                       status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+                                                           namebuf,
+                                                           strlen(namebuf),
+                                                           &blkno);
+                       if (status < 0) {
+                               status = -ENOENT;
+                               goto bail;
+                       }
+               }
+
+               status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
+               if (status < 0)
+                       goto bail;
+
+               iput(inode_alloc);
+               inode_alloc = NULL;
+       }
+
+       o2info_set_request_filled(&oifi->ifi_req);
+
+       if (o2info_to_user(*oifi, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(&oifi->ifi_req, req);
+
+       kfree(oifi);
+
+       return status;
+}
+
+static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
+                                  unsigned int chunksize)
+{
+       int index;
+
+       index = __ilog2_u32(chunksize);
+       if (index >= OCFS2_INFO_MAX_HIST)
+               index = OCFS2_INFO_MAX_HIST - 1;
+
+       hist->fc_chunks[index]++;
+       hist->fc_clusters[index] += chunksize;
+}
+
+static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
+                              unsigned int chunksize)
+{
+       if (chunksize > stats->ffs_max)
+               stats->ffs_max = chunksize;
+
+       if (chunksize < stats->ffs_min)
+               stats->ffs_min = chunksize;
+
+       stats->ffs_avg += chunksize;
+       stats->ffs_free_chunks_real++;
+}
+
+void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
+                          unsigned int chunksize)
+{
+       o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
+       o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
+}
+
+int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
+                                  struct inode *gb_inode,
+                                  struct ocfs2_dinode *gb_dinode,
+                                  struct ocfs2_chain_rec *rec,
+                                  struct ocfs2_info_freefrag *ffg,
+                                  u32 chunks_in_group)
+{
+       int status = 0, used;
+       u64 blkno;
+
+       struct buffer_head *bh = NULL;
+       struct ocfs2_group_desc *bg = NULL;
+
+       unsigned int max_bits, num_clusters;
+       unsigned int offset = 0, cluster, chunk;
+       unsigned int chunk_free, last_chunksize = 0;
+
+       if (!le32_to_cpu(rec->c_free))
+               goto bail;
+
+       do {
+               if (!bg)
+                       blkno = le64_to_cpu(rec->c_blkno);
+               else
+                       blkno = le64_to_cpu(bg->bg_next_group);
+
+               if (bh) {
+                       brelse(bh);
+                       bh = NULL;
+               }
+
+               if (o2info_coherent(&ffg->iff_req))
+                       status = ocfs2_read_group_descriptor(gb_inode,
+                                                            gb_dinode,
+                                                            blkno, &bh);
+               else
+                       status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+
+               if (status < 0) {
+                       mlog(ML_ERROR, "Can't read the group descriptor # "
+                            "%llu from device.", (unsigned long long)blkno);
+                       status = -EIO;
+                       goto bail;
+               }
+
+               bg = (struct ocfs2_group_desc *)bh->b_data;
+
+               if (!le16_to_cpu(bg->bg_free_bits_count))
+                       continue;
+
+               max_bits = le16_to_cpu(bg->bg_bits);
+               offset = 0;
+
+               for (chunk = 0; chunk < chunks_in_group; chunk++) {
+                       /*
+                        * last chunk may be not an entire one.
+                        */
+                       if ((offset + ffg->iff_chunksize) > max_bits)
+                               num_clusters = max_bits - offset;
+                       else
+                               num_clusters = ffg->iff_chunksize;
+
+                       chunk_free = 0;
+                       for (cluster = 0; cluster < num_clusters; cluster++) {
+                               used = ocfs2_test_bit(offset,
+                                               (unsigned long *)bg->bg_bitmap);
+                               /*
+                                * - chunk_free counts free clusters in #N chunk.
+                                * - last_chunksize records the size(in) clusters
+                                *   for the last real free chunk being counted.
+                                */
+                               if (!used) {
+                                       last_chunksize++;
+                                       chunk_free++;
+                               }
+
+                               if (used && last_chunksize) {
+                                       ocfs2_info_update_ffg(ffg,
+                                                             last_chunksize);
+                                       last_chunksize = 0;
+                               }
+
+                               offset++;
+                       }
+
+                       if (chunk_free == ffg->iff_chunksize)
+                               ffg->iff_ffs.ffs_free_chunks++;
+               }
+
+               /*
+                * need to update the info for last free chunk.
+                */
+               if (last_chunksize)
+                       ocfs2_info_update_ffg(ffg, last_chunksize);
+
+       } while (le64_to_cpu(bg->bg_next_group));
+
+bail:
+       brelse(bh);
+
+       return status;
+}
+
+int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
+                                   struct inode *gb_inode, u64 blkno,
+                                   struct ocfs2_info_freefrag *ffg)
+{
+       u32 chunks_in_group;
+       int status = 0, unlock = 0, i;
+
+       struct buffer_head *bh = NULL;
+       struct ocfs2_chain_list *cl = NULL;
+       struct ocfs2_chain_rec *rec = NULL;
+       struct ocfs2_dinode *gb_dinode = NULL;
+
+       if (gb_inode)
+               mutex_lock(&gb_inode->i_mutex);
+
+       if (o2info_coherent(&ffg->iff_req)) {
+               status = ocfs2_inode_lock(gb_inode, &bh, 0);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+               unlock = 1;
+       } else {
+               status = ocfs2_read_blocks_sync(osb, blkno, 1, &bh);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+       }
+
+       gb_dinode = (struct ocfs2_dinode *)bh->b_data;
+       cl = &(gb_dinode->id2.i_chain);
+
+       /*
+        * Chunksize(in) clusters from userspace should be
+        * less than clusters in a group.
+        */
+       if (ffg->iff_chunksize > le16_to_cpu(cl->cl_cpg)) {
+               status = -EINVAL;
+               goto bail;
+       }
+
+       memset(&ffg->iff_ffs, 0, sizeof(struct ocfs2_info_freefrag_stats));
+
+       ffg->iff_ffs.ffs_min = ~0U;
+       ffg->iff_ffs.ffs_clusters =
+                       le32_to_cpu(gb_dinode->id1.bitmap1.i_total);
+       ffg->iff_ffs.ffs_free_clusters = ffg->iff_ffs.ffs_clusters -
+                       le32_to_cpu(gb_dinode->id1.bitmap1.i_used);
+
+       chunks_in_group = le16_to_cpu(cl->cl_cpg) / ffg->iff_chunksize + 1;
+
+       for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+               rec = &(cl->cl_recs[i]);
+               status = ocfs2_info_freefrag_scan_chain(osb, gb_inode,
+                                                       gb_dinode,
+                                                       rec, ffg,
+                                                       chunks_in_group);
+               if (status)
+                       goto bail;
+       }
+
+       if (ffg->iff_ffs.ffs_free_chunks_real)
+               ffg->iff_ffs.ffs_avg = (ffg->iff_ffs.ffs_avg /
+                                       ffg->iff_ffs.ffs_free_chunks_real);
+bail:
+       if (unlock)
+               ocfs2_inode_unlock(gb_inode, 0);
+
+       if (gb_inode)
+               mutex_unlock(&gb_inode->i_mutex);
+
+       if (gb_inode)
+               iput(gb_inode);
+
+       brelse(bh);
+
+       return status;
+}
+
+int ocfs2_info_handle_freefrag(struct inode *inode,
+                              struct ocfs2_info_request __user *req)
+{
+       u64 blkno = -1;
+       char namebuf[40];
+       int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE;
+
+       struct ocfs2_info_freefrag *oiff;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct inode *gb_inode = NULL;
+
+       oiff = kzalloc(sizeof(struct ocfs2_info_freefrag), GFP_KERNEL);
+       if (!oiff) {
+               status = -ENOMEM;
+               mlog_errno(status);
+               goto bail;
+       }
+
+       if (o2info_from_user(*oiff, req))
+               goto bail;
+       /*
+        * chunksize from userspace should be power of 2.
+        */
+       if ((oiff->iff_chunksize & (oiff->iff_chunksize - 1)) ||
+           (!oiff->iff_chunksize)) {
+               status = -EINVAL;
+               goto bail;
+       }
+
+       if (o2info_coherent(&oiff->iff_req)) {
+               gb_inode = ocfs2_get_system_file_inode(osb, type,
+                                                      OCFS2_INVALID_SLOT);
+               if (!gb_inode) {
+                       mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+                       status = -EIO;
+                       goto bail;
+               }
+       } else {
+               ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type,
+                                               OCFS2_INVALID_SLOT);
+               status = ocfs2_lookup_ino_from_name(osb->sys_root_inode,
+                                                   namebuf,
+                                                   strlen(namebuf),
+                                                   &blkno);
+               if (status < 0) {
+                       status = -ENOENT;
+                       goto bail;
+               }
+       }
+
+       status = ocfs2_info_freefrag_scan_bitmap(osb, gb_inode, blkno, oiff);
+       if (status < 0)
+               goto bail;
+
+       o2info_set_request_filled(&oiff->iff_req);
+
+       if (o2info_to_user(*oiff, req))
+               goto bail;
+
+       status = 0;
+bail:
+       if (status)
+               o2info_set_request_error(&oiff->iff_req, req);
+
+       kfree(oiff);
 
        return status;
 }
@@ -327,7 +729,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
        if (o2info_from_user(oir, req))
                goto bail;
 
-       o2info_clear_request_filled(oir);
+       o2info_clear_request_filled(&oir);
 
        if (o2info_to_user(oir, req))
                goto bail;
@@ -335,7 +737,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
        status = 0;
 bail:
        if (status)
-               o2info_set_request_error(oir, req);
+               o2info_set_request_error(&oir, req);
 
        return status;
 }
@@ -389,6 +791,14 @@ int ocfs2_info_handle_request(struct inode *inode,
                if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
                        status = ocfs2_info_handle_journal_size(inode, req);
                break;
+       case OCFS2_INFO_FREEINODE:
+               if (oir.ir_size == sizeof(struct ocfs2_info_freeinode))
+                       status = ocfs2_info_handle_freeinode(inode, req);
+               break;
+       case OCFS2_INFO_FREEFRAG:
+               if (oir.ir_size == sizeof(struct ocfs2_info_freefrag))
+                       status = ocfs2_info_handle_freefrag(inode, req);
+               break;
        default:
                status = ocfs2_info_handle_unknown(inode, req);
                break;
@@ -542,6 +952,31 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        return -EFAULT;
 
                return ocfs2_info_handle(inode, &info, 0);
+       case FITRIM:
+       {
+               struct super_block *sb = inode->i_sb;
+               struct fstrim_range range;
+               int ret = 0;
+
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EPERM;
+
+               if (copy_from_user(&range, (struct fstrim_range *)arg,
+                   sizeof(range)))
+                       return -EFAULT;
+
+               ret = ocfs2_trim_fs(sb, &range);
+               if (ret < 0)
+                       return ret;
+
+               if (copy_to_user((struct fstrim_range *)arg, &range,
+                   sizeof(range)))
+                       return -EFAULT;
+
+               return 0;
+       }
+       case OCFS2_IOC_MOVE_EXT:
+               return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
        default:
                return -ENOTTY;
        }
@@ -569,6 +1004,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case OCFS2_IOC_GROUP_EXTEND:
        case OCFS2_IOC_GROUP_ADD:
        case OCFS2_IOC_GROUP_ADD64:
+       case FITRIM:
                break;
        case OCFS2_IOC_REFLINK:
                if (copy_from_user(&args, (struct reflink_arguments *)arg,
@@ -584,6 +1020,8 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                        return -EFAULT;
 
                return ocfs2_info_handle(inode, &info, 1);
+       case OCFS2_IOC_MOVE_EXT:
+               break;
        default:
                return -ENOIOCTLCMD;
        }
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
new file mode 100644 (file)
index 0000000..4c54884
--- /dev/null
@@ -0,0 +1,1153 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.c
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/swap.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_ioctl.h"
+
+#include "alloc.h"
+#include "aops.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "super.h"
+#include "dir.h"
+#include "buffer_head_io.h"
+#include "sysfile.h"
+#include "suballoc.h"
+#include "refcounttree.h"
+#include "move_extents.h"
+
+struct ocfs2_move_extents_context {
+       struct inode *inode;
+       struct file *file;
+       int auto_defrag;
+       int partial;
+       int credits;
+       u32 new_phys_cpos;
+       u32 clusters_moved;
+       u64 refcount_loc;
+       struct ocfs2_move_extents *range;
+       struct ocfs2_extent_tree et;
+       struct ocfs2_alloc_context *meta_ac;
+       struct ocfs2_alloc_context *data_ac;
+       struct ocfs2_cached_dealloc_ctxt dealloc;
+};
+
+static int __ocfs2_move_extent(handle_t *handle,
+                              struct ocfs2_move_extents_context *context,
+                              u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
+                              int ext_flags)
+{
+       int ret = 0, index;
+       struct inode *inode = context->inode;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct ocfs2_extent_rec *rec, replace_rec;
+       struct ocfs2_path *path = NULL;
+       struct ocfs2_extent_list *el;
+       u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
+       u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
+
+       ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
+                                              p_cpos, new_p_cpos, len);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       memset(&replace_rec, 0, sizeof(replace_rec));
+       replace_rec.e_cpos = cpu_to_le32(cpos);
+       replace_rec.e_leaf_clusters = cpu_to_le16(len);
+       replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+                                                                  new_p_cpos));
+
+       path = ocfs2_new_path_from_et(&context->et);
+       if (!path) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       el = path_leaf_el(path);
+
+       index = ocfs2_search_extent_list(el, cpos);
+       if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+               ocfs2_error(inode->i_sb,
+                           "Inode %llu has an extent at cpos %u which can no "
+                           "longer be found.\n",
+                           (unsigned long long)ino, cpos);
+               ret = -EROFS;
+               goto out;
+       }
+
+       rec = &el->l_recs[index];
+
+       BUG_ON(ext_flags != rec->e_flags);
+       /*
+        * after moving/defraging to new location, the extent is not going
+        * to be refcounted anymore.
+        */
+       replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
+
+       ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+                                     context->et.et_root_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_split_extent(handle, &context->et, path, index,
+                                &replace_rec, context->meta_ac,
+                                &context->dealloc);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ocfs2_journal_dirty(handle, context->et.et_root_bh);
+
+       context->new_phys_cpos = new_p_cpos;
+
+       /*
+        * need I to append truncate log for old clusters?
+        */
+       if (old_blkno) {
+               if (ext_flags & OCFS2_EXT_REFCOUNTED)
+                       ret = ocfs2_decrease_refcount(inode, handle,
+                                       ocfs2_blocks_to_clusters(osb->sb,
+                                                                old_blkno),
+                                       len, context->meta_ac,
+                                       &context->dealloc, 1);
+               else
+                       ret = ocfs2_truncate_log_append(osb, handle,
+                                                       old_blkno, len);
+       }
+
+out:
+       return ret;
+}
+
+/*
+ * lock allocators, and reserving appropriate number of bits for
+ * meta blocks and data clusters.
+ *
+ * in some cases, we don't need to reserve clusters, just let data_ac
+ * be NULL.
+ */
+static int ocfs2_lock_allocators_move_extents(struct inode *inode,
+                                       struct ocfs2_extent_tree *et,
+                                       u32 clusters_to_move,
+                                       u32 extents_to_split,
+                                       struct ocfs2_alloc_context **meta_ac,
+                                       struct ocfs2_alloc_context **data_ac,
+                                       int extra_blocks,
+                                       int *credits)
+{
+       int ret, num_free_extents;
+       unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       num_free_extents = ocfs2_num_free_extents(osb, et);
+       if (num_free_extents < 0) {
+               ret = num_free_extents;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (!num_free_extents ||
+           (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+               extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+
+       ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (data_ac) {
+               ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+
+       *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
+                                             clusters_to_move + 2);
+
+       mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
+            extra_blocks, clusters_to_move, *credits);
+out:
+       if (ret) {
+               if (*meta_ac) {
+                       ocfs2_free_alloc_context(*meta_ac);
+                       *meta_ac = NULL;
+               }
+       }
+
+       return ret;
+}
+
+/*
+ * Using one journal handle to guarantee the data consistency in case
+ * crash happens anywhere.
+ *
+ *  XXX: defrag can end up with finishing partial extent as requested,
+ * due to not enough contiguous clusters can be found in allocator.
+ */
+static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
+                              u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
+{
+       int ret, credits = 0, extra_blocks = 0, partial = context->partial;
+       handle_t *handle;
+       struct inode *inode = context->inode;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct inode *tl_inode = osb->osb_tl_inode;
+       struct ocfs2_refcount_tree *ref_tree = NULL;
+       u32 new_phys_cpos, new_len;
+       u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+       if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
+
+               BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+                        OCFS2_HAS_REFCOUNT_FL));
+
+               BUG_ON(!context->refcount_loc);
+
+               ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+                                              &ref_tree, NULL);
+               if (ret) {
+                       mlog_errno(ret);
+                       return ret;
+               }
+
+               ret = ocfs2_prepare_refcount_change_for_del(inode,
+                                                       context->refcount_loc,
+                                                       phys_blkno,
+                                                       *len,
+                                                       &credits,
+                                                       &extra_blocks);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+
+       ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
+                                                &context->meta_ac,
+                                                &context->data_ac,
+                                                extra_blocks, &credits);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /*
+        * should be using allocation reservation strategy there?
+        *
+        * if (context->data_ac)
+        *      context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+        */
+
+       mutex_lock(&tl_inode->i_mutex);
+
+       if (ocfs2_truncate_log_needs_flush(osb)) {
+               ret = __ocfs2_flush_truncate_log(osb);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out_unlock_mutex;
+               }
+       }
+
+       handle = ocfs2_start_trans(osb, credits);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               mlog_errno(ret);
+               goto out_unlock_mutex;
+       }
+
+       ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
+                                    &new_phys_cpos, &new_len);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       /*
+        * allowing partial extent moving is kind of 'pros and cons', it makes
+        * whole defragmentation less likely to fail, on the contrary, the bad
+        * thing is it may make the fs even more fragmented after moving, let
+        * userspace make a good decision here.
+        */
+       if (new_len != *len) {
+               mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
+               if (!partial) {
+                       context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
+                       ret = -ENOSPC;
+                       goto out_commit;
+               }
+       }
+
+       mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
+            phys_cpos, new_phys_cpos);
+
+       ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
+                                 new_phys_cpos, ext_flags);
+       if (ret)
+               mlog_errno(ret);
+
+       if (partial && (new_len != *len))
+               *len = new_len;
+
+       /*
+        * Here we should write the new page out first if we are
+        * in write-back mode.
+        */
+       ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
+       if (ret)
+               mlog_errno(ret);
+
+out_commit:
+       ocfs2_commit_trans(osb, handle);
+
+out_unlock_mutex:
+       mutex_unlock(&tl_inode->i_mutex);
+
+       if (context->data_ac) {
+               ocfs2_free_alloc_context(context->data_ac);
+               context->data_ac = NULL;
+       }
+
+       if (context->meta_ac) {
+               ocfs2_free_alloc_context(context->meta_ac);
+               context->meta_ac = NULL;
+       }
+
+out:
+       if (ref_tree)
+               ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+       return ret;
+}
+
+/*
+ * find the victim alloc group, where #blkno fits.
+ */
+static int ocfs2_find_victim_alloc_group(struct inode *inode,
+                                        u64 vict_blkno,
+                                        int type, int slot,
+                                        int *vict_bit,
+                                        struct buffer_head **ret_bh)
+{
+       int ret, i, blocks_per_unit = 1;
+       u64 blkno;
+       char namebuf[40];
+
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
+       struct ocfs2_chain_list *cl;
+       struct ocfs2_chain_rec *rec;
+       struct ocfs2_dinode *ac_dinode;
+       struct ocfs2_group_desc *bg;
+
+       ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
+       ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+                                        strlen(namebuf), &blkno);
+       if (ret) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
+       cl = &(ac_dinode->id2.i_chain);
+       rec = &(cl->cl_recs[0]);
+
+       if (type == GLOBAL_BITMAP_SYSTEM_INODE)
+               blocks_per_unit <<= (osb->s_clustersize_bits -
+                                               inode->i_sb->s_blocksize_bits);
+       /*
+        * 'vict_blkno' was out of the valid range.
+        */
+       if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
+           (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) *
+                               blocks_per_unit))) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
+
+               rec = &(cl->cl_recs[i]);
+               if (!rec)
+                       continue;
+
+               bg = NULL;
+
+               do {
+                       if (!bg)
+                               blkno = le64_to_cpu(rec->c_blkno);
+                       else
+                               blkno = le64_to_cpu(bg->bg_next_group);
+
+                       if (gd_bh) {
+                               brelse(gd_bh);
+                               gd_bh = NULL;
+                       }
+
+                       ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+
+                       bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+                       if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
+                                               le16_to_cpu(bg->bg_bits))) {
+
+                               *ret_bh = gd_bh;
+                               *vict_bit = (vict_blkno - blkno) /
+                                                       blocks_per_unit;
+                               mlog(0, "find the victim group: #%llu, "
+                                    "total_bits: %u, vict_bit: %u\n",
+                                    blkno, le16_to_cpu(bg->bg_bits),
+                                    *vict_bit);
+                               goto out;
+                       }
+
+               } while (le64_to_cpu(bg->bg_next_group));
+       }
+
+       ret = -EINVAL;
+out:
+       brelse(ac_bh);
+
+       /*
+        * caller has to release the gd_bh properly.
+        */
+       return ret;
+}
+
+/*
+ * XXX: helper to validate and adjust moving goal.
+ */
+static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
+                                              struct ocfs2_move_extents *range)
+{
+       int ret, goal_bit = 0;
+
+       struct buffer_head *gd_bh = NULL;
+       struct ocfs2_group_desc *bg;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       int c_to_b = 1 << (osb->s_clustersize_bits -
+                                       inode->i_sb->s_blocksize_bits);
+
+       /*
+        * validate goal sits within global_bitmap, and return the victim
+        * group desc
+        */
+       ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
+                                           GLOBAL_BITMAP_SYSTEM_INODE,
+                                           OCFS2_INVALID_SLOT,
+                                           &goal_bit, &gd_bh);
+       if (ret)
+               goto out;
+
+       bg = (struct ocfs2_group_desc *)gd_bh->b_data;
+
+       /*
+        * make goal become cluster aligned.
+        */
+       if (range->me_goal % c_to_b)
+               range->me_goal = range->me_goal / c_to_b * c_to_b;
+
+       /*
+        * moving goal is not allowd to start with a group desc blok(#0 blk)
+        * let's compromise to the latter cluster.
+        */
+       if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+               range->me_goal += c_to_b;
+
+       /*
+        * movement is not gonna cross two groups.
+        */
+       if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
+                                                               range->me_len) {
+               ret = -EINVAL;
+               goto out;
+       }
+       /*
+        * more exact validations/adjustments will be performed later during
+        * moving operation for each extent range.
+        */
+       mlog(0, "extents get ready to be moved to #%llu block\n",
+            range->me_goal);
+
+out:
+       brelse(gd_bh);
+
+       return ret;
+}
+
+static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
+                                   int *goal_bit, u32 move_len, u32 max_hop,
+                                   u32 *phys_cpos)
+{
+       int i, used, last_free_bits = 0, base_bit = *goal_bit;
+       struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+       u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                le64_to_cpu(gd->bg_blkno));
+
+       for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
+
+               used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
+               if (used) {
+                       /*
+                        * we even tried searching the free chunk by jumping
+                        * a 'max_hop' distance, but still failed.
+                        */
+                       if ((i - base_bit) > max_hop) {
+                               *phys_cpos = 0;
+                               break;
+                       }
+
+                       if (last_free_bits)
+                               last_free_bits = 0;
+
+                       continue;
+               } else
+                       last_free_bits++;
+
+               if (last_free_bits == move_len) {
+                       *goal_bit = i;
+                       *phys_cpos = base_cpos + i;
+                       break;
+               }
+       }
+
+       mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
+}
+
+static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+                                      handle_t *handle,
+                                      struct buffer_head *di_bh,
+                                      u32 num_bits,
+                                      u16 chain)
+{
+       int ret;
+       u32 tmp_used;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+       struct ocfs2_chain_list *cl =
+                               (struct ocfs2_chain_list *) &di->id2.i_chain;
+
+       ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+       di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
+       le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
+       ocfs2_journal_dirty(handle, di_bh);
+
+out:
+       return ret;
+}
+
+static inline int ocfs2_block_group_set_bits(handle_t *handle,
+                                            struct inode *alloc_inode,
+                                            struct ocfs2_group_desc *bg,
+                                            struct buffer_head *group_bh,
+                                            unsigned int bit_off,
+                                            unsigned int num_bits)
+{
+       int status;
+       void *bitmap = bg->bg_bitmap;
+       int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
+
+       /* All callers get the descriptor via
+        * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+       BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+       BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
+
+       mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
+            num_bits);
+
+       if (ocfs2_is_cluster_bitmap(alloc_inode))
+               journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
+
+       status = ocfs2_journal_access_gd(handle,
+                                        INODE_CACHE(alloc_inode),
+                                        group_bh,
+                                        journal_type);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+       if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+               ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+                           " count %u but claims %u are freed. num_bits %d",
+                           (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                           le16_to_cpu(bg->bg_bits),
+                           le16_to_cpu(bg->bg_free_bits_count), num_bits);
+               return -EROFS;
+       }
+       while (num_bits--)
+               ocfs2_set_bit(bit_off++, bitmap);
+
+       ocfs2_journal_dirty(handle, group_bh);
+
+bail:
+       return status;
+}
+
+static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
+                            u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
+                            u32 len, int ext_flags)
+{
+       int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
+       handle_t *handle;
+       struct inode *inode = context->inode;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct inode *tl_inode = osb->osb_tl_inode;
+       struct inode *gb_inode = NULL;
+       struct buffer_head *gb_bh = NULL;
+       struct buffer_head *gd_bh = NULL;
+       struct ocfs2_group_desc *gd;
+       struct ocfs2_refcount_tree *ref_tree = NULL;
+       u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                   context->range->me_threshold);
+       u64 phys_blkno, new_phys_blkno;
+
+       phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+
+       if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
+
+               BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+                        OCFS2_HAS_REFCOUNT_FL));
+
+               BUG_ON(!context->refcount_loc);
+
+               ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
+                                              &ref_tree, NULL);
+               if (ret) {
+                       mlog_errno(ret);
+                       return ret;
+               }
+
+               ret = ocfs2_prepare_refcount_change_for_del(inode,
+                                                       context->refcount_loc,
+                                                       phys_blkno,
+                                                       len,
+                                                       &credits,
+                                                       &extra_blocks);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+
+       ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
+                                                &context->meta_ac,
+                                                NULL, extra_blocks, &credits);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /*
+        * need to count 2 extra credits for global_bitmap inode and
+        * group descriptor.
+        */
+       credits += OCFS2_INODE_UPDATE_CREDITS + 1;
+
+       /*
+        * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
+        * logic, while we still need to lock the global_bitmap.
+        */
+       gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
+                                              OCFS2_INVALID_SLOT);
+       if (!gb_inode) {
+               mlog(ML_ERROR, "unable to get global_bitmap inode\n");
+               ret = -EIO;
+               goto out;
+       }
+
+       mutex_lock(&gb_inode->i_mutex);
+
+       ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_unlock_gb_mutex;
+       }
+
+       mutex_lock(&tl_inode->i_mutex);
+
+       handle = ocfs2_start_trans(osb, credits);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               mlog_errno(ret);
+               goto out_unlock_tl_inode;
+       }
+
+       new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
+       ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
+                                           GLOBAL_BITMAP_SYSTEM_INODE,
+                                           OCFS2_INVALID_SLOT,
+                                           &goal_bit, &gd_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       /*
+        * probe the victim cluster group to find a proper
+        * region to fit wanted movement, it even will perfrom
+        * a best-effort attempt by compromising to a threshold
+        * around the goal.
+        */
+       ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
+                               new_phys_cpos);
+       if (!new_phys_cpos) {
+               ret = -ENOSPC;
+               goto out_commit;
+       }
+
+       ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
+                                 *new_phys_cpos, ext_flags);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+       ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
+                                              le16_to_cpu(gd->bg_chain));
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
+                                        goal_bit, len);
+       if (ret)
+               mlog_errno(ret);
+
+       /*
+        * Here we should write the new page out first if we are
+        * in write-back mode.
+        */
+       ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
+       if (ret)
+               mlog_errno(ret);
+
+out_commit:
+       ocfs2_commit_trans(osb, handle);
+       brelse(gd_bh);
+
+out_unlock_tl_inode:
+       mutex_unlock(&tl_inode->i_mutex);
+
+       ocfs2_inode_unlock(gb_inode, 1);
+out_unlock_gb_mutex:
+       mutex_unlock(&gb_inode->i_mutex);
+       brelse(gb_bh);
+       iput(gb_inode);
+
+out:
+       if (context->meta_ac) {
+               ocfs2_free_alloc_context(context->meta_ac);
+               context->meta_ac = NULL;
+       }
+
+       if (ref_tree)
+               ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
+       return ret;
+}
+
+/*
+ * Helper to calculate the defraging length in one run according to threshold.
+ */
+static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
+                                        u32 threshold, int *skip)
+{
+       if ((*alloc_size + *len_defraged) < threshold) {
+               /*
+                * proceed defragmentation until we meet the thresh
+                */
+               *len_defraged += *alloc_size;
+       } else if (*len_defraged == 0) {
+               /*
+                * XXX: skip a large extent.
+                */
+               *skip = 1;
+       } else {
+               /*
+                * split this extent to coalesce with former pieces as
+                * to reach the threshold.
+                *
+                * we're done here with one cycle of defragmentation
+                * in a size of 'thresh', resetting 'len_defraged'
+                * forces a new defragmentation.
+                */
+               *alloc_size = threshold - *len_defraged;
+               *len_defraged = 0;
+       }
+}
+
+static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
+                               struct ocfs2_move_extents_context *context)
+{
+       int ret = 0, flags, do_defrag, skip = 0;
+       u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
+       u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
+
+       struct inode *inode = context->inode;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+       struct ocfs2_move_extents *range = context->range;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if ((inode->i_size == 0) || (range->me_len == 0))
+               return 0;
+
+       if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+               return 0;
+
+       context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
+
+       ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
+       ocfs2_init_dealloc_ctxt(&context->dealloc);
+
+       /*
+        * TO-DO XXX:
+        *
+        * - xattr extents.
+        */
+
+       do_defrag = context->auto_defrag;
+
+       /*
+        * extents moving happens in unit of clusters, for the sake
+        * of simplicity, we may ignore two clusters where 'byte_start'
+        * and 'byte_start + len' were within.
+        */
+       move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
+       len_to_move = (range->me_start + range->me_len) >>
+                                               osb->s_clustersize_bits;
+       if (len_to_move >= move_start)
+               len_to_move -= move_start;
+       else
+               len_to_move = 0;
+
+       if (do_defrag) {
+               defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
+               if (defrag_thresh <= 1)
+                       goto done;
+       } else
+               new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+                                                        range->me_goal);
+
+       mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
+            "thresh: %u\n",
+            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+            (unsigned long long)range->me_start,
+            (unsigned long long)range->me_len,
+            move_start, len_to_move, defrag_thresh);
+
+       cpos = move_start;
+       while (len_to_move) {
+               ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
+                                        &flags);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               if (alloc_size > len_to_move)
+                       alloc_size = len_to_move;
+
+               /*
+                * XXX: how to deal with a hole:
+                *
+                * - skip the hole of course
+                * - force a new defragmentation
+                */
+               if (!phys_cpos) {
+                       if (do_defrag)
+                               len_defraged = 0;
+
+                       goto next;
+               }
+
+               if (do_defrag) {
+                       ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
+                                                    defrag_thresh, &skip);
+                       /*
+                        * skip large extents
+                        */
+                       if (skip) {
+                               skip = 0;
+                               goto next;
+                       }
+
+                       mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
+                            "alloc_size: %u, len_defraged: %u\n",
+                            cpos, phys_cpos, alloc_size, len_defraged);
+
+                       ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
+                                                 &alloc_size, flags);
+               } else {
+                       ret = ocfs2_move_extent(context, cpos, phys_cpos,
+                                               &new_phys_cpos, alloc_size,
+                                               flags);
+
+                       new_phys_cpos += alloc_size;
+               }
+
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               context->clusters_moved += alloc_size;
+next:
+               cpos += alloc_size;
+               len_to_move -= alloc_size;
+       }
+
+done:
+       range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
+
+out:
+       range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
+                                                     context->clusters_moved);
+       range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
+                                                      context->new_phys_cpos);
+
+       ocfs2_schedule_truncate_log_flush(osb, 1);
+       ocfs2_run_deallocs(osb, &context->dealloc);
+
+       return ret;
+}
+
+static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
+{
+       int status;
+       handle_t *handle;
+       struct inode *inode = context->inode;
+       struct ocfs2_dinode *di;
+       struct buffer_head *di_bh = NULL;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (!inode)
+               return -ENOENT;
+
+       if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+               return -EROFS;
+
+       mutex_lock(&inode->i_mutex);
+
+       /*
+        * This prevents concurrent writes from other nodes
+        */
+       status = ocfs2_rw_lock(inode, 1);
+       if (status) {
+               mlog_errno(status);
+               goto out;
+       }
+
+       status = ocfs2_inode_lock(inode, &di_bh, 1);
+       if (status) {
+               mlog_errno(status);
+               goto out_rw_unlock;
+       }
+
+       /*
+        * rememer ip_xattr_sem also needs to be held if necessary
+        */
+       down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+       status = __ocfs2_move_extents_range(di_bh, context);
+
+       up_write(&OCFS2_I(inode)->ip_alloc_sem);
+       if (status) {
+               mlog_errno(status);
+               goto out_inode_unlock;
+       }
+
+       /*
+        * We update ctime for these changes
+        */
+       handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+       if (IS_ERR(handle)) {
+               status = PTR_ERR(handle);
+               mlog_errno(status);
+               goto out_inode_unlock;
+       }
+
+       status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+                                        OCFS2_JOURNAL_ACCESS_WRITE);
+       if (status) {
+               mlog_errno(status);
+               goto out_commit;
+       }
+
+       di = (struct ocfs2_dinode *)di_bh->b_data;
+       inode->i_ctime = CURRENT_TIME;
+       di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+       di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+       ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+       ocfs2_commit_trans(osb, handle);
+
+out_inode_unlock:
+       brelse(di_bh);
+       ocfs2_inode_unlock(inode, 1);
+out_rw_unlock:
+       ocfs2_rw_unlock(inode, 1);
+out:
+       mutex_unlock(&inode->i_mutex);
+
+       return status;
+}
+
+int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
+{
+       int status;
+
+       struct inode *inode = filp->f_path.dentry->d_inode;
+       struct ocfs2_move_extents range;
+       struct ocfs2_move_extents_context *context = NULL;
+
+       status = mnt_want_write(filp->f_path.mnt);
+       if (status)
+               return status;
+
+       if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
+               goto out;
+
+       if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+               status = -EPERM;
+               goto out;
+       }
+
+       context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
+       if (!context) {
+               status = -ENOMEM;
+               mlog_errno(status);
+               goto out;
+       }
+
+       context->inode = inode;
+       context->file = filp;
+
+       if (argp) {
+               if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
+                                  sizeof(range))) {
+                       status = -EFAULT;
+                       goto out;
+               }
+       } else {
+               status = -EINVAL;
+               goto out;
+       }
+
+       if (range.me_start > i_size_read(inode))
+               goto out;
+
+       if (range.me_start + range.me_len > i_size_read(inode))
+                       range.me_len = i_size_read(inode) - range.me_start;
+
+       context->range = &range;
+
+       if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
+               context->auto_defrag = 1;
+               /*
+                * ok, the default theshold for the defragmentation
+                * is 1M, since our maximum clustersize was 1M also.
+                * any thought?
+                */
+               if (!range.me_threshold)
+                       range.me_threshold = 1024 * 1024;
+
+               if (range.me_threshold > i_size_read(inode))
+                       range.me_threshold = i_size_read(inode);
+
+               if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
+                       context->partial = 1;
+       } else {
+               /*
+                * first best-effort attempt to validate and adjust the goal
+                * (physical address in block), while it can't guarantee later
+                * operation can succeed all the time since global_bitmap may
+                * change a bit over time.
+                */
+
+               status = ocfs2_validate_and_adjust_move_goal(inode, &range);
+               if (status)
+                       goto out;
+       }
+
+       status = ocfs2_move_extents(context);
+       if (status)
+               mlog_errno(status);
+out:
+       /*
+        * movement/defragmentation may end up being partially completed,
+        * that's the reason why we need to return userspace the finished
+        * length and new_offset even if failure happens somewhere.
+        */
+       if (argp) {
+               if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
+                               sizeof(range)))
+                       status = -EFAULT;
+       }
+
+       kfree(context);
+
+       mnt_drop_write(filp->f_path.mnt);
+
+       return status;
+}
diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h
new file mode 100644 (file)
index 0000000..4e143e8
--- /dev/null
@@ -0,0 +1,22 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * move_extents.h
+ *
+ * Copyright (C) 2011 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_MOVE_EXTENTS_H
+#define OCFS2_MOVE_EXTENTS_H
+
+int ocfs2_ioctl_move_extents(struct file *filp,  void __user *argp);
+
+#endif /* OCFS2_MOVE_EXTENTS_H */
index b46f39bf7438d5048dd5637d5762b4d6c44990e4..5b27ff1fa577d95533c0b594349b23f8cc62bd62 100644 (file)
@@ -142,6 +142,38 @@ struct ocfs2_info_journal_size {
        __u64 ij_journal_size;
 };
 
+struct ocfs2_info_freeinode {
+       struct ocfs2_info_request ifi_req;
+       struct ocfs2_info_local_freeinode {
+               __u64 lfi_total;
+               __u64 lfi_free;
+       } ifi_stat[OCFS2_MAX_SLOTS];
+       __u32 ifi_slotnum; /* out */
+       __u32 ifi_pad;
+};
+
+#define OCFS2_INFO_MAX_HIST     (32)
+
+struct ocfs2_info_freefrag {
+       struct ocfs2_info_request iff_req;
+       struct ocfs2_info_freefrag_stats { /* (out) */
+               struct ocfs2_info_free_chunk_list {
+                       __u32 fc_chunks[OCFS2_INFO_MAX_HIST];
+                       __u32 fc_clusters[OCFS2_INFO_MAX_HIST];
+               } ffs_fc_hist;
+               __u32 ffs_clusters;
+               __u32 ffs_free_clusters;
+               __u32 ffs_free_chunks;
+               __u32 ffs_free_chunks_real;
+               __u32 ffs_min; /* Minimum free chunksize in clusters */
+               __u32 ffs_max;
+               __u32 ffs_avg;
+               __u32 ffs_pad;
+       } iff_ffs;
+       __u32 iff_chunksize; /* chunksize in clusters(in) */
+       __u32 iff_pad;
+};
+
 /* Codes for ocfs2_info_request */
 enum ocfs2_info_type {
        OCFS2_INFO_CLUSTERSIZE = 1,
@@ -151,6 +183,8 @@ enum ocfs2_info_type {
        OCFS2_INFO_UUID,
        OCFS2_INFO_FS_FEATURES,
        OCFS2_INFO_JOURNAL_SIZE,
+       OCFS2_INFO_FREEINODE,
+       OCFS2_INFO_FREEFRAG,
        OCFS2_INFO_NUM_TYPES
 };
 
@@ -171,4 +205,38 @@ enum ocfs2_info_type {
 
 #define OCFS2_IOC_INFO         _IOR('o', 5, struct ocfs2_info)
 
+struct ocfs2_move_extents {
+/* All values are in bytes */
+       /* in */
+       __u64 me_start;         /* Virtual start in the file to move */
+       __u64 me_len;           /* Length of the extents to be moved */
+       __u64 me_goal;          /* Physical offset of the goal,
+                                  it's in block unit */
+       __u64 me_threshold;     /* Maximum distance from goal or threshold
+                                  for auto defragmentation */
+       __u64 me_flags;         /* Flags for the operation:
+                                * - auto defragmentation.
+                                * - refcount,xattr cases.
+                                */
+       /* out */
+       __u64 me_moved_len;     /* Moved/defraged length */
+       __u64 me_new_offset;    /* Resulting physical location */
+       __u32 me_reserved[2];   /* Reserved for futhure */
+};
+
+#define OCFS2_MOVE_EXT_FL_AUTO_DEFRAG  (0x00000001)    /* Kernel manages to
+                                                          claim new clusters
+                                                          as the goal place
+                                                          for extents moving */
+#define OCFS2_MOVE_EXT_FL_PART_DEFRAG  (0x00000002)    /* Allow partial extent
+                                                          moving, is to make
+                                                          movement less likely
+                                                          to fail, may make fs
+                                                          even more fragmented */
+#define OCFS2_MOVE_EXT_FL_COMPLETE     (0x00000004)    /* Move or defragmenation
+                                                          completely gets done.
+                                                        */
+
+#define OCFS2_IOC_MOVE_EXT     _IOW('o', 6, struct ocfs2_move_extents)
+
 #endif /* OCFS2_IOCTL_H */
index a1dae5bb54acda9d6e852d9d66e24c3d4cde8926..3b481f490633af2f483afd1817fe8ad538908b6a 100644 (file)
@@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
                  __entry->blkno, __entry->bit)
 );
 
+TRACE_EVENT(ocfs2_trim_extent,
+       TP_PROTO(struct super_block *sb, unsigned long long blk,
+                unsigned long long count),
+       TP_ARGS(sb, blk, count),
+       TP_STRUCT__entry(
+               __field(int, dev_major)
+               __field(int, dev_minor)
+               __field(unsigned long long, blk)
+               __field(__u64,  count)
+       ),
+       TP_fast_assign(
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
+               __entry->blk = blk;
+               __entry->count = count;
+       ),
+       TP_printk("%d %d %llu %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 __entry->blk, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
+
 /* End of trace events for fs/ocfs2/alloc.c. */
 
 /* Trace events for fs/ocfs2/localalloc.c. */
index 3c7606cff1ab4f3c7df789f14ef89dfdff70063e..ebfd3825f12a367b3c2786507146913a8191e56f 100644 (file)
@@ -66,7 +66,7 @@ struct ocfs2_cow_context {
                            u32 *num_clusters,
                            unsigned int *extent_flags);
        int (*cow_duplicate_clusters)(handle_t *handle,
-                                     struct ocfs2_cow_context *context,
+                                     struct file *file,
                                      u32 cpos, u32 old_cluster,
                                      u32 new_cluster, u32 new_len);
 };
@@ -2921,20 +2921,21 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
        return 0;
 }
 
-static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
-                                           struct ocfs2_cow_context *context,
-                                           u32 cpos, u32 old_cluster,
-                                           u32 new_cluster, u32 new_len)
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+                                    struct file *file,
+                                    u32 cpos, u32 old_cluster,
+                                    u32 new_cluster, u32 new_len)
 {
        int ret = 0, partial;
-       struct ocfs2_caching_info *ci = context->data_et.et_ci;
+       struct inode *inode = file->f_path.dentry->d_inode;
+       struct ocfs2_caching_info *ci = INODE_CACHE(inode);
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
        struct page *page;
        pgoff_t page_index;
        unsigned int from, to, readahead_pages;
        loff_t offset, end, map_end;
-       struct address_space *mapping = context->inode->i_mapping;
+       struct address_space *mapping = inode->i_mapping;
 
        trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
                                               new_cluster, new_len);
@@ -2948,8 +2949,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
         * We only duplicate pages until we reach the page contains i_size - 1.
         * So trim 'end' to i_size.
         */
-       if (end > i_size_read(context->inode))
-               end = i_size_read(context->inode);
+       if (end > i_size_read(inode))
+               end = i_size_read(inode);
 
        while (offset < end) {
                page_index = offset >> PAGE_CACHE_SHIFT;
@@ -2972,10 +2973,9 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
                        BUG_ON(PageDirty(page));
 
-               if (PageReadahead(page) && context->file) {
+               if (PageReadahead(page)) {
                        page_cache_async_readahead(mapping,
-                                                  &context->file->f_ra,
-                                                  context->file,
+                                                  &file->f_ra, file,
                                                   page, page_index,
                                                   readahead_pages);
                }
@@ -2999,8 +2999,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
                        }
                }
 
-               ocfs2_map_and_dirty_page(context->inode,
-                                        handle, from, to,
+               ocfs2_map_and_dirty_page(inode, handle, from, to,
                                         page, 0, &new_block);
                mark_page_accessed(page);
 unlock:
@@ -3015,14 +3014,15 @@ unlock:
        return ret;
 }
 
-static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
-                                          struct ocfs2_cow_context *context,
-                                          u32 cpos, u32 old_cluster,
-                                          u32 new_cluster, u32 new_len)
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+                                   struct file *file,
+                                   u32 cpos, u32 old_cluster,
+                                   u32 new_cluster, u32 new_len)
 {
        int ret = 0;
-       struct super_block *sb = context->inode->i_sb;
-       struct ocfs2_caching_info *ci = context->data_et.et_ci;
+       struct inode *inode = file->f_path.dentry->d_inode;
+       struct super_block *sb = inode->i_sb;
+       struct ocfs2_caching_info *ci = INODE_CACHE(inode);
        int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
        u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
        u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3145,8 +3145,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
 
        /*If the old clusters is unwritten, no need to duplicate. */
        if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-               ret = context->cow_duplicate_clusters(handle, context, cpos,
-                                                     old, new, len);
+               ret = context->cow_duplicate_clusters(handle, context->file,
+                                                     cpos, old, new, len);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3162,22 +3162,22 @@ out:
        return ret;
 }
 
-static int ocfs2_cow_sync_writeback(struct super_block *sb,
-                                   struct ocfs2_cow_context *context,
-                                   u32 cpos, u32 num_clusters)
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+                            struct inode *inode,
+                            u32 cpos, u32 num_clusters)
 {
        int ret = 0;
        loff_t offset, end, map_end;
        pgoff_t page_index;
        struct page *page;
 
-       if (ocfs2_should_order_data(context->inode))
+       if (ocfs2_should_order_data(inode))
                return 0;
 
        offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
        end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
 
-       ret = filemap_fdatawrite_range(context->inode->i_mapping,
+       ret = filemap_fdatawrite_range(inode->i_mapping,
                                       offset, end - 1);
        if (ret < 0) {
                mlog_errno(ret);
@@ -3190,7 +3190,7 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb,
                if (map_end > end)
                        map_end = end;
 
-               page = find_or_create_page(context->inode->i_mapping,
+               page = find_or_create_page(inode->i_mapping,
                                           page_index, GFP_NOFS);
                BUG_ON(!page);
 
@@ -3349,7 +3349,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
         * in write-back mode.
         */
        if (context->get_clusters == ocfs2_di_get_clusters) {
-               ret = ocfs2_cow_sync_writeback(sb, context, cpos,
+               ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
                                               orig_num_clusters);
                if (ret)
                        mlog_errno(ret);
index c8ce46f7d8e30ee842cc8966a8c034aefae3b98b..7754608c83a47b1b44425c8f9c5e13a2adc65675 100644 (file)
@@ -84,6 +84,17 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
                             struct buffer_head *ref_root_bh,
                             u32 cpos, u32 write_len,
                             struct ocfs2_post_refcount *post);
+int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+                                    struct file *file,
+                                    u32 cpos, u32 old_cluster,
+                                    u32 new_cluster, u32 new_len);
+int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+                                   struct file *file,
+                                   u32 cpos, u32 old_cluster,
+                                   u32 new_cluster, u32 new_len);
+int ocfs2_cow_sync_writeback(struct super_block *sb,
+                            struct inode *inode,
+                            u32 cpos, u32 num_clusters);
 int ocfs2_add_refcount_flag(struct inode *inode,
                            struct ocfs2_extent_tree *data_et,
                            struct ocfs2_caching_info *ref_ci,
index 4129fb671d7164d6e612025145adcd0b70046eaf..cdbaf5e97308f3564af7820c575c591e3cfcbf95 100644 (file)
@@ -1567,7 +1567,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (osb->preferred_slot != OCFS2_INVALID_SLOT)
                seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
 
-       if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
+       if (!(mnt->mnt_flags & MNT_NOATIME) && !(mnt->mnt_flags & MNT_RELATIME))
                seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
 
        if (osb->osb_commit_interval)
This page took 0.200922 seconds and 5 git commands to generate.