1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
26 #include <linux/types.h>
27 #include <linux/slab.h>
28 #include <linux/highmem.h>
29 #include <linux/smp_lock.h>
30 #include <linux/kthread.h>
32 #include <cluster/heartbeat.h>
33 #include <cluster/nodemanager.h>
34 #include <cluster/tcp.h>
36 #include <dlm/dlmapi.h>
38 #define MLOG_MASK_PREFIX ML_VOTE
39 #include <cluster/masklog.h>
45 #include "extent_map.h"
46 #include "heartbeat.h"
52 #include "buffer_head_io.h"
54 #define OCFS2_MESSAGE_TYPE_VOTE (0x1)
55 #define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
58 __be32 h_response_id
; /* used to lookup message handle on sending
63 __be32 h_node_num
; /* node sending this particular message. */
68 struct ocfs2_msg_hdr v_hdr
;
72 /* Responses are given these values to maintain backwards
73 * compatibility with older ocfs2 versions */
74 #define OCFS2_RESPONSE_OK (0)
75 #define OCFS2_RESPONSE_BUSY (-16)
76 #define OCFS2_RESPONSE_BAD_MSG (-22)
78 struct ocfs2_response_msg
80 struct ocfs2_msg_hdr r_hdr
;
84 struct ocfs2_vote_work
{
85 struct list_head w_list
;
86 struct ocfs2_vote_msg w_msg
;
89 enum ocfs2_vote_request
{
90 OCFS2_VOTE_REQ_INVALID
= 0,
92 OCFS2_VOTE_REQ_UMOUNT
,
96 static inline int ocfs2_is_valid_vote_request(int request
)
98 return OCFS2_VOTE_REQ_INVALID
< request
&&
99 request
< OCFS2_VOTE_REQ_LAST
;
102 typedef void (*ocfs2_net_response_callback
)(void *priv
,
103 struct ocfs2_response_msg
*resp
);
104 struct ocfs2_net_response_cb
{
105 ocfs2_net_response_callback rc_cb
;
109 struct ocfs2_net_wait_ctxt
{
110 struct list_head n_list
;
112 wait_queue_head_t n_event
;
113 struct ocfs2_node_map n_node_map
;
114 int n_response
; /* an agreggate response. 0 if
115 * all nodes are go, < 0 on any
116 * negative response from any
117 * node or network error. */
118 struct ocfs2_net_response_cb
*n_callback
;
121 static void ocfs2_process_mount_request(struct ocfs2_super
*osb
,
122 unsigned int node_num
)
124 mlog(0, "MOUNT vote from node %u\n", node_num
);
125 /* The other node only sends us this message when he has an EX
126 * on the superblock, so our recovery threads (if having been
127 * launched) are waiting on it.*/
128 ocfs2_recovery_map_clear(osb
, node_num
);
129 ocfs2_node_map_set_bit(osb
, &osb
->mounted_map
, node_num
);
131 /* We clear the umount map here because a node may have been
132 * previously mounted, safely unmounted but never stopped
133 * heartbeating - in which case we'd have a stale entry. */
134 ocfs2_node_map_clear_bit(osb
, &osb
->umount_map
, node_num
);
137 static void ocfs2_process_umount_request(struct ocfs2_super
*osb
,
138 unsigned int node_num
)
140 mlog(0, "UMOUNT vote from node %u\n", node_num
);
141 ocfs2_node_map_clear_bit(osb
, &osb
->mounted_map
, node_num
);
142 ocfs2_node_map_set_bit(osb
, &osb
->umount_map
, node_num
);
145 static void ocfs2_process_vote(struct ocfs2_super
*osb
,
146 struct ocfs2_vote_msg
*msg
)
148 int net_status
, vote_response
;
149 unsigned int node_num
;
151 enum ocfs2_vote_request request
;
152 struct ocfs2_msg_hdr
*hdr
= &msg
->v_hdr
;
153 struct ocfs2_response_msg response
;
155 /* decode the network mumbo jumbo into local variables. */
156 request
= be32_to_cpu(hdr
->h_request
);
157 blkno
= be64_to_cpu(hdr
->h_blkno
);
158 node_num
= be32_to_cpu(hdr
->h_node_num
);
160 mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
161 request
, (unsigned long long)blkno
, node_num
);
163 if (!ocfs2_is_valid_vote_request(request
)) {
164 mlog(ML_ERROR
, "Invalid vote request %d from node %u\n",
166 vote_response
= OCFS2_RESPONSE_BAD_MSG
;
170 vote_response
= OCFS2_RESPONSE_OK
;
173 case OCFS2_VOTE_REQ_UMOUNT
:
174 ocfs2_process_umount_request(osb
, node_num
);
176 case OCFS2_VOTE_REQ_MOUNT
:
177 ocfs2_process_mount_request(osb
, node_num
);
180 /* avoids a gcc warning */
185 /* Response struture is small so we just put it on the stack
186 * and stuff it inline. */
187 memset(&response
, 0, sizeof(struct ocfs2_response_msg
));
188 response
.r_hdr
.h_response_id
= hdr
->h_response_id
;
189 response
.r_hdr
.h_blkno
= hdr
->h_blkno
;
190 response
.r_hdr
.h_generation
= hdr
->h_generation
;
191 response
.r_hdr
.h_node_num
= cpu_to_be32(osb
->node_num
);
192 response
.r_response
= cpu_to_be32(vote_response
);
194 net_status
= o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE
,
197 sizeof(struct ocfs2_response_msg
),
200 /* We still want to error print for ENOPROTOOPT here. The
201 * sending node shouldn't have unregistered his net handler
202 * without sending an unmount vote 1st */
204 && net_status
!= -ETIMEDOUT
205 && net_status
!= -ENOTCONN
)
206 mlog(ML_ERROR
, "message to node %u fails with error %d!\n",
207 node_num
, net_status
);
210 static void ocfs2_vote_thread_do_work(struct ocfs2_super
*osb
)
212 unsigned long processed
;
213 struct ocfs2_lock_res
*lockres
;
214 struct ocfs2_vote_work
*work
;
218 spin_lock(&osb
->vote_task_lock
);
219 /* grab this early so we know to try again if a state change and
220 * wake happens part-way through our work */
221 osb
->vote_work_sequence
= osb
->vote_wake_sequence
;
223 processed
= osb
->blocked_lock_count
;
225 BUG_ON(list_empty(&osb
->blocked_lock_list
));
227 lockres
= list_entry(osb
->blocked_lock_list
.next
,
228 struct ocfs2_lock_res
, l_blocked_list
);
229 list_del_init(&lockres
->l_blocked_list
);
230 osb
->blocked_lock_count
--;
231 spin_unlock(&osb
->vote_task_lock
);
236 ocfs2_process_blocked_lock(osb
, lockres
);
238 spin_lock(&osb
->vote_task_lock
);
241 while (osb
->vote_count
) {
242 BUG_ON(list_empty(&osb
->vote_list
));
243 work
= list_entry(osb
->vote_list
.next
,
244 struct ocfs2_vote_work
, w_list
);
245 list_del(&work
->w_list
);
247 spin_unlock(&osb
->vote_task_lock
);
249 ocfs2_process_vote(osb
, &work
->w_msg
);
252 spin_lock(&osb
->vote_task_lock
);
254 spin_unlock(&osb
->vote_task_lock
);
259 static int ocfs2_vote_thread_lists_empty(struct ocfs2_super
*osb
)
263 spin_lock(&osb
->vote_task_lock
);
264 if (list_empty(&osb
->blocked_lock_list
) &&
265 list_empty(&osb
->vote_list
))
268 spin_unlock(&osb
->vote_task_lock
);
272 static int ocfs2_vote_thread_should_wake(struct ocfs2_super
*osb
)
276 spin_lock(&osb
->vote_task_lock
);
277 if (osb
->vote_work_sequence
!= osb
->vote_wake_sequence
)
279 spin_unlock(&osb
->vote_task_lock
);
284 int ocfs2_vote_thread(void *arg
)
287 struct ocfs2_super
*osb
= arg
;
289 /* only quit once we've been asked to stop and there is no more
291 while (!(kthread_should_stop() &&
292 ocfs2_vote_thread_lists_empty(osb
))) {
294 wait_event_interruptible(osb
->vote_event
,
295 ocfs2_vote_thread_should_wake(osb
) ||
296 kthread_should_stop());
298 mlog(0, "vote_thread: awoken\n");
300 ocfs2_vote_thread_do_work(osb
);
303 osb
->vote_task
= NULL
;
307 static struct ocfs2_net_wait_ctxt
*ocfs2_new_net_wait_ctxt(unsigned int response_id
)
309 struct ocfs2_net_wait_ctxt
*w
;
311 w
= kzalloc(sizeof(*w
), GFP_NOFS
);
317 INIT_LIST_HEAD(&w
->n_list
);
318 init_waitqueue_head(&w
->n_event
);
319 ocfs2_node_map_init(&w
->n_node_map
);
320 w
->n_response_id
= response_id
;
321 w
->n_callback
= NULL
;
326 static unsigned int ocfs2_new_response_id(struct ocfs2_super
*osb
)
330 spin_lock(&osb
->net_response_lock
);
331 ret
= ++osb
->net_response_ids
;
332 spin_unlock(&osb
->net_response_lock
);
337 static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super
*osb
,
338 struct ocfs2_net_wait_ctxt
*w
)
340 spin_lock(&osb
->net_response_lock
);
341 list_del(&w
->n_list
);
342 spin_unlock(&osb
->net_response_lock
);
345 static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super
*osb
,
346 struct ocfs2_net_wait_ctxt
*w
)
348 spin_lock(&osb
->net_response_lock
);
349 list_add_tail(&w
->n_list
,
350 &osb
->net_response_list
);
351 spin_unlock(&osb
->net_response_lock
);
354 static void __ocfs2_mark_node_responded(struct ocfs2_super
*osb
,
355 struct ocfs2_net_wait_ctxt
*w
,
358 assert_spin_locked(&osb
->net_response_lock
);
360 ocfs2_node_map_clear_bit(osb
, &w
->n_node_map
, node_num
);
361 if (ocfs2_node_map_is_empty(osb
, &w
->n_node_map
))
362 wake_up(&w
->n_event
);
365 /* Intended to be called from the node down callback, we fake remove
366 * the node from all our response contexts */
367 void ocfs2_remove_node_from_vote_queues(struct ocfs2_super
*osb
,
371 struct ocfs2_net_wait_ctxt
*w
= NULL
;
373 spin_lock(&osb
->net_response_lock
);
375 list_for_each(p
, &osb
->net_response_list
) {
376 w
= list_entry(p
, struct ocfs2_net_wait_ctxt
, n_list
);
378 __ocfs2_mark_node_responded(osb
, w
, node_num
);
381 spin_unlock(&osb
->net_response_lock
);
384 static int ocfs2_broadcast_vote(struct ocfs2_super
*osb
,
385 struct ocfs2_vote_msg
*request
,
386 unsigned int response_id
,
388 struct ocfs2_net_response_cb
*callback
)
390 int status
, i
, remote_err
;
391 struct ocfs2_net_wait_ctxt
*w
= NULL
;
396 w
= ocfs2_new_net_wait_ctxt(response_id
);
402 w
->n_callback
= callback
;
404 /* we're pretty much ready to go at this point, and this fills
405 * in n_response which we need anyway... */
406 ocfs2_queue_net_wait_ctxt(osb
, w
);
408 i
= ocfs2_node_map_iterate(osb
, &osb
->mounted_map
, 0);
410 while (i
!= O2NM_INVALID_NODE_NUM
) {
411 if (i
!= osb
->node_num
) {
412 mlog(0, "trying to send request to node %i\n", i
);
413 ocfs2_node_map_set_bit(osb
, &w
->n_node_map
, i
);
416 status
= o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE
,
422 if (status
== -ETIMEDOUT
) {
423 mlog(0, "remote node %d timed out!\n", i
);
427 if (remote_err
< 0) {
429 mlog(0, "remote error %d on node %d!\n",
440 i
= ocfs2_node_map_iterate(osb
, &osb
->mounted_map
, i
);
441 mlog(0, "next is %d, i am %d\n", i
, osb
->node_num
);
443 mlog(0, "done sending, now waiting on responses...\n");
445 wait_event(w
->n_event
, ocfs2_node_map_is_empty(osb
, &w
->n_node_map
));
447 ocfs2_dequeue_net_wait_ctxt(osb
, w
);
450 *response
= w
->n_response
;
455 ocfs2_dequeue_net_wait_ctxt(osb
, w
);
463 static struct ocfs2_vote_msg
* ocfs2_new_vote_request(struct ocfs2_super
*osb
,
465 unsigned int generation
,
466 enum ocfs2_vote_request type
)
468 struct ocfs2_vote_msg
*request
;
469 struct ocfs2_msg_hdr
*hdr
;
471 BUG_ON(!ocfs2_is_valid_vote_request(type
));
473 request
= kzalloc(sizeof(*request
), GFP_NOFS
);
477 hdr
= &request
->v_hdr
;
478 hdr
->h_node_num
= cpu_to_be32(osb
->node_num
);
479 hdr
->h_request
= cpu_to_be32(type
);
480 hdr
->h_blkno
= cpu_to_be64(blkno
);
481 hdr
->h_generation
= cpu_to_be32(generation
);
487 /* Complete the buildup of a new vote request and process the
488 * broadcast return value. */
489 static int ocfs2_do_request_vote(struct ocfs2_super
*osb
,
490 struct ocfs2_vote_msg
*request
,
491 struct ocfs2_net_response_cb
*callback
)
493 int status
, response
= -EBUSY
;
494 unsigned int response_id
;
495 struct ocfs2_msg_hdr
*hdr
;
497 response_id
= ocfs2_new_response_id(osb
);
499 hdr
= &request
->v_hdr
;
500 hdr
->h_response_id
= cpu_to_be32(response_id
);
502 status
= ocfs2_broadcast_vote(osb
, request
, response_id
, &response
,
515 int ocfs2_request_mount_vote(struct ocfs2_super
*osb
)
518 struct ocfs2_vote_msg
*request
= NULL
;
520 request
= ocfs2_new_vote_request(osb
, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT
);
527 while (status
== -EAGAIN
) {
528 if (!(osb
->s_mount_opt
& OCFS2_MOUNT_NOINTR
) &&
529 signal_pending(current
)) {
530 status
= -ERESTARTSYS
;
534 if (ocfs2_node_map_is_only(osb
, &osb
->mounted_map
,
540 status
= ocfs2_do_request_vote(osb
, request
, NULL
);
548 int ocfs2_request_umount_vote(struct ocfs2_super
*osb
)
551 struct ocfs2_vote_msg
*request
= NULL
;
553 request
= ocfs2_new_vote_request(osb
, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT
);
560 while (status
== -EAGAIN
) {
561 /* Do not check signals on this vote... We really want
562 * this one to go all the way through. */
564 if (ocfs2_node_map_is_only(osb
, &osb
->mounted_map
,
570 status
= ocfs2_do_request_vote(osb
, request
, NULL
);
578 /* TODO: This should eventually be a hash table! */
579 static struct ocfs2_net_wait_ctxt
* __ocfs2_find_net_wait_ctxt(struct ocfs2_super
*osb
,
583 struct ocfs2_net_wait_ctxt
*w
= NULL
;
585 list_for_each(p
, &osb
->net_response_list
) {
586 w
= list_entry(p
, struct ocfs2_net_wait_ctxt
, n_list
);
587 if (response_id
== w
->n_response_id
)
595 /* Translate response codes into local node errno values */
596 static inline int ocfs2_translate_response(int response
)
601 case OCFS2_RESPONSE_OK
:
605 case OCFS2_RESPONSE_BUSY
:
616 static int ocfs2_handle_response_message(struct o2net_msg
*msg
,
618 void *data
, void **ret_data
)
620 unsigned int response_id
, node_num
;
622 struct ocfs2_super
*osb
= data
;
623 struct ocfs2_response_msg
*resp
;
624 struct ocfs2_net_wait_ctxt
* w
;
625 struct ocfs2_net_response_cb
*resp_cb
;
627 resp
= (struct ocfs2_response_msg
*) msg
->buf
;
629 response_id
= be32_to_cpu(resp
->r_hdr
.h_response_id
);
630 node_num
= be32_to_cpu(resp
->r_hdr
.h_node_num
);
632 ocfs2_translate_response(be32_to_cpu(resp
->r_response
));
634 mlog(0, "received response message:\n");
635 mlog(0, "h_response_id = %u\n", response_id
);
636 mlog(0, "h_request = %u\n", be32_to_cpu(resp
->r_hdr
.h_request
));
637 mlog(0, "h_blkno = %llu\n",
638 (unsigned long long)be64_to_cpu(resp
->r_hdr
.h_blkno
));
639 mlog(0, "h_generation = %u\n", be32_to_cpu(resp
->r_hdr
.h_generation
));
640 mlog(0, "h_node_num = %u\n", node_num
);
641 mlog(0, "r_response = %d\n", response_status
);
643 spin_lock(&osb
->net_response_lock
);
644 w
= __ocfs2_find_net_wait_ctxt(osb
, response_id
);
646 mlog(0, "request not found!\n");
649 resp_cb
= w
->n_callback
;
651 if (response_status
&& (!w
->n_response
)) {
652 /* we only really need one negative response so don't
654 w
->n_response
= response_status
;
658 spin_unlock(&osb
->net_response_lock
);
660 resp_cb
->rc_cb(resp_cb
->rc_priv
, resp
);
662 spin_lock(&osb
->net_response_lock
);
665 __ocfs2_mark_node_responded(osb
, w
, node_num
);
667 spin_unlock(&osb
->net_response_lock
);
672 static int ocfs2_handle_vote_message(struct o2net_msg
*msg
,
674 void *data
, void **ret_data
)
677 struct ocfs2_super
*osb
= data
;
678 struct ocfs2_vote_work
*work
;
680 work
= kmalloc(sizeof(struct ocfs2_vote_work
), GFP_NOFS
);
687 INIT_LIST_HEAD(&work
->w_list
);
688 memcpy(&work
->w_msg
, msg
->buf
, sizeof(struct ocfs2_vote_msg
));
690 mlog(0, "scheduling vote request:\n");
691 mlog(0, "h_response_id = %u\n",
692 be32_to_cpu(work
->w_msg
.v_hdr
.h_response_id
));
693 mlog(0, "h_request = %u\n", be32_to_cpu(work
->w_msg
.v_hdr
.h_request
));
694 mlog(0, "h_blkno = %llu\n",
695 (unsigned long long)be64_to_cpu(work
->w_msg
.v_hdr
.h_blkno
));
696 mlog(0, "h_generation = %u\n",
697 be32_to_cpu(work
->w_msg
.v_hdr
.h_generation
));
698 mlog(0, "h_node_num = %u\n",
699 be32_to_cpu(work
->w_msg
.v_hdr
.h_node_num
));
701 spin_lock(&osb
->vote_task_lock
);
702 list_add_tail(&work
->w_list
, &osb
->vote_list
);
704 spin_unlock(&osb
->vote_task_lock
);
706 ocfs2_kick_vote_thread(osb
);
713 void ocfs2_unregister_net_handlers(struct ocfs2_super
*osb
)
718 o2net_unregister_handler_list(&osb
->osb_net_handlers
);
720 if (!list_empty(&osb
->net_response_list
))
721 mlog(ML_ERROR
, "net response list not empty!\n");
726 int ocfs2_register_net_handlers(struct ocfs2_super
*osb
)
730 if (ocfs2_mount_local(osb
))
733 status
= o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE
,
735 sizeof(struct ocfs2_response_msg
),
736 ocfs2_handle_response_message
,
737 osb
, NULL
, &osb
->osb_net_handlers
);
743 status
= o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE
,
745 sizeof(struct ocfs2_vote_msg
),
746 ocfs2_handle_vote_message
,
747 osb
, NULL
, &osb
->osb_net_handlers
);
754 ocfs2_unregister_net_handlers(osb
);