staging/lustre/ptlrpc: Fix Multiple Assignments
[deliverable/linux.git] / drivers / staging / lustre / lustre / ptlrpc / import.c
CommitLineData
d7e09d03
PT
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
1dc563a6 30 * Copyright (c) 2011, 2015, Intel Corporation.
d7e09d03
PT
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/ptlrpc/import.c
37 *
38 * Author: Mike Shaver <shaver@clusterfs.com>
39 */
40
41#define DEBUG_SUBSYSTEM S_RPC
42
e27db149
GKH
43#include "../include/obd_support.h"
44#include "../include/lustre_ha.h"
45#include "../include/lustre_net.h"
46#include "../include/lustre_import.h"
47#include "../include/lustre_export.h"
48#include "../include/obd.h"
49#include "../include/obd_cksum.h"
50#include "../include/obd_class.h"
d7e09d03
PT
51
52#include "ptlrpc_internal.h"
53
54struct ptlrpc_connect_async_args {
55 __u64 pcaa_peer_committed;
56 int pcaa_initial_connect;
57};
58
59/**
60 * Updates import \a imp current state to provided \a state value
61 * Helper function. Must be called under imp_lock.
62 */
63static void __import_set_state(struct obd_import *imp,
64 enum lustre_imp_state state)
65{
502cb58e
AS
66 switch (state) {
67 case LUSTRE_IMP_CLOSED:
68 case LUSTRE_IMP_NEW:
69 case LUSTRE_IMP_DISCON:
70 case LUSTRE_IMP_CONNECTING:
71 break;
72 case LUSTRE_IMP_REPLAY_WAIT:
73 imp->imp_replay_state = LUSTRE_IMP_REPLAY_LOCKS;
74 break;
75 default:
76 imp->imp_replay_state = LUSTRE_IMP_REPLAY;
77 }
78
d7e09d03
PT
79 imp->imp_state = state;
80 imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state;
81 imp->imp_state_hist[imp->imp_state_hist_idx].ish_time =
74e489aa 82 ktime_get_real_seconds();
d7e09d03
PT
83 imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) %
84 IMP_STATE_HIST_LEN;
85}
86
87/* A CLOSED import should remain so. */
532118c0
KM
88#define IMPORT_SET_STATE_NOLOCK(imp, state) \
89do { \
90 if (imp->imp_state != LUSTRE_IMP_CLOSED) { \
91 CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n", \
92 imp, obd2cli_tgt(imp->imp_obd), \
93 ptlrpc_import_state_name(imp->imp_state), \
94 ptlrpc_import_state_name(state)); \
95 __import_set_state(imp, state); \
96 } \
3949015e 97} while (0)
d7e09d03
PT
98
99#define IMPORT_SET_STATE(imp, state) \
100do { \
101 spin_lock(&imp->imp_lock); \
102 IMPORT_SET_STATE_NOLOCK(imp, state); \
103 spin_unlock(&imp->imp_lock); \
3949015e 104} while (0)
d7e09d03 105
d7e09d03
PT
106static int ptlrpc_connect_interpret(const struct lu_env *env,
107 struct ptlrpc_request *request,
aff9d8e8 108 void *data, int rc);
d7e09d03
PT
109int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
110
111/* Only this function is allowed to change the import state when it is
112 * CLOSED. I would rather refcount the import and free it after
113 * disconnection like we do with exports. To do that, the client_obd
114 * will need to save the peer info somewhere other than in the import,
dadfcdab
OD
115 * though.
116 */
d7e09d03
PT
117int ptlrpc_init_import(struct obd_import *imp)
118{
119 spin_lock(&imp->imp_lock);
120
121 imp->imp_generation++;
d0bfef31 122 imp->imp_state = LUSTRE_IMP_NEW;
d7e09d03
PT
123
124 spin_unlock(&imp->imp_lock);
125
126 return 0;
127}
128EXPORT_SYMBOL(ptlrpc_init_import);
129
130#define UUID_STR "_UUID"
9dd7d427
SB
131static void deuuidify(char *uuid, const char *prefix, char **uuid_start,
132 int *uuid_len)
d7e09d03
PT
133{
134 *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix))
135 ? uuid : uuid + strlen(prefix);
136
137 *uuid_len = strlen(*uuid_start);
138
139 if (*uuid_len < strlen(UUID_STR))
140 return;
141
142 if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR),
30c0aa39 143 UUID_STR, strlen(UUID_STR)))
d7e09d03
PT
144 *uuid_len -= strlen(UUID_STR);
145}
d7e09d03
PT
146
147/**
148 * Returns true if import was FULL, false if import was already not
149 * connected.
150 * @imp - import to be disconnected
151 * @conn_cnt - connection count (epoch) of the request that timed out
152 * and caused the disconnection. In some cases, multiple
153 * inflight requests can fail to a single target (e.g. OST
154 * bulk requests) and if one has already caused a reconnection
155 * (increasing the import->conn_cnt) the older failure should
156 * not also cause a reconnection. If zero it forces a reconnect.
157 */
158int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
159{
160 int rc = 0;
161
162 spin_lock(&imp->imp_lock);
163
164 if (imp->imp_state == LUSTRE_IMP_FULL &&
165 (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
166 char *target_start;
167 int target_len;
168
169 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
170 &target_start, &target_len);
171
172 if (imp->imp_replayable) {
2d00bd17
JP
173 LCONSOLE_WARN("%s: Connection to %.*s (at %s) was lost; in progress operations using this service will wait for recovery to complete\n",
174 imp->imp_obd->obd_name, target_len, target_start,
175 libcfs_nid2str(imp->imp_connection->c_peer.nid));
d7e09d03 176 } else {
2d00bd17
JP
177 LCONSOLE_ERROR_MSG(0x166, "%s: Connection to %.*s (at %s) was lost; in progress operations using this service will fail\n",
178 imp->imp_obd->obd_name,
179 target_len, target_start,
180 libcfs_nid2str(imp->imp_connection->c_peer.nid));
d7e09d03 181 }
d7e09d03
PT
182 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
183 spin_unlock(&imp->imp_lock);
184
185 if (obd_dump_on_timeout)
186 libcfs_debug_dumplog();
187
188 obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
189 rc = 1;
190 } else {
191 spin_unlock(&imp->imp_lock);
192 CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
193 imp->imp_client->cli_name, imp,
194 (imp->imp_state == LUSTRE_IMP_FULL &&
195 imp->imp_conn_cnt > conn_cnt) ?
196 "reconnected" : "not connected", imp->imp_conn_cnt,
197 conn_cnt, ptlrpc_import_state_name(imp->imp_state));
198 }
199
200 return rc;
201}
202
441fda84
ML
203/*
204 * This acts as a barrier; all existing requests are rejected, and
205 * no new requests will be accepted until the import is valid again.
206 */
207void ptlrpc_deactivate_import(struct obd_import *imp)
d7e09d03 208{
d7e09d03 209 CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
441fda84
ML
210
211 spin_lock(&imp->imp_lock);
d7e09d03
PT
212 imp->imp_invalid = 1;
213 imp->imp_generation++;
214 spin_unlock(&imp->imp_lock);
215
216 ptlrpc_abort_inflight(imp);
217 obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
d7e09d03 218}
d7e09d03
PT
219EXPORT_SYMBOL(ptlrpc_deactivate_import);
220
221static unsigned int
219e6de6 222ptlrpc_inflight_deadline(struct ptlrpc_request *req, time64_t now)
d7e09d03
PT
223{
224 long dl;
225
226 if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
227 (req->rq_phase == RQ_PHASE_BULK) ||
228 (req->rq_phase == RQ_PHASE_NEW)))
229 return 0;
230
231 if (req->rq_timedout)
232 return 0;
233
234 if (req->rq_phase == RQ_PHASE_NEW)
235 dl = req->rq_sent;
236 else
237 dl = req->rq_deadline;
238
239 if (dl <= now)
240 return 0;
241
242 return dl - now;
243}
244
245static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp)
246{
219e6de6 247 time64_t now = ktime_get_real_seconds();
d7e09d03
PT
248 struct list_head *tmp, *n;
249 struct ptlrpc_request *req;
250 unsigned int timeout = 0;
251
252 spin_lock(&imp->imp_lock);
253 list_for_each_safe(tmp, n, &imp->imp_sending_list) {
254 req = list_entry(tmp, struct ptlrpc_request, rq_list);
255 timeout = max(ptlrpc_inflight_deadline(req, now), timeout);
256 }
257 spin_unlock(&imp->imp_lock);
258 return timeout;
259}
260
261/**
262 * This function will invalidate the import, if necessary, then block
263 * for all the RPC completions, and finally notify the obd to
264 * invalidate its state (ie cancel locks, clear pending requests,
265 * etc).
266 */
267void ptlrpc_invalidate_import(struct obd_import *imp)
268{
269 struct list_head *tmp, *n;
270 struct ptlrpc_request *req;
271 struct l_wait_info lwi;
272 unsigned int timeout;
273 int rc;
274
275 atomic_inc(&imp->imp_inval_count);
276
277 if (!imp->imp_invalid || imp->imp_obd->obd_no_recov)
278 ptlrpc_deactivate_import(imp);
279
cca8fca1 280 CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2);
d7e09d03
PT
281 LASSERT(imp->imp_invalid);
282
283 /* Wait forever until inflight == 0. We really can't do it another
284 * way because in some cases we need to wait for very long reply
285 * unlink. We can't do anything before that because there is really
dadfcdab
OD
286 * no guarantee that some rdma transfer is not in progress right now.
287 */
d7e09d03
PT
288 do {
289 /* Calculate max timeout for waiting on rpcs to error
290 * out. Use obd_timeout if calculated value is smaller
dadfcdab
OD
291 * than it.
292 */
d7e09d03
PT
293 if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
294 timeout = ptlrpc_inflight_timeout(imp);
295 timeout += timeout / 3;
296
297 if (timeout == 0)
298 timeout = obd_timeout;
299 } else {
300 /* decrease the interval to increase race condition */
301 timeout = 1;
302 }
303
1d8cb70c
GD
304 CDEBUG(D_RPCTRACE,
305 "Sleeping %d sec for inflight to error out\n",
d7e09d03
PT
306 timeout);
307
308 /* Wait for all requests to error out and call completion
309 * callbacks. Cap it at obd_timeout -- these should all
dadfcdab
OD
310 * have been locally cancelled by ptlrpc_abort_inflight.
311 */
d7e09d03
PT
312 lwi = LWI_TIMEOUT_INTERVAL(
313 cfs_timeout_cap(cfs_time_seconds(timeout)),
314 (timeout > 1)?cfs_time_seconds(1):cfs_time_seconds(1)/2,
315 NULL, NULL);
316 rc = l_wait_event(imp->imp_recovery_waitq,
317 (atomic_read(&imp->imp_inflight) == 0),
318 &lwi);
319 if (rc) {
320 const char *cli_tgt = obd2cli_tgt(imp->imp_obd);
321
322 CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
323 cli_tgt, rc,
324 atomic_read(&imp->imp_inflight));
325
326 spin_lock(&imp->imp_lock);
327 if (atomic_read(&imp->imp_inflight) == 0) {
328 int count = atomic_read(&imp->imp_unregistering);
329
330 /* We know that "unregistering" rpcs only can
331 * survive in sending or delaying lists (they
332 * maybe waiting for long reply unlink in
333 * sluggish nets). Let's check this. If there
334 * is no inflight and unregistering != 0, this
dadfcdab
OD
335 * is bug.
336 */
2d00bd17
JP
337 LASSERTF(count == 0, "Some RPCs are still unregistering: %d\n",
338 count);
d7e09d03
PT
339
340 /* Let's save one loop as soon as inflight have
341 * dropped to zero. No new inflights possible at
dadfcdab
OD
342 * this point.
343 */
d7e09d03
PT
344 rc = 0;
345 } else {
346 list_for_each_safe(tmp, n,
30c0aa39 347 &imp->imp_sending_list) {
d7e09d03 348 req = list_entry(tmp,
30c0aa39
OD
349 struct ptlrpc_request,
350 rq_list);
d7e09d03
PT
351 DEBUG_REQ(D_ERROR, req,
352 "still on sending list");
353 }
354 list_for_each_safe(tmp, n,
30c0aa39 355 &imp->imp_delayed_list) {
d7e09d03 356 req = list_entry(tmp,
30c0aa39
OD
357 struct ptlrpc_request,
358 rq_list);
d7e09d03
PT
359 DEBUG_REQ(D_ERROR, req,
360 "still on delayed list");
361 }
362
2d00bd17
JP
363 CERROR("%s: RPCs in \"%s\" phase found (%d). Network is sluggish? Waiting them to error out.\n",
364 cli_tgt,
d7e09d03
PT
365 ptlrpc_phase2str(RQ_PHASE_UNREGISTERING),
366 atomic_read(&imp->
2d00bd17 367 imp_unregistering));
d7e09d03
PT
368 }
369 spin_unlock(&imp->imp_lock);
d0bfef31 370 }
d7e09d03
PT
371 } while (rc != 0);
372
373 /*
374 * Let's additionally check that no new rpcs added to import in
375 * "invalidate" state.
376 */
377 LASSERT(atomic_read(&imp->imp_inflight) == 0);
378 obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
379 sptlrpc_import_flush_all_ctx(imp);
380
381 atomic_dec(&imp->imp_inval_count);
382 wake_up_all(&imp->imp_recovery_waitq);
383}
384EXPORT_SYMBOL(ptlrpc_invalidate_import);
385
386/* unset imp_invalid */
387void ptlrpc_activate_import(struct obd_import *imp)
388{
389 struct obd_device *obd = imp->imp_obd;
390
391 spin_lock(&imp->imp_lock);
0b291b9a
HZ
392 if (imp->imp_deactive != 0) {
393 spin_unlock(&imp->imp_lock);
394 return;
395 }
396
d7e09d03 397 imp->imp_invalid = 0;
d7e09d03
PT
398 spin_unlock(&imp->imp_lock);
399 obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
400}
401EXPORT_SYMBOL(ptlrpc_activate_import);
402
cca8fca1
AS
403static void ptlrpc_pinger_force(struct obd_import *imp)
404{
405 CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd),
406 ptlrpc_import_state_name(imp->imp_state));
407
408 spin_lock(&imp->imp_lock);
409 imp->imp_force_verify = 1;
410 spin_unlock(&imp->imp_lock);
411
412 if (imp->imp_state != LUSTRE_IMP_CONNECTING)
413 ptlrpc_pinger_wake_up();
414}
415
d7e09d03
PT
416void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
417{
d7e09d03
PT
418 LASSERT(!imp->imp_dlm_fake);
419
420 if (ptlrpc_set_import_discon(imp, conn_cnt)) {
421 if (!imp->imp_replayable) {
2d00bd17 422 CDEBUG(D_HA, "import %s@%s for %s not replayable, auto-deactivating\n",
d7e09d03
PT
423 obd2cli_tgt(imp->imp_obd),
424 imp->imp_connection->c_remote_uuid.uuid,
425 imp->imp_obd->obd_name);
426 ptlrpc_deactivate_import(imp);
427 }
428
cca8fca1 429 ptlrpc_pinger_force(imp);
d7e09d03 430 }
d7e09d03
PT
431}
432EXPORT_SYMBOL(ptlrpc_fail_import);
433
434int ptlrpc_reconnect_import(struct obd_import *imp)
435{
cca8fca1
AS
436 struct l_wait_info lwi;
437 int secs = cfs_time_seconds(obd_timeout);
438 int rc;
439
440 ptlrpc_pinger_force(imp);
441
442 CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n",
443 obd2cli_tgt(imp->imp_obd), secs);
444
445 lwi = LWI_TIMEOUT(secs, NULL, NULL);
446 rc = l_wait_event(imp->imp_recovery_waitq,
447 !ptlrpc_import_in_recovery(imp), &lwi);
448 CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd),
449 ptlrpc_import_state_name(imp->imp_state));
450 return rc;
d7e09d03
PT
451}
452EXPORT_SYMBOL(ptlrpc_reconnect_import);
453
454/**
455 * Connection on import \a imp is changed to another one (if more than one is
456 * present). We typically chose connection that we have not tried to connect to
457 * the longest
458 */
459static int import_select_connection(struct obd_import *imp)
460{
461 struct obd_import_conn *imp_conn = NULL, *conn;
462 struct obd_export *dlmexp;
463 char *target_start;
464 int target_len, tried_all = 1;
d7e09d03
PT
465
466 spin_lock(&imp->imp_lock);
467
468 if (list_empty(&imp->imp_conn_list)) {
469 CERROR("%s: no connections available\n",
470 imp->imp_obd->obd_name);
471 spin_unlock(&imp->imp_lock);
0a3bdb00 472 return -EINVAL;
d7e09d03
PT
473 }
474
475 list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
b0f5aad5 476 CDEBUG(D_HA, "%s: connect to NID %s last attempt %llu\n",
d7e09d03
PT
477 imp->imp_obd->obd_name,
478 libcfs_nid2str(conn->oic_conn->c_peer.nid),
479 conn->oic_last_attempt);
480
481 /* If we have not tried this connection since
dadfcdab
OD
482 * the last successful attempt, go with this one
483 */
d7e09d03
PT
484 if ((conn->oic_last_attempt == 0) ||
485 cfs_time_beforeq_64(conn->oic_last_attempt,
30c0aa39 486 imp->imp_last_success_conn)) {
d7e09d03
PT
487 imp_conn = conn;
488 tried_all = 0;
489 break;
490 }
491
492 /* If all of the connections have already been tried
dadfcdab
OD
493 * since the last successful connection; just choose the
494 * least recently used
495 */
d7e09d03
PT
496 if (!imp_conn)
497 imp_conn = conn;
498 else if (cfs_time_before_64(conn->oic_last_attempt,
499 imp_conn->oic_last_attempt))
500 imp_conn = conn;
501 }
502
503 /* if not found, simply choose the current one */
504 if (!imp_conn || imp->imp_force_reconnect) {
505 LASSERT(imp->imp_conn_current);
506 imp_conn = imp->imp_conn_current;
507 tried_all = 0;
508 }
509 LASSERT(imp_conn->oic_conn);
510
511 /* If we've tried everything, and we're back to the beginning of the
dadfcdab
OD
512 * list, increase our timeout and try again. It will be reset when
513 * we do finally connect. (FIXME: really we should wait for all network
514 * state associated with the last connection attempt to drain before
515 * trying to reconnect on it.)
516 */
d7e09d03
PT
517 if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
518 struct adaptive_timeout *at = &imp->imp_at.iat_net_latency;
50ffcb7e 519
d7e09d03
PT
520 if (at_get(at) < CONNECTION_SWITCH_MAX) {
521 at_measured(at, at_get(at) + CONNECTION_SWITCH_INC);
522 if (at_get(at) > CONNECTION_SWITCH_MAX)
523 at_reset(at, CONNECTION_SWITCH_MAX);
524 }
525 LASSERT(imp_conn->oic_last_attempt);
2d00bd17
JP
526 CDEBUG(D_HA, "%s: tried all connections, increasing latency to %ds\n",
527 imp->imp_obd->obd_name, at_get(at));
d7e09d03
PT
528 }
529
530 imp_conn->oic_last_attempt = cfs_time_current_64();
531
532 /* switch connection, don't mind if it's same as the current one */
a5cb8880 533 ptlrpc_connection_put(imp->imp_connection);
d7e09d03
PT
534 imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
535
d0bfef31 536 dlmexp = class_conn2export(&imp->imp_dlm_handle);
a5cb8880 537 ptlrpc_connection_put(dlmexp->exp_connection);
d7e09d03
PT
538 dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
539 class_export_put(dlmexp);
540
541 if (imp->imp_conn_current != imp_conn) {
542 if (imp->imp_conn_current) {
543 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
544 &target_start, &target_len);
545
2d00bd17 546 CDEBUG(D_HA, "%s: Connection changing to %.*s (at %s)\n",
d7e09d03
PT
547 imp->imp_obd->obd_name,
548 target_len, target_start,
549 libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
550 }
551
552 imp->imp_conn_current = imp_conn;
553 }
554
555 CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
556 imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
557 libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
558
559 spin_unlock(&imp->imp_lock);
560
0a3bdb00 561 return 0;
d7e09d03
PT
562}
563
564/*
565 * must be called under imp_lock
566 */
567static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno)
568{
569 struct ptlrpc_request *req;
570 struct list_head *tmp;
571
63d42578 572 /* The requests in committed_list always have smaller transnos than
dadfcdab
OD
573 * the requests in replay_list
574 */
63d42578
HZ
575 if (!list_empty(&imp->imp_committed_list)) {
576 tmp = imp->imp_committed_list.next;
577 req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
578 *transno = req->rq_transno;
579 if (req->rq_transno == 0) {
580 DEBUG_REQ(D_ERROR, req,
581 "zero transno in committed_list");
582 LBUG();
583 }
584 return 1;
d7e09d03 585 }
63d42578
HZ
586 if (!list_empty(&imp->imp_replay_list)) {
587 tmp = imp->imp_replay_list.next;
588 req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
589 *transno = req->rq_transno;
590 if (req->rq_transno == 0) {
591 DEBUG_REQ(D_ERROR, req, "zero transno in replay_list");
592 LBUG();
593 }
594 return 1;
595 }
596 return 0;
d7e09d03
PT
597}
598
599/**
600 * Attempt to (re)connect import \a imp. This includes all preparations,
601 * initializing CONNECT RPC request and passing it to ptlrpcd for
602 * actual sending.
603 * Returns 0 on success or error code.
604 */
605int ptlrpc_connect_import(struct obd_import *imp)
606{
607 struct obd_device *obd = imp->imp_obd;
608 int initial_connect = 0;
609 int set_transno = 0;
610 __u64 committed_before_reconnect = 0;
611 struct ptlrpc_request *request;
612 char *bufs[] = { NULL,
613 obd2cli_tgt(imp->imp_obd),
614 obd->obd_uuid.uuid,
615 (char *)&imp->imp_dlm_handle,
616 (char *)&imp->imp_connect_data };
617 struct ptlrpc_connect_async_args *aa;
618 int rc;
d7e09d03
PT
619
620 spin_lock(&imp->imp_lock);
621 if (imp->imp_state == LUSTRE_IMP_CLOSED) {
622 spin_unlock(&imp->imp_lock);
623 CERROR("can't connect to a closed import\n");
0a3bdb00 624 return -EINVAL;
d7e09d03
PT
625 } else if (imp->imp_state == LUSTRE_IMP_FULL) {
626 spin_unlock(&imp->imp_lock);
627 CERROR("already connected\n");
0a3bdb00 628 return 0;
d7e09d03
PT
629 } else if (imp->imp_state == LUSTRE_IMP_CONNECTING) {
630 spin_unlock(&imp->imp_lock);
631 CERROR("already connecting\n");
0a3bdb00 632 return -EALREADY;
d7e09d03
PT
633 }
634
635 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
636
637 imp->imp_conn_cnt++;
638 imp->imp_resend_replay = 0;
639
640 if (!lustre_handle_is_used(&imp->imp_remote_handle))
641 initial_connect = 1;
642 else
643 committed_before_reconnect = imp->imp_peer_committed_transno;
644
645 set_transno = ptlrpc_first_transno(imp,
646 &imp->imp_connect_data.ocd_transno);
647 spin_unlock(&imp->imp_lock);
648
649 rc = import_select_connection(imp);
650 if (rc)
a9b3e8f3 651 goto out;
d7e09d03 652
5bcfab13 653 rc = sptlrpc_import_sec_adapt(imp, NULL, NULL);
d7e09d03 654 if (rc)
a9b3e8f3 655 goto out;
d7e09d03
PT
656
657 /* Reset connect flags to the originally requested flags, in case
dadfcdab
OD
658 * the server is updated on-the-fly we will get the new features.
659 */
d7e09d03
PT
660 imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
661 /* Reset ocd_version each time so the server knows the exact versions */
662 imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE;
663 imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
664 imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
665
666 rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd,
667 &obd->obd_uuid, &imp->imp_connect_data, NULL);
668 if (rc)
a9b3e8f3 669 goto out;
d7e09d03
PT
670
671 request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT);
8b382089 672 if (!request) {
a9b3e8f3
JL
673 rc = -ENOMEM;
674 goto out;
675 }
d7e09d03
PT
676
677 rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION,
678 imp->imp_connect_op, bufs, NULL);
679 if (rc) {
680 ptlrpc_request_free(request);
a9b3e8f3 681 goto out;
d7e09d03
PT
682 }
683
684 /* Report the rpc service time to the server so that it knows how long
dadfcdab
OD
685 * to wait for clients to join recovery
686 */
d7e09d03
PT
687 lustre_msg_set_service_time(request->rq_reqmsg,
688 at_timeout2est(request->rq_timeout));
689
690 /* The amount of time we give the server to process the connect req.
691 * import_select_connection will increase the net latency on
692 * repeated reconnect attempts to cover slow networks.
693 * We override/ignore the server rpc completion estimate here,
dadfcdab
OD
694 * which may be large if this is a reconnect attempt
695 */
d7e09d03
PT
696 request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
697 lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout);
698
699 lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER);
700
3d2b8f57
NC
701 request->rq_no_resend = 1;
702 request->rq_no_delay = 1;
d7e09d03
PT
703 request->rq_send_state = LUSTRE_IMP_CONNECTING;
704 /* Allow a slightly larger reply for future growth compatibility */
705 req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER,
706 sizeof(struct obd_connect_data)+16*sizeof(__u64));
707 ptlrpc_request_set_replen(request);
708 request->rq_interpret_reply = ptlrpc_connect_interpret;
709
3949015e 710 CLASSERT(sizeof(*aa) <= sizeof(request->rq_async_args));
d7e09d03 711 aa = ptlrpc_req_async_args(request);
ec83e611 712 memset(aa, 0, sizeof(*aa));
d7e09d03
PT
713
714 aa->pcaa_peer_committed = committed_before_reconnect;
715 aa->pcaa_initial_connect = initial_connect;
716
717 if (aa->pcaa_initial_connect) {
718 spin_lock(&imp->imp_lock);
719 imp->imp_replayable = 1;
720 spin_unlock(&imp->imp_lock);
721 lustre_msg_add_op_flags(request->rq_reqmsg,
722 MSG_CONNECT_INITIAL);
723 }
724
725 if (set_transno)
726 lustre_msg_add_op_flags(request->rq_reqmsg,
727 MSG_CONNECT_TRANSNO);
728
729 DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)",
730 request->rq_timeout);
c5c4c6fa 731 ptlrpcd_add_req(request);
d7e09d03
PT
732 rc = 0;
733out:
c5c4c6fa 734 if (rc != 0)
d7e09d03 735 IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
d7e09d03 736
0a3bdb00 737 return rc;
d7e09d03
PT
738}
739EXPORT_SYMBOL(ptlrpc_connect_import);
740
741static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
742{
743 int force_verify;
744
745 spin_lock(&imp->imp_lock);
746 force_verify = imp->imp_force_verify != 0;
747 spin_unlock(&imp->imp_lock);
748
749 if (force_verify)
750 ptlrpc_pinger_wake_up();
751}
752
753static int ptlrpc_busy_reconnect(int rc)
754{
755 return (rc == -EBUSY) || (rc == -EAGAIN);
756}
757
758/**
759 * interpret_reply callback for connect RPCs.
760 * Looks into returned status of connect operation and decides
761 * what to do with the import - i.e enter recovery, promote it to
762 * full state for normal operations of disconnect it due to an error.
763 */
764static int ptlrpc_connect_interpret(const struct lu_env *env,
765 struct ptlrpc_request *request,
766 void *data, int rc)
767{
768 struct ptlrpc_connect_async_args *aa = data;
769 struct obd_import *imp = request->rq_import;
770 struct client_obd *cli = &imp->imp_obd->u.cli;
771 struct lustre_handle old_hdl;
772 __u64 old_connect_flags;
773 int msg_flags;
774 struct obd_connect_data *ocd;
775 struct obd_export *exp;
776 int ret;
d7e09d03
PT
777
778 spin_lock(&imp->imp_lock);
779 if (imp->imp_state == LUSTRE_IMP_CLOSED) {
780 imp->imp_connect_tried = 1;
781 spin_unlock(&imp->imp_lock);
0a3bdb00 782 return 0;
d7e09d03
PT
783 }
784
785 if (rc) {
786 /* if this reconnect to busy export - not need select new target
dadfcdab
OD
787 * for connecting
788 */
d7e09d03
PT
789 imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc);
790 spin_unlock(&imp->imp_lock);
791 ptlrpc_maybe_ping_import_soon(imp);
a9b3e8f3 792 goto out;
d7e09d03
PT
793 }
794 spin_unlock(&imp->imp_lock);
795
796 LASSERT(imp->imp_conn_current);
797
798 msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
799
800 ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA,
801 RCL_SERVER);
802 /* server replied obd_connect_data is always bigger */
803 ocd = req_capsule_server_sized_get(&request->rq_pill,
804 &RMF_CONNECT_DATA, ret);
805
8b382089 806 if (!ocd) {
d7e09d03
PT
807 CERROR("%s: no connect data from server\n",
808 imp->imp_obd->obd_name);
809 rc = -EPROTO;
a9b3e8f3 810 goto out;
d7e09d03
PT
811 }
812
813 spin_lock(&imp->imp_lock);
814
815 /* All imports are pingable */
816 imp->imp_pingable = 1;
817 imp->imp_force_reconnect = 0;
818 imp->imp_force_verify = 0;
819
820 imp->imp_connect_data = *ocd;
821
822 CDEBUG(D_HA, "%s: connect to target with instance %u\n",
823 imp->imp_obd->obd_name, ocd->ocd_instance);
824 exp = class_conn2export(&imp->imp_dlm_handle);
825
826 spin_unlock(&imp->imp_lock);
827
828 /* check that server granted subset of flags we asked for. */
829 if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) !=
830 ocd->ocd_connect_flags) {
55f5a824 831 CERROR("%s: Server didn't granted asked subset of flags: asked=%#llx grranted=%#llx\n",
1d8cb70c 832 imp->imp_obd->obd_name, imp->imp_connect_flags_orig,
d7e09d03 833 ocd->ocd_connect_flags);
a9b3e8f3
JL
834 rc = -EPROTO;
835 goto out;
d7e09d03
PT
836 }
837
838 if (!exp) {
839 /* This could happen if export is cleaned during the
dadfcdab
OD
840 * connect attempt
841 */
d7e09d03
PT
842 CERROR("%s: missing export after connect\n",
843 imp->imp_obd->obd_name);
a9b3e8f3
JL
844 rc = -ENODEV;
845 goto out;
d7e09d03
PT
846 }
847 old_connect_flags = exp_connect_flags(exp);
848 exp->exp_connect_data = *ocd;
849 imp->imp_obd->obd_self_export->exp_connect_data = *ocd;
850 class_export_put(exp);
851
852 obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
853
854 if (aa->pcaa_initial_connect) {
855 spin_lock(&imp->imp_lock);
856 if (msg_flags & MSG_CONNECT_REPLAYABLE) {
857 imp->imp_replayable = 1;
858 spin_unlock(&imp->imp_lock);
859 CDEBUG(D_HA, "connected to replayable target: %s\n",
860 obd2cli_tgt(imp->imp_obd));
861 } else {
862 imp->imp_replayable = 0;
863 spin_unlock(&imp->imp_lock);
864 }
865
866 /* if applies, adjust the imp->imp_msg_magic here
dadfcdab
OD
867 * according to reply flags
868 */
d7e09d03
PT
869
870 imp->imp_remote_handle =
871 *lustre_msg_get_handle(request->rq_repmsg);
872
873 /* Initial connects are allowed for clients with non-random
874 * uuids when servers are in recovery. Simply signal the
dadfcdab
OD
875 * servers replay is complete and wait in REPLAY_WAIT.
876 */
d7e09d03
PT
877 if (msg_flags & MSG_CONNECT_RECOVERING) {
878 CDEBUG(D_HA, "connect to %s during recovery\n",
879 obd2cli_tgt(imp->imp_obd));
880 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
881 } else {
882 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
883 ptlrpc_activate_import(imp);
884 }
885
a9b3e8f3
JL
886 rc = 0;
887 goto finish;
d7e09d03
PT
888 }
889
890 /* Determine what recovery state to move the import to. */
2b241d31 891 if (msg_flags & MSG_CONNECT_RECONNECT) {
d7e09d03
PT
892 memset(&old_hdl, 0, sizeof(old_hdl));
893 if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg),
3949015e 894 sizeof(old_hdl))) {
55f5a824 895 LCONSOLE_WARN("Reconnect to %s (at @%s) failed due bad handle %#llx\n",
d7e09d03
PT
896 obd2cli_tgt(imp->imp_obd),
897 imp->imp_connection->c_remote_uuid.uuid,
898 imp->imp_dlm_handle.cookie);
a9b3e8f3
JL
899 rc = -ENOTCONN;
900 goto out;
d7e09d03
PT
901 }
902
903 if (memcmp(&imp->imp_remote_handle,
904 lustre_msg_get_handle(request->rq_repmsg),
905 sizeof(imp->imp_remote_handle))) {
906 int level = msg_flags & MSG_CONNECT_RECOVERING ?
907 D_HA : D_WARNING;
908
909 /* Bug 16611/14775: if server handle have changed,
910 * that means some sort of disconnection happened.
911 * If the server is not in recovery, that also means it
912 * already erased all of our state because of previous
913 * eviction. If it is in recovery - we are safe to
914 * participate since we can reestablish all of our state
dadfcdab
OD
915 * with server again
916 */
2b241d31 917 if ((msg_flags & MSG_CONNECT_RECOVERING)) {
b533ff4b 918 CDEBUG(level, "%s@%s changed server handle from %#llx to %#llx but is still in recovery\n",
d7e09d03
PT
919 obd2cli_tgt(imp->imp_obd),
920 imp->imp_connection->c_remote_uuid.uuid,
921 imp->imp_remote_handle.cookie,
922 lustre_msg_get_handle(
923 request->rq_repmsg)->cookie);
924 } else {
2d00bd17 925 LCONSOLE_WARN("Evicted from %s (at %s) after server handle changed from %#llx to %#llx\n",
d7e09d03
PT
926 obd2cli_tgt(imp->imp_obd),
927 imp->imp_connection-> \
928 c_remote_uuid.uuid,
929 imp->imp_remote_handle.cookie,
930 lustre_msg_get_handle(
2d00bd17 931 request->rq_repmsg)->cookie);
d7e09d03
PT
932 }
933
d7e09d03
PT
934 imp->imp_remote_handle =
935 *lustre_msg_get_handle(request->rq_repmsg);
936
2b241d31 937 if (!(msg_flags & MSG_CONNECT_RECOVERING)) {
d7e09d03 938 IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
a9b3e8f3
JL
939 rc = 0;
940 goto finish;
d7e09d03
PT
941 }
942
943 } else {
944 CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
945 obd2cli_tgt(imp->imp_obd),
946 imp->imp_connection->c_remote_uuid.uuid);
947 }
948
949 if (imp->imp_invalid) {
2d00bd17
JP
950 CDEBUG(D_HA, "%s: reconnected but import is invalid; marking evicted\n",
951 imp->imp_obd->obd_name);
d7e09d03 952 IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
2b241d31 953 } else if (msg_flags & MSG_CONNECT_RECOVERING) {
d7e09d03
PT
954 CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
955 imp->imp_obd->obd_name,
956 obd2cli_tgt(imp->imp_obd));
957
958 spin_lock(&imp->imp_lock);
959 imp->imp_resend_replay = 1;
960 spin_unlock(&imp->imp_lock);
961
502cb58e 962 IMPORT_SET_STATE(imp, imp->imp_replay_state);
d7e09d03
PT
963 } else {
964 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
965 }
2b241d31 966 } else if ((msg_flags & MSG_CONNECT_RECOVERING) && !imp->imp_invalid) {
d7e09d03
PT
967 LASSERT(imp->imp_replayable);
968 imp->imp_remote_handle =
969 *lustre_msg_get_handle(request->rq_repmsg);
970 imp->imp_last_replay_transno = 0;
971 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
972 } else {
2d00bd17
JP
973 DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags not set: %x)",
974 imp->imp_obd->obd_name, msg_flags);
d7e09d03
PT
975 imp->imp_remote_handle =
976 *lustre_msg_get_handle(request->rq_repmsg);
977 IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
978 }
979
980 /* Sanity checks for a reconnected import. */
981 if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) {
2d00bd17 982 CERROR("imp_replayable flag does not match server after reconnect. We should LBUG right here.\n");
d7e09d03
PT
983 }
984
985 if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 &&
986 lustre_msg_get_last_committed(request->rq_repmsg) <
987 aa->pcaa_peer_committed) {
2d00bd17 988 CERROR("%s went back in time (transno %lld was previously committed, server now claims %lld)! See https://bugzilla.lustre.org/show_bug.cgi?id=9646\n",
d7e09d03
PT
989 obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
990 lustre_msg_get_last_committed(request->rq_repmsg));
991 }
992
993finish:
994 rc = ptlrpc_import_recovery_state_machine(imp);
995 if (rc != 0) {
996 if (rc == -ENOTCONN) {
2d00bd17 997 CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery; invalidating and reconnecting\n",
d7e09d03
PT
998 obd2cli_tgt(imp->imp_obd),
999 imp->imp_connection->c_remote_uuid.uuid);
1000 ptlrpc_connect_import(imp);
1001 imp->imp_connect_tried = 1;
0a3bdb00 1002 return 0;
d7e09d03
PT
1003 }
1004 } else {
35e45816
AD
1005 static bool warned;
1006
d7e09d03
PT
1007 spin_lock(&imp->imp_lock);
1008 list_del(&imp->imp_conn_current->oic_item);
30c0aa39 1009 list_add(&imp->imp_conn_current->oic_item, &imp->imp_conn_list);
d7e09d03
PT
1010 imp->imp_last_success_conn =
1011 imp->imp_conn_current->oic_last_attempt;
1012
1013 spin_unlock(&imp->imp_lock);
1014
f261f48a
FY
1015 if ((imp->imp_connect_flags_orig & OBD_CONNECT_IBITS) &&
1016 !(ocd->ocd_connect_flags & OBD_CONNECT_IBITS)) {
2d00bd17 1017 LCONSOLE_WARN("%s: MDS %s does not support ibits lock, either very old or invalid: requested %llx, replied %llx\n",
f261f48a
FY
1018 imp->imp_obd->obd_name,
1019 imp->imp_connection->c_remote_uuid.uuid,
1020 imp->imp_connect_flags_orig,
1021 ocd->ocd_connect_flags);
a9b3e8f3
JL
1022 rc = -EPROTO;
1023 goto out;
f261f48a 1024 }
d7e09d03 1025
35e45816 1026 if (!warned && (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
d7e09d03
PT
1027 (ocd->ocd_version > LUSTRE_VERSION_CODE +
1028 LUSTRE_VERSION_OFFSET_WARN ||
1029 ocd->ocd_version < LUSTRE_VERSION_CODE -
1030 LUSTRE_VERSION_OFFSET_WARN)) {
1031 /* Sigh, some compilers do not like #ifdef in the middle
dadfcdab
OD
1032 * of macro arguments
1033 */
35e45816
AD
1034 const char *older = "older than client. Consider upgrading server";
1035 const char *newer = "newer than client. Consider recompiling application";
d7e09d03 1036
2d00bd17 1037 LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) is much %s (%s)\n",
d7e09d03
PT
1038 obd2cli_tgt(imp->imp_obd),
1039 OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
1040 OBD_OCD_VERSION_MINOR(ocd->ocd_version),
1041 OBD_OCD_VERSION_PATCH(ocd->ocd_version),
1042 OBD_OCD_VERSION_FIX(ocd->ocd_version),
1043 ocd->ocd_version > LUSTRE_VERSION_CODE ?
1044 newer : older, LUSTRE_VERSION_STRING);
35e45816 1045 warned = true;
d7e09d03
PT
1046 }
1047
1048#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
1049 /* Check if server has LU-1252 fix applied to not always swab
1050 * the IR MNE entries. Do this only once per connection. This
1051 * fixup is version-limited, because we don't want to carry the
1052 * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we
1053 * need interop with unpatched 2.2 servers. For newer servers,
dadfcdab
OD
1054 * the client will do MNE swabbing only as needed. LU-1644
1055 */
d7e09d03
PT
1056 if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
1057 !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) &&
1058 OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 &&
1059 OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 &&
1060 OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 &&
1061 strcmp(imp->imp_obd->obd_type->typ_name,
1062 LUSTRE_MGC_NAME) == 0))
1063 imp->imp_need_mne_swab = 1;
1064 else /* clear if server was upgraded since last connect */
1065 imp->imp_need_mne_swab = 0;
1066#else
1067#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
1068#endif
1069
1070 if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) {
1071 /* We sent to the server ocd_cksum_types with bits set
1072 * for algorithms we understand. The server masked off
dadfcdab
OD
1073 * the checksum types it doesn't support
1074 */
d7e09d03
PT
1075 if ((ocd->ocd_cksum_types &
1076 cksum_types_supported_client()) == 0) {
2d00bd17 1077 LCONSOLE_WARN("The negotiation of the checksum algorithm to use with server %s failed (%x/%x), disabling checksums\n",
d7e09d03
PT
1078 obd2cli_tgt(imp->imp_obd),
1079 ocd->ocd_cksum_types,
1080 cksum_types_supported_client());
1081 cli->cl_checksum = 0;
1082 cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
1083 } else {
1084 cli->cl_supp_cksum_types = ocd->ocd_cksum_types;
1085 }
1086 } else {
1087 /* The server does not support OBD_CONNECT_CKSUM.
dadfcdab
OD
1088 * Enforce ADLER for backward compatibility
1089 */
d7e09d03
PT
1090 cli->cl_supp_cksum_types = OBD_CKSUM_ADLER;
1091 }
b533ff4b 1092 cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types);
d7e09d03
PT
1093
1094 if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
1095 cli->cl_max_pages_per_rpc =
09cbfeaf 1096 min(ocd->ocd_brw_size >> PAGE_SHIFT,
d7e09d03
PT
1097 cli->cl_max_pages_per_rpc);
1098 else if (imp->imp_connect_op == MDS_CONNECT ||
1099 imp->imp_connect_op == MGS_CONNECT)
1100 cli->cl_max_pages_per_rpc = 1;
1101
1102 /* Reset ns_connect_flags only for initial connect. It might be
1103 * changed in while using FS and if we reset it in reconnect
1104 * this leads to losing user settings done before such as
dadfcdab
OD
1105 * disable lru_resize, etc.
1106 */
d7e09d03
PT
1107 if (old_connect_flags != exp_connect_flags(exp) ||
1108 aa->pcaa_initial_connect) {
55f5a824
GKH
1109 CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server flags: %#llx\n",
1110 imp->imp_obd->obd_name, ocd->ocd_connect_flags);
d7e09d03
PT
1111 imp->imp_obd->obd_namespace->ns_connect_flags =
1112 ocd->ocd_connect_flags;
1113 imp->imp_obd->obd_namespace->ns_orig_connect_flags =
1114 ocd->ocd_connect_flags;
1115 }
1116
1117 if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
1118 (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
1119 /* We need a per-message support flag, because
dadfcdab
OD
1120 * a. we don't know if the incoming connect reply
1121 * supports AT or not (in reply_in_callback)
1122 * until we unpack it.
1123 * b. failovered server means export and flags are gone
1124 * (in ptlrpc_send_reply).
1125 * Can only be set when we know AT is supported at
1126 * both ends
1127 */
d7e09d03
PT
1128 imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT;
1129 else
1130 imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT;
1131
1132 if ((ocd->ocd_connect_flags & OBD_CONNECT_FULL20) &&
1133 (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
1134 imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18;
1135 else
1136 imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18;
1137
1138 LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
1139 (cli->cl_max_pages_per_rpc > 0));
1140 }
1141
1142out:
1143 imp->imp_connect_tried = 1;
1144
1145 if (rc != 0) {
1146 IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
1147 if (rc == -EACCES) {
1148 /*
1149 * Give up trying to reconnect
1150 * EACCES means client has no permission for connection
1151 */
1152 imp->imp_obd->obd_no_recov = 1;
1153 ptlrpc_deactivate_import(imp);
1154 }
1155
1156 if (rc == -EPROTO) {
1157 struct obd_connect_data *ocd;
1158
1159 /* reply message might not be ready */
8b382089 1160 if (!request->rq_repmsg)
0a3bdb00 1161 return -EPROTO;
d7e09d03
PT
1162
1163 ocd = req_capsule_server_get(&request->rq_pill,
1164 &RMF_CONNECT_DATA);
1165 if (ocd &&
1166 (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
1167 (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
532118c0
KM
1168 /*
1169 * Actually servers are only supposed to refuse
1170 * connection from liblustre clients, so we
1171 * should never see this from VFS context
1172 */
2d00bd17
JP
1173 LCONSOLE_ERROR_MSG(0x16a, "Server %s version (%d.%d.%d.%d) refused connection from this client with an incompatible version (%s). Client must be recompiled\n",
1174 obd2cli_tgt(imp->imp_obd),
1175 OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
1176 OBD_OCD_VERSION_MINOR(ocd->ocd_version),
1177 OBD_OCD_VERSION_PATCH(ocd->ocd_version),
1178 OBD_OCD_VERSION_FIX(ocd->ocd_version),
1179 LUSTRE_VERSION_STRING);
d7e09d03
PT
1180 ptlrpc_deactivate_import(imp);
1181 IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
1182 }
0a3bdb00 1183 return -EPROTO;
d7e09d03
PT
1184 }
1185
1186 ptlrpc_maybe_ping_import_soon(imp);
1187
1188 CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
1189 obd2cli_tgt(imp->imp_obd),
1190 (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
1191 }
1192
1193 wake_up_all(&imp->imp_recovery_waitq);
0a3bdb00 1194 return rc;
d7e09d03
PT
1195}
1196
1197/**
1198 * interpret callback for "completed replay" RPCs.
1199 * \see signal_completed_replay
1200 */
1201static int completed_replay_interpret(const struct lu_env *env,
1202 struct ptlrpc_request *req,
aff9d8e8 1203 void *data, int rc)
d7e09d03 1204{
d7e09d03
PT
1205 atomic_dec(&req->rq_import->imp_replay_inflight);
1206 if (req->rq_status == 0 &&
1207 !req->rq_import->imp_vbr_failed) {
1208 ptlrpc_import_recovery_state_machine(req->rq_import);
1209 } else {
1210 if (req->rq_import->imp_vbr_failed) {
1211 CDEBUG(D_WARNING,
1212 "%s: version recovery fails, reconnecting\n",
1213 req->rq_import->imp_obd->obd_name);
1214 } else {
2d00bd17 1215 CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, reconnecting\n",
d7e09d03
PT
1216 req->rq_import->imp_obd->obd_name,
1217 req->rq_status);
1218 }
1219 ptlrpc_connect_import(req->rq_import);
1220 }
1221
0a3bdb00 1222 return 0;
d7e09d03
PT
1223}
1224
1225/**
1226 * Let server know that we have no requests to replay anymore.
1227 * Achieved by just sending a PING request
1228 */
1229static int signal_completed_replay(struct obd_import *imp)
1230{
1231 struct ptlrpc_request *req;
d7e09d03
PT
1232
1233 if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY)))
0a3bdb00 1234 return 0;
d7e09d03
PT
1235
1236 LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
1237 atomic_inc(&imp->imp_replay_inflight);
1238
1239 req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION,
1240 OBD_PING);
8b382089 1241 if (!req) {
d7e09d03 1242 atomic_dec(&imp->imp_replay_inflight);
0a3bdb00 1243 return -ENOMEM;
d7e09d03
PT
1244 }
1245
1246 ptlrpc_request_set_replen(req);
1247 req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
1248 lustre_msg_add_flags(req->rq_reqmsg,
1249 MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE);
1250 if (AT_OFF)
1251 req->rq_timeout *= 3;
1252 req->rq_interpret_reply = completed_replay_interpret;
1253
c5c4c6fa 1254 ptlrpcd_add_req(req);
0a3bdb00 1255 return 0;
d7e09d03
PT
1256}
1257
1258/**
1259 * In kernel code all import invalidation happens in its own
1260 * separate thread, so that whatever application happened to encounter
1261 * a problem could still be killed or otherwise continue
1262 */
1263static int ptlrpc_invalidate_import_thread(void *data)
1264{
1265 struct obd_import *imp = data;
1266
d7e09d03
PT
1267 unshare_fs_struct();
1268
1269 CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
1270 imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
1271 imp->imp_connection->c_remote_uuid.uuid);
1272
1273 ptlrpc_invalidate_import(imp);
1274
1275 if (obd_dump_on_eviction) {
1276 CERROR("dump the log upon eviction\n");
1277 libcfs_debug_dumplog();
1278 }
1279
1280 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
1281 ptlrpc_import_recovery_state_machine(imp);
1282
1283 class_import_put(imp);
0a3bdb00 1284 return 0;
d7e09d03
PT
1285}
1286
1287/**
1288 * This is the state machine for client-side recovery on import.
1289 *
b6da17f3 1290 * Typically we have two possibly paths. If we came to server and it is not
d7e09d03
PT
1291 * in recovery, we just enter IMP_EVICTED state, invalidate our import
1292 * state and reconnect from scratch.
1293 * If we came to server that is in recovery, we enter IMP_REPLAY import state.
1294 * We go through our list of requests to replay and send them to server one by
1295 * one.
1296 * After sending all request from the list we change import state to
1297 * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server
1298 * and also all the locks we don't yet have and wait for server to grant us.
1299 * After that we send a special "replay completed" request and change import
1300 * state to IMP_REPLAY_WAIT.
1301 * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER
1302 * state and resend all requests from sending list.
1303 * After that we promote import to FULL state and send all delayed requests
1304 * and import is fully operational after that.
1305 *
1306 */
1307int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
1308{
1309 int rc = 0;
1310 int inflight;
1311 char *target_start;
1312 int target_len;
1313
d7e09d03
PT
1314 if (imp->imp_state == LUSTRE_IMP_EVICTED) {
1315 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
1316 &target_start, &target_len);
1317 /* Don't care about MGC eviction */
1318 if (strcmp(imp->imp_obd->obd_type->typ_name,
1319 LUSTRE_MGC_NAME) != 0) {
2d00bd17 1320 LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted by %.*s; in progress operations using this service will fail.\n",
d7e09d03
PT
1321 imp->imp_obd->obd_name, target_len,
1322 target_start);
1323 }
1324 CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
1325 obd2cli_tgt(imp->imp_obd),
1326 imp->imp_connection->c_remote_uuid.uuid);
1327 /* reset vbr_failed flag upon eviction */
1328 spin_lock(&imp->imp_lock);
1329 imp->imp_vbr_failed = 0;
1330 spin_unlock(&imp->imp_lock);
1331
1332 {
68b636b6 1333 struct task_struct *task;
d7e09d03 1334 /* bug 17802: XXX client_disconnect_export vs connect request
9c379663 1335 * race. if client is evicted at this time, we start
d7e09d03 1336 * invalidate thread without reference to import and import can
dadfcdab
OD
1337 * be freed at same time.
1338 */
d7e09d03
PT
1339 class_import_get(imp);
1340 task = kthread_run(ptlrpc_invalidate_import_thread, imp,
30c0aa39 1341 "ll_imp_inval");
d7e09d03
PT
1342 if (IS_ERR(task)) {
1343 class_import_put(imp);
1344 CERROR("error starting invalidate thread: %d\n", rc);
1345 rc = PTR_ERR(task);
1346 } else {
1347 rc = 0;
1348 }
0a3bdb00 1349 return rc;
d7e09d03
PT
1350 }
1351 }
1352
1353 if (imp->imp_state == LUSTRE_IMP_REPLAY) {
1354 CDEBUG(D_HA, "replay requested by %s\n",
1355 obd2cli_tgt(imp->imp_obd));
1356 rc = ptlrpc_replay_next(imp, &inflight);
1357 if (inflight == 0 &&
1358 atomic_read(&imp->imp_replay_inflight) == 0) {
1359 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
1360 rc = ldlm_replay_locks(imp);
1361 if (rc)
a9b3e8f3 1362 goto out;
d7e09d03
PT
1363 }
1364 rc = 0;
1365 }
1366
1367 if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
1368 if (atomic_read(&imp->imp_replay_inflight) == 0) {
1369 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
1370 rc = signal_completed_replay(imp);
1371 if (rc)
a9b3e8f3 1372 goto out;
d7e09d03 1373 }
d7e09d03
PT
1374 }
1375
1376 if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
1377 if (atomic_read(&imp->imp_replay_inflight) == 0) {
1378 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
1379 }
1380 }
1381
1382 if (imp->imp_state == LUSTRE_IMP_RECOVER) {
1383 CDEBUG(D_HA, "reconnected to %s@%s\n",
1384 obd2cli_tgt(imp->imp_obd),
1385 imp->imp_connection->c_remote_uuid.uuid);
1386
1387 rc = ptlrpc_resend(imp);
1388 if (rc)
a9b3e8f3 1389 goto out;
d7e09d03
PT
1390 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
1391 ptlrpc_activate_import(imp);
1392
1393 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
1394 &target_start, &target_len);
1395 LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n",
1396 imp->imp_obd->obd_name,
1397 target_len, target_start,
1398 libcfs_nid2str(imp->imp_connection->c_peer.nid));
1399 }
1400
1401 if (imp->imp_state == LUSTRE_IMP_FULL) {
1402 wake_up_all(&imp->imp_recovery_waitq);
1403 ptlrpc_wake_delayed(imp);
1404 }
1405
1406out:
0a3bdb00 1407 return rc;
d7e09d03
PT
1408}
1409
1410int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
1411{
1412 struct ptlrpc_request *req;
1413 int rq_opc, rc = 0;
d7e09d03 1414
88291a7a 1415 if (imp->imp_obd->obd_force)
a9b3e8f3 1416 goto set_state;
d7e09d03
PT
1417
1418 switch (imp->imp_connect_op) {
88291a7a
AD
1419 case OST_CONNECT:
1420 rq_opc = OST_DISCONNECT;
1421 break;
1422 case MDS_CONNECT:
1423 rq_opc = MDS_DISCONNECT;
1424 break;
1425 case MGS_CONNECT:
1426 rq_opc = MGS_DISCONNECT;
1427 break;
d7e09d03 1428 default:
88291a7a 1429 rc = -EINVAL;
2d00bd17 1430 CERROR("%s: don't know how to disconnect from %s (connect_op %d): rc = %d\n",
88291a7a
AD
1431 imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
1432 imp->imp_connect_op, rc);
1433 return rc;
d7e09d03
PT
1434 }
1435
1436 if (ptlrpc_import_in_recovery(imp)) {
1437 struct l_wait_info lwi;
b2d201bd 1438 long timeout;
d7e09d03 1439
d7e09d03
PT
1440 if (AT_OFF) {
1441 if (imp->imp_server_timeout)
1442 timeout = cfs_time_seconds(obd_timeout / 2);
1443 else
1444 timeout = cfs_time_seconds(obd_timeout);
1445 } else {
1446 int idx = import_at_get_index(imp,
1447 imp->imp_client->cli_request_portal);
1448 timeout = cfs_time_seconds(
1449 at_get(&imp->imp_at.iat_service_estimate[idx]));
1450 }
1451
1452 lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
1453 back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
1454 rc = l_wait_event(imp->imp_recovery_waitq,
1455 !ptlrpc_import_in_recovery(imp), &lwi);
d7e09d03
PT
1456 }
1457
1458 spin_lock(&imp->imp_lock);
1459 if (imp->imp_state != LUSTRE_IMP_FULL)
a9b3e8f3 1460 goto out;
d7e09d03
PT
1461 spin_unlock(&imp->imp_lock);
1462
1463 req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT,
1464 LUSTRE_OBD_VERSION, rq_opc);
1465 if (req) {
1466 /* We are disconnecting, do not retry a failed DISCONNECT rpc if
1467 * it fails. We can get through the above with a down server
dadfcdab
OD
1468 * if the client doesn't know the server is gone yet.
1469 */
d7e09d03
PT
1470 req->rq_no_resend = 1;
1471
1472 /* We want client umounts to happen quickly, no matter the
dadfcdab
OD
1473 * server state...
1474 */
d7e09d03
PT
1475 req->rq_timeout = min_t(int, req->rq_timeout,
1476 INITIAL_CONNECT_TIMEOUT);
1477
1478 IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
d0bfef31 1479 req->rq_send_state = LUSTRE_IMP_CONNECTING;
d7e09d03
PT
1480 ptlrpc_request_set_replen(req);
1481 rc = ptlrpc_queue_wait(req);
1482 ptlrpc_req_finished(req);
1483 }
1484
1485set_state:
1486 spin_lock(&imp->imp_lock);
1487out:
1488 if (noclose)
1489 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
1490 else
1491 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
1492 memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
1493 spin_unlock(&imp->imp_lock);
1494
88291a7a
AD
1495 if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN)
1496 rc = 0;
1497
0a3bdb00 1498 return rc;
d7e09d03
PT
1499}
1500EXPORT_SYMBOL(ptlrpc_disconnect_import);
1501
d7e09d03
PT
1502/* Adaptive Timeout utils */
1503extern unsigned int at_min, at_max, at_history;
1504
1505/* Bin into timeslices using AT_BINS bins.
dadfcdab
OD
1506 * This gives us a max of the last binlimit*AT_BINS secs without the storage,
1507 * but still smoothing out a return to normalcy from a slow response.
1508 * (E.g. remember the maximum latency in each minute of the last 4 minutes.)
1509 */
d7e09d03
PT
1510int at_measured(struct adaptive_timeout *at, unsigned int val)
1511{
1512 unsigned int old = at->at_current;
0ac0478b
AB
1513 time64_t now = ktime_get_real_seconds();
1514 long binlimit = max_t(long, at_history / AT_BINS, 1);
d7e09d03
PT
1515
1516 LASSERT(at);
1517 CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
0ac0478b 1518 val, at, (long)(now - at->at_binstart), at->at_current,
d7e09d03
PT
1519 at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
1520
1521 if (val == 0)
1522 /* 0's don't count, because we never want our timeout to
dadfcdab
OD
1523 * drop to 0, and because 0 could mean an error
1524 */
d7e09d03
PT
1525 return 0;
1526
1527 spin_lock(&at->at_lock);
1528
1529 if (unlikely(at->at_binstart == 0)) {
1530 /* Special case to remove default from history */
1531 at->at_current = val;
1532 at->at_worst_ever = val;
1533 at->at_worst_time = now;
1534 at->at_hist[0] = val;
1535 at->at_binstart = now;
3949015e 1536 } else if (now - at->at_binstart < binlimit) {
d7e09d03
PT
1537 /* in bin 0 */
1538 at->at_hist[0] = max(val, at->at_hist[0]);
1539 at->at_current = max(val, at->at_current);
1540 } else {
1541 int i, shift;
1542 unsigned int maxv = val;
1543 /* move bins over */
0ac0478b 1544 shift = (u32)(now - at->at_binstart) / binlimit;
d7e09d03 1545 LASSERT(shift > 0);
3949015e 1546 for (i = AT_BINS - 1; i >= 0; i--) {
d7e09d03
PT
1547 if (i >= shift) {
1548 at->at_hist[i] = at->at_hist[i - shift];
1549 maxv = max(maxv, at->at_hist[i]);
1550 } else {
1551 at->at_hist[i] = 0;
1552 }
1553 }
1554 at->at_hist[0] = val;
1555 at->at_current = maxv;
1556 at->at_binstart += shift * binlimit;
1557 }
1558
1559 if (at->at_current > at->at_worst_ever) {
1560 at->at_worst_ever = at->at_current;
1561 at->at_worst_time = now;
1562 }
1563
1564 if (at->at_flags & AT_FLG_NOHIST)
1565 /* Only keep last reported val; keeping the rest of the history
dadfcdab
OD
1566 * for debugfs only
1567 */
d7e09d03
PT
1568 at->at_current = val;
1569
1570 if (at_max > 0)
1571 at->at_current = min(at->at_current, at_max);
1572 at->at_current = max(at->at_current, at_min);
1573
1574 if (at->at_current != old)
2d00bd17
JP
1575 CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d (val=%u) hist %u %u %u %u\n",
1576 at,
d7e09d03
PT
1577 old, at->at_current, at->at_current - old, val,
1578 at->at_hist[0], at->at_hist[1], at->at_hist[2],
1579 at->at_hist[3]);
1580
1581 /* if we changed, report the old value */
1582 old = (at->at_current != old) ? old : 0;
1583
1584 spin_unlock(&at->at_lock);
1585 return old;
1586}
1587
1588/* Find the imp_at index for a given portal; assign if space available */
1589int import_at_get_index(struct obd_import *imp, int portal)
1590{
1591 struct imp_at *at = &imp->imp_at;
1592 int i;
1593
1594 for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
1595 if (at->iat_portal[i] == portal)
1596 return i;
1597 if (at->iat_portal[i] == 0)
1598 /* unused */
1599 break;
1600 }
1601
1602 /* Not found in list, add it under a lock */
1603 spin_lock(&imp->imp_lock);
1604
1605 /* Check unused under lock */
1606 for (; i < IMP_AT_MAX_PORTALS; i++) {
1607 if (at->iat_portal[i] == portal)
1608 goto out;
1609 if (at->iat_portal[i] == 0)
1610 /* unused */
1611 break;
1612 }
1613
1614 /* Not enough portals? */
1615 LASSERT(i < IMP_AT_MAX_PORTALS);
1616
1617 at->iat_portal[i] = portal;
1618out:
1619 spin_unlock(&imp->imp_lock);
1620 return i;
1621}
This page took 0.648016 seconds and 5 git commands to generate.