drivers/staging/lustre/lustre/ptlrpc/client.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19  *
  20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21  * CA 95054 USA or visit www.sun.com if you need additional information or
  22  * have any questions.
  23  *
  24  * GPL HEADER END
  25  */
  26 /*
  27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Use is subject to license terms.
  29  *
  30  * Copyright (c) 2011, 2012, Intel Corporation.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  */
  36
  37 /** Implementation of client-side PortalRPC interfaces */
  38
  39 #define DEBUG_SUBSYSTEM S_RPC
  40
  41 #include "../include/obd_support.h"
  42 #include "../include/obd_class.h"
  43 #include "../include/lustre_lib.h"
  44 #include "../include/lustre_ha.h"
  45 #include "../include/lustre_import.h"
  46 #include "../include/lustre_req_layout.h"
  47
  48 #include "ptlrpc_internal.h"
  49
  50 static int ptlrpc_send_new_req(struct ptlrpc_request *req);
  51 static int ptlrpcd_check_work(struct ptlrpc_request *req);
  52
  53 /**
  54  * Initialize passed in client structure \a cl.
  55  */
  56 void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
  57                         struct ptlrpc_client *cl)
  58 {
  59         cl->cli_request_portal = req_portal;
  60         cl->cli_reply_portal = rep_portal;
  61         cl->cli_name = name;
  62 }
  63 EXPORT_SYMBOL(ptlrpc_init_client);
  64
  65 /**
  66  * Return PortalRPC connection for remote uud \a uuid
  67  */
  68 struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
  69 {
  70         struct ptlrpc_connection *c;
  71         lnet_nid_t self;
  72         lnet_process_id_t peer;
  73         int err;
  74
  75         /* ptlrpc_uuid_to_peer() initializes its 2nd parameter
  76          * before accessing its values. */
  77         /* coverity[uninit_use_in_call] */
  78         err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
  79         if (err != 0) {
  80                 CNETERR("cannot find peer %s!\n", uuid->uuid);
  81                 return NULL;
  82         }
  83
  84         c = ptlrpc_connection_get(peer, self, uuid);
  85         if (c) {
  86                 memcpy(c->c_remote_uuid.uuid,
  87                        uuid->uuid, sizeof(c->c_remote_uuid.uuid));
  88         }
  89
  90         CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c);
  91
  92         return c;
  93 }
  94 EXPORT_SYMBOL(ptlrpc_uuid_to_connection);
  95
  96 /**
  97  * Allocate and initialize new bulk descriptor on the sender.
  98  * Returns pointer to the descriptor or NULL on error.
  99  */
 100 struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned npages, unsigned max_brw,
 101                                          unsigned type, unsigned portal)
 102 {
 103         struct ptlrpc_bulk_desc *desc;
 104         int i;
 105
 106         desc = kzalloc(offsetof(struct ptlrpc_bulk_desc, bd_iov[npages]),
 107                        GFP_NOFS);
 108         if (!desc)
 109                 return NULL;
 110
 111         spin_lock_init(&desc->bd_lock);
 112         init_waitqueue_head(&desc->bd_waitq);
 113         desc->bd_max_iov = npages;
 114         desc->bd_iov_count = 0;
 115         desc->bd_portal = portal;
 116         desc->bd_type = type;
 117         desc->bd_md_count = 0;
 118         LASSERT(max_brw > 0);
 119         desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT);
 120         /* PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this
 121          * node. Negotiated ocd_brw_size will always be <= this number. */
 122         for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++)
 123                 LNetInvalidateHandle(&desc->bd_mds[i]);
 124
 125         return desc;
 126 }
 127
 128 /**
 129  * Prepare bulk descriptor for specified outgoing request \a req that
 130  * can fit \a npages * pages. \a type is bulk type. \a portal is where
 131  * the bulk to be sent. Used on client-side.
 132  * Returns pointer to newly allocated initialized bulk descriptor or NULL on
 133  * error.
 134  */
 135 struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req,
 136                                               unsigned npages, unsigned max_brw,
 137                                               unsigned type, unsigned portal)
 138 {
 139         struct obd_import *imp = req->rq_import;
 140         struct ptlrpc_bulk_desc *desc;
 141
 142         LASSERT(type == BULK_PUT_SINK || type == BULK_GET_SOURCE);
 143         desc = ptlrpc_new_bulk(npages, max_brw, type, portal);
 144         if (desc == NULL)
 145                 return NULL;
 146
 147         desc->bd_import_generation = req->rq_import_generation;
 148         desc->bd_import = class_import_get(imp);
 149         desc->bd_req = req;
 150
 151         desc->bd_cbid.cbid_fn = client_bulk_callback;
 152         desc->bd_cbid.cbid_arg = desc;
 153
 154         /* This makes req own desc, and free it when she frees herself */
 155         req->rq_bulk = desc;
 156
 157         return desc;
 158 }
 159 EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
 160
 161 /**
 162  * Add a page \a page to the bulk descriptor \a desc.
 163  * Data to transfer in the page starts at offset \a pageoffset and
 164  * amount of data to transfer from the page is \a len
 165  */
 166 void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
 167                              struct page *page, int pageoffset, int len, int pin)
 168 {
 169         LASSERT(desc->bd_iov_count < desc->bd_max_iov);
 170         LASSERT(page != NULL);
 171         LASSERT(pageoffset >= 0);
 172         LASSERT(len > 0);
 173         LASSERT(pageoffset + len <= PAGE_CACHE_SIZE);
 174
 175         desc->bd_nob += len;
 176
 177         if (pin)
 178                 page_cache_get(page);
 179
 180         ptlrpc_add_bulk_page(desc, page, pageoffset, len);
 181 }
 182 EXPORT_SYMBOL(__ptlrpc_prep_bulk_page);
 183
 184 /**
 185  * Uninitialize and free bulk descriptor \a desc.
 186  * Works on bulk descriptors both from server and client side.
 187  */
 188 void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin)
 189 {
 190         int i;
 191
 192         LASSERT(desc != NULL);
 193         LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */
 194         LASSERT(desc->bd_md_count == 0);         /* network hands off */
 195         LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
 196
 197         sptlrpc_enc_pool_put_pages(desc);
 198
 199         if (desc->bd_export)
 200                 class_export_put(desc->bd_export);
 201         else
 202                 class_import_put(desc->bd_import);
 203
 204         if (unpin) {
 205                 for (i = 0; i < desc->bd_iov_count; i++)
 206                         page_cache_release(desc->bd_iov[i].kiov_page);
 207         }
 208
 209         kfree(desc);
 210 }
 211 EXPORT_SYMBOL(__ptlrpc_free_bulk);
 212
 213 /**
 214  * Set server timelimit for this req, i.e. how long are we willing to wait
 215  * for reply before timing out this request.
 216  */
 217 void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
 218 {
 219         __u32 serv_est;
 220         int idx;
 221         struct imp_at *at;
 222
 223         LASSERT(req->rq_import);
 224
 225         if (AT_OFF) {
 226                 /* non-AT settings */
 227                 /**
 228                  * \a imp_server_timeout means this is reverse import and
 229                  * we send (currently only) ASTs to the client and cannot afford
 230                  * to wait too long for the reply, otherwise the other client
 231                  * (because of which we are sending this request) would
 232                  * timeout waiting for us
 233                  */
 234                 req->rq_timeout = req->rq_import->imp_server_timeout ?
 235                                   obd_timeout / 2 : obd_timeout;
 236         } else {
 237                 at = &req->rq_import->imp_at;
 238                 idx = import_at_get_index(req->rq_import,
 239                                           req->rq_request_portal);
 240                 serv_est = at_get(&at->iat_service_estimate[idx]);
 241                 req->rq_timeout = at_est2timeout(serv_est);
 242         }
 243         /* We could get even fancier here, using history to predict increased
 244            loading... */
 245
 246         /* Let the server know what this RPC timeout is by putting it in the
 247            reqmsg*/
 248         lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout);
 249 }
 250 EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
 251
 252 /* Adjust max service estimate based on server value */
 253 static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
 254                                   unsigned int serv_est)
 255 {
 256         int idx;
 257         unsigned int oldse;
 258         struct imp_at *at;
 259
 260         LASSERT(req->rq_import);
 261         at = &req->rq_import->imp_at;
 262
 263         idx = import_at_get_index(req->rq_import, req->rq_request_portal);
 264         /* max service estimates are tracked on the server side,
 265            so just keep minimal history here */
 266         oldse = at_measured(&at->iat_service_estimate[idx], serv_est);
 267         if (oldse != 0)
 268                 CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d has changed from %d to %d\n",
 269                        req->rq_import->imp_obd->obd_name, req->rq_request_portal,
 270                        oldse, at_get(&at->iat_service_estimate[idx]));
 271 }
 272
 273 /* Expected network latency per remote node (secs) */
 274 int ptlrpc_at_get_net_latency(struct ptlrpc_request *req)
 275 {
 276         return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency);
 277 }
 278
 279 /* Adjust expected network latency */
 280 static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
 281                                       unsigned int service_time)
 282 {
 283         unsigned int nl, oldnl;
 284         struct imp_at *at;
 285         time_t now = get_seconds();
 286
 287         LASSERT(req->rq_import);
 288
 289         if (service_time > now - req->rq_sent + 3) {
 290                 /* bz16408, however, this can also happen if early reply
 291                  * is lost and client RPC is expired and resent, early reply
 292                  * or reply of original RPC can still be fit in reply buffer
 293                  * of resent RPC, now client is measuring time from the
 294                  * resent time, but server sent back service time of original
 295                  * RPC.
 296                  */
 297                 CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ?
 298                        D_ADAPTTO : D_WARNING,
 299                        "Reported service time %u > total measured time "
 300                        CFS_DURATION_T"\n", service_time,
 301                        cfs_time_sub(now, req->rq_sent));
 302                 return;
 303         }
 304
 305         /* Network latency is total time less server processing time */
 306         nl = max_t(int, now - req->rq_sent -
 307                         service_time, 0) + 1; /* st rounding */
 308         at = &req->rq_import->imp_at;
 309
 310         oldnl = at_measured(&at->iat_net_latency, nl);
 311         if (oldnl != 0)
 312                 CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) has changed from %d to %d\n",
 313                        req->rq_import->imp_obd->obd_name,
 314                        obd_uuid2str(
 315                                &req->rq_import->imp_connection->c_remote_uuid),
 316                        oldnl, at_get(&at->iat_net_latency));
 317 }
 318
 319 static int unpack_reply(struct ptlrpc_request *req)
 320 {
 321         int rc;
 322
 323         if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) {
 324                 rc = ptlrpc_unpack_rep_msg(req, req->rq_replen);
 325                 if (rc) {
 326                         DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc);
 327                         return -EPROTO;
 328                 }
 329         }
 330
 331         rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF);
 332         if (rc) {
 333                 DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc);
 334                 return -EPROTO;
 335         }
 336         return 0;
 337 }
 338
 339 /**
 340  * Handle an early reply message, called with the rq_lock held.
 341  * If anything goes wrong just ignore it - same as if it never happened
 342  */
 343 static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
 344 {
 345         struct ptlrpc_request *early_req;
 346         time_t olddl;
 347         int rc;
 348
 349         req->rq_early = 0;
 350         spin_unlock(&req->rq_lock);
 351
 352         rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
 353         if (rc) {
 354                 spin_lock(&req->rq_lock);
 355                 return rc;
 356         }
 357
 358         rc = unpack_reply(early_req);
 359         if (rc == 0) {
 360                 /* Expecting to increase the service time estimate here */
 361                 ptlrpc_at_adj_service(req,
 362                         lustre_msg_get_timeout(early_req->rq_repmsg));
 363                 ptlrpc_at_adj_net_latency(req,
 364                         lustre_msg_get_service_time(early_req->rq_repmsg));
 365         }
 366
 367         sptlrpc_cli_finish_early_reply(early_req);
 368
 369         if (rc != 0) {
 370                 spin_lock(&req->rq_lock);
 371                 return rc;
 372         }
 373
 374         /* Adjust the local timeout for this req */
 375         ptlrpc_at_set_req_timeout(req);
 376
 377         spin_lock(&req->rq_lock);
 378         olddl = req->rq_deadline;
 379         /* server assumes it now has rq_timeout from when it sent the
 380          * early reply, so client should give it at least that long. */
 381         req->rq_deadline = get_seconds() + req->rq_timeout +
 382                            ptlrpc_at_get_net_latency(req);
 383
 384         DEBUG_REQ(D_ADAPTTO, req,
 385                   "Early reply #%d, new deadline in " CFS_DURATION_T "s (" CFS_DURATION_T "s)",
 386                   req->rq_early_count,
 387                   cfs_time_sub(req->rq_deadline, get_seconds()),
 388                   cfs_time_sub(req->rq_deadline, olddl));
 389
 390         return rc;
 391 }
 392
 393 static struct kmem_cache *request_cache;
 394
 395 int ptlrpc_request_cache_init(void)
 396 {
 397         request_cache = kmem_cache_create("ptlrpc_cache",
 398                                           sizeof(struct ptlrpc_request),
 399                                           0, SLAB_HWCACHE_ALIGN, NULL);
 400         return request_cache == NULL ? -ENOMEM : 0;
 401 }
 402
 403 void ptlrpc_request_cache_fini(void)
 404 {
 405         kmem_cache_destroy(request_cache);
 406 }
 407
 408 struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags)
 409 {
 410         struct ptlrpc_request *req;
 411
 412         OBD_SLAB_ALLOC_PTR_GFP(req, request_cache, flags);
 413         return req;
 414 }
 415
 416 void ptlrpc_request_cache_free(struct ptlrpc_request *req)
 417 {
 418         OBD_SLAB_FREE_PTR(req, request_cache);
 419 }
 420
 421 /**
 422  * Wind down request pool \a pool.
 423  * Frees all requests from the pool too
 424  */
 425 void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool)
 426 {
 427         struct list_head *l, *tmp;
 428         struct ptlrpc_request *req;
 429
 430         LASSERT(pool != NULL);
 431
 432         spin_lock(&pool->prp_lock);
 433         list_for_each_safe(l, tmp, &pool->prp_req_list) {
 434                 req = list_entry(l, struct ptlrpc_request, rq_list);
 435                 list_del(&req->rq_list);
 436                 LASSERT(req->rq_reqbuf);
 437                 LASSERT(req->rq_reqbuf_len == pool->prp_rq_size);
 438                 kvfree(req->rq_reqbuf);
 439                 ptlrpc_request_cache_free(req);
 440         }
 441         spin_unlock(&pool->prp_lock);
 442         kfree(pool);
 443 }
 444 EXPORT_SYMBOL(ptlrpc_free_rq_pool);
 445
 446 /**
 447  * Allocates, initializes and adds \a num_rq requests to the pool \a pool
 448  */
 449 void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq)
 450 {
 451         int i;
 452         int size = 1;
 453
 454         while (size < pool->prp_rq_size)
 455                 size <<= 1;
 456
 457         LASSERTF(list_empty(&pool->prp_req_list) ||
 458                  size == pool->prp_rq_size,
 459                  "Trying to change pool size with nonempty pool from %d to %d bytes\n",
 460                  pool->prp_rq_size, size);
 461
 462         spin_lock(&pool->prp_lock);
 463         pool->prp_rq_size = size;
 464         for (i = 0; i < num_rq; i++) {
 465                 struct ptlrpc_request *req;
 466                 struct lustre_msg *msg;
 467
 468                 spin_unlock(&pool->prp_lock);
 469                 req = ptlrpc_request_cache_alloc(GFP_NOFS);
 470                 if (!req)
 471                         return;
 472                 msg = libcfs_kvzalloc(size, GFP_NOFS);
 473                 if (!msg) {
 474                         ptlrpc_request_cache_free(req);
 475                         return;
 476                 }
 477                 req->rq_reqbuf = msg;
 478                 req->rq_reqbuf_len = size;
 479                 req->rq_pool = pool;
 480                 spin_lock(&pool->prp_lock);
 481                 list_add_tail(&req->rq_list, &pool->prp_req_list);
 482         }
 483         spin_unlock(&pool->prp_lock);
 484 }
 485 EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
 486
 487 /**
 488  * Create and initialize new request pool with given attributes:
 489  * \a num_rq - initial number of requests to create for the pool
 490  * \a msgsize - maximum message size possible for requests in thid pool
 491  * \a populate_pool - function to be called when more requests need to be added
 492  *                  to the pool
 493  * Returns pointer to newly created pool or NULL on error.
 494  */
 495 struct ptlrpc_request_pool *
 496 ptlrpc_init_rq_pool(int num_rq, int msgsize,
 497                     void (*populate_pool)(struct ptlrpc_request_pool *, int))
 498 {
 499         struct ptlrpc_request_pool *pool;
 500
 501         pool = kzalloc(sizeof(struct ptlrpc_request_pool), GFP_NOFS);
 502         if (!pool)
 503                 return NULL;
 504
 505         /* Request next power of two for the allocation, because internally
 506            kernel would do exactly this */
 507
 508         spin_lock_init(&pool->prp_lock);
 509         INIT_LIST_HEAD(&pool->prp_req_list);
 510         pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD;
 511         pool->prp_populate = populate_pool;
 512
 513         populate_pool(pool, num_rq);
 514
 515         if (list_empty(&pool->prp_req_list)) {
 516                 /* have not allocated a single request for the pool */
 517                 kfree(pool);
 518                 pool = NULL;
 519         }
 520         return pool;
 521 }
 522 EXPORT_SYMBOL(ptlrpc_init_rq_pool);
 523
 524 /**
 525  * Fetches one request from pool \a pool
 526  */
 527 static struct ptlrpc_request *
 528 ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool)
 529 {
 530         struct ptlrpc_request *request;
 531         struct lustre_msg *reqbuf;
 532
 533         if (!pool)
 534                 return NULL;
 535
 536         spin_lock(&pool->prp_lock);
 537
 538         /* See if we have anything in a pool, and bail out if nothing,
 539          * in writeout path, where this matters, this is safe to do, because
 540          * nothing is lost in this case, and when some in-flight requests
 541          * complete, this code will be called again. */
 542         if (unlikely(list_empty(&pool->prp_req_list))) {
 543                 spin_unlock(&pool->prp_lock);
 544                 return NULL;
 545         }
 546
 547         request = list_entry(pool->prp_req_list.next, struct ptlrpc_request,
 548                                  rq_list);
 549         list_del_init(&request->rq_list);
 550         spin_unlock(&pool->prp_lock);
 551
 552         LASSERT(request->rq_reqbuf);
 553         LASSERT(request->rq_pool);
 554
 555         reqbuf = request->rq_reqbuf;
 556         memset(request, 0, sizeof(*request));
 557         request->rq_reqbuf = reqbuf;
 558         request->rq_reqbuf_len = pool->prp_rq_size;
 559         request->rq_pool = pool;
 560
 561         return request;
 562 }
 563
 564 /**
 565  * Returns freed \a request to pool.
 566  */
 567 static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request)
 568 {
 569         struct ptlrpc_request_pool *pool = request->rq_pool;
 570
 571         spin_lock(&pool->prp_lock);
 572         LASSERT(list_empty(&request->rq_list));
 573         LASSERT(!request->rq_receiving_reply);
 574         list_add_tail(&request->rq_list, &pool->prp_req_list);
 575         spin_unlock(&pool->prp_lock);
 576 }
 577
 578 static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
 579                                       __u32 version, int opcode,
 580                                       int count, __u32 *lengths, char **bufs,
 581                                       struct ptlrpc_cli_ctx *ctx)
 582 {
 583         struct obd_import *imp = request->rq_import;
 584         int rc;
 585
 586         if (unlikely(ctx))
 587                 request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx);
 588         else {
 589                 rc = sptlrpc_req_get_ctx(request);
 590                 if (rc)
 591                         goto out_free;
 592         }
 593
 594         sptlrpc_req_set_flavor(request, opcode);
 595
 596         rc = lustre_pack_request(request, imp->imp_msg_magic, count,
 597                                  lengths, bufs);
 598         if (rc) {
 599                 LASSERT(!request->rq_pool);
 600                 goto out_ctx;
 601         }
 602
 603         lustre_msg_add_version(request->rq_reqmsg, version);
 604         request->rq_send_state = LUSTRE_IMP_FULL;
 605         request->rq_type = PTL_RPC_MSG_REQUEST;
 606         request->rq_export = NULL;
 607
 608         request->rq_req_cbid.cbid_fn = request_out_callback;
 609         request->rq_req_cbid.cbid_arg = request;
 610
 611         request->rq_reply_cbid.cbid_fn = reply_in_callback;
 612         request->rq_reply_cbid.cbid_arg = request;
 613
 614         request->rq_reply_deadline = 0;
 615         request->rq_phase = RQ_PHASE_NEW;
 616         request->rq_next_phase = RQ_PHASE_UNDEFINED;
 617
 618         request->rq_request_portal = imp->imp_client->cli_request_portal;
 619         request->rq_reply_portal = imp->imp_client->cli_reply_portal;
 620
 621         ptlrpc_at_set_req_timeout(request);
 622
 623         spin_lock_init(&request->rq_lock);
 624         INIT_LIST_HEAD(&request->rq_list);
 625         INIT_LIST_HEAD(&request->rq_timed_list);
 626         INIT_LIST_HEAD(&request->rq_replay_list);
 627         INIT_LIST_HEAD(&request->rq_ctx_chain);
 628         INIT_LIST_HEAD(&request->rq_set_chain);
 629         INIT_LIST_HEAD(&request->rq_history_list);
 630         INIT_LIST_HEAD(&request->rq_exp_list);
 631         init_waitqueue_head(&request->rq_reply_waitq);
 632         init_waitqueue_head(&request->rq_set_waitq);
 633         request->rq_xid = ptlrpc_next_xid();
 634         atomic_set(&request->rq_refcount, 1);
 635
 636         lustre_msg_set_opc(request->rq_reqmsg, opcode);
 637
 638         return 0;
 639 out_ctx:
 640         sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1);
 641 out_free:
 642         class_import_put(imp);
 643         return rc;
 644 }
 645
 646 int ptlrpc_request_bufs_pack(struct ptlrpc_request *request,
 647                              __u32 version, int opcode, char **bufs,
 648                              struct ptlrpc_cli_ctx *ctx)
 649 {
 650         int count;
 651
 652         count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT);
 653         return __ptlrpc_request_bufs_pack(request, version, opcode, count,
 654                                           request->rq_pill.rc_area[RCL_CLIENT],
 655                                           bufs, ctx);
 656 }
 657 EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
 658
 659 /**
 660  * Pack request buffers for network transfer, performing necessary encryption
 661  * steps if necessary.
 662  */
 663 int ptlrpc_request_pack(struct ptlrpc_request *request,
 664                         __u32 version, int opcode)
 665 {
 666         int rc;
 667         rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
 668         if (rc)
 669                 return rc;
 670
 671         /* For some old 1.8 clients (< 1.8.7), they will LASSERT the size of
 672          * ptlrpc_body sent from server equal to local ptlrpc_body size, so we
 673          * have to send old ptlrpc_body to keep interoperability with these
 674          * clients.
 675          *
 676          * Only three kinds of server->client RPCs so far:
 677          *  - LDLM_BL_CALLBACK
 678          *  - LDLM_CP_CALLBACK
 679          *  - LDLM_GL_CALLBACK
 680          *
 681          * XXX This should be removed whenever we drop the interoperability with
 682          *     the these old clients.
 683          */
 684         if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK ||
 685             opcode == LDLM_GL_CALLBACK)
 686                 req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY,
 687                                    sizeof(struct ptlrpc_body_v2), RCL_CLIENT);
 688
 689         return rc;
 690 }
 691 EXPORT_SYMBOL(ptlrpc_request_pack);
 692
 693 /**
 694  * Helper function to allocate new request on import \a imp
 695  * and possibly using existing request from pool \a pool if provided.
 696  * Returns allocated request structure with import field filled or
 697  * NULL on error.
 698  */
 699 static inline
 700 struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp,
 701                                               struct ptlrpc_request_pool *pool)
 702 {
 703         struct ptlrpc_request *request = NULL;
 704
 705         if (pool)
 706                 request = ptlrpc_prep_req_from_pool(pool);
 707
 708         if (!request)
 709                 request = ptlrpc_request_cache_alloc(GFP_NOFS);
 710
 711         if (request) {
 712                 LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
 713                 LASSERT(imp != LP_POISON);
 714                 LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p",
 715                         imp->imp_client);
 716                 LASSERT(imp->imp_client != LP_POISON);
 717
 718                 request->rq_import = class_import_get(imp);
 719         } else {
 720                 CERROR("request allocation out of memory\n");
 721         }
 722
 723         return request;
 724 }
 725
 726 /**
 727  * Helper function for creating a request.
 728  * Calls __ptlrpc_request_alloc to allocate new request structure and inits
 729  * buffer structures according to capsule template \a format.
 730  * Returns allocated request structure pointer or NULL on error.
 731  */
 732 static struct ptlrpc_request *
 733 ptlrpc_request_alloc_internal(struct obd_import *imp,
 734                               struct ptlrpc_request_pool *pool,
 735                               const struct req_format *format)
 736 {
 737         struct ptlrpc_request *request;
 738
 739         request = __ptlrpc_request_alloc(imp, pool);
 740         if (request == NULL)
 741                 return NULL;
 742
 743         req_capsule_init(&request->rq_pill, request, RCL_CLIENT);
 744         req_capsule_set(&request->rq_pill, format);
 745         return request;
 746 }
 747
 748 /**
 749  * Allocate new request structure for import \a imp and initialize its
 750  * buffer structure according to capsule template \a format.
 751  */
 752 struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp,
 753                                             const struct req_format *format)
 754 {
 755         return ptlrpc_request_alloc_internal(imp, NULL, format);
 756 }
 757 EXPORT_SYMBOL(ptlrpc_request_alloc);
 758
 759 /**
 760  * Allocate new request structure for import \a imp from pool \a pool and
 761  * initialize its buffer structure according to capsule template \a format.
 762  */
 763 struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp,
 764                                                  struct ptlrpc_request_pool *pool,
 765                                                  const struct req_format *format)
 766 {
 767         return ptlrpc_request_alloc_internal(imp, pool, format);
 768 }
 769 EXPORT_SYMBOL(ptlrpc_request_alloc_pool);
 770
 771 /**
 772  * For requests not from pool, free memory of the request structure.
 773  * For requests obtained from a pool earlier, return request back to pool.
 774  */
 775 void ptlrpc_request_free(struct ptlrpc_request *request)
 776 {
 777         if (request->rq_pool)
 778                 __ptlrpc_free_req_to_pool(request);
 779         else
 780                 ptlrpc_request_cache_free(request);
 781 }
 782 EXPORT_SYMBOL(ptlrpc_request_free);
 783
 784 /**
 785  * Allocate new request for operation \a opcode and immediately pack it for
 786  * network transfer.
 787  * Only used for simple requests like OBD_PING where the only important
 788  * part of the request is operation itself.
 789  * Returns allocated request or NULL on error.
 790  */
 791 struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp,
 792                                                  const struct req_format *format,
 793                                                  __u32 version, int opcode)
 794 {
 795         struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format);
 796         int rc;
 797
 798         if (req) {
 799                 rc = ptlrpc_request_pack(req, version, opcode);
 800                 if (rc) {
 801                         ptlrpc_request_free(req);
 802                         req = NULL;
 803                 }
 804         }
 805         return req;
 806 }
 807 EXPORT_SYMBOL(ptlrpc_request_alloc_pack);
 808
 809 /**
 810  * Prepare request (fetched from pool \a pool if not NULL) on import \a imp
 811  * for operation \a opcode. Request would contain \a count buffers.
 812  * Sizes of buffers are described in array \a lengths and buffers themselves
 813  * are provided by a pointer \a bufs.
 814  * Returns prepared request structure pointer or NULL on error.
 815  */
 816 struct ptlrpc_request *
 817 ptlrpc_prep_req_pool(struct obd_import *imp,
 818                      __u32 version, int opcode,
 819                      int count, __u32 *lengths, char **bufs,
 820                      struct ptlrpc_request_pool *pool)
 821 {
 822         struct ptlrpc_request *request;
 823         int rc;
 824
 825         request = __ptlrpc_request_alloc(imp, pool);
 826         if (!request)
 827                 return NULL;
 828
 829         rc = __ptlrpc_request_bufs_pack(request, version, opcode, count,
 830                                         lengths, bufs, NULL);
 831         if (rc) {
 832                 ptlrpc_request_free(request);
 833                 request = NULL;
 834         }
 835         return request;
 836 }
 837 EXPORT_SYMBOL(ptlrpc_prep_req_pool);
 838
 839 /**
 840  * Same as ptlrpc_prep_req_pool, but without pool
 841  */
 842 struct ptlrpc_request *
 843 ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count,
 844                 __u32 *lengths, char **bufs)
 845 {
 846         return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths, bufs,
 847                                     NULL);
 848 }
 849 EXPORT_SYMBOL(ptlrpc_prep_req);
 850
 851 /**
 852  * Allocate and initialize new request set structure.
 853  * Returns a pointer to the newly allocated set structure or NULL on error.
 854  */
 855 struct ptlrpc_request_set *ptlrpc_prep_set(void)
 856 {
 857         struct ptlrpc_request_set *set;
 858
 859         set = kzalloc(sizeof(*set), GFP_NOFS);
 860         if (!set)
 861                 return NULL;
 862         atomic_set(&set->set_refcount, 1);
 863         INIT_LIST_HEAD(&set->set_requests);
 864         init_waitqueue_head(&set->set_waitq);
 865         atomic_set(&set->set_new_count, 0);
 866         atomic_set(&set->set_remaining, 0);
 867         spin_lock_init(&set->set_new_req_lock);
 868         INIT_LIST_HEAD(&set->set_new_requests);
 869         INIT_LIST_HEAD(&set->set_cblist);
 870         set->set_max_inflight = UINT_MAX;
 871         set->set_producer = NULL;
 872         set->set_producer_arg = NULL;
 873         set->set_rc = 0;
 874
 875         return set;
 876 }
 877 EXPORT_SYMBOL(ptlrpc_prep_set);
 878
 879 /**
 880  * Allocate and initialize new request set structure with flow control
 881  * extension. This extension allows to control the number of requests in-flight
 882  * for the whole set. A callback function to generate requests must be provided
 883  * and the request set will keep the number of requests sent over the wire to
 884  * @max_inflight.
 885  * Returns a pointer to the newly allocated set structure or NULL on error.
 886  */
 887 struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
 888                                              void *arg)
 889
 890 {
 891         struct ptlrpc_request_set *set;
 892
 893         set = ptlrpc_prep_set();
 894         if (!set)
 895                 return NULL;
 896
 897         set->set_max_inflight = max;
 898         set->set_producer = func;
 899         set->set_producer_arg = arg;
 900
 901         return set;
 902 }
 903 EXPORT_SYMBOL(ptlrpc_prep_fcset);
 904
 905 /**
 906  * Wind down and free request set structure previously allocated with
 907  * ptlrpc_prep_set.
 908  * Ensures that all requests on the set have completed and removes
 909  * all requests from the request list in a set.
 910  * If any unsent request happen to be on the list, pretends that they got
 911  * an error in flight and calls their completion handler.
 912  */
 913 void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
 914 {
 915         struct list_head *tmp;
 916         struct list_head *next;
 917         int expected_phase;
 918         int n = 0;
 919
 920         /* Requests on the set should either all be completed, or all be new */
 921         expected_phase = (atomic_read(&set->set_remaining) == 0) ?
 922                          RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
 923         list_for_each(tmp, &set->set_requests) {
 924                 struct ptlrpc_request *req =
 925                         list_entry(tmp, struct ptlrpc_request,
 926                                        rq_set_chain);
 927
 928                 LASSERT(req->rq_phase == expected_phase);
 929                 n++;
 930         }
 931
 932         LASSERTF(atomic_read(&set->set_remaining) == 0 ||
 933                  atomic_read(&set->set_remaining) == n, "%d / %d\n",
 934                  atomic_read(&set->set_remaining), n);
 935
 936         list_for_each_safe(tmp, next, &set->set_requests) {
 937                 struct ptlrpc_request *req =
 938                         list_entry(tmp, struct ptlrpc_request,
 939                                        rq_set_chain);
 940                 list_del_init(&req->rq_set_chain);
 941
 942                 LASSERT(req->rq_phase == expected_phase);
 943
 944                 if (req->rq_phase == RQ_PHASE_NEW) {
 945                         ptlrpc_req_interpret(NULL, req, -EBADR);
 946                         atomic_dec(&set->set_remaining);
 947                 }
 948
 949                 spin_lock(&req->rq_lock);
 950                 req->rq_set = NULL;
 951                 req->rq_invalid_rqset = 0;
 952                 spin_unlock(&req->rq_lock);
 953
 954                 ptlrpc_req_finished(req);
 955         }
 956
 957         LASSERT(atomic_read(&set->set_remaining) == 0);
 958
 959         ptlrpc_reqset_put(set);
 960 }
 961 EXPORT_SYMBOL(ptlrpc_set_destroy);
 962
 963 /**
 964  * Add a callback function \a fn to the set.
 965  * This function would be called when all requests on this set are completed.
 966  * The function will be passed \a data argument.
 967  */
 968 int ptlrpc_set_add_cb(struct ptlrpc_request_set *set,
 969                       set_interpreter_func fn, void *data)
 970 {
 971         struct ptlrpc_set_cbdata *cbdata;
 972
 973         cbdata = kzalloc(sizeof(*cbdata), GFP_NOFS);
 974         if (cbdata == NULL)
 975                 return -ENOMEM;
 976
 977         cbdata->psc_interpret = fn;
 978         cbdata->psc_data = data;
 979         list_add_tail(&cbdata->psc_item, &set->set_cblist);
 980
 981         return 0;
 982 }
 983 EXPORT_SYMBOL(ptlrpc_set_add_cb);
 984
 985 /**
 986  * Add a new request to the general purpose request set.
 987  * Assumes request reference from the caller.
 988  */
 989 void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
 990                         struct ptlrpc_request *req)
 991 {
 992         LASSERT(list_empty(&req->rq_set_chain));
 993
 994         /* The set takes over the caller's request reference */
 995         list_add_tail(&req->rq_set_chain, &set->set_requests);
 996         req->rq_set = set;
 997         atomic_inc(&set->set_remaining);
 998         req->rq_queued_time = cfs_time_current();
 999
1000         if (req->rq_reqmsg != NULL)
1001                 lustre_msg_set_jobid(req->rq_reqmsg, NULL);
1002
1003         if (set->set_producer != NULL)
1004                 /* If the request set has a producer callback, the RPC must be
1005                  * sent straight away */
1006                 ptlrpc_send_new_req(req);
1007 }
1008 EXPORT_SYMBOL(ptlrpc_set_add_req);
1009
1010 /**
1011  * Add a request to a request with dedicated server thread
1012  * and wake the thread to make any necessary processing.
1013  * Currently only used for ptlrpcd.
1014  */
1015 void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
1016                             struct ptlrpc_request *req)
1017 {
1018         struct ptlrpc_request_set *set = pc->pc_set;
1019         int count, i;
1020
1021         LASSERT(req->rq_set == NULL);
1022         LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0);
1023
1024         spin_lock(&set->set_new_req_lock);
1025         /*
1026          * The set takes over the caller's request reference.
1027          */
1028         req->rq_set = set;
1029         req->rq_queued_time = cfs_time_current();
1030         list_add_tail(&req->rq_set_chain, &set->set_new_requests);
1031         count = atomic_inc_return(&set->set_new_count);
1032         spin_unlock(&set->set_new_req_lock);
1033
1034         /* Only need to call wakeup once for the first entry. */
1035         if (count == 1) {
1036                 wake_up(&set->set_waitq);
1037
1038                 /* XXX: It maybe unnecessary to wakeup all the partners. But to
1039                  *      guarantee the async RPC can be processed ASAP, we have
1040                  *      no other better choice. It maybe fixed in future. */
1041                 for (i = 0; i < pc->pc_npartners; i++)
1042                         wake_up(&pc->pc_partners[i]->pc_set->set_waitq);
1043         }
1044 }
1045 EXPORT_SYMBOL(ptlrpc_set_add_new_req);
1046
1047 /**
1048  * Based on the current state of the import, determine if the request
1049  * can be sent, is an error, or should be delayed.
1050  *
1051  * Returns true if this request should be delayed. If false, and
1052  * *status is set, then the request can not be sent and *status is the
1053  * error code.  If false and status is 0, then request can be sent.
1054  *
1055  * The imp->imp_lock must be held.
1056  */
1057 static int ptlrpc_import_delay_req(struct obd_import *imp,
1058                                    struct ptlrpc_request *req, int *status)
1059 {
1060         int delay = 0;
1061
1062         LASSERT(status != NULL);
1063         *status = 0;
1064
1065         if (req->rq_ctx_init || req->rq_ctx_fini) {
1066                 /* always allow ctx init/fini rpc go through */
1067         } else if (imp->imp_state == LUSTRE_IMP_NEW) {
1068                 DEBUG_REQ(D_ERROR, req, "Uninitialized import.");
1069                 *status = -EIO;
1070         } else if (imp->imp_state == LUSTRE_IMP_CLOSED) {
1071                 /* pings may safely race with umount */
1072                 DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ?
1073                           D_HA : D_ERROR, req, "IMP_CLOSED ");
1074                 *status = -EIO;
1075         } else if (ptlrpc_send_limit_expired(req)) {
1076                 /* probably doesn't need to be a D_ERROR after initial testing */
1077                 DEBUG_REQ(D_ERROR, req, "send limit expired ");
1078                 *status = -EIO;
1079         } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING &&
1080                    imp->imp_state == LUSTRE_IMP_CONNECTING) {
1081                 /* allow CONNECT even if import is invalid */
1082                 if (atomic_read(&imp->imp_inval_count) != 0) {
1083                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
1084                         *status = -EIO;
1085                 }
1086         } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) {
1087                 if (!imp->imp_deactive)
1088                         DEBUG_REQ(D_NET, req, "IMP_INVALID");
1089                 *status = -ESHUTDOWN; /* bz 12940 */
1090         } else if (req->rq_import_generation != imp->imp_generation) {
1091                 DEBUG_REQ(D_ERROR, req, "req wrong generation:");
1092                 *status = -EIO;
1093         } else if (req->rq_send_state != imp->imp_state) {
1094                 /* invalidate in progress - any requests should be drop */
1095                 if (atomic_read(&imp->imp_inval_count) != 0) {
1096                         DEBUG_REQ(D_ERROR, req, "invalidate in flight");
1097                         *status = -EIO;
1098                 } else if (imp->imp_dlm_fake || req->rq_no_delay) {
1099                         *status = -EWOULDBLOCK;
1100                 } else if (req->rq_allow_replay &&
1101                           (imp->imp_state == LUSTRE_IMP_REPLAY ||
1102                            imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS ||
1103                            imp->imp_state == LUSTRE_IMP_REPLAY_WAIT ||
1104                            imp->imp_state == LUSTRE_IMP_RECOVER)) {
1105                         DEBUG_REQ(D_HA, req, "allow during recovery.\n");
1106                 } else {
1107                         delay = 1;
1108                 }
1109         }
1110
1111         return delay;
1112 }
1113
1114 /**
1115  * Decide if the error message regarding provided request \a req
1116  * should be printed to the console or not.
1117  * Makes it's decision on request status and other properties.
1118  * Returns 1 to print error on the system console or 0 if not.
1119  */
1120 static int ptlrpc_console_allow(struct ptlrpc_request *req)
1121 {
1122         __u32 opc;
1123         int err;
1124
1125         LASSERT(req->rq_reqmsg != NULL);
1126         opc = lustre_msg_get_opc(req->rq_reqmsg);
1127
1128         /* Suppress particular reconnect errors which are to be expected.  No
1129          * errors are suppressed for the initial connection on an import */
1130         if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
1131             (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
1132
1133                 /* Suppress timed out reconnect requests */
1134                 if (req->rq_timedout)
1135                         return 0;
1136
1137                 /* Suppress unavailable/again reconnect requests */
1138                 err = lustre_msg_get_status(req->rq_repmsg);
1139                 if (err == -ENODEV || err == -EAGAIN)
1140                         return 0;
1141         }
1142
1143         return 1;
1144 }
1145
1146 /**
1147  * Check request processing status.
1148  * Returns the status.
1149  */
1150 static int ptlrpc_check_status(struct ptlrpc_request *req)
1151 {
1152         int err;
1153
1154         err = lustre_msg_get_status(req->rq_repmsg);
1155         if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
1156                 struct obd_import *imp = req->rq_import;
1157                 __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
1158                 if (ptlrpc_console_allow(req))
1159                         LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s, operation %s failed with %d.\n",
1160                                            imp->imp_obd->obd_name,
1161                                            libcfs_nid2str(
1162                                                    imp->imp_connection->c_peer.nid),
1163                                            ll_opcode2str(opc), err);
1164                 return err < 0 ? err : -EINVAL;
1165         }
1166
1167         if (err < 0) {
1168                 DEBUG_REQ(D_INFO, req, "status is %d", err);
1169         } else if (err > 0) {
1170                 /* XXX: translate this error from net to host */
1171                 DEBUG_REQ(D_INFO, req, "status is %d", err);
1172         }
1173
1174         return err;
1175 }
1176
1177 /**
1178  * save pre-versions of objects into request for replay.
1179  * Versions are obtained from server reply.
1180  * used for VBR.
1181  */
1182 static void ptlrpc_save_versions(struct ptlrpc_request *req)
1183 {
1184         struct lustre_msg *repmsg = req->rq_repmsg;
1185         struct lustre_msg *reqmsg = req->rq_reqmsg;
1186         __u64 *versions = lustre_msg_get_versions(repmsg);
1187
1188         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)
1189                 return;
1190
1191         LASSERT(versions);
1192         lustre_msg_set_versions(reqmsg, versions);
1193         CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n",
1194                versions[0], versions[1]);
1195 }
1196
1197 /**
1198  * Callback function called when client receives RPC reply for \a req.
1199  * Returns 0 on success or error code.
1200  * The return value would be assigned to req->rq_status by the caller
1201  * as request processing status.
1202  * This function also decides if the request needs to be saved for later replay.
1203  */
1204 static int after_reply(struct ptlrpc_request *req)
1205 {
1206         struct obd_import *imp = req->rq_import;
1207         struct obd_device *obd = req->rq_import->imp_obd;
1208         int rc;
1209         struct timeval work_start;
1210         long timediff;
1211
1212         LASSERT(obd != NULL);
1213         /* repbuf must be unlinked */
1214         LASSERT(!req->rq_receiving_reply && !req->rq_reply_unlink);
1215
1216         if (req->rq_reply_truncate) {
1217                 if (ptlrpc_no_resend(req)) {
1218                         DEBUG_REQ(D_ERROR, req, "reply buffer overflow, expected: %d, actual size: %d",
1219                                   req->rq_nob_received, req->rq_repbuf_len);
1220                         return -EOVERFLOW;
1221                 }
1222
1223                 sptlrpc_cli_free_repbuf(req);
1224                 /* Pass the required reply buffer size (include
1225                  * space for early reply).
1226                  * NB: no need to roundup because alloc_repbuf
1227                  * will roundup it */
1228                 req->rq_replen       = req->rq_nob_received;
1229                 req->rq_nob_received = 0;
1230                 spin_lock(&req->rq_lock);
1231                 req->rq_resend       = 1;
1232                 spin_unlock(&req->rq_lock);
1233                 return 0;
1234         }
1235
1236         /*
1237          * NB Until this point, the whole of the incoming message,
1238          * including buflens, status etc is in the sender's byte order.
1239          */
1240         rc = sptlrpc_cli_unwrap_reply(req);
1241         if (rc) {
1242                 DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc);
1243                 return rc;
1244         }
1245
1246         /*
1247          * Security layer unwrap might ask resend this request.
1248          */
1249         if (req->rq_resend)
1250                 return 0;
1251
1252         rc = unpack_reply(req);
1253         if (rc)
1254                 return rc;
1255
1256         /* retry indefinitely on EINPROGRESS */
1257         if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS &&
1258             ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) {
1259                 time_t  now = get_seconds();
1260
1261                 DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS");
1262                 spin_lock(&req->rq_lock);
1263                 req->rq_resend = 1;
1264                 spin_unlock(&req->rq_lock);
1265                 req->rq_nr_resend++;
1266
1267                 /* allocate new xid to avoid reply reconstruction */
1268                 if (!req->rq_bulk) {
1269                         /* new xid is already allocated for bulk in
1270                          * ptlrpc_check_set() */
1271                         req->rq_xid = ptlrpc_next_xid();
1272                         DEBUG_REQ(D_RPCTRACE, req, "Allocating new xid for resend on EINPROGRESS");
1273                 }
1274
1275                 /* Readjust the timeout for current conditions */
1276                 ptlrpc_at_set_req_timeout(req);
1277                 /* delay resend to give a chance to the server to get ready.
1278                  * The delay is increased by 1s on every resend and is capped to
1279                  * the current request timeout (i.e. obd_timeout if AT is off,
1280                  * or AT service time x 125% + 5s, see at_est2timeout) */
1281                 if (req->rq_nr_resend > req->rq_timeout)
1282                         req->rq_sent = now + req->rq_timeout;
1283                 else
1284                         req->rq_sent = now + req->rq_nr_resend;
1285
1286                 return 0;
1287         }
1288
1289         do_gettimeofday(&work_start);
1290         timediff = cfs_timeval_sub(&work_start, &req->rq_arrival_time, NULL);
1291         if (obd->obd_svc_stats != NULL) {
1292                 lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR,
1293                                     timediff);
1294                 ptlrpc_lprocfs_rpc_sent(req, timediff);
1295         }
1296
1297         if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY &&
1298             lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) {
1299                 DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)",
1300                           lustre_msg_get_type(req->rq_repmsg));
1301                 return -EPROTO;
1302         }
1303
1304         if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING)
1305                 CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val);
1306         ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
1307         ptlrpc_at_adj_net_latency(req,
1308                                   lustre_msg_get_service_time(req->rq_repmsg));
1309
1310         rc = ptlrpc_check_status(req);
1311         imp->imp_connect_error = rc;
1312
1313         if (rc) {
1314                 /*
1315                  * Either we've been evicted, or the server has failed for
1316                  * some reason. Try to reconnect, and if that fails, punt to
1317                  * the upcall.
1318                  */
1319                 if (ll_rpc_recoverable_error(rc)) {
1320                         if (req->rq_send_state != LUSTRE_IMP_FULL ||
1321                             imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
1322                                 return rc;
1323                         }
1324                         ptlrpc_request_handle_notconn(req);
1325                         return rc;
1326                 }
1327         } else {
1328                 /*
1329                  * Let's look if server sent slv. Do it only for RPC with
1330                  * rc == 0.
1331                  */
1332                 ldlm_cli_update_pool(req);
1333         }
1334
1335         /*
1336          * Store transno in reqmsg for replay.
1337          */
1338         if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) {
1339                 req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
1340                 lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
1341         }
1342
1343         if (imp->imp_replayable) {
1344                 spin_lock(&imp->imp_lock);
1345                 /*
1346                  * No point in adding already-committed requests to the replay
1347                  * list, we will just remove them immediately. b=9829
1348                  */
1349                 if (req->rq_transno != 0 &&
1350                     (req->rq_transno >
1351                      lustre_msg_get_last_committed(req->rq_repmsg) ||
1352                      req->rq_replay)) {
1353                         /** version recovery */
1354                         ptlrpc_save_versions(req);
1355                         ptlrpc_retain_replayable_request(req, imp);
1356                 } else if (req->rq_commit_cb != NULL &&
1357                            list_empty(&req->rq_replay_list)) {
1358                         /* NB: don't call rq_commit_cb if it's already on
1359                          * rq_replay_list, ptlrpc_free_committed() will call
1360                          * it later, see LU-3618 for details */
1361                         spin_unlock(&imp->imp_lock);
1362                         req->rq_commit_cb(req);
1363                         spin_lock(&imp->imp_lock);
1364                 }
1365
1366                 /*
1367                  * Replay-enabled imports return commit-status information.
1368                  */
1369                 if (lustre_msg_get_last_committed(req->rq_repmsg)) {
1370                         imp->imp_peer_committed_transno =
1371                                 lustre_msg_get_last_committed(req->rq_repmsg);
1372                 }
1373
1374                 ptlrpc_free_committed(imp);
1375
1376                 if (!list_empty(&imp->imp_replay_list)) {
1377                         struct ptlrpc_request *last;
1378
1379                         last = list_entry(imp->imp_replay_list.prev,
1380                                               struct ptlrpc_request,
1381                                               rq_replay_list);
1382                         /*
1383                          * Requests with rq_replay stay on the list even if no
1384                          * commit is expected.
1385                          */
1386                         if (last->rq_transno > imp->imp_peer_committed_transno)
1387                                 ptlrpc_pinger_commit_expected(imp);
1388                 }
1389
1390                 spin_unlock(&imp->imp_lock);
1391         }
1392
1393         return rc;
1394 }
1395
1396 /**
1397  * Helper function to send request \a req over the network for the first time
1398  * Also adjusts request phase.
1399  * Returns 0 on success or error code.
1400  */
1401 static int ptlrpc_send_new_req(struct ptlrpc_request *req)
1402 {
1403         struct obd_import *imp = req->rq_import;
1404         int rc;
1405
1406         LASSERT(req->rq_phase == RQ_PHASE_NEW);
1407         if (req->rq_sent && (req->rq_sent > get_seconds()) &&
1408             (!req->rq_generation_set ||
1409              req->rq_import_generation == imp->imp_generation))
1410                 return 0;
1411
1412         ptlrpc_rqphase_move(req, RQ_PHASE_RPC);
1413
1414         spin_lock(&imp->imp_lock);
1415
1416         if (!req->rq_generation_set)
1417                 req->rq_import_generation = imp->imp_generation;
1418
1419         if (ptlrpc_import_delay_req(imp, req, &rc)) {
1420                 spin_lock(&req->rq_lock);
1421                 req->rq_waiting = 1;
1422                 spin_unlock(&req->rq_lock);
1423
1424                 DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: (%s != %s)",
1425                           lustre_msg_get_status(req->rq_reqmsg),
1426                           ptlrpc_import_state_name(req->rq_send_state),
1427                           ptlrpc_import_state_name(imp->imp_state));
1428                 LASSERT(list_empty(&req->rq_list));
1429                 list_add_tail(&req->rq_list, &imp->imp_delayed_list);
1430                 atomic_inc(&req->rq_import->imp_inflight);
1431                 spin_unlock(&imp->imp_lock);
1432                 return 0;
1433         }
1434
1435         if (rc != 0) {
1436                 spin_unlock(&imp->imp_lock);
1437                 req->rq_status = rc;
1438                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1439                 return rc;
1440         }
1441
1442         LASSERT(list_empty(&req->rq_list));
1443         list_add_tail(&req->rq_list, &imp->imp_sending_list);
1444         atomic_inc(&req->rq_import->imp_inflight);
1445         spin_unlock(&imp->imp_lock);
1446
1447         lustre_msg_set_status(req->rq_reqmsg, current_pid());
1448
1449         rc = sptlrpc_req_refresh_ctx(req, -1);
1450         if (rc) {
1451                 if (req->rq_err) {
1452                         req->rq_status = rc;
1453                         return 1;
1454                 }
1455                 spin_lock(&req->rq_lock);
1456                 req->rq_wait_ctx = 1;
1457                 spin_unlock(&req->rq_lock);
1458                 return 0;
1459         }
1460
1461         CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
1462                current_comm(),
1463                imp->imp_obd->obd_uuid.uuid,
1464                lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
1465                libcfs_nid2str(imp->imp_connection->c_peer.nid),
1466                lustre_msg_get_opc(req->rq_reqmsg));
1467
1468         rc = ptl_send_rpc(req, 0);
1469         if (rc) {
1470                 DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
1471                 spin_lock(&req->rq_lock);
1472                 req->rq_net_err = 1;
1473                 spin_unlock(&req->rq_lock);
1474                 return rc;
1475         }
1476         return 0;
1477 }
1478
1479 static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set)
1480 {
1481         int remaining, rc;
1482
1483         LASSERT(set->set_producer != NULL);
1484
1485         remaining = atomic_read(&set->set_remaining);
1486
1487         /* populate the ->set_requests list with requests until we
1488          * reach the maximum number of RPCs in flight for this set */
1489         while (atomic_read(&set->set_remaining) < set->set_max_inflight) {
1490                 rc = set->set_producer(set, set->set_producer_arg);
1491                 if (rc == -ENOENT) {
1492                         /* no more RPC to produce */
1493                         set->set_producer     = NULL;
1494                         set->set_producer_arg = NULL;
1495                         return 0;
1496                 }
1497         }
1498
1499         return (atomic_read(&set->set_remaining) - remaining);
1500 }
1501
1502 /**
1503  * this sends any unsent RPCs in \a set and returns 1 if all are sent
1504  * and no more replies are expected.
1505  * (it is possible to get less replies than requests sent e.g. due to timed out
1506  * requests or requests that we had trouble to send out)
1507  *
1508  * NOTE: This function contains a potential schedule point (cond_resched()).
1509  */
1510 int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
1511 {
1512         struct list_head *tmp, *next;
1513         struct list_head comp_reqs;
1514         int force_timer_recalc = 0;
1515
1516         if (atomic_read(&set->set_remaining) == 0)
1517                 return 1;
1518
1519         INIT_LIST_HEAD(&comp_reqs);
1520         list_for_each_safe(tmp, next, &set->set_requests) {
1521                 struct ptlrpc_request *req =
1522                         list_entry(tmp, struct ptlrpc_request,
1523                                        rq_set_chain);
1524                 struct obd_import *imp = req->rq_import;
1525                 int unregistered = 0;
1526                 int rc = 0;
1527
1528                 /* This schedule point is mainly for the ptlrpcd caller of this
1529                  * function.  Most ptlrpc sets are not long-lived and unbounded
1530                  * in length, but at the least the set used by the ptlrpcd is.
1531                  * Since the processing time is unbounded, we need to insert an
1532                  * explicit schedule point to make the thread well-behaved.
1533                  */
1534                 cond_resched();
1535
1536                 if (req->rq_phase == RQ_PHASE_NEW &&
1537                     ptlrpc_send_new_req(req)) {
1538                         force_timer_recalc = 1;
1539                 }
1540
1541                 /* delayed send - skip */
1542                 if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent)
1543                         continue;
1544
1545                 /* delayed resend - skip */
1546                 if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend &&
1547                     req->rq_sent > get_seconds())
1548                         continue;
1549
1550                 if (!(req->rq_phase == RQ_PHASE_RPC ||
1551                       req->rq_phase == RQ_PHASE_BULK ||
1552                       req->rq_phase == RQ_PHASE_INTERPRET ||
1553                       req->rq_phase == RQ_PHASE_UNREGISTERING ||
1554                       req->rq_phase == RQ_PHASE_COMPLETE)) {
1555                         DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase);
1556                         LBUG();
1557                 }
1558
1559                 if (req->rq_phase == RQ_PHASE_UNREGISTERING) {
1560                         LASSERT(req->rq_next_phase != req->rq_phase);
1561                         LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED);
1562
1563                         /*
1564                          * Skip processing until reply is unlinked. We
1565                          * can't return to pool before that and we can't
1566                          * call interpret before that. We need to make
1567                          * sure that all rdma transfers finished and will
1568                          * not corrupt any data.
1569                          */
1570                         if (ptlrpc_client_recv_or_unlink(req) ||
1571                             ptlrpc_client_bulk_active(req))
1572                                 continue;
1573
1574                         /*
1575                          * Turn fail_loc off to prevent it from looping
1576                          * forever.
1577                          */
1578                         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) {
1579                                 OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK,
1580                                                      OBD_FAIL_ONCE);
1581                         }
1582                         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) {
1583                                 OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK,
1584                                                      OBD_FAIL_ONCE);
1585                         }
1586
1587                         /*
1588                          * Move to next phase if reply was successfully
1589                          * unlinked.
1590                          */
1591                         ptlrpc_rqphase_move(req, req->rq_next_phase);
1592                 }
1593
1594                 if (req->rq_phase == RQ_PHASE_COMPLETE) {
1595                         list_move_tail(&req->rq_set_chain, &comp_reqs);
1596                         continue;
1597                 }
1598
1599                 if (req->rq_phase == RQ_PHASE_INTERPRET)
1600                         goto interpret;
1601
1602                 /*
1603                  * Note that this also will start async reply unlink.
1604                  */
1605                 if (req->rq_net_err && !req->rq_timedout) {
1606                         ptlrpc_expire_one_request(req, 1);
1607
1608                         /*
1609                          * Check if we still need to wait for unlink.
1610                          */
1611                         if (ptlrpc_client_recv_or_unlink(req) ||
1612                             ptlrpc_client_bulk_active(req))
1613                                 continue;
1614                         /* If there is no need to resend, fail it now. */
1615                         if (req->rq_no_resend) {
1616                                 if (req->rq_status == 0)
1617                                         req->rq_status = -EIO;
1618                                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1619                                 goto interpret;
1620                         } else {
1621                                 continue;
1622                         }
1623                 }
1624
1625                 if (req->rq_err) {
1626                         spin_lock(&req->rq_lock);
1627                         req->rq_replied = 0;
1628                         spin_unlock(&req->rq_lock);
1629                         if (req->rq_status == 0)
1630                                 req->rq_status = -EIO;
1631                         ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1632                         goto interpret;
1633                 }
1634
1635                 /* ptlrpc_set_wait->l_wait_event sets lwi_allow_intr
1636                  * so it sets rq_intr regardless of individual rpc
1637                  * timeouts. The synchronous IO waiting path sets
1638                  * rq_intr irrespective of whether ptlrpcd
1639                  * has seen a timeout.  Our policy is to only interpret
1640                  * interrupted rpcs after they have timed out, so we
1641                  * need to enforce that here.
1642                  */
1643
1644                 if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
1645                                      req->rq_wait_ctx)) {
1646                         req->rq_status = -EINTR;
1647                         ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1648                         goto interpret;
1649                 }
1650
1651                 if (req->rq_phase == RQ_PHASE_RPC) {
1652                         if (req->rq_timedout || req->rq_resend ||
1653                             req->rq_waiting || req->rq_wait_ctx) {
1654                                 int status;
1655
1656                                 if (!ptlrpc_unregister_reply(req, 1))
1657                                         continue;
1658
1659                                 spin_lock(&imp->imp_lock);
1660                                 if (ptlrpc_import_delay_req(imp, req,
1661                                                             &status)) {
1662                                         /* put on delay list - only if we wait
1663                                          * recovery finished - before send */
1664                                         list_del_init(&req->rq_list);
1665                                         list_add_tail(&req->rq_list,
1666                                                           &imp->
1667                                                           imp_delayed_list);
1668                                         spin_unlock(&imp->imp_lock);
1669                                         continue;
1670                                 }
1671
1672                                 if (status != 0) {
1673                                         req->rq_status = status;
1674                                         ptlrpc_rqphase_move(req,
1675                                                 RQ_PHASE_INTERPRET);
1676                                         spin_unlock(&imp->imp_lock);
1677                                         goto interpret;
1678                                 }
1679                                 if (ptlrpc_no_resend(req) &&
1680                                     !req->rq_wait_ctx) {
1681                                         req->rq_status = -ENOTCONN;
1682                                         ptlrpc_rqphase_move(req,
1683                                                             RQ_PHASE_INTERPRET);
1684                                         spin_unlock(&imp->imp_lock);
1685                                         goto interpret;
1686                                 }
1687
1688                                 list_del_init(&req->rq_list);
1689                                 list_add_tail(&req->rq_list,
1690                                                   &imp->imp_sending_list);
1691
1692                                 spin_unlock(&imp->imp_lock);
1693
1694                                 spin_lock(&req->rq_lock);
1695                                 req->rq_waiting = 0;
1696                                 spin_unlock(&req->rq_lock);
1697
1698                                 if (req->rq_timedout || req->rq_resend) {
1699                                         /* This is re-sending anyways,
1700                                          * let's mark req as resend. */
1701                                         spin_lock(&req->rq_lock);
1702                                         req->rq_resend = 1;
1703                                         spin_unlock(&req->rq_lock);
1704                                         if (req->rq_bulk) {
1705                                                 __u64 old_xid;
1706
1707                                                 if (!ptlrpc_unregister_bulk(req, 1))
1708                                                         continue;
1709
1710                                                 /* ensure previous bulk fails */
1711                                                 old_xid = req->rq_xid;
1712                                                 req->rq_xid = ptlrpc_next_xid();
1713                                                 CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
1714                                                        old_xid, req->rq_xid);
1715                                         }
1716                                 }
1717                                 /*
1718                                  * rq_wait_ctx is only touched by ptlrpcd,
1719                                  * so no lock is needed here.
1720                                  */
1721                                 status = sptlrpc_req_refresh_ctx(req, -1);
1722                                 if (status) {
1723                                         if (req->rq_err) {
1724                                                 req->rq_status = status;
1725                                                 spin_lock(&req->rq_lock);
1726                                                 req->rq_wait_ctx = 0;
1727                                                 spin_unlock(&req->rq_lock);
1728                                                 force_timer_recalc = 1;
1729                                         } else {
1730                                                 spin_lock(&req->rq_lock);
1731                                                 req->rq_wait_ctx = 1;
1732                                                 spin_unlock(&req->rq_lock);
1733                                         }
1734
1735                                         continue;
1736                                 } else {
1737                                         spin_lock(&req->rq_lock);
1738                                         req->rq_wait_ctx = 0;
1739                                         spin_unlock(&req->rq_lock);
1740                                 }
1741
1742                                 rc = ptl_send_rpc(req, 0);
1743                                 if (rc) {
1744                                         DEBUG_REQ(D_HA, req,
1745                                                   "send failed: rc = %d", rc);
1746                                         force_timer_recalc = 1;
1747                                         spin_lock(&req->rq_lock);
1748                                         req->rq_net_err = 1;
1749                                         spin_unlock(&req->rq_lock);
1750                                         continue;
1751                                 }
1752                                 /* need to reset the timeout */
1753                                 force_timer_recalc = 1;
1754                         }
1755
1756                         spin_lock(&req->rq_lock);
1757
1758                         if (ptlrpc_client_early(req)) {
1759                                 ptlrpc_at_recv_early_reply(req);
1760                                 spin_unlock(&req->rq_lock);
1761                                 continue;
1762                         }
1763
1764                         /* Still waiting for a reply? */
1765                         if (ptlrpc_client_recv(req)) {
1766                                 spin_unlock(&req->rq_lock);
1767                                 continue;
1768                         }
1769
1770                         /* Did we actually receive a reply? */
1771                         if (!ptlrpc_client_replied(req)) {
1772                                 spin_unlock(&req->rq_lock);
1773                                 continue;
1774                         }
1775
1776                         spin_unlock(&req->rq_lock);
1777
1778                         /* unlink from net because we are going to
1779                          * swab in-place of reply buffer */
1780                         unregistered = ptlrpc_unregister_reply(req, 1);
1781                         if (!unregistered)
1782                                 continue;
1783
1784                         req->rq_status = after_reply(req);
1785                         if (req->rq_resend)
1786                                 continue;
1787
1788                         /* If there is no bulk associated with this request,
1789                          * then we're done and should let the interpreter
1790                          * process the reply. Similarly if the RPC returned
1791                          * an error, and therefore the bulk will never arrive.
1792                          */
1793                         if (req->rq_bulk == NULL || req->rq_status < 0) {
1794                                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1795                                 goto interpret;
1796                         }
1797
1798                         ptlrpc_rqphase_move(req, RQ_PHASE_BULK);
1799                 }
1800
1801                 LASSERT(req->rq_phase == RQ_PHASE_BULK);
1802                 if (ptlrpc_client_bulk_active(req))
1803                         continue;
1804
1805                 if (req->rq_bulk->bd_failure) {
1806                         /* The RPC reply arrived OK, but the bulk screwed
1807                          * up!  Dead weird since the server told us the RPC
1808                          * was good after getting the REPLY for her GET or
1809                          * the ACK for her PUT. */
1810                         DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
1811                         req->rq_status = -EIO;
1812                 }
1813
1814                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
1815
1816 interpret:
1817                 LASSERT(req->rq_phase == RQ_PHASE_INTERPRET);
1818
1819                 /* This moves to "unregistering" phase we need to wait for
1820                  * reply unlink. */
1821                 if (!unregistered && !ptlrpc_unregister_reply(req, 1)) {
1822                         /* start async bulk unlink too */
1823                         ptlrpc_unregister_bulk(req, 1);
1824                         continue;
1825                 }
1826
1827                 if (!ptlrpc_unregister_bulk(req, 1))
1828                         continue;
1829
1830                 /* When calling interpret receiving already should be
1831                  * finished. */
1832                 LASSERT(!req->rq_receiving_reply);
1833
1834                 ptlrpc_req_interpret(env, req, req->rq_status);
1835
1836                 if (ptlrpcd_check_work(req)) {
1837                         atomic_dec(&set->set_remaining);
1838                         continue;
1839                 }
1840                 ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE);
1841
1842                 CDEBUG(req->rq_reqmsg != NULL ? D_RPCTRACE : 0,
1843                        "Completed RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n",
1844                        current_comm(), imp->imp_obd->obd_uuid.uuid,
1845                        lustre_msg_get_status(req->rq_reqmsg), req->rq_xid,
1846                        libcfs_nid2str(imp->imp_connection->c_peer.nid),
1847                        lustre_msg_get_opc(req->rq_reqmsg));
1848
1849                 spin_lock(&imp->imp_lock);
1850                 /* Request already may be not on sending or delaying list. This
1851                  * may happen in the case of marking it erroneous for the case
1852                  * ptlrpc_import_delay_req(req, status) find it impossible to
1853                  * allow sending this rpc and returns *status != 0. */
1854                 if (!list_empty(&req->rq_list)) {
1855                         list_del_init(&req->rq_list);
1856                         atomic_dec(&imp->imp_inflight);
1857                 }
1858                 spin_unlock(&imp->imp_lock);
1859
1860                 atomic_dec(&set->set_remaining);
1861                 wake_up_all(&imp->imp_recovery_waitq);
1862
1863                 if (set->set_producer) {
1864                         /* produce a new request if possible */
1865                         if (ptlrpc_set_producer(set) > 0)
1866                                 force_timer_recalc = 1;
1867
1868                         /* free the request that has just been completed
1869                          * in order not to pollute set->set_requests */
1870                         list_del_init(&req->rq_set_chain);
1871                         spin_lock(&req->rq_lock);
1872                         req->rq_set = NULL;
1873                         req->rq_invalid_rqset = 0;
1874                         spin_unlock(&req->rq_lock);
1875
1876                         /* record rq_status to compute the final status later */
1877                         if (req->rq_status != 0)
1878                                 set->set_rc = req->rq_status;
1879                         ptlrpc_req_finished(req);
1880                 } else {
1881                         list_move_tail(&req->rq_set_chain, &comp_reqs);
1882                 }
1883         }
1884
1885         /* move completed request at the head of list so it's easier for
1886          * caller to find them */
1887         list_splice(&comp_reqs, &set->set_requests);
1888
1889         /* If we hit an error, we want to recover promptly. */
1890         return atomic_read(&set->set_remaining) == 0 || force_timer_recalc;
1891 }
1892 EXPORT_SYMBOL(ptlrpc_check_set);
1893
1894 /**
1895  * Time out request \a req. is \a async_unlink is set, that means do not wait
1896  * until LNet actually confirms network buffer unlinking.
1897  * Return 1 if we should give up further retrying attempts or 0 otherwise.
1898  */
1899 int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink)
1900 {
1901         struct obd_import *imp = req->rq_import;
1902         int rc = 0;
1903
1904         spin_lock(&req->rq_lock);
1905         req->rq_timedout = 1;
1906         spin_unlock(&req->rq_lock);
1907
1908         DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent "CFS_DURATION_T
1909                   "/real "CFS_DURATION_T"]",
1910                   req->rq_net_err ? "failed due to network error" :
1911                      ((req->rq_real_sent == 0 ||
1912                        time_before((unsigned long)req->rq_real_sent, (unsigned long)req->rq_sent) ||
1913                        cfs_time_aftereq(req->rq_real_sent, req->rq_deadline)) ?
1914                       "timed out for sent delay" : "timed out for slow reply"),
1915                   req->rq_sent, req->rq_real_sent);
1916
1917         if (imp != NULL && obd_debug_peer_on_timeout)
1918                 LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer);
1919
1920         ptlrpc_unregister_reply(req, async_unlink);
1921         ptlrpc_unregister_bulk(req, async_unlink);
1922
1923         if (obd_dump_on_timeout)
1924                 libcfs_debug_dumplog();
1925
1926         if (imp == NULL) {
1927                 DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
1928                 return 1;
1929         }
1930
1931         atomic_inc(&imp->imp_timeouts);
1932
1933         /* The DLM server doesn't want recovery run on its imports. */
1934         if (imp->imp_dlm_fake)
1935                 return 1;
1936
1937         /* If this request is for recovery or other primordial tasks,
1938          * then error it out here. */
1939         if (req->rq_ctx_init || req->rq_ctx_fini ||
1940             req->rq_send_state != LUSTRE_IMP_FULL ||
1941             imp->imp_obd->obd_no_recov) {
1942                 DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)",
1943                           ptlrpc_import_state_name(req->rq_send_state),
1944                           ptlrpc_import_state_name(imp->imp_state));
1945                 spin_lock(&req->rq_lock);
1946                 req->rq_status = -ETIMEDOUT;
1947                 req->rq_err = 1;
1948                 spin_unlock(&req->rq_lock);
1949                 return 1;
1950         }
1951
1952         /* if a request can't be resent we can't wait for an answer after
1953            the timeout */
1954         if (ptlrpc_no_resend(req)) {
1955                 DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:");
1956                 rc = 1;
1957         }
1958
1959         ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg));
1960
1961         return rc;
1962 }
1963
1964 /**
1965  * Time out all uncompleted requests in request set pointed by \a data
1966  * Callback used when waiting on sets with l_wait_event.
1967  * Always returns 1.
1968  */
1969 int ptlrpc_expired_set(void *data)
1970 {
1971         struct ptlrpc_request_set *set = data;
1972         struct list_head *tmp;
1973         time_t now = get_seconds();
1974
1975         LASSERT(set != NULL);
1976
1977         /*
1978          * A timeout expired. See which reqs it applies to...
1979          */
1980         list_for_each(tmp, &set->set_requests) {
1981                 struct ptlrpc_request *req =
1982                         list_entry(tmp, struct ptlrpc_request,
1983                                        rq_set_chain);
1984
1985                 /* don't expire request waiting for context */
1986                 if (req->rq_wait_ctx)
1987                         continue;
1988
1989                 /* Request in-flight? */
1990                 if (!((req->rq_phase == RQ_PHASE_RPC &&
1991                        !req->rq_waiting && !req->rq_resend) ||
1992                       (req->rq_phase == RQ_PHASE_BULK)))
1993                         continue;
1994
1995                 if (req->rq_timedout ||     /* already dealt with */
1996                     req->rq_deadline > now) /* not expired */
1997                         continue;
1998
1999                 /* Deal with this guy. Do it asynchronously to not block
2000                  * ptlrpcd thread. */
2001                 ptlrpc_expire_one_request(req, 1);
2002         }
2003
2004         /*
2005          * When waiting for a whole set, we always break out of the
2006          * sleep so we can recalculate the timeout, or enable interrupts
2007          * if everyone's timed out.
2008          */
2009         return 1;
2010 }
2011 EXPORT_SYMBOL(ptlrpc_expired_set);
2012
2013 /**
2014  * Sets rq_intr flag in \a req under spinlock.
2015  */
2016 void ptlrpc_mark_interrupted(struct ptlrpc_request *req)
2017 {
2018         spin_lock(&req->rq_lock);
2019         req->rq_intr = 1;
2020         spin_unlock(&req->rq_lock);
2021 }
2022 EXPORT_SYMBOL(ptlrpc_mark_interrupted);
2023
2024 /**
2025  * Interrupts (sets interrupted flag) all uncompleted requests in
2026  * a set \a data. Callback for l_wait_event for interruptible waits.
2027  */
2028 void ptlrpc_interrupted_set(void *data)
2029 {
2030         struct ptlrpc_request_set *set = data;
2031         struct list_head *tmp;
2032
2033         LASSERT(set != NULL);
2034         CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set);
2035
2036         list_for_each(tmp, &set->set_requests) {
2037                 struct ptlrpc_request *req =
2038                         list_entry(tmp, struct ptlrpc_request,
2039                                        rq_set_chain);
2040
2041                 if (req->rq_phase != RQ_PHASE_RPC &&
2042                     req->rq_phase != RQ_PHASE_UNREGISTERING)
2043                         continue;
2044
2045                 ptlrpc_mark_interrupted(req);
2046         }
2047 }
2048 EXPORT_SYMBOL(ptlrpc_interrupted_set);
2049
2050 /**
2051  * Get the smallest timeout in the set; this does NOT set a timeout.
2052  */
2053 int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set)
2054 {
2055         struct list_head *tmp;
2056         time_t now = get_seconds();
2057         int timeout = 0;
2058         struct ptlrpc_request *req;
2059         int deadline;
2060
2061         list_for_each(tmp, &set->set_requests) {
2062                 req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
2063
2064                 /*
2065                  * Request in-flight?
2066                  */
2067                 if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) ||
2068                       (req->rq_phase == RQ_PHASE_BULK) ||
2069                       (req->rq_phase == RQ_PHASE_NEW)))
2070                         continue;
2071
2072                 /*
2073                  * Already timed out.
2074                  */
2075                 if (req->rq_timedout)
2076                         continue;
2077
2078                 /*
2079                  * Waiting for ctx.
2080                  */
2081                 if (req->rq_wait_ctx)
2082                         continue;
2083
2084                 if (req->rq_phase == RQ_PHASE_NEW)
2085                         deadline = req->rq_sent;
2086                 else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend)
2087                         deadline = req->rq_sent;
2088                 else
2089                         deadline = req->rq_sent + req->rq_timeout;
2090
2091                 if (deadline <= now)    /* actually expired already */
2092                         timeout = 1;    /* ASAP */
2093                 else if (timeout == 0 || timeout > deadline - now)
2094                         timeout = deadline - now;
2095         }
2096         return timeout;
2097 }
2098 EXPORT_SYMBOL(ptlrpc_set_next_timeout);
2099
2100 /**
2101  * Send all unset request from the set and then wait until all
2102  * requests in the set complete (either get a reply, timeout, get an
2103  * error or otherwise be interrupted).
2104  * Returns 0 on success or error code otherwise.
2105  */
2106 int ptlrpc_set_wait(struct ptlrpc_request_set *set)
2107 {
2108         struct list_head *tmp;
2109         struct ptlrpc_request *req;
2110         struct l_wait_info lwi;
2111         int rc, timeout;
2112
2113         if (set->set_producer)
2114                 (void)ptlrpc_set_producer(set);
2115         else
2116                 list_for_each(tmp, &set->set_requests) {
2117                         req = list_entry(tmp, struct ptlrpc_request,
2118                                              rq_set_chain);
2119                         if (req->rq_phase == RQ_PHASE_NEW)
2120                                 (void)ptlrpc_send_new_req(req);
2121                 }
2122
2123         if (list_empty(&set->set_requests))
2124                 return 0;
2125
2126         do {
2127                 timeout = ptlrpc_set_next_timeout(set);
2128
2129                 /* wait until all complete, interrupted, or an in-flight
2130                  * req times out */
2131                 CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n",
2132                        set, timeout);
2133
2134                 if (timeout == 0 && !cfs_signal_pending())
2135                         /*
2136                          * No requests are in-flight (ether timed out
2137                          * or delayed), so we can allow interrupts.
2138                          * We still want to block for a limited time,
2139                          * so we allow interrupts during the timeout.
2140                          */
2141                         lwi = LWI_TIMEOUT_INTR_ALL(cfs_time_seconds(1),
2142                                                    ptlrpc_expired_set,
2143                                                    ptlrpc_interrupted_set, set);
2144                 else
2145                         /*
2146                          * At least one request is in flight, so no
2147                          * interrupts are allowed. Wait until all
2148                          * complete, or an in-flight req times out.
2149                          */
2150                         lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
2151                                           ptlrpc_expired_set, set);
2152
2153                 rc = l_wait_event(set->set_waitq, ptlrpc_check_set(NULL, set), &lwi);
2154
2155                 /* LU-769 - if we ignored the signal because it was already
2156                  * pending when we started, we need to handle it now or we risk
2157                  * it being ignored forever */
2158                 if (rc == -ETIMEDOUT && !lwi.lwi_allow_intr &&
2159                     cfs_signal_pending()) {
2160                         sigset_t blocked_sigs =
2161                                            cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
2162
2163                         /* In fact we only interrupt for the "fatal" signals
2164                          * like SIGINT or SIGKILL. We still ignore less
2165                          * important signals since ptlrpc set is not easily
2166                          * reentrant from userspace again */
2167                         if (cfs_signal_pending())
2168                                 ptlrpc_interrupted_set(set);
2169                         cfs_restore_sigs(blocked_sigs);
2170                 }
2171
2172                 LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
2173
2174                 /* -EINTR => all requests have been flagged rq_intr so next
2175                  * check completes.
2176                  * -ETIMEDOUT => someone timed out.  When all reqs have
2177                  * timed out, signals are enabled allowing completion with
2178                  * EINTR.
2179                  * I don't really care if we go once more round the loop in
2180                  * the error cases -eeb. */
2181                 if (rc == 0 && atomic_read(&set->set_remaining) == 0) {
2182                         list_for_each(tmp, &set->set_requests) {
2183                                 req = list_entry(tmp, struct ptlrpc_request,
2184                                                      rq_set_chain);
2185                                 spin_lock(&req->rq_lock);
2186                                 req->rq_invalid_rqset = 1;
2187                                 spin_unlock(&req->rq_lock);
2188                         }
2189                 }
2190         } while (rc != 0 || atomic_read(&set->set_remaining) != 0);
2191
2192         LASSERT(atomic_read(&set->set_remaining) == 0);
2193
2194         rc = set->set_rc; /* rq_status of already freed requests if any */
2195         list_for_each(tmp, &set->set_requests) {
2196                 req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
2197
2198                 LASSERT(req->rq_phase == RQ_PHASE_COMPLETE);
2199                 if (req->rq_status != 0)
2200                         rc = req->rq_status;
2201         }
2202
2203         if (set->set_interpret != NULL) {
2204                 int (*interpreter)(struct ptlrpc_request_set *set, void *, int) =
2205                         set->set_interpret;
2206                 rc = interpreter(set, set->set_arg, rc);
2207         } else {
2208                 struct ptlrpc_set_cbdata *cbdata, *n;
2209                 int err;
2210
2211                 list_for_each_entry_safe(cbdata, n,
2212                                          &set->set_cblist, psc_item) {
2213                         list_del_init(&cbdata->psc_item);
2214                         err = cbdata->psc_interpret(set, cbdata->psc_data, rc);
2215                         if (err && !rc)
2216                                 rc = err;
2217                         kfree(cbdata);
2218                 }
2219         }
2220
2221         return rc;
2222 }
2223 EXPORT_SYMBOL(ptlrpc_set_wait);
2224
2225 /**
2226  * Helper function for request freeing.
2227  * Called when request count reached zero and request needs to be freed.
2228  * Removes request from all sorts of sending/replay lists it might be on,
2229  * frees network buffers if any are present.
2230  * If \a locked is set, that means caller is already holding import imp_lock
2231  * and so we no longer need to reobtain it (for certain lists manipulations)
2232  */
2233 static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
2234 {
2235         if (request == NULL)
2236                 return;
2237         LASSERTF(!request->rq_receiving_reply, "req %p\n", request);
2238         LASSERTF(request->rq_rqbd == NULL, "req %p\n", request);/* client-side */
2239         LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
2240         LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
2241         LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request);
2242         LASSERTF(!request->rq_replay, "req %p\n", request);
2243
2244         req_capsule_fini(&request->rq_pill);
2245
2246         /* We must take it off the imp_replay_list first.  Otherwise, we'll set
2247          * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
2248         if (request->rq_import != NULL) {
2249                 if (!locked)
2250                         spin_lock(&request->rq_import->imp_lock);
2251                 list_del_init(&request->rq_replay_list);
2252                 if (!locked)
2253                         spin_unlock(&request->rq_import->imp_lock);
2254         }
2255         LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request);
2256
2257         if (atomic_read(&request->rq_refcount) != 0) {
2258                 DEBUG_REQ(D_ERROR, request,
2259                           "freeing request with nonzero refcount");
2260                 LBUG();
2261         }
2262
2263         if (request->rq_repbuf != NULL)
2264                 sptlrpc_cli_free_repbuf(request);
2265         if (request->rq_export != NULL) {
2266                 class_export_put(request->rq_export);
2267                 request->rq_export = NULL;
2268         }
2269         if (request->rq_import != NULL) {
2270                 class_import_put(request->rq_import);
2271                 request->rq_import = NULL;
2272         }
2273         if (request->rq_bulk != NULL)
2274                 ptlrpc_free_bulk_pin(request->rq_bulk);
2275
2276         if (request->rq_reqbuf != NULL || request->rq_clrbuf != NULL)
2277                 sptlrpc_cli_free_reqbuf(request);
2278
2279         if (request->rq_cli_ctx)
2280                 sptlrpc_req_put_ctx(request, !locked);
2281
2282         if (request->rq_pool)
2283                 __ptlrpc_free_req_to_pool(request);
2284         else
2285                 ptlrpc_request_cache_free(request);
2286 }
2287
2288 static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
2289 /**
2290  * Drop one request reference. Must be called with import imp_lock held.
2291  * When reference count drops to zero, request is freed.
2292  */
2293 void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
2294 {
2295         assert_spin_locked(&request->rq_import->imp_lock);
2296         (void)__ptlrpc_req_finished(request, 1);
2297 }
2298 EXPORT_SYMBOL(ptlrpc_req_finished_with_imp_lock);
2299
2300 /**
2301  * Helper function
2302  * Drops one reference count for request \a request.
2303  * \a locked set indicates that caller holds import imp_lock.
2304  * Frees the request when reference count reaches zero.
2305  */
2306 static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
2307 {
2308         if (request == NULL)
2309                 return 1;
2310
2311         if (request == LP_POISON ||
2312             request->rq_reqmsg == LP_POISON) {
2313                 CERROR("dereferencing freed request (bug 575)\n");
2314                 LBUG();
2315                 return 1;
2316         }
2317
2318         DEBUG_REQ(D_INFO, request, "refcount now %u",
2319                   atomic_read(&request->rq_refcount) - 1);
2320
2321         if (atomic_dec_and_test(&request->rq_refcount)) {
2322                 __ptlrpc_free_req(request, locked);
2323                 return 1;
2324         }
2325
2326         return 0;
2327 }
2328
2329 /**
2330  * Drops one reference count for a request.
2331  */
2332 void ptlrpc_req_finished(struct ptlrpc_request *request)
2333 {
2334         __ptlrpc_req_finished(request, 0);
2335 }
2336 EXPORT_SYMBOL(ptlrpc_req_finished);
2337
2338 /**
2339  * Returns xid of a \a request
2340  */
2341 __u64 ptlrpc_req_xid(struct ptlrpc_request *request)
2342 {
2343         return request->rq_xid;
2344 }
2345 EXPORT_SYMBOL(ptlrpc_req_xid);
2346
2347 /**
2348  * Disengage the client's reply buffer from the network
2349  * NB does _NOT_ unregister any client-side bulk.
2350  * IDEMPOTENT, but _not_ safe against concurrent callers.
2351  * The request owner (i.e. the thread doing the I/O) must call...
2352  * Returns 0 on success or 1 if unregistering cannot be made.
2353  */
2354 int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async)
2355 {
2356         int rc;
2357         wait_queue_head_t *wq;
2358         struct l_wait_info lwi;
2359
2360         /*
2361          * Might sleep.
2362          */
2363         LASSERT(!in_interrupt());
2364
2365         /*
2366          * Let's setup deadline for reply unlink.
2367          */
2368         if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) &&
2369             async && request->rq_reply_deadline == 0)
2370                 request->rq_reply_deadline = get_seconds()+LONG_UNLINK;
2371
2372         /*
2373          * Nothing left to do.
2374          */
2375         if (!ptlrpc_client_recv_or_unlink(request))
2376                 return 1;
2377
2378         LNetMDUnlink(request->rq_reply_md_h);
2379
2380         /*
2381          * Let's check it once again.
2382          */
2383         if (!ptlrpc_client_recv_or_unlink(request))
2384                 return 1;
2385
2386         /*
2387          * Move to "Unregistering" phase as reply was not unlinked yet.
2388          */
2389         ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING);
2390
2391         /*
2392          * Do not wait for unlink to finish.
2393          */
2394         if (async)
2395                 return 0;
2396
2397         /*
2398          * We have to l_wait_event() whatever the result, to give liblustre
2399          * a chance to run reply_in_callback(), and to make sure we've
2400          * unlinked before returning a req to the pool.
2401          */
2402         if (request->rq_set != NULL)
2403                 wq = &request->rq_set->set_waitq;
2404         else
2405                 wq = &request->rq_reply_waitq;
2406
2407         for (;;) {
2408                 /* Network access will complete in finite time but the HUGE
2409                  * timeout lets us CWARN for visibility of sluggish NALs */
2410                 lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK),
2411                                            cfs_time_seconds(1), NULL, NULL);
2412                 rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request),
2413                                   &lwi);
2414                 if (rc == 0) {
2415                         ptlrpc_rqphase_move(request, request->rq_next_phase);
2416                         return 1;
2417                 }
2418
2419                 LASSERT(rc == -ETIMEDOUT);
2420                 DEBUG_REQ(D_WARNING, request,
2421                           "Unexpectedly long timeout rvcng=%d unlnk=%d/%d",
2422                           request->rq_receiving_reply,
2423                           request->rq_req_unlink, request->rq_reply_unlink);
2424         }
2425         return 0;
2426 }
2427 EXPORT_SYMBOL(ptlrpc_unregister_reply);
2428
2429 static void ptlrpc_free_request(struct ptlrpc_request *req)
2430 {
2431         spin_lock(&req->rq_lock);
2432         req->rq_replay = 0;
2433         spin_unlock(&req->rq_lock);
2434
2435         if (req->rq_commit_cb != NULL)
2436                 req->rq_commit_cb(req);
2437         list_del_init(&req->rq_replay_list);
2438
2439         __ptlrpc_req_finished(req, 1);
2440 }
2441
2442 /**
2443  * the request is committed and dropped from the replay list of its import
2444  */
2445 void ptlrpc_request_committed(struct ptlrpc_request *req, int force)
2446 {
2447         struct obd_import       *imp = req->rq_import;
2448
2449         spin_lock(&imp->imp_lock);
2450         if (list_empty(&req->rq_replay_list)) {
2451                 spin_unlock(&imp->imp_lock);
2452                 return;
2453         }
2454
2455         if (force || req->rq_transno <= imp->imp_peer_committed_transno)
2456                 ptlrpc_free_request(req);
2457
2458         spin_unlock(&imp->imp_lock);
2459 }
2460 EXPORT_SYMBOL(ptlrpc_request_committed);
2461
2462 /**
2463  * Iterates through replay_list on import and prunes
2464  * all requests have transno smaller than last_committed for the
2465  * import and don't have rq_replay set.
2466  * Since requests are sorted in transno order, stops when meeting first
2467  * transno bigger than last_committed.
2468  * caller must hold imp->imp_lock
2469  */
2470 void ptlrpc_free_committed(struct obd_import *imp)
2471 {
2472         struct ptlrpc_request *req, *saved;
2473         struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
2474         bool skip_committed_list = true;
2475
2476         LASSERT(imp != NULL);
2477         assert_spin_locked(&imp->imp_lock);
2478
2479         if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked &&
2480             imp->imp_generation == imp->imp_last_generation_checked) {
2481                 CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n",
2482                        imp->imp_obd->obd_name, imp->imp_peer_committed_transno);
2483                 return;
2484         }
2485         CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n",
2486                imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
2487                imp->imp_generation);
2488
2489         if (imp->imp_generation != imp->imp_last_generation_checked)
2490                 skip_committed_list = false;
2491
2492         imp->imp_last_transno_checked = imp->imp_peer_committed_transno;
2493         imp->imp_last_generation_checked = imp->imp_generation;
2494
2495         list_for_each_entry_safe(req, saved, &imp->imp_replay_list,
2496                                  rq_replay_list) {
2497                 /* XXX ok to remove when 1357 resolved - rread 05/29/03  */
2498                 LASSERT(req != last_req);
2499                 last_req = req;
2500
2501                 if (req->rq_transno == 0) {
2502                         DEBUG_REQ(D_EMERG, req, "zero transno during replay");
2503                         LBUG();
2504                 }
2505                 if (req->rq_import_generation < imp->imp_generation) {
2506                         DEBUG_REQ(D_RPCTRACE, req, "free request with old gen");
2507                         goto free_req;
2508                 }
2509
2510                 /* not yet committed */
2511                 if (req->rq_transno > imp->imp_peer_committed_transno) {
2512                         DEBUG_REQ(D_RPCTRACE, req, "stopping search");
2513                         break;
2514                 }
2515
2516                 if (req->rq_replay) {
2517                         DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)");
2518                         list_move_tail(&req->rq_replay_list,
2519                                        &imp->imp_committed_list);
2520                         continue;
2521                 }
2522
2523                 DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)",
2524                           imp->imp_peer_committed_transno);
2525 free_req:
2526                 ptlrpc_free_request(req);
2527         }
2528         if (skip_committed_list)
2529                 return;
2530
2531         list_for_each_entry_safe(req, saved, &imp->imp_committed_list,
2532                                  rq_replay_list) {
2533                 LASSERT(req->rq_transno != 0);
2534                 if (req->rq_import_generation < imp->imp_generation) {
2535                         DEBUG_REQ(D_RPCTRACE, req, "free stale open request");
2536                         ptlrpc_free_request(req);
2537                 }
2538         }
2539 }
2540
2541 void ptlrpc_cleanup_client(struct obd_import *imp)
2542 {
2543 }
2544 EXPORT_SYMBOL(ptlrpc_cleanup_client);
2545
2546 /**
2547  * Schedule previously sent request for resend.
2548  * For bulk requests we assign new xid (to avoid problems with
2549  * lost replies and therefore several transfers landing into same buffer
2550  * from different sending attempts).
2551  */
2552 void ptlrpc_resend_req(struct ptlrpc_request *req)
2553 {
2554         DEBUG_REQ(D_HA, req, "going to resend");
2555         spin_lock(&req->rq_lock);
2556
2557         /* Request got reply but linked to the import list still.
2558            Let ptlrpc_check_set() to process it. */
2559         if (ptlrpc_client_replied(req)) {
2560                 spin_unlock(&req->rq_lock);
2561                 DEBUG_REQ(D_HA, req, "it has reply, so skip it");
2562                 return;
2563         }
2564
2565         lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 });
2566         req->rq_status = -EAGAIN;
2567
2568         req->rq_resend = 1;
2569         req->rq_net_err = 0;
2570         req->rq_timedout = 0;
2571         if (req->rq_bulk) {
2572                 __u64 old_xid = req->rq_xid;
2573
2574                 /* ensure previous bulk fails */
2575                 req->rq_xid = ptlrpc_next_xid();
2576                 CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n",
2577                        old_xid, req->rq_xid);
2578         }
2579         ptlrpc_client_wake_req(req);
2580         spin_unlock(&req->rq_lock);
2581 }
2582 EXPORT_SYMBOL(ptlrpc_resend_req);
2583
2584 /* XXX: this function and rq_status are currently unused */
2585 void ptlrpc_restart_req(struct ptlrpc_request *req)
2586 {
2587         DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
2588         req->rq_status = -ERESTARTSYS;
2589
2590         spin_lock(&req->rq_lock);
2591         req->rq_restart = 1;
2592         req->rq_timedout = 0;
2593         ptlrpc_client_wake_req(req);
2594         spin_unlock(&req->rq_lock);
2595 }
2596 EXPORT_SYMBOL(ptlrpc_restart_req);
2597
2598 /**
2599  * Grab additional reference on a request \a req
2600  */
2601 struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
2602 {
2603         atomic_inc(&req->rq_refcount);
2604         return req;
2605 }
2606 EXPORT_SYMBOL(ptlrpc_request_addref);
2607
2608 /**
2609  * Add a request to import replay_list.
2610  * Must be called under imp_lock
2611  */
2612 void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
2613                                       struct obd_import *imp)
2614 {
2615         struct list_head *tmp;
2616
2617         assert_spin_locked(&imp->imp_lock);
2618
2619         if (req->rq_transno == 0) {
2620                 DEBUG_REQ(D_EMERG, req, "saving request with zero transno");
2621                 LBUG();
2622         }
2623
2624         /* clear this for new requests that were resent as well
2625            as resent replayed requests. */
2626         lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT);
2627
2628         /* don't re-add requests that have been replayed */
2629         if (!list_empty(&req->rq_replay_list))
2630                 return;
2631
2632         lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY);
2633
2634         LASSERT(imp->imp_replayable);
2635         /* Balanced in ptlrpc_free_committed, usually. */
2636         ptlrpc_request_addref(req);
2637         list_for_each_prev(tmp, &imp->imp_replay_list) {
2638                 struct ptlrpc_request *iter =
2639                         list_entry(tmp, struct ptlrpc_request,
2640                                        rq_replay_list);
2641
2642                 /* We may have duplicate transnos if we create and then
2643                  * open a file, or for closes retained if to match creating
2644                  * opens, so use req->rq_xid as a secondary key.
2645                  * (See bugs 684, 685, and 428.)
2646                  * XXX no longer needed, but all opens need transnos!
2647                  */
2648                 if (iter->rq_transno > req->rq_transno)
2649                         continue;
2650
2651                 if (iter->rq_transno == req->rq_transno) {
2652                         LASSERT(iter->rq_xid != req->rq_xid);
2653                         if (iter->rq_xid > req->rq_xid)
2654                                 continue;
2655                 }
2656
2657                 list_add(&req->rq_replay_list, &iter->rq_replay_list);
2658                 return;
2659         }
2660
2661         list_add(&req->rq_replay_list, &imp->imp_replay_list);
2662 }
2663 EXPORT_SYMBOL(ptlrpc_retain_replayable_request);
2664
2665 /**
2666  * Send request and wait until it completes.
2667  * Returns request processing status.
2668  */
2669 int ptlrpc_queue_wait(struct ptlrpc_request *req)
2670 {
2671         struct ptlrpc_request_set *set;
2672         int rc;
2673
2674         LASSERT(req->rq_set == NULL);
2675         LASSERT(!req->rq_receiving_reply);
2676
2677         set = ptlrpc_prep_set();
2678         if (set == NULL) {
2679                 CERROR("Unable to allocate ptlrpc set.");
2680                 return -ENOMEM;
2681         }
2682
2683         /* for distributed debugging */
2684         lustre_msg_set_status(req->rq_reqmsg, current_pid());
2685
2686         /* add a ref for the set (see comment in ptlrpc_set_add_req) */
2687         ptlrpc_request_addref(req);
2688         ptlrpc_set_add_req(set, req);
2689         rc = ptlrpc_set_wait(set);
2690         ptlrpc_set_destroy(set);
2691
2692         return rc;
2693 }
2694 EXPORT_SYMBOL(ptlrpc_queue_wait);
2695
2696 struct ptlrpc_replay_async_args {
2697         int praa_old_state;
2698         int praa_old_status;
2699 };
2700
2701 /**
2702  * Callback used for replayed requests reply processing.
2703  * In case of successful reply calls registered request replay callback.
2704  * In case of error restart replay process.
2705  */
2706 static int ptlrpc_replay_interpret(const struct lu_env *env,
2707                                    struct ptlrpc_request *req,
2708                                    void *data, int rc)
2709 {
2710         struct ptlrpc_replay_async_args *aa = data;
2711         struct obd_import *imp = req->rq_import;
2712
2713         atomic_dec(&imp->imp_replay_inflight);
2714
2715         if (!ptlrpc_client_replied(req)) {
2716                 CERROR("request replay timed out, restarting recovery\n");
2717                 rc = -ETIMEDOUT;
2718                 goto out;
2719         }
2720
2721         if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR &&
2722             (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN ||
2723              lustre_msg_get_status(req->rq_repmsg) == -ENODEV)) {
2724                 rc = lustre_msg_get_status(req->rq_repmsg);
2725                 goto out;
2726         }
2727
2728         /** VBR: check version failure */
2729         if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) {
2730                 /** replay was failed due to version mismatch */
2731                 DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n");
2732                 spin_lock(&imp->imp_lock);
2733                 imp->imp_vbr_failed = 1;
2734                 imp->imp_no_lock_replay = 1;
2735                 spin_unlock(&imp->imp_lock);
2736                 lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
2737         } else {
2738                 /** The transno had better not change over replay. */
2739                 LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) ==
2740                          lustre_msg_get_transno(req->rq_repmsg) ||
2741                          lustre_msg_get_transno(req->rq_repmsg) == 0,
2742                          "%#llx/%#llx\n",
2743                          lustre_msg_get_transno(req->rq_reqmsg),
2744                          lustre_msg_get_transno(req->rq_repmsg));
2745         }
2746
2747         spin_lock(&imp->imp_lock);
2748         /** if replays by version then gap occur on server, no trust to locks */
2749         if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY)
2750                 imp->imp_no_lock_replay = 1;
2751         imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg);
2752         spin_unlock(&imp->imp_lock);
2753         LASSERT(imp->imp_last_replay_transno);
2754
2755         /* transaction number shouldn't be bigger than the latest replayed */
2756         if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) {
2757                 DEBUG_REQ(D_ERROR, req,
2758                           "Reported transno %llu is bigger than the replayed one: %llu",
2759                           req->rq_transno,
2760                           lustre_msg_get_transno(req->rq_reqmsg));
2761                 rc = -EINVAL;
2762                 goto out;
2763         }
2764
2765         DEBUG_REQ(D_HA, req, "got rep");
2766
2767         /* let the callback do fixups, possibly including in the request */
2768         if (req->rq_replay_cb)
2769                 req->rq_replay_cb(req);
2770
2771         if (ptlrpc_client_replied(req) &&
2772             lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) {
2773                 DEBUG_REQ(D_ERROR, req, "status %d, old was %d",
2774                           lustre_msg_get_status(req->rq_repmsg),
2775                           aa->praa_old_status);
2776         } else {
2777                 /* Put it back for re-replay. */
2778                 lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status);
2779         }
2780
2781         /*
2782          * Errors while replay can set transno to 0, but
2783          * imp_last_replay_transno shouldn't be set to 0 anyway
2784          */
2785         if (req->rq_transno == 0)
2786                 CERROR("Transno is 0 during replay!\n");
2787
2788         /* continue with recovery */
2789         rc = ptlrpc_import_recovery_state_machine(imp);
2790  out:
2791         req->rq_send_state = aa->praa_old_state;
2792
2793         if (rc != 0)
2794                 /* this replay failed, so restart recovery */
2795                 ptlrpc_connect_import(imp);
2796
2797         return rc;
2798 }
2799
2800 /**
2801  * Prepares and queues request for replay.
2802  * Adds it to ptlrpcd queue for actual sending.
2803  * Returns 0 on success.
2804  */
2805 int ptlrpc_replay_req(struct ptlrpc_request *req)
2806 {
2807         struct ptlrpc_replay_async_args *aa;
2808
2809         LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
2810
2811         LASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
2812         aa = ptlrpc_req_async_args(req);
2813         memset(aa, 0, sizeof(*aa));
2814
2815         /* Prepare request to be resent with ptlrpcd */
2816         aa->praa_old_state = req->rq_send_state;
2817         req->rq_send_state = LUSTRE_IMP_REPLAY;
2818         req->rq_phase = RQ_PHASE_NEW;
2819         req->rq_next_phase = RQ_PHASE_UNDEFINED;
2820         if (req->rq_repmsg)
2821                 aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg);
2822         req->rq_status = 0;
2823         req->rq_interpret_reply = ptlrpc_replay_interpret;
2824         /* Readjust the timeout for current conditions */
2825         ptlrpc_at_set_req_timeout(req);
2826
2827         /* Tell server the net_latency, so the server can calculate how long
2828          * it should wait for next replay */
2829         lustre_msg_set_service_time(req->rq_reqmsg,
2830                                     ptlrpc_at_get_net_latency(req));
2831         DEBUG_REQ(D_HA, req, "REPLAY");
2832
2833         atomic_inc(&req->rq_import->imp_replay_inflight);
2834         ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
2835
2836         ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
2837         return 0;
2838 }
2839 EXPORT_SYMBOL(ptlrpc_replay_req);
2840
2841 /**
2842  * Aborts all in-flight request on import \a imp sending and delayed lists
2843  */
2844 void ptlrpc_abort_inflight(struct obd_import *imp)
2845 {
2846         struct list_head *tmp, *n;
2847
2848         /* Make sure that no new requests get processed for this import.
2849          * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
2850          * this flag and then putting requests on sending_list or delayed_list.
2851          */
2852         spin_lock(&imp->imp_lock);
2853
2854         /* XXX locking?  Maybe we should remove each request with the list
2855          * locked?  Also, how do we know if the requests on the list are
2856          * being freed at this time?
2857          */
2858         list_for_each_safe(tmp, n, &imp->imp_sending_list) {
2859                 struct ptlrpc_request *req =
2860                         list_entry(tmp, struct ptlrpc_request, rq_list);
2861
2862                 DEBUG_REQ(D_RPCTRACE, req, "inflight");
2863
2864                 spin_lock(&req->rq_lock);
2865                 if (req->rq_import_generation < imp->imp_generation) {
2866                         req->rq_err = 1;
2867                         req->rq_status = -EIO;
2868                         ptlrpc_client_wake_req(req);
2869                 }
2870                 spin_unlock(&req->rq_lock);
2871         }
2872
2873         list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
2874                 struct ptlrpc_request *req =
2875                         list_entry(tmp, struct ptlrpc_request, rq_list);
2876
2877                 DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req");
2878
2879                 spin_lock(&req->rq_lock);
2880                 if (req->rq_import_generation < imp->imp_generation) {
2881                         req->rq_err = 1;
2882                         req->rq_status = -EIO;
2883                         ptlrpc_client_wake_req(req);
2884                 }
2885                 spin_unlock(&req->rq_lock);
2886         }
2887
2888         /* Last chance to free reqs left on the replay list, but we
2889          * will still leak reqs that haven't committed.  */
2890         if (imp->imp_replayable)
2891                 ptlrpc_free_committed(imp);
2892
2893         spin_unlock(&imp->imp_lock);
2894 }
2895 EXPORT_SYMBOL(ptlrpc_abort_inflight);
2896
2897 /**
2898  * Abort all uncompleted requests in request set \a set
2899  */
2900 void ptlrpc_abort_set(struct ptlrpc_request_set *set)
2901 {
2902         struct list_head *tmp, *pos;
2903
2904         LASSERT(set != NULL);
2905
2906         list_for_each_safe(pos, tmp, &set->set_requests) {
2907                 struct ptlrpc_request *req =
2908                         list_entry(pos, struct ptlrpc_request,
2909                                        rq_set_chain);
2910
2911                 spin_lock(&req->rq_lock);
2912                 if (req->rq_phase != RQ_PHASE_RPC) {
2913                         spin_unlock(&req->rq_lock);
2914                         continue;
2915                 }
2916
2917                 req->rq_err = 1;
2918                 req->rq_status = -EINTR;
2919                 ptlrpc_client_wake_req(req);
2920                 spin_unlock(&req->rq_lock);
2921         }
2922 }
2923
2924 static __u64 ptlrpc_last_xid;
2925 static spinlock_t ptlrpc_last_xid_lock;
2926
2927 /**
2928  * Initialize the XID for the node.  This is common among all requests on
2929  * this node, and only requires the property that it is monotonically
2930  * increasing.  It does not need to be sequential.  Since this is also used
2931  * as the RDMA match bits, it is important that a single client NOT have
2932  * the same match bits for two different in-flight requests, hence we do
2933  * NOT want to have an XID per target or similar.
2934  *
2935  * To avoid an unlikely collision between match bits after a client reboot
2936  * (which would deliver old data into the wrong RDMA buffer) initialize
2937  * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
2938  * If the time is clearly incorrect, we instead use a 62-bit random number.
2939  * In the worst case the random number will overflow 1M RPCs per second in
2940  * 9133 years, or permutations thereof.
2941  */
2942 #define YEAR_2004 (1ULL << 30)
2943 void ptlrpc_init_xid(void)
2944 {
2945         time_t now = get_seconds();
2946
2947         spin_lock_init(&ptlrpc_last_xid_lock);
2948         if (now < YEAR_2004) {
2949                 cfs_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
2950                 ptlrpc_last_xid >>= 2;
2951                 ptlrpc_last_xid |= (1ULL << 61);
2952         } else {
2953                 ptlrpc_last_xid = (__u64)now << 20;
2954         }
2955
2956         /* Always need to be aligned to a power-of-two for multi-bulk BRW */
2957         CLASSERT((PTLRPC_BULK_OPS_COUNT & (PTLRPC_BULK_OPS_COUNT - 1)) == 0);
2958         ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK;
2959 }
2960
2961 /**
2962  * Increase xid and returns resulting new value to the caller.
2963  *
2964  * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting
2965  * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC
2966  * itself uses the last bulk xid needed, so the server can determine the
2967  * the number of bulk transfers from the RPC XID and a bitmask.  The starting
2968  * xid must align to a power-of-two value.
2969  *
2970  * This is assumed to be true due to the initial ptlrpc_last_xid
2971  * value also being initialized to a power-of-two value. LU-1431
2972  */
2973 __u64 ptlrpc_next_xid(void)
2974 {
2975         __u64 next;
2976
2977         spin_lock(&ptlrpc_last_xid_lock);
2978         next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
2979         ptlrpc_last_xid = next;
2980         spin_unlock(&ptlrpc_last_xid_lock);
2981
2982         return next;
2983 }
2984 EXPORT_SYMBOL(ptlrpc_next_xid);
2985
2986 /**
2987  * Get a glimpse at what next xid value might have been.
2988  * Returns possible next xid.
2989  */
2990 __u64 ptlrpc_sample_next_xid(void)
2991 {
2992 #if BITS_PER_LONG == 32
2993         /* need to avoid possible word tearing on 32-bit systems */
2994         __u64 next;
2995
2996         spin_lock(&ptlrpc_last_xid_lock);
2997         next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
2998         spin_unlock(&ptlrpc_last_xid_lock);
2999
3000         return next;
3001 #else
3002         /* No need to lock, since returned value is racy anyways */
3003         return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT;
3004 #endif
3005 }
3006 EXPORT_SYMBOL(ptlrpc_sample_next_xid);
3007
3008 /**
3009  * Functions for operating ptlrpc workers.
3010  *
3011  * A ptlrpc work is a function which will be running inside ptlrpc context.
3012  * The callback shouldn't sleep otherwise it will block that ptlrpcd thread.
3013  *
3014  * 1. after a work is created, it can be used many times, that is:
3015  *       handler = ptlrpcd_alloc_work();
3016  *       ptlrpcd_queue_work();
3017  *
3018  *    queue it again when necessary:
3019  *       ptlrpcd_queue_work();
3020  *       ptlrpcd_destroy_work();
3021  * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but
3022  *    it will only be queued once in any time. Also as its name implies, it may
3023  *    have delay before it really runs by ptlrpcd thread.
3024  */
3025 struct ptlrpc_work_async_args {
3026         int (*cb)(const struct lu_env *, void *);
3027         void *cbdata;
3028 };
3029
3030 static void ptlrpcd_add_work_req(struct ptlrpc_request *req)
3031 {
3032         /* re-initialize the req */
3033         req->rq_timeout         = obd_timeout;
3034         req->rq_sent            = get_seconds();
3035         req->rq_deadline        = req->rq_sent + req->rq_timeout;
3036         req->rq_reply_deadline  = req->rq_deadline;
3037         req->rq_phase           = RQ_PHASE_INTERPRET;
3038         req->rq_next_phase      = RQ_PHASE_COMPLETE;
3039         req->rq_xid             = ptlrpc_next_xid();
3040         req->rq_import_generation = req->rq_import->imp_generation;
3041
3042         ptlrpcd_add_req(req, PDL_POLICY_ROUND, -1);
3043 }
3044
3045 static int work_interpreter(const struct lu_env *env,
3046                             struct ptlrpc_request *req, void *data, int rc)
3047 {
3048         struct ptlrpc_work_async_args *arg = data;
3049
3050         LASSERT(ptlrpcd_check_work(req));
3051         LASSERT(arg->cb != NULL);
3052
3053         rc = arg->cb(env, arg->cbdata);
3054
3055         list_del_init(&req->rq_set_chain);
3056         req->rq_set = NULL;
3057
3058         if (atomic_dec_return(&req->rq_refcount) > 1) {
3059                 atomic_set(&req->rq_refcount, 2);
3060                 ptlrpcd_add_work_req(req);
3061         }
3062         return rc;
3063 }
3064
3065 static int worker_format;
3066
3067 static int ptlrpcd_check_work(struct ptlrpc_request *req)
3068 {
3069         return req->rq_pill.rc_fmt == (void *)&worker_format;
3070 }
3071
3072 /**
3073  * Create a work for ptlrpc.
3074  */
3075 void *ptlrpcd_alloc_work(struct obd_import *imp,
3076                          int (*cb)(const struct lu_env *, void *), void *cbdata)
3077 {
3078         struct ptlrpc_request    *req = NULL;
3079         struct ptlrpc_work_async_args *args;
3080
3081         might_sleep();
3082
3083         if (cb == NULL)
3084                 return ERR_PTR(-EINVAL);
3085
3086         /* copy some code from deprecated fakereq. */
3087         req = ptlrpc_request_cache_alloc(GFP_NOFS);
3088         if (req == NULL) {
3089                 CERROR("ptlrpc: run out of memory!\n");
3090                 return ERR_PTR(-ENOMEM);
3091         }
3092
3093         req->rq_send_state = LUSTRE_IMP_FULL;
3094         req->rq_type = PTL_RPC_MSG_REQUEST;
3095         req->rq_import = class_import_get(imp);
3096         req->rq_export = NULL;
3097         req->rq_interpret_reply = work_interpreter;
3098         /* don't want reply */
3099         req->rq_receiving_reply = 0;
3100         req->rq_req_unlink = req->rq_reply_unlink = 0;
3101         req->rq_no_delay = req->rq_no_resend = 1;
3102         req->rq_pill.rc_fmt = (void *)&worker_format;
3103
3104         spin_lock_init(&req->rq_lock);
3105         INIT_LIST_HEAD(&req->rq_list);
3106         INIT_LIST_HEAD(&req->rq_replay_list);
3107         INIT_LIST_HEAD(&req->rq_set_chain);
3108         INIT_LIST_HEAD(&req->rq_history_list);
3109         INIT_LIST_HEAD(&req->rq_exp_list);
3110         init_waitqueue_head(&req->rq_reply_waitq);
3111         init_waitqueue_head(&req->rq_set_waitq);
3112         atomic_set(&req->rq_refcount, 1);
3113
3114         CLASSERT(sizeof(*args) <= sizeof(req->rq_async_args));
3115         args = ptlrpc_req_async_args(req);
3116         args->cb = cb;
3117         args->cbdata = cbdata;
3118
3119         return req;
3120 }
3121 EXPORT_SYMBOL(ptlrpcd_alloc_work);
3122
3123 void ptlrpcd_destroy_work(void *handler)
3124 {
3125         struct ptlrpc_request *req = handler;
3126
3127         if (req)
3128                 ptlrpc_req_finished(req);
3129 }
3130 EXPORT_SYMBOL(ptlrpcd_destroy_work);
3131
3132 int ptlrpcd_queue_work(void *handler)
3133 {
3134         struct ptlrpc_request *req = handler;
3135
3136         /*
3137          * Check if the req is already being queued.
3138          *
3139          * Here comes a trick: it lacks a way of checking if a req is being
3140          * processed reliably in ptlrpc. Here I have to use refcount of req
3141          * for this purpose. This is okay because the caller should use this
3142          * req as opaque data. - Jinshan
3143          */
3144         LASSERT(atomic_read(&req->rq_refcount) > 0);
3145         if (atomic_inc_return(&req->rq_refcount) == 2)
3146                 ptlrpcd_add_work_req(req);
3147         return 0;
3148 }
3149 EXPORT_SYMBOL(ptlrpcd_queue_work);