net/ceph/mon_client.c

   1 #include <linux/ceph/ceph_debug.h>
   2
   3 #include <linux/module.h>
   4 #include <linux/types.h>
   5 #include <linux/slab.h>
   6 #include <linux/random.h>
   7 #include <linux/sched.h>
   8
   9 #include <linux/ceph/mon_client.h>
  10 #include <linux/ceph/libceph.h>
  11 #include <linux/ceph/debugfs.h>
  12 #include <linux/ceph/decode.h>
  13 #include <linux/ceph/auth.h>
  14
  15 /*
  16  * Interact with Ceph monitor cluster.  Handle requests for new map
  17  * versions, and periodically resend as needed.  Also implement
  18  * statfs() and umount().
  19  *
  20  * A small cluster of Ceph "monitors" are responsible for managing critical
  21  * cluster configuration and state information.  An odd number (e.g., 3, 5)
  22  * of cmon daemons use a modified version of the Paxos part-time parliament
  23  * algorithm to manage the MDS map (mds cluster membership), OSD map, and
  24  * list of clients who have mounted the file system.
  25  *
  26  * We maintain an open, active session with a monitor at all times in order to
  27  * receive timely MDSMap updates.  We periodically send a keepalive byte on the
  28  * TCP socket to ensure we detect a failure.  If the connection does break, we
  29  * randomly hunt for a new monitor.  Once the connection is reestablished, we
  30  * resend any outstanding requests.
  31  */
  32
  33 static const struct ceph_connection_operations mon_con_ops;
  34
  35 static int __validate_auth(struct ceph_mon_client *monc);
  36
  37 /*
  38  * Decode a monmap blob (e.g., during mount).
  39  */
  40 struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
  41 {
  42         struct ceph_monmap *m = NULL;
  43         int i, err = -EINVAL;
  44         struct ceph_fsid fsid;
  45         u32 epoch, num_mon;
  46         u16 version;
  47         u32 len;
  48
  49         ceph_decode_32_safe(&p, end, len, bad);
  50         ceph_decode_need(&p, end, len, bad);
  51
  52         dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
  53
  54         ceph_decode_16_safe(&p, end, version, bad);
  55
  56         ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
  57         ceph_decode_copy(&p, &fsid, sizeof(fsid));
  58         epoch = ceph_decode_32(&p);
  59
  60         num_mon = ceph_decode_32(&p);
  61         ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
  62
  63         if (num_mon >= CEPH_MAX_MON)
  64                 goto bad;
  65         m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
  66         if (m == NULL)
  67                 return ERR_PTR(-ENOMEM);
  68         m->fsid = fsid;
  69         m->epoch = epoch;
  70         m->num_mon = num_mon;
  71         ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
  72         for (i = 0; i < num_mon; i++)
  73                 ceph_decode_addr(&m->mon_inst[i].addr);
  74
  75         dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
  76              m->num_mon);
  77         for (i = 0; i < m->num_mon; i++)
  78                 dout("monmap_decode  mon%d is %s\n", i,
  79                      ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
  80         return m;
  81
  82 bad:
  83         dout("monmap_decode failed with %d\n", err);
  84         kfree(m);
  85         return ERR_PTR(err);
  86 }
  87
  88 /*
  89  * return true if *addr is included in the monmap.
  90  */
  91 int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
  92 {
  93         int i;
  94
  95         for (i = 0; i < m->num_mon; i++)
  96                 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
  97                         return 1;
  98         return 0;
  99 }
 100
 101 /*
 102  * Send an auth request.
 103  */
 104 static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
 105 {
 106         monc->pending_auth = 1;
 107         monc->m_auth->front.iov_len = len;
 108         monc->m_auth->hdr.front_len = cpu_to_le32(len);
 109         ceph_msg_revoke(monc->m_auth);
 110         ceph_msg_get(monc->m_auth);  /* keep our ref */
 111         ceph_con_send(&monc->con, monc->m_auth);
 112 }
 113
 114 /*
 115  * Close monitor session, if any.
 116  */
 117 static void __close_session(struct ceph_mon_client *monc)
 118 {
 119         dout("__close_session closing mon%d\n", monc->cur_mon);
 120         ceph_msg_revoke(monc->m_auth);
 121         ceph_msg_revoke_incoming(monc->m_auth_reply);
 122         ceph_msg_revoke(monc->m_subscribe);
 123         ceph_msg_revoke_incoming(monc->m_subscribe_ack);
 124         ceph_con_close(&monc->con);
 125
 126         monc->pending_auth = 0;
 127         ceph_auth_reset(monc->auth);
 128 }
 129
 130 /*
 131  * Pick a new monitor at random and set cur_mon.  If we are repicking
 132  * (i.e. cur_mon is already set), be sure to pick a different one.
 133  */
 134 static void pick_new_mon(struct ceph_mon_client *monc)
 135 {
 136         int old_mon = monc->cur_mon;
 137
 138         BUG_ON(monc->monmap->num_mon < 1);
 139
 140         if (monc->monmap->num_mon == 1) {
 141                 monc->cur_mon = 0;
 142         } else {
 143                 int max = monc->monmap->num_mon;
 144                 int o = -1;
 145                 int n;
 146
 147                 if (monc->cur_mon >= 0) {
 148                         if (monc->cur_mon < monc->monmap->num_mon)
 149                                 o = monc->cur_mon;
 150                         if (o >= 0)
 151                                 max--;
 152                 }
 153
 154                 n = prandom_u32() % max;
 155                 if (o >= 0 && n >= o)
 156                         n++;
 157
 158                 monc->cur_mon = n;
 159         }
 160
 161         dout("%s mon%d -> mon%d out of %d mons\n", __func__, old_mon,
 162              monc->cur_mon, monc->monmap->num_mon);
 163 }
 164
 165 /*
 166  * Open a session with a new monitor.
 167  */
 168 static void __open_session(struct ceph_mon_client *monc)
 169 {
 170         int ret;
 171
 172         pick_new_mon(monc);
 173
 174         monc->hunting = true;
 175         if (monc->had_a_connection) {
 176                 monc->hunt_mult *= CEPH_MONC_HUNT_BACKOFF;
 177                 if (monc->hunt_mult > CEPH_MONC_HUNT_MAX_MULT)
 178                         monc->hunt_mult = CEPH_MONC_HUNT_MAX_MULT;
 179         }
 180
 181         monc->sub_renew_after = jiffies; /* i.e., expired */
 182         monc->sub_renew_sent = 0;
 183
 184         dout("%s opening mon%d\n", __func__, monc->cur_mon);
 185         ceph_con_open(&monc->con, CEPH_ENTITY_TYPE_MON, monc->cur_mon,
 186                       &monc->monmap->mon_inst[monc->cur_mon].addr);
 187
 188         /*
 189          * send an initial keepalive to ensure our timestamp is valid
 190          * by the time we are in an OPENED state
 191          */
 192         ceph_con_keepalive(&monc->con);
 193
 194         /* initiate authentication handshake */
 195         ret = ceph_auth_build_hello(monc->auth,
 196                                     monc->m_auth->front.iov_base,
 197                                     monc->m_auth->front_alloc_len);
 198         BUG_ON(ret <= 0);
 199         __send_prepared_auth_request(monc, ret);
 200 }
 201
 202 static void reopen_session(struct ceph_mon_client *monc)
 203 {
 204         if (!monc->hunting)
 205                 pr_info("mon%d %s session lost, hunting for new mon\n",
 206                     monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr));
 207
 208         __close_session(monc);
 209         __open_session(monc);
 210 }
 211
 212 /*
 213  * Reschedule delayed work timer.
 214  */
 215 static void __schedule_delayed(struct ceph_mon_client *monc)
 216 {
 217         unsigned long delay;
 218
 219         if (monc->hunting)
 220                 delay = CEPH_MONC_HUNT_INTERVAL * monc->hunt_mult;
 221         else
 222                 delay = CEPH_MONC_PING_INTERVAL;
 223
 224         dout("__schedule_delayed after %lu\n", delay);
 225         mod_delayed_work(system_wq, &monc->delayed_work,
 226                          round_jiffies_relative(delay));
 227 }
 228
 229 const char *ceph_sub_str[] = {
 230         [CEPH_SUB_MDSMAP] = "mdsmap",
 231         [CEPH_SUB_MONMAP] = "monmap",
 232         [CEPH_SUB_OSDMAP] = "osdmap",
 233 };
 234
 235 /*
 236  * Send subscribe request for one or more maps, according to
 237  * monc->subs.
 238  */
 239 static void __send_subscribe(struct ceph_mon_client *monc)
 240 {
 241         struct ceph_msg *msg = monc->m_subscribe;
 242         void *p = msg->front.iov_base;
 243         void *const end = p + msg->front_alloc_len;
 244         int num = 0;
 245         int i;
 246
 247         dout("%s sent %lu\n", __func__, monc->sub_renew_sent);
 248
 249         BUG_ON(monc->cur_mon < 0);
 250
 251         if (!monc->sub_renew_sent)
 252                 monc->sub_renew_sent = jiffies | 1; /* never 0 */
 253
 254         msg->hdr.version = cpu_to_le16(2);
 255
 256         for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
 257                 if (monc->subs[i].want)
 258                         num++;
 259         }
 260         BUG_ON(num < 1); /* monmap sub is always there */
 261         ceph_encode_32(&p, num);
 262         for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
 263                 const char *s = ceph_sub_str[i];
 264
 265                 if (!monc->subs[i].want)
 266                         continue;
 267
 268                 dout("%s %s start %llu flags 0x%x\n", __func__, s,
 269                      le64_to_cpu(monc->subs[i].item.start),
 270                      monc->subs[i].item.flags);
 271                 ceph_encode_string(&p, end, s, strlen(s));
 272                 memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
 273                 p += sizeof(monc->subs[i].item);
 274         }
 275
 276         BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
 277         msg->front.iov_len = p - msg->front.iov_base;
 278         msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 279         ceph_msg_revoke(msg);
 280         ceph_con_send(&monc->con, ceph_msg_get(msg));
 281 }
 282
 283 static void handle_subscribe_ack(struct ceph_mon_client *monc,
 284                                  struct ceph_msg *msg)
 285 {
 286         unsigned int seconds;
 287         struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
 288
 289         if (msg->front.iov_len < sizeof(*h))
 290                 goto bad;
 291         seconds = le32_to_cpu(h->duration);
 292
 293         mutex_lock(&monc->mutex);
 294         if (monc->sub_renew_sent) {
 295                 monc->sub_renew_after = monc->sub_renew_sent +
 296                                             (seconds >> 1) * HZ - 1;
 297                 dout("%s sent %lu duration %d renew after %lu\n", __func__,
 298                      monc->sub_renew_sent, seconds, monc->sub_renew_after);
 299                 monc->sub_renew_sent = 0;
 300         } else {
 301                 dout("%s sent %lu renew after %lu, ignoring\n", __func__,
 302                      monc->sub_renew_sent, monc->sub_renew_after);
 303         }
 304         mutex_unlock(&monc->mutex);
 305         return;
 306 bad:
 307         pr_err("got corrupt subscribe-ack msg\n");
 308         ceph_msg_dump(msg);
 309 }
 310
 311 /*
 312  * Register interest in a map
 313  *
 314  * @sub: one of CEPH_SUB_*
 315  * @epoch: X for "every map since X", or 0 for "just the latest"
 316  */
 317 static bool __ceph_monc_want_map(struct ceph_mon_client *monc, int sub,
 318                                  u32 epoch, bool continuous)
 319 {
 320         __le64 start = cpu_to_le64(epoch);
 321         u8 flags = !continuous ? CEPH_SUBSCRIBE_ONETIME : 0;
 322
 323         dout("%s %s epoch %u continuous %d\n", __func__, ceph_sub_str[sub],
 324              epoch, continuous);
 325
 326         if (monc->subs[sub].want &&
 327             monc->subs[sub].item.start == start &&
 328             monc->subs[sub].item.flags == flags)
 329                 return false;
 330
 331         monc->subs[sub].item.start = start;
 332         monc->subs[sub].item.flags = flags;
 333         monc->subs[sub].want = true;
 334
 335         return true;
 336 }
 337
 338 bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
 339                         bool continuous)
 340 {
 341         bool need_request;
 342
 343         mutex_lock(&monc->mutex);
 344         need_request = __ceph_monc_want_map(monc, sub, epoch, continuous);
 345         mutex_unlock(&monc->mutex);
 346
 347         return need_request;
 348 }
 349 EXPORT_SYMBOL(ceph_monc_want_map);
 350
 351 /*
 352  * Keep track of which maps we have
 353  *
 354  * @sub: one of CEPH_SUB_*
 355  */
 356 static void __ceph_monc_got_map(struct ceph_mon_client *monc, int sub,
 357                                 u32 epoch)
 358 {
 359         dout("%s %s epoch %u\n", __func__, ceph_sub_str[sub], epoch);
 360
 361         if (monc->subs[sub].want) {
 362                 if (monc->subs[sub].item.flags & CEPH_SUBSCRIBE_ONETIME)
 363                         monc->subs[sub].want = false;
 364                 else
 365                         monc->subs[sub].item.start = cpu_to_le64(epoch + 1);
 366         }
 367
 368         monc->subs[sub].have = epoch;
 369 }
 370
 371 void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
 372 {
 373         mutex_lock(&monc->mutex);
 374         __ceph_monc_got_map(monc, sub, epoch);
 375         mutex_unlock(&monc->mutex);
 376 }
 377 EXPORT_SYMBOL(ceph_monc_got_map);
 378
 379 /*
 380  * Register interest in the next osdmap
 381  */
 382 void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 383 {
 384         dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
 385         mutex_lock(&monc->mutex);
 386         if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
 387                                  monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
 388                 __send_subscribe(monc);
 389         mutex_unlock(&monc->mutex);
 390 }
 391 EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
 392
 393 /*
 394  * Wait for an osdmap with a given epoch.
 395  *
 396  * @epoch: epoch to wait for
 397  * @timeout: in jiffies, 0 means "wait forever"
 398  */
 399 int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 400                           unsigned long timeout)
 401 {
 402         unsigned long started = jiffies;
 403         long ret;
 404
 405         mutex_lock(&monc->mutex);
 406         while (monc->subs[CEPH_SUB_OSDMAP].have < epoch) {
 407                 mutex_unlock(&monc->mutex);
 408
 409                 if (timeout && time_after_eq(jiffies, started + timeout))
 410                         return -ETIMEDOUT;
 411
 412                 ret = wait_event_interruptible_timeout(monc->client->auth_wq,
 413                                      monc->subs[CEPH_SUB_OSDMAP].have >= epoch,
 414                                      ceph_timeout_jiffies(timeout));
 415                 if (ret < 0)
 416                         return ret;
 417
 418                 mutex_lock(&monc->mutex);
 419         }
 420
 421         mutex_unlock(&monc->mutex);
 422         return 0;
 423 }
 424 EXPORT_SYMBOL(ceph_monc_wait_osdmap);
 425
 426 /*
 427  * Open a session with a random monitor.  Request monmap and osdmap,
 428  * which are waited upon in __ceph_open_session().
 429  */
 430 int ceph_monc_open_session(struct ceph_mon_client *monc)
 431 {
 432         mutex_lock(&monc->mutex);
 433         __ceph_monc_want_map(monc, CEPH_SUB_MONMAP, 0, true);
 434         __ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 0, false);
 435         __open_session(monc);
 436         __schedule_delayed(monc);
 437         mutex_unlock(&monc->mutex);
 438         return 0;
 439 }
 440 EXPORT_SYMBOL(ceph_monc_open_session);
 441
 442 static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 443                                  struct ceph_msg *msg)
 444 {
 445         struct ceph_client *client = monc->client;
 446         struct ceph_monmap *monmap = NULL, *old = monc->monmap;
 447         void *p, *end;
 448
 449         mutex_lock(&monc->mutex);
 450
 451         dout("handle_monmap\n");
 452         p = msg->front.iov_base;
 453         end = p + msg->front.iov_len;
 454
 455         monmap = ceph_monmap_decode(p, end);
 456         if (IS_ERR(monmap)) {
 457                 pr_err("problem decoding monmap, %d\n",
 458                        (int)PTR_ERR(monmap));
 459                 goto out;
 460         }
 461
 462         if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
 463                 kfree(monmap);
 464                 goto out;
 465         }
 466
 467         client->monc.monmap = monmap;
 468         kfree(old);
 469
 470         __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch);
 471         client->have_fsid = true;
 472
 473 out:
 474         mutex_unlock(&monc->mutex);
 475         wake_up_all(&client->auth_wq);
 476 }
 477
 478 /*
 479  * generic requests (currently statfs, mon_get_version)
 480  */
 481 DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
 482
 483 static void release_generic_request(struct kref *kref)
 484 {
 485         struct ceph_mon_generic_request *req =
 486                 container_of(kref, struct ceph_mon_generic_request, kref);
 487
 488         if (req->reply)
 489                 ceph_msg_put(req->reply);
 490         if (req->request)
 491                 ceph_msg_put(req->request);
 492
 493         kfree(req);
 494 }
 495
 496 static void put_generic_request(struct ceph_mon_generic_request *req)
 497 {
 498         kref_put(&req->kref, release_generic_request);
 499 }
 500
 501 static void get_generic_request(struct ceph_mon_generic_request *req)
 502 {
 503         kref_get(&req->kref);
 504 }
 505
 506 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 507                                          struct ceph_msg_header *hdr,
 508                                          int *skip)
 509 {
 510         struct ceph_mon_client *monc = con->private;
 511         struct ceph_mon_generic_request *req;
 512         u64 tid = le64_to_cpu(hdr->tid);
 513         struct ceph_msg *m;
 514
 515         mutex_lock(&monc->mutex);
 516         req = lookup_generic_request(&monc->generic_request_tree, tid);
 517         if (!req) {
 518                 dout("get_generic_reply %lld dne\n", tid);
 519                 *skip = 1;
 520                 m = NULL;
 521         } else {
 522                 dout("get_generic_reply %lld got %p\n", tid, req->reply);
 523                 *skip = 0;
 524                 m = ceph_msg_get(req->reply);
 525                 /*
 526                  * we don't need to track the connection reading into
 527                  * this reply because we only have one open connection
 528                  * at a time, ever.
 529                  */
 530         }
 531         mutex_unlock(&monc->mutex);
 532         return m;
 533 }
 534
 535 static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
 536                                 struct ceph_mon_generic_request *req)
 537 {
 538         int err;
 539
 540         /* register request */
 541         req->tid = tid != 0 ? tid : ++monc->last_tid;
 542         req->request->hdr.tid = cpu_to_le64(req->tid);
 543         insert_generic_request(&monc->generic_request_tree, req);
 544         ceph_con_send(&monc->con, ceph_msg_get(req->request));
 545         mutex_unlock(&monc->mutex);
 546
 547         err = wait_for_completion_interruptible(&req->completion);
 548
 549         mutex_lock(&monc->mutex);
 550         erase_generic_request(&monc->generic_request_tree, req);
 551
 552         if (!err)
 553                 err = req->result;
 554         return err;
 555 }
 556
 557 static int do_generic_request(struct ceph_mon_client *monc,
 558                               struct ceph_mon_generic_request *req)
 559 {
 560         int err;
 561
 562         mutex_lock(&monc->mutex);
 563         err = __do_generic_request(monc, 0, req);
 564         mutex_unlock(&monc->mutex);
 565
 566         return err;
 567 }
 568
 569 /*
 570  * statfs
 571  */
 572 static void handle_statfs_reply(struct ceph_mon_client *monc,
 573                                 struct ceph_msg *msg)
 574 {
 575         struct ceph_mon_generic_request *req;
 576         struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
 577         u64 tid = le64_to_cpu(msg->hdr.tid);
 578
 579         if (msg->front.iov_len != sizeof(*reply))
 580                 goto bad;
 581         dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 582
 583         mutex_lock(&monc->mutex);
 584         req = lookup_generic_request(&monc->generic_request_tree, tid);
 585         if (req) {
 586                 *(struct ceph_statfs *)req->buf = reply->st;
 587                 req->result = 0;
 588                 get_generic_request(req);
 589         }
 590         mutex_unlock(&monc->mutex);
 591         if (req) {
 592                 complete_all(&req->completion);
 593                 put_generic_request(req);
 594         }
 595         return;
 596
 597 bad:
 598         pr_err("corrupt statfs reply, tid %llu\n", tid);
 599         ceph_msg_dump(msg);
 600 }
 601
 602 /*
 603  * Do a synchronous statfs().
 604  */
 605 int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 606 {
 607         struct ceph_mon_generic_request *req;
 608         struct ceph_mon_statfs *h;
 609         int err;
 610
 611         req = kzalloc(sizeof(*req), GFP_NOFS);
 612         if (!req)
 613                 return -ENOMEM;
 614
 615         kref_init(&req->kref);
 616         RB_CLEAR_NODE(&req->node);
 617         req->buf = buf;
 618         init_completion(&req->completion);
 619
 620         err = -ENOMEM;
 621         req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
 622                                     true);
 623         if (!req->request)
 624                 goto out;
 625         req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
 626                                   true);
 627         if (!req->reply)
 628                 goto out;
 629
 630         /* fill out request */
 631         h = req->request->front.iov_base;
 632         h->monhdr.have_version = 0;
 633         h->monhdr.session_mon = cpu_to_le16(-1);
 634         h->monhdr.session_mon_tid = 0;
 635         h->fsid = monc->monmap->fsid;
 636
 637         err = do_generic_request(monc, req);
 638
 639 out:
 640         put_generic_request(req);
 641         return err;
 642 }
 643 EXPORT_SYMBOL(ceph_monc_do_statfs);
 644
 645 static void handle_get_version_reply(struct ceph_mon_client *monc,
 646                                      struct ceph_msg *msg)
 647 {
 648         struct ceph_mon_generic_request *req;
 649         u64 tid = le64_to_cpu(msg->hdr.tid);
 650         void *p = msg->front.iov_base;
 651         void *end = p + msg->front_alloc_len;
 652         u64 handle;
 653
 654         dout("%s %p tid %llu\n", __func__, msg, tid);
 655
 656         ceph_decode_need(&p, end, 2*sizeof(u64), bad);
 657         handle = ceph_decode_64(&p);
 658         if (tid != 0 && tid != handle)
 659                 goto bad;
 660
 661         mutex_lock(&monc->mutex);
 662         req = lookup_generic_request(&monc->generic_request_tree, handle);
 663         if (req) {
 664                 *(u64 *)req->buf = ceph_decode_64(&p);
 665                 req->result = 0;
 666                 get_generic_request(req);
 667         }
 668         mutex_unlock(&monc->mutex);
 669         if (req) {
 670                 complete_all(&req->completion);
 671                 put_generic_request(req);
 672         }
 673
 674         return;
 675 bad:
 676         pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
 677         ceph_msg_dump(msg);
 678 }
 679
 680 /*
 681  * Send MMonGetVersion and wait for the reply.
 682  *
 683  * @what: one of "mdsmap", "osdmap" or "monmap"
 684  */
 685 int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
 686                              u64 *newest)
 687 {
 688         struct ceph_mon_generic_request *req;
 689         void *p, *end;
 690         u64 tid;
 691         int err;
 692
 693         req = kzalloc(sizeof(*req), GFP_NOFS);
 694         if (!req)
 695                 return -ENOMEM;
 696
 697         kref_init(&req->kref);
 698         RB_CLEAR_NODE(&req->node);
 699         req->buf = newest;
 700         init_completion(&req->completion);
 701
 702         req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
 703                                     sizeof(u64) + sizeof(u32) + strlen(what),
 704                                     GFP_NOFS, true);
 705         if (!req->request) {
 706                 err = -ENOMEM;
 707                 goto out;
 708         }
 709
 710         req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
 711                                   GFP_NOFS, true);
 712         if (!req->reply) {
 713                 err = -ENOMEM;
 714                 goto out;
 715         }
 716
 717         p = req->request->front.iov_base;
 718         end = p + req->request->front_alloc_len;
 719
 720         /* fill out request */
 721         mutex_lock(&monc->mutex);
 722         tid = ++monc->last_tid;
 723         ceph_encode_64(&p, tid); /* handle */
 724         ceph_encode_string(&p, end, what, strlen(what));
 725
 726         err = __do_generic_request(monc, tid, req);
 727
 728         mutex_unlock(&monc->mutex);
 729 out:
 730         put_generic_request(req);
 731         return err;
 732 }
 733 EXPORT_SYMBOL(ceph_monc_do_get_version);
 734
 735 /*
 736  * Resend pending generic requests.
 737  */
 738 static void __resend_generic_request(struct ceph_mon_client *monc)
 739 {
 740         struct ceph_mon_generic_request *req;
 741         struct rb_node *p;
 742
 743         for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
 744                 req = rb_entry(p, struct ceph_mon_generic_request, node);
 745                 ceph_msg_revoke(req->request);
 746                 ceph_msg_revoke_incoming(req->reply);
 747                 ceph_con_send(&monc->con, ceph_msg_get(req->request));
 748         }
 749 }
 750
 751 /*
 752  * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
 753  * renew/retry subscription as needed (in case it is timing out, or we
 754  * got an ENOMEM).  And keep the monitor connection alive.
 755  */
 756 static void delayed_work(struct work_struct *work)
 757 {
 758         struct ceph_mon_client *monc =
 759                 container_of(work, struct ceph_mon_client, delayed_work.work);
 760
 761         dout("monc delayed_work\n");
 762         mutex_lock(&monc->mutex);
 763         if (monc->hunting) {
 764                 dout("%s continuing hunt\n", __func__);
 765                 reopen_session(monc);
 766         } else {
 767                 int is_auth = ceph_auth_is_authenticated(monc->auth);
 768                 if (ceph_con_keepalive_expired(&monc->con,
 769                                                CEPH_MONC_PING_TIMEOUT)) {
 770                         dout("monc keepalive timeout\n");
 771                         is_auth = 0;
 772                         reopen_session(monc);
 773                 }
 774
 775                 if (!monc->hunting) {
 776                         ceph_con_keepalive(&monc->con);
 777                         __validate_auth(monc);
 778                 }
 779
 780                 if (is_auth) {
 781                         unsigned long now = jiffies;
 782
 783                         dout("%s renew subs? now %lu renew after %lu\n",
 784                              __func__, now, monc->sub_renew_after);
 785                         if (time_after_eq(now, monc->sub_renew_after))
 786                                 __send_subscribe(monc);
 787                 }
 788         }
 789         __schedule_delayed(monc);
 790         mutex_unlock(&monc->mutex);
 791 }
 792
 793 /*
 794  * On startup, we build a temporary monmap populated with the IPs
 795  * provided by mount(2).
 796  */
 797 static int build_initial_monmap(struct ceph_mon_client *monc)
 798 {
 799         struct ceph_options *opt = monc->client->options;
 800         struct ceph_entity_addr *mon_addr = opt->mon_addr;
 801         int num_mon = opt->num_mon;
 802         int i;
 803
 804         /* build initial monmap */
 805         monc->monmap = kzalloc(sizeof(*monc->monmap) +
 806                                num_mon*sizeof(monc->monmap->mon_inst[0]),
 807                                GFP_KERNEL);
 808         if (!monc->monmap)
 809                 return -ENOMEM;
 810         for (i = 0; i < num_mon; i++) {
 811                 monc->monmap->mon_inst[i].addr = mon_addr[i];
 812                 monc->monmap->mon_inst[i].addr.nonce = 0;
 813                 monc->monmap->mon_inst[i].name.type =
 814                         CEPH_ENTITY_TYPE_MON;
 815                 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
 816         }
 817         monc->monmap->num_mon = num_mon;
 818         return 0;
 819 }
 820
 821 int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 822 {
 823         int err = 0;
 824
 825         dout("init\n");
 826         memset(monc, 0, sizeof(*monc));
 827         monc->client = cl;
 828         monc->monmap = NULL;
 829         mutex_init(&monc->mutex);
 830
 831         err = build_initial_monmap(monc);
 832         if (err)
 833                 goto out;
 834
 835         /* connection */
 836         /* authentication */
 837         monc->auth = ceph_auth_init(cl->options->name,
 838                                     cl->options->key);
 839         if (IS_ERR(monc->auth)) {
 840                 err = PTR_ERR(monc->auth);
 841                 goto out_monmap;
 842         }
 843         monc->auth->want_keys =
 844                 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
 845                 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
 846
 847         /* msgs */
 848         err = -ENOMEM;
 849         monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
 850                                      sizeof(struct ceph_mon_subscribe_ack),
 851                                      GFP_NOFS, true);
 852         if (!monc->m_subscribe_ack)
 853                 goto out_auth;
 854
 855         monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
 856                                          true);
 857         if (!monc->m_subscribe)
 858                 goto out_subscribe_ack;
 859
 860         monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS,
 861                                           true);
 862         if (!monc->m_auth_reply)
 863                 goto out_subscribe;
 864
 865         monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true);
 866         monc->pending_auth = 0;
 867         if (!monc->m_auth)
 868                 goto out_auth_reply;
 869
 870         ceph_con_init(&monc->con, monc, &mon_con_ops,
 871                       &monc->client->msgr);
 872
 873         monc->cur_mon = -1;
 874         monc->had_a_connection = false;
 875         monc->hunt_mult = 1;
 876
 877         INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
 878         monc->generic_request_tree = RB_ROOT;
 879         monc->last_tid = 0;
 880
 881         return 0;
 882
 883 out_auth_reply:
 884         ceph_msg_put(monc->m_auth_reply);
 885 out_subscribe:
 886         ceph_msg_put(monc->m_subscribe);
 887 out_subscribe_ack:
 888         ceph_msg_put(monc->m_subscribe_ack);
 889 out_auth:
 890         ceph_auth_destroy(monc->auth);
 891 out_monmap:
 892         kfree(monc->monmap);
 893 out:
 894         return err;
 895 }
 896 EXPORT_SYMBOL(ceph_monc_init);
 897
 898 void ceph_monc_stop(struct ceph_mon_client *monc)
 899 {
 900         dout("stop\n");
 901         cancel_delayed_work_sync(&monc->delayed_work);
 902
 903         mutex_lock(&monc->mutex);
 904         __close_session(monc);
 905         monc->cur_mon = -1;
 906         mutex_unlock(&monc->mutex);
 907
 908         /*
 909          * flush msgr queue before we destroy ourselves to ensure that:
 910          *  - any work that references our embedded con is finished.
 911          *  - any osd_client or other work that may reference an authorizer
 912          *    finishes before we shut down the auth subsystem.
 913          */
 914         ceph_msgr_flush();
 915
 916         ceph_auth_destroy(monc->auth);
 917
 918         ceph_msg_put(monc->m_auth);
 919         ceph_msg_put(monc->m_auth_reply);
 920         ceph_msg_put(monc->m_subscribe);
 921         ceph_msg_put(monc->m_subscribe_ack);
 922
 923         kfree(monc->monmap);
 924 }
 925 EXPORT_SYMBOL(ceph_monc_stop);
 926
 927 static void finish_hunting(struct ceph_mon_client *monc)
 928 {
 929         if (monc->hunting) {
 930                 dout("%s found mon%d\n", __func__, monc->cur_mon);
 931                 monc->hunting = false;
 932                 monc->had_a_connection = true;
 933                 monc->hunt_mult /= 2; /* reduce by 50% */
 934                 if (monc->hunt_mult < 1)
 935                         monc->hunt_mult = 1;
 936         }
 937 }
 938
 939 static void handle_auth_reply(struct ceph_mon_client *monc,
 940                               struct ceph_msg *msg)
 941 {
 942         int ret;
 943         int was_auth = 0;
 944
 945         mutex_lock(&monc->mutex);
 946         was_auth = ceph_auth_is_authenticated(monc->auth);
 947         monc->pending_auth = 0;
 948         ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
 949                                      msg->front.iov_len,
 950                                      monc->m_auth->front.iov_base,
 951                                      monc->m_auth->front_alloc_len);
 952         if (ret > 0) {
 953                 __send_prepared_auth_request(monc, ret);
 954                 goto out;
 955         }
 956
 957         finish_hunting(monc);
 958
 959         if (ret < 0) {
 960                 monc->client->auth_err = ret;
 961         } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
 962                 dout("authenticated, starting session\n");
 963
 964                 monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
 965                 monc->client->msgr.inst.name.num =
 966                                         cpu_to_le64(monc->auth->global_id);
 967
 968                 __send_subscribe(monc);
 969                 __resend_generic_request(monc);
 970
 971                 pr_info("mon%d %s session established\n", monc->cur_mon,
 972                         ceph_pr_addr(&monc->con.peer_addr.in_addr));
 973         }
 974
 975 out:
 976         mutex_unlock(&monc->mutex);
 977         if (monc->client->auth_err < 0)
 978                 wake_up_all(&monc->client->auth_wq);
 979 }
 980
 981 static int __validate_auth(struct ceph_mon_client *monc)
 982 {
 983         int ret;
 984
 985         if (monc->pending_auth)
 986                 return 0;
 987
 988         ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
 989                               monc->m_auth->front_alloc_len);
 990         if (ret <= 0)
 991                 return ret; /* either an error, or no need to authenticate */
 992         __send_prepared_auth_request(monc, ret);
 993         return 0;
 994 }
 995
 996 int ceph_monc_validate_auth(struct ceph_mon_client *monc)
 997 {
 998         int ret;
 999
1000         mutex_lock(&monc->mutex);
1001         ret = __validate_auth(monc);
1002         mutex_unlock(&monc->mutex);
1003         return ret;
1004 }
1005 EXPORT_SYMBOL(ceph_monc_validate_auth);
1006
1007 /*
1008  * handle incoming message
1009  */
1010 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1011 {
1012         struct ceph_mon_client *monc = con->private;
1013         int type = le16_to_cpu(msg->hdr.type);
1014
1015         if (!monc)
1016                 return;
1017
1018         switch (type) {
1019         case CEPH_MSG_AUTH_REPLY:
1020                 handle_auth_reply(monc, msg);
1021                 break;
1022
1023         case CEPH_MSG_MON_SUBSCRIBE_ACK:
1024                 handle_subscribe_ack(monc, msg);
1025                 break;
1026
1027         case CEPH_MSG_STATFS_REPLY:
1028                 handle_statfs_reply(monc, msg);
1029                 break;
1030
1031         case CEPH_MSG_MON_GET_VERSION_REPLY:
1032                 handle_get_version_reply(monc, msg);
1033                 break;
1034
1035         case CEPH_MSG_MON_MAP:
1036                 ceph_monc_handle_map(monc, msg);
1037                 break;
1038
1039         case CEPH_MSG_OSD_MAP:
1040                 ceph_osdc_handle_map(&monc->client->osdc, msg);
1041                 break;
1042
1043         default:
1044                 /* can the chained handler handle it? */
1045                 if (monc->client->extra_mon_dispatch &&
1046                     monc->client->extra_mon_dispatch(monc->client, msg) == 0)
1047                         break;
1048
1049                 pr_err("received unknown message type %d %s\n", type,
1050                        ceph_msg_type_name(type));
1051         }
1052         ceph_msg_put(msg);
1053 }
1054
1055 /*
1056  * Allocate memory for incoming message
1057  */
1058 static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
1059                                       struct ceph_msg_header *hdr,
1060                                       int *skip)
1061 {
1062         struct ceph_mon_client *monc = con->private;
1063         int type = le16_to_cpu(hdr->type);
1064         int front_len = le32_to_cpu(hdr->front_len);
1065         struct ceph_msg *m = NULL;
1066
1067         *skip = 0;
1068
1069         switch (type) {
1070         case CEPH_MSG_MON_SUBSCRIBE_ACK:
1071                 m = ceph_msg_get(monc->m_subscribe_ack);
1072                 break;
1073         case CEPH_MSG_STATFS_REPLY:
1074                 return get_generic_reply(con, hdr, skip);
1075         case CEPH_MSG_AUTH_REPLY:
1076                 m = ceph_msg_get(monc->m_auth_reply);
1077                 break;
1078         case CEPH_MSG_MON_GET_VERSION_REPLY:
1079                 if (le64_to_cpu(hdr->tid) != 0)
1080                         return get_generic_reply(con, hdr, skip);
1081
1082                 /*
1083                  * Older OSDs don't set reply tid even if the orignal
1084                  * request had a non-zero tid.  Workaround this weirdness
1085                  * by falling through to the allocate case.
1086                  */
1087         case CEPH_MSG_MON_MAP:
1088         case CEPH_MSG_MDS_MAP:
1089         case CEPH_MSG_OSD_MAP:
1090                 m = ceph_msg_new(type, front_len, GFP_NOFS, false);
1091                 if (!m)
1092                         return NULL;    /* ENOMEM--return skip == 0 */
1093                 break;
1094         }
1095
1096         if (!m) {
1097                 pr_info("alloc_msg unknown type %d\n", type);
1098                 *skip = 1;
1099         } else if (front_len > m->front_alloc_len) {
1100                 pr_warn("mon_alloc_msg front %d > prealloc %d (%u#%llu)\n",
1101                         front_len, m->front_alloc_len,
1102                         (unsigned int)con->peer_name.type,
1103                         le64_to_cpu(con->peer_name.num));
1104                 ceph_msg_put(m);
1105                 m = ceph_msg_new(type, front_len, GFP_NOFS, false);
1106         }
1107
1108         return m;
1109 }
1110
1111 /*
1112  * If the monitor connection resets, pick a new monitor and resubmit
1113  * any pending requests.
1114  */
1115 static void mon_fault(struct ceph_connection *con)
1116 {
1117         struct ceph_mon_client *monc = con->private;
1118
1119         mutex_lock(&monc->mutex);
1120         dout("%s mon%d\n", __func__, monc->cur_mon);
1121         if (monc->cur_mon >= 0) {
1122                 if (!monc->hunting) {
1123                         dout("%s hunting for new mon\n", __func__);
1124                         reopen_session(monc);
1125                         __schedule_delayed(monc);
1126                 } else {
1127                         dout("%s already hunting\n", __func__);
1128                 }
1129         }
1130         mutex_unlock(&monc->mutex);
1131 }
1132
1133 /*
1134  * We can ignore refcounting on the connection struct, as all references
1135  * will come from the messenger workqueue, which is drained prior to
1136  * mon_client destruction.
1137  */
1138 static struct ceph_connection *con_get(struct ceph_connection *con)
1139 {
1140         return con;
1141 }
1142
1143 static void con_put(struct ceph_connection *con)
1144 {
1145 }
1146
1147 static const struct ceph_connection_operations mon_con_ops = {
1148         .get = con_get,
1149         .put = con_put,
1150         .dispatch = dispatch,
1151         .fault = mon_fault,
1152         .alloc_msg = mon_alloc_msg,
1153 };