net/ceph/mon_client.c

   1 #include <linux/ceph/ceph_debug.h>
   2
   3 #include <linux/module.h>
   4 #include <linux/types.h>
   5 #include <linux/slab.h>
   6 #include <linux/random.h>
   7 #include <linux/sched.h>
   8
   9 #include <linux/ceph/mon_client.h>
  10 #include <linux/ceph/libceph.h>
  11 #include <linux/ceph/debugfs.h>
  12 #include <linux/ceph/decode.h>
  13 #include <linux/ceph/auth.h>
  14
  15 /*
  16  * Interact with Ceph monitor cluster.  Handle requests for new map
  17  * versions, and periodically resend as needed.  Also implement
  18  * statfs() and umount().
  19  *
  20  * A small cluster of Ceph "monitors" are responsible for managing critical
  21  * cluster configuration and state information.  An odd number (e.g., 3, 5)
  22  * of cmon daemons use a modified version of the Paxos part-time parliament
  23  * algorithm to manage the MDS map (mds cluster membership), OSD map, and
  24  * list of clients who have mounted the file system.
  25  *
  26  * We maintain an open, active session with a monitor at all times in order to
  27  * receive timely MDSMap updates.  We periodically send a keepalive byte on the
  28  * TCP socket to ensure we detect a failure.  If the connection does break, we
  29  * randomly hunt for a new monitor.  Once the connection is reestablished, we
  30  * resend any outstanding requests.
  31  */
  32
  33 static const struct ceph_connection_operations mon_con_ops;
  34
  35 static int __validate_auth(struct ceph_mon_client *monc);
  36
  37 /*
  38  * Decode a monmap blob (e.g., during mount).
  39  */
  40 struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
  41 {
  42         struct ceph_monmap *m = NULL;
  43         int i, err = -EINVAL;
  44         struct ceph_fsid fsid;
  45         u32 epoch, num_mon;
  46         u16 version;
  47         u32 len;
  48
  49         ceph_decode_32_safe(&p, end, len, bad);
  50         ceph_decode_need(&p, end, len, bad);
  51
  52         dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
  53
  54         ceph_decode_16_safe(&p, end, version, bad);
  55
  56         ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
  57         ceph_decode_copy(&p, &fsid, sizeof(fsid));
  58         epoch = ceph_decode_32(&p);
  59
  60         num_mon = ceph_decode_32(&p);
  61         ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
  62
  63         if (num_mon >= CEPH_MAX_MON)
  64                 goto bad;
  65         m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
  66         if (m == NULL)
  67                 return ERR_PTR(-ENOMEM);
  68         m->fsid = fsid;
  69         m->epoch = epoch;
  70         m->num_mon = num_mon;
  71         ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
  72         for (i = 0; i < num_mon; i++)
  73                 ceph_decode_addr(&m->mon_inst[i].addr);
  74
  75         dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
  76              m->num_mon);
  77         for (i = 0; i < m->num_mon; i++)
  78                 dout("monmap_decode  mon%d is %s\n", i,
  79                      ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
  80         return m;
  81
  82 bad:
  83         dout("monmap_decode failed with %d\n", err);
  84         kfree(m);
  85         return ERR_PTR(err);
  86 }
  87
  88 /*
  89  * return true if *addr is included in the monmap.
  90  */
  91 int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
  92 {
  93         int i;
  94
  95         for (i = 0; i < m->num_mon; i++)
  96                 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
  97                         return 1;
  98         return 0;
  99 }
 100
 101 /*
 102  * Send an auth request.
 103  */
 104 static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
 105 {
 106         monc->pending_auth = 1;
 107         monc->m_auth->front.iov_len = len;
 108         monc->m_auth->hdr.front_len = cpu_to_le32(len);
 109         ceph_msg_revoke(monc->m_auth);
 110         ceph_msg_get(monc->m_auth);  /* keep our ref */
 111         ceph_con_send(&monc->con, monc->m_auth);
 112 }
 113
 114 /*
 115  * Close monitor session, if any.
 116  */
 117 static void __close_session(struct ceph_mon_client *monc)
 118 {
 119         dout("__close_session closing mon%d\n", monc->cur_mon);
 120         ceph_msg_revoke(monc->m_auth);
 121         ceph_msg_revoke_incoming(monc->m_auth_reply);
 122         ceph_msg_revoke(monc->m_subscribe);
 123         ceph_msg_revoke_incoming(monc->m_subscribe_ack);
 124         ceph_con_close(&monc->con);
 125         monc->cur_mon = -1;
 126         monc->pending_auth = 0;
 127         ceph_auth_reset(monc->auth);
 128 }
 129
 130 /*
 131  * Open a session with a (new) monitor.
 132  */
 133 static int __open_session(struct ceph_mon_client *monc)
 134 {
 135         char r;
 136         int ret;
 137
 138         if (monc->cur_mon < 0) {
 139                 get_random_bytes(&r, 1);
 140                 monc->cur_mon = r % monc->monmap->num_mon;
 141                 dout("open_session num=%d r=%d -> mon%d\n",
 142                      monc->monmap->num_mon, r, monc->cur_mon);
 143                 monc->sub_sent = 0;
 144                 monc->sub_renew_after = jiffies;  /* i.e., expired */
 145                 monc->want_next_osdmap = !!monc->want_next_osdmap;
 146
 147                 dout("open_session mon%d opening\n", monc->cur_mon);
 148                 ceph_con_open(&monc->con,
 149                               CEPH_ENTITY_TYPE_MON, monc->cur_mon,
 150                               &monc->monmap->mon_inst[monc->cur_mon].addr);
 151
 152                 /* send an initial keepalive to ensure our timestamp is
 153                  * valid by the time we are in an OPENED state */
 154                 ceph_con_keepalive(&monc->con);
 155
 156                 /* initiatiate authentication handshake */
 157                 ret = ceph_auth_build_hello(monc->auth,
 158                                             monc->m_auth->front.iov_base,
 159                                             monc->m_auth->front_alloc_len);
 160                 __send_prepared_auth_request(monc, ret);
 161         } else {
 162                 dout("open_session mon%d already open\n", monc->cur_mon);
 163         }
 164         return 0;
 165 }
 166
 167 static bool __sub_expired(struct ceph_mon_client *monc)
 168 {
 169         return time_after_eq(jiffies, monc->sub_renew_after);
 170 }
 171
 172 /*
 173  * Reschedule delayed work timer.
 174  */
 175 static void __schedule_delayed(struct ceph_mon_client *monc)
 176 {
 177         struct ceph_options *opt = monc->client->options;
 178         unsigned long delay;
 179
 180         if (monc->cur_mon < 0 || __sub_expired(monc)) {
 181                 delay = 10 * HZ;
 182         } else {
 183                 delay = 20 * HZ;
 184                 if (opt->monc_ping_timeout > 0)
 185                         delay = min(delay, opt->monc_ping_timeout / 3);
 186         }
 187         dout("__schedule_delayed after %lu\n", delay);
 188         schedule_delayed_work(&monc->delayed_work,
 189                               round_jiffies_relative(delay));
 190 }
 191
 192 /*
 193  * Send subscribe request for mdsmap and/or osdmap.
 194  */
 195 static void __send_subscribe(struct ceph_mon_client *monc)
 196 {
 197         dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
 198              (unsigned int)monc->sub_sent, __sub_expired(monc),
 199              monc->want_next_osdmap);
 200         if ((__sub_expired(monc) && !monc->sub_sent) ||
 201             monc->want_next_osdmap == 1) {
 202                 struct ceph_msg *msg = monc->m_subscribe;
 203                 struct ceph_mon_subscribe_item *i;
 204                 void *p, *end;
 205                 int num;
 206
 207                 p = msg->front.iov_base;
 208                 end = p + msg->front_alloc_len;
 209
 210                 num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
 211                 ceph_encode_32(&p, num);
 212
 213                 if (monc->want_next_osdmap) {
 214                         dout("__send_subscribe to 'osdmap' %u\n",
 215                              (unsigned int)monc->have_osdmap);
 216                         ceph_encode_string(&p, end, "osdmap", 6);
 217                         i = p;
 218                         i->have = cpu_to_le64(monc->have_osdmap);
 219                         i->onetime = 1;
 220                         p += sizeof(*i);
 221                         monc->want_next_osdmap = 2;  /* requested */
 222                 }
 223                 if (monc->want_mdsmap) {
 224                         dout("__send_subscribe to 'mdsmap' %u+\n",
 225                              (unsigned int)monc->have_mdsmap);
 226                         ceph_encode_string(&p, end, "mdsmap", 6);
 227                         i = p;
 228                         i->have = cpu_to_le64(monc->have_mdsmap);
 229                         i->onetime = 0;
 230                         p += sizeof(*i);
 231                 }
 232                 ceph_encode_string(&p, end, "monmap", 6);
 233                 i = p;
 234                 i->have = 0;
 235                 i->onetime = 0;
 236                 p += sizeof(*i);
 237
 238                 msg->front.iov_len = p - msg->front.iov_base;
 239                 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 240                 ceph_msg_revoke(msg);
 241                 ceph_con_send(&monc->con, ceph_msg_get(msg));
 242
 243                 monc->sub_sent = jiffies | 1;  /* never 0 */
 244         }
 245 }
 246
 247 static void handle_subscribe_ack(struct ceph_mon_client *monc,
 248                                  struct ceph_msg *msg)
 249 {
 250         unsigned int seconds;
 251         struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
 252
 253         if (msg->front.iov_len < sizeof(*h))
 254                 goto bad;
 255         seconds = le32_to_cpu(h->duration);
 256
 257         mutex_lock(&monc->mutex);
 258         dout("handle_subscribe_ack after %d seconds\n", seconds);
 259         monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
 260         monc->sub_sent = 0;
 261         mutex_unlock(&monc->mutex);
 262         return;
 263 bad:
 264         pr_err("got corrupt subscribe-ack msg\n");
 265         ceph_msg_dump(msg);
 266 }
 267
 268 /*
 269  * Keep track of which maps we have
 270  */
 271 int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
 272 {
 273         mutex_lock(&monc->mutex);
 274         monc->have_mdsmap = got;
 275         mutex_unlock(&monc->mutex);
 276         return 0;
 277 }
 278 EXPORT_SYMBOL(ceph_monc_got_mdsmap);
 279
 280 int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
 281 {
 282         mutex_lock(&monc->mutex);
 283         monc->have_osdmap = got;
 284         monc->want_next_osdmap = 0;
 285         mutex_unlock(&monc->mutex);
 286         return 0;
 287 }
 288
 289 /*
 290  * Register interest in the next osdmap
 291  */
 292 void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 293 {
 294         dout("request_next_osdmap have %u\n", monc->have_osdmap);
 295         mutex_lock(&monc->mutex);
 296         if (!monc->want_next_osdmap)
 297                 monc->want_next_osdmap = 1;
 298         if (monc->want_next_osdmap < 2)
 299                 __send_subscribe(monc);
 300         mutex_unlock(&monc->mutex);
 301 }
 302 EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
 303
 304 /*
 305  * Wait for an osdmap with a given epoch.
 306  *
 307  * @epoch: epoch to wait for
 308  * @timeout: in jiffies, 0 means "wait forever"
 309  */
 310 int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 311                           unsigned long timeout)
 312 {
 313         unsigned long started = jiffies;
 314         long ret;
 315
 316         mutex_lock(&monc->mutex);
 317         while (monc->have_osdmap < epoch) {
 318                 mutex_unlock(&monc->mutex);
 319
 320                 if (timeout && time_after_eq(jiffies, started + timeout))
 321                         return -ETIMEDOUT;
 322
 323                 ret = wait_event_interruptible_timeout(monc->client->auth_wq,
 324                                                 monc->have_osdmap >= epoch,
 325                                                 ceph_timeout_jiffies(timeout));
 326                 if (ret < 0)
 327                         return ret;
 328
 329                 mutex_lock(&monc->mutex);
 330         }
 331
 332         mutex_unlock(&monc->mutex);
 333         return 0;
 334 }
 335 EXPORT_SYMBOL(ceph_monc_wait_osdmap);
 336
 337 /*
 338  *
 339  */
 340 int ceph_monc_open_session(struct ceph_mon_client *monc)
 341 {
 342         mutex_lock(&monc->mutex);
 343         __open_session(monc);
 344         __schedule_delayed(monc);
 345         mutex_unlock(&monc->mutex);
 346         return 0;
 347 }
 348 EXPORT_SYMBOL(ceph_monc_open_session);
 349
 350 static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 351                                  struct ceph_msg *msg)
 352 {
 353         struct ceph_client *client = monc->client;
 354         struct ceph_monmap *monmap = NULL, *old = monc->monmap;
 355         void *p, *end;
 356
 357         mutex_lock(&monc->mutex);
 358
 359         dout("handle_monmap\n");
 360         p = msg->front.iov_base;
 361         end = p + msg->front.iov_len;
 362
 363         monmap = ceph_monmap_decode(p, end);
 364         if (IS_ERR(monmap)) {
 365                 pr_err("problem decoding monmap, %d\n",
 366                        (int)PTR_ERR(monmap));
 367                 goto out;
 368         }
 369
 370         if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
 371                 kfree(monmap);
 372                 goto out;
 373         }
 374
 375         client->monc.monmap = monmap;
 376         kfree(old);
 377
 378         client->have_fsid = true;
 379
 380 out:
 381         mutex_unlock(&monc->mutex);
 382         wake_up_all(&client->auth_wq);
 383 }
 384
 385 /*
 386  * generic requests (currently statfs, mon_get_version)
 387  */
 388 static struct ceph_mon_generic_request *__lookup_generic_req(
 389         struct ceph_mon_client *monc, u64 tid)
 390 {
 391         struct ceph_mon_generic_request *req;
 392         struct rb_node *n = monc->generic_request_tree.rb_node;
 393
 394         while (n) {
 395                 req = rb_entry(n, struct ceph_mon_generic_request, node);
 396                 if (tid < req->tid)
 397                         n = n->rb_left;
 398                 else if (tid > req->tid)
 399                         n = n->rb_right;
 400                 else
 401                         return req;
 402         }
 403         return NULL;
 404 }
 405
 406 static void __insert_generic_request(struct ceph_mon_client *monc,
 407                             struct ceph_mon_generic_request *new)
 408 {
 409         struct rb_node **p = &monc->generic_request_tree.rb_node;
 410         struct rb_node *parent = NULL;
 411         struct ceph_mon_generic_request *req = NULL;
 412
 413         while (*p) {
 414                 parent = *p;
 415                 req = rb_entry(parent, struct ceph_mon_generic_request, node);
 416                 if (new->tid < req->tid)
 417                         p = &(*p)->rb_left;
 418                 else if (new->tid > req->tid)
 419                         p = &(*p)->rb_right;
 420                 else
 421                         BUG();
 422         }
 423
 424         rb_link_node(&new->node, parent, p);
 425         rb_insert_color(&new->node, &monc->generic_request_tree);
 426 }
 427
 428 static void release_generic_request(struct kref *kref)
 429 {
 430         struct ceph_mon_generic_request *req =
 431                 container_of(kref, struct ceph_mon_generic_request, kref);
 432
 433         if (req->reply)
 434                 ceph_msg_put(req->reply);
 435         if (req->request)
 436                 ceph_msg_put(req->request);
 437
 438         kfree(req);
 439 }
 440
 441 static void put_generic_request(struct ceph_mon_generic_request *req)
 442 {
 443         kref_put(&req->kref, release_generic_request);
 444 }
 445
 446 static void get_generic_request(struct ceph_mon_generic_request *req)
 447 {
 448         kref_get(&req->kref);
 449 }
 450
 451 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
 452                                          struct ceph_msg_header *hdr,
 453                                          int *skip)
 454 {
 455         struct ceph_mon_client *monc = con->private;
 456         struct ceph_mon_generic_request *req;
 457         u64 tid = le64_to_cpu(hdr->tid);
 458         struct ceph_msg *m;
 459
 460         mutex_lock(&monc->mutex);
 461         req = __lookup_generic_req(monc, tid);
 462         if (!req) {
 463                 dout("get_generic_reply %lld dne\n", tid);
 464                 *skip = 1;
 465                 m = NULL;
 466         } else {
 467                 dout("get_generic_reply %lld got %p\n", tid, req->reply);
 468                 *skip = 0;
 469                 m = ceph_msg_get(req->reply);
 470                 /*
 471                  * we don't need to track the connection reading into
 472                  * this reply because we only have one open connection
 473                  * at a time, ever.
 474                  */
 475         }
 476         mutex_unlock(&monc->mutex);
 477         return m;
 478 }
 479
 480 static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
 481                                 struct ceph_mon_generic_request *req)
 482 {
 483         int err;
 484
 485         /* register request */
 486         req->tid = tid != 0 ? tid : ++monc->last_tid;
 487         req->request->hdr.tid = cpu_to_le64(req->tid);
 488         __insert_generic_request(monc, req);
 489         monc->num_generic_requests++;
 490         ceph_con_send(&monc->con, ceph_msg_get(req->request));
 491         mutex_unlock(&monc->mutex);
 492
 493         err = wait_for_completion_interruptible(&req->completion);
 494
 495         mutex_lock(&monc->mutex);
 496         rb_erase(&req->node, &monc->generic_request_tree);
 497         monc->num_generic_requests--;
 498
 499         if (!err)
 500                 err = req->result;
 501         return err;
 502 }
 503
 504 static int do_generic_request(struct ceph_mon_client *monc,
 505                               struct ceph_mon_generic_request *req)
 506 {
 507         int err;
 508
 509         mutex_lock(&monc->mutex);
 510         err = __do_generic_request(monc, 0, req);
 511         mutex_unlock(&monc->mutex);
 512
 513         return err;
 514 }
 515
 516 /*
 517  * statfs
 518  */
 519 static void handle_statfs_reply(struct ceph_mon_client *monc,
 520                                 struct ceph_msg *msg)
 521 {
 522         struct ceph_mon_generic_request *req;
 523         struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
 524         u64 tid = le64_to_cpu(msg->hdr.tid);
 525
 526         if (msg->front.iov_len != sizeof(*reply))
 527                 goto bad;
 528         dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 529
 530         mutex_lock(&monc->mutex);
 531         req = __lookup_generic_req(monc, tid);
 532         if (req) {
 533                 *(struct ceph_statfs *)req->buf = reply->st;
 534                 req->result = 0;
 535                 get_generic_request(req);
 536         }
 537         mutex_unlock(&monc->mutex);
 538         if (req) {
 539                 complete_all(&req->completion);
 540                 put_generic_request(req);
 541         }
 542         return;
 543
 544 bad:
 545         pr_err("corrupt statfs reply, tid %llu\n", tid);
 546         ceph_msg_dump(msg);
 547 }
 548
 549 /*
 550  * Do a synchronous statfs().
 551  */
 552 int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 553 {
 554         struct ceph_mon_generic_request *req;
 555         struct ceph_mon_statfs *h;
 556         int err;
 557
 558         req = kzalloc(sizeof(*req), GFP_NOFS);
 559         if (!req)
 560                 return -ENOMEM;
 561
 562         kref_init(&req->kref);
 563         req->buf = buf;
 564         init_completion(&req->completion);
 565
 566         err = -ENOMEM;
 567         req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
 568                                     true);
 569         if (!req->request)
 570                 goto out;
 571         req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
 572                                   true);
 573         if (!req->reply)
 574                 goto out;
 575
 576         /* fill out request */
 577         h = req->request->front.iov_base;
 578         h->monhdr.have_version = 0;
 579         h->monhdr.session_mon = cpu_to_le16(-1);
 580         h->monhdr.session_mon_tid = 0;
 581         h->fsid = monc->monmap->fsid;
 582
 583         err = do_generic_request(monc, req);
 584
 585 out:
 586         put_generic_request(req);
 587         return err;
 588 }
 589 EXPORT_SYMBOL(ceph_monc_do_statfs);
 590
 591 static void handle_get_version_reply(struct ceph_mon_client *monc,
 592                                      struct ceph_msg *msg)
 593 {
 594         struct ceph_mon_generic_request *req;
 595         u64 tid = le64_to_cpu(msg->hdr.tid);
 596         void *p = msg->front.iov_base;
 597         void *end = p + msg->front_alloc_len;
 598         u64 handle;
 599
 600         dout("%s %p tid %llu\n", __func__, msg, tid);
 601
 602         ceph_decode_need(&p, end, 2*sizeof(u64), bad);
 603         handle = ceph_decode_64(&p);
 604         if (tid != 0 && tid != handle)
 605                 goto bad;
 606
 607         mutex_lock(&monc->mutex);
 608         req = __lookup_generic_req(monc, handle);
 609         if (req) {
 610                 *(u64 *)req->buf = ceph_decode_64(&p);
 611                 req->result = 0;
 612                 get_generic_request(req);
 613         }
 614         mutex_unlock(&monc->mutex);
 615         if (req) {
 616                 complete_all(&req->completion);
 617                 put_generic_request(req);
 618         }
 619
 620         return;
 621 bad:
 622         pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
 623         ceph_msg_dump(msg);
 624 }
 625
 626 /*
 627  * Send MMonGetVersion and wait for the reply.
 628  *
 629  * @what: one of "mdsmap", "osdmap" or "monmap"
 630  */
 631 int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
 632                              u64 *newest)
 633 {
 634         struct ceph_mon_generic_request *req;
 635         void *p, *end;
 636         u64 tid;
 637         int err;
 638
 639         req = kzalloc(sizeof(*req), GFP_NOFS);
 640         if (!req)
 641                 return -ENOMEM;
 642
 643         kref_init(&req->kref);
 644         req->buf = newest;
 645         init_completion(&req->completion);
 646
 647         req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
 648                                     sizeof(u64) + sizeof(u32) + strlen(what),
 649                                     GFP_NOFS, true);
 650         if (!req->request) {
 651                 err = -ENOMEM;
 652                 goto out;
 653         }
 654
 655         req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
 656                                   GFP_NOFS, true);
 657         if (!req->reply) {
 658                 err = -ENOMEM;
 659                 goto out;
 660         }
 661
 662         p = req->request->front.iov_base;
 663         end = p + req->request->front_alloc_len;
 664
 665         /* fill out request */
 666         mutex_lock(&monc->mutex);
 667         tid = ++monc->last_tid;
 668         ceph_encode_64(&p, tid); /* handle */
 669         ceph_encode_string(&p, end, what, strlen(what));
 670
 671         err = __do_generic_request(monc, tid, req);
 672
 673         mutex_unlock(&monc->mutex);
 674 out:
 675         put_generic_request(req);
 676         return err;
 677 }
 678 EXPORT_SYMBOL(ceph_monc_do_get_version);
 679
 680 /*
 681  * Resend pending generic requests.
 682  */
 683 static void __resend_generic_request(struct ceph_mon_client *monc)
 684 {
 685         struct ceph_mon_generic_request *req;
 686         struct rb_node *p;
 687
 688         for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
 689                 req = rb_entry(p, struct ceph_mon_generic_request, node);
 690                 ceph_msg_revoke(req->request);
 691                 ceph_msg_revoke_incoming(req->reply);
 692                 ceph_con_send(&monc->con, ceph_msg_get(req->request));
 693         }
 694 }
 695
 696 /*
 697  * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
 698  * renew/retry subscription as needed (in case it is timing out, or we
 699  * got an ENOMEM).  And keep the monitor connection alive.
 700  */
 701 static void delayed_work(struct work_struct *work)
 702 {
 703         struct ceph_mon_client *monc =
 704                 container_of(work, struct ceph_mon_client, delayed_work.work);
 705
 706         dout("monc delayed_work\n");
 707         mutex_lock(&monc->mutex);
 708         if (monc->hunting) {
 709                 __close_session(monc);
 710                 __open_session(monc);  /* continue hunting */
 711         } else {
 712                 struct ceph_options *opt = monc->client->options;
 713                 int is_auth = ceph_auth_is_authenticated(monc->auth);
 714                 if (ceph_con_keepalive_expired(&monc->con,
 715                                                opt->monc_ping_timeout)) {
 716                         dout("monc keepalive timeout\n");
 717                         is_auth = 0;
 718                         __close_session(monc);
 719                         monc->hunting = true;
 720                         __open_session(monc);
 721                 }
 722
 723                 if (!monc->hunting) {
 724                         ceph_con_keepalive(&monc->con);
 725                         __validate_auth(monc);
 726                 }
 727
 728                 if (is_auth)
 729                         __send_subscribe(monc);
 730         }
 731         __schedule_delayed(monc);
 732         mutex_unlock(&monc->mutex);
 733 }
 734
 735 /*
 736  * On startup, we build a temporary monmap populated with the IPs
 737  * provided by mount(2).
 738  */
 739 static int build_initial_monmap(struct ceph_mon_client *monc)
 740 {
 741         struct ceph_options *opt = monc->client->options;
 742         struct ceph_entity_addr *mon_addr = opt->mon_addr;
 743         int num_mon = opt->num_mon;
 744         int i;
 745
 746         /* build initial monmap */
 747         monc->monmap = kzalloc(sizeof(*monc->monmap) +
 748                                num_mon*sizeof(monc->monmap->mon_inst[0]),
 749                                GFP_KERNEL);
 750         if (!monc->monmap)
 751                 return -ENOMEM;
 752         for (i = 0; i < num_mon; i++) {
 753                 monc->monmap->mon_inst[i].addr = mon_addr[i];
 754                 monc->monmap->mon_inst[i].addr.nonce = 0;
 755                 monc->monmap->mon_inst[i].name.type =
 756                         CEPH_ENTITY_TYPE_MON;
 757                 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
 758         }
 759         monc->monmap->num_mon = num_mon;
 760         return 0;
 761 }
 762
 763 int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 764 {
 765         int err = 0;
 766
 767         dout("init\n");
 768         memset(monc, 0, sizeof(*monc));
 769         monc->client = cl;
 770         monc->monmap = NULL;
 771         mutex_init(&monc->mutex);
 772
 773         err = build_initial_monmap(monc);
 774         if (err)
 775                 goto out;
 776
 777         /* connection */
 778         /* authentication */
 779         monc->auth = ceph_auth_init(cl->options->name,
 780                                     cl->options->key);
 781         if (IS_ERR(monc->auth)) {
 782                 err = PTR_ERR(monc->auth);
 783                 goto out_monmap;
 784         }
 785         monc->auth->want_keys =
 786                 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
 787                 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
 788
 789         /* msgs */
 790         err = -ENOMEM;
 791         monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
 792                                      sizeof(struct ceph_mon_subscribe_ack),
 793                                      GFP_NOFS, true);
 794         if (!monc->m_subscribe_ack)
 795                 goto out_auth;
 796
 797         monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
 798                                          true);
 799         if (!monc->m_subscribe)
 800                 goto out_subscribe_ack;
 801
 802         monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS,
 803                                           true);
 804         if (!monc->m_auth_reply)
 805                 goto out_subscribe;
 806
 807         monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true);
 808         monc->pending_auth = 0;
 809         if (!monc->m_auth)
 810                 goto out_auth_reply;
 811
 812         ceph_con_init(&monc->con, monc, &mon_con_ops,
 813                       &monc->client->msgr);
 814
 815         monc->cur_mon = -1;
 816         monc->hunting = true;
 817         monc->sub_renew_after = jiffies;
 818         monc->sub_sent = 0;
 819
 820         INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
 821         monc->generic_request_tree = RB_ROOT;
 822         monc->num_generic_requests = 0;
 823         monc->last_tid = 0;
 824
 825         monc->have_mdsmap = 0;
 826         monc->have_osdmap = 0;
 827         monc->want_next_osdmap = 1;
 828         return 0;
 829
 830 out_auth_reply:
 831         ceph_msg_put(monc->m_auth_reply);
 832 out_subscribe:
 833         ceph_msg_put(monc->m_subscribe);
 834 out_subscribe_ack:
 835         ceph_msg_put(monc->m_subscribe_ack);
 836 out_auth:
 837         ceph_auth_destroy(monc->auth);
 838 out_monmap:
 839         kfree(monc->monmap);
 840 out:
 841         return err;
 842 }
 843 EXPORT_SYMBOL(ceph_monc_init);
 844
 845 void ceph_monc_stop(struct ceph_mon_client *monc)
 846 {
 847         dout("stop\n");
 848         cancel_delayed_work_sync(&monc->delayed_work);
 849
 850         mutex_lock(&monc->mutex);
 851         __close_session(monc);
 852
 853         mutex_unlock(&monc->mutex);
 854
 855         /*
 856          * flush msgr queue before we destroy ourselves to ensure that:
 857          *  - any work that references our embedded con is finished.
 858          *  - any osd_client or other work that may reference an authorizer
 859          *    finishes before we shut down the auth subsystem.
 860          */
 861         ceph_msgr_flush();
 862
 863         ceph_auth_destroy(monc->auth);
 864
 865         ceph_msg_put(monc->m_auth);
 866         ceph_msg_put(monc->m_auth_reply);
 867         ceph_msg_put(monc->m_subscribe);
 868         ceph_msg_put(monc->m_subscribe_ack);
 869
 870         kfree(monc->monmap);
 871 }
 872 EXPORT_SYMBOL(ceph_monc_stop);
 873
 874 static void finish_hunting(struct ceph_mon_client *monc)
 875 {
 876         if (monc->hunting) {
 877                 dout("%s found mon%d\n", __func__, monc->cur_mon);
 878                 monc->hunting = false;
 879         }
 880 }
 881
 882 static void handle_auth_reply(struct ceph_mon_client *monc,
 883                               struct ceph_msg *msg)
 884 {
 885         int ret;
 886         int was_auth = 0;
 887
 888         mutex_lock(&monc->mutex);
 889         was_auth = ceph_auth_is_authenticated(monc->auth);
 890         monc->pending_auth = 0;
 891         ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
 892                                      msg->front.iov_len,
 893                                      monc->m_auth->front.iov_base,
 894                                      monc->m_auth->front_alloc_len);
 895         if (ret > 0) {
 896                 __send_prepared_auth_request(monc, ret);
 897                 goto out;
 898         }
 899
 900         finish_hunting(monc);
 901
 902         if (ret < 0) {
 903                 monc->client->auth_err = ret;
 904         } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
 905                 dout("authenticated, starting session\n");
 906
 907                 monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
 908                 monc->client->msgr.inst.name.num =
 909                                         cpu_to_le64(monc->auth->global_id);
 910
 911                 __send_subscribe(monc);
 912                 __resend_generic_request(monc);
 913
 914                 pr_info("mon%d %s session established\n", monc->cur_mon,
 915                         ceph_pr_addr(&monc->con.peer_addr.in_addr));
 916         }
 917
 918 out:
 919         mutex_unlock(&monc->mutex);
 920         if (monc->client->auth_err < 0)
 921                 wake_up_all(&monc->client->auth_wq);
 922 }
 923
 924 static int __validate_auth(struct ceph_mon_client *monc)
 925 {
 926         int ret;
 927
 928         if (monc->pending_auth)
 929                 return 0;
 930
 931         ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
 932                               monc->m_auth->front_alloc_len);
 933         if (ret <= 0)
 934                 return ret; /* either an error, or no need to authenticate */
 935         __send_prepared_auth_request(monc, ret);
 936         return 0;
 937 }
 938
 939 int ceph_monc_validate_auth(struct ceph_mon_client *monc)
 940 {
 941         int ret;
 942
 943         mutex_lock(&monc->mutex);
 944         ret = __validate_auth(monc);
 945         mutex_unlock(&monc->mutex);
 946         return ret;
 947 }
 948 EXPORT_SYMBOL(ceph_monc_validate_auth);
 949
 950 /*
 951  * handle incoming message
 952  */
 953 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 954 {
 955         struct ceph_mon_client *monc = con->private;
 956         int type = le16_to_cpu(msg->hdr.type);
 957
 958         if (!monc)
 959                 return;
 960
 961         switch (type) {
 962         case CEPH_MSG_AUTH_REPLY:
 963                 handle_auth_reply(monc, msg);
 964                 break;
 965
 966         case CEPH_MSG_MON_SUBSCRIBE_ACK:
 967                 handle_subscribe_ack(monc, msg);
 968                 break;
 969
 970         case CEPH_MSG_STATFS_REPLY:
 971                 handle_statfs_reply(monc, msg);
 972                 break;
 973
 974         case CEPH_MSG_MON_GET_VERSION_REPLY:
 975                 handle_get_version_reply(monc, msg);
 976                 break;
 977
 978         case CEPH_MSG_MON_MAP:
 979                 ceph_monc_handle_map(monc, msg);
 980                 break;
 981
 982         case CEPH_MSG_OSD_MAP:
 983                 ceph_osdc_handle_map(&monc->client->osdc, msg);
 984                 break;
 985
 986         default:
 987                 /* can the chained handler handle it? */
 988                 if (monc->client->extra_mon_dispatch &&
 989                     monc->client->extra_mon_dispatch(monc->client, msg) == 0)
 990                         break;
 991
 992                 pr_err("received unknown message type %d %s\n", type,
 993                        ceph_msg_type_name(type));
 994         }
 995         ceph_msg_put(msg);
 996 }
 997
 998 /*
 999  * Allocate memory for incoming message
1000  */
1001 static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
1002                                       struct ceph_msg_header *hdr,
1003                                       int *skip)
1004 {
1005         struct ceph_mon_client *monc = con->private;
1006         int type = le16_to_cpu(hdr->type);
1007         int front_len = le32_to_cpu(hdr->front_len);
1008         struct ceph_msg *m = NULL;
1009
1010         *skip = 0;
1011
1012         switch (type) {
1013         case CEPH_MSG_MON_SUBSCRIBE_ACK:
1014                 m = ceph_msg_get(monc->m_subscribe_ack);
1015                 break;
1016         case CEPH_MSG_STATFS_REPLY:
1017                 return get_generic_reply(con, hdr, skip);
1018         case CEPH_MSG_AUTH_REPLY:
1019                 m = ceph_msg_get(monc->m_auth_reply);
1020                 break;
1021         case CEPH_MSG_MON_GET_VERSION_REPLY:
1022                 if (le64_to_cpu(hdr->tid) != 0)
1023                         return get_generic_reply(con, hdr, skip);
1024
1025                 /*
1026                  * Older OSDs don't set reply tid even if the orignal
1027                  * request had a non-zero tid.  Workaround this weirdness
1028                  * by falling through to the allocate case.
1029                  */
1030         case CEPH_MSG_MON_MAP:
1031         case CEPH_MSG_MDS_MAP:
1032         case CEPH_MSG_OSD_MAP:
1033                 m = ceph_msg_new(type, front_len, GFP_NOFS, false);
1034                 if (!m)
1035                         return NULL;    /* ENOMEM--return skip == 0 */
1036                 break;
1037         }
1038
1039         if (!m) {
1040                 pr_info("alloc_msg unknown type %d\n", type);
1041                 *skip = 1;
1042         } else if (front_len > m->front_alloc_len) {
1043                 pr_warn("mon_alloc_msg front %d > prealloc %d (%u#%llu)\n",
1044                         front_len, m->front_alloc_len,
1045                         (unsigned int)con->peer_name.type,
1046                         le64_to_cpu(con->peer_name.num));
1047                 ceph_msg_put(m);
1048                 m = ceph_msg_new(type, front_len, GFP_NOFS, false);
1049         }
1050
1051         return m;
1052 }
1053
1054 /*
1055  * If the monitor connection resets, pick a new monitor and resubmit
1056  * any pending requests.
1057  */
1058 static void mon_fault(struct ceph_connection *con)
1059 {
1060         struct ceph_mon_client *monc = con->private;
1061
1062         if (!monc)
1063                 return;
1064
1065         dout("mon_fault\n");
1066         mutex_lock(&monc->mutex);
1067         if (!con->private)
1068                 goto out;
1069
1070         if (!monc->hunting)
1071                 pr_info("mon%d %s session lost, "
1072                         "hunting for new mon\n", monc->cur_mon,
1073                         ceph_pr_addr(&monc->con.peer_addr.in_addr));
1074
1075         __close_session(monc);
1076         if (!monc->hunting) {
1077                 /* start hunting */
1078                 monc->hunting = true;
1079                 __open_session(monc);
1080         } else {
1081                 /* already hunting, let's wait a bit */
1082                 __schedule_delayed(monc);
1083         }
1084 out:
1085         mutex_unlock(&monc->mutex);
1086 }
1087
1088 /*
1089  * We can ignore refcounting on the connection struct, as all references
1090  * will come from the messenger workqueue, which is drained prior to
1091  * mon_client destruction.
1092  */
1093 static struct ceph_connection *con_get(struct ceph_connection *con)
1094 {
1095         return con;
1096 }
1097
1098 static void con_put(struct ceph_connection *con)
1099 {
1100 }
1101
1102 static const struct ceph_connection_operations mon_con_ops = {
1103         .get = con_get,
1104         .put = con_put,
1105         .dispatch = dispatch,
1106         .fault = mon_fault,
1107         .alloc_msg = mon_alloc_msg,
1108 };