libceph: init osd->o_node in create_osd()

[deliverable/linux.git] / net / ceph / osd_client.c
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c

index ca59e66c9787303805519f2bf325cc5d5817ff55..a6dc6acd656637e8e1f2afa01b5f6408b31b0ec4 100644 (file)
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -52,7 +52,7 @@ static int op_has_extent(int op)
                 op == CEPH_OSD_OP_WRITE);
  }
  
-void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
                         struct ceph_file_layout *layout,
                         u64 snapid,
                         u64 off, u64 *plen, u64 *bno,
@@ -62,12 +62,15 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
         struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
         u64 orig_len = *plen;
         u64 objoff, objlen;    /* extent in object */
+       int r;
  
         reqhead->snapid = cpu_to_le64(snapid);
  
         /* object extent? */
-       ceph_calc_file_object_mapping(layout, off, plen, bno,
-                                     &objoff, &objlen);
+       r = ceph_calc_file_object_mapping(layout, off, plen, bno,
+                                         &objoff, &objlen);
+       if (r < 0)
+               return r;
         if (*plen < orig_len)
                 dout(" skipping last %llu, final file extent %llu~%llu\n",
                      orig_len - *plen, off, *plen);
@@ -83,7 +86,7 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
  
         dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
              *bno, objoff, objlen, req->r_num_pages);
-
+       return 0;
  }
  EXPORT_SYMBOL(ceph_calc_raw_layout);
  
@@ -112,20 +115,25 @@ EXPORT_SYMBOL(ceph_calc_raw_layout);
   *
   * fill osd op in request message.
   */
-static void calc_layout(struct ceph_osd_client *osdc,
-                       struct ceph_vino vino,
-                       struct ceph_file_layout *layout,
-                       u64 off, u64 *plen,
-                       struct ceph_osd_request *req,
-                       struct ceph_osd_req_op *op)
+static int calc_layout(struct ceph_osd_client *osdc,
+                      struct ceph_vino vino,
+                      struct ceph_file_layout *layout,
+                      u64 off, u64 *plen,
+                      struct ceph_osd_request *req,
+                      struct ceph_osd_req_op *op)
  {
         u64 bno;
+       int r;
  
-       ceph_calc_raw_layout(osdc, layout, vino.snap, off,
-                            plen, &bno, req, op);
+       r = ceph_calc_raw_layout(osdc, layout, vino.snap, off,
+                                plen, &bno, req, op);
+       if (r < 0)
+               return r;
  
         snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno);
         req->r_oid_len = strlen(req->r_oid);
+
+       return r;
  }
  
  /*
@@ -140,10 +148,9 @@ void ceph_osdc_release_request(struct kref *kref)
         if (req->r_request)
                 ceph_msg_put(req->r_request);
         if (req->r_con_filling_msg) {
-               dout("release_request revoking pages %p from con %p\n",
+               dout("%s revoking pages %p from con %p\n", __func__,
                      req->r_pages, req->r_con_filling_msg);
-               ceph_con_revoke_message(req->r_con_filling_msg,
-                                     req->r_reply);
+               ceph_msg_revoke_incoming(req->r_reply);
                 req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
         }
         if (req->r_reply)
@@ -214,10 +221,13 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
         kref_init(&req->r_kref);
         init_completion(&req->r_completion);
         init_completion(&req->r_safe_completion);
+       rb_init_node(&req->r_node);
         INIT_LIST_HEAD(&req->r_unsafe_item);
         INIT_LIST_HEAD(&req->r_linger_item);
         INIT_LIST_HEAD(&req->r_linger_osd);
         INIT_LIST_HEAD(&req->r_req_lru_item);
+       INIT_LIST_HEAD(&req->r_osd_item);
+
         req->r_flags = flags;
  
         WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
@@ -243,6 +253,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
                 }
                 ceph_pagelist_init(req->r_trail);
         }
+
         /* create request message; allow space for oid */
         msg_size += MAX_OBJ_NAME_SIZE;
         if (snapc)
@@ -256,7 +267,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
                 return NULL;
         }
  
-       msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
         memset(msg->front.iov_base, 0, msg->front.iov_len);
  
         req->r_request = msg;
@@ -454,6 +464,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
  {
         struct ceph_osd_req_op ops[3];
         struct ceph_osd_request *req;
+       int r;
  
         ops[0].op = opcode;
         ops[0].extent.truncate_seq = truncate_seq;
@@ -472,10 +483,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                                          use_mempool,
                                          GFP_NOFS, NULL, NULL);
         if (!req)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
  
         /* calculate max write size */
-       calc_layout(osdc, vino, layout, off, plen, req, ops);
+       r = calc_layout(osdc, vino, layout, off, plen, req, ops);
+       if (r < 0)
+               return ERR_PTR(r);
         req->r_file_layout = *layout;  /* keep a copy */
  
         /* in case it differs from natural (file) alignment that
@@ -568,7 +581,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
  
         dout("__kick_osd_requests osd%d\n", osd->o_osd);
         err = __reset_osd(osdc, osd);
-       if (err == -EAGAIN)
+       if (err)
                 return;
  
         list_for_each_entry(req, &osd->o_requests, r_osd_item) {
@@ -595,14 +608,6 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
         }
  }
  
-static void kick_osd_requests(struct ceph_osd_client *osdc,
-                             struct ceph_osd *kickosd)
-{
-       mutex_lock(&osdc->request_mutex);
-       __kick_osd_requests(osdc, kickosd);
-       mutex_unlock(&osdc->request_mutex);
-}
-
  /*
   * If the osd connection drops, we need to resubmit all requests.
   */
@@ -616,7 +621,9 @@ static void osd_reset(struct ceph_connection *con)
         dout("osd_reset osd%d\n", osd->o_osd);
         osdc = osd->o_osdc;
         down_read(&osdc->map_sem);
-       kick_osd_requests(osdc, osd);
+       mutex_lock(&osdc->request_mutex);
+       __kick_osd_requests(osdc, osd);
+       mutex_unlock(&osdc->request_mutex);
         send_queued(osdc);
         up_read(&osdc->map_sem);
  }
@@ -624,7 +631,7 @@ static void osd_reset(struct ceph_connection *con)
  /*
   * Track open sessions with osds.
   */
-static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
  {
         struct ceph_osd *osd;
  
@@ -634,15 +641,14 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
  
         atomic_set(&osd->o_ref, 1);
         osd->o_osdc = osdc;
+       osd->o_osd = onum;
+       RB_CLEAR_NODE(&osd->o_node);
         INIT_LIST_HEAD(&osd->o_requests);
         INIT_LIST_HEAD(&osd->o_linger_requests);
         INIT_LIST_HEAD(&osd->o_osd_lru);
         osd->o_incarnation = 1;
  
-       ceph_con_init(osdc->client->msgr, &osd->o_con);
-       osd->o_con.private = osd;
-       osd->o_con.ops = &osd_con_ops;
-       osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+       ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
  
         INIT_LIST_HEAD(&osd->o_keepalive_item);
         return osd;
@@ -688,7 +694,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
  
  static void remove_all_osds(struct ceph_osd_client *osdc)
  {
-       dout("__remove_old_osds %p\n", osdc);
+       dout("%s %p\n", __func__, osdc);
         mutex_lock(&osdc->request_mutex);
         while (!RB_EMPTY_ROOT(&osdc->osds)) {
                 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
@@ -740,6 +746,7 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
         if (list_empty(&osd->o_requests) &&
             list_empty(&osd->o_linger_requests)) {
                 __remove_osd(osdc, osd);
+               ret = -ENODEV;
         } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
                           &osd->o_con.peer_addr,
                           sizeof(osd->o_con.peer_addr)) == 0 &&
@@ -752,7 +759,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
                 ret = -EAGAIN;
         } else {
                 ceph_con_close(&osd->o_con);
-               ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+               ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
+                             &osdc->osdmap->osd_addr[osd->o_osd]);
                 osd->o_incarnation++;
         }
         return ret;
@@ -853,7 +861,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
  
         if (req->r_osd) {
                 /* make sure the original request isn't in flight. */
-               ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+               ceph_msg_revoke(req->r_request);
  
                 list_del_init(&req->r_osd_item);
                 if (list_empty(&req->r_osd->o_requests) &&
@@ -865,9 +873,9 @@ static void __unregister_request(struct ceph_osd_client *osdc,
                         req->r_osd = NULL;
         }
  
+       list_del_init(&req->r_req_lru_item);
         ceph_osdc_put_request(req);
  
-       list_del_init(&req->r_req_lru_item);
         if (osdc->num_requests == 0) {
                 dout(" no requests, canceling timeout\n");
                 __cancel_osd_timeout(osdc);
@@ -880,7 +888,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
  static void __cancel_request(struct ceph_osd_request *req)
  {
         if (req->r_sent && req->r_osd) {
-               ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+               ceph_msg_revoke(req->r_request);
                 req->r_sent = 0;
         }
  }
@@ -890,15 +898,17 @@ static void __register_linger_request(struct ceph_osd_client *osdc,
  {
         dout("__register_linger_request %p\n", req);
         list_add_tail(&req->r_linger_item, &osdc->req_linger);
-       list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests);
+       if (req->r_osd)
+               list_add_tail(&req->r_linger_osd,
+                             &req->r_osd->o_linger_requests);
  }
  
  static void __unregister_linger_request(struct ceph_osd_client *osdc,
                                         struct ceph_osd_request *req)
  {
         dout("__unregister_linger_request %p\n", req);
+       list_del_init(&req->r_linger_item);
         if (req->r_osd) {
-               list_del_init(&req->r_linger_item);
                 list_del_init(&req->r_linger_osd);
  
                 if (list_empty(&req->r_osd->o_requests) &&
@@ -998,18 +1008,18 @@ static int __map_request(struct ceph_osd_client *osdc,
         req->r_osd = __lookup_osd(osdc, o);
         if (!req->r_osd && o >= 0) {
                 err = -ENOMEM;
-               req->r_osd = create_osd(osdc);
+               req->r_osd = create_osd(osdc, o);
                 if (!req->r_osd) {
                         list_move(&req->r_req_lru_item, &osdc->req_notarget);
                         goto out;
                 }
  
                 dout("map_request osd %p is osd%d\n", req->r_osd, o);
-               req->r_osd->o_osd = o;
-               req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
                 __insert_osd(osdc, req->r_osd);
  
-               ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+               ceph_con_open(&req->r_osd->o_con,
+                             CEPH_ENTITY_TYPE_OSD, o,
+                             &osdc->osdmap->osd_addr[o]);
         }
  
         if (req->r_osd) {
@@ -1077,12 +1087,10 @@ static void handle_timeout(struct work_struct *work)
  {
         struct ceph_osd_client *osdc =
                 container_of(work, struct ceph_osd_client, timeout_work.work);
-       struct ceph_osd_request *req, *last_req = NULL;
+       struct ceph_osd_request *req;
         struct ceph_osd *osd;
-       unsigned long timeout = osdc->client->options->osd_timeout * HZ;
         unsigned long keepalive =
                 osdc->client->options->osd_keepalive_timeout * HZ;
-       unsigned long last_stamp = 0;
         struct list_head slow_osds;
         dout("timeout\n");
         down_read(&osdc->map_sem);
@@ -1091,37 +1099,6 @@ static void handle_timeout(struct work_struct *work)
  
         mutex_lock(&osdc->request_mutex);
  
-       /*
-        * reset osds that appear to be _really_ unresponsive.  this
-        * is a failsafe measure.. we really shouldn't be getting to
-        * this point if the system is working properly.  the monitors
-        * should mark the osd as failed and we should find out about
-        * it from an updated osd map.
-        */
-       while (timeout && !list_empty(&osdc->req_lru)) {
-               req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
-                                r_req_lru_item);
-
-               /* hasn't been long enough since we sent it? */
-               if (time_before(jiffies, req->r_stamp + timeout))
-                       break;
-
-               /* hasn't been long enough since it was acked? */
-               if (req->r_request->ack_stamp == 0 ||
-                   time_before(jiffies, req->r_request->ack_stamp + timeout))
-                       break;
-
-               BUG_ON(req == last_req && req->r_stamp == last_stamp);
-               last_req = req;
-               last_stamp = req->r_stamp;
-
-               osd = req->r_osd;
-               BUG_ON(!osd);
-               pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
-                          req->r_tid, osd->o_osd);
-               __kick_osd_requests(osdc, osd);
-       }
-
         /*
          * ping osds that are a bit slow.  this ensures that if there
          * is a break in the TCP connection we will notice, and reopen
@@ -1304,8 +1281,9 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
  
         dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
         mutex_lock(&osdc->request_mutex);
-       for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+       for (p = rb_first(&osdc->requests); p; ) {
                 req = rb_entry(p, struct ceph_osd_request, r_node);
+               p = rb_next(p);
                 err = __map_request(osdc, req, force_resend);
                 if (err < 0)
                         continue;  /* error */
@@ -1313,10 +1291,23 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
                         dout("%p tid %llu maps to no osd\n", req, req->r_tid);
                         needmap++;  /* request a newer map */
                 } else if (err > 0) {
-                       dout("%p tid %llu requeued on osd%d\n", req, req->r_tid,
-                            req->r_osd ? req->r_osd->o_osd : -1);
-                       if (!req->r_linger)
+                       if (!req->r_linger) {
+                               dout("%p tid %llu requeued on osd%d\n", req,
+                                    req->r_tid,
+                                    req->r_osd ? req->r_osd->o_osd : -1);
                                 req->r_flags |= CEPH_OSD_FLAG_RETRY;
+                       }
+               }
+               if (req->r_linger && list_empty(&req->r_linger_item)) {
+                       /*
+                        * register as a linger so that we will
+                        * re-submit below and get a new tid
+                        */
+                       dout("%p tid %llu restart on osd%d\n",
+                            req, req->r_tid,
+                            req->r_osd ? req->r_osd->o_osd : -1);
+                       __register_linger_request(osdc, req);
+                       __unregister_request(osdc, req);
                 }
         }
  
@@ -1391,7 +1382,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
                              epoch, maplen);
                         newmap = osdmap_apply_incremental(&p, next,
                                                           osdc->osdmap,
-                                                         osdc->client->msgr);
+                                                         &osdc->client->msgr);
                         if (IS_ERR(newmap)) {
                                 err = PTR_ERR(newmap);
                                 goto bad;
@@ -1839,11 +1830,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
         if (!osdc->req_mempool)
                 goto out;
  
-       err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+       err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
+                               OSD_OP_FRONT_LEN, 10, true,
                                 "osd_op");
         if (err < 0)
                 goto out_mempool;
-       err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+       err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
                                 OSD_OPREPLY_FRONT_LEN, 10, true,
                                 "osd_op_reply");
         if (err < 0)
@@ -1902,8 +1894,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                                     CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
                                     NULL, 0, truncate_seq, truncate_size, NULL,
                                     false, 1, page_align);
-       if (!req)
-               return -ENOMEM;
+       if (IS_ERR(req))
+               return PTR_ERR(req);
  
         /* it may be a short read due to an object boundary */
         req->r_pages = pages;
@@ -1945,8 +1937,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
                                     snapc, do_sync,
                                     truncate_seq, truncate_size, mtime,
                                     nofail, 1, page_align);
-       if (!req)
-               return -ENOMEM;
+       if (IS_ERR(req))
+               return PTR_ERR(req);
  
         /* it may be a short write due to an object boundary */
         req->r_pages = pages;
@@ -2019,15 +2011,15 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
         if (!req) {
                 *skip = 1;
                 m = NULL;
-               pr_info("get_reply unknown tid %llu from osd%d\n", tid,
-                       osd->o_osd);
+               dout("get_reply unknown tid %llu from osd%d\n", tid,
+                    osd->o_osd);
                 goto out;
         }
  
         if (req->r_con_filling_msg) {
-               dout("get_reply revoking msg %p from old con %p\n",
+               dout("%s revoking msg %p from old con %p\n", __func__,
                      req->r_reply, req->r_con_filling_msg);
-               ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+               ceph_msg_revoke_incoming(req->r_reply);
                 req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
                 req->r_con_filling_msg = NULL;
         }
@@ -2080,6 +2072,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
         int type = le16_to_cpu(hdr->type);
         int front = le32_to_cpu(hdr->front_len);
  
+       *skip = 0;
         switch (type) {
         case CEPH_MSG_OSD_MAP:
         case CEPH_MSG_WATCH_NOTIFY: