drivers/block/drbd/drbd_nl.c

   1 /*
   2    drbd_nl.c
   3
   4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10    drbd is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 2, or (at your option)
  13    any later version.
  14
  15    drbd is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18    GNU General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with drbd; see the file COPYING.  If not, write to
  22    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23
  24  */
  25
  26 #include <linux/module.h>
  27 #include <linux/drbd.h>
  28 #include <linux/in.h>
  29 #include <linux/fs.h>
  30 #include <linux/file.h>
  31 #include <linux/slab.h>
  32 #include <linux/blkpg.h>
  33 #include <linux/cpumask.h>
  34 #include "drbd_int.h"
  35 #include "drbd_protocol.h"
  36 #include "drbd_req.h"
  37 #include "drbd_wrappers.h"
  38 #include <asm/unaligned.h>
  39 #include <linux/drbd_limits.h>
  40 #include <linux/kthread.h>
  41
  42 #include <net/genetlink.h>
  43
  44 /* .doit */
  45 // int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
  46 // int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
  47
  48 int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info);
  49 int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info);
  50
  51 int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
  52 int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
  53 int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
  54
  55 int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
  56 int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
  57 int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
  58 int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
  59 int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
  60 int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
  61 int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
  62 int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
  63 int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
  64 int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
  65 int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
  66 int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
  67 int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
  68 int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
  69 int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
  70 int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
  71 int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
  72 int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
  73 int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
  74 int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
  75 /* .dumpit */
  76 int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
  77
  78 #include <linux/drbd_genl_api.h>
  79 #include "drbd_nla.h"
  80 #include <linux/genl_magic_func.h>
  81
  82 /* used blkdev_get_by_path, to claim our meta data device(s) */
  83 static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
  84
  85 /* Configuration is strictly serialized, because generic netlink message
  86  * processing is strictly serialized by the genl_lock().
  87  * Which means we can use one static global drbd_config_context struct.
  88  */
  89 static struct drbd_config_context {
  90         /* assigned from drbd_genlmsghdr */
  91         unsigned int minor;
  92         /* assigned from request attributes, if present */
  93         unsigned int volume;
  94 #define VOLUME_UNSPECIFIED              (-1U)
  95         /* pointer into the request skb,
  96          * limited lifetime! */
  97         char *resource_name;
  98         struct nlattr *my_addr;
  99         struct nlattr *peer_addr;
 100
 101         /* reply buffer */
 102         struct sk_buff *reply_skb;
 103         /* pointer into reply buffer */
 104         struct drbd_genlmsghdr *reply_dh;
 105         /* resolved from attributes, if possible */
 106         struct drbd_device *device;
 107         struct drbd_connection *connection;
 108 } adm_ctx;
 109
 110 static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
 111 {
 112         genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
 113         if (genlmsg_reply(skb, info))
 114                 printk(KERN_ERR "drbd: error sending genl reply\n");
 115 }
 116
 117 /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
 118  * reason it could fail was no space in skb, and there are 4k available. */
 119 int drbd_msg_put_info(const char *info)
 120 {
 121         struct sk_buff *skb = adm_ctx.reply_skb;
 122         struct nlattr *nla;
 123         int err = -EMSGSIZE;
 124
 125         if (!info || !info[0])
 126                 return 0;
 127
 128         nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
 129         if (!nla)
 130                 return err;
 131
 132         err = nla_put_string(skb, T_info_text, info);
 133         if (err) {
 134                 nla_nest_cancel(skb, nla);
 135                 return err;
 136         } else
 137                 nla_nest_end(skb, nla);
 138         return 0;
 139 }
 140
 141 /* This would be a good candidate for a "pre_doit" hook,
 142  * and per-family private info->pointers.
 143  * But we need to stay compatible with older kernels.
 144  * If it returns successfully, adm_ctx members are valid.
 145  */
 146 #define DRBD_ADM_NEED_MINOR     1
 147 #define DRBD_ADM_NEED_RESOURCE  2
 148 #define DRBD_ADM_NEED_CONNECTION 4
 149 static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
 150                 unsigned flags)
 151 {
 152         struct drbd_genlmsghdr *d_in = info->userhdr;
 153         const u8 cmd = info->genlhdr->cmd;
 154         int err;
 155
 156         memset(&adm_ctx, 0, sizeof(adm_ctx));
 157
 158         /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
 159         if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
 160                return -EPERM;
 161
 162         adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
 163         if (!adm_ctx.reply_skb) {
 164                 err = -ENOMEM;
 165                 goto fail;
 166         }
 167
 168         adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb,
 169                                         info, &drbd_genl_family, 0, cmd);
 170         /* put of a few bytes into a fresh skb of >= 4k will always succeed.
 171          * but anyways */
 172         if (!adm_ctx.reply_dh) {
 173                 err = -ENOMEM;
 174                 goto fail;
 175         }
 176
 177         adm_ctx.reply_dh->minor = d_in->minor;
 178         adm_ctx.reply_dh->ret_code = NO_ERROR;
 179
 180         adm_ctx.volume = VOLUME_UNSPECIFIED;
 181         if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
 182                 struct nlattr *nla;
 183                 /* parse and validate only */
 184                 err = drbd_cfg_context_from_attrs(NULL, info);
 185                 if (err)
 186                         goto fail;
 187
 188                 /* It was present, and valid,
 189                  * copy it over to the reply skb. */
 190                 err = nla_put_nohdr(adm_ctx.reply_skb,
 191                                 info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
 192                                 info->attrs[DRBD_NLA_CFG_CONTEXT]);
 193                 if (err)
 194                         goto fail;
 195
 196                 /* and assign stuff to the global adm_ctx */
 197                 nla = nested_attr_tb[__nla_type(T_ctx_volume)];
 198                 if (nla)
 199                         adm_ctx.volume = nla_get_u32(nla);
 200                 nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
 201                 if (nla)
 202                         adm_ctx.resource_name = nla_data(nla);
 203                 adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
 204                 adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
 205                 if ((adm_ctx.my_addr &&
 206                      nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.connection->my_addr)) ||
 207                     (adm_ctx.peer_addr &&
 208                      nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.connection->peer_addr))) {
 209                         err = -EINVAL;
 210                         goto fail;
 211                 }
 212         }
 213
 214         adm_ctx.minor = d_in->minor;
 215         adm_ctx.device = minor_to_device(d_in->minor);
 216         adm_ctx.connection = conn_get_by_name(adm_ctx.resource_name);
 217
 218         if (!adm_ctx.device && (flags & DRBD_ADM_NEED_MINOR)) {
 219                 drbd_msg_put_info("unknown minor");
 220                 return ERR_MINOR_INVALID;
 221         }
 222         if (!adm_ctx.connection && (flags & DRBD_ADM_NEED_RESOURCE)) {
 223                 drbd_msg_put_info("unknown resource");
 224                 return ERR_INVALID_REQUEST;
 225         }
 226
 227         if (flags & DRBD_ADM_NEED_CONNECTION) {
 228                 if (adm_ctx.connection && !(flags & DRBD_ADM_NEED_RESOURCE)) {
 229                         drbd_msg_put_info("no resource name expected");
 230                         return ERR_INVALID_REQUEST;
 231                 }
 232                 if (adm_ctx.device) {
 233                         drbd_msg_put_info("no minor number expected");
 234                         return ERR_INVALID_REQUEST;
 235                 }
 236                 if (adm_ctx.my_addr && adm_ctx.peer_addr)
 237                         adm_ctx.connection = conn_get_by_addrs(nla_data(adm_ctx.my_addr),
 238                                                           nla_len(adm_ctx.my_addr),
 239                                                           nla_data(adm_ctx.peer_addr),
 240                                                           nla_len(adm_ctx.peer_addr));
 241                 if (!adm_ctx.connection) {
 242                         drbd_msg_put_info("unknown connection");
 243                         return ERR_INVALID_REQUEST;
 244                 }
 245         }
 246
 247         /* some more paranoia, if the request was over-determined */
 248         if (adm_ctx.device && adm_ctx.connection &&
 249             first_peer_device(adm_ctx.device)->connection != adm_ctx.connection) {
 250                 pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n",
 251                                 adm_ctx.minor, adm_ctx.resource_name,
 252                                 first_peer_device(adm_ctx.device)->connection->name);
 253                 drbd_msg_put_info("minor exists in different resource");
 254                 return ERR_INVALID_REQUEST;
 255         }
 256         if (adm_ctx.device &&
 257             adm_ctx.volume != VOLUME_UNSPECIFIED &&
 258             adm_ctx.volume != adm_ctx.device->vnr) {
 259                 pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
 260                                 adm_ctx.minor, adm_ctx.volume,
 261                                 adm_ctx.device->vnr, first_peer_device(adm_ctx.device)->connection->name);
 262                 drbd_msg_put_info("minor exists as different volume");
 263                 return ERR_INVALID_REQUEST;
 264         }
 265
 266         return NO_ERROR;
 267
 268 fail:
 269         nlmsg_free(adm_ctx.reply_skb);
 270         adm_ctx.reply_skb = NULL;
 271         return err;
 272 }
 273
 274 static int drbd_adm_finish(struct genl_info *info, int retcode)
 275 {
 276         if (adm_ctx.connection) {
 277                 kref_put(&adm_ctx.connection->kref, drbd_destroy_connection);
 278                 adm_ctx.connection = NULL;
 279         }
 280
 281         if (!adm_ctx.reply_skb)
 282                 return -ENOMEM;
 283
 284         adm_ctx.reply_dh->ret_code = retcode;
 285         drbd_adm_send_reply(adm_ctx.reply_skb, info);
 286         return 0;
 287 }
 288
 289 static void setup_khelper_env(struct drbd_connection *connection, char **envp)
 290 {
 291         char *afs;
 292
 293         /* FIXME: A future version will not allow this case. */
 294         if (connection->my_addr_len == 0 || connection->peer_addr_len == 0)
 295                 return;
 296
 297         switch (((struct sockaddr *)&connection->peer_addr)->sa_family) {
 298         case AF_INET6:
 299                 afs = "ipv6";
 300                 snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
 301                          &((struct sockaddr_in6 *)&connection->peer_addr)->sin6_addr);
 302                 break;
 303         case AF_INET:
 304                 afs = "ipv4";
 305                 snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
 306                          &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
 307                 break;
 308         default:
 309                 afs = "ssocks";
 310                 snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
 311                          &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
 312         }
 313         snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
 314 }
 315
 316 int drbd_khelper(struct drbd_device *device, char *cmd)
 317 {
 318         char *envp[] = { "HOME=/",
 319                         "TERM=linux",
 320                         "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 321                          (char[20]) { }, /* address family */
 322                          (char[60]) { }, /* address */
 323                         NULL };
 324         char mb[12];
 325         char *argv[] = {usermode_helper, cmd, mb, NULL };
 326         struct drbd_connection *connection = first_peer_device(device)->connection;
 327         struct sib_info sib;
 328         int ret;
 329
 330         if (current == connection->worker.task)
 331                 set_bit(CALLBACK_PENDING, &connection->flags);
 332
 333         snprintf(mb, 12, "minor-%d", device_to_minor(device));
 334         setup_khelper_env(connection, envp);
 335
 336         /* The helper may take some time.
 337          * write out any unsynced meta data changes now */
 338         drbd_md_sync(device);
 339
 340         dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
 341         sib.sib_reason = SIB_HELPER_PRE;
 342         sib.helper_name = cmd;
 343         drbd_bcast_event(device, &sib);
 344         ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 345         if (ret)
 346                 dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
 347                                 usermode_helper, cmd, mb,
 348                                 (ret >> 8) & 0xff, ret);
 349         else
 350                 dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
 351                                 usermode_helper, cmd, mb,
 352                                 (ret >> 8) & 0xff, ret);
 353         sib.sib_reason = SIB_HELPER_POST;
 354         sib.helper_exit_code = ret;
 355         drbd_bcast_event(device, &sib);
 356
 357         if (current == connection->worker.task)
 358                 clear_bit(CALLBACK_PENDING, &connection->flags);
 359
 360         if (ret < 0) /* Ignore any ERRNOs we got. */
 361                 ret = 0;
 362
 363         return ret;
 364 }
 365
 366 static int conn_khelper(struct drbd_connection *connection, char *cmd)
 367 {
 368         char *envp[] = { "HOME=/",
 369                         "TERM=linux",
 370                         "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
 371                          (char[20]) { }, /* address family */
 372                          (char[60]) { }, /* address */
 373                         NULL };
 374         char *argv[] = {usermode_helper, cmd, connection->name, NULL };
 375         int ret;
 376
 377         setup_khelper_env(connection, envp);
 378         conn_md_sync(connection);
 379
 380         conn_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, connection->name);
 381         /* TODO: conn_bcast_event() ?? */
 382
 383         ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 384         if (ret)
 385                 conn_warn(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
 386                           usermode_helper, cmd, connection->name,
 387                           (ret >> 8) & 0xff, ret);
 388         else
 389                 conn_info(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
 390                           usermode_helper, cmd, connection->name,
 391                           (ret >> 8) & 0xff, ret);
 392         /* TODO: conn_bcast_event() ?? */
 393
 394         if (ret < 0) /* Ignore any ERRNOs we got. */
 395                 ret = 0;
 396
 397         return ret;
 398 }
 399
 400 static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connection)
 401 {
 402         enum drbd_fencing_p fp = FP_NOT_AVAIL;
 403         struct drbd_device *device;
 404         int vnr;
 405
 406         rcu_read_lock();
 407         idr_for_each_entry(&connection->volumes, device, vnr) {
 408                 if (get_ldev_if_state(device, D_CONSISTENT)) {
 409                         fp = max_t(enum drbd_fencing_p, fp,
 410                                    rcu_dereference(device->ldev->disk_conf)->fencing);
 411                         put_ldev(device);
 412                 }
 413         }
 414         rcu_read_unlock();
 415
 416         return fp;
 417 }
 418
 419 bool conn_try_outdate_peer(struct drbd_connection *connection)
 420 {
 421         unsigned int connect_cnt;
 422         union drbd_state mask = { };
 423         union drbd_state val = { };
 424         enum drbd_fencing_p fp;
 425         char *ex_to_string;
 426         int r;
 427
 428         if (connection->cstate >= C_WF_REPORT_PARAMS) {
 429                 conn_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
 430                 return false;
 431         }
 432
 433         spin_lock_irq(&connection->req_lock);
 434         connect_cnt = connection->connect_cnt;
 435         spin_unlock_irq(&connection->req_lock);
 436
 437         fp = highest_fencing_policy(connection);
 438         switch (fp) {
 439         case FP_NOT_AVAIL:
 440                 conn_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
 441                 goto out;
 442         case FP_DONT_CARE:
 443                 return true;
 444         default: ;
 445         }
 446
 447         r = conn_khelper(connection, "fence-peer");
 448
 449         switch ((r>>8) & 0xff) {
 450         case 3: /* peer is inconsistent */
 451                 ex_to_string = "peer is inconsistent or worse";
 452                 mask.pdsk = D_MASK;
 453                 val.pdsk = D_INCONSISTENT;
 454                 break;
 455         case 4: /* peer got outdated, or was already outdated */
 456                 ex_to_string = "peer was fenced";
 457                 mask.pdsk = D_MASK;
 458                 val.pdsk = D_OUTDATED;
 459                 break;
 460         case 5: /* peer was down */
 461                 if (conn_highest_disk(connection) == D_UP_TO_DATE) {
 462                         /* we will(have) create(d) a new UUID anyways... */
 463                         ex_to_string = "peer is unreachable, assumed to be dead";
 464                         mask.pdsk = D_MASK;
 465                         val.pdsk = D_OUTDATED;
 466                 } else {
 467                         ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
 468                 }
 469                 break;
 470         case 6: /* Peer is primary, voluntarily outdate myself.
 471                  * This is useful when an unconnected R_SECONDARY is asked to
 472                  * become R_PRIMARY, but finds the other peer being active. */
 473                 ex_to_string = "peer is active";
 474                 conn_warn(connection, "Peer is primary, outdating myself.\n");
 475                 mask.disk = D_MASK;
 476                 val.disk = D_OUTDATED;
 477                 break;
 478         case 7:
 479                 if (fp != FP_STONITH)
 480                         conn_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
 481                 ex_to_string = "peer was stonithed";
 482                 mask.pdsk = D_MASK;
 483                 val.pdsk = D_OUTDATED;
 484                 break;
 485         default:
 486                 /* The script is broken ... */
 487                 conn_err(connection, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
 488                 return false; /* Eventually leave IO frozen */
 489         }
 490
 491         conn_info(connection, "fence-peer helper returned %d (%s)\n",
 492                   (r>>8) & 0xff, ex_to_string);
 493
 494  out:
 495
 496         /* Not using
 497            conn_request_state(connection, mask, val, CS_VERBOSE);
 498            here, because we might were able to re-establish the connection in the
 499            meantime. */
 500         spin_lock_irq(&connection->req_lock);
 501         if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
 502                 if (connection->connect_cnt != connect_cnt)
 503                         /* In case the connection was established and droped
 504                            while the fence-peer handler was running, ignore it */
 505                         conn_info(connection, "Ignoring fence-peer exit code\n");
 506                 else
 507                         _conn_request_state(connection, mask, val, CS_VERBOSE);
 508         }
 509         spin_unlock_irq(&connection->req_lock);
 510
 511         return conn_highest_pdsk(connection) <= D_OUTDATED;
 512 }
 513
 514 static int _try_outdate_peer_async(void *data)
 515 {
 516         struct drbd_connection *connection = (struct drbd_connection *)data;
 517
 518         conn_try_outdate_peer(connection);
 519
 520         kref_put(&connection->kref, drbd_destroy_connection);
 521         return 0;
 522 }
 523
 524 void conn_try_outdate_peer_async(struct drbd_connection *connection)
 525 {
 526         struct task_struct *opa;
 527
 528         kref_get(&connection->kref);
 529         opa = kthread_run(_try_outdate_peer_async, connection, "drbd_async_h");
 530         if (IS_ERR(opa)) {
 531                 conn_err(connection, "out of mem, failed to invoke fence-peer helper\n");
 532                 kref_put(&connection->kref, drbd_destroy_connection);
 533         }
 534 }
 535
 536 enum drbd_state_rv
 537 drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
 538 {
 539         const int max_tries = 4;
 540         enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
 541         struct net_conf *nc;
 542         int try = 0;
 543         int forced = 0;
 544         union drbd_state mask, val;
 545
 546         if (new_role == R_PRIMARY)
 547                 request_ping(first_peer_device(device)->connection); /* Detect a dead peer ASAP */
 548
 549         mutex_lock(device->state_mutex);
 550
 551         mask.i = 0; mask.role = R_MASK;
 552         val.i  = 0; val.role  = new_role;
 553
 554         while (try++ < max_tries) {
 555                 rv = _drbd_request_state(device, mask, val, CS_WAIT_COMPLETE);
 556
 557                 /* in case we first succeeded to outdate,
 558                  * but now suddenly could establish a connection */
 559                 if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
 560                         val.pdsk = 0;
 561                         mask.pdsk = 0;
 562                         continue;
 563                 }
 564
 565                 if (rv == SS_NO_UP_TO_DATE_DISK && force &&
 566                     (device->state.disk < D_UP_TO_DATE &&
 567                      device->state.disk >= D_INCONSISTENT)) {
 568                         mask.disk = D_MASK;
 569                         val.disk  = D_UP_TO_DATE;
 570                         forced = 1;
 571                         continue;
 572                 }
 573
 574                 if (rv == SS_NO_UP_TO_DATE_DISK &&
 575                     device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
 576                         D_ASSERT(device->state.pdsk == D_UNKNOWN);
 577
 578                         if (conn_try_outdate_peer(first_peer_device(device)->connection)) {
 579                                 val.disk = D_UP_TO_DATE;
 580                                 mask.disk = D_MASK;
 581                         }
 582                         continue;
 583                 }
 584
 585                 if (rv == SS_NOTHING_TO_DO)
 586                         goto out;
 587                 if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
 588                         if (!conn_try_outdate_peer(first_peer_device(device)->connection) && force) {
 589                                 dev_warn(DEV, "Forced into split brain situation!\n");
 590                                 mask.pdsk = D_MASK;
 591                                 val.pdsk  = D_OUTDATED;
 592
 593                         }
 594                         continue;
 595                 }
 596                 if (rv == SS_TWO_PRIMARIES) {
 597                         /* Maybe the peer is detected as dead very soon...
 598                            retry at most once more in this case. */
 599                         int timeo;
 600                         rcu_read_lock();
 601                         nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
 602                         timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
 603                         rcu_read_unlock();
 604                         schedule_timeout_interruptible(timeo);
 605                         if (try < max_tries)
 606                                 try = max_tries - 1;
 607                         continue;
 608                 }
 609                 if (rv < SS_SUCCESS) {
 610                         rv = _drbd_request_state(device, mask, val,
 611                                                 CS_VERBOSE + CS_WAIT_COMPLETE);
 612                         if (rv < SS_SUCCESS)
 613                                 goto out;
 614                 }
 615                 break;
 616         }
 617
 618         if (rv < SS_SUCCESS)
 619                 goto out;
 620
 621         if (forced)
 622                 dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
 623
 624         /* Wait until nothing is on the fly :) */
 625         wait_event(device->misc_wait, atomic_read(&device->ap_pending_cnt) == 0);
 626
 627         /* FIXME also wait for all pending P_BARRIER_ACK? */
 628
 629         if (new_role == R_SECONDARY) {
 630                 set_disk_ro(device->vdisk, true);
 631                 if (get_ldev(device)) {
 632                         device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
 633                         put_ldev(device);
 634                 }
 635         } else {
 636                 mutex_lock(&first_peer_device(device)->connection->conf_update);
 637                 nc = first_peer_device(device)->connection->net_conf;
 638                 if (nc)
 639                         nc->discard_my_data = 0; /* without copy; single bit op is atomic */
 640                 mutex_unlock(&first_peer_device(device)->connection->conf_update);
 641
 642                 set_disk_ro(device->vdisk, false);
 643                 if (get_ldev(device)) {
 644                         if (((device->state.conn < C_CONNECTED ||
 645                                device->state.pdsk <= D_FAILED)
 646                               && device->ldev->md.uuid[UI_BITMAP] == 0) || forced)
 647                                 drbd_uuid_new_current(device);
 648
 649                         device->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
 650                         put_ldev(device);
 651                 }
 652         }
 653
 654         /* writeout of activity log covered areas of the bitmap
 655          * to stable storage done in after state change already */
 656
 657         if (device->state.conn >= C_WF_REPORT_PARAMS) {
 658                 /* if this was forced, we should consider sync */
 659                 if (forced)
 660                         drbd_send_uuids(device);
 661                 drbd_send_current_state(device);
 662         }
 663
 664         drbd_md_sync(device);
 665
 666         kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
 667 out:
 668         mutex_unlock(device->state_mutex);
 669         return rv;
 670 }
 671
 672 static const char *from_attrs_err_to_txt(int err)
 673 {
 674         return  err == -ENOMSG ? "required attribute missing" :
 675                 err == -EOPNOTSUPP ? "unknown mandatory attribute" :
 676                 err == -EEXIST ? "can not change invariant setting" :
 677                 "invalid attribute value";
 678 }
 679
 680 int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
 681 {
 682         struct set_role_parms parms;
 683         int err;
 684         enum drbd_ret_code retcode;
 685
 686         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
 687         if (!adm_ctx.reply_skb)
 688                 return retcode;
 689         if (retcode != NO_ERROR)
 690                 goto out;
 691
 692         memset(&parms, 0, sizeof(parms));
 693         if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
 694                 err = set_role_parms_from_attrs(&parms, info);
 695                 if (err) {
 696                         retcode = ERR_MANDATORY_TAG;
 697                         drbd_msg_put_info(from_attrs_err_to_txt(err));
 698                         goto out;
 699                 }
 700         }
 701
 702         if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
 703                 retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate);
 704         else
 705                 retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0);
 706 out:
 707         drbd_adm_finish(info, retcode);
 708         return 0;
 709 }
 710
 711 /* Initializes the md.*_offset members, so we are able to find
 712  * the on disk meta data.
 713  *
 714  * We currently have two possible layouts:
 715  * external:
 716  *   |----------- md_size_sect ------------------|
 717  *   [ 4k superblock ][ activity log ][  Bitmap  ]
 718  *   | al_offset == 8 |
 719  *   | bm_offset = al_offset + X      |
 720  *  ==> bitmap sectors = md_size_sect - bm_offset
 721  *
 722  * internal:
 723  *            |----------- md_size_sect ------------------|
 724  * [data.....][  Bitmap  ][ activity log ][ 4k superblock ]
 725  *                        | al_offset < 0 |
 726  *            | bm_offset = al_offset - Y |
 727  *  ==> bitmap sectors = Y = al_offset - bm_offset
 728  *
 729  *  Activity log size used to be fixed 32kB,
 730  *  but is about to become configurable.
 731  */
 732 static void drbd_md_set_sector_offsets(struct drbd_device *device,
 733                                        struct drbd_backing_dev *bdev)
 734 {
 735         sector_t md_size_sect = 0;
 736         unsigned int al_size_sect = bdev->md.al_size_4k * 8;
 737
 738         bdev->md.md_offset = drbd_md_ss(bdev);
 739
 740         switch (bdev->md.meta_dev_idx) {
 741         default:
 742                 /* v07 style fixed size indexed meta data */
 743                 bdev->md.md_size_sect = MD_128MB_SECT;
 744                 bdev->md.al_offset = MD_4kB_SECT;
 745                 bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
 746                 break;
 747         case DRBD_MD_INDEX_FLEX_EXT:
 748                 /* just occupy the full device; unit: sectors */
 749                 bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
 750                 bdev->md.al_offset = MD_4kB_SECT;
 751                 bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
 752                 break;
 753         case DRBD_MD_INDEX_INTERNAL:
 754         case DRBD_MD_INDEX_FLEX_INT:
 755                 /* al size is still fixed */
 756                 bdev->md.al_offset = -al_size_sect;
 757                 /* we need (slightly less than) ~ this much bitmap sectors: */
 758                 md_size_sect = drbd_get_capacity(bdev->backing_bdev);
 759                 md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
 760                 md_size_sect = BM_SECT_TO_EXT(md_size_sect);
 761                 md_size_sect = ALIGN(md_size_sect, 8);
 762
 763                 /* plus the "drbd meta data super block",
 764                  * and the activity log; */
 765                 md_size_sect += MD_4kB_SECT + al_size_sect;
 766
 767                 bdev->md.md_size_sect = md_size_sect;
 768                 /* bitmap offset is adjusted by 'super' block size */
 769                 bdev->md.bm_offset   = -md_size_sect + MD_4kB_SECT;
 770                 break;
 771         }
 772 }
 773
 774 /* input size is expected to be in KB */
 775 char *ppsize(char *buf, unsigned long long size)
 776 {
 777         /* Needs 9 bytes at max including trailing NUL:
 778          * -1ULL ==> "16384 EB" */
 779         static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
 780         int base = 0;
 781         while (size >= 10000 && base < sizeof(units)-1) {
 782                 /* shift + round */
 783                 size = (size >> 10) + !!(size & (1<<9));
 784                 base++;
 785         }
 786         sprintf(buf, "%u %cB", (unsigned)size, units[base]);
 787
 788         return buf;
 789 }
 790
 791 /* there is still a theoretical deadlock when called from receiver
 792  * on an D_INCONSISTENT R_PRIMARY:
 793  *  remote READ does inc_ap_bio, receiver would need to receive answer
 794  *  packet from remote to dec_ap_bio again.
 795  *  receiver receive_sizes(), comes here,
 796  *  waits for ap_bio_cnt == 0. -> deadlock.
 797  * but this cannot happen, actually, because:
 798  *  R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
 799  *  (not connected, or bad/no disk on peer):
 800  *  see drbd_fail_request_early, ap_bio_cnt is zero.
 801  *  R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
 802  *  peer may not initiate a resize.
 803  */
 804 /* Note these are not to be confused with
 805  * drbd_adm_suspend_io/drbd_adm_resume_io,
 806  * which are (sub) state changes triggered by admin (drbdsetup),
 807  * and can be long lived.
 808  * This changes an device->flag, is triggered by drbd internals,
 809  * and should be short-lived. */
 810 void drbd_suspend_io(struct drbd_device *device)
 811 {
 812         set_bit(SUSPEND_IO, &device->flags);
 813         if (drbd_suspended(device))
 814                 return;
 815         wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
 816 }
 817
 818 void drbd_resume_io(struct drbd_device *device)
 819 {
 820         clear_bit(SUSPEND_IO, &device->flags);
 821         wake_up(&device->misc_wait);
 822 }
 823
 824 /**
 825  * drbd_determine_dev_size() -  Sets the right device size obeying all constraints
 826  * @device:     DRBD device.
 827  *
 828  * Returns 0 on success, negative return values indicate errors.
 829  * You should call drbd_md_sync() after calling this function.
 830  */
 831 enum determine_dev_size
 832 drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
 833 {
 834         sector_t prev_first_sect, prev_size; /* previous meta location */
 835         sector_t la_size_sect, u_size;
 836         struct drbd_md *md = &device->ldev->md;
 837         u32 prev_al_stripe_size_4k;
 838         u32 prev_al_stripes;
 839         sector_t size;
 840         char ppb[10];
 841         void *buffer;
 842
 843         int md_moved, la_size_changed;
 844         enum determine_dev_size rv = DS_UNCHANGED;
 845
 846         /* race:
 847          * application request passes inc_ap_bio,
 848          * but then cannot get an AL-reference.
 849          * this function later may wait on ap_bio_cnt == 0. -> deadlock.
 850          *
 851          * to avoid that:
 852          * Suspend IO right here.
 853          * still lock the act_log to not trigger ASSERTs there.
 854          */
 855         drbd_suspend_io(device);
 856         buffer = drbd_md_get_buffer(device); /* Lock meta-data IO */
 857         if (!buffer) {
 858                 drbd_resume_io(device);
 859                 return DS_ERROR;
 860         }
 861
 862         /* no wait necessary anymore, actually we could assert that */
 863         wait_event(device->al_wait, lc_try_lock(device->act_log));
 864
 865         prev_first_sect = drbd_md_first_sector(device->ldev);
 866         prev_size = device->ldev->md.md_size_sect;
 867         la_size_sect = device->ldev->md.la_size_sect;
 868
 869         if (rs) {
 870                 /* rs is non NULL if we should change the AL layout only */
 871
 872                 prev_al_stripes = md->al_stripes;
 873                 prev_al_stripe_size_4k = md->al_stripe_size_4k;
 874
 875                 md->al_stripes = rs->al_stripes;
 876                 md->al_stripe_size_4k = rs->al_stripe_size / 4;
 877                 md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
 878         }
 879
 880         drbd_md_set_sector_offsets(device, device->ldev);
 881
 882         rcu_read_lock();
 883         u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
 884         rcu_read_unlock();
 885         size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
 886
 887         if (size < la_size_sect) {
 888                 if (rs && u_size == 0) {
 889                         /* Remove "rs &&" later. This check should always be active, but
 890                            right now the receiver expects the permissive behavior */
 891                         dev_warn(DEV, "Implicit shrink not allowed. "
 892                                  "Use --size=%llus for explicit shrink.\n",
 893                                  (unsigned long long)size);
 894                         rv = DS_ERROR_SHRINK;
 895                 }
 896                 if (u_size > size)
 897                         rv = DS_ERROR_SPACE_MD;
 898                 if (rv != DS_UNCHANGED)
 899                         goto err_out;
 900         }
 901
 902         if (drbd_get_capacity(device->this_bdev) != size ||
 903             drbd_bm_capacity(device) != size) {
 904                 int err;
 905                 err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
 906                 if (unlikely(err)) {
 907                         /* currently there is only one error: ENOMEM! */
 908                         size = drbd_bm_capacity(device)>>1;
 909                         if (size == 0) {
 910                                 dev_err(DEV, "OUT OF MEMORY! "
 911                                     "Could not allocate bitmap!\n");
 912                         } else {
 913                                 dev_err(DEV, "BM resizing failed. "
 914                                     "Leaving size unchanged at size = %lu KB\n",
 915                                     (unsigned long)size);
 916                         }
 917                         rv = DS_ERROR;
 918                 }
 919                 /* racy, see comments above. */
 920                 drbd_set_my_capacity(device, size);
 921                 device->ldev->md.la_size_sect = size;
 922                 dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
 923                      (unsigned long long)size>>1);
 924         }
 925         if (rv <= DS_ERROR)
 926                 goto err_out;
 927
 928         la_size_changed = (la_size_sect != device->ldev->md.la_size_sect);
 929
 930         md_moved = prev_first_sect != drbd_md_first_sector(device->ldev)
 931                 || prev_size       != device->ldev->md.md_size_sect;
 932
 933         if (la_size_changed || md_moved || rs) {
 934                 u32 prev_flags;
 935
 936                 drbd_al_shrink(device); /* All extents inactive. */
 937
 938                 prev_flags = md->flags;
 939                 md->flags &= ~MDF_PRIMARY_IND;
 940                 drbd_md_write(device, buffer);
 941
 942                 dev_info(DEV, "Writing the whole bitmap, %s\n",
 943                          la_size_changed && md_moved ? "size changed and md moved" :
 944                          la_size_changed ? "size changed" : "md moved");
 945                 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
 946                 drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
 947                                "size changed", BM_LOCKED_MASK);
 948                 drbd_initialize_al(device, buffer);
 949
 950                 md->flags = prev_flags;
 951                 drbd_md_write(device, buffer);
 952
 953                 if (rs)
 954                         dev_info(DEV, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
 955                                  md->al_stripes, md->al_stripe_size_4k * 4);
 956         }
 957
 958         if (size > la_size_sect)
 959                 rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
 960         if (size < la_size_sect)
 961                 rv = DS_SHRUNK;
 962
 963         if (0) {
 964         err_out:
 965                 if (rs) {
 966                         md->al_stripes = prev_al_stripes;
 967                         md->al_stripe_size_4k = prev_al_stripe_size_4k;
 968                         md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
 969
 970                         drbd_md_set_sector_offsets(device, device->ldev);
 971                 }
 972         }
 973         lc_unlock(device->act_log);
 974         wake_up(&device->al_wait);
 975         drbd_md_put_buffer(device);
 976         drbd_resume_io(device);
 977
 978         return rv;
 979 }
 980
 981 sector_t
 982 drbd_new_dev_size(struct drbd_device *device, struct drbd_backing_dev *bdev,
 983                   sector_t u_size, int assume_peer_has_space)
 984 {
 985         sector_t p_size = device->p_size;   /* partner's disk size. */
 986         sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
 987         sector_t m_size; /* my size */
 988         sector_t size = 0;
 989
 990         m_size = drbd_get_max_capacity(bdev);
 991
 992         if (device->state.conn < C_CONNECTED && assume_peer_has_space) {
 993                 dev_warn(DEV, "Resize while not connected was forced by the user!\n");
 994                 p_size = m_size;
 995         }
 996
 997         if (p_size && m_size) {
 998                 size = min_t(sector_t, p_size, m_size);
 999         } else {
1000                 if (la_size_sect) {
1001                         size = la_size_sect;
1002                         if (m_size && m_size < size)
1003                                 size = m_size;
1004                         if (p_size && p_size < size)
1005                                 size = p_size;
1006                 } else {
1007                         if (m_size)
1008                                 size = m_size;
1009                         if (p_size)
1010                                 size = p_size;
1011                 }
1012         }
1013
1014         if (size == 0)
1015                 dev_err(DEV, "Both nodes diskless!\n");
1016
1017         if (u_size) {
1018                 if (u_size > size)
1019                         dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
1020                             (unsigned long)u_size>>1, (unsigned long)size>>1);
1021                 else
1022                         size = u_size;
1023         }
1024
1025         return size;
1026 }
1027
1028 /**
1029  * drbd_check_al_size() - Ensures that the AL is of the right size
1030  * @device:     DRBD device.
1031  *
1032  * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
1033  * failed, and 0 on success. You should call drbd_md_sync() after you called
1034  * this function.
1035  */
1036 static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
1037 {
1038         struct lru_cache *n, *t;
1039         struct lc_element *e;
1040         unsigned int in_use;
1041         int i;
1042
1043         if (device->act_log &&
1044             device->act_log->nr_elements == dc->al_extents)
1045                 return 0;
1046
1047         in_use = 0;
1048         t = device->act_log;
1049         n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
1050                 dc->al_extents, sizeof(struct lc_element), 0);
1051
1052         if (n == NULL) {
1053                 dev_err(DEV, "Cannot allocate act_log lru!\n");
1054                 return -ENOMEM;
1055         }
1056         spin_lock_irq(&device->al_lock);
1057         if (t) {
1058                 for (i = 0; i < t->nr_elements; i++) {
1059                         e = lc_element_by_index(t, i);
1060                         if (e->refcnt)
1061                                 dev_err(DEV, "refcnt(%d)==%d\n",
1062                                     e->lc_number, e->refcnt);
1063                         in_use += e->refcnt;
1064                 }
1065         }
1066         if (!in_use)
1067                 device->act_log = n;
1068         spin_unlock_irq(&device->al_lock);
1069         if (in_use) {
1070                 dev_err(DEV, "Activity log still in use!\n");
1071                 lc_destroy(n);
1072                 return -EBUSY;
1073         } else {
1074                 if (t)
1075                         lc_destroy(t);
1076         }
1077         drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
1078         return 0;
1079 }
1080
1081 static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_bio_size)
1082 {
1083         struct request_queue * const q = device->rq_queue;
1084         unsigned int max_hw_sectors = max_bio_size >> 9;
1085         unsigned int max_segments = 0;
1086
1087         if (get_ldev_if_state(device, D_ATTACHING)) {
1088                 struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue;
1089
1090                 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
1091                 rcu_read_lock();
1092                 max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
1093                 rcu_read_unlock();
1094                 put_ldev(device);
1095         }
1096
1097         blk_queue_logical_block_size(q, 512);
1098         blk_queue_max_hw_sectors(q, max_hw_sectors);
1099         /* This is the workaround for "bio would need to, but cannot, be split" */
1100         blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
1101         blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
1102
1103         if (get_ldev_if_state(device, D_ATTACHING)) {
1104                 struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue;
1105
1106                 blk_queue_stack_limits(q, b);
1107
1108                 if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
1109                         dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
1110                                  q->backing_dev_info.ra_pages,
1111                                  b->backing_dev_info.ra_pages);
1112                         q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
1113                 }
1114                 put_ldev(device);
1115         }
1116 }
1117
1118 void drbd_reconsider_max_bio_size(struct drbd_device *device)
1119 {
1120         unsigned int now, new, local, peer;
1121
1122         now = queue_max_hw_sectors(device->rq_queue) << 9;
1123         local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
1124         peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
1125
1126         if (get_ldev_if_state(device, D_ATTACHING)) {
1127                 local = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
1128                 device->local_max_bio_size = local;
1129                 put_ldev(device);
1130         }
1131         local = min(local, DRBD_MAX_BIO_SIZE);
1132
1133         /* We may ignore peer limits if the peer is modern enough.
1134            Because new from 8.3.8 onwards the peer can use multiple
1135            BIOs for a single peer_request */
1136         if (device->state.conn >= C_WF_REPORT_PARAMS) {
1137                 if (first_peer_device(device)->connection->agreed_pro_version < 94)
1138                         peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
1139                         /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
1140                 else if (first_peer_device(device)->connection->agreed_pro_version == 94)
1141                         peer = DRBD_MAX_SIZE_H80_PACKET;
1142                 else if (first_peer_device(device)->connection->agreed_pro_version < 100)
1143                         peer = DRBD_MAX_BIO_SIZE_P95;  /* drbd 8.3.8 onwards, before 8.4.0 */
1144                 else
1145                         peer = DRBD_MAX_BIO_SIZE;
1146         }
1147
1148         new = min(local, peer);
1149
1150         if (device->state.role == R_PRIMARY && new < now)
1151                 dev_err(DEV, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
1152
1153         if (new != now)
1154                 dev_info(DEV, "max BIO size = %u\n", new);
1155
1156         drbd_setup_queue_param(device, new);
1157 }
1158
1159 /* Starts the worker thread */
1160 static void conn_reconfig_start(struct drbd_connection *connection)
1161 {
1162         drbd_thread_start(&connection->worker);
1163         conn_flush_workqueue(connection);
1164 }
1165
1166 /* if still unconfigured, stops worker again. */
1167 static void conn_reconfig_done(struct drbd_connection *connection)
1168 {
1169         bool stop_threads;
1170         spin_lock_irq(&connection->req_lock);
1171         stop_threads = conn_all_vols_unconf(connection) &&
1172                 connection->cstate == C_STANDALONE;
1173         spin_unlock_irq(&connection->req_lock);
1174         if (stop_threads) {
1175                 /* asender is implicitly stopped by receiver
1176                  * in conn_disconnect() */
1177                 drbd_thread_stop(&connection->receiver);
1178                 drbd_thread_stop(&connection->worker);
1179         }
1180 }
1181
1182 /* Make sure IO is suspended before calling this function(). */
1183 static void drbd_suspend_al(struct drbd_device *device)
1184 {
1185         int s = 0;
1186
1187         if (!lc_try_lock(device->act_log)) {
1188                 dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
1189                 return;
1190         }
1191
1192         drbd_al_shrink(device);
1193         spin_lock_irq(&first_peer_device(device)->connection->req_lock);
1194         if (device->state.conn < C_CONNECTED)
1195                 s = !test_and_set_bit(AL_SUSPENDED, &device->flags);
1196         spin_unlock_irq(&first_peer_device(device)->connection->req_lock);
1197         lc_unlock(device->act_log);
1198
1199         if (s)
1200                 dev_info(DEV, "Suspended AL updates\n");
1201 }
1202
1203
1204 static bool should_set_defaults(struct genl_info *info)
1205 {
1206         unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
1207         return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
1208 }
1209
1210 static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
1211 {
1212         /* This is limited by 16 bit "slot" numbers,
1213          * and by available on-disk context storage.
1214          *
1215          * Also (u16)~0 is special (denotes a "free" extent).
1216          *
1217          * One transaction occupies one 4kB on-disk block,
1218          * we have n such blocks in the on disk ring buffer,
1219          * the "current" transaction may fail (n-1),
1220          * and there is 919 slot numbers context information per transaction.
1221          *
1222          * 72 transaction blocks amounts to more than 2**16 context slots,
1223          * so cap there first.
1224          */
1225         const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
1226         const unsigned int sufficient_on_disk =
1227                 (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
1228                 /AL_CONTEXT_PER_TRANSACTION;
1229
1230         unsigned int al_size_4k = bdev->md.al_size_4k;
1231
1232         if (al_size_4k > sufficient_on_disk)
1233                 return max_al_nr;
1234
1235         return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
1236 }
1237
1238 int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1239 {
1240         enum drbd_ret_code retcode;
1241         struct drbd_device *device;
1242         struct disk_conf *new_disk_conf, *old_disk_conf;
1243         struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
1244         int err, fifo_size;
1245
1246         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
1247         if (!adm_ctx.reply_skb)
1248                 return retcode;
1249         if (retcode != NO_ERROR)
1250                 goto out;
1251
1252         device = adm_ctx.device;
1253
1254         /* we also need a disk
1255          * to change the options on */
1256         if (!get_ldev(device)) {
1257                 retcode = ERR_NO_DISK;
1258                 goto out;
1259         }
1260
1261         new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
1262         if (!new_disk_conf) {
1263                 retcode = ERR_NOMEM;
1264                 goto fail;
1265         }
1266
1267         mutex_lock(&first_peer_device(device)->connection->conf_update);
1268         old_disk_conf = device->ldev->disk_conf;
1269         *new_disk_conf = *old_disk_conf;
1270         if (should_set_defaults(info))
1271                 set_disk_conf_defaults(new_disk_conf);
1272
1273         err = disk_conf_from_attrs_for_change(new_disk_conf, info);
1274         if (err && err != -ENOMSG) {
1275                 retcode = ERR_MANDATORY_TAG;
1276                 drbd_msg_put_info(from_attrs_err_to_txt(err));
1277                 goto fail_unlock;
1278         }
1279
1280         if (!expect(new_disk_conf->resync_rate >= 1))
1281                 new_disk_conf->resync_rate = 1;
1282
1283         if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1284                 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1285         if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev))
1286                 new_disk_conf->al_extents = drbd_al_extents_max(device->ldev);
1287
1288         if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1289                 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1290
1291         fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
1292         if (fifo_size != device->rs_plan_s->size) {
1293                 new_plan = fifo_alloc(fifo_size);
1294                 if (!new_plan) {
1295                         dev_err(DEV, "kmalloc of fifo_buffer failed");
1296                         retcode = ERR_NOMEM;
1297                         goto fail_unlock;
1298                 }
1299         }
1300
1301         drbd_suspend_io(device);
1302         wait_event(device->al_wait, lc_try_lock(device->act_log));
1303         drbd_al_shrink(device);
1304         err = drbd_check_al_size(device, new_disk_conf);
1305         lc_unlock(device->act_log);
1306         wake_up(&device->al_wait);
1307         drbd_resume_io(device);
1308
1309         if (err) {
1310                 retcode = ERR_NOMEM;
1311                 goto fail_unlock;
1312         }
1313
1314         write_lock_irq(&global_state_lock);
1315         retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
1316         if (retcode == NO_ERROR) {
1317                 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
1318                 drbd_resync_after_changed(device);
1319         }
1320         write_unlock_irq(&global_state_lock);
1321
1322         if (retcode != NO_ERROR)
1323                 goto fail_unlock;
1324
1325         if (new_plan) {
1326                 old_plan = device->rs_plan_s;
1327                 rcu_assign_pointer(device->rs_plan_s, new_plan);
1328         }
1329
1330         mutex_unlock(&first_peer_device(device)->connection->conf_update);
1331
1332         if (new_disk_conf->al_updates)
1333                 device->ldev->md.flags &= ~MDF_AL_DISABLED;
1334         else
1335                 device->ldev->md.flags |= MDF_AL_DISABLED;
1336
1337         if (new_disk_conf->md_flushes)
1338                 clear_bit(MD_NO_FUA, &device->flags);
1339         else
1340                 set_bit(MD_NO_FUA, &device->flags);
1341
1342         drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
1343
1344         drbd_md_sync(device);
1345
1346         if (device->state.conn >= C_CONNECTED)
1347                 drbd_send_sync_param(device);
1348
1349         synchronize_rcu();
1350         kfree(old_disk_conf);
1351         kfree(old_plan);
1352         mod_timer(&device->request_timer, jiffies + HZ);
1353         goto success;
1354
1355 fail_unlock:
1356         mutex_unlock(&first_peer_device(device)->connection->conf_update);
1357  fail:
1358         kfree(new_disk_conf);
1359         kfree(new_plan);
1360 success:
1361         put_ldev(device);
1362  out:
1363         drbd_adm_finish(info, retcode);
1364         return 0;
1365 }
1366
1367 int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1368 {
1369         struct drbd_device *device;
1370         int err;
1371         enum drbd_ret_code retcode;
1372         enum determine_dev_size dd;
1373         sector_t max_possible_sectors;
1374         sector_t min_md_device_sectors;
1375         struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
1376         struct disk_conf *new_disk_conf = NULL;
1377         struct block_device *bdev;
1378         struct lru_cache *resync_lru = NULL;
1379         struct fifo_buffer *new_plan = NULL;
1380         union drbd_state ns, os;
1381         enum drbd_state_rv rv;
1382         struct net_conf *nc;
1383
1384         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
1385         if (!adm_ctx.reply_skb)
1386                 return retcode;
1387         if (retcode != NO_ERROR)
1388                 goto finish;
1389
1390         device = adm_ctx.device;
1391         conn_reconfig_start(first_peer_device(device)->connection);
1392
1393         /* if you want to reconfigure, please tear down first */
1394         if (device->state.disk > D_DISKLESS) {
1395                 retcode = ERR_DISK_CONFIGURED;
1396                 goto fail;
1397         }
1398         /* It may just now have detached because of IO error.  Make sure
1399          * drbd_ldev_destroy is done already, we may end up here very fast,
1400          * e.g. if someone calls attach from the on-io-error handler,
1401          * to realize a "hot spare" feature (not that I'd recommend that) */
1402         wait_event(device->misc_wait, !atomic_read(&device->local_cnt));
1403
1404         /* make sure there is no leftover from previous force-detach attempts */
1405         clear_bit(FORCE_DETACH, &device->flags);
1406         clear_bit(WAS_IO_ERROR, &device->flags);
1407         clear_bit(WAS_READ_ERROR, &device->flags);
1408
1409         /* and no leftover from previously aborted resync or verify, either */
1410         device->rs_total = 0;
1411         device->rs_failed = 0;
1412         atomic_set(&device->rs_pending_cnt, 0);
1413
1414         /* allocation not in the IO path, drbdsetup context */
1415         nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
1416         if (!nbc) {
1417                 retcode = ERR_NOMEM;
1418                 goto fail;
1419         }
1420         spin_lock_init(&nbc->md.uuid_lock);
1421
1422         new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
1423         if (!new_disk_conf) {
1424                 retcode = ERR_NOMEM;
1425                 goto fail;
1426         }
1427         nbc->disk_conf = new_disk_conf;
1428
1429         set_disk_conf_defaults(new_disk_conf);
1430         err = disk_conf_from_attrs(new_disk_conf, info);
1431         if (err) {
1432                 retcode = ERR_MANDATORY_TAG;
1433                 drbd_msg_put_info(from_attrs_err_to_txt(err));
1434                 goto fail;
1435         }
1436
1437         if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
1438                 new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
1439
1440         new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
1441         if (!new_plan) {
1442                 retcode = ERR_NOMEM;
1443                 goto fail;
1444         }
1445
1446         if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
1447                 retcode = ERR_MD_IDX_INVALID;
1448                 goto fail;
1449         }
1450
1451         write_lock_irq(&global_state_lock);
1452         retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
1453         write_unlock_irq(&global_state_lock);
1454         if (retcode != NO_ERROR)
1455                 goto fail;
1456
1457         rcu_read_lock();
1458         nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
1459         if (nc) {
1460                 if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
1461                         rcu_read_unlock();
1462                         retcode = ERR_STONITH_AND_PROT_A;
1463                         goto fail;
1464                 }
1465         }
1466         rcu_read_unlock();
1467
1468         bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
1469                                   FMODE_READ | FMODE_WRITE | FMODE_EXCL, device);
1470         if (IS_ERR(bdev)) {
1471                 dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
1472                         PTR_ERR(bdev));
1473                 retcode = ERR_OPEN_DISK;
1474                 goto fail;
1475         }
1476         nbc->backing_bdev = bdev;
1477
1478         /*
1479          * meta_dev_idx >= 0: external fixed size, possibly multiple
1480          * drbd sharing one meta device.  TODO in that case, paranoia
1481          * check that [md_bdev, meta_dev_idx] is not yet used by some
1482          * other drbd minor!  (if you use drbd.conf + drbdadm, that
1483          * should check it for you already; but if you don't, or
1484          * someone fooled it, we need to double check here)
1485          */
1486         bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
1487                                   FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1488                                   (new_disk_conf->meta_dev_idx < 0) ?
1489                                   (void *)device : (void *)drbd_m_holder);
1490         if (IS_ERR(bdev)) {
1491                 dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
1492                         PTR_ERR(bdev));
1493                 retcode = ERR_OPEN_MD_DISK;
1494                 goto fail;
1495         }
1496         nbc->md_bdev = bdev;
1497
1498         if ((nbc->backing_bdev == nbc->md_bdev) !=
1499             (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
1500              new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
1501                 retcode = ERR_MD_IDX_INVALID;
1502                 goto fail;
1503         }
1504
1505         resync_lru = lc_create("resync", drbd_bm_ext_cache,
1506                         1, 61, sizeof(struct bm_extent),
1507                         offsetof(struct bm_extent, lce));
1508         if (!resync_lru) {
1509                 retcode = ERR_NOMEM;
1510                 goto fail;
1511         }
1512
1513         /* Read our meta data super block early.
1514          * This also sets other on-disk offsets. */
1515         retcode = drbd_md_read(device, nbc);
1516         if (retcode != NO_ERROR)
1517                 goto fail;
1518
1519         if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
1520                 new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
1521         if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
1522                 new_disk_conf->al_extents = drbd_al_extents_max(nbc);
1523
1524         if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
1525                 dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
1526                         (unsigned long long) drbd_get_max_capacity(nbc),
1527                         (unsigned long long) new_disk_conf->disk_size);
1528                 retcode = ERR_DISK_TOO_SMALL;
1529                 goto fail;
1530         }
1531
1532         if (new_disk_conf->meta_dev_idx < 0) {
1533                 max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
1534                 /* at least one MB, otherwise it does not make sense */
1535                 min_md_device_sectors = (2<<10);
1536         } else {
1537                 max_possible_sectors = DRBD_MAX_SECTORS;
1538                 min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
1539         }
1540
1541         if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
1542                 retcode = ERR_MD_DISK_TOO_SMALL;
1543                 dev_warn(DEV, "refusing attach: md-device too small, "
1544                      "at least %llu sectors needed for this meta-disk type\n",
1545                      (unsigned long long) min_md_device_sectors);
1546                 goto fail;
1547         }
1548
1549         /* Make sure the new disk is big enough
1550          * (we may currently be R_PRIMARY with no local disk...) */
1551         if (drbd_get_max_capacity(nbc) <
1552             drbd_get_capacity(device->this_bdev)) {
1553                 retcode = ERR_DISK_TOO_SMALL;
1554                 goto fail;
1555         }
1556
1557         nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
1558
1559         if (nbc->known_size > max_possible_sectors) {
1560                 dev_warn(DEV, "==> truncating very big lower level device "
1561                         "to currently maximum possible %llu sectors <==\n",
1562                         (unsigned long long) max_possible_sectors);
1563                 if (new_disk_conf->meta_dev_idx >= 0)
1564                         dev_warn(DEV, "==>> using internal or flexible "
1565                                       "meta data may help <<==\n");
1566         }
1567
1568         drbd_suspend_io(device);
1569         /* also wait for the last barrier ack. */
1570         /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
1571          * We need a way to either ignore barrier acks for barriers sent before a device
1572          * was attached, or a way to wait for all pending barrier acks to come in.
1573          * As barriers are counted per resource,
1574          * we'd need to suspend io on all devices of a resource.
1575          */
1576         wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
1577         /* and for any other previously queued work */
1578         drbd_flush_workqueue(device);
1579
1580         rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
1581         retcode = rv;  /* FIXME: Type mismatch. */
1582         drbd_resume_io(device);
1583         if (rv < SS_SUCCESS)
1584                 goto fail;
1585
1586         if (!get_ldev_if_state(device, D_ATTACHING))
1587                 goto force_diskless;
1588
1589         if (!device->bitmap) {
1590                 if (drbd_bm_init(device)) {
1591                         retcode = ERR_NOMEM;
1592                         goto force_diskless_dec;
1593                 }
1594         }
1595
1596         if (device->state.conn < C_CONNECTED &&
1597             device->state.role == R_PRIMARY &&
1598             (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
1599                 dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
1600                     (unsigned long long)device->ed_uuid);
1601                 retcode = ERR_DATA_NOT_CURRENT;
1602                 goto force_diskless_dec;
1603         }
1604
1605         /* Since we are diskless, fix the activity log first... */
1606         if (drbd_check_al_size(device, new_disk_conf)) {
1607                 retcode = ERR_NOMEM;
1608                 goto force_diskless_dec;
1609         }
1610
1611         /* Prevent shrinking of consistent devices ! */
1612         if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
1613             drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
1614                 dev_warn(DEV, "refusing to truncate a consistent device\n");
1615                 retcode = ERR_DISK_TOO_SMALL;
1616                 goto force_diskless_dec;
1617         }
1618
1619         /* Reset the "barriers don't work" bits here, then force meta data to
1620          * be written, to ensure we determine if barriers are supported. */
1621         if (new_disk_conf->md_flushes)
1622                 clear_bit(MD_NO_FUA, &device->flags);
1623         else
1624                 set_bit(MD_NO_FUA, &device->flags);
1625
1626         /* Point of no return reached.
1627          * Devices and memory are no longer released by error cleanup below.
1628          * now device takes over responsibility, and the state engine should
1629          * clean it up somewhere.  */
1630         D_ASSERT(device->ldev == NULL);
1631         device->ldev = nbc;
1632         device->resync = resync_lru;
1633         device->rs_plan_s = new_plan;
1634         nbc = NULL;
1635         resync_lru = NULL;
1636         new_disk_conf = NULL;
1637         new_plan = NULL;
1638
1639         drbd_bump_write_ordering(first_peer_device(device)->connection, WO_bdev_flush);
1640
1641         if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
1642                 set_bit(CRASHED_PRIMARY, &device->flags);
1643         else
1644                 clear_bit(CRASHED_PRIMARY, &device->flags);
1645
1646         if (drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
1647             !(device->state.role == R_PRIMARY &&
1648               first_peer_device(device)->connection->susp_nod))
1649                 set_bit(CRASHED_PRIMARY, &device->flags);
1650
1651         device->send_cnt = 0;
1652         device->recv_cnt = 0;
1653         device->read_cnt = 0;
1654         device->writ_cnt = 0;
1655
1656         drbd_reconsider_max_bio_size(device);
1657
1658         /* If I am currently not R_PRIMARY,
1659          * but meta data primary indicator is set,
1660          * I just now recover from a hard crash,
1661          * and have been R_PRIMARY before that crash.
1662          *
1663          * Now, if I had no connection before that crash
1664          * (have been degraded R_PRIMARY), chances are that
1665          * I won't find my peer now either.
1666          *
1667          * In that case, and _only_ in that case,
1668          * we use the degr-wfc-timeout instead of the default,
1669          * so we can automatically recover from a crash of a
1670          * degraded but active "cluster" after a certain timeout.
1671          */
1672         clear_bit(USE_DEGR_WFC_T, &device->flags);
1673         if (device->state.role != R_PRIMARY &&
1674              drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
1675             !drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND))
1676                 set_bit(USE_DEGR_WFC_T, &device->flags);
1677
1678         dd = drbd_determine_dev_size(device, 0, NULL);
1679         if (dd <= DS_ERROR) {
1680                 retcode = ERR_NOMEM_BITMAP;
1681                 goto force_diskless_dec;
1682         } else if (dd == DS_GREW)
1683                 set_bit(RESYNC_AFTER_NEG, &device->flags);
1684
1685         if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ||
1686             (test_bit(CRASHED_PRIMARY, &device->flags) &&
1687              drbd_md_test_flag(device->ldev, MDF_AL_DISABLED))) {
1688                 dev_info(DEV, "Assuming that all blocks are out of sync "
1689                      "(aka FullSync)\n");
1690                 if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
1691                         "set_n_write from attaching", BM_LOCKED_MASK)) {
1692                         retcode = ERR_IO_MD_DISK;
1693                         goto force_diskless_dec;
1694                 }
1695         } else {
1696                 if (drbd_bitmap_io(device, &drbd_bm_read,
1697                         "read from attaching", BM_LOCKED_MASK)) {
1698                         retcode = ERR_IO_MD_DISK;
1699                         goto force_diskless_dec;
1700                 }
1701         }
1702
1703         if (_drbd_bm_total_weight(device) == drbd_bm_bits(device))
1704                 drbd_suspend_al(device); /* IO is still suspended here... */
1705
1706         spin_lock_irq(&first_peer_device(device)->connection->req_lock);
1707         os = drbd_read_state(device);
1708         ns = os;
1709         /* If MDF_CONSISTENT is not set go into inconsistent state,
1710            otherwise investigate MDF_WasUpToDate...
1711            If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
1712            otherwise into D_CONSISTENT state.
1713         */
1714         if (drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) {
1715                 if (drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE))
1716                         ns.disk = D_CONSISTENT;
1717                 else
1718                         ns.disk = D_OUTDATED;
1719         } else {
1720                 ns.disk = D_INCONSISTENT;
1721         }
1722
1723         if (drbd_md_test_flag(device->ldev, MDF_PEER_OUT_DATED))
1724                 ns.pdsk = D_OUTDATED;
1725
1726         rcu_read_lock();
1727         if (ns.disk == D_CONSISTENT &&
1728             (ns.pdsk == D_OUTDATED || rcu_dereference(device->ldev->disk_conf)->fencing == FP_DONT_CARE))
1729                 ns.disk = D_UP_TO_DATE;
1730
1731         /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
1732            MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
1733            this point, because drbd_request_state() modifies these
1734            flags. */
1735
1736         if (rcu_dereference(device->ldev->disk_conf)->al_updates)
1737                 device->ldev->md.flags &= ~MDF_AL_DISABLED;
1738         else
1739                 device->ldev->md.flags |= MDF_AL_DISABLED;
1740
1741         rcu_read_unlock();
1742
1743         /* In case we are C_CONNECTED postpone any decision on the new disk
1744            state after the negotiation phase. */
1745         if (device->state.conn == C_CONNECTED) {
1746                 device->new_state_tmp.i = ns.i;
1747                 ns.i = os.i;
1748                 ns.disk = D_NEGOTIATING;
1749
1750                 /* We expect to receive up-to-date UUIDs soon.
1751                    To avoid a race in receive_state, free p_uuid while
1752                    holding req_lock. I.e. atomic with the state change */
1753                 kfree(device->p_uuid);
1754                 device->p_uuid = NULL;
1755         }
1756
1757         rv = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
1758         spin_unlock_irq(&first_peer_device(device)->connection->req_lock);
1759
1760         if (rv < SS_SUCCESS)
1761                 goto force_diskless_dec;
1762
1763         mod_timer(&device->request_timer, jiffies + HZ);
1764
1765         if (device->state.role == R_PRIMARY)
1766                 device->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
1767         else
1768                 device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
1769
1770         drbd_md_mark_dirty(device);
1771         drbd_md_sync(device);
1772
1773         kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
1774         put_ldev(device);
1775         conn_reconfig_done(first_peer_device(device)->connection);
1776         drbd_adm_finish(info, retcode);
1777         return 0;
1778
1779  force_diskless_dec:
1780         put_ldev(device);
1781  force_diskless:
1782         drbd_force_state(device, NS(disk, D_DISKLESS));
1783         drbd_md_sync(device);
1784  fail:
1785         conn_reconfig_done(first_peer_device(device)->connection);
1786         if (nbc) {
1787                 if (nbc->backing_bdev)
1788                         blkdev_put(nbc->backing_bdev,
1789                                    FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1790                 if (nbc->md_bdev)
1791                         blkdev_put(nbc->md_bdev,
1792                                    FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1793                 kfree(nbc);
1794         }
1795         kfree(new_disk_conf);
1796         lc_destroy(resync_lru);
1797         kfree(new_plan);
1798
1799  finish:
1800         drbd_adm_finish(info, retcode);
1801         return 0;
1802 }
1803
1804 static int adm_detach(struct drbd_device *device, int force)
1805 {
1806         enum drbd_state_rv retcode;
1807         int ret;
1808
1809         if (force) {
1810                 set_bit(FORCE_DETACH, &device->flags);
1811                 drbd_force_state(device, NS(disk, D_FAILED));
1812                 retcode = SS_SUCCESS;
1813                 goto out;
1814         }
1815
1816         drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
1817         drbd_md_get_buffer(device); /* make sure there is no in-flight meta-data IO */
1818         retcode = drbd_request_state(device, NS(disk, D_FAILED));
1819         drbd_md_put_buffer(device);
1820         /* D_FAILED will transition to DISKLESS. */
1821         ret = wait_event_interruptible(device->misc_wait,
1822                         device->state.disk != D_FAILED);
1823         drbd_resume_io(device);
1824         if ((int)retcode == (int)SS_IS_DISKLESS)
1825                 retcode = SS_NOTHING_TO_DO;
1826         if (ret)
1827                 retcode = ERR_INTR;
1828 out:
1829         return retcode;
1830 }
1831
1832 /* Detaching the disk is a process in multiple stages.  First we need to lock
1833  * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
1834  * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
1835  * internal references as well.
1836  * Only then we have finally detached. */
1837 int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
1838 {
1839         enum drbd_ret_code retcode;
1840         struct detach_parms parms = { };
1841         int err;
1842
1843         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
1844         if (!adm_ctx.reply_skb)
1845                 return retcode;
1846         if (retcode != NO_ERROR)
1847                 goto out;
1848
1849         if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
1850                 err = detach_parms_from_attrs(&parms, info);
1851                 if (err) {
1852                         retcode = ERR_MANDATORY_TAG;
1853                         drbd_msg_put_info(from_attrs_err_to_txt(err));
1854                         goto out;
1855                 }
1856         }
1857
1858         retcode = adm_detach(adm_ctx.device, parms.force_detach);
1859 out:
1860         drbd_adm_finish(info, retcode);
1861         return 0;
1862 }
1863
1864 static bool conn_resync_running(struct drbd_connection *connection)
1865 {
1866         struct drbd_device *device;
1867         bool rv = false;
1868         int vnr;
1869
1870         rcu_read_lock();
1871         idr_for_each_entry(&connection->volumes, device, vnr) {
1872                 if (device->state.conn == C_SYNC_SOURCE ||
1873                     device->state.conn == C_SYNC_TARGET ||
1874                     device->state.conn == C_PAUSED_SYNC_S ||
1875                     device->state.conn == C_PAUSED_SYNC_T) {
1876                         rv = true;
1877                         break;
1878                 }
1879         }
1880         rcu_read_unlock();
1881
1882         return rv;
1883 }
1884
1885 static bool conn_ov_running(struct drbd_connection *connection)
1886 {
1887         struct drbd_device *device;
1888         bool rv = false;
1889         int vnr;
1890
1891         rcu_read_lock();
1892         idr_for_each_entry(&connection->volumes, device, vnr) {
1893                 if (device->state.conn == C_VERIFY_S ||
1894                     device->state.conn == C_VERIFY_T) {
1895                         rv = true;
1896                         break;
1897                 }
1898         }
1899         rcu_read_unlock();
1900
1901         return rv;
1902 }
1903
1904 static enum drbd_ret_code
1905 _check_net_options(struct drbd_connection *connection, struct net_conf *old_conf, struct net_conf *new_conf)
1906 {
1907         struct drbd_device *device;
1908         int i;
1909
1910         if (old_conf && connection->cstate == C_WF_REPORT_PARAMS && connection->agreed_pro_version < 100) {
1911                 if (new_conf->wire_protocol != old_conf->wire_protocol)
1912                         return ERR_NEED_APV_100;
1913
1914                 if (new_conf->two_primaries != old_conf->two_primaries)
1915                         return ERR_NEED_APV_100;
1916
1917                 if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg))
1918                         return ERR_NEED_APV_100;
1919         }
1920
1921         if (!new_conf->two_primaries &&
1922             conn_highest_role(connection) == R_PRIMARY &&
1923             conn_highest_peer(connection) == R_PRIMARY)
1924                 return ERR_NEED_ALLOW_TWO_PRI;
1925
1926         if (new_conf->two_primaries &&
1927             (new_conf->wire_protocol != DRBD_PROT_C))
1928                 return ERR_NOT_PROTO_C;
1929
1930         idr_for_each_entry(&connection->volumes, device, i) {
1931                 if (get_ldev(device)) {
1932                         enum drbd_fencing_p fp = rcu_dereference(device->ldev->disk_conf)->fencing;
1933                         put_ldev(device);
1934                         if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
1935                                 return ERR_STONITH_AND_PROT_A;
1936                 }
1937                 if (device->state.role == R_PRIMARY && new_conf->discard_my_data)
1938                         return ERR_DISCARD_IMPOSSIBLE;
1939         }
1940
1941         if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A)
1942                 return ERR_CONG_NOT_PROTO_A;
1943
1944         return NO_ERROR;
1945 }
1946
1947 static enum drbd_ret_code
1948 check_net_options(struct drbd_connection *connection, struct net_conf *new_conf)
1949 {
1950         static enum drbd_ret_code rv;
1951         struct drbd_device *device;
1952         int i;
1953
1954         rcu_read_lock();
1955         rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_conf);
1956         rcu_read_unlock();
1957
1958         /* connection->volumes protected by genl_lock() here */
1959         idr_for_each_entry(&connection->volumes, device, i) {
1960                 if (!device->bitmap) {
1961                         if (drbd_bm_init(device))
1962                                 return ERR_NOMEM;
1963                 }
1964         }
1965
1966         return rv;
1967 }
1968
1969 struct crypto {
1970         struct crypto_hash *verify_tfm;
1971         struct crypto_hash *csums_tfm;
1972         struct crypto_hash *cram_hmac_tfm;
1973         struct crypto_hash *integrity_tfm;
1974 };
1975
1976 static int
1977 alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg)
1978 {
1979         if (!tfm_name[0])
1980                 return NO_ERROR;
1981
1982         *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC);
1983         if (IS_ERR(*tfm)) {
1984                 *tfm = NULL;
1985                 return err_alg;
1986         }
1987
1988         return NO_ERROR;
1989 }
1990
1991 static enum drbd_ret_code
1992 alloc_crypto(struct crypto *crypto, struct net_conf *new_conf)
1993 {
1994         char hmac_name[CRYPTO_MAX_ALG_NAME];
1995         enum drbd_ret_code rv;
1996
1997         rv = alloc_hash(&crypto->csums_tfm, new_conf->csums_alg,
1998                        ERR_CSUMS_ALG);
1999         if (rv != NO_ERROR)
2000                 return rv;
2001         rv = alloc_hash(&crypto->verify_tfm, new_conf->verify_alg,
2002                        ERR_VERIFY_ALG);
2003         if (rv != NO_ERROR)
2004                 return rv;
2005         rv = alloc_hash(&crypto->integrity_tfm, new_conf->integrity_alg,
2006                        ERR_INTEGRITY_ALG);
2007         if (rv != NO_ERROR)
2008                 return rv;
2009         if (new_conf->cram_hmac_alg[0] != 0) {
2010                 snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
2011                          new_conf->cram_hmac_alg);
2012
2013                 rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name,
2014                                ERR_AUTH_ALG);
2015         }
2016
2017         return rv;
2018 }
2019
2020 static void free_crypto(struct crypto *crypto)
2021 {
2022         crypto_free_hash(crypto->cram_hmac_tfm);
2023         crypto_free_hash(crypto->integrity_tfm);
2024         crypto_free_hash(crypto->csums_tfm);
2025         crypto_free_hash(crypto->verify_tfm);
2026 }
2027
2028 int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
2029 {
2030         enum drbd_ret_code retcode;
2031         struct drbd_connection *connection;
2032         struct net_conf *old_conf, *new_conf = NULL;
2033         int err;
2034         int ovr; /* online verify running */
2035         int rsr; /* re-sync running */
2036         struct crypto crypto = { };
2037
2038         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
2039         if (!adm_ctx.reply_skb)
2040                 return retcode;
2041         if (retcode != NO_ERROR)
2042                 goto out;
2043
2044         connection = adm_ctx.connection;
2045
2046         new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
2047         if (!new_conf) {
2048                 retcode = ERR_NOMEM;
2049                 goto out;
2050         }
2051
2052         conn_reconfig_start(connection);
2053
2054         mutex_lock(&connection->data.mutex);
2055         mutex_lock(&connection->conf_update);
2056         old_conf = connection->net_conf;
2057
2058         if (!old_conf) {
2059                 drbd_msg_put_info("net conf missing, try connect");
2060                 retcode = ERR_INVALID_REQUEST;
2061                 goto fail;
2062         }
2063
2064         *new_conf = *old_conf;
2065         if (should_set_defaults(info))
2066                 set_net_conf_defaults(new_conf);
2067
2068         err = net_conf_from_attrs_for_change(new_conf, info);
2069         if (err && err != -ENOMSG) {
2070                 retcode = ERR_MANDATORY_TAG;
2071                 drbd_msg_put_info(from_attrs_err_to_txt(err));
2072                 goto fail;
2073         }
2074
2075         retcode = check_net_options(connection, new_conf);
2076         if (retcode != NO_ERROR)
2077                 goto fail;
2078
2079         /* re-sync running */
2080         rsr = conn_resync_running(connection);
2081         if (rsr && strcmp(new_conf->csums_alg, old_conf->csums_alg)) {
2082                 retcode = ERR_CSUMS_RESYNC_RUNNING;
2083                 goto fail;
2084         }
2085
2086         /* online verify running */
2087         ovr = conn_ov_running(connection);
2088         if (ovr && strcmp(new_conf->verify_alg, old_conf->verify_alg)) {
2089                 retcode = ERR_VERIFY_RUNNING;
2090                 goto fail;
2091         }
2092
2093         retcode = alloc_crypto(&crypto, new_conf);
2094         if (retcode != NO_ERROR)
2095                 goto fail;
2096
2097         rcu_assign_pointer(connection->net_conf, new_conf);
2098
2099         if (!rsr) {
2100                 crypto_free_hash(connection->csums_tfm);
2101                 connection->csums_tfm = crypto.csums_tfm;
2102                 crypto.csums_tfm = NULL;
2103         }
2104         if (!ovr) {
2105                 crypto_free_hash(connection->verify_tfm);
2106                 connection->verify_tfm = crypto.verify_tfm;
2107                 crypto.verify_tfm = NULL;
2108         }
2109
2110         crypto_free_hash(connection->integrity_tfm);
2111         connection->integrity_tfm = crypto.integrity_tfm;
2112         if (connection->cstate >= C_WF_REPORT_PARAMS && connection->agreed_pro_version >= 100)
2113                 /* Do this without trying to take connection->data.mutex again.  */
2114                 __drbd_send_protocol(connection, P_PROTOCOL_UPDATE);
2115
2116         crypto_free_hash(connection->cram_hmac_tfm);
2117         connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
2118
2119         mutex_unlock(&connection->conf_update);
2120         mutex_unlock(&connection->data.mutex);
2121         synchronize_rcu();
2122         kfree(old_conf);
2123
2124         if (connection->cstate >= C_WF_REPORT_PARAMS)
2125                 drbd_send_sync_param(minor_to_device(conn_lowest_minor(connection)));
2126
2127         goto done;
2128
2129  fail:
2130         mutex_unlock(&connection->conf_update);
2131         mutex_unlock(&connection->data.mutex);
2132         free_crypto(&crypto);
2133         kfree(new_conf);
2134  done:
2135         conn_reconfig_done(connection);
2136  out:
2137         drbd_adm_finish(info, retcode);
2138         return 0;
2139 }
2140
2141 int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
2142 {
2143         struct drbd_device *device;
2144         struct net_conf *old_conf, *new_conf = NULL;
2145         struct crypto crypto = { };
2146         struct drbd_connection *connection;
2147         enum drbd_ret_code retcode;
2148         int i;
2149         int err;
2150
2151         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
2152
2153         if (!adm_ctx.reply_skb)
2154                 return retcode;
2155         if (retcode != NO_ERROR)
2156                 goto out;
2157         if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
2158                 drbd_msg_put_info("connection endpoint(s) missing");
2159                 retcode = ERR_INVALID_REQUEST;
2160                 goto out;
2161         }
2162
2163         /* No need for _rcu here. All reconfiguration is
2164          * strictly serialized on genl_lock(). We are protected against
2165          * concurrent reconfiguration/addition/deletion */
2166         list_for_each_entry(connection, &drbd_connections, connections) {
2167                 if (nla_len(adm_ctx.my_addr) == connection->my_addr_len &&
2168                     !memcmp(nla_data(adm_ctx.my_addr), &connection->my_addr, connection->my_addr_len)) {
2169                         retcode = ERR_LOCAL_ADDR;
2170                         goto out;
2171                 }
2172
2173                 if (nla_len(adm_ctx.peer_addr) == connection->peer_addr_len &&
2174                     !memcmp(nla_data(adm_ctx.peer_addr), &connection->peer_addr, connection->peer_addr_len)) {
2175                         retcode = ERR_PEER_ADDR;
2176                         goto out;
2177                 }
2178         }
2179
2180         connection = adm_ctx.connection;
2181         conn_reconfig_start(connection);
2182
2183         if (connection->cstate > C_STANDALONE) {
2184                 retcode = ERR_NET_CONFIGURED;
2185                 goto fail;
2186         }
2187
2188         /* allocation not in the IO path, drbdsetup / netlink process context */
2189         new_conf = kzalloc(sizeof(*new_conf), GFP_KERNEL);
2190         if (!new_conf) {
2191                 retcode = ERR_NOMEM;
2192                 goto fail;
2193         }
2194
2195         set_net_conf_defaults(new_conf);
2196
2197         err = net_conf_from_attrs(new_conf, info);
2198         if (err && err != -ENOMSG) {
2199                 retcode = ERR_MANDATORY_TAG;
2200                 drbd_msg_put_info(from_attrs_err_to_txt(err));
2201                 goto fail;
2202         }
2203
2204         retcode = check_net_options(connection, new_conf);
2205         if (retcode != NO_ERROR)
2206                 goto fail;
2207
2208         retcode = alloc_crypto(&crypto, new_conf);
2209         if (retcode != NO_ERROR)
2210                 goto fail;
2211
2212         ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
2213
2214         conn_flush_workqueue(connection);
2215
2216         mutex_lock(&connection->conf_update);
2217         old_conf = connection->net_conf;
2218         if (old_conf) {
2219                 retcode = ERR_NET_CONFIGURED;
2220                 mutex_unlock(&connection->conf_update);
2221                 goto fail;
2222         }
2223         rcu_assign_pointer(connection->net_conf, new_conf);
2224
2225         conn_free_crypto(connection);
2226         connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
2227         connection->integrity_tfm = crypto.integrity_tfm;
2228         connection->csums_tfm = crypto.csums_tfm;
2229         connection->verify_tfm = crypto.verify_tfm;
2230
2231         connection->my_addr_len = nla_len(adm_ctx.my_addr);
2232         memcpy(&connection->my_addr, nla_data(adm_ctx.my_addr), connection->my_addr_len);
2233         connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
2234         memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
2235
2236         mutex_unlock(&connection->conf_update);
2237
2238         rcu_read_lock();
2239         idr_for_each_entry(&connection->volumes, device, i) {
2240                 device->send_cnt = 0;
2241                 device->recv_cnt = 0;
2242         }
2243         rcu_read_unlock();
2244
2245         retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
2246
2247         conn_reconfig_done(connection);
2248         drbd_adm_finish(info, retcode);
2249         return 0;
2250
2251 fail:
2252         free_crypto(&crypto);
2253         kfree(new_conf);
2254
2255         conn_reconfig_done(connection);
2256 out:
2257         drbd_adm_finish(info, retcode);
2258         return 0;
2259 }
2260
2261 static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection, bool force)
2262 {
2263         enum drbd_state_rv rv;
2264
2265         rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
2266                         force ? CS_HARD : 0);
2267
2268         switch (rv) {
2269         case SS_NOTHING_TO_DO:
2270                 break;
2271         case SS_ALREADY_STANDALONE:
2272                 return SS_SUCCESS;
2273         case SS_PRIMARY_NOP:
2274                 /* Our state checking code wants to see the peer outdated. */
2275                 rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
2276
2277                 if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
2278                         rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_VERBOSE);
2279
2280                 break;
2281         case SS_CW_FAILED_BY_PEER:
2282                 /* The peer probably wants to see us outdated. */
2283                 rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING,
2284                                                         disk, D_OUTDATED), 0);
2285                 if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
2286                         rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
2287                                         CS_HARD);
2288                 }
2289                 break;
2290         default:;
2291                 /* no special handling necessary */
2292         }
2293
2294         if (rv >= SS_SUCCESS) {
2295                 enum drbd_state_rv rv2;
2296                 /* No one else can reconfigure the network while I am here.
2297                  * The state handling only uses drbd_thread_stop_nowait(),
2298                  * we want to really wait here until the receiver is no more.
2299                  */
2300                 drbd_thread_stop(&adm_ctx.connection->receiver);
2301
2302                 /* Race breaker.  This additional state change request may be
2303                  * necessary, if this was a forced disconnect during a receiver
2304                  * restart.  We may have "killed" the receiver thread just
2305                  * after drbdd_init() returned.  Typically, we should be
2306                  * C_STANDALONE already, now, and this becomes a no-op.
2307                  */
2308                 rv2 = conn_request_state(connection, NS(conn, C_STANDALONE),
2309                                 CS_VERBOSE | CS_HARD);
2310                 if (rv2 < SS_SUCCESS)
2311                         conn_err(connection,
2312                                 "unexpected rv2=%d in conn_try_disconnect()\n",
2313                                 rv2);
2314         }
2315         return rv;
2316 }
2317
2318 int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
2319 {
2320         struct disconnect_parms parms;
2321         struct drbd_connection *connection;
2322         enum drbd_state_rv rv;
2323         enum drbd_ret_code retcode;
2324         int err;
2325
2326         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
2327         if (!adm_ctx.reply_skb)
2328                 return retcode;
2329         if (retcode != NO_ERROR)
2330                 goto fail;
2331
2332         connection = adm_ctx.connection;
2333         memset(&parms, 0, sizeof(parms));
2334         if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
2335                 err = disconnect_parms_from_attrs(&parms, info);
2336                 if (err) {
2337                         retcode = ERR_MANDATORY_TAG;
2338                         drbd_msg_put_info(from_attrs_err_to_txt(err));
2339                         goto fail;
2340                 }
2341         }
2342
2343         rv = conn_try_disconnect(connection, parms.force_disconnect);
2344         if (rv < SS_SUCCESS)
2345                 retcode = rv;  /* FIXME: Type mismatch. */
2346         else
2347                 retcode = NO_ERROR;
2348  fail:
2349         drbd_adm_finish(info, retcode);
2350         return 0;
2351 }
2352
2353 void resync_after_online_grow(struct drbd_device *device)
2354 {
2355         int iass; /* I am sync source */
2356
2357         dev_info(DEV, "Resync of new storage after online grow\n");
2358         if (device->state.role != device->state.peer)
2359                 iass = (device->state.role == R_PRIMARY);
2360         else
2361                 iass = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
2362
2363         if (iass)
2364                 drbd_start_resync(device, C_SYNC_SOURCE);
2365         else
2366                 _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
2367 }
2368
2369 int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2370 {
2371         struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
2372         struct resize_parms rs;
2373         struct drbd_device *device;
2374         enum drbd_ret_code retcode;
2375         enum determine_dev_size dd;
2376         bool change_al_layout = false;
2377         enum dds_flags ddsf;
2378         sector_t u_size;
2379         int err;
2380
2381         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2382         if (!adm_ctx.reply_skb)
2383                 return retcode;
2384         if (retcode != NO_ERROR)
2385                 goto fail;
2386
2387         device = adm_ctx.device;
2388         if (!get_ldev(device)) {
2389                 retcode = ERR_NO_DISK;
2390                 goto fail;
2391         }
2392
2393         memset(&rs, 0, sizeof(struct resize_parms));
2394         rs.al_stripes = device->ldev->md.al_stripes;
2395         rs.al_stripe_size = device->ldev->md.al_stripe_size_4k * 4;
2396         if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
2397                 err = resize_parms_from_attrs(&rs, info);
2398                 if (err) {
2399                         retcode = ERR_MANDATORY_TAG;
2400                         drbd_msg_put_info(from_attrs_err_to_txt(err));
2401                         goto fail_ldev;
2402                 }
2403         }
2404
2405         if (device->state.conn > C_CONNECTED) {
2406                 retcode = ERR_RESIZE_RESYNC;
2407                 goto fail_ldev;
2408         }
2409
2410         if (device->state.role == R_SECONDARY &&
2411             device->state.peer == R_SECONDARY) {
2412                 retcode = ERR_NO_PRIMARY;
2413                 goto fail_ldev;
2414         }
2415
2416         if (rs.no_resync && first_peer_device(device)->connection->agreed_pro_version < 93) {
2417                 retcode = ERR_NEED_APV_93;
2418                 goto fail_ldev;
2419         }
2420
2421         rcu_read_lock();
2422         u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
2423         rcu_read_unlock();
2424         if (u_size != (sector_t)rs.resize_size) {
2425                 new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
2426                 if (!new_disk_conf) {
2427                         retcode = ERR_NOMEM;
2428                         goto fail_ldev;
2429                 }
2430         }
2431
2432         if (device->ldev->md.al_stripes != rs.al_stripes ||
2433             device->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
2434                 u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
2435
2436                 if (al_size_k > (16 * 1024 * 1024)) {
2437                         retcode = ERR_MD_LAYOUT_TOO_BIG;
2438                         goto fail_ldev;
2439                 }
2440
2441                 if (al_size_k < MD_32kB_SECT/2) {
2442                         retcode = ERR_MD_LAYOUT_TOO_SMALL;
2443                         goto fail_ldev;
2444                 }
2445
2446                 if (device->state.conn != C_CONNECTED) {
2447                         retcode = ERR_MD_LAYOUT_CONNECTED;
2448                         goto fail_ldev;
2449                 }
2450
2451                 change_al_layout = true;
2452         }
2453
2454         if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev))
2455                 device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
2456
2457         if (new_disk_conf) {
2458                 mutex_lock(&first_peer_device(device)->connection->conf_update);
2459                 old_disk_conf = device->ldev->disk_conf;
2460                 *new_disk_conf = *old_disk_conf;
2461                 new_disk_conf->disk_size = (sector_t)rs.resize_size;
2462                 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
2463                 mutex_unlock(&first_peer_device(device)->connection->conf_update);
2464                 synchronize_rcu();
2465                 kfree(old_disk_conf);
2466         }
2467
2468         ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
2469         dd = drbd_determine_dev_size(device, ddsf, change_al_layout ? &rs : NULL);
2470         drbd_md_sync(device);
2471         put_ldev(device);
2472         if (dd == DS_ERROR) {
2473                 retcode = ERR_NOMEM_BITMAP;
2474                 goto fail;
2475         } else if (dd == DS_ERROR_SPACE_MD) {
2476                 retcode = ERR_MD_LAYOUT_NO_FIT;
2477                 goto fail;
2478         } else if (dd == DS_ERROR_SHRINK) {
2479                 retcode = ERR_IMPLICIT_SHRINK;
2480                 goto fail;
2481         }
2482
2483         if (device->state.conn == C_CONNECTED) {
2484                 if (dd == DS_GREW)
2485                         set_bit(RESIZE_PENDING, &device->flags);
2486
2487                 drbd_send_uuids(device);
2488                 drbd_send_sizes(device, 1, ddsf);
2489         }
2490
2491  fail:
2492         drbd_adm_finish(info, retcode);
2493         return 0;
2494
2495  fail_ldev:
2496         put_ldev(device);
2497         goto fail;
2498 }
2499
2500 int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
2501 {
2502         enum drbd_ret_code retcode;
2503         struct drbd_connection *connection;
2504         struct res_opts res_opts;
2505         int err;
2506
2507         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
2508         if (!adm_ctx.reply_skb)
2509                 return retcode;
2510         if (retcode != NO_ERROR)
2511                 goto fail;
2512         connection = adm_ctx.connection;
2513
2514         res_opts = connection->res_opts;
2515         if (should_set_defaults(info))
2516                 set_res_opts_defaults(&res_opts);
2517
2518         err = res_opts_from_attrs(&res_opts, info);
2519         if (err && err != -ENOMSG) {
2520                 retcode = ERR_MANDATORY_TAG;
2521                 drbd_msg_put_info(from_attrs_err_to_txt(err));
2522                 goto fail;
2523         }
2524
2525         err = set_resource_options(connection, &res_opts);
2526         if (err) {
2527                 retcode = ERR_INVALID_REQUEST;
2528                 if (err == -ENOMEM)
2529                         retcode = ERR_NOMEM;
2530         }
2531
2532 fail:
2533         drbd_adm_finish(info, retcode);
2534         return 0;
2535 }
2536
2537 int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
2538 {
2539         struct drbd_device *device;
2540         int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
2541
2542         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2543         if (!adm_ctx.reply_skb)
2544                 return retcode;
2545         if (retcode != NO_ERROR)
2546                 goto out;
2547
2548         device = adm_ctx.device;
2549
2550         /* If there is still bitmap IO pending, probably because of a previous
2551          * resync just being finished, wait for it before requesting a new resync.
2552          * Also wait for it's after_state_ch(). */
2553         drbd_suspend_io(device);
2554         wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
2555         drbd_flush_workqueue(device);
2556
2557         /* If we happen to be C_STANDALONE R_SECONDARY, just change to
2558          * D_INCONSISTENT, and set all bits in the bitmap.  Otherwise,
2559          * try to start a resync handshake as sync target for full sync.
2560          */
2561         if (device->state.conn == C_STANDALONE && device->state.role == R_SECONDARY) {
2562                 retcode = drbd_request_state(device, NS(disk, D_INCONSISTENT));
2563                 if (retcode >= SS_SUCCESS) {
2564                         if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
2565                                 "set_n_write from invalidate", BM_LOCKED_MASK))
2566                                 retcode = ERR_IO_MD_DISK;
2567                 }
2568         } else
2569                 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
2570         drbd_resume_io(device);
2571
2572 out:
2573         drbd_adm_finish(info, retcode);
2574         return 0;
2575 }
2576
2577 static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
2578                 union drbd_state mask, union drbd_state val)
2579 {
2580         enum drbd_ret_code retcode;
2581
2582         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2583         if (!adm_ctx.reply_skb)
2584                 return retcode;
2585         if (retcode != NO_ERROR)
2586                 goto out;
2587
2588         retcode = drbd_request_state(adm_ctx.device, mask, val);
2589 out:
2590         drbd_adm_finish(info, retcode);
2591         return 0;
2592 }
2593
2594 static int drbd_bmio_set_susp_al(struct drbd_device *device)
2595 {
2596         int rv;
2597
2598         rv = drbd_bmio_set_n_write(device);
2599         drbd_suspend_al(device);
2600         return rv;
2601 }
2602
2603 int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
2604 {
2605         int retcode; /* drbd_ret_code, drbd_state_rv */
2606         struct drbd_device *device;
2607
2608         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2609         if (!adm_ctx.reply_skb)
2610                 return retcode;
2611         if (retcode != NO_ERROR)
2612                 goto out;
2613
2614         device = adm_ctx.device;
2615
2616         /* If there is still bitmap IO pending, probably because of a previous
2617          * resync just being finished, wait for it before requesting a new resync.
2618          * Also wait for it's after_state_ch(). */
2619         drbd_suspend_io(device);
2620         wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
2621         drbd_flush_workqueue(device);
2622
2623         /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
2624          * in the bitmap.  Otherwise, try to start a resync handshake
2625          * as sync source for full sync.
2626          */
2627         if (device->state.conn == C_STANDALONE && device->state.role == R_PRIMARY) {
2628                 /* The peer will get a resync upon connect anyways. Just make that
2629                    into a full resync. */
2630                 retcode = drbd_request_state(device, NS(pdsk, D_INCONSISTENT));
2631                 if (retcode >= SS_SUCCESS) {
2632                         if (drbd_bitmap_io(device, &drbd_bmio_set_susp_al,
2633                                 "set_n_write from invalidate_peer",
2634                                 BM_LOCKED_SET_ALLOWED))
2635                                 retcode = ERR_IO_MD_DISK;
2636                 }
2637         } else
2638                 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
2639         drbd_resume_io(device);
2640
2641 out:
2642         drbd_adm_finish(info, retcode);
2643         return 0;
2644 }
2645
2646 int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
2647 {
2648         enum drbd_ret_code retcode;
2649
2650         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2651         if (!adm_ctx.reply_skb)
2652                 return retcode;
2653         if (retcode != NO_ERROR)
2654                 goto out;
2655
2656         if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
2657                 retcode = ERR_PAUSE_IS_SET;
2658 out:
2659         drbd_adm_finish(info, retcode);
2660         return 0;
2661 }
2662
2663 int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
2664 {
2665         union drbd_dev_state s;
2666         enum drbd_ret_code retcode;
2667
2668         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2669         if (!adm_ctx.reply_skb)
2670                 return retcode;
2671         if (retcode != NO_ERROR)
2672                 goto out;
2673
2674         if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
2675                 s = adm_ctx.device->state;
2676                 if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
2677                         retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
2678                                   s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
2679                 } else {
2680                         retcode = ERR_PAUSE_IS_CLEAR;
2681                 }
2682         }
2683
2684 out:
2685         drbd_adm_finish(info, retcode);
2686         return 0;
2687 }
2688
2689 int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
2690 {
2691         return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
2692 }
2693
2694 int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
2695 {
2696         struct drbd_device *device;
2697         int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
2698
2699         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2700         if (!adm_ctx.reply_skb)
2701                 return retcode;
2702         if (retcode != NO_ERROR)
2703                 goto out;
2704
2705         device = adm_ctx.device;
2706         if (test_bit(NEW_CUR_UUID, &device->flags)) {
2707                 drbd_uuid_new_current(device);
2708                 clear_bit(NEW_CUR_UUID, &device->flags);
2709         }
2710         drbd_suspend_io(device);
2711         retcode = drbd_request_state(device, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
2712         if (retcode == SS_SUCCESS) {
2713                 if (device->state.conn < C_CONNECTED)
2714                         tl_clear(first_peer_device(device)->connection);
2715                 if (device->state.disk == D_DISKLESS || device->state.disk == D_FAILED)
2716                         tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO);
2717         }
2718         drbd_resume_io(device);
2719
2720 out:
2721         drbd_adm_finish(info, retcode);
2722         return 0;
2723 }
2724
2725 int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
2726 {
2727         return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
2728 }
2729
2730 static int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_connection *connection, unsigned vnr)
2731 {
2732         struct nlattr *nla;
2733         nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
2734         if (!nla)
2735                 goto nla_put_failure;
2736         if (vnr != VOLUME_UNSPECIFIED &&
2737             nla_put_u32(skb, T_ctx_volume, vnr))
2738                 goto nla_put_failure;
2739         if (nla_put_string(skb, T_ctx_resource_name, connection->name))
2740                 goto nla_put_failure;
2741         if (connection->my_addr_len &&
2742             nla_put(skb, T_ctx_my_addr, connection->my_addr_len, &connection->my_addr))
2743                 goto nla_put_failure;
2744         if (connection->peer_addr_len &&
2745             nla_put(skb, T_ctx_peer_addr, connection->peer_addr_len, &connection->peer_addr))
2746                 goto nla_put_failure;
2747         nla_nest_end(skb, nla);
2748         return 0;
2749
2750 nla_put_failure:
2751         if (nla)
2752                 nla_nest_cancel(skb, nla);
2753         return -EMSGSIZE;
2754 }
2755
2756 static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
2757                 const struct sib_info *sib)
2758 {
2759         struct state_info *si = NULL; /* for sizeof(si->member); */
2760         struct nlattr *nla;
2761         int got_ldev;
2762         int err = 0;
2763         int exclude_sensitive;
2764
2765         /* If sib != NULL, this is drbd_bcast_event, which anyone can listen
2766          * to.  So we better exclude_sensitive information.
2767          *
2768          * If sib == NULL, this is drbd_adm_get_status, executed synchronously
2769          * in the context of the requesting user process. Exclude sensitive
2770          * information, unless current has superuser.
2771          *
2772          * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
2773          * relies on the current implementation of netlink_dump(), which
2774          * executes the dump callback successively from netlink_recvmsg(),
2775          * always in the context of the receiving process */
2776         exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
2777
2778         got_ldev = get_ldev(device);
2779
2780         /* We need to add connection name and volume number information still.
2781          * Minor number is in drbd_genlmsghdr. */
2782         if (nla_put_drbd_cfg_context(skb, first_peer_device(device)->connection, device->vnr))
2783                 goto nla_put_failure;
2784
2785         if (res_opts_to_skb(skb, &first_peer_device(device)->connection->res_opts, exclude_sensitive))
2786                 goto nla_put_failure;
2787
2788         rcu_read_lock();
2789         if (got_ldev) {
2790                 struct disk_conf *disk_conf;
2791
2792                 disk_conf = rcu_dereference(device->ldev->disk_conf);
2793                 err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
2794         }
2795         if (!err) {
2796                 struct net_conf *nc;
2797
2798                 nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
2799                 if (nc)
2800                         err = net_conf_to_skb(skb, nc, exclude_sensitive);
2801         }
2802         rcu_read_unlock();
2803         if (err)
2804                 goto nla_put_failure;
2805
2806         nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
2807         if (!nla)
2808                 goto nla_put_failure;
2809         if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
2810             nla_put_u32(skb, T_current_state, device->state.i) ||
2811             nla_put_u64(skb, T_ed_uuid, device->ed_uuid) ||
2812             nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) ||
2813             nla_put_u64(skb, T_send_cnt, device->send_cnt) ||
2814             nla_put_u64(skb, T_recv_cnt, device->recv_cnt) ||
2815             nla_put_u64(skb, T_read_cnt, device->read_cnt) ||
2816             nla_put_u64(skb, T_writ_cnt, device->writ_cnt) ||
2817             nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) ||
2818             nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) ||
2819             nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) ||
2820             nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) ||
2821             nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt)))
2822                 goto nla_put_failure;
2823
2824         if (got_ldev) {
2825                 int err;
2826
2827                 spin_lock_irq(&device->ldev->md.uuid_lock);
2828                 err = nla_put(skb, T_uuids, sizeof(si->uuids), device->ldev->md.uuid);
2829                 spin_unlock_irq(&device->ldev->md.uuid_lock);
2830
2831                 if (err)
2832                         goto nla_put_failure;
2833
2834                 if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) ||
2835                     nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) ||
2836                     nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device)))
2837                         goto nla_put_failure;
2838                 if (C_SYNC_SOURCE <= device->state.conn &&
2839                     C_PAUSED_SYNC_T >= device->state.conn) {
2840                         if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) ||
2841                             nla_put_u64(skb, T_bits_rs_failed, device->rs_failed))
2842                                 goto nla_put_failure;
2843                 }
2844         }
2845
2846         if (sib) {
2847                 switch(sib->sib_reason) {
2848                 case SIB_SYNC_PROGRESS:
2849                 case SIB_GET_STATUS_REPLY:
2850                         break;
2851                 case SIB_STATE_CHANGE:
2852                         if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
2853                             nla_put_u32(skb, T_new_state, sib->ns.i))
2854                                 goto nla_put_failure;
2855                         break;
2856                 case SIB_HELPER_POST:
2857                         if (nla_put_u32(skb, T_helper_exit_code,
2858                                         sib->helper_exit_code))
2859                                 goto nla_put_failure;
2860                         /* fall through */
2861                 case SIB_HELPER_PRE:
2862                         if (nla_put_string(skb, T_helper, sib->helper_name))
2863                                 goto nla_put_failure;
2864                         break;
2865                 }
2866         }
2867         nla_nest_end(skb, nla);
2868
2869         if (0)
2870 nla_put_failure:
2871                 err = -EMSGSIZE;
2872         if (got_ldev)
2873                 put_ldev(device);
2874         return err;
2875 }
2876
2877 int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
2878 {
2879         enum drbd_ret_code retcode;
2880         int err;
2881
2882         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
2883         if (!adm_ctx.reply_skb)
2884                 return retcode;
2885         if (retcode != NO_ERROR)
2886                 goto out;
2887
2888         err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.device, NULL);
2889         if (err) {
2890                 nlmsg_free(adm_ctx.reply_skb);
2891                 return err;
2892         }
2893 out:
2894         drbd_adm_finish(info, retcode);
2895         return 0;
2896 }
2897
2898 static int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
2899 {
2900         struct drbd_device *device;
2901         struct drbd_genlmsghdr *dh;
2902         struct drbd_connection *pos = (struct drbd_connection *)cb->args[0];
2903         struct drbd_connection *connection = NULL;
2904         struct drbd_connection *tmp;
2905         unsigned volume = cb->args[1];
2906
2907         /* Open coded, deferred, iteration:
2908          * list_for_each_entry_safe(connection, tmp, &drbd_connections, connections) {
2909          *      idr_for_each_entry(&connection->volumes, device, i) {
2910          *        ...
2911          *      }
2912          * }
2913          * where connection is cb->args[0];
2914          * and i is cb->args[1];
2915          *
2916          * cb->args[2] indicates if we shall loop over all resources,
2917          * or just dump all volumes of a single resource.
2918          *
2919          * This may miss entries inserted after this dump started,
2920          * or entries deleted before they are reached.
2921          *
2922          * We need to make sure the device won't disappear while
2923          * we are looking at it, and revalidate our iterators
2924          * on each iteration.
2925          */
2926
2927         /* synchronize with conn_create()/drbd_destroy_connection() */
2928         rcu_read_lock();
2929         /* revalidate iterator position */
2930         list_for_each_entry_rcu(tmp, &drbd_connections, connections) {
2931                 if (pos == NULL) {
2932                         /* first iteration */
2933                         pos = tmp;
2934                         connection = pos;
2935                         break;
2936                 }
2937                 if (tmp == pos) {
2938                         connection = pos;
2939                         break;
2940                 }
2941         }
2942         if (connection) {
2943 next_connection:
2944                 device = idr_get_next(&connection->volumes, &volume);
2945                 if (!device) {
2946                         /* No more volumes to dump on this connection.
2947                          * Advance connection iterator. */
2948                         pos = list_entry_rcu(connection->connections.next,
2949                                              struct drbd_connection, connections);
2950                         /* Did we dump any volume on this connection yet? */
2951                         if (volume != 0) {
2952                                 /* If we reached the end of the list,
2953                                  * or only a single resource dump was requested,
2954                                  * we are done. */
2955                                 if (&pos->connections == &drbd_connections || cb->args[2])
2956                                         goto out;
2957                                 volume = 0;
2958                                 connection = pos;
2959                                 goto next_connection;
2960                         }
2961                 }
2962
2963                 dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
2964                                 cb->nlh->nlmsg_seq, &drbd_genl_family,
2965                                 NLM_F_MULTI, DRBD_ADM_GET_STATUS);
2966                 if (!dh)
2967                         goto out;
2968
2969                 if (!device) {
2970                         /* This is a connection without a single volume.
2971                          * Suprisingly enough, it may have a network
2972                          * configuration. */
2973                         struct net_conf *nc;
2974                         dh->minor = -1U;
2975                         dh->ret_code = NO_ERROR;
2976                         if (nla_put_drbd_cfg_context(skb, connection, VOLUME_UNSPECIFIED))
2977                                 goto cancel;
2978                         nc = rcu_dereference(connection->net_conf);
2979                         if (nc && net_conf_to_skb(skb, nc, 1) != 0)
2980                                 goto cancel;
2981                         goto done;
2982                 }
2983
2984                 D_ASSERT(device->vnr == volume);
2985                 D_ASSERT(first_peer_device(device)->connection == connection);
2986
2987                 dh->minor = device_to_minor(device);
2988                 dh->ret_code = NO_ERROR;
2989
2990                 if (nla_put_status_info(skb, device, NULL)) {
2991 cancel:
2992                         genlmsg_cancel(skb, dh);
2993                         goto out;
2994                 }
2995 done:
2996                 genlmsg_end(skb, dh);
2997         }
2998
2999 out:
3000         rcu_read_unlock();
3001         /* where to start the next iteration */
3002         cb->args[0] = (long)pos;
3003         cb->args[1] = (pos == connection) ? volume + 1 : 0;
3004
3005         /* No more connections/volumes/minors found results in an empty skb.
3006          * Which will terminate the dump. */
3007         return skb->len;
3008 }
3009
3010 /*
3011  * Request status of all resources, or of all volumes within a single resource.
3012  *
3013  * This is a dump, as the answer may not fit in a single reply skb otherwise.
3014  * Which means we cannot use the family->attrbuf or other such members, because
3015  * dump is NOT protected by the genl_lock().  During dump, we only have access
3016  * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
3017  *
3018  * Once things are setup properly, we call into get_one_status().
3019  */
3020 int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
3021 {
3022         const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
3023         struct nlattr *nla;
3024         const char *resource_name;
3025         struct drbd_connection *connection;
3026         int maxtype;
3027
3028         /* Is this a followup call? */
3029         if (cb->args[0]) {
3030                 /* ... of a single resource dump,
3031                  * and the resource iterator has been advanced already? */
3032                 if (cb->args[2] && cb->args[2] != cb->args[0])
3033                         return 0; /* DONE. */
3034                 goto dump;
3035         }
3036
3037         /* First call (from netlink_dump_start).  We need to figure out
3038          * which resource(s) the user wants us to dump. */
3039         nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
3040                         nlmsg_attrlen(cb->nlh, hdrlen),
3041                         DRBD_NLA_CFG_CONTEXT);
3042
3043         /* No explicit context given.  Dump all. */
3044         if (!nla)
3045                 goto dump;
3046         maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
3047         nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
3048         if (IS_ERR(nla))
3049                 return PTR_ERR(nla);
3050         /* context given, but no name present? */
3051         if (!nla)
3052                 return -EINVAL;
3053         resource_name = nla_data(nla);
3054         connection = conn_get_by_name(resource_name);
3055
3056         if (!connection)
3057                 return -ENODEV;
3058
3059         kref_put(&connection->kref, drbd_destroy_connection); /* get_one_status() (re)validates connection by itself */
3060
3061         /* prime iterators, and set "filter" mode mark:
3062          * only dump this connection. */
3063         cb->args[0] = (long)connection;
3064         /* cb->args[1] = 0; passed in this way. */
3065         cb->args[2] = (long)connection;
3066
3067 dump:
3068         return get_one_status(skb, cb);
3069 }
3070
3071 int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
3072 {
3073         enum drbd_ret_code retcode;
3074         struct timeout_parms tp;
3075         int err;
3076
3077         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3078         if (!adm_ctx.reply_skb)
3079                 return retcode;
3080         if (retcode != NO_ERROR)
3081                 goto out;
3082
3083         tp.timeout_type =
3084                 adm_ctx.device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
3085                 test_bit(USE_DEGR_WFC_T, &adm_ctx.device->flags) ? UT_DEGRADED :
3086                 UT_DEFAULT;
3087
3088         err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
3089         if (err) {
3090                 nlmsg_free(adm_ctx.reply_skb);
3091                 return err;
3092         }
3093 out:
3094         drbd_adm_finish(info, retcode);
3095         return 0;
3096 }
3097
3098 int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
3099 {
3100         struct drbd_device *device;
3101         enum drbd_ret_code retcode;
3102         struct start_ov_parms parms;
3103
3104         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3105         if (!adm_ctx.reply_skb)
3106                 return retcode;
3107         if (retcode != NO_ERROR)
3108                 goto out;
3109
3110         device = adm_ctx.device;
3111
3112         /* resume from last known position, if possible */
3113         parms.ov_start_sector = device->ov_start_sector;
3114         parms.ov_stop_sector = ULLONG_MAX;
3115         if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
3116                 int err = start_ov_parms_from_attrs(&parms, info);
3117                 if (err) {
3118                         retcode = ERR_MANDATORY_TAG;
3119                         drbd_msg_put_info(from_attrs_err_to_txt(err));
3120                         goto out;
3121                 }
3122         }
3123         /* w_make_ov_request expects position to be aligned */
3124         device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
3125         device->ov_stop_sector = parms.ov_stop_sector;
3126
3127         /* If there is still bitmap IO pending, e.g. previous resync or verify
3128          * just being finished, wait for it before requesting a new resync. */
3129         drbd_suspend_io(device);
3130         wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
3131         retcode = drbd_request_state(device, NS(conn, C_VERIFY_S));
3132         drbd_resume_io(device);
3133 out:
3134         drbd_adm_finish(info, retcode);
3135         return 0;
3136 }
3137
3138
3139 int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
3140 {
3141         struct drbd_device *device;
3142         enum drbd_ret_code retcode;
3143         int skip_initial_sync = 0;
3144         int err;
3145         struct new_c_uuid_parms args;
3146
3147         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3148         if (!adm_ctx.reply_skb)
3149                 return retcode;
3150         if (retcode != NO_ERROR)
3151                 goto out_nolock;
3152
3153         device = adm_ctx.device;
3154         memset(&args, 0, sizeof(args));
3155         if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
3156                 err = new_c_uuid_parms_from_attrs(&args, info);
3157                 if (err) {
3158                         retcode = ERR_MANDATORY_TAG;
3159                         drbd_msg_put_info(from_attrs_err_to_txt(err));
3160                         goto out_nolock;
3161                 }
3162         }
3163
3164         mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */
3165
3166         if (!get_ldev(device)) {
3167                 retcode = ERR_NO_DISK;
3168                 goto out;
3169         }
3170
3171         /* this is "skip initial sync", assume to be clean */
3172         if (device->state.conn == C_CONNECTED &&
3173             first_peer_device(device)->connection->agreed_pro_version >= 90 &&
3174             device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
3175                 dev_info(DEV, "Preparing to skip initial sync\n");
3176                 skip_initial_sync = 1;
3177         } else if (device->state.conn != C_STANDALONE) {
3178                 retcode = ERR_CONNECTED;
3179                 goto out_dec;
3180         }
3181
3182         drbd_uuid_set(device, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
3183         drbd_uuid_new_current(device); /* New current, previous to UI_BITMAP */
3184
3185         if (args.clear_bm) {
3186                 err = drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
3187                         "clear_n_write from new_c_uuid", BM_LOCKED_MASK);
3188                 if (err) {
3189                         dev_err(DEV, "Writing bitmap failed with %d\n",err);
3190                         retcode = ERR_IO_MD_DISK;
3191                 }
3192                 if (skip_initial_sync) {
3193                         drbd_send_uuids_skip_initial_sync(device);
3194                         _drbd_uuid_set(device, UI_BITMAP, 0);
3195                         drbd_print_uuids(device, "cleared bitmap UUID");
3196                         spin_lock_irq(&first_peer_device(device)->connection->req_lock);
3197                         _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3198                                         CS_VERBOSE, NULL);
3199                         spin_unlock_irq(&first_peer_device(device)->connection->req_lock);
3200                 }
3201         }
3202
3203         drbd_md_sync(device);
3204 out_dec:
3205         put_ldev(device);
3206 out:
3207         mutex_unlock(device->state_mutex);
3208 out_nolock:
3209         drbd_adm_finish(info, retcode);
3210         return 0;
3211 }
3212
3213 static enum drbd_ret_code
3214 drbd_check_resource_name(const char *name)
3215 {
3216         if (!name || !name[0]) {
3217                 drbd_msg_put_info("resource name missing");
3218                 return ERR_MANDATORY_TAG;
3219         }
3220         /* if we want to use these in sysfs/configfs/debugfs some day,
3221          * we must not allow slashes */
3222         if (strchr(name, '/')) {
3223                 drbd_msg_put_info("invalid resource name");
3224                 return ERR_INVALID_REQUEST;
3225         }
3226         return NO_ERROR;
3227 }
3228
3229 int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
3230 {
3231         enum drbd_ret_code retcode;
3232         struct res_opts res_opts;
3233         int err;
3234
3235         retcode = drbd_adm_prepare(skb, info, 0);
3236         if (!adm_ctx.reply_skb)
3237                 return retcode;
3238         if (retcode != NO_ERROR)
3239                 goto out;
3240
3241         set_res_opts_defaults(&res_opts);
3242         err = res_opts_from_attrs(&res_opts, info);
3243         if (err && err != -ENOMSG) {
3244                 retcode = ERR_MANDATORY_TAG;
3245                 drbd_msg_put_info(from_attrs_err_to_txt(err));
3246                 goto out;
3247         }
3248
3249         retcode = drbd_check_resource_name(adm_ctx.resource_name);
3250         if (retcode != NO_ERROR)
3251                 goto out;
3252
3253         if (adm_ctx.connection) {
3254                 if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
3255                         retcode = ERR_INVALID_REQUEST;
3256                         drbd_msg_put_info("resource exists");
3257                 }
3258                 /* else: still NO_ERROR */
3259                 goto out;
3260         }
3261
3262         if (!conn_create(adm_ctx.resource_name, &res_opts))
3263                 retcode = ERR_NOMEM;
3264 out:
3265         drbd_adm_finish(info, retcode);
3266         return 0;
3267 }
3268
3269 int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
3270 {
3271         struct drbd_genlmsghdr *dh = info->userhdr;
3272         enum drbd_ret_code retcode;
3273
3274         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
3275         if (!adm_ctx.reply_skb)
3276                 return retcode;
3277         if (retcode != NO_ERROR)
3278                 goto out;
3279
3280         if (dh->minor > MINORMASK) {
3281                 drbd_msg_put_info("requested minor out of range");
3282                 retcode = ERR_INVALID_REQUEST;
3283                 goto out;
3284         }
3285         if (adm_ctx.volume > DRBD_VOLUME_MAX) {
3286                 drbd_msg_put_info("requested volume id out of range");
3287                 retcode = ERR_INVALID_REQUEST;
3288                 goto out;
3289         }
3290
3291         /* drbd_adm_prepare made sure already
3292          * that first_peer_device(device)->connection and device->vnr match the request. */
3293         if (adm_ctx.device) {
3294                 if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
3295                         retcode = ERR_MINOR_EXISTS;
3296                 /* else: still NO_ERROR */
3297                 goto out;
3298         }
3299
3300         retcode = drbd_create_minor(adm_ctx.connection, dh->minor, adm_ctx.volume);
3301 out:
3302         drbd_adm_finish(info, retcode);
3303         return 0;
3304 }
3305
3306 static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
3307 {
3308         if (device->state.disk == D_DISKLESS &&
3309             /* no need to be device->state.conn == C_STANDALONE &&
3310              * we may want to delete a minor from a live replication group.
3311              */
3312             device->state.role == R_SECONDARY) {
3313                 _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
3314                                     CS_VERBOSE + CS_WAIT_COMPLETE);
3315                 idr_remove(&first_peer_device(device)->connection->volumes, device->vnr);
3316                 idr_remove(&drbd_devices, device_to_minor(device));
3317                 destroy_workqueue(device->submit.wq);
3318                 del_gendisk(device->vdisk);
3319                 synchronize_rcu();
3320                 kref_put(&device->kref, drbd_destroy_device);
3321                 return NO_ERROR;
3322         } else
3323                 return ERR_MINOR_CONFIGURED;
3324 }
3325
3326 int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info)
3327 {
3328         enum drbd_ret_code retcode;
3329
3330         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
3331         if (!adm_ctx.reply_skb)
3332                 return retcode;
3333         if (retcode != NO_ERROR)
3334                 goto out;
3335
3336         retcode = adm_del_minor(adm_ctx.device);
3337 out:
3338         drbd_adm_finish(info, retcode);
3339         return 0;
3340 }
3341
3342 int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
3343 {
3344         int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
3345         struct drbd_device *device;
3346         unsigned i;
3347
3348         retcode = drbd_adm_prepare(skb, info, 0);
3349         if (!adm_ctx.reply_skb)
3350                 return retcode;
3351         if (retcode != NO_ERROR)
3352                 goto out;
3353
3354         if (!adm_ctx.connection) {
3355                 retcode = ERR_RES_NOT_KNOWN;
3356                 goto out;
3357         }
3358
3359         /* demote */
3360         idr_for_each_entry(&adm_ctx.connection->volumes, device, i) {
3361                 retcode = drbd_set_role(device, R_SECONDARY, 0);
3362                 if (retcode < SS_SUCCESS) {
3363                         drbd_msg_put_info("failed to demote");
3364                         goto out;
3365                 }
3366         }
3367
3368         retcode = conn_try_disconnect(adm_ctx.connection, 0);
3369         if (retcode < SS_SUCCESS) {
3370                 drbd_msg_put_info("failed to disconnect");
3371                 goto out;
3372         }
3373
3374         /* detach */
3375         idr_for_each_entry(&adm_ctx.connection->volumes, device, i) {
3376                 retcode = adm_detach(device, 0);
3377                 if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
3378                         drbd_msg_put_info("failed to detach");
3379                         goto out;
3380                 }
3381         }
3382
3383         /* If we reach this, all volumes (of this connection) are Secondary,
3384          * Disconnected, Diskless, aka Unconfigured. Make sure all threads have
3385          * actually stopped, state handling only does drbd_thread_stop_nowait(). */
3386         drbd_thread_stop(&adm_ctx.connection->worker);
3387
3388         /* Now, nothing can fail anymore */
3389
3390         /* delete volumes */
3391         idr_for_each_entry(&adm_ctx.connection->volumes, device, i) {
3392                 retcode = adm_del_minor(device);
3393                 if (retcode != NO_ERROR) {
3394                         /* "can not happen" */
3395                         drbd_msg_put_info("failed to delete volume");
3396                         goto out;
3397                 }
3398         }
3399
3400         /* delete connection */
3401         if (conn_lowest_minor(adm_ctx.connection) < 0) {
3402                 list_del_rcu(&adm_ctx.connection->connections);
3403                 synchronize_rcu();
3404                 kref_put(&adm_ctx.connection->kref, drbd_destroy_connection);
3405
3406                 retcode = NO_ERROR;
3407         } else {
3408                 /* "can not happen" */
3409                 retcode = ERR_RES_IN_USE;
3410                 drbd_msg_put_info("failed to delete connection");
3411         }
3412         goto out;
3413 out:
3414         drbd_adm_finish(info, retcode);
3415         return 0;
3416 }
3417
3418 int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
3419 {
3420         enum drbd_ret_code retcode;
3421
3422         retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
3423         if (!adm_ctx.reply_skb)
3424                 return retcode;
3425         if (retcode != NO_ERROR)
3426                 goto out;
3427
3428         if (conn_lowest_minor(adm_ctx.connection) < 0) {
3429                 list_del_rcu(&adm_ctx.connection->connections);
3430                 synchronize_rcu();
3431                 kref_put(&adm_ctx.connection->kref, drbd_destroy_connection);
3432
3433                 retcode = NO_ERROR;
3434         } else {
3435                 retcode = ERR_RES_IN_USE;
3436         }
3437
3438         if (retcode == NO_ERROR)
3439                 drbd_thread_stop(&adm_ctx.connection->worker);
3440 out:
3441         drbd_adm_finish(info, retcode);
3442         return 0;
3443 }
3444
3445 void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
3446 {
3447         static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
3448         struct sk_buff *msg;
3449         struct drbd_genlmsghdr *d_out;
3450         unsigned seq;
3451         int err = -ENOMEM;
3452
3453         if (sib->sib_reason == SIB_SYNC_PROGRESS) {
3454                 if (time_after(jiffies, device->rs_last_bcast + HZ))
3455                         device->rs_last_bcast = jiffies;
3456                 else
3457                         return;
3458         }
3459
3460         seq = atomic_inc_return(&drbd_genl_seq);
3461         msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
3462         if (!msg)
3463                 goto failed;
3464
3465         err = -EMSGSIZE;
3466         d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
3467         if (!d_out) /* cannot happen, but anyways. */
3468                 goto nla_put_failure;
3469         d_out->minor = device_to_minor(device);
3470         d_out->ret_code = NO_ERROR;
3471
3472         if (nla_put_status_info(msg, device, sib))
3473                 goto nla_put_failure;
3474         genlmsg_end(msg, d_out);
3475         err = drbd_genl_multicast_events(msg, 0);
3476         /* msg has been consumed or freed in netlink_broadcast() */
3477         if (err && err != -ESRCH)
3478                 goto failed;
3479
3480         return;
3481
3482 nla_put_failure:
3483         nlmsg_free(msg);
3484 failed:
3485         dev_err(DEV, "Error %d while broadcasting event. "
3486                         "Event seq:%u sib_reason:%u\n",
3487                         err, seq, sib->sib_reason);
3488 }