block/blk-mq.c

   1 #include <linux/kernel.h>
   2 #include <linux/module.h>
   3 #include <linux/backing-dev.h>
   4 #include <linux/bio.h>
   5 #include <linux/blkdev.h>
   6 #include <linux/mm.h>
   7 #include <linux/init.h>
   8 #include <linux/slab.h>
   9 #include <linux/workqueue.h>
  10 #include <linux/smp.h>
  11 #include <linux/llist.h>
  12 #include <linux/list_sort.h>
  13 #include <linux/cpu.h>
  14 #include <linux/cache.h>
  15 #include <linux/sched/sysctl.h>
  16 #include <linux/delay.h>
  17
  18 #include <trace/events/block.h>
  19
  20 #include <linux/blk-mq.h>
  21 #include "blk.h"
  22 #include "blk-mq.h"
  23 #include "blk-mq-tag.h"
  24
  25 static DEFINE_MUTEX(all_q_mutex);
  26 static LIST_HEAD(all_q_list);
  27
  28 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
  29
  30 static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
  31                                            unsigned int cpu)
  32 {
  33         return per_cpu_ptr(q->queue_ctx, cpu);
  34 }
  35
  36 /*
  37  * This assumes per-cpu software queueing queues. They could be per-node
  38  * as well, for instance. For now this is hardcoded as-is. Note that we don't
  39  * care about preemption, since we know the ctx's are persistent. This does
  40  * mean that we can't rely on ctx always matching the currently running CPU.
  41  */
  42 static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
  43 {
  44         return __blk_mq_get_ctx(q, get_cpu());
  45 }
  46
  47 static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
  48 {
  49         put_cpu();
  50 }
  51
  52 /*
  53  * Check if any of the ctx's have pending work in this hardware queue
  54  */
  55 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  56 {
  57         unsigned int i;
  58
  59         for (i = 0; i < hctx->nr_ctx_map; i++)
  60                 if (hctx->ctx_map[i])
  61                         return true;
  62
  63         return false;
  64 }
  65
  66 /*
  67  * Mark this ctx as having pending work in this hardware queue
  68  */
  69 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  70                                      struct blk_mq_ctx *ctx)
  71 {
  72         if (!test_bit(ctx->index_hw, hctx->ctx_map))
  73                 set_bit(ctx->index_hw, hctx->ctx_map);
  74 }
  75
  76 static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
  77                                               struct blk_mq_ctx *ctx,
  78                                               gfp_t gfp, bool reserved)
  79 {
  80         struct request *rq;
  81         unsigned int tag;
  82
  83         tag = blk_mq_get_tag(hctx->tags, hctx, &ctx->last_tag, gfp, reserved);
  84         if (tag != BLK_MQ_TAG_FAIL) {
  85                 rq = hctx->tags->rqs[tag];
  86                 rq->tag = tag;
  87                 return rq;
  88         }
  89
  90         return NULL;
  91 }
  92
  93 static int blk_mq_queue_enter(struct request_queue *q)
  94 {
  95         int ret;
  96
  97         __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
  98         smp_wmb();
  99         /* we have problems to freeze the queue if it's initializing */
 100         if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
 101                 return 0;
 102
 103         __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
 104
 105         spin_lock_irq(q->queue_lock);
 106         ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
 107                 !blk_queue_bypass(q) || blk_queue_dying(q),
 108                 *q->queue_lock);
 109         /* inc usage with lock hold to avoid freeze_queue runs here */
 110         if (!ret && !blk_queue_dying(q))
 111                 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
 112         else if (blk_queue_dying(q))
 113                 ret = -ENODEV;
 114         spin_unlock_irq(q->queue_lock);
 115
 116         return ret;
 117 }
 118
 119 static void blk_mq_queue_exit(struct request_queue *q)
 120 {
 121         __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
 122 }
 123
 124 static void __blk_mq_drain_queue(struct request_queue *q)
 125 {
 126         while (true) {
 127                 s64 count;
 128
 129                 spin_lock_irq(q->queue_lock);
 130                 count = percpu_counter_sum(&q->mq_usage_counter);
 131                 spin_unlock_irq(q->queue_lock);
 132
 133                 if (count == 0)
 134                         break;
 135                 blk_mq_run_queues(q, false);
 136                 msleep(10);
 137         }
 138 }
 139
 140 /*
 141  * Guarantee no request is in use, so we can change any data structure of
 142  * the queue afterward.
 143  */
 144 static void blk_mq_freeze_queue(struct request_queue *q)
 145 {
 146         bool drain;
 147
 148         spin_lock_irq(q->queue_lock);
 149         drain = !q->bypass_depth++;
 150         queue_flag_set(QUEUE_FLAG_BYPASS, q);
 151         spin_unlock_irq(q->queue_lock);
 152
 153         if (drain)
 154                 __blk_mq_drain_queue(q);
 155 }
 156
 157 void blk_mq_drain_queue(struct request_queue *q)
 158 {
 159         __blk_mq_drain_queue(q);
 160 }
 161
 162 static void blk_mq_unfreeze_queue(struct request_queue *q)
 163 {
 164         bool wake = false;
 165
 166         spin_lock_irq(q->queue_lock);
 167         if (!--q->bypass_depth) {
 168                 queue_flag_clear(QUEUE_FLAG_BYPASS, q);
 169                 wake = true;
 170         }
 171         WARN_ON_ONCE(q->bypass_depth < 0);
 172         spin_unlock_irq(q->queue_lock);
 173         if (wake)
 174                 wake_up_all(&q->mq_freeze_wq);
 175 }
 176
 177 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 178 {
 179         return blk_mq_has_free_tags(hctx->tags);
 180 }
 181 EXPORT_SYMBOL(blk_mq_can_queue);
 182
 183 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 184                                struct request *rq, unsigned int rw_flags)
 185 {
 186         if (blk_queue_io_stat(q))
 187                 rw_flags |= REQ_IO_STAT;
 188
 189         INIT_LIST_HEAD(&rq->queuelist);
 190         /* csd/requeue_work/fifo_time is initialized before use */
 191         rq->q = q;
 192         rq->mq_ctx = ctx;
 193         rq->cmd_flags = rw_flags;
 194         rq->cmd_type = 0;
 195         /* do not touch atomic flags, it needs atomic ops against the timer */
 196         rq->cpu = -1;
 197         rq->__data_len = 0;
 198         rq->__sector = (sector_t) -1;
 199         rq->bio = NULL;
 200         rq->biotail = NULL;
 201         INIT_HLIST_NODE(&rq->hash);
 202         RB_CLEAR_NODE(&rq->rb_node);
 203         memset(&rq->flush, 0, max(sizeof(rq->flush), sizeof(rq->elv)));
 204         rq->rq_disk = NULL;
 205         rq->part = NULL;
 206         rq->start_time = jiffies;
 207 #ifdef CONFIG_BLK_CGROUP
 208         rq->rl = NULL;
 209         set_start_time_ns(rq);
 210         rq->io_start_time_ns = 0;
 211 #endif
 212         rq->nr_phys_segments = 0;
 213 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 214         rq->nr_integrity_segments = 0;
 215 #endif
 216         rq->ioprio = 0;
 217         rq->special = NULL;
 218         /* tag was already set */
 219         rq->errors = 0;
 220         memset(rq->__cmd, 0, sizeof(rq->__cmd));
 221         rq->cmd = rq->__cmd;
 222         rq->cmd_len = BLK_MAX_CDB;
 223
 224         rq->extra_len = 0;
 225         rq->sense_len = 0;
 226         rq->resid_len = 0;
 227         rq->sense = NULL;
 228
 229         rq->deadline = 0;
 230         INIT_LIST_HEAD(&rq->timeout_list);
 231         rq->timeout = 0;
 232         rq->retries = 0;
 233         rq->end_io = NULL;
 234         rq->end_io_data = NULL;
 235         rq->next_rq = NULL;
 236
 237         ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
 238 }
 239
 240 static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
 241                                                    int rw, gfp_t gfp,
 242                                                    bool reserved)
 243 {
 244         struct request *rq;
 245
 246         do {
 247                 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
 248                 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
 249
 250                 rq = __blk_mq_alloc_request(hctx, ctx, gfp & ~__GFP_WAIT,
 251                                                 reserved);
 252                 if (rq) {
 253                         blk_mq_rq_ctx_init(q, ctx, rq, rw);
 254                         break;
 255                 }
 256
 257                 if (gfp & __GFP_WAIT) {
 258                         __blk_mq_run_hw_queue(hctx);
 259                         blk_mq_put_ctx(ctx);
 260                 } else {
 261                         blk_mq_put_ctx(ctx);
 262                         break;
 263                 }
 264
 265                 blk_mq_wait_for_tags(hctx->tags, hctx, reserved);
 266         } while (1);
 267
 268         return rq;
 269 }
 270
 271 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp)
 272 {
 273         struct request *rq;
 274
 275         if (blk_mq_queue_enter(q))
 276                 return NULL;
 277
 278         rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
 279         if (rq)
 280                 blk_mq_put_ctx(rq->mq_ctx);
 281         return rq;
 282 }
 283 EXPORT_SYMBOL(blk_mq_alloc_request);
 284
 285 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
 286                                               gfp_t gfp)
 287 {
 288         struct request *rq;
 289
 290         if (blk_mq_queue_enter(q))
 291                 return NULL;
 292
 293         rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
 294         if (rq)
 295                 blk_mq_put_ctx(rq->mq_ctx);
 296         return rq;
 297 }
 298 EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
 299
 300 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 301                                   struct blk_mq_ctx *ctx, struct request *rq)
 302 {
 303         const int tag = rq->tag;
 304         struct request_queue *q = rq->q;
 305
 306         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 307         blk_mq_put_tag(hctx->tags, tag, &ctx->last_tag);
 308         blk_mq_queue_exit(q);
 309 }
 310
 311 void blk_mq_free_request(struct request *rq)
 312 {
 313         struct blk_mq_ctx *ctx = rq->mq_ctx;
 314         struct blk_mq_hw_ctx *hctx;
 315         struct request_queue *q = rq->q;
 316
 317         ctx->rq_completed[rq_is_sync(rq)]++;
 318
 319         hctx = q->mq_ops->map_queue(q, ctx->cpu);
 320         __blk_mq_free_request(hctx, ctx, rq);
 321 }
 322
 323 /*
 324  * Clone all relevant state from a request that has been put on hold in
 325  * the flush state machine into the preallocated flush request that hangs
 326  * off the request queue.
 327  *
 328  * For a driver the flush request should be invisible, that's why we are
 329  * impersonating the original request here.
 330  */
 331 void blk_mq_clone_flush_request(struct request *flush_rq,
 332                 struct request *orig_rq)
 333 {
 334         struct blk_mq_hw_ctx *hctx =
 335                 orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu);
 336
 337         flush_rq->mq_ctx = orig_rq->mq_ctx;
 338         flush_rq->tag = orig_rq->tag;
 339         memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq),
 340                 hctx->cmd_size);
 341 }
 342
 343 inline void __blk_mq_end_io(struct request *rq, int error)
 344 {
 345         blk_account_io_done(rq);
 346
 347         if (rq->end_io) {
 348                 rq->end_io(rq, error);
 349         } else {
 350                 if (unlikely(blk_bidi_rq(rq)))
 351                         blk_mq_free_request(rq->next_rq);
 352                 blk_mq_free_request(rq);
 353         }
 354 }
 355 EXPORT_SYMBOL(__blk_mq_end_io);
 356
 357 void blk_mq_end_io(struct request *rq, int error)
 358 {
 359         if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 360                 BUG();
 361         __blk_mq_end_io(rq, error);
 362 }
 363 EXPORT_SYMBOL(blk_mq_end_io);
 364
 365 static void __blk_mq_complete_request_remote(void *data)
 366 {
 367         struct request *rq = data;
 368
 369         rq->q->softirq_done_fn(rq);
 370 }
 371
 372 void __blk_mq_complete_request(struct request *rq)
 373 {
 374         struct blk_mq_ctx *ctx = rq->mq_ctx;
 375         bool shared = false;
 376         int cpu;
 377
 378         if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
 379                 rq->q->softirq_done_fn(rq);
 380                 return;
 381         }
 382
 383         cpu = get_cpu();
 384         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
 385                 shared = cpus_share_cache(cpu, ctx->cpu);
 386
 387         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
 388                 rq->csd.func = __blk_mq_complete_request_remote;
 389                 rq->csd.info = rq;
 390                 rq->csd.flags = 0;
 391                 smp_call_function_single_async(ctx->cpu, &rq->csd);
 392         } else {
 393                 rq->q->softirq_done_fn(rq);
 394         }
 395         put_cpu();
 396 }
 397
 398 /**
 399  * blk_mq_complete_request - end I/O on a request
 400  * @rq:         the request being processed
 401  *
 402  * Description:
 403  *      Ends all I/O on a request. It does not handle partial completions.
 404  *      The actual completion happens out-of-order, through a IPI handler.
 405  **/
 406 void blk_mq_complete_request(struct request *rq)
 407 {
 408         if (unlikely(blk_should_fake_timeout(rq->q)))
 409                 return;
 410         if (!blk_mark_rq_complete(rq))
 411                 __blk_mq_complete_request(rq);
 412 }
 413 EXPORT_SYMBOL(blk_mq_complete_request);
 414
 415 static void blk_mq_start_request(struct request *rq, bool last)
 416 {
 417         struct request_queue *q = rq->q;
 418
 419         trace_block_rq_issue(q, rq);
 420
 421         rq->resid_len = blk_rq_bytes(rq);
 422         if (unlikely(blk_bidi_rq(rq)))
 423                 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
 424
 425         /*
 426          * Just mark start time and set the started bit. Due to memory
 427          * ordering, we know we'll see the correct deadline as long as
 428          * REQ_ATOMIC_STARTED is seen.
 429          */
 430         rq->deadline = jiffies + q->rq_timeout;
 431
 432         /*
 433          * Mark us as started and clear complete. Complete might have been
 434          * set if requeue raced with timeout, which then marked it as
 435          * complete. So be sure to clear complete again when we start
 436          * the request, otherwise we'll ignore the completion event.
 437          */
 438         set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 439         clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 440
 441         if (q->dma_drain_size && blk_rq_bytes(rq)) {
 442                 /*
 443                  * Make sure space for the drain appears.  We know we can do
 444                  * this because max_hw_segments has been adjusted to be one
 445                  * fewer than the device can handle.
 446                  */
 447                 rq->nr_phys_segments++;
 448         }
 449
 450         /*
 451          * Flag the last request in the series so that drivers know when IO
 452          * should be kicked off, if they don't do it on a per-request basis.
 453          *
 454          * Note: the flag isn't the only condition drivers should do kick off.
 455          * If drive is busy, the last request might not have the bit set.
 456          */
 457         if (last)
 458                 rq->cmd_flags |= REQ_END;
 459 }
 460
 461 static void __blk_mq_requeue_request(struct request *rq)
 462 {
 463         struct request_queue *q = rq->q;
 464
 465         trace_block_rq_requeue(q, rq);
 466         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 467
 468         rq->cmd_flags &= ~REQ_END;
 469
 470         if (q->dma_drain_size && blk_rq_bytes(rq))
 471                 rq->nr_phys_segments--;
 472 }
 473
 474 void blk_mq_requeue_request(struct request *rq)
 475 {
 476         __blk_mq_requeue_request(rq);
 477         blk_clear_rq_complete(rq);
 478
 479         BUG_ON(blk_queued_rq(rq));
 480         blk_mq_insert_request(rq, true, true, false);
 481 }
 482 EXPORT_SYMBOL(blk_mq_requeue_request);
 483
 484 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 485 {
 486         return tags->rqs[tag];
 487 }
 488 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 489
 490 struct blk_mq_timeout_data {
 491         struct blk_mq_hw_ctx *hctx;
 492         unsigned long *next;
 493         unsigned int *next_set;
 494 };
 495
 496 static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
 497 {
 498         struct blk_mq_timeout_data *data = __data;
 499         struct blk_mq_hw_ctx *hctx = data->hctx;
 500         unsigned int tag;
 501
 502          /* It may not be in flight yet (this is where
 503          * the REQ_ATOMIC_STARTED flag comes in). The requests are
 504          * statically allocated, so we know it's always safe to access the
 505          * memory associated with a bit offset into ->rqs[].
 506          */
 507         tag = 0;
 508         do {
 509                 struct request *rq;
 510
 511                 tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
 512                 if (tag >= hctx->tags->nr_tags)
 513                         break;
 514
 515                 rq = blk_mq_tag_to_rq(hctx->tags, tag++);
 516                 if (rq->q != hctx->queue)
 517                         continue;
 518                 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 519                         continue;
 520
 521                 blk_rq_check_expired(rq, data->next, data->next_set);
 522         } while (1);
 523 }
 524
 525 static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
 526                                         unsigned long *next,
 527                                         unsigned int *next_set)
 528 {
 529         struct blk_mq_timeout_data data = {
 530                 .hctx           = hctx,
 531                 .next           = next,
 532                 .next_set       = next_set,
 533         };
 534
 535         /*
 536          * Ask the tagging code to iterate busy requests, so we can
 537          * check them for timeout.
 538          */
 539         blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
 540 }
 541
 542 static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
 543 {
 544         struct request_queue *q = rq->q;
 545
 546         /*
 547          * We know that complete is set at this point. If STARTED isn't set
 548          * anymore, then the request isn't active and the "timeout" should
 549          * just be ignored. This can happen due to the bitflag ordering.
 550          * Timeout first checks if STARTED is set, and if it is, assumes
 551          * the request is active. But if we race with completion, then
 552          * we both flags will get cleared. So check here again, and ignore
 553          * a timeout event with a request that isn't active.
 554          */
 555         if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 556                 return BLK_EH_NOT_HANDLED;
 557
 558         if (!q->mq_ops->timeout)
 559                 return BLK_EH_RESET_TIMER;
 560
 561         return q->mq_ops->timeout(rq);
 562 }
 563
 564 static void blk_mq_rq_timer(unsigned long data)
 565 {
 566         struct request_queue *q = (struct request_queue *) data;
 567         struct blk_mq_hw_ctx *hctx;
 568         unsigned long next = 0;
 569         int i, next_set = 0;
 570
 571         queue_for_each_hw_ctx(q, hctx, i)
 572                 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
 573
 574         if (next_set)
 575                 mod_timer(&q->timeout, round_jiffies_up(next));
 576 }
 577
 578 /*
 579  * Reverse check our software queue for entries that we could potentially
 580  * merge with. Currently includes a hand-wavy stop count of 8, to not spend
 581  * too much time checking for merges.
 582  */
 583 static bool blk_mq_attempt_merge(struct request_queue *q,
 584                                  struct blk_mq_ctx *ctx, struct bio *bio)
 585 {
 586         struct request *rq;
 587         int checked = 8;
 588
 589         list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
 590                 int el_ret;
 591
 592                 if (!checked--)
 593                         break;
 594
 595                 if (!blk_rq_merge_ok(rq, bio))
 596                         continue;
 597
 598                 el_ret = blk_try_merge(rq, bio);
 599                 if (el_ret == ELEVATOR_BACK_MERGE) {
 600                         if (bio_attempt_back_merge(q, rq, bio)) {
 601                                 ctx->rq_merged++;
 602                                 return true;
 603                         }
 604                         break;
 605                 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
 606                         if (bio_attempt_front_merge(q, rq, bio)) {
 607                                 ctx->rq_merged++;
 608                                 return true;
 609                         }
 610                         break;
 611                 }
 612         }
 613
 614         return false;
 615 }
 616
 617 /*
 618  * Run this hardware queue, pulling any software queues mapped to it in.
 619  * Note that this function currently has various problems around ordering
 620  * of IO. In particular, we'd like FIFO behaviour on handling existing
 621  * items on the hctx->dispatch list. Ignore that for now.
 622  */
 623 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 624 {
 625         struct request_queue *q = hctx->queue;
 626         struct blk_mq_ctx *ctx;
 627         struct request *rq;
 628         LIST_HEAD(rq_list);
 629         int bit, queued;
 630
 631         WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
 632
 633         if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
 634                 return;
 635
 636         hctx->run++;
 637
 638         /*
 639          * Touch any software queue that has pending entries.
 640          */
 641         for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
 642                 clear_bit(bit, hctx->ctx_map);
 643                 ctx = hctx->ctxs[bit];
 644
 645                 spin_lock(&ctx->lock);
 646                 list_splice_tail_init(&ctx->rq_list, &rq_list);
 647                 spin_unlock(&ctx->lock);
 648         }
 649
 650         /*
 651          * If we have previous entries on our dispatch list, grab them
 652          * and stuff them at the front for more fair dispatch.
 653          */
 654         if (!list_empty_careful(&hctx->dispatch)) {
 655                 spin_lock(&hctx->lock);
 656                 if (!list_empty(&hctx->dispatch))
 657                         list_splice_init(&hctx->dispatch, &rq_list);
 658                 spin_unlock(&hctx->lock);
 659         }
 660
 661         /*
 662          * Delete and return all entries from our dispatch list
 663          */
 664         queued = 0;
 665
 666         /*
 667          * Now process all the entries, sending them to the driver.
 668          */
 669         while (!list_empty(&rq_list)) {
 670                 int ret;
 671
 672                 rq = list_first_entry(&rq_list, struct request, queuelist);
 673                 list_del_init(&rq->queuelist);
 674
 675                 blk_mq_start_request(rq, list_empty(&rq_list));
 676
 677                 ret = q->mq_ops->queue_rq(hctx, rq);
 678                 switch (ret) {
 679                 case BLK_MQ_RQ_QUEUE_OK:
 680                         queued++;
 681                         continue;
 682                 case BLK_MQ_RQ_QUEUE_BUSY:
 683                         list_add(&rq->queuelist, &rq_list);
 684                         __blk_mq_requeue_request(rq);
 685                         break;
 686                 default:
 687                         pr_err("blk-mq: bad return on queue: %d\n", ret);
 688                 case BLK_MQ_RQ_QUEUE_ERROR:
 689                         rq->errors = -EIO;
 690                         blk_mq_end_io(rq, rq->errors);
 691                         break;
 692                 }
 693
 694                 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
 695                         break;
 696         }
 697
 698         if (!queued)
 699                 hctx->dispatched[0]++;
 700         else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
 701                 hctx->dispatched[ilog2(queued) + 1]++;
 702
 703         /*
 704          * Any items that need requeuing? Stuff them into hctx->dispatch,
 705          * that is where we will continue on next queue run.
 706          */
 707         if (!list_empty(&rq_list)) {
 708                 spin_lock(&hctx->lock);
 709                 list_splice(&rq_list, &hctx->dispatch);
 710                 spin_unlock(&hctx->lock);
 711         }
 712 }
 713
 714 /*
 715  * It'd be great if the workqueue API had a way to pass
 716  * in a mask and had some smarts for more clever placement.
 717  * For now we just round-robin here, switching for every
 718  * BLK_MQ_CPU_WORK_BATCH queued items.
 719  */
 720 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 721 {
 722         int cpu = hctx->next_cpu;
 723
 724         if (--hctx->next_cpu_batch <= 0) {
 725                 int next_cpu;
 726
 727                 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
 728                 if (next_cpu >= nr_cpu_ids)
 729                         next_cpu = cpumask_first(hctx->cpumask);
 730
 731                 hctx->next_cpu = next_cpu;
 732                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 733         }
 734
 735         return cpu;
 736 }
 737
 738 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 739 {
 740         if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
 741                 return;
 742
 743         if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
 744                 __blk_mq_run_hw_queue(hctx);
 745         else if (hctx->queue->nr_hw_queues == 1)
 746                 kblockd_schedule_delayed_work(&hctx->run_work, 0);
 747         else {
 748                 unsigned int cpu;
 749
 750                 cpu = blk_mq_hctx_next_cpu(hctx);
 751                 kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
 752         }
 753 }
 754
 755 void blk_mq_run_queues(struct request_queue *q, bool async)
 756 {
 757         struct blk_mq_hw_ctx *hctx;
 758         int i;
 759
 760         queue_for_each_hw_ctx(q, hctx, i) {
 761                 if ((!blk_mq_hctx_has_pending(hctx) &&
 762                     list_empty_careful(&hctx->dispatch)) ||
 763                     test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 764                         continue;
 765
 766                 preempt_disable();
 767                 blk_mq_run_hw_queue(hctx, async);
 768                 preempt_enable();
 769         }
 770 }
 771 EXPORT_SYMBOL(blk_mq_run_queues);
 772
 773 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 774 {
 775         cancel_delayed_work(&hctx->run_work);
 776         cancel_delayed_work(&hctx->delay_work);
 777         set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 778 }
 779 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
 780
 781 void blk_mq_stop_hw_queues(struct request_queue *q)
 782 {
 783         struct blk_mq_hw_ctx *hctx;
 784         int i;
 785
 786         queue_for_each_hw_ctx(q, hctx, i)
 787                 blk_mq_stop_hw_queue(hctx);
 788 }
 789 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
 790
 791 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 792 {
 793         clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 794
 795         preempt_disable();
 796         __blk_mq_run_hw_queue(hctx);
 797         preempt_enable();
 798 }
 799 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 800
 801 void blk_mq_start_hw_queues(struct request_queue *q)
 802 {
 803         struct blk_mq_hw_ctx *hctx;
 804         int i;
 805
 806         queue_for_each_hw_ctx(q, hctx, i)
 807                 blk_mq_start_hw_queue(hctx);
 808 }
 809 EXPORT_SYMBOL(blk_mq_start_hw_queues);
 810
 811
 812 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 813 {
 814         struct blk_mq_hw_ctx *hctx;
 815         int i;
 816
 817         queue_for_each_hw_ctx(q, hctx, i) {
 818                 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 819                         continue;
 820
 821                 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 822                 preempt_disable();
 823                 blk_mq_run_hw_queue(hctx, async);
 824                 preempt_enable();
 825         }
 826 }
 827 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
 828
 829 static void blk_mq_run_work_fn(struct work_struct *work)
 830 {
 831         struct blk_mq_hw_ctx *hctx;
 832
 833         hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
 834
 835         __blk_mq_run_hw_queue(hctx);
 836 }
 837
 838 static void blk_mq_delay_work_fn(struct work_struct *work)
 839 {
 840         struct blk_mq_hw_ctx *hctx;
 841
 842         hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
 843
 844         if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
 845                 __blk_mq_run_hw_queue(hctx);
 846 }
 847
 848 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 849 {
 850         unsigned long tmo = msecs_to_jiffies(msecs);
 851
 852         if (hctx->queue->nr_hw_queues == 1)
 853                 kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
 854         else {
 855                 unsigned int cpu;
 856
 857                 cpu = blk_mq_hctx_next_cpu(hctx);
 858                 kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
 859         }
 860 }
 861 EXPORT_SYMBOL(blk_mq_delay_queue);
 862
 863 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 864                                     struct request *rq, bool at_head)
 865 {
 866         struct blk_mq_ctx *ctx = rq->mq_ctx;
 867
 868         trace_block_rq_insert(hctx->queue, rq);
 869
 870         if (at_head)
 871                 list_add(&rq->queuelist, &ctx->rq_list);
 872         else
 873                 list_add_tail(&rq->queuelist, &ctx->rq_list);
 874
 875         blk_mq_hctx_mark_pending(hctx, ctx);
 876
 877         /*
 878          * We do this early, to ensure we are on the right CPU.
 879          */
 880         blk_add_timer(rq);
 881 }
 882
 883 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
 884                 bool async)
 885 {
 886         struct request_queue *q = rq->q;
 887         struct blk_mq_hw_ctx *hctx;
 888         struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx;
 889
 890         current_ctx = blk_mq_get_ctx(q);
 891         if (!cpu_online(ctx->cpu))
 892                 rq->mq_ctx = ctx = current_ctx;
 893
 894         hctx = q->mq_ops->map_queue(q, ctx->cpu);
 895
 896         if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA) &&
 897             !(rq->cmd_flags & (REQ_FLUSH_SEQ))) {
 898                 blk_insert_flush(rq);
 899         } else {
 900                 spin_lock(&ctx->lock);
 901                 __blk_mq_insert_request(hctx, rq, at_head);
 902                 spin_unlock(&ctx->lock);
 903         }
 904
 905         if (run_queue)
 906                 blk_mq_run_hw_queue(hctx, async);
 907
 908         blk_mq_put_ctx(current_ctx);
 909 }
 910
 911 static void blk_mq_insert_requests(struct request_queue *q,
 912                                      struct blk_mq_ctx *ctx,
 913                                      struct list_head *list,
 914                                      int depth,
 915                                      bool from_schedule)
 916
 917 {
 918         struct blk_mq_hw_ctx *hctx;
 919         struct blk_mq_ctx *current_ctx;
 920
 921         trace_block_unplug(q, depth, !from_schedule);
 922
 923         current_ctx = blk_mq_get_ctx(q);
 924
 925         if (!cpu_online(ctx->cpu))
 926                 ctx = current_ctx;
 927         hctx = q->mq_ops->map_queue(q, ctx->cpu);
 928
 929         /*
 930          * preemption doesn't flush plug list, so it's possible ctx->cpu is
 931          * offline now
 932          */
 933         spin_lock(&ctx->lock);
 934         while (!list_empty(list)) {
 935                 struct request *rq;
 936
 937                 rq = list_first_entry(list, struct request, queuelist);
 938                 list_del_init(&rq->queuelist);
 939                 rq->mq_ctx = ctx;
 940                 __blk_mq_insert_request(hctx, rq, false);
 941         }
 942         spin_unlock(&ctx->lock);
 943
 944         blk_mq_run_hw_queue(hctx, from_schedule);
 945         blk_mq_put_ctx(current_ctx);
 946 }
 947
 948 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
 949 {
 950         struct request *rqa = container_of(a, struct request, queuelist);
 951         struct request *rqb = container_of(b, struct request, queuelist);
 952
 953         return !(rqa->mq_ctx < rqb->mq_ctx ||
 954                  (rqa->mq_ctx == rqb->mq_ctx &&
 955                   blk_rq_pos(rqa) < blk_rq_pos(rqb)));
 956 }
 957
 958 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 959 {
 960         struct blk_mq_ctx *this_ctx;
 961         struct request_queue *this_q;
 962         struct request *rq;
 963         LIST_HEAD(list);
 964         LIST_HEAD(ctx_list);
 965         unsigned int depth;
 966
 967         list_splice_init(&plug->mq_list, &list);
 968
 969         list_sort(NULL, &list, plug_ctx_cmp);
 970
 971         this_q = NULL;
 972         this_ctx = NULL;
 973         depth = 0;
 974
 975         while (!list_empty(&list)) {
 976                 rq = list_entry_rq(list.next);
 977                 list_del_init(&rq->queuelist);
 978                 BUG_ON(!rq->q);
 979                 if (rq->mq_ctx != this_ctx) {
 980                         if (this_ctx) {
 981                                 blk_mq_insert_requests(this_q, this_ctx,
 982                                                         &ctx_list, depth,
 983                                                         from_schedule);
 984                         }
 985
 986                         this_ctx = rq->mq_ctx;
 987                         this_q = rq->q;
 988                         depth = 0;
 989                 }
 990
 991                 depth++;
 992                 list_add_tail(&rq->queuelist, &ctx_list);
 993         }
 994
 995         /*
 996          * If 'this_ctx' is set, we know we have entries to complete
 997          * on 'ctx_list'. Do those.
 998          */
 999         if (this_ctx) {
1000                 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
1001                                        from_schedule);
1002         }
1003 }
1004
1005 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1006 {
1007         init_request_from_bio(rq, bio);
1008         blk_account_io_start(rq, 1);
1009 }
1010
1011 static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1012 {
1013         struct blk_mq_hw_ctx *hctx;
1014         struct blk_mq_ctx *ctx;
1015         const int is_sync = rw_is_sync(bio->bi_rw);
1016         const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1017         int rw = bio_data_dir(bio);
1018         struct request *rq;
1019         unsigned int use_plug, request_count = 0;
1020
1021         /*
1022          * If we have multiple hardware queues, just go directly to
1023          * one of those for sync IO.
1024          */
1025         use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);
1026
1027         blk_queue_bounce(q, &bio);
1028
1029         if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1030                 bio_endio(bio, -EIO);
1031                 return;
1032         }
1033
1034         if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
1035                 return;
1036
1037         if (blk_mq_queue_enter(q)) {
1038                 bio_endio(bio, -EIO);
1039                 return;
1040         }
1041
1042         ctx = blk_mq_get_ctx(q);
1043         hctx = q->mq_ops->map_queue(q, ctx->cpu);
1044
1045         if (is_sync)
1046                 rw |= REQ_SYNC;
1047         trace_block_getrq(q, bio, rw);
1048         rq = __blk_mq_alloc_request(hctx, ctx, GFP_ATOMIC, false);
1049         if (likely(rq))
1050                 blk_mq_rq_ctx_init(q, ctx, rq, rw);
1051         else {
1052                 blk_mq_put_ctx(ctx);
1053                 trace_block_sleeprq(q, bio, rw);
1054                 rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
1055                                                         false);
1056                 ctx = rq->mq_ctx;
1057                 hctx = q->mq_ops->map_queue(q, ctx->cpu);
1058         }
1059
1060         hctx->queued++;
1061
1062         if (unlikely(is_flush_fua)) {
1063                 blk_mq_bio_to_request(rq, bio);
1064                 blk_insert_flush(rq);
1065                 goto run_queue;
1066         }
1067
1068         /*
1069          * A task plug currently exists. Since this is completely lockless,
1070          * utilize that to temporarily store requests until the task is
1071          * either done or scheduled away.
1072          */
1073         if (use_plug) {
1074                 struct blk_plug *plug = current->plug;
1075
1076                 if (plug) {
1077                         blk_mq_bio_to_request(rq, bio);
1078                         if (list_empty(&plug->mq_list))
1079                                 trace_block_plug(q);
1080                         else if (request_count >= BLK_MAX_REQUEST_COUNT) {
1081                                 blk_flush_plug_list(plug, false);
1082                                 trace_block_plug(q);
1083                         }
1084                         list_add_tail(&rq->queuelist, &plug->mq_list);
1085                         blk_mq_put_ctx(ctx);
1086                         return;
1087                 }
1088         }
1089
1090         if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) {
1091                 init_request_from_bio(rq, bio);
1092
1093                 spin_lock(&ctx->lock);
1094 insert_rq:
1095                 __blk_mq_insert_request(hctx, rq, false);
1096                 spin_unlock(&ctx->lock);
1097                 blk_account_io_start(rq, 1);
1098         } else {
1099                 spin_lock(&ctx->lock);
1100                 if (!blk_mq_attempt_merge(q, ctx, bio)) {
1101                         init_request_from_bio(rq, bio);
1102                         goto insert_rq;
1103                 }
1104
1105                 spin_unlock(&ctx->lock);
1106                 __blk_mq_free_request(hctx, ctx, rq);
1107         }
1108
1109
1110         /*
1111          * For a SYNC request, send it to the hardware immediately. For an
1112          * ASYNC request, just ensure that we run it later on. The latter
1113          * allows for merging opportunities and more efficient dispatching.
1114          */
1115 run_queue:
1116         blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
1117         blk_mq_put_ctx(ctx);
1118 }
1119
1120 /*
1121  * Default mapping to a software queue, since we use one per CPU.
1122  */
1123 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
1124 {
1125         return q->queue_hw_ctx[q->mq_map[cpu]];
1126 }
1127 EXPORT_SYMBOL(blk_mq_map_queue);
1128
1129 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set,
1130                                                    unsigned int hctx_index)
1131 {
1132         return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL,
1133                                 set->numa_node);
1134 }
1135 EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
1136
1137 void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
1138                                  unsigned int hctx_index)
1139 {
1140         kfree(hctx);
1141 }
1142 EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
1143
1144 static void blk_mq_hctx_notify(void *data, unsigned long action,
1145                                unsigned int cpu)
1146 {
1147         struct blk_mq_hw_ctx *hctx = data;
1148         struct request_queue *q = hctx->queue;
1149         struct blk_mq_ctx *ctx;
1150         LIST_HEAD(tmp);
1151
1152         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1153                 return;
1154
1155         /*
1156          * Move ctx entries to new CPU, if this one is going away.
1157          */
1158         ctx = __blk_mq_get_ctx(q, cpu);
1159
1160         spin_lock(&ctx->lock);
1161         if (!list_empty(&ctx->rq_list)) {
1162                 list_splice_init(&ctx->rq_list, &tmp);
1163                 clear_bit(ctx->index_hw, hctx->ctx_map);
1164         }
1165         spin_unlock(&ctx->lock);
1166
1167         if (list_empty(&tmp))
1168                 return;
1169
1170         ctx = blk_mq_get_ctx(q);
1171         spin_lock(&ctx->lock);
1172
1173         while (!list_empty(&tmp)) {
1174                 struct request *rq;
1175
1176                 rq = list_first_entry(&tmp, struct request, queuelist);
1177                 rq->mq_ctx = ctx;
1178                 list_move_tail(&rq->queuelist, &ctx->rq_list);
1179         }
1180
1181         hctx = q->mq_ops->map_queue(q, ctx->cpu);
1182         blk_mq_hctx_mark_pending(hctx, ctx);
1183
1184         spin_unlock(&ctx->lock);
1185
1186         blk_mq_run_hw_queue(hctx, true);
1187         blk_mq_put_ctx(ctx);
1188 }
1189
1190 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
1191                 struct blk_mq_tags *tags, unsigned int hctx_idx)
1192 {
1193         struct page *page;
1194
1195         if (tags->rqs && set->ops->exit_request) {
1196                 int i;
1197
1198                 for (i = 0; i < tags->nr_tags; i++) {
1199                         if (!tags->rqs[i])
1200                                 continue;
1201                         set->ops->exit_request(set->driver_data, tags->rqs[i],
1202                                                 hctx_idx, i);
1203                 }
1204         }
1205
1206         while (!list_empty(&tags->page_list)) {
1207                 page = list_first_entry(&tags->page_list, struct page, lru);
1208                 list_del_init(&page->lru);
1209                 __free_pages(page, page->private);
1210         }
1211
1212         kfree(tags->rqs);
1213
1214         blk_mq_free_tags(tags);
1215 }
1216
1217 static size_t order_to_size(unsigned int order)
1218 {
1219         return (size_t)PAGE_SIZE << order;
1220 }
1221
1222 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1223                 unsigned int hctx_idx)
1224 {
1225         struct blk_mq_tags *tags;
1226         unsigned int i, j, entries_per_page, max_order = 4;
1227         size_t rq_size, left;
1228
1229         tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
1230                                 set->numa_node);
1231         if (!tags)
1232                 return NULL;
1233
1234         INIT_LIST_HEAD(&tags->page_list);
1235
1236         tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *),
1237                                         GFP_KERNEL, set->numa_node);
1238         if (!tags->rqs) {
1239                 blk_mq_free_tags(tags);
1240                 return NULL;
1241         }
1242
1243         /*
1244          * rq_size is the size of the request plus driver payload, rounded
1245          * to the cacheline size
1246          */
1247         rq_size = round_up(sizeof(struct request) + set->cmd_size,
1248                                 cache_line_size());
1249         left = rq_size * set->queue_depth;
1250
1251         for (i = 0; i < set->queue_depth; ) {
1252                 int this_order = max_order;
1253                 struct page *page;
1254                 int to_do;
1255                 void *p;
1256
1257                 while (left < order_to_size(this_order - 1) && this_order)
1258                         this_order--;
1259
1260                 do {
1261                         page = alloc_pages_node(set->numa_node, GFP_KERNEL,
1262                                                 this_order);
1263                         if (page)
1264                                 break;
1265                         if (!this_order--)
1266                                 break;
1267                         if (order_to_size(this_order) < rq_size)
1268                                 break;
1269                 } while (1);
1270
1271                 if (!page)
1272                         goto fail;
1273
1274                 page->private = this_order;
1275                 list_add_tail(&page->lru, &tags->page_list);
1276
1277                 p = page_address(page);
1278                 entries_per_page = order_to_size(this_order) / rq_size;
1279                 to_do = min(entries_per_page, set->queue_depth - i);
1280                 left -= to_do * rq_size;
1281                 for (j = 0; j < to_do; j++) {
1282                         tags->rqs[i] = p;
1283                         if (set->ops->init_request) {
1284                                 if (set->ops->init_request(set->driver_data,
1285                                                 tags->rqs[i], hctx_idx, i,
1286                                                 set->numa_node))
1287                                         goto fail;
1288                         }
1289
1290                         p += rq_size;
1291                         i++;
1292                 }
1293         }
1294
1295         return tags;
1296
1297 fail:
1298         pr_warn("%s: failed to allocate requests\n", __func__);
1299         blk_mq_free_rq_map(set, tags, hctx_idx);
1300         return NULL;
1301 }
1302
1303 static int blk_mq_init_hw_queues(struct request_queue *q,
1304                 struct blk_mq_tag_set *set)
1305 {
1306         struct blk_mq_hw_ctx *hctx;
1307         unsigned int i, j;
1308
1309         /*
1310          * Initialize hardware queues
1311          */
1312         queue_for_each_hw_ctx(q, hctx, i) {
1313                 unsigned int num_maps;
1314                 int node;
1315
1316                 node = hctx->numa_node;
1317                 if (node == NUMA_NO_NODE)
1318                         node = hctx->numa_node = set->numa_node;
1319
1320                 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1321                 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1322                 spin_lock_init(&hctx->lock);
1323                 INIT_LIST_HEAD(&hctx->dispatch);
1324                 hctx->queue = q;
1325                 hctx->queue_num = i;
1326                 hctx->flags = set->flags;
1327                 hctx->cmd_size = set->cmd_size;
1328
1329                 blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1330                                                 blk_mq_hctx_notify, hctx);
1331                 blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1332
1333                 hctx->tags = set->tags[i];
1334
1335                 /*
1336                  * Allocate space for all possible cpus to avoid allocation in
1337                  * runtime
1338                  */
1339                 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1340                                                 GFP_KERNEL, node);
1341                 if (!hctx->ctxs)
1342                         break;
1343
1344                 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
1345                 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
1346                                                 GFP_KERNEL, node);
1347                 if (!hctx->ctx_map)
1348                         break;
1349
1350                 hctx->nr_ctx_map = num_maps;
1351                 hctx->nr_ctx = 0;
1352
1353                 if (set->ops->init_hctx &&
1354                     set->ops->init_hctx(hctx, set->driver_data, i))
1355                         break;
1356         }
1357
1358         if (i == q->nr_hw_queues)
1359                 return 0;
1360
1361         /*
1362          * Init failed
1363          */
1364         queue_for_each_hw_ctx(q, hctx, j) {
1365                 if (i == j)
1366                         break;
1367
1368                 if (set->ops->exit_hctx)
1369                         set->ops->exit_hctx(hctx, j);
1370
1371                 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1372                 kfree(hctx->ctxs);
1373                 kfree(hctx->ctx_map);
1374         }
1375
1376         return 1;
1377 }
1378
1379 static void blk_mq_init_cpu_queues(struct request_queue *q,
1380                                    unsigned int nr_hw_queues)
1381 {
1382         unsigned int i;
1383
1384         for_each_possible_cpu(i) {
1385                 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1386                 struct blk_mq_hw_ctx *hctx;
1387
1388                 memset(__ctx, 0, sizeof(*__ctx));
1389                 __ctx->cpu = i;
1390                 spin_lock_init(&__ctx->lock);
1391                 INIT_LIST_HEAD(&__ctx->rq_list);
1392                 __ctx->queue = q;
1393
1394                 /* If the cpu isn't online, the cpu is mapped to first hctx */
1395                 if (!cpu_online(i))
1396                         continue;
1397
1398                 hctx = q->mq_ops->map_queue(q, i);
1399                 cpumask_set_cpu(i, hctx->cpumask);
1400                 hctx->nr_ctx++;
1401
1402                 /*
1403                  * Set local node, IFF we have more than one hw queue. If
1404                  * not, we remain on the home node of the device
1405                  */
1406                 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1407                         hctx->numa_node = cpu_to_node(i);
1408         }
1409 }
1410
1411 static void blk_mq_map_swqueue(struct request_queue *q)
1412 {
1413         unsigned int i;
1414         struct blk_mq_hw_ctx *hctx;
1415         struct blk_mq_ctx *ctx;
1416
1417         queue_for_each_hw_ctx(q, hctx, i) {
1418                 cpumask_clear(hctx->cpumask);
1419                 hctx->nr_ctx = 0;
1420         }
1421
1422         /*
1423          * Map software to hardware queues
1424          */
1425         queue_for_each_ctx(q, ctx, i) {
1426                 /* If the cpu isn't online, the cpu is mapped to first hctx */
1427                 if (!cpu_online(i))
1428                         continue;
1429
1430                 hctx = q->mq_ops->map_queue(q, i);
1431                 cpumask_set_cpu(i, hctx->cpumask);
1432                 ctx->index_hw = hctx->nr_ctx;
1433                 hctx->ctxs[hctx->nr_ctx++] = ctx;
1434         }
1435
1436         queue_for_each_hw_ctx(q, hctx, i) {
1437                 hctx->next_cpu = cpumask_first(hctx->cpumask);
1438                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1439         }
1440 }
1441
1442 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1443 {
1444         struct blk_mq_hw_ctx **hctxs;
1445         struct blk_mq_ctx *ctx;
1446         struct request_queue *q;
1447         int i;
1448
1449         ctx = alloc_percpu(struct blk_mq_ctx);
1450         if (!ctx)
1451                 return ERR_PTR(-ENOMEM);
1452
1453         hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1454                         set->numa_node);
1455
1456         if (!hctxs)
1457                 goto err_percpu;
1458
1459         for (i = 0; i < set->nr_hw_queues; i++) {
1460                 hctxs[i] = set->ops->alloc_hctx(set, i);
1461                 if (!hctxs[i])
1462                         goto err_hctxs;
1463
1464                 if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
1465                         goto err_hctxs;
1466
1467                 hctxs[i]->numa_node = NUMA_NO_NODE;
1468                 hctxs[i]->queue_num = i;
1469         }
1470
1471         q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
1472         if (!q)
1473                 goto err_hctxs;
1474
1475         q->mq_map = blk_mq_make_queue_map(set);
1476         if (!q->mq_map)
1477                 goto err_map;
1478
1479         setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
1480         blk_queue_rq_timeout(q, 30000);
1481
1482         q->nr_queues = nr_cpu_ids;
1483         q->nr_hw_queues = set->nr_hw_queues;
1484
1485         q->queue_ctx = ctx;
1486         q->queue_hw_ctx = hctxs;
1487
1488         q->mq_ops = set->ops;
1489         q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
1490
1491         q->sg_reserved_size = INT_MAX;
1492
1493         blk_queue_make_request(q, blk_mq_make_request);
1494         blk_queue_rq_timed_out(q, blk_mq_rq_timed_out);
1495         if (set->timeout)
1496                 blk_queue_rq_timeout(q, set->timeout);
1497
1498         if (set->ops->complete)
1499                 blk_queue_softirq_done(q, set->ops->complete);
1500
1501         blk_mq_init_flush(q);
1502         blk_mq_init_cpu_queues(q, set->nr_hw_queues);
1503
1504         q->flush_rq = kzalloc(round_up(sizeof(struct request) +
1505                                 set->cmd_size, cache_line_size()),
1506                                 GFP_KERNEL);
1507         if (!q->flush_rq)
1508                 goto err_hw;
1509
1510         if (blk_mq_init_hw_queues(q, set))
1511                 goto err_flush_rq;
1512
1513         blk_mq_map_swqueue(q);
1514
1515         mutex_lock(&all_q_mutex);
1516         list_add_tail(&q->all_q_node, &all_q_list);
1517         mutex_unlock(&all_q_mutex);
1518
1519         return q;
1520
1521 err_flush_rq:
1522         kfree(q->flush_rq);
1523 err_hw:
1524         kfree(q->mq_map);
1525 err_map:
1526         blk_cleanup_queue(q);
1527 err_hctxs:
1528         for (i = 0; i < set->nr_hw_queues; i++) {
1529                 if (!hctxs[i])
1530                         break;
1531                 free_cpumask_var(hctxs[i]->cpumask);
1532                 set->ops->free_hctx(hctxs[i], i);
1533         }
1534         kfree(hctxs);
1535 err_percpu:
1536         free_percpu(ctx);
1537         return ERR_PTR(-ENOMEM);
1538 }
1539 EXPORT_SYMBOL(blk_mq_init_queue);
1540
1541 void blk_mq_free_queue(struct request_queue *q)
1542 {
1543         struct blk_mq_hw_ctx *hctx;
1544         int i;
1545
1546         queue_for_each_hw_ctx(q, hctx, i) {
1547                 kfree(hctx->ctx_map);
1548                 kfree(hctx->ctxs);
1549                 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1550                 if (q->mq_ops->exit_hctx)
1551                         q->mq_ops->exit_hctx(hctx, i);
1552                 free_cpumask_var(hctx->cpumask);
1553                 q->mq_ops->free_hctx(hctx, i);
1554         }
1555
1556         free_percpu(q->queue_ctx);
1557         kfree(q->queue_hw_ctx);
1558         kfree(q->mq_map);
1559
1560         q->queue_ctx = NULL;
1561         q->queue_hw_ctx = NULL;
1562         q->mq_map = NULL;
1563
1564         mutex_lock(&all_q_mutex);
1565         list_del_init(&q->all_q_node);
1566         mutex_unlock(&all_q_mutex);
1567 }
1568
1569 /* Basically redo blk_mq_init_queue with queue frozen */
1570 static void blk_mq_queue_reinit(struct request_queue *q)
1571 {
1572         blk_mq_freeze_queue(q);
1573
1574         blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
1575
1576         /*
1577          * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
1578          * we should change hctx numa_node according to new topology (this
1579          * involves free and re-allocate memory, worthy doing?)
1580          */
1581
1582         blk_mq_map_swqueue(q);
1583
1584         blk_mq_unfreeze_queue(q);
1585 }
1586
1587 static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
1588                                       unsigned long action, void *hcpu)
1589 {
1590         struct request_queue *q;
1591
1592         /*
1593          * Before new mappings are established, hotadded cpu might already
1594          * start handling requests. This doesn't break anything as we map
1595          * offline CPUs to first hardware queue. We will re-init the queue
1596          * below to get optimal settings.
1597          */
1598         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
1599             action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
1600                 return NOTIFY_OK;
1601
1602         mutex_lock(&all_q_mutex);
1603         list_for_each_entry(q, &all_q_list, all_q_node)
1604                 blk_mq_queue_reinit(q);
1605         mutex_unlock(&all_q_mutex);
1606         return NOTIFY_OK;
1607 }
1608
1609 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
1610 {
1611         int i;
1612
1613         if (!set->nr_hw_queues)
1614                 return -EINVAL;
1615         if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
1616                 return -EINVAL;
1617         if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
1618                 return -EINVAL;
1619
1620         if (!set->nr_hw_queues ||
1621             !set->ops->queue_rq || !set->ops->map_queue ||
1622             !set->ops->alloc_hctx || !set->ops->free_hctx)
1623                 return -EINVAL;
1624
1625
1626         set->tags = kmalloc_node(set->nr_hw_queues *
1627                                  sizeof(struct blk_mq_tags *),
1628                                  GFP_KERNEL, set->numa_node);
1629         if (!set->tags)
1630                 goto out;
1631
1632         for (i = 0; i < set->nr_hw_queues; i++) {
1633                 set->tags[i] = blk_mq_init_rq_map(set, i);
1634                 if (!set->tags[i])
1635                         goto out_unwind;
1636         }
1637
1638         return 0;
1639
1640 out_unwind:
1641         while (--i >= 0)
1642                 blk_mq_free_rq_map(set, set->tags[i], i);
1643 out:
1644         return -ENOMEM;
1645 }
1646 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
1647
1648 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
1649 {
1650         int i;
1651
1652         for (i = 0; i < set->nr_hw_queues; i++)
1653                 blk_mq_free_rq_map(set, set->tags[i], i);
1654         kfree(set->tags);
1655 }
1656 EXPORT_SYMBOL(blk_mq_free_tag_set);
1657
1658 void blk_mq_disable_hotplug(void)
1659 {
1660         mutex_lock(&all_q_mutex);
1661 }
1662
1663 void blk_mq_enable_hotplug(void)
1664 {
1665         mutex_unlock(&all_q_mutex);
1666 }
1667
1668 static int __init blk_mq_init(void)
1669 {
1670         blk_mq_cpu_init();
1671
1672         /* Must be called after percpu_counter_hotcpu_callback() */
1673         hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
1674
1675         return 0;
1676 }
1677 subsys_initcall(blk_mq_init);