drivers/block/nvme-core.c

   1 /*
   2  * NVM Express device driver
   3  * Copyright (c) 2011-2014, Intel Corporation.
   4  *
   5  * This program is free software; you can redistribute it and/or modify it
   6  * under the terms and conditions of the GNU General Public License,
   7  * version 2, as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12  * more details.
  13  *
  14  * You should have received a copy of the GNU General Public License along with
  15  * this program; if not, write to the Free Software Foundation, Inc.,
  16  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  17  */
  18
  19 #include <linux/nvme.h>
  20 #include <linux/bio.h>
  21 #include <linux/bitops.h>
  22 #include <linux/blkdev.h>
  23 #include <linux/cpu.h>
  24 #include <linux/delay.h>
  25 #include <linux/errno.h>
  26 #include <linux/fs.h>
  27 #include <linux/genhd.h>
  28 #include <linux/idr.h>
  29 #include <linux/init.h>
  30 #include <linux/interrupt.h>
  31 #include <linux/io.h>
  32 #include <linux/kdev_t.h>
  33 #include <linux/kthread.h>
  34 #include <linux/kernel.h>
  35 #include <linux/mm.h>
  36 #include <linux/module.h>
  37 #include <linux/moduleparam.h>
  38 #include <linux/pci.h>
  39 #include <linux/percpu.h>
  40 #include <linux/poison.h>
  41 #include <linux/ptrace.h>
  42 #include <linux/sched.h>
  43 #include <linux/slab.h>
  44 #include <linux/types.h>
  45 #include <scsi/sg.h>
  46 #include <asm-generic/io-64-nonatomic-lo-hi.h>
  47
  48 #define NVME_Q_DEPTH 1024
  49 #define SQ_SIZE(depth)          (depth * sizeof(struct nvme_command))
  50 #define CQ_SIZE(depth)          (depth * sizeof(struct nvme_completion))
  51 #define ADMIN_TIMEOUT   (60 * HZ)
  52
  53 static int nvme_major;
  54 module_param(nvme_major, int, 0);
  55
  56 static int use_threaded_interrupts;
  57 module_param(use_threaded_interrupts, int, 0);
  58
  59 static DEFINE_SPINLOCK(dev_list_lock);
  60 static LIST_HEAD(dev_list);
  61 static struct task_struct *nvme_thread;
  62 static struct workqueue_struct *nvme_workq;
  63
  64 static void nvme_reset_failed_dev(struct work_struct *ws);
  65
  66 struct async_cmd_info {
  67         struct kthread_work work;
  68         struct kthread_worker *worker;
  69         u32 result;
  70         int status;
  71         void *ctx;
  72 };
  73
  74 /*
  75  * An NVM Express queue.  Each device has at least two (one for admin
  76  * commands and one for I/O commands).
  77  */
  78 struct nvme_queue {
  79         struct rcu_head r_head;
  80         struct device *q_dmadev;
  81         struct nvme_dev *dev;
  82         char irqname[24];       /* nvme4294967295-65535\0 */
  83         spinlock_t q_lock;
  84         struct nvme_command *sq_cmds;
  85         volatile struct nvme_completion *cqes;
  86         dma_addr_t sq_dma_addr;
  87         dma_addr_t cq_dma_addr;
  88         wait_queue_head_t sq_full;
  89         wait_queue_t sq_cong_wait;
  90         struct bio_list sq_cong;
  91         u32 __iomem *q_db;
  92         u16 q_depth;
  93         u16 cq_vector;
  94         u16 sq_head;
  95         u16 sq_tail;
  96         u16 cq_head;
  97         u16 qid;
  98         u8 cq_phase;
  99         u8 cqe_seen;
 100         u8 q_suspended;
 101         cpumask_var_t cpu_mask;
 102         struct async_cmd_info cmdinfo;
 103         unsigned long cmdid_data[];
 104 };
 105
 106 /*
 107  * Check we didin't inadvertently grow the command struct
 108  */
 109 static inline void _nvme_check_size(void)
 110 {
 111         BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
 112         BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
 113         BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
 114         BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
 115         BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
 116         BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
 117         BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
 118         BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
 119         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
 120         BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
 121         BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 122         BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
 123 }
 124
 125 typedef void (*nvme_completion_fn)(struct nvme_dev *, void *,
 126                                                 struct nvme_completion *);
 127
 128 struct nvme_cmd_info {
 129         nvme_completion_fn fn;
 130         void *ctx;
 131         unsigned long timeout;
 132         int aborted;
 133 };
 134
 135 static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
 136 {
 137         return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
 138 }
 139
 140 static unsigned nvme_queue_extra(int depth)
 141 {
 142         return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info));
 143 }
 144
 145 /**
 146  * alloc_cmdid() - Allocate a Command ID
 147  * @nvmeq: The queue that will be used for this command
 148  * @ctx: A pointer that will be passed to the handler
 149  * @handler: The function to call on completion
 150  *
 151  * Allocate a Command ID for a queue.  The data passed in will
 152  * be passed to the completion handler.  This is implemented by using
 153  * the bottom two bits of the ctx pointer to store the handler ID.
 154  * Passing in a pointer that's not 4-byte aligned will cause a BUG.
 155  * We can change this if it becomes a problem.
 156  *
 157  * May be called with local interrupts disabled and the q_lock held,
 158  * or with interrupts enabled and no locks held.
 159  */
 160 static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
 161                                 nvme_completion_fn handler, unsigned timeout)
 162 {
 163         int depth = nvmeq->q_depth - 1;
 164         struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 165         int cmdid;
 166
 167         do {
 168                 cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
 169                 if (cmdid >= depth)
 170                         return -EBUSY;
 171         } while (test_and_set_bit(cmdid, nvmeq->cmdid_data));
 172
 173         info[cmdid].fn = handler;
 174         info[cmdid].ctx = ctx;
 175         info[cmdid].timeout = jiffies + timeout;
 176         info[cmdid].aborted = 0;
 177         return cmdid;
 178 }
 179
 180 static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
 181                                 nvme_completion_fn handler, unsigned timeout)
 182 {
 183         int cmdid;
 184         wait_event_killable(nvmeq->sq_full,
 185                 (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
 186         return (cmdid < 0) ? -EINTR : cmdid;
 187 }
 188
 189 /* Special values must be less than 0x1000 */
 190 #define CMD_CTX_BASE            ((void *)POISON_POINTER_DELTA)
 191 #define CMD_CTX_CANCELLED       (0x30C + CMD_CTX_BASE)
 192 #define CMD_CTX_COMPLETED       (0x310 + CMD_CTX_BASE)
 193 #define CMD_CTX_INVALID         (0x314 + CMD_CTX_BASE)
 194 #define CMD_CTX_FLUSH           (0x318 + CMD_CTX_BASE)
 195 #define CMD_CTX_ABORT           (0x31C + CMD_CTX_BASE)
 196
 197 static void special_completion(struct nvme_dev *dev, void *ctx,
 198                                                 struct nvme_completion *cqe)
 199 {
 200         if (ctx == CMD_CTX_CANCELLED)
 201                 return;
 202         if (ctx == CMD_CTX_FLUSH)
 203                 return;
 204         if (ctx == CMD_CTX_ABORT) {
 205                 ++dev->abort_limit;
 206                 return;
 207         }
 208         if (ctx == CMD_CTX_COMPLETED) {
 209                 dev_warn(&dev->pci_dev->dev,
 210                                 "completed id %d twice on queue %d\n",
 211                                 cqe->command_id, le16_to_cpup(&cqe->sq_id));
 212                 return;
 213         }
 214         if (ctx == CMD_CTX_INVALID) {
 215                 dev_warn(&dev->pci_dev->dev,
 216                                 "invalid id %d completed on queue %d\n",
 217                                 cqe->command_id, le16_to_cpup(&cqe->sq_id));
 218                 return;
 219         }
 220
 221         dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx);
 222 }
 223
 224 static void async_completion(struct nvme_dev *dev, void *ctx,
 225                                                 struct nvme_completion *cqe)
 226 {
 227         struct async_cmd_info *cmdinfo = ctx;
 228         cmdinfo->result = le32_to_cpup(&cqe->result);
 229         cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 230         queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
 231 }
 232
 233 /*
 234  * Called with local interrupts disabled and the q_lock held.  May not sleep.
 235  */
 236 static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
 237                                                 nvme_completion_fn *fn)
 238 {
 239         void *ctx;
 240         struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 241
 242         if (cmdid >= nvmeq->q_depth) {
 243                 *fn = special_completion;
 244                 return CMD_CTX_INVALID;
 245         }
 246         if (fn)
 247                 *fn = info[cmdid].fn;
 248         ctx = info[cmdid].ctx;
 249         info[cmdid].fn = special_completion;
 250         info[cmdid].ctx = CMD_CTX_COMPLETED;
 251         clear_bit(cmdid, nvmeq->cmdid_data);
 252         wake_up(&nvmeq->sq_full);
 253         return ctx;
 254 }
 255
 256 static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
 257                                                 nvme_completion_fn *fn)
 258 {
 259         void *ctx;
 260         struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
 261         if (fn)
 262                 *fn = info[cmdid].fn;
 263         ctx = info[cmdid].ctx;
 264         info[cmdid].fn = special_completion;
 265         info[cmdid].ctx = CMD_CTX_CANCELLED;
 266         return ctx;
 267 }
 268
 269 static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
 270 {
 271         return rcu_dereference_raw(dev->queues[qid]);
 272 }
 273
 274 static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
 275 {
 276         unsigned queue_id = get_cpu_var(*dev->io_queue);
 277         rcu_read_lock();
 278         return rcu_dereference(dev->queues[queue_id]);
 279 }
 280
 281 static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
 282 {
 283         rcu_read_unlock();
 284         put_cpu_var(nvmeq->dev->io_queue);
 285 }
 286
 287 static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx)
 288                                                         __acquires(RCU)
 289 {
 290         rcu_read_lock();
 291         return rcu_dereference(dev->queues[q_idx]);
 292 }
 293
 294 static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
 295 {
 296         rcu_read_unlock();
 297 }
 298
 299 /**
 300  * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
 301  * @nvmeq: The queue to use
 302  * @cmd: The command to send
 303  *
 304  * Safe to use from interrupt context
 305  */
 306 static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 307 {
 308         unsigned long flags;
 309         u16 tail;
 310         spin_lock_irqsave(&nvmeq->q_lock, flags);
 311         if (nvmeq->q_suspended) {
 312                 spin_unlock_irqrestore(&nvmeq->q_lock, flags);
 313                 return -EBUSY;
 314         }
 315         tail = nvmeq->sq_tail;
 316         memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
 317         if (++tail == nvmeq->q_depth)
 318                 tail = 0;
 319         writel(tail, nvmeq->q_db);
 320         nvmeq->sq_tail = tail;
 321         spin_unlock_irqrestore(&nvmeq->q_lock, flags);
 322
 323         return 0;
 324 }
 325
 326 static __le64 **iod_list(struct nvme_iod *iod)
 327 {
 328         return ((void *)iod) + iod->offset;
 329 }
 330
 331 /*
 332  * Will slightly overestimate the number of pages needed.  This is OK
 333  * as it only leads to a small amount of wasted memory for the lifetime of
 334  * the I/O.
 335  */
 336 static int nvme_npages(unsigned size)
 337 {
 338         unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE);
 339         return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
 340 }
 341
 342 static struct nvme_iod *
 343 nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
 344 {
 345         struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
 346                                 sizeof(__le64 *) * nvme_npages(nbytes) +
 347                                 sizeof(struct scatterlist) * nseg, gfp);
 348
 349         if (iod) {
 350                 iod->offset = offsetof(struct nvme_iod, sg[nseg]);
 351                 iod->npages = -1;
 352                 iod->length = nbytes;
 353                 iod->nents = 0;
 354                 iod->start_time = jiffies;
 355         }
 356
 357         return iod;
 358 }
 359
 360 void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 361 {
 362         const int last_prp = PAGE_SIZE / 8 - 1;
 363         int i;
 364         __le64 **list = iod_list(iod);
 365         dma_addr_t prp_dma = iod->first_dma;
 366
 367         if (iod->npages == 0)
 368                 dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
 369         for (i = 0; i < iod->npages; i++) {
 370                 __le64 *prp_list = list[i];
 371                 dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
 372                 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
 373                 prp_dma = next_prp_dma;
 374         }
 375         kfree(iod);
 376 }
 377
 378 static void nvme_start_io_acct(struct bio *bio)
 379 {
 380         struct gendisk *disk = bio->bi_bdev->bd_disk;
 381         const int rw = bio_data_dir(bio);
 382         int cpu = part_stat_lock();
 383         part_round_stats(cpu, &disk->part0);
 384         part_stat_inc(cpu, &disk->part0, ios[rw]);
 385         part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio));
 386         part_inc_in_flight(&disk->part0, rw);
 387         part_stat_unlock();
 388 }
 389
 390 static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
 391 {
 392         struct gendisk *disk = bio->bi_bdev->bd_disk;
 393         const int rw = bio_data_dir(bio);
 394         unsigned long duration = jiffies - start_time;
 395         int cpu = part_stat_lock();
 396         part_stat_add(cpu, &disk->part0, ticks[rw], duration);
 397         part_round_stats(cpu, &disk->part0);
 398         part_dec_in_flight(&disk->part0, rw);
 399         part_stat_unlock();
 400 }
 401
 402 static void bio_completion(struct nvme_dev *dev, void *ctx,
 403                                                 struct nvme_completion *cqe)
 404 {
 405         struct nvme_iod *iod = ctx;
 406         struct bio *bio = iod->private;
 407         u16 status = le16_to_cpup(&cqe->status) >> 1;
 408
 409         if (iod->nents) {
 410                 dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
 411                         bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 412                 nvme_end_io_acct(bio, iod->start_time);
 413         }
 414         nvme_free_iod(dev, iod);
 415         if (status)
 416                 bio_endio(bio, -EIO);
 417         else
 418                 bio_endio(bio, 0);
 419 }
 420
 421 /* length is in bytes.  gfp flags indicates whether we may sleep. */
 422 int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
 423                         struct nvme_iod *iod, int total_len, gfp_t gfp)
 424 {
 425         struct dma_pool *pool;
 426         int length = total_len;
 427         struct scatterlist *sg = iod->sg;
 428         int dma_len = sg_dma_len(sg);
 429         u64 dma_addr = sg_dma_address(sg);
 430         int offset = offset_in_page(dma_addr);
 431         __le64 *prp_list;
 432         __le64 **list = iod_list(iod);
 433         dma_addr_t prp_dma;
 434         int nprps, i;
 435
 436         cmd->prp1 = cpu_to_le64(dma_addr);
 437         length -= (PAGE_SIZE - offset);
 438         if (length <= 0)
 439                 return total_len;
 440
 441         dma_len -= (PAGE_SIZE - offset);
 442         if (dma_len) {
 443                 dma_addr += (PAGE_SIZE - offset);
 444         } else {
 445                 sg = sg_next(sg);
 446                 dma_addr = sg_dma_address(sg);
 447                 dma_len = sg_dma_len(sg);
 448         }
 449
 450         if (length <= PAGE_SIZE) {
 451                 cmd->prp2 = cpu_to_le64(dma_addr);
 452                 return total_len;
 453         }
 454
 455         nprps = DIV_ROUND_UP(length, PAGE_SIZE);
 456         if (nprps <= (256 / 8)) {
 457                 pool = dev->prp_small_pool;
 458                 iod->npages = 0;
 459         } else {
 460                 pool = dev->prp_page_pool;
 461                 iod->npages = 1;
 462         }
 463
 464         prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
 465         if (!prp_list) {
 466                 cmd->prp2 = cpu_to_le64(dma_addr);
 467                 iod->npages = -1;
 468                 return (total_len - length) + PAGE_SIZE;
 469         }
 470         list[0] = prp_list;
 471         iod->first_dma = prp_dma;
 472         cmd->prp2 = cpu_to_le64(prp_dma);
 473         i = 0;
 474         for (;;) {
 475                 if (i == PAGE_SIZE / 8) {
 476                         __le64 *old_prp_list = prp_list;
 477                         prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
 478                         if (!prp_list)
 479                                 return total_len - length;
 480                         list[iod->npages++] = prp_list;
 481                         prp_list[0] = old_prp_list[i - 1];
 482                         old_prp_list[i - 1] = cpu_to_le64(prp_dma);
 483                         i = 1;
 484                 }
 485                 prp_list[i++] = cpu_to_le64(dma_addr);
 486                 dma_len -= PAGE_SIZE;
 487                 dma_addr += PAGE_SIZE;
 488                 length -= PAGE_SIZE;
 489                 if (length <= 0)
 490                         break;
 491                 if (dma_len > 0)
 492                         continue;
 493                 BUG_ON(dma_len < 0);
 494                 sg = sg_next(sg);
 495                 dma_addr = sg_dma_address(sg);
 496                 dma_len = sg_dma_len(sg);
 497         }
 498
 499         return total_len;
 500 }
 501
 502 static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
 503                                  int len)
 504 {
 505         struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL);
 506         if (!split)
 507                 return -ENOMEM;
 508
 509         bio_chain(split, bio);
 510
 511         if (bio_list_empty(&nvmeq->sq_cong))
 512                 add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
 513         bio_list_add(&nvmeq->sq_cong, split);
 514         bio_list_add(&nvmeq->sq_cong, bio);
 515
 516         return 0;
 517 }
 518
 519 /* NVMe scatterlists require no holes in the virtual address */
 520 #define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2)   ((vec2)->bv_offset || \
 521                         (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
 522
 523 static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 524                 struct bio *bio, enum dma_data_direction dma_dir, int psegs)
 525 {
 526         struct bio_vec bvec, bvprv;
 527         struct bvec_iter iter;
 528         struct scatterlist *sg = NULL;
 529         int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size;
 530         int first = 1;
 531
 532         if (nvmeq->dev->stripe_size)
 533                 split_len = nvmeq->dev->stripe_size -
 534                         ((bio->bi_iter.bi_sector << 9) &
 535                          (nvmeq->dev->stripe_size - 1));
 536
 537         sg_init_table(iod->sg, psegs);
 538         bio_for_each_segment(bvec, bio, iter) {
 539                 if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) {
 540                         sg->length += bvec.bv_len;
 541                 } else {
 542                         if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec))
 543                                 return nvme_split_and_submit(bio, nvmeq,
 544                                                              length);
 545
 546                         sg = sg ? sg + 1 : iod->sg;
 547                         sg_set_page(sg, bvec.bv_page,
 548                                     bvec.bv_len, bvec.bv_offset);
 549                         nsegs++;
 550                 }
 551
 552                 if (split_len - length < bvec.bv_len)
 553                         return nvme_split_and_submit(bio, nvmeq, split_len);
 554                 length += bvec.bv_len;
 555                 bvprv = bvec;
 556                 first = 0;
 557         }
 558         iod->nents = nsegs;
 559         sg_mark_end(sg);
 560         if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0)
 561                 return -ENOMEM;
 562
 563         BUG_ON(length != bio->bi_iter.bi_size);
 564         return length;
 565 }
 566
 567 /*
 568  * We reuse the small pool to allocate the 16-byte range here as it is not
 569  * worth having a special pool for these or additional cases to handle freeing
 570  * the iod.
 571  */
 572 static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 573                 struct bio *bio, struct nvme_iod *iod, int cmdid)
 574 {
 575         struct nvme_dsm_range *range;
 576         struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 577
 578         range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
 579                                                         &iod->first_dma);
 580         if (!range)
 581                 return -ENOMEM;
 582
 583         iod_list(iod)[0] = (__le64 *)range;
 584         iod->npages = 0;
 585
 586         range->cattr = cpu_to_le32(0);
 587         range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift);
 588         range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
 589
 590         memset(cmnd, 0, sizeof(*cmnd));
 591         cmnd->dsm.opcode = nvme_cmd_dsm;
 592         cmnd->dsm.command_id = cmdid;
 593         cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
 594         cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
 595         cmnd->dsm.nr = 0;
 596         cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 597
 598         if (++nvmeq->sq_tail == nvmeq->q_depth)
 599                 nvmeq->sq_tail = 0;
 600         writel(nvmeq->sq_tail, nvmeq->q_db);
 601
 602         return 0;
 603 }
 604
 605 static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 606                                                                 int cmdid)
 607 {
 608         struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 609
 610         memset(cmnd, 0, sizeof(*cmnd));
 611         cmnd->common.opcode = nvme_cmd_flush;
 612         cmnd->common.command_id = cmdid;
 613         cmnd->common.nsid = cpu_to_le32(ns->ns_id);
 614
 615         if (++nvmeq->sq_tail == nvmeq->q_depth)
 616                 nvmeq->sq_tail = 0;
 617         writel(nvmeq->sq_tail, nvmeq->q_db);
 618
 619         return 0;
 620 }
 621
 622 int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
 623 {
 624         int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
 625                                         special_completion, NVME_IO_TIMEOUT);
 626         if (unlikely(cmdid < 0))
 627                 return cmdid;
 628
 629         return nvme_submit_flush(nvmeq, ns, cmdid);
 630 }
 631
 632 /*
 633  * Called with local interrupts disabled and the q_lock held.  May not sleep.
 634  */
 635 static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 636                                                                 struct bio *bio)
 637 {
 638         struct nvme_command *cmnd;
 639         struct nvme_iod *iod;
 640         enum dma_data_direction dma_dir;
 641         int cmdid, length, result;
 642         u16 control;
 643         u32 dsmgmt;
 644         int psegs = bio_phys_segments(ns->queue, bio);
 645
 646         if ((bio->bi_rw & REQ_FLUSH) && psegs) {
 647                 result = nvme_submit_flush_data(nvmeq, ns);
 648                 if (result)
 649                         return result;
 650         }
 651
 652         result = -ENOMEM;
 653         iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
 654         if (!iod)
 655                 goto nomem;
 656         iod->private = bio;
 657
 658         result = -EBUSY;
 659         cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
 660         if (unlikely(cmdid < 0))
 661                 goto free_iod;
 662
 663         if (bio->bi_rw & REQ_DISCARD) {
 664                 result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
 665                 if (result)
 666                         goto free_cmdid;
 667                 return result;
 668         }
 669         if ((bio->bi_rw & REQ_FLUSH) && !psegs)
 670                 return nvme_submit_flush(nvmeq, ns, cmdid);
 671
 672         control = 0;
 673         if (bio->bi_rw & REQ_FUA)
 674                 control |= NVME_RW_FUA;
 675         if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
 676                 control |= NVME_RW_LR;
 677
 678         dsmgmt = 0;
 679         if (bio->bi_rw & REQ_RAHEAD)
 680                 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 681
 682         cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
 683
 684         memset(cmnd, 0, sizeof(*cmnd));
 685         if (bio_data_dir(bio)) {
 686                 cmnd->rw.opcode = nvme_cmd_write;
 687                 dma_dir = DMA_TO_DEVICE;
 688         } else {
 689                 cmnd->rw.opcode = nvme_cmd_read;
 690                 dma_dir = DMA_FROM_DEVICE;
 691         }
 692
 693         result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs);
 694         if (result <= 0)
 695                 goto free_cmdid;
 696         length = result;
 697
 698         cmnd->rw.command_id = cmdid;
 699         cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
 700         length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
 701                                                                 GFP_ATOMIC);
 702         cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
 703         cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
 704         cmnd->rw.control = cpu_to_le16(control);
 705         cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
 706
 707         nvme_start_io_acct(bio);
 708         if (++nvmeq->sq_tail == nvmeq->q_depth)
 709                 nvmeq->sq_tail = 0;
 710         writel(nvmeq->sq_tail, nvmeq->q_db);
 711
 712         return 0;
 713
 714  free_cmdid:
 715         free_cmdid(nvmeq, cmdid, NULL);
 716  free_iod:
 717         nvme_free_iod(nvmeq->dev, iod);
 718  nomem:
 719         return result;
 720 }
 721
 722 static int nvme_process_cq(struct nvme_queue *nvmeq)
 723 {
 724         u16 head, phase;
 725
 726         head = nvmeq->cq_head;
 727         phase = nvmeq->cq_phase;
 728
 729         for (;;) {
 730                 void *ctx;
 731                 nvme_completion_fn fn;
 732                 struct nvme_completion cqe = nvmeq->cqes[head];
 733                 if ((le16_to_cpu(cqe.status) & 1) != phase)
 734                         break;
 735                 nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
 736                 if (++head == nvmeq->q_depth) {
 737                         head = 0;
 738                         phase = !phase;
 739                 }
 740
 741                 ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
 742                 fn(nvmeq->dev, ctx, &cqe);
 743         }
 744
 745         /* If the controller ignores the cq head doorbell and continuously
 746          * writes to the queue, it is theoretically possible to wrap around
 747          * the queue twice and mistakenly return IRQ_NONE.  Linux only
 748          * requires that 0.1% of your interrupts are handled, so this isn't
 749          * a big problem.
 750          */
 751         if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
 752                 return 0;
 753
 754         writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
 755         nvmeq->cq_head = head;
 756         nvmeq->cq_phase = phase;
 757
 758         nvmeq->cqe_seen = 1;
 759         return 1;
 760 }
 761
 762 static void nvme_make_request(struct request_queue *q, struct bio *bio)
 763 {
 764         struct nvme_ns *ns = q->queuedata;
 765         struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
 766         int result = -EBUSY;
 767
 768         if (!nvmeq) {
 769                 put_nvmeq(NULL);
 770                 bio_endio(bio, -EIO);
 771                 return;
 772         }
 773
 774         spin_lock_irq(&nvmeq->q_lock);
 775         if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
 776                 result = nvme_submit_bio_queue(nvmeq, ns, bio);
 777         if (unlikely(result)) {
 778                 if (bio_list_empty(&nvmeq->sq_cong))
 779                         add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
 780                 bio_list_add(&nvmeq->sq_cong, bio);
 781         }
 782
 783         nvme_process_cq(nvmeq);
 784         spin_unlock_irq(&nvmeq->q_lock);
 785         put_nvmeq(nvmeq);
 786 }
 787
 788 static irqreturn_t nvme_irq(int irq, void *data)
 789 {
 790         irqreturn_t result;
 791         struct nvme_queue *nvmeq = data;
 792         spin_lock(&nvmeq->q_lock);
 793         nvme_process_cq(nvmeq);
 794         result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
 795         nvmeq->cqe_seen = 0;
 796         spin_unlock(&nvmeq->q_lock);
 797         return result;
 798 }
 799
 800 static irqreturn_t nvme_irq_check(int irq, void *data)
 801 {
 802         struct nvme_queue *nvmeq = data;
 803         struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
 804         if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
 805                 return IRQ_NONE;
 806         return IRQ_WAKE_THREAD;
 807 }
 808
 809 static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
 810 {
 811         spin_lock_irq(&nvmeq->q_lock);
 812         cancel_cmdid(nvmeq, cmdid, NULL);
 813         spin_unlock_irq(&nvmeq->q_lock);
 814 }
 815
 816 struct sync_cmd_info {
 817         struct task_struct *task;
 818         u32 result;
 819         int status;
 820 };
 821
 822 static void sync_completion(struct nvme_dev *dev, void *ctx,
 823                                                 struct nvme_completion *cqe)
 824 {
 825         struct sync_cmd_info *cmdinfo = ctx;
 826         cmdinfo->result = le32_to_cpup(&cqe->result);
 827         cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 828         wake_up_process(cmdinfo->task);
 829 }
 830
 831 /*
 832  * Returns 0 on success.  If the result is negative, it's a Linux error code;
 833  * if the result is positive, it's an NVM Express status code
 834  */
 835 static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
 836                                                 struct nvme_command *cmd,
 837                                                 u32 *result, unsigned timeout)
 838 {
 839         int cmdid, ret;
 840         struct sync_cmd_info cmdinfo;
 841         struct nvme_queue *nvmeq;
 842
 843         nvmeq = lock_nvmeq(dev, q_idx);
 844         if (!nvmeq) {
 845                 unlock_nvmeq(nvmeq);
 846                 return -ENODEV;
 847         }
 848
 849         cmdinfo.task = current;
 850         cmdinfo.status = -EINTR;
 851
 852         cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout);
 853         if (cmdid < 0) {
 854                 unlock_nvmeq(nvmeq);
 855                 return cmdid;
 856         }
 857         cmd->common.command_id = cmdid;
 858
 859         set_current_state(TASK_KILLABLE);
 860         ret = nvme_submit_cmd(nvmeq, cmd);
 861         if (ret) {
 862                 free_cmdid(nvmeq, cmdid, NULL);
 863                 unlock_nvmeq(nvmeq);
 864                 set_current_state(TASK_RUNNING);
 865                 return ret;
 866         }
 867         unlock_nvmeq(nvmeq);
 868         schedule_timeout(timeout);
 869
 870         if (cmdinfo.status == -EINTR) {
 871                 nvmeq = lock_nvmeq(dev, q_idx);
 872                 if (nvmeq)
 873                         nvme_abort_command(nvmeq, cmdid);
 874                 unlock_nvmeq(nvmeq);
 875                 return -EINTR;
 876         }
 877
 878         if (result)
 879                 *result = cmdinfo.result;
 880
 881         return cmdinfo.status;
 882 }
 883
 884 static int nvme_submit_async_cmd(struct nvme_queue *nvmeq,
 885                         struct nvme_command *cmd,
 886                         struct async_cmd_info *cmdinfo, unsigned timeout)
 887 {
 888         int cmdid;
 889
 890         cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout);
 891         if (cmdid < 0)
 892                 return cmdid;
 893         cmdinfo->status = -EINTR;
 894         cmd->common.command_id = cmdid;
 895         return nvme_submit_cmd(nvmeq, cmd);
 896 }
 897
 898 int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 899                                                                 u32 *result)
 900 {
 901         return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT);
 902 }
 903
 904 int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 905                                                                 u32 *result)
 906 {
 907         return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result,
 908                                                         NVME_IO_TIMEOUT);
 909 }
 910
 911 static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
 912                 struct nvme_command *cmd, struct async_cmd_info *cmdinfo)
 913 {
 914         return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo,
 915                                                                 ADMIN_TIMEOUT);
 916 }
 917
 918 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 919 {
 920         int status;
 921         struct nvme_command c;
 922
 923         memset(&c, 0, sizeof(c));
 924         c.delete_queue.opcode = opcode;
 925         c.delete_queue.qid = cpu_to_le16(id);
 926
 927         status = nvme_submit_admin_cmd(dev, &c, NULL);
 928         if (status)
 929                 return -EIO;
 930         return 0;
 931 }
 932
 933 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 934                                                 struct nvme_queue *nvmeq)
 935 {
 936         int status;
 937         struct nvme_command c;
 938         int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
 939
 940         memset(&c, 0, sizeof(c));
 941         c.create_cq.opcode = nvme_admin_create_cq;
 942         c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
 943         c.create_cq.cqid = cpu_to_le16(qid);
 944         c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
 945         c.create_cq.cq_flags = cpu_to_le16(flags);
 946         c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
 947
 948         status = nvme_submit_admin_cmd(dev, &c, NULL);
 949         if (status)
 950                 return -EIO;
 951         return 0;
 952 }
 953
 954 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 955                                                 struct nvme_queue *nvmeq)
 956 {
 957         int status;
 958         struct nvme_command c;
 959         int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
 960
 961         memset(&c, 0, sizeof(c));
 962         c.create_sq.opcode = nvme_admin_create_sq;
 963         c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
 964         c.create_sq.sqid = cpu_to_le16(qid);
 965         c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
 966         c.create_sq.sq_flags = cpu_to_le16(flags);
 967         c.create_sq.cqid = cpu_to_le16(qid);
 968
 969         status = nvme_submit_admin_cmd(dev, &c, NULL);
 970         if (status)
 971                 return -EIO;
 972         return 0;
 973 }
 974
 975 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
 976 {
 977         return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
 978 }
 979
 980 static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 981 {
 982         return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 983 }
 984
 985 int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
 986                                                         dma_addr_t dma_addr)
 987 {
 988         struct nvme_command c;
 989
 990         memset(&c, 0, sizeof(c));
 991         c.identify.opcode = nvme_admin_identify;
 992         c.identify.nsid = cpu_to_le32(nsid);
 993         c.identify.prp1 = cpu_to_le64(dma_addr);
 994         c.identify.cns = cpu_to_le32(cns);
 995
 996         return nvme_submit_admin_cmd(dev, &c, NULL);
 997 }
 998
 999 int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
1000                                         dma_addr_t dma_addr, u32 *result)
1001 {
1002         struct nvme_command c;
1003
1004         memset(&c, 0, sizeof(c));
1005         c.features.opcode = nvme_admin_get_features;
1006         c.features.nsid = cpu_to_le32(nsid);
1007         c.features.prp1 = cpu_to_le64(dma_addr);
1008         c.features.fid = cpu_to_le32(fid);
1009
1010         return nvme_submit_admin_cmd(dev, &c, result);
1011 }
1012
1013 int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
1014                                         dma_addr_t dma_addr, u32 *result)
1015 {
1016         struct nvme_command c;
1017
1018         memset(&c, 0, sizeof(c));
1019         c.features.opcode = nvme_admin_set_features;
1020         c.features.prp1 = cpu_to_le64(dma_addr);
1021         c.features.fid = cpu_to_le32(fid);
1022         c.features.dword11 = cpu_to_le32(dword11);
1023
1024         return nvme_submit_admin_cmd(dev, &c, result);
1025 }
1026
1027 /**
1028  * nvme_abort_cmd - Attempt aborting a command
1029  * @cmdid: Command id of a timed out IO
1030  * @queue: The queue with timed out IO
1031  *
1032  * Schedule controller reset if the command was already aborted once before and
1033  * still hasn't been returned to the driver, or if this is the admin queue.
1034  */
1035 static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
1036 {
1037         int a_cmdid;
1038         struct nvme_command cmd;
1039         struct nvme_dev *dev = nvmeq->dev;
1040         struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
1041         struct nvme_queue *adminq;
1042
1043         if (!nvmeq->qid || info[cmdid].aborted) {
1044                 if (work_busy(&dev->reset_work))
1045                         return;
1046                 list_del_init(&dev->node);
1047                 dev_warn(&dev->pci_dev->dev,
1048                         "I/O %d QID %d timeout, reset controller\n", cmdid,
1049                                                                 nvmeq->qid);
1050                 PREPARE_WORK(&dev->reset_work, nvme_reset_failed_dev);
1051                 queue_work(nvme_workq, &dev->reset_work);
1052                 return;
1053         }
1054
1055         if (!dev->abort_limit)
1056                 return;
1057
1058         adminq = rcu_dereference(dev->queues[0]);
1059         a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion,
1060                                                                 ADMIN_TIMEOUT);
1061         if (a_cmdid < 0)
1062                 return;
1063
1064         memset(&cmd, 0, sizeof(cmd));
1065         cmd.abort.opcode = nvme_admin_abort_cmd;
1066         cmd.abort.cid = cmdid;
1067         cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
1068         cmd.abort.command_id = a_cmdid;
1069
1070         --dev->abort_limit;
1071         info[cmdid].aborted = 1;
1072         info[cmdid].timeout = jiffies + ADMIN_TIMEOUT;
1073
1074         dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid,
1075                                                         nvmeq->qid);
1076         nvme_submit_cmd(adminq, &cmd);
1077 }
1078
1079 /**
1080  * nvme_cancel_ios - Cancel outstanding I/Os
1081  * @queue: The queue to cancel I/Os on
1082  * @timeout: True to only cancel I/Os which have timed out
1083  */
1084 static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
1085 {
1086         int depth = nvmeq->q_depth - 1;
1087         struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
1088         unsigned long now = jiffies;
1089         int cmdid;
1090
1091         for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
1092                 void *ctx;
1093                 nvme_completion_fn fn;
1094                 static struct nvme_completion cqe = {
1095                         .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
1096                 };
1097
1098                 if (timeout && !time_after(now, info[cmdid].timeout))
1099                         continue;
1100                 if (info[cmdid].ctx == CMD_CTX_CANCELLED)
1101                         continue;
1102                 if (timeout && nvmeq->dev->initialized) {
1103                         nvme_abort_cmd(cmdid, nvmeq);
1104                         continue;
1105                 }
1106                 dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
1107                                                                 nvmeq->qid);
1108                 ctx = cancel_cmdid(nvmeq, cmdid, &fn);
1109                 fn(nvmeq->dev, ctx, &cqe);
1110         }
1111 }
1112
1113 static void nvme_free_queue(struct rcu_head *r)
1114 {
1115         struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head);
1116
1117         spin_lock_irq(&nvmeq->q_lock);
1118         while (bio_list_peek(&nvmeq->sq_cong)) {
1119                 struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
1120                 bio_endio(bio, -EIO);
1121         }
1122         spin_unlock_irq(&nvmeq->q_lock);
1123
1124         dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
1125                                 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1126         dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
1127                                         nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1128         if (nvmeq->qid)
1129                 free_cpumask_var(nvmeq->cpu_mask);
1130         kfree(nvmeq);
1131 }
1132
1133 static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1134 {
1135         int i;
1136
1137         for (i = dev->queue_count - 1; i >= lowest; i--) {
1138                 struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
1139                 rcu_assign_pointer(dev->queues[i], NULL);
1140                 call_rcu(&nvmeq->r_head, nvme_free_queue);
1141                 dev->queue_count--;
1142         }
1143 }
1144
1145 /**
1146  * nvme_suspend_queue - put queue into suspended state
1147  * @nvmeq - queue to suspend
1148  *
1149  * Returns 1 if already suspended, 0 otherwise.
1150  */
1151 static int nvme_suspend_queue(struct nvme_queue *nvmeq)
1152 {
1153         int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
1154
1155         spin_lock_irq(&nvmeq->q_lock);
1156         if (nvmeq->q_suspended) {
1157                 spin_unlock_irq(&nvmeq->q_lock);
1158                 return 1;
1159         }
1160         nvmeq->q_suspended = 1;
1161         nvmeq->dev->online_queues--;
1162         spin_unlock_irq(&nvmeq->q_lock);
1163
1164         irq_set_affinity_hint(vector, NULL);
1165         free_irq(vector, nvmeq);
1166
1167         return 0;
1168 }
1169
1170 static void nvme_clear_queue(struct nvme_queue *nvmeq)
1171 {
1172         spin_lock_irq(&nvmeq->q_lock);
1173         nvme_process_cq(nvmeq);
1174         nvme_cancel_ios(nvmeq, false);
1175         spin_unlock_irq(&nvmeq->q_lock);
1176 }
1177
1178 static void nvme_disable_queue(struct nvme_dev *dev, int qid)
1179 {
1180         struct nvme_queue *nvmeq = raw_nvmeq(dev, qid);
1181
1182         if (!nvmeq)
1183                 return;
1184         if (nvme_suspend_queue(nvmeq))
1185                 return;
1186
1187         /* Don't tell the adapter to delete the admin queue.
1188          * Don't tell a removed adapter to delete IO queues. */
1189         if (qid && readl(&dev->bar->csts) != -1) {
1190                 adapter_delete_sq(dev, qid);
1191                 adapter_delete_cq(dev, qid);
1192         }
1193         nvme_clear_queue(nvmeq);
1194 }
1195
1196 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
1197                                                         int depth, int vector)
1198 {
1199         struct device *dmadev = &dev->pci_dev->dev;
1200         unsigned extra = nvme_queue_extra(depth);
1201         struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
1202         if (!nvmeq)
1203                 return NULL;
1204
1205         nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
1206                                         &nvmeq->cq_dma_addr, GFP_KERNEL);
1207         if (!nvmeq->cqes)
1208                 goto free_nvmeq;
1209         memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));
1210
1211         nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
1212                                         &nvmeq->sq_dma_addr, GFP_KERNEL);
1213         if (!nvmeq->sq_cmds)
1214                 goto free_cqdma;
1215
1216         if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL))
1217                 goto free_sqdma;
1218
1219         nvmeq->q_dmadev = dmadev;
1220         nvmeq->dev = dev;
1221         snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
1222                         dev->instance, qid);
1223         spin_lock_init(&nvmeq->q_lock);
1224         nvmeq->cq_head = 0;
1225         nvmeq->cq_phase = 1;
1226         init_waitqueue_head(&nvmeq->sq_full);
1227         init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
1228         bio_list_init(&nvmeq->sq_cong);
1229         nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1230         nvmeq->q_depth = depth;
1231         nvmeq->cq_vector = vector;
1232         nvmeq->qid = qid;
1233         nvmeq->q_suspended = 1;
1234         dev->queue_count++;
1235         rcu_assign_pointer(dev->queues[qid], nvmeq);
1236
1237         return nvmeq;
1238
1239  free_sqdma:
1240         dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds,
1241                                                         nvmeq->sq_dma_addr);
1242  free_cqdma:
1243         dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
1244                                                         nvmeq->cq_dma_addr);
1245  free_nvmeq:
1246         kfree(nvmeq);
1247         return NULL;
1248 }
1249
1250 static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1251                                                         const char *name)
1252 {
1253         if (use_threaded_interrupts)
1254                 return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
1255                                         nvme_irq_check, nvme_irq, IRQF_SHARED,
1256                                         name, nvmeq);
1257         return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
1258                                 IRQF_SHARED, name, nvmeq);
1259 }
1260
1261 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
1262 {
1263         struct nvme_dev *dev = nvmeq->dev;
1264         unsigned extra = nvme_queue_extra(nvmeq->q_depth);
1265
1266         nvmeq->sq_tail = 0;
1267         nvmeq->cq_head = 0;
1268         nvmeq->cq_phase = 1;
1269         nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1270         memset(nvmeq->cmdid_data, 0, extra);
1271         memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
1272         nvme_cancel_ios(nvmeq, false);
1273         nvmeq->q_suspended = 0;
1274         dev->online_queues++;
1275 }
1276
1277 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
1278 {
1279         struct nvme_dev *dev = nvmeq->dev;
1280         int result;
1281
1282         result = adapter_alloc_cq(dev, qid, nvmeq);
1283         if (result < 0)
1284                 return result;
1285
1286         result = adapter_alloc_sq(dev, qid, nvmeq);
1287         if (result < 0)
1288                 goto release_cq;
1289
1290         result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
1291         if (result < 0)
1292                 goto release_sq;
1293
1294         spin_lock_irq(&nvmeq->q_lock);
1295         nvme_init_queue(nvmeq, qid);
1296         spin_unlock_irq(&nvmeq->q_lock);
1297
1298         return result;
1299
1300  release_sq:
1301         adapter_delete_sq(dev, qid);
1302  release_cq:
1303         adapter_delete_cq(dev, qid);
1304         return result;
1305 }
1306
1307 static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
1308 {
1309         unsigned long timeout;
1310         u32 bit = enabled ? NVME_CSTS_RDY : 0;
1311
1312         timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1313
1314         while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) {
1315                 msleep(100);
1316                 if (fatal_signal_pending(current))
1317                         return -EINTR;
1318                 if (time_after(jiffies, timeout)) {
1319                         dev_err(&dev->pci_dev->dev,
1320                                 "Device not ready; aborting initialisation\n");
1321                         return -ENODEV;
1322                 }
1323         }
1324
1325         return 0;
1326 }
1327
1328 /*
1329  * If the device has been passed off to us in an enabled state, just clear
1330  * the enabled bit.  The spec says we should set the 'shutdown notification
1331  * bits', but doing so may cause the device to complete commands to the
1332  * admin queue ... and we don't know what memory that might be pointing at!
1333  */
1334 static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
1335 {
1336         u32 cc = readl(&dev->bar->cc);
1337
1338         if (cc & NVME_CC_ENABLE)
1339                 writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc);
1340         return nvme_wait_ready(dev, cap, false);
1341 }
1342
1343 static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
1344 {
1345         return nvme_wait_ready(dev, cap, true);
1346 }
1347
1348 static int nvme_shutdown_ctrl(struct nvme_dev *dev)
1349 {
1350         unsigned long timeout;
1351         u32 cc;
1352
1353         cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL;
1354         writel(cc, &dev->bar->cc);
1355
1356         timeout = 2 * HZ + jiffies;
1357         while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) !=
1358                                                         NVME_CSTS_SHST_CMPLT) {
1359                 msleep(100);
1360                 if (fatal_signal_pending(current))
1361                         return -EINTR;
1362                 if (time_after(jiffies, timeout)) {
1363                         dev_err(&dev->pci_dev->dev,
1364                                 "Device shutdown incomplete; abort shutdown\n");
1365                         return -ENODEV;
1366                 }
1367         }
1368
1369         return 0;
1370 }
1371
1372 static int nvme_configure_admin_queue(struct nvme_dev *dev)
1373 {
1374         int result;
1375         u32 aqa;
1376         u64 cap = readq(&dev->bar->cap);
1377         struct nvme_queue *nvmeq;
1378
1379         result = nvme_disable_ctrl(dev, cap);
1380         if (result < 0)
1381                 return result;
1382
1383         nvmeq = raw_nvmeq(dev, 0);
1384         if (!nvmeq) {
1385                 nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
1386                 if (!nvmeq)
1387                         return -ENOMEM;
1388         }
1389
1390         aqa = nvmeq->q_depth - 1;
1391         aqa |= aqa << 16;
1392
1393         dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
1394         dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
1395         dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
1396         dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
1397
1398         writel(aqa, &dev->bar->aqa);
1399         writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
1400         writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
1401         writel(dev->ctrl_config, &dev->bar->cc);
1402
1403         result = nvme_enable_ctrl(dev, cap);
1404         if (result)
1405                 return result;
1406
1407         result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
1408         if (result)
1409                 return result;
1410
1411         spin_lock_irq(&nvmeq->q_lock);
1412         nvme_init_queue(nvmeq, 0);
1413         spin_unlock_irq(&nvmeq->q_lock);
1414         return result;
1415 }
1416
1417 struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
1418                                 unsigned long addr, unsigned length)
1419 {
1420         int i, err, count, nents, offset;
1421         struct scatterlist *sg;
1422         struct page **pages;
1423         struct nvme_iod *iod;
1424
1425         if (addr & 3)
1426                 return ERR_PTR(-EINVAL);
1427         if (!length || length > INT_MAX - PAGE_SIZE)
1428                 return ERR_PTR(-EINVAL);
1429
1430         offset = offset_in_page(addr);
1431         count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
1432         pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
1433         if (!pages)
1434                 return ERR_PTR(-ENOMEM);
1435
1436         err = get_user_pages_fast(addr, count, 1, pages);
1437         if (err < count) {
1438                 count = err;
1439                 err = -EFAULT;
1440                 goto put_pages;
1441         }
1442
1443         iod = nvme_alloc_iod(count, length, GFP_KERNEL);
1444         sg = iod->sg;
1445         sg_init_table(sg, count);
1446         for (i = 0; i < count; i++) {
1447                 sg_set_page(&sg[i], pages[i],
1448                             min_t(unsigned, length, PAGE_SIZE - offset),
1449                             offset);
1450                 length -= (PAGE_SIZE - offset);
1451                 offset = 0;
1452         }
1453         sg_mark_end(&sg[i - 1]);
1454         iod->nents = count;
1455
1456         err = -ENOMEM;
1457         nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
1458                                 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1459         if (!nents)
1460                 goto free_iod;
1461
1462         kfree(pages);
1463         return iod;
1464
1465  free_iod:
1466         kfree(iod);
1467  put_pages:
1468         for (i = 0; i < count; i++)
1469                 put_page(pages[i]);
1470         kfree(pages);
1471         return ERR_PTR(err);
1472 }
1473
1474 void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
1475                         struct nvme_iod *iod)
1476 {
1477         int i;
1478
1479         dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
1480                                 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1481
1482         for (i = 0; i < iod->nents; i++)
1483                 put_page(sg_page(&iod->sg[i]));
1484 }
1485
1486 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1487 {
1488         struct nvme_dev *dev = ns->dev;
1489         struct nvme_user_io io;
1490         struct nvme_command c;
1491         unsigned length, meta_len;
1492         int status, i;
1493         struct nvme_iod *iod, *meta_iod = NULL;
1494         dma_addr_t meta_dma_addr;
1495         void *meta, *uninitialized_var(meta_mem);
1496
1497         if (copy_from_user(&io, uio, sizeof(io)))
1498                 return -EFAULT;
1499         length = (io.nblocks + 1) << ns->lba_shift;
1500         meta_len = (io.nblocks + 1) * ns->ms;
1501
1502         if (meta_len && ((io.metadata & 3) || !io.metadata))
1503                 return -EINVAL;
1504
1505         switch (io.opcode) {
1506         case nvme_cmd_write:
1507         case nvme_cmd_read:
1508         case nvme_cmd_compare:
1509                 iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length);
1510                 break;
1511         default:
1512                 return -EINVAL;
1513         }
1514
1515         if (IS_ERR(iod))
1516                 return PTR_ERR(iod);
1517
1518         memset(&c, 0, sizeof(c));
1519         c.rw.opcode = io.opcode;
1520         c.rw.flags = io.flags;
1521         c.rw.nsid = cpu_to_le32(ns->ns_id);
1522         c.rw.slba = cpu_to_le64(io.slba);
1523         c.rw.length = cpu_to_le16(io.nblocks);
1524         c.rw.control = cpu_to_le16(io.control);
1525         c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
1526         c.rw.reftag = cpu_to_le32(io.reftag);
1527         c.rw.apptag = cpu_to_le16(io.apptag);
1528         c.rw.appmask = cpu_to_le16(io.appmask);
1529
1530         if (meta_len) {
1531                 meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata,
1532                                                                 meta_len);
1533                 if (IS_ERR(meta_iod)) {
1534                         status = PTR_ERR(meta_iod);
1535                         meta_iod = NULL;
1536                         goto unmap;
1537                 }
1538
1539                 meta_mem = dma_alloc_coherent(&dev->pci_dev->dev, meta_len,
1540                                                 &meta_dma_addr, GFP_KERNEL);
1541                 if (!meta_mem) {
1542                         status = -ENOMEM;
1543                         goto unmap;
1544                 }
1545
1546                 if (io.opcode & 1) {
1547                         int meta_offset = 0;
1548
1549                         for (i = 0; i < meta_iod->nents; i++) {
1550                                 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) +
1551                                                 meta_iod->sg[i].offset;
1552                                 memcpy(meta_mem + meta_offset, meta,
1553                                                 meta_iod->sg[i].length);
1554                                 kunmap_atomic(meta);
1555                                 meta_offset += meta_iod->sg[i].length;
1556                         }
1557                 }
1558
1559                 c.rw.metadata = cpu_to_le64(meta_dma_addr);
1560         }
1561
1562         length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
1563
1564         if (length != (io.nblocks + 1) << ns->lba_shift)
1565                 status = -ENOMEM;
1566         else
1567                 status = nvme_submit_io_cmd(dev, &c, NULL);
1568
1569         if (meta_len) {
1570                 if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) {
1571                         int meta_offset = 0;
1572
1573                         for (i = 0; i < meta_iod->nents; i++) {
1574                                 meta = kmap_atomic(sg_page(&meta_iod->sg[i])) +
1575                                                 meta_iod->sg[i].offset;
1576                                 memcpy(meta, meta_mem + meta_offset,
1577                                                 meta_iod->sg[i].length);
1578                                 kunmap_atomic(meta);
1579                                 meta_offset += meta_iod->sg[i].length;
1580                         }
1581                 }
1582
1583                 dma_free_coherent(&dev->pci_dev->dev, meta_len, meta_mem,
1584                                                                 meta_dma_addr);
1585         }
1586
1587  unmap:
1588         nvme_unmap_user_pages(dev, io.opcode & 1, iod);
1589         nvme_free_iod(dev, iod);
1590
1591         if (meta_iod) {
1592                 nvme_unmap_user_pages(dev, io.opcode & 1, meta_iod);
1593                 nvme_free_iod(dev, meta_iod);
1594         }
1595
1596         return status;
1597 }
1598
1599 static int nvme_user_admin_cmd(struct nvme_dev *dev,
1600                                         struct nvme_admin_cmd __user *ucmd)
1601 {
1602         struct nvme_admin_cmd cmd;
1603         struct nvme_command c;
1604         int status, length;
1605         struct nvme_iod *uninitialized_var(iod);
1606         unsigned timeout;
1607
1608         if (!capable(CAP_SYS_ADMIN))
1609                 return -EACCES;
1610         if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1611                 return -EFAULT;
1612
1613         memset(&c, 0, sizeof(c));
1614         c.common.opcode = cmd.opcode;
1615         c.common.flags = cmd.flags;
1616         c.common.nsid = cpu_to_le32(cmd.nsid);
1617         c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1618         c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1619         c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
1620         c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
1621         c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
1622         c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
1623         c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
1624         c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
1625
1626         length = cmd.data_len;
1627         if (cmd.data_len) {
1628                 iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
1629                                                                 length);
1630                 if (IS_ERR(iod))
1631                         return PTR_ERR(iod);
1632                 length = nvme_setup_prps(dev, &c.common, iod, length,
1633                                                                 GFP_KERNEL);
1634         }
1635
1636         timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
1637                                                                 ADMIN_TIMEOUT;
1638         if (length != cmd.data_len)
1639                 status = -ENOMEM;
1640         else
1641                 status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout);
1642
1643         if (cmd.data_len) {
1644                 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
1645                 nvme_free_iod(dev, iod);
1646         }
1647
1648         if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result,
1649                                                         sizeof(cmd.result)))
1650                 status = -EFAULT;
1651
1652         return status;
1653 }
1654
1655 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
1656                                                         unsigned long arg)
1657 {
1658         struct nvme_ns *ns = bdev->bd_disk->private_data;
1659
1660         switch (cmd) {
1661         case NVME_IOCTL_ID:
1662                 force_successful_syscall_return();
1663                 return ns->ns_id;
1664         case NVME_IOCTL_ADMIN_CMD:
1665                 return nvme_user_admin_cmd(ns->dev, (void __user *)arg);
1666         case NVME_IOCTL_SUBMIT_IO:
1667                 return nvme_submit_io(ns, (void __user *)arg);
1668         case SG_GET_VERSION_NUM:
1669                 return nvme_sg_get_version_num((void __user *)arg);
1670         case SG_IO:
1671                 return nvme_sg_io(ns, (void __user *)arg);
1672         default:
1673                 return -ENOTTY;
1674         }
1675 }
1676
1677 #ifdef CONFIG_COMPAT
1678 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
1679                                         unsigned int cmd, unsigned long arg)
1680 {
1681         struct nvme_ns *ns = bdev->bd_disk->private_data;
1682
1683         switch (cmd) {
1684         case SG_IO:
1685                 return nvme_sg_io32(ns, arg);
1686         }
1687         return nvme_ioctl(bdev, mode, cmd, arg);
1688 }
1689 #else
1690 #define nvme_compat_ioctl       NULL
1691 #endif
1692
1693 static int nvme_open(struct block_device *bdev, fmode_t mode)
1694 {
1695         struct nvme_ns *ns = bdev->bd_disk->private_data;
1696         struct nvme_dev *dev = ns->dev;
1697
1698         kref_get(&dev->kref);
1699         return 0;
1700 }
1701
1702 static void nvme_free_dev(struct kref *kref);
1703
1704 static void nvme_release(struct gendisk *disk, fmode_t mode)
1705 {
1706         struct nvme_ns *ns = disk->private_data;
1707         struct nvme_dev *dev = ns->dev;
1708
1709         kref_put(&dev->kref, nvme_free_dev);
1710 }
1711
1712 static const struct block_device_operations nvme_fops = {
1713         .owner          = THIS_MODULE,
1714         .ioctl          = nvme_ioctl,
1715         .compat_ioctl   = nvme_compat_ioctl,
1716         .open           = nvme_open,
1717         .release        = nvme_release,
1718 };
1719
1720 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
1721 {
1722         while (bio_list_peek(&nvmeq->sq_cong)) {
1723                 struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
1724                 struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
1725
1726                 if (bio_list_empty(&nvmeq->sq_cong))
1727                         remove_wait_queue(&nvmeq->sq_full,
1728                                                         &nvmeq->sq_cong_wait);
1729                 if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
1730                         if (bio_list_empty(&nvmeq->sq_cong))
1731                                 add_wait_queue(&nvmeq->sq_full,
1732                                                         &nvmeq->sq_cong_wait);
1733                         bio_list_add_head(&nvmeq->sq_cong, bio);
1734                         break;
1735                 }
1736         }
1737 }
1738
1739 static int nvme_kthread(void *data)
1740 {
1741         struct nvme_dev *dev, *next;
1742
1743         while (!kthread_should_stop()) {
1744                 set_current_state(TASK_INTERRUPTIBLE);
1745                 spin_lock(&dev_list_lock);
1746                 list_for_each_entry_safe(dev, next, &dev_list, node) {
1747                         int i;
1748                         if (readl(&dev->bar->csts) & NVME_CSTS_CFS &&
1749                                                         dev->initialized) {
1750                                 if (work_busy(&dev->reset_work))
1751                                         continue;
1752                                 list_del_init(&dev->node);
1753                                 dev_warn(&dev->pci_dev->dev,
1754                                         "Failed status, reset controller\n");
1755                                 PREPARE_WORK(&dev->reset_work,
1756                                                         nvme_reset_failed_dev);
1757                                 queue_work(nvme_workq, &dev->reset_work);
1758                                 continue;
1759                         }
1760                         rcu_read_lock();
1761                         for (i = 0; i < dev->queue_count; i++) {
1762                                 struct nvme_queue *nvmeq =
1763                                                 rcu_dereference(dev->queues[i]);
1764                                 if (!nvmeq)
1765                                         continue;
1766                                 spin_lock_irq(&nvmeq->q_lock);
1767                                 if (nvmeq->q_suspended)
1768                                         goto unlock;
1769                                 nvme_process_cq(nvmeq);
1770                                 nvme_cancel_ios(nvmeq, true);
1771                                 nvme_resubmit_bios(nvmeq);
1772  unlock:
1773                                 spin_unlock_irq(&nvmeq->q_lock);
1774                         }
1775                         rcu_read_unlock();
1776                 }
1777                 spin_unlock(&dev_list_lock);
1778                 schedule_timeout(round_jiffies_relative(HZ));
1779         }
1780         return 0;
1781 }
1782
1783 static void nvme_config_discard(struct nvme_ns *ns)
1784 {
1785         u32 logical_block_size = queue_logical_block_size(ns->queue);
1786         ns->queue->limits.discard_zeroes_data = 0;
1787         ns->queue->limits.discard_alignment = logical_block_size;
1788         ns->queue->limits.discard_granularity = logical_block_size;
1789         ns->queue->limits.max_discard_sectors = 0xffffffff;
1790         queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
1791 }
1792
1793 static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
1794                         struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
1795 {
1796         struct nvme_ns *ns;
1797         struct gendisk *disk;
1798         int lbaf;
1799
1800         if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
1801                 return NULL;
1802
1803         ns = kzalloc(sizeof(*ns), GFP_KERNEL);
1804         if (!ns)
1805                 return NULL;
1806         ns->queue = blk_alloc_queue(GFP_KERNEL);
1807         if (!ns->queue)
1808                 goto out_free_ns;
1809         ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
1810         queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
1811         queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
1812         blk_queue_make_request(ns->queue, nvme_make_request);
1813         ns->dev = dev;
1814         ns->queue->queuedata = ns;
1815
1816         disk = alloc_disk(0);
1817         if (!disk)
1818                 goto out_free_queue;
1819         ns->ns_id = nsid;
1820         ns->disk = disk;
1821         lbaf = id->flbas & 0xf;
1822         ns->lba_shift = id->lbaf[lbaf].ds;
1823         ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
1824         blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
1825         if (dev->max_hw_sectors)
1826                 blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
1827
1828         disk->major = nvme_major;
1829         disk->first_minor = 0;
1830         disk->fops = &nvme_fops;
1831         disk->private_data = ns;
1832         disk->queue = ns->queue;
1833         disk->driverfs_dev = &dev->pci_dev->dev;
1834         disk->flags = GENHD_FL_EXT_DEVT;
1835         sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
1836         set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
1837
1838         if (dev->oncs & NVME_CTRL_ONCS_DSM)
1839                 nvme_config_discard(ns);
1840
1841         return ns;
1842
1843  out_free_queue:
1844         blk_cleanup_queue(ns->queue);
1845  out_free_ns:
1846         kfree(ns);
1847         return NULL;
1848 }
1849
1850 static int nvme_find_closest_node(int node)
1851 {
1852         int n, val, min_val = INT_MAX, best_node = node;
1853
1854         for_each_online_node(n) {
1855                 if (n == node)
1856                         continue;
1857                 val = node_distance(node, n);
1858                 if (val < min_val) {
1859                         min_val = val;
1860                         best_node = n;
1861                 }
1862         }
1863         return best_node;
1864 }
1865
1866 static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq,
1867                                                                 int count)
1868 {
1869         int cpu;
1870         for_each_cpu(cpu, qmask) {
1871                 if (cpumask_weight(nvmeq->cpu_mask) >= count)
1872                         break;
1873                 if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask))
1874                         *per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid;
1875         }
1876 }
1877
1878 static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
1879         const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue)
1880 {
1881         int next_cpu;
1882         for_each_cpu(next_cpu, new_mask) {
1883                 cpumask_or(mask, mask, get_cpu_mask(next_cpu));
1884                 cpumask_or(mask, mask, topology_thread_cpumask(next_cpu));
1885                 cpumask_and(mask, mask, unassigned_cpus);
1886                 nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue);
1887         }
1888 }
1889
1890 static void nvme_create_io_queues(struct nvme_dev *dev)
1891 {
1892         unsigned i, max;
1893
1894         max = min(dev->max_qid, num_online_cpus());
1895         for (i = dev->queue_count; i <= max; i++)
1896                 if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
1897                         break;
1898
1899         max = min(dev->queue_count - 1, num_online_cpus());
1900         for (i = dev->online_queues; i <= max; i++)
1901                 if (nvme_create_queue(raw_nvmeq(dev, i), i))
1902                         break;
1903 }
1904
1905 /*
1906  * If there are fewer queues than online cpus, this will try to optimally
1907  * assign a queue to multiple cpus by grouping cpus that are "close" together:
1908  * thread siblings, core, socket, closest node, then whatever else is
1909  * available.
1910  */
1911 static void nvme_assign_io_queues(struct nvme_dev *dev)
1912 {
1913         unsigned cpu, cpus_per_queue, queues, remainder, i;
1914         cpumask_var_t unassigned_cpus;
1915
1916         nvme_create_io_queues(dev);
1917
1918         queues = min(dev->online_queues - 1, num_online_cpus());
1919         if (!queues)
1920                 return;
1921
1922         cpus_per_queue = num_online_cpus() / queues;
1923         remainder = queues - (num_online_cpus() - queues * cpus_per_queue);
1924
1925         if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL))
1926                 return;
1927
1928         cpumask_copy(unassigned_cpus, cpu_online_mask);
1929         cpu = cpumask_first(unassigned_cpus);
1930         for (i = 1; i <= queues; i++) {
1931                 struct nvme_queue *nvmeq = lock_nvmeq(dev, i);
1932                 cpumask_t mask;
1933
1934                 cpumask_clear(nvmeq->cpu_mask);
1935                 if (!cpumask_weight(unassigned_cpus)) {
1936                         unlock_nvmeq(nvmeq);
1937                         break;
1938                 }
1939
1940                 mask = *get_cpu_mask(cpu);
1941                 nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue);
1942                 if (cpus_weight(mask) < cpus_per_queue)
1943                         nvme_add_cpus(&mask, unassigned_cpus,
1944                                 topology_thread_cpumask(cpu),
1945                                 nvmeq, cpus_per_queue);
1946                 if (cpus_weight(mask) < cpus_per_queue)
1947                         nvme_add_cpus(&mask, unassigned_cpus,
1948                                 topology_core_cpumask(cpu),
1949                                 nvmeq, cpus_per_queue);
1950                 if (cpus_weight(mask) < cpus_per_queue)
1951                         nvme_add_cpus(&mask, unassigned_cpus,
1952                                 cpumask_of_node(cpu_to_node(cpu)),
1953                                 nvmeq, cpus_per_queue);
1954                 if (cpus_weight(mask) < cpus_per_queue)
1955                         nvme_add_cpus(&mask, unassigned_cpus,
1956                                 cpumask_of_node(
1957                                         nvme_find_closest_node(
1958                                                 cpu_to_node(cpu))),
1959                                 nvmeq, cpus_per_queue);
1960                 if (cpus_weight(mask) < cpus_per_queue)
1961                         nvme_add_cpus(&mask, unassigned_cpus,
1962                                 unassigned_cpus,
1963                                 nvmeq, cpus_per_queue);
1964
1965                 WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue,
1966                         "nvme%d qid:%d mis-matched queue-to-cpu assignment\n",
1967                         dev->instance, i);
1968
1969                 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
1970                                                         nvmeq->cpu_mask);
1971                 cpumask_andnot(unassigned_cpus, unassigned_cpus,
1972                                                 nvmeq->cpu_mask);
1973                 cpu = cpumask_next(cpu, unassigned_cpus);
1974                 if (remainder && !--remainder)
1975                         cpus_per_queue++;
1976                 unlock_nvmeq(nvmeq);
1977         }
1978         WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n",
1979                                                                 dev->instance);
1980         i = 0;
1981         cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask);
1982         for_each_cpu(cpu, unassigned_cpus)
1983                 *per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1;
1984         free_cpumask_var(unassigned_cpus);
1985 }
1986
1987 static int set_queue_count(struct nvme_dev *dev, int count)
1988 {
1989         int status;
1990         u32 result;
1991         u32 q_count = (count - 1) | ((count - 1) << 16);
1992
1993         status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
1994                                                                 &result);
1995         if (status)
1996                 return status < 0 ? -EIO : -EBUSY;
1997         return min(result & 0xffff, result >> 16) + 1;
1998 }
1999
2000 static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
2001 {
2002         return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
2003 }
2004
2005 static int nvme_setup_io_queues(struct nvme_dev *dev)
2006 {
2007         struct nvme_queue *adminq = raw_nvmeq(dev, 0);
2008         struct pci_dev *pdev = dev->pci_dev;
2009         int result, i, vecs, nr_io_queues, size;
2010
2011         nr_io_queues = num_possible_cpus();
2012         result = set_queue_count(dev, nr_io_queues);
2013         if (result < 0)
2014                 return result;
2015         if (result < nr_io_queues)
2016                 nr_io_queues = result;
2017
2018         size = db_bar_size(dev, nr_io_queues);
2019         if (size > 8192) {
2020                 iounmap(dev->bar);
2021                 do {
2022                         dev->bar = ioremap(pci_resource_start(pdev, 0), size);
2023                         if (dev->bar)
2024                                 break;
2025                         if (!--nr_io_queues)
2026                                 return -ENOMEM;
2027                         size = db_bar_size(dev, nr_io_queues);
2028                 } while (1);
2029                 dev->dbs = ((void __iomem *)dev->bar) + 4096;
2030                 adminq->q_db = dev->dbs;
2031         }
2032
2033         /* Deregister the admin queue's interrupt */
2034         free_irq(dev->entry[0].vector, adminq);
2035
2036         vecs = nr_io_queues;
2037         for (i = 0; i < vecs; i++)
2038                 dev->entry[i].entry = i;
2039         for (;;) {
2040                 result = pci_enable_msix(pdev, dev->entry, vecs);
2041                 if (result <= 0)
2042                         break;
2043                 vecs = result;
2044         }
2045
2046         if (result < 0) {
2047                 vecs = nr_io_queues;
2048                 if (vecs > 32)
2049                         vecs = 32;
2050                 for (;;) {
2051                         result = pci_enable_msi_block(pdev, vecs);
2052                         if (result == 0) {
2053                                 for (i = 0; i < vecs; i++)
2054                                         dev->entry[i].vector = i + pdev->irq;
2055                                 break;
2056                         } else if (result < 0) {
2057                                 vecs = 1;
2058                                 break;
2059                         }
2060                         vecs = result;
2061                 }
2062         }
2063
2064         /*
2065          * Should investigate if there's a performance win from allocating
2066          * more queues than interrupt vectors; it might allow the submission
2067          * path to scale better, even if the receive path is limited by the
2068          * number of interrupts.
2069          */
2070         nr_io_queues = vecs;
2071         dev->max_qid = nr_io_queues;
2072
2073         result = queue_request_irq(dev, adminq, adminq->irqname);
2074         if (result) {
2075                 adminq->q_suspended = 1;
2076                 goto free_queues;
2077         }
2078
2079         /* Free previously allocated queues that are no longer usable */
2080         nvme_free_queues(dev, nr_io_queues + 1);
2081         nvme_assign_io_queues(dev);
2082
2083         return 0;
2084
2085  free_queues:
2086         nvme_free_queues(dev, 1);
2087         return result;
2088 }
2089
2090 /*
2091  * Return: error value if an error occurred setting up the queues or calling
2092  * Identify Device.  0 if these succeeded, even if adding some of the
2093  * namespaces failed.  At the moment, these failures are silent.  TBD which
2094  * failures should be reported.
2095  */
2096 static int nvme_dev_add(struct nvme_dev *dev)
2097 {
2098         struct pci_dev *pdev = dev->pci_dev;
2099         int res;
2100         unsigned nn, i;
2101         struct nvme_ns *ns;
2102         struct nvme_id_ctrl *ctrl;
2103         struct nvme_id_ns *id_ns;
2104         void *mem;
2105         dma_addr_t dma_addr;
2106         int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
2107
2108         mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL);
2109         if (!mem)
2110                 return -ENOMEM;
2111
2112         res = nvme_identify(dev, 0, 1, dma_addr);
2113         if (res) {
2114                 res = -EIO;
2115                 goto out;
2116         }
2117
2118         ctrl = mem;
2119         nn = le32_to_cpup(&ctrl->nn);
2120         dev->oncs = le16_to_cpup(&ctrl->oncs);
2121         dev->abort_limit = ctrl->acl + 1;
2122         memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
2123         memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
2124         memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
2125         if (ctrl->mdts)
2126                 dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
2127         if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
2128                         (pdev->device == 0x0953) && ctrl->vs[3])
2129                 dev->stripe_size = 1 << (ctrl->vs[3] + shift);
2130
2131         id_ns = mem;
2132         for (i = 1; i <= nn; i++) {
2133                 res = nvme_identify(dev, i, 0, dma_addr);
2134                 if (res)
2135                         continue;
2136
2137                 if (id_ns->ncap == 0)
2138                         continue;
2139
2140                 res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
2141                                                         dma_addr + 4096, NULL);
2142                 if (res)
2143                         memset(mem + 4096, 0, 4096);
2144
2145                 ns = nvme_alloc_ns(dev, i, mem, mem + 4096);
2146                 if (ns)
2147                         list_add_tail(&ns->list, &dev->namespaces);
2148         }
2149         list_for_each_entry(ns, &dev->namespaces, list)
2150                 add_disk(ns->disk);
2151         res = 0;
2152
2153  out:
2154         dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
2155         return res;
2156 }
2157
2158 static int nvme_dev_map(struct nvme_dev *dev)
2159 {
2160         u64 cap;
2161         int bars, result = -ENOMEM;
2162         struct pci_dev *pdev = dev->pci_dev;
2163
2164         if (pci_enable_device_mem(pdev))
2165                 return result;
2166
2167         dev->entry[0].vector = pdev->irq;
2168         pci_set_master(pdev);
2169         bars = pci_select_bars(pdev, IORESOURCE_MEM);
2170         if (pci_request_selected_regions(pdev, bars, "nvme"))
2171                 goto disable_pci;
2172
2173         if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) &&
2174             dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)))
2175                 goto disable;
2176
2177         dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
2178         if (!dev->bar)
2179                 goto disable;
2180         if (readl(&dev->bar->csts) == -1) {
2181                 result = -ENODEV;
2182                 goto unmap;
2183         }
2184         cap = readq(&dev->bar->cap);
2185         dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
2186         dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
2187         dev->dbs = ((void __iomem *)dev->bar) + 4096;
2188
2189         return 0;
2190
2191  unmap:
2192         iounmap(dev->bar);
2193         dev->bar = NULL;
2194  disable:
2195         pci_release_regions(pdev);
2196  disable_pci:
2197         pci_disable_device(pdev);
2198         return result;
2199 }
2200
2201 static void nvme_dev_unmap(struct nvme_dev *dev)
2202 {
2203         if (dev->pci_dev->msi_enabled)
2204                 pci_disable_msi(dev->pci_dev);
2205         else if (dev->pci_dev->msix_enabled)
2206                 pci_disable_msix(dev->pci_dev);
2207
2208         if (dev->bar) {
2209                 iounmap(dev->bar);
2210                 dev->bar = NULL;
2211                 pci_release_regions(dev->pci_dev);
2212         }
2213
2214         if (pci_is_enabled(dev->pci_dev))
2215                 pci_disable_device(dev->pci_dev);
2216 }
2217
2218 struct nvme_delq_ctx {
2219         struct task_struct *waiter;
2220         struct kthread_worker *worker;
2221         atomic_t refcount;
2222 };
2223
2224 static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
2225 {
2226         dq->waiter = current;
2227         mb();
2228
2229         for (;;) {
2230                 set_current_state(TASK_KILLABLE);
2231                 if (!atomic_read(&dq->refcount))
2232                         break;
2233                 if (!schedule_timeout(ADMIN_TIMEOUT) ||
2234                                         fatal_signal_pending(current)) {
2235                         set_current_state(TASK_RUNNING);
2236
2237                         nvme_disable_ctrl(dev, readq(&dev->bar->cap));
2238                         nvme_disable_queue(dev, 0);
2239
2240                         send_sig(SIGKILL, dq->worker->task, 1);
2241                         flush_kthread_worker(dq->worker);
2242                         return;
2243                 }
2244         }
2245         set_current_state(TASK_RUNNING);
2246 }
2247
2248 static void nvme_put_dq(struct nvme_delq_ctx *dq)
2249 {
2250         atomic_dec(&dq->refcount);
2251         if (dq->waiter)
2252                 wake_up_process(dq->waiter);
2253 }
2254
2255 static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
2256 {
2257         atomic_inc(&dq->refcount);
2258         return dq;
2259 }
2260
2261 static void nvme_del_queue_end(struct nvme_queue *nvmeq)
2262 {
2263         struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
2264
2265         nvme_clear_queue(nvmeq);
2266         nvme_put_dq(dq);
2267 }
2268
2269 static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
2270                                                 kthread_work_func_t fn)
2271 {
2272         struct nvme_command c;
2273
2274         memset(&c, 0, sizeof(c));
2275         c.delete_queue.opcode = opcode;
2276         c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2277
2278         init_kthread_work(&nvmeq->cmdinfo.work, fn);
2279         return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo);
2280 }
2281
2282 static void nvme_del_cq_work_handler(struct kthread_work *work)
2283 {
2284         struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
2285                                                         cmdinfo.work);
2286         nvme_del_queue_end(nvmeq);
2287 }
2288
2289 static int nvme_delete_cq(struct nvme_queue *nvmeq)
2290 {
2291         return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq,
2292                                                 nvme_del_cq_work_handler);
2293 }
2294
2295 static void nvme_del_sq_work_handler(struct kthread_work *work)
2296 {
2297         struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
2298                                                         cmdinfo.work);
2299         int status = nvmeq->cmdinfo.status;
2300
2301         if (!status)
2302                 status = nvme_delete_cq(nvmeq);
2303         if (status)
2304                 nvme_del_queue_end(nvmeq);
2305 }
2306
2307 static int nvme_delete_sq(struct nvme_queue *nvmeq)
2308 {
2309         return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq,
2310                                                 nvme_del_sq_work_handler);
2311 }
2312
2313 static void nvme_del_queue_start(struct kthread_work *work)
2314 {
2315         struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
2316                                                         cmdinfo.work);
2317         allow_signal(SIGKILL);
2318         if (nvme_delete_sq(nvmeq))
2319                 nvme_del_queue_end(nvmeq);
2320 }
2321
2322 static void nvme_disable_io_queues(struct nvme_dev *dev)
2323 {
2324         int i;
2325         DEFINE_KTHREAD_WORKER_ONSTACK(worker);
2326         struct nvme_delq_ctx dq;
2327         struct task_struct *kworker_task = kthread_run(kthread_worker_fn,
2328                                         &worker, "nvme%d", dev->instance);
2329
2330         if (IS_ERR(kworker_task)) {
2331                 dev_err(&dev->pci_dev->dev,
2332                         "Failed to create queue del task\n");
2333                 for (i = dev->queue_count - 1; i > 0; i--)
2334                         nvme_disable_queue(dev, i);
2335                 return;
2336         }
2337
2338         dq.waiter = NULL;
2339         atomic_set(&dq.refcount, 0);
2340         dq.worker = &worker;
2341         for (i = dev->queue_count - 1; i > 0; i--) {
2342                 struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
2343
2344                 if (nvme_suspend_queue(nvmeq))
2345                         continue;
2346                 nvmeq->cmdinfo.ctx = nvme_get_dq(&dq);
2347                 nvmeq->cmdinfo.worker = dq.worker;
2348                 init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start);
2349                 queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work);
2350         }
2351         nvme_wait_dq(&dq, dev);
2352         kthread_stop(kworker_task);
2353 }
2354
2355 static void nvme_dev_shutdown(struct nvme_dev *dev)
2356 {
2357         int i;
2358
2359         dev->initialized = 0;
2360
2361         spin_lock(&dev_list_lock);
2362         list_del_init(&dev->node);
2363         spin_unlock(&dev_list_lock);
2364
2365         if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) {
2366                 for (i = dev->queue_count - 1; i >= 0; i--) {
2367                         struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
2368                         nvme_suspend_queue(nvmeq);
2369                         nvme_clear_queue(nvmeq);
2370                 }
2371         } else {
2372                 nvme_disable_io_queues(dev);
2373                 nvme_shutdown_ctrl(dev);
2374                 nvme_disable_queue(dev, 0);
2375         }
2376         nvme_dev_unmap(dev);
2377 }
2378
2379 static void nvme_dev_remove(struct nvme_dev *dev)
2380 {
2381         struct nvme_ns *ns;
2382
2383         list_for_each_entry(ns, &dev->namespaces, list) {
2384                 if (ns->disk->flags & GENHD_FL_UP)
2385                         del_gendisk(ns->disk);
2386                 if (!blk_queue_dying(ns->queue))
2387                         blk_cleanup_queue(ns->queue);
2388         }
2389 }
2390
2391 static int nvme_setup_prp_pools(struct nvme_dev *dev)
2392 {
2393         struct device *dmadev = &dev->pci_dev->dev;
2394         dev->prp_page_pool = dma_pool_create("prp list page", dmadev,
2395                                                 PAGE_SIZE, PAGE_SIZE, 0);
2396         if (!dev->prp_page_pool)
2397                 return -ENOMEM;
2398
2399         /* Optimisation for I/Os between 4k and 128k */
2400         dev->prp_small_pool = dma_pool_create("prp list 256", dmadev,
2401                                                 256, 256, 0);
2402         if (!dev->prp_small_pool) {
2403                 dma_pool_destroy(dev->prp_page_pool);
2404                 return -ENOMEM;
2405         }
2406         return 0;
2407 }
2408
2409 static void nvme_release_prp_pools(struct nvme_dev *dev)
2410 {
2411         dma_pool_destroy(dev->prp_page_pool);
2412         dma_pool_destroy(dev->prp_small_pool);
2413 }
2414
2415 static DEFINE_IDA(nvme_instance_ida);
2416
2417 static int nvme_set_instance(struct nvme_dev *dev)
2418 {
2419         int instance, error;
2420
2421         do {
2422                 if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
2423                         return -ENODEV;
2424
2425                 spin_lock(&dev_list_lock);
2426                 error = ida_get_new(&nvme_instance_ida, &instance);
2427                 spin_unlock(&dev_list_lock);
2428         } while (error == -EAGAIN);
2429
2430         if (error)
2431                 return -ENODEV;
2432
2433         dev->instance = instance;
2434         return 0;
2435 }
2436
2437 static void nvme_release_instance(struct nvme_dev *dev)
2438 {
2439         spin_lock(&dev_list_lock);
2440         ida_remove(&nvme_instance_ida, dev->instance);
2441         spin_unlock(&dev_list_lock);
2442 }
2443
2444 static void nvme_free_namespaces(struct nvme_dev *dev)
2445 {
2446         struct nvme_ns *ns, *next;
2447
2448         list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
2449                 list_del(&ns->list);
2450                 put_disk(ns->disk);
2451                 kfree(ns);
2452         }
2453 }
2454
2455 static void nvme_free_dev(struct kref *kref)
2456 {
2457         struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
2458
2459         nvme_free_namespaces(dev);
2460         free_percpu(dev->io_queue);
2461         kfree(dev->queues);
2462         kfree(dev->entry);
2463         kfree(dev);
2464 }
2465
2466 static int nvme_dev_open(struct inode *inode, struct file *f)
2467 {
2468         struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev,
2469                                                                 miscdev);
2470         kref_get(&dev->kref);
2471         f->private_data = dev;
2472         return 0;
2473 }
2474
2475 static int nvme_dev_release(struct inode *inode, struct file *f)
2476 {
2477         struct nvme_dev *dev = f->private_data;
2478         kref_put(&dev->kref, nvme_free_dev);
2479         return 0;
2480 }
2481
2482 static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
2483 {
2484         struct nvme_dev *dev = f->private_data;
2485         switch (cmd) {
2486         case NVME_IOCTL_ADMIN_CMD:
2487                 return nvme_user_admin_cmd(dev, (void __user *)arg);
2488         default:
2489                 return -ENOTTY;
2490         }
2491 }
2492
2493 static const struct file_operations nvme_dev_fops = {
2494         .owner          = THIS_MODULE,
2495         .open           = nvme_dev_open,
2496         .release        = nvme_dev_release,
2497         .unlocked_ioctl = nvme_dev_ioctl,
2498         .compat_ioctl   = nvme_dev_ioctl,
2499 };
2500
2501 static int nvme_dev_start(struct nvme_dev *dev)
2502 {
2503         int result;
2504
2505         result = nvme_dev_map(dev);
2506         if (result)
2507                 return result;
2508
2509         result = nvme_configure_admin_queue(dev);
2510         if (result)
2511                 goto unmap;
2512
2513         spin_lock(&dev_list_lock);
2514         list_add(&dev->node, &dev_list);
2515         spin_unlock(&dev_list_lock);
2516
2517         result = nvme_setup_io_queues(dev);
2518         if (result && result != -EBUSY)
2519                 goto disable;
2520
2521         return result;
2522
2523  disable:
2524         nvme_disable_queue(dev, 0);
2525         spin_lock(&dev_list_lock);
2526         list_del_init(&dev->node);
2527         spin_unlock(&dev_list_lock);
2528  unmap:
2529         nvme_dev_unmap(dev);
2530         return result;
2531 }
2532
2533 static int nvme_remove_dead_ctrl(void *arg)
2534 {
2535         struct nvme_dev *dev = (struct nvme_dev *)arg;
2536         struct pci_dev *pdev = dev->pci_dev;
2537
2538         if (pci_get_drvdata(pdev))
2539                 pci_stop_and_remove_bus_device(pdev);
2540         kref_put(&dev->kref, nvme_free_dev);
2541         return 0;
2542 }
2543
2544 static void nvme_remove_disks(struct work_struct *ws)
2545 {
2546         struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
2547
2548         nvme_dev_remove(dev);
2549         nvme_free_queues(dev, 1);
2550 }
2551
2552 static int nvme_dev_resume(struct nvme_dev *dev)
2553 {
2554         int ret;
2555
2556         ret = nvme_dev_start(dev);
2557         if (ret && ret != -EBUSY)
2558                 return ret;
2559         if (ret == -EBUSY) {
2560                 spin_lock(&dev_list_lock);
2561                 PREPARE_WORK(&dev->reset_work, nvme_remove_disks);
2562                 queue_work(nvme_workq, &dev->reset_work);
2563                 spin_unlock(&dev_list_lock);
2564         }
2565         dev->initialized = 1;
2566         return 0;
2567 }
2568
2569 static void nvme_dev_reset(struct nvme_dev *dev)
2570 {
2571         nvme_dev_shutdown(dev);
2572         if (nvme_dev_resume(dev)) {
2573                 dev_err(&dev->pci_dev->dev, "Device failed to resume\n");
2574                 kref_get(&dev->kref);
2575                 if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
2576                                                         dev->instance))) {
2577                         dev_err(&dev->pci_dev->dev,
2578                                 "Failed to start controller remove task\n");
2579                         kref_put(&dev->kref, nvme_free_dev);
2580                 }
2581         }
2582 }
2583
2584 static void nvme_reset_failed_dev(struct work_struct *ws)
2585 {
2586         struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
2587         nvme_dev_reset(dev);
2588 }
2589
2590 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2591 {
2592         int result = -ENOMEM;
2593         struct nvme_dev *dev;
2594
2595         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
2596         if (!dev)
2597                 return -ENOMEM;
2598         dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
2599                                                                 GFP_KERNEL);
2600         if (!dev->entry)
2601                 goto free;
2602         dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
2603                                                                 GFP_KERNEL);
2604         if (!dev->queues)
2605                 goto free;
2606         dev->io_queue = alloc_percpu(unsigned short);
2607         if (!dev->io_queue)
2608                 goto free;
2609
2610         INIT_LIST_HEAD(&dev->namespaces);
2611         INIT_WORK(&dev->reset_work, nvme_reset_failed_dev);
2612         dev->pci_dev = pdev;
2613         pci_set_drvdata(pdev, dev);
2614         result = nvme_set_instance(dev);
2615         if (result)
2616                 goto free;
2617
2618         result = nvme_setup_prp_pools(dev);
2619         if (result)
2620                 goto release;
2621
2622         kref_init(&dev->kref);
2623         result = nvme_dev_start(dev);
2624         if (result) {
2625                 if (result == -EBUSY)
2626                         goto create_cdev;
2627                 goto release_pools;
2628         }
2629
2630         result = nvme_dev_add(dev);
2631         if (result)
2632                 goto shutdown;
2633
2634  create_cdev:
2635         scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance);
2636         dev->miscdev.minor = MISC_DYNAMIC_MINOR;
2637         dev->miscdev.parent = &pdev->dev;
2638         dev->miscdev.name = dev->name;
2639         dev->miscdev.fops = &nvme_dev_fops;
2640         result = misc_register(&dev->miscdev);
2641         if (result)
2642                 goto remove;
2643
2644         dev->initialized = 1;
2645         return 0;
2646
2647  remove:
2648         nvme_dev_remove(dev);
2649         nvme_free_namespaces(dev);
2650  shutdown:
2651         nvme_dev_shutdown(dev);
2652  release_pools:
2653         nvme_free_queues(dev, 0);
2654         nvme_release_prp_pools(dev);
2655  release:
2656         nvme_release_instance(dev);
2657  free:
2658         free_percpu(dev->io_queue);
2659         kfree(dev->queues);
2660         kfree(dev->entry);
2661         kfree(dev);
2662         return result;
2663 }
2664
2665 static void nvme_shutdown(struct pci_dev *pdev)
2666 {
2667         struct nvme_dev *dev = pci_get_drvdata(pdev);
2668         nvme_dev_shutdown(dev);
2669 }
2670
2671 static void nvme_remove(struct pci_dev *pdev)
2672 {
2673         struct nvme_dev *dev = pci_get_drvdata(pdev);
2674
2675         spin_lock(&dev_list_lock);
2676         list_del_init(&dev->node);
2677         spin_unlock(&dev_list_lock);
2678
2679         pci_set_drvdata(pdev, NULL);
2680         flush_work(&dev->reset_work);
2681         misc_deregister(&dev->miscdev);
2682         nvme_dev_remove(dev);
2683         nvme_dev_shutdown(dev);
2684         nvme_free_queues(dev, 0);
2685         rcu_barrier();
2686         nvme_release_instance(dev);
2687         nvme_release_prp_pools(dev);
2688         kref_put(&dev->kref, nvme_free_dev);
2689 }
2690
2691 /* These functions are yet to be implemented */
2692 #define nvme_error_detected NULL
2693 #define nvme_dump_registers NULL
2694 #define nvme_link_reset NULL
2695 #define nvme_slot_reset NULL
2696 #define nvme_error_resume NULL
2697
2698 #ifdef CONFIG_PM_SLEEP
2699 static int nvme_suspend(struct device *dev)
2700 {
2701         struct pci_dev *pdev = to_pci_dev(dev);
2702         struct nvme_dev *ndev = pci_get_drvdata(pdev);
2703
2704         nvme_dev_shutdown(ndev);
2705         return 0;
2706 }
2707
2708 static int nvme_resume(struct device *dev)
2709 {
2710         struct pci_dev *pdev = to_pci_dev(dev);
2711         struct nvme_dev *ndev = pci_get_drvdata(pdev);
2712
2713         if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) {
2714                 PREPARE_WORK(&ndev->reset_work, nvme_reset_failed_dev);
2715                 queue_work(nvme_workq, &ndev->reset_work);
2716         }
2717         return 0;
2718 }
2719 #endif
2720
2721 static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
2722
2723 static const struct pci_error_handlers nvme_err_handler = {
2724         .error_detected = nvme_error_detected,
2725         .mmio_enabled   = nvme_dump_registers,
2726         .link_reset     = nvme_link_reset,
2727         .slot_reset     = nvme_slot_reset,
2728         .resume         = nvme_error_resume,
2729 };
2730
2731 /* Move to pci_ids.h later */
2732 #define PCI_CLASS_STORAGE_EXPRESS       0x010802
2733
2734 static const struct pci_device_id nvme_id_table[] = {
2735         { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
2736         { 0, }
2737 };
2738 MODULE_DEVICE_TABLE(pci, nvme_id_table);
2739
2740 static struct pci_driver nvme_driver = {
2741         .name           = "nvme",
2742         .id_table       = nvme_id_table,
2743         .probe          = nvme_probe,
2744         .remove         = nvme_remove,
2745         .shutdown       = nvme_shutdown,
2746         .driver         = {
2747                 .pm     = &nvme_dev_pm_ops,
2748         },
2749         .err_handler    = &nvme_err_handler,
2750 };
2751
2752 static int __init nvme_init(void)
2753 {
2754         int result;
2755
2756         nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
2757         if (IS_ERR(nvme_thread))
2758                 return PTR_ERR(nvme_thread);
2759
2760         result = -ENOMEM;
2761         nvme_workq = create_singlethread_workqueue("nvme");
2762         if (!nvme_workq)
2763                 goto kill_kthread;
2764
2765         result = register_blkdev(nvme_major, "nvme");
2766         if (result < 0)
2767                 goto kill_workq;
2768         else if (result > 0)
2769                 nvme_major = result;
2770
2771         result = pci_register_driver(&nvme_driver);
2772         if (result)
2773                 goto unregister_blkdev;
2774         return 0;
2775
2776  unregister_blkdev:
2777         unregister_blkdev(nvme_major, "nvme");
2778  kill_workq:
2779         destroy_workqueue(nvme_workq);
2780  kill_kthread:
2781         kthread_stop(nvme_thread);
2782         return result;
2783 }
2784
2785 static void __exit nvme_exit(void)
2786 {
2787         pci_unregister_driver(&nvme_driver);
2788         unregister_blkdev(nvme_major, "nvme");
2789         destroy_workqueue(nvme_workq);
2790         kthread_stop(nvme_thread);
2791 }
2792
2793 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
2794 MODULE_LICENSE("GPL");
2795 MODULE_VERSION("0.9");
2796 module_init(nvme_init);
2797 module_exit(nvme_exit);