2 * NVM Express device driver
3 * Copyright (c) 2011, Intel Corporation.
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 #include <linux/nvme.h>
20 #include <linux/bio.h>
21 #include <linux/blkdev.h>
22 #include <linux/errno.h>
24 #include <linux/genhd.h>
25 #include <linux/init.h>
26 #include <linux/interrupt.h>
28 #include <linux/kdev_t.h>
29 #include <linux/kernel.h>
31 #include <linux/module.h>
32 #include <linux/moduleparam.h>
33 #include <linux/pci.h>
34 #include <linux/sched.h>
35 #include <linux/slab.h>
36 #include <linux/types.h>
37 #include <linux/version.h>
39 #define NVME_Q_DEPTH 1024
40 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
41 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
42 #define NVME_MINORS 64
44 static int nvme_major
;
45 module_param(nvme_major
, int, 0);
48 * Represents an NVM Express device. Each nvme_dev is a PCI function.
51 struct list_head node
;
52 struct nvme_queue
**queues
;
54 struct pci_dev
*pci_dev
;
58 struct msix_entry
*entry
;
59 struct nvme_bar __iomem
*bar
;
60 struct list_head namespaces
;
64 * An NVM Express namespace is equivalent to a SCSI LUN
67 struct list_head list
;
70 struct request_queue
*queue
;
78 * An NVM Express queue. Each device has at least two (one for admin
79 * commands and one for I/O commands).
82 struct device
*q_dmadev
;
84 struct nvme_command
*sq_cmds
;
85 volatile struct nvme_completion
*cqes
;
86 dma_addr_t sq_dma_addr
;
87 dma_addr_t cq_dma_addr
;
88 wait_queue_head_t sq_full
;
89 struct bio_list sq_cong
;
97 unsigned long cmdid_data
[];
101 * Check we didin't inadvertently grow the command struct
103 static inline void _nvme_check_size(void)
105 BUILD_BUG_ON(sizeof(struct nvme_rw_command
) != 64);
106 BUILD_BUG_ON(sizeof(struct nvme_create_cq
) != 64);
107 BUILD_BUG_ON(sizeof(struct nvme_create_sq
) != 64);
108 BUILD_BUG_ON(sizeof(struct nvme_delete_queue
) != 64);
109 BUILD_BUG_ON(sizeof(struct nvme_features
) != 64);
110 BUILD_BUG_ON(sizeof(struct nvme_command
) != 64);
111 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl
) != 4096);
112 BUILD_BUG_ON(sizeof(struct nvme_id_ns
) != 4096);
113 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type
) != 64);
117 * alloc_cmdid - Allocate a Command ID
118 * @param nvmeq The queue that will be used for this command
119 * @param ctx A pointer that will be passed to the handler
120 * @param handler The ID of the handler to call
122 * Allocate a Command ID for a queue. The data passed in will
123 * be passed to the completion handler. This is implemented by using
124 * the bottom two bits of the ctx pointer to store the handler ID.
125 * Passing in a pointer that's not 4-byte aligned will cause a BUG.
126 * We can change this if it becomes a problem.
128 static int alloc_cmdid(struct nvme_queue
*nvmeq
, void *ctx
, int handler
)
130 int depth
= nvmeq
->q_depth
;
131 unsigned long data
= (unsigned long)ctx
| handler
;
134 BUG_ON((unsigned long)ctx
& 3);
137 cmdid
= find_first_zero_bit(nvmeq
->cmdid_data
, depth
);
140 } while (test_and_set_bit(cmdid
, nvmeq
->cmdid_data
));
142 nvmeq
->cmdid_data
[cmdid
+ BITS_TO_LONGS(depth
)] = data
;
146 static int alloc_cmdid_killable(struct nvme_queue
*nvmeq
, void *ctx
,
150 wait_event_killable(nvmeq
->sq_full
,
151 (cmdid
= alloc_cmdid(nvmeq
, ctx
, handler
)) >= 0);
152 return (cmdid
< 0) ? -EINTR
: cmdid
;
155 /* If you need more than four handlers, you'll need to change how
156 * alloc_cmdid and nvme_process_cq work
159 sync_completion_id
= 0,
163 static unsigned long free_cmdid(struct nvme_queue
*nvmeq
, int cmdid
)
167 data
= nvmeq
->cmdid_data
[cmdid
+ BITS_TO_LONGS(nvmeq
->q_depth
)];
168 clear_bit(cmdid
, nvmeq
->cmdid_data
);
169 wake_up(&nvmeq
->sq_full
);
173 static struct nvme_queue
*get_nvmeq(struct nvme_ns
*ns
)
175 int qid
, cpu
= get_cpu();
176 if (cpu
< ns
->dev
->queue_count
)
179 qid
= (cpu
% rounddown_pow_of_two(ns
->dev
->queue_count
)) + 1;
180 return ns
->dev
->queues
[qid
];
183 static void put_nvmeq(struct nvme_queue
*nvmeq
)
189 * nvme_submit_cmd: Copy a command into a queue and ring the doorbell
190 * @nvmeq: The queue to use
191 * @cmd: The command to send
193 * Safe to use from interrupt context
195 static int nvme_submit_cmd(struct nvme_queue
*nvmeq
, struct nvme_command
*cmd
)
199 /* XXX: Need to check tail isn't going to overrun head */
200 spin_lock_irqsave(&nvmeq
->q_lock
, flags
);
201 tail
= nvmeq
->sq_tail
;
202 memcpy(&nvmeq
->sq_cmds
[tail
], cmd
, sizeof(*cmd
));
203 writel(tail
, nvmeq
->q_db
);
204 if (++tail
== nvmeq
->q_depth
)
206 nvmeq
->sq_tail
= tail
;
207 spin_unlock_irqrestore(&nvmeq
->q_lock
, flags
);
212 struct nvme_req_info
{
215 struct scatterlist sg
[0];
218 /* XXX: use a mempool */
219 static struct nvme_req_info
*alloc_info(unsigned nseg
, gfp_t gfp
)
221 return kmalloc(sizeof(struct nvme_req_info
) +
222 sizeof(struct scatterlist
) * nseg
, gfp
);
225 static void free_info(struct nvme_req_info
*info
)
230 static void bio_completion(struct nvme_queue
*nvmeq
, void *ctx
,
231 struct nvme_completion
*cqe
)
233 struct nvme_req_info
*info
= ctx
;
234 struct bio
*bio
= info
->bio
;
235 u16 status
= le16_to_cpup(&cqe
->status
) >> 1;
237 dma_unmap_sg(nvmeq
->q_dmadev
, info
->sg
, info
->nents
,
238 bio_data_dir(bio
) ? DMA_TO_DEVICE
: DMA_FROM_DEVICE
);
240 bio_endio(bio
, status
? -EIO
: 0);
243 static int nvme_map_bio(struct device
*dev
, struct nvme_req_info
*info
,
244 struct bio
*bio
, enum dma_data_direction dma_dir
, int psegs
)
246 struct bio_vec
*bvec
;
247 struct scatterlist
*sg
= info
->sg
;
250 sg_init_table(sg
, psegs
);
251 bio_for_each_segment(bvec
, bio
, i
) {
252 sg_set_page(sg
, bvec
->bv_page
, bvec
->bv_len
, bvec
->bv_offset
);
253 /* XXX: handle non-mergable here */
258 return dma_map_sg(dev
, info
->sg
, info
->nents
, dma_dir
);
261 static int nvme_submit_bio_queue(struct nvme_queue
*nvmeq
, struct nvme_ns
*ns
,
264 struct nvme_rw_command
*cmnd
;
265 struct nvme_req_info
*info
;
266 enum dma_data_direction dma_dir
;
271 int psegs
= bio_phys_segments(ns
->queue
, bio
);
273 info
= alloc_info(psegs
, GFP_NOIO
);
278 cmdid
= alloc_cmdid(nvmeq
, info
, bio_completion_id
);
279 if (unlikely(cmdid
< 0))
283 if (bio
->bi_rw
& REQ_FUA
)
284 control
|= NVME_RW_FUA
;
285 if (bio
->bi_rw
& (REQ_FAILFAST_DEV
| REQ_RAHEAD
))
286 control
|= NVME_RW_LR
;
289 if (bio
->bi_rw
& REQ_RAHEAD
)
290 dsmgmt
|= NVME_RW_DSM_FREQ_PREFETCH
;
292 spin_lock_irqsave(&nvmeq
->q_lock
, flags
);
293 cmnd
= &nvmeq
->sq_cmds
[nvmeq
->sq_tail
].rw
;
295 if (bio_data_dir(bio
)) {
296 cmnd
->opcode
= nvme_cmd_write
;
297 dma_dir
= DMA_TO_DEVICE
;
299 cmnd
->opcode
= nvme_cmd_read
;
300 dma_dir
= DMA_FROM_DEVICE
;
303 nvme_map_bio(nvmeq
->q_dmadev
, info
, bio
, dma_dir
, psegs
);
306 cmnd
->command_id
= cmdid
;
307 cmnd
->nsid
= cpu_to_le32(ns
->ns_id
);
308 cmnd
->prp1
= cpu_to_le64(sg_phys(info
->sg
));
309 /* XXX: Support more than one PRP */
310 cmnd
->slba
= cpu_to_le64(bio
->bi_sector
>> (ns
->lba_shift
- 9));
311 cmnd
->length
= cpu_to_le16((bio
->bi_size
>> ns
->lba_shift
) - 1);
312 cmnd
->control
= cpu_to_le16(control
);
313 cmnd
->dsmgmt
= cpu_to_le32(dsmgmt
);
315 writel(nvmeq
->sq_tail
, nvmeq
->q_db
);
316 if (++nvmeq
->sq_tail
== nvmeq
->q_depth
)
319 spin_unlock_irqrestore(&nvmeq
->q_lock
, flags
);
330 * NB: return value of non-zero would mean that we were a stacking driver.
331 * make_request must always succeed.
333 static int nvme_make_request(struct request_queue
*q
, struct bio
*bio
)
335 struct nvme_ns
*ns
= q
->queuedata
;
336 struct nvme_queue
*nvmeq
= get_nvmeq(ns
);
338 if (nvme_submit_bio_queue(nvmeq
, ns
, bio
)) {
339 blk_set_queue_congested(q
, rw_is_sync(bio
->bi_rw
));
340 bio_list_add(&nvmeq
->sq_cong
, bio
);
347 struct sync_cmd_info
{
348 struct task_struct
*task
;
353 static void sync_completion(struct nvme_queue
*nvmeq
, void *ctx
,
354 struct nvme_completion
*cqe
)
356 struct sync_cmd_info
*cmdinfo
= ctx
;
357 cmdinfo
->result
= le32_to_cpup(&cqe
->result
);
358 cmdinfo
->status
= le16_to_cpup(&cqe
->status
) >> 1;
359 wake_up_process(cmdinfo
->task
);
362 typedef void (*completion_fn
)(struct nvme_queue
*, void *,
363 struct nvme_completion
*);
365 static irqreturn_t
nvme_process_cq(struct nvme_queue
*nvmeq
)
369 static const completion_fn completions
[4] = {
370 [sync_completion_id
] = sync_completion
,
371 [bio_completion_id
] = bio_completion
,
374 head
= nvmeq
->cq_head
;
375 phase
= nvmeq
->cq_phase
;
380 unsigned char handler
;
381 struct nvme_completion cqe
= nvmeq
->cqes
[head
];
382 if ((le16_to_cpu(cqe
.status
) & 1) != phase
)
384 nvmeq
->sq_head
= le16_to_cpu(cqe
.sq_head
);
385 if (++head
== nvmeq
->q_depth
) {
390 data
= free_cmdid(nvmeq
, cqe
.command_id
);
392 ptr
= (void *)(data
& ~3UL);
393 completions
[handler
](nvmeq
, ptr
, &cqe
);
396 /* If the controller ignores the cq head doorbell and continuously
397 * writes to the queue, it is theoretically possible to wrap around
398 * the queue twice and mistakenly return IRQ_NONE. Linux only
399 * requires that 0.1% of your interrupts are handled, so this isn't
402 if (head
== nvmeq
->cq_head
&& phase
== nvmeq
->cq_phase
)
405 writel(head
, nvmeq
->q_db
+ 1);
406 nvmeq
->cq_head
= head
;
407 nvmeq
->cq_phase
= phase
;
412 static irqreturn_t
nvme_irq(int irq
, void *data
)
414 return nvme_process_cq(data
);
418 * Returns 0 on success. If the result is negative, it's a Linux error code;
419 * if the result is positive, it's an NVM Express status code
421 static int nvme_submit_sync_cmd(struct nvme_queue
*q
, struct nvme_command
*cmd
,
425 struct sync_cmd_info cmdinfo
;
427 cmdinfo
.task
= current
;
428 cmdinfo
.status
= -EINTR
;
430 cmdid
= alloc_cmdid_killable(q
, &cmdinfo
, sync_completion_id
);
433 cmd
->common
.command_id
= cmdid
;
435 set_current_state(TASK_UNINTERRUPTIBLE
);
436 nvme_submit_cmd(q
, cmd
);
440 *result
= cmdinfo
.result
;
442 return cmdinfo
.status
;
445 static int nvme_submit_admin_cmd(struct nvme_dev
*dev
, struct nvme_command
*cmd
,
448 return nvme_submit_sync_cmd(dev
->queues
[0], cmd
, result
);
451 static int adapter_delete_queue(struct nvme_dev
*dev
, u8 opcode
, u16 id
)
454 struct nvme_command c
;
456 memset(&c
, 0, sizeof(c
));
457 c
.delete_queue
.opcode
= opcode
;
458 c
.delete_queue
.qid
= cpu_to_le16(id
);
460 status
= nvme_submit_admin_cmd(dev
, &c
, NULL
);
466 static int adapter_alloc_cq(struct nvme_dev
*dev
, u16 qid
,
467 struct nvme_queue
*nvmeq
)
470 struct nvme_command c
;
471 int flags
= NVME_QUEUE_PHYS_CONTIG
| NVME_CQ_IRQ_ENABLED
;
473 memset(&c
, 0, sizeof(c
));
474 c
.create_cq
.opcode
= nvme_admin_create_cq
;
475 c
.create_cq
.prp1
= cpu_to_le64(nvmeq
->cq_dma_addr
);
476 c
.create_cq
.cqid
= cpu_to_le16(qid
);
477 c
.create_cq
.qsize
= cpu_to_le16(nvmeq
->q_depth
- 1);
478 c
.create_cq
.cq_flags
= cpu_to_le16(flags
);
479 c
.create_cq
.irq_vector
= cpu_to_le16(nvmeq
->cq_vector
);
481 status
= nvme_submit_admin_cmd(dev
, &c
, NULL
);
487 static int adapter_alloc_sq(struct nvme_dev
*dev
, u16 qid
,
488 struct nvme_queue
*nvmeq
)
491 struct nvme_command c
;
492 int flags
= NVME_QUEUE_PHYS_CONTIG
| NVME_SQ_PRIO_MEDIUM
;
494 memset(&c
, 0, sizeof(c
));
495 c
.create_sq
.opcode
= nvme_admin_create_sq
;
496 c
.create_sq
.prp1
= cpu_to_le64(nvmeq
->sq_dma_addr
);
497 c
.create_sq
.sqid
= cpu_to_le16(qid
);
498 c
.create_sq
.qsize
= cpu_to_le16(nvmeq
->q_depth
- 1);
499 c
.create_sq
.sq_flags
= cpu_to_le16(flags
);
500 c
.create_sq
.cqid
= cpu_to_le16(qid
);
502 status
= nvme_submit_admin_cmd(dev
, &c
, NULL
);
508 static int adapter_delete_cq(struct nvme_dev
*dev
, u16 cqid
)
510 return adapter_delete_queue(dev
, nvme_admin_delete_cq
, cqid
);
513 static int adapter_delete_sq(struct nvme_dev
*dev
, u16 sqid
)
515 return adapter_delete_queue(dev
, nvme_admin_delete_sq
, sqid
);
518 static void nvme_free_queue(struct nvme_dev
*dev
, int qid
)
520 struct nvme_queue
*nvmeq
= dev
->queues
[qid
];
522 free_irq(dev
->entry
[nvmeq
->cq_vector
].vector
, nvmeq
);
524 /* Don't tell the adapter to delete the admin queue */
526 adapter_delete_sq(dev
, qid
);
527 adapter_delete_cq(dev
, qid
);
530 dma_free_coherent(nvmeq
->q_dmadev
, CQ_SIZE(nvmeq
->q_depth
),
531 (void *)nvmeq
->cqes
, nvmeq
->cq_dma_addr
);
532 dma_free_coherent(nvmeq
->q_dmadev
, SQ_SIZE(nvmeq
->q_depth
),
533 nvmeq
->sq_cmds
, nvmeq
->sq_dma_addr
);
537 static struct nvme_queue
*nvme_alloc_queue(struct nvme_dev
*dev
, int qid
,
538 int depth
, int vector
)
540 struct device
*dmadev
= &dev
->pci_dev
->dev
;
541 unsigned extra
= (depth
+ BITS_TO_LONGS(depth
)) * sizeof(long);
542 struct nvme_queue
*nvmeq
= kzalloc(sizeof(*nvmeq
) + extra
, GFP_KERNEL
);
546 nvmeq
->cqes
= dma_alloc_coherent(dmadev
, CQ_SIZE(depth
),
547 &nvmeq
->cq_dma_addr
, GFP_KERNEL
);
550 memset((void *)nvmeq
->cqes
, 0, CQ_SIZE(depth
));
552 nvmeq
->sq_cmds
= dma_alloc_coherent(dmadev
, SQ_SIZE(depth
),
553 &nvmeq
->sq_dma_addr
, GFP_KERNEL
);
557 nvmeq
->q_dmadev
= dmadev
;
558 spin_lock_init(&nvmeq
->q_lock
);
561 init_waitqueue_head(&nvmeq
->sq_full
);
562 bio_list_init(&nvmeq
->sq_cong
);
563 nvmeq
->q_db
= &dev
->dbs
[qid
* 2];
564 nvmeq
->q_depth
= depth
;
565 nvmeq
->cq_vector
= vector
;
570 dma_free_coherent(dmadev
, CQ_SIZE(nvmeq
->q_depth
), (void *)nvmeq
->cqes
,
577 static int queue_request_irq(struct nvme_dev
*dev
, struct nvme_queue
*nvmeq
,
580 return request_irq(dev
->entry
[nvmeq
->cq_vector
].vector
, nvme_irq
,
581 IRQF_DISABLED
| IRQF_SHARED
, name
, nvmeq
);
584 static __devinit
struct nvme_queue
*nvme_create_queue(struct nvme_dev
*dev
,
585 int qid
, int cq_size
, int vector
)
588 struct nvme_queue
*nvmeq
= nvme_alloc_queue(dev
, qid
, cq_size
, vector
);
590 result
= adapter_alloc_cq(dev
, qid
, nvmeq
);
594 result
= adapter_alloc_sq(dev
, qid
, nvmeq
);
598 result
= queue_request_irq(dev
, nvmeq
, "nvme");
605 adapter_delete_sq(dev
, qid
);
607 adapter_delete_cq(dev
, qid
);
609 dma_free_coherent(nvmeq
->q_dmadev
, CQ_SIZE(nvmeq
->q_depth
),
610 (void *)nvmeq
->cqes
, nvmeq
->cq_dma_addr
);
611 dma_free_coherent(nvmeq
->q_dmadev
, SQ_SIZE(nvmeq
->q_depth
),
612 nvmeq
->sq_cmds
, nvmeq
->sq_dma_addr
);
617 static int __devinit
nvme_configure_admin_queue(struct nvme_dev
*dev
)
621 struct nvme_queue
*nvmeq
;
623 dev
->dbs
= ((void __iomem
*)dev
->bar
) + 4096;
625 nvmeq
= nvme_alloc_queue(dev
, 0, 64, 0);
627 aqa
= nvmeq
->q_depth
- 1;
630 dev
->ctrl_config
= NVME_CC_ENABLE
| NVME_CC_CSS_NVM
;
631 dev
->ctrl_config
|= (PAGE_SHIFT
- 12) << NVME_CC_MPS_SHIFT
;
632 dev
->ctrl_config
|= NVME_CC_ARB_RR
| NVME_CC_SHN_NONE
;
634 writel(aqa
, &dev
->bar
->aqa
);
635 writeq(nvmeq
->sq_dma_addr
, &dev
->bar
->asq
);
636 writeq(nvmeq
->cq_dma_addr
, &dev
->bar
->acq
);
637 writel(dev
->ctrl_config
, &dev
->bar
->cc
);
639 while (!(readl(&dev
->bar
->csts
) & NVME_CSTS_RDY
)) {
641 if (fatal_signal_pending(current
))
645 result
= queue_request_irq(dev
, nvmeq
, "nvme admin");
646 dev
->queues
[0] = nvmeq
;
650 static int nvme_identify(struct nvme_ns
*ns
, void __user
*addr
, int cns
)
652 struct nvme_dev
*dev
= ns
->dev
;
654 struct nvme_command c
;
658 page
= dma_alloc_coherent(&dev
->pci_dev
->dev
, 4096, &dma_addr
,
661 memset(&c
, 0, sizeof(c
));
662 c
.identify
.opcode
= nvme_admin_identify
;
663 c
.identify
.nsid
= cns
? 0 : cpu_to_le32(ns
->ns_id
);
664 c
.identify
.prp1
= cpu_to_le64(dma_addr
);
665 c
.identify
.cns
= cpu_to_le32(cns
);
667 status
= nvme_submit_admin_cmd(dev
, &c
, NULL
);
671 else if (copy_to_user(addr
, page
, 4096))
674 dma_free_coherent(&dev
->pci_dev
->dev
, 4096, page
, dma_addr
);
679 static int nvme_get_range_type(struct nvme_ns
*ns
, void __user
*addr
)
681 struct nvme_dev
*dev
= ns
->dev
;
683 struct nvme_command c
;
687 page
= dma_alloc_coherent(&dev
->pci_dev
->dev
, 4096, &dma_addr
,
690 memset(&c
, 0, sizeof(c
));
691 c
.features
.opcode
= nvme_admin_get_features
;
692 c
.features
.nsid
= cpu_to_le32(ns
->ns_id
);
693 c
.features
.prp1
= cpu_to_le64(dma_addr
);
694 c
.features
.fid
= cpu_to_le32(NVME_FEAT_LBA_RANGE
);
696 status
= nvme_submit_admin_cmd(dev
, &c
, NULL
);
698 /* XXX: Assuming first range for now */
701 else if (copy_to_user(addr
, page
, 64))
704 dma_free_coherent(&dev
->pci_dev
->dev
, 4096, page
, dma_addr
);
709 static int nvme_ioctl(struct block_device
*bdev
, fmode_t mode
, unsigned int cmd
,
712 struct nvme_ns
*ns
= bdev
->bd_disk
->private_data
;
715 case NVME_IOCTL_IDENTIFY_NS
:
716 return nvme_identify(ns
, (void __user
*)arg
, 0);
717 case NVME_IOCTL_IDENTIFY_CTRL
:
718 return nvme_identify(ns
, (void __user
*)arg
, 1);
719 case NVME_IOCTL_GET_RANGE_TYPE
:
720 return nvme_get_range_type(ns
, (void __user
*)arg
);
726 static const struct block_device_operations nvme_fops
= {
727 .owner
= THIS_MODULE
,
731 static struct nvme_ns
*nvme_alloc_ns(struct nvme_dev
*dev
, int index
,
732 struct nvme_id_ns
*id
, struct nvme_lba_range_type
*rt
)
735 struct gendisk
*disk
;
738 if (rt
->attributes
& NVME_LBART_ATTRIB_HIDE
)
741 ns
= kzalloc(sizeof(*ns
), GFP_KERNEL
);
744 ns
->queue
= blk_alloc_queue(GFP_KERNEL
);
747 ns
->queue
->queue_flags
= QUEUE_FLAG_DEFAULT
| QUEUE_FLAG_NOMERGES
|
748 QUEUE_FLAG_NONROT
| QUEUE_FLAG_DISCARD
;
749 blk_queue_make_request(ns
->queue
, nvme_make_request
);
751 ns
->queue
->queuedata
= ns
;
753 disk
= alloc_disk(NVME_MINORS
);
758 lbaf
= id
->flbas
& 0xf;
759 ns
->lba_shift
= id
->lbaf
[lbaf
].ds
;
761 disk
->major
= nvme_major
;
762 disk
->minors
= NVME_MINORS
;
763 disk
->first_minor
= NVME_MINORS
* index
;
764 disk
->fops
= &nvme_fops
;
765 disk
->private_data
= ns
;
766 disk
->queue
= ns
->queue
;
767 sprintf(disk
->disk_name
, "nvme%dn%d", dev
->instance
, index
);
768 set_capacity(disk
, le64_to_cpup(&id
->nsze
) << (ns
->lba_shift
- 9));
773 blk_cleanup_queue(ns
->queue
);
779 static void nvme_ns_free(struct nvme_ns
*ns
)
782 blk_cleanup_queue(ns
->queue
);
786 static int set_queue_count(struct nvme_dev
*dev
, int count
)
790 struct nvme_command c
;
791 u32 q_count
= (count
- 1) | ((count
- 1) << 16);
793 memset(&c
, 0, sizeof(c
));
794 c
.features
.opcode
= nvme_admin_get_features
;
795 c
.features
.fid
= cpu_to_le32(NVME_FEAT_NUM_QUEUES
);
796 c
.features
.dword11
= cpu_to_le32(q_count
);
798 status
= nvme_submit_admin_cmd(dev
, &c
, &result
);
801 return min(result
& 0xffff, result
>> 16) + 1;
804 static int __devinit
nvme_setup_io_queues(struct nvme_dev
*dev
)
806 int result
, cpu
, i
, nr_queues
;
808 nr_queues
= num_online_cpus();
809 result
= set_queue_count(dev
, nr_queues
);
812 if (result
< nr_queues
)
815 /* Deregister the admin queue's interrupt */
816 free_irq(dev
->entry
[0].vector
, dev
->queues
[0]);
818 for (i
= 0; i
< nr_queues
; i
++)
819 dev
->entry
[i
].entry
= i
;
821 result
= pci_enable_msix(dev
->pci_dev
, dev
->entry
, nr_queues
);
824 } else if (result
> 0) {
833 result
= queue_request_irq(dev
, dev
->queues
[0], "nvme admin");
834 /* XXX: handle failure here */
836 cpu
= cpumask_first(cpu_online_mask
);
837 for (i
= 0; i
< nr_queues
; i
++) {
838 irq_set_affinity_hint(dev
->entry
[i
].vector
, get_cpu_mask(cpu
));
839 cpu
= cpumask_next(cpu
, cpu_online_mask
);
842 for (i
= 0; i
< nr_queues
; i
++) {
843 dev
->queues
[i
+ 1] = nvme_create_queue(dev
, i
+ 1,
845 if (!dev
->queues
[i
+ 1])
853 static void nvme_free_queues(struct nvme_dev
*dev
)
857 for (i
= dev
->queue_count
- 1; i
>= 0; i
--)
858 nvme_free_queue(dev
, i
);
861 static int __devinit
nvme_dev_add(struct nvme_dev
*dev
)
864 struct nvme_ns
*ns
, *next
;
867 struct nvme_command cid
, crt
;
869 res
= nvme_setup_io_queues(dev
);
873 /* XXX: Switch to a SG list once prp2 works */
874 id
= dma_alloc_coherent(&dev
->pci_dev
->dev
, 8192, &dma_addr
,
877 memset(&cid
, 0, sizeof(cid
));
878 cid
.identify
.opcode
= nvme_admin_identify
;
879 cid
.identify
.nsid
= 0;
880 cid
.identify
.prp1
= cpu_to_le64(dma_addr
);
881 cid
.identify
.cns
= cpu_to_le32(1);
883 res
= nvme_submit_admin_cmd(dev
, &cid
, NULL
);
889 nn
= le32_to_cpup(&((struct nvme_id_ctrl
*)id
)->nn
);
891 cid
.identify
.cns
= 0;
892 memset(&crt
, 0, sizeof(crt
));
893 crt
.features
.opcode
= nvme_admin_get_features
;
894 crt
.features
.prp1
= cpu_to_le64(dma_addr
+ 4096);
895 crt
.features
.fid
= cpu_to_le32(NVME_FEAT_LBA_RANGE
);
897 for (i
= 0; i
< nn
; i
++) {
898 cid
.identify
.nsid
= cpu_to_le32(i
);
899 res
= nvme_submit_admin_cmd(dev
, &cid
, NULL
);
903 if (((struct nvme_id_ns
*)id
)->ncap
== 0)
906 crt
.features
.nsid
= cpu_to_le32(i
);
907 res
= nvme_submit_admin_cmd(dev
, &crt
, NULL
);
911 ns
= nvme_alloc_ns(dev
, i
, id
, id
+ 4096);
913 list_add_tail(&ns
->list
, &dev
->namespaces
);
915 list_for_each_entry(ns
, &dev
->namespaces
, list
)
918 dma_free_coherent(&dev
->pci_dev
->dev
, 4096, id
, dma_addr
);
922 list_for_each_entry_safe(ns
, next
, &dev
->namespaces
, list
) {
927 dma_free_coherent(&dev
->pci_dev
->dev
, 4096, id
, dma_addr
);
931 static int nvme_dev_remove(struct nvme_dev
*dev
)
933 struct nvme_ns
*ns
, *next
;
935 /* TODO: wait all I/O finished or cancel them */
937 list_for_each_entry_safe(ns
, next
, &dev
->namespaces
, list
) {
939 del_gendisk(ns
->disk
);
943 nvme_free_queues(dev
);
948 /* XXX: Use an ida or something to let remove / add work correctly */
949 static void nvme_set_instance(struct nvme_dev
*dev
)
952 dev
->instance
= instance
++;
955 static void nvme_release_instance(struct nvme_dev
*dev
)
959 static int __devinit
nvme_probe(struct pci_dev
*pdev
,
960 const struct pci_device_id
*id
)
962 int result
= -ENOMEM
;
963 struct nvme_dev
*dev
;
965 dev
= kzalloc(sizeof(*dev
), GFP_KERNEL
);
968 dev
->entry
= kcalloc(num_possible_cpus(), sizeof(*dev
->entry
),
972 dev
->queues
= kcalloc(num_possible_cpus() + 1, sizeof(void *),
977 INIT_LIST_HEAD(&dev
->namespaces
);
979 pci_set_drvdata(pdev
, dev
);
980 dma_set_mask(&dev
->pci_dev
->dev
, DMA_BIT_MASK(64));
981 nvme_set_instance(dev
);
982 dev
->entry
[0].vector
= pdev
->irq
;
984 dev
->bar
= ioremap(pci_resource_start(pdev
, 0), 8192);
990 result
= nvme_configure_admin_queue(dev
);
995 result
= nvme_dev_add(dev
);
1001 nvme_free_queues(dev
);
1005 pci_disable_msix(pdev
);
1006 nvme_release_instance(dev
);
1014 static void __devexit
nvme_remove(struct pci_dev
*pdev
)
1016 struct nvme_dev
*dev
= pci_get_drvdata(pdev
);
1017 nvme_dev_remove(dev
);
1018 pci_disable_msix(pdev
);
1020 nvme_release_instance(dev
);
1026 /* These functions are yet to be implemented */
1027 #define nvme_error_detected NULL
1028 #define nvme_dump_registers NULL
1029 #define nvme_link_reset NULL
1030 #define nvme_slot_reset NULL
1031 #define nvme_error_resume NULL
1032 #define nvme_suspend NULL
1033 #define nvme_resume NULL
1035 static struct pci_error_handlers nvme_err_handler
= {
1036 .error_detected
= nvme_error_detected
,
1037 .mmio_enabled
= nvme_dump_registers
,
1038 .link_reset
= nvme_link_reset
,
1039 .slot_reset
= nvme_slot_reset
,
1040 .resume
= nvme_error_resume
,
1043 /* Move to pci_ids.h later */
1044 #define PCI_CLASS_STORAGE_EXPRESS 0x010802
1046 static DEFINE_PCI_DEVICE_TABLE(nvme_id_table
) = {
1047 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS
, 0xffffff) },
1050 MODULE_DEVICE_TABLE(pci
, nvme_id_table
);
1052 static struct pci_driver nvme_driver
= {
1054 .id_table
= nvme_id_table
,
1055 .probe
= nvme_probe
,
1056 .remove
= __devexit_p(nvme_remove
),
1057 .suspend
= nvme_suspend
,
1058 .resume
= nvme_resume
,
1059 .err_handler
= &nvme_err_handler
,
1062 static int __init
nvme_init(void)
1066 nvme_major
= register_blkdev(nvme_major
, "nvme");
1067 if (nvme_major
<= 0)
1070 result
= pci_register_driver(&nvme_driver
);
1074 unregister_blkdev(nvme_major
, "nvme");
1078 static void __exit
nvme_exit(void)
1080 pci_unregister_driver(&nvme_driver
);
1081 unregister_blkdev(nvme_major
, "nvme");
1084 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
1085 MODULE_LICENSE("GPL");
1086 MODULE_VERSION("0.1");
1087 module_init(nvme_init
);
1088 module_exit(nvme_exit
);