2 * This file is provided under a dual BSD/GPLv2 license. When using or
3 * redistributing this file, you may do so under either license.
7 * Copyright(c) 2015 Intel Corporation. All rights reserved.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
15 * Copyright(c) 2015 Intel Corporation. All rights reserved.
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
21 * * Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * * Redistributions in binary form must reproduce the above copy
24 * notice, this list of conditions and the following disclaimer in
25 * the documentation and/or other materials provided with the
27 * * Neither the name of Intel Corporation nor the names of its
28 * contributors may be used to endorse or promote products derived
29 * from this software without specific prior written permission.
31 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
32 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
33 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
34 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
35 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
37 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
38 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
39 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
41 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * PCIe NTB Perf Linux driver
46 #include <linux/init.h>
47 #include <linux/kernel.h>
48 #include <linux/module.h>
49 #include <linux/kthread.h>
50 #include <linux/time.h>
51 #include <linux/timer.h>
52 #include <linux/dma-mapping.h>
53 #include <linux/pci.h>
54 #include <linux/slab.h>
55 #include <linux/spinlock.h>
56 #include <linux/debugfs.h>
57 #include <linux/dmaengine.h>
58 #include <linux/delay.h>
59 #include <linux/sizes.h>
60 #include <linux/ntb.h>
62 #define DRIVER_NAME "ntb_perf"
63 #define DRIVER_DESCRIPTION "PCIe NTB Performance Measurement Tool"
65 #define DRIVER_LICENSE "Dual BSD/GPL"
66 #define DRIVER_VERSION "1.0"
67 #define DRIVER_AUTHOR "Dave Jiang <dave.jiang@intel.com>"
69 #define PERF_LINK_DOWN_TIMEOUT 10
70 #define PERF_VERSION 0xffff0001
71 #define MAX_THREADS 32
72 #define MAX_TEST_SIZE SZ_1M
74 #define DMA_OUT_RESOURCE_TO 50
75 #define DMA_RETRIES 20
76 #define SZ_4G (1ULL << 32)
77 #define MAX_SEG_ORDER 20 /* no larger than 1M for kmalloc buffer */
79 MODULE_LICENSE(DRIVER_LICENSE
);
80 MODULE_VERSION(DRIVER_VERSION
);
81 MODULE_AUTHOR(DRIVER_AUTHOR
);
82 MODULE_DESCRIPTION(DRIVER_DESCRIPTION
);
84 static struct dentry
*perf_debugfs_dir
;
86 static unsigned int seg_order
= 19; /* 512K */
87 module_param(seg_order
, uint
, 0644);
88 MODULE_PARM_DESC(seg_order
, "size order [n^2] of buffer segment for testing");
90 static unsigned int run_order
= 32; /* 4G */
91 module_param(run_order
, uint
, 0644);
92 MODULE_PARM_DESC(run_order
, "size order [n^2] of total data to transfer");
94 static bool use_dma
; /* default to 0 */
95 module_param(use_dma
, bool, 0644);
96 MODULE_PARM_DESC(use_dma
, "Using DMA engine to measure performance");
99 phys_addr_t phys_addr
;
100 resource_size_t phys_size
;
101 resource_size_t xlat_align
;
102 resource_size_t xlat_align_size
;
113 struct task_struct
*thread
;
114 struct perf_ctx
*perf
;
116 struct dma_chan
*dma_chan
;
119 void *srcs
[MAX_SRCS
];
127 struct work_struct link_cleanup
;
128 struct delayed_work link_work
;
129 struct dentry
*debugfs_node_dir
;
130 struct dentry
*debugfs_run
;
131 struct dentry
*debugfs_threads
;
134 struct pthr_ctx pthr_ctx
[MAX_THREADS
];
147 static void perf_link_event(void *ctx
)
149 struct perf_ctx
*perf
= ctx
;
151 if (ntb_link_is_up(perf
->ntb
, NULL
, NULL
) == 1)
152 schedule_delayed_work(&perf
->link_work
, 2*HZ
);
154 schedule_work(&perf
->link_cleanup
);
157 static void perf_db_event(void *ctx
, int vec
)
159 struct perf_ctx
*perf
= ctx
;
160 u64 db_bits
, db_mask
;
162 db_mask
= ntb_db_vector_mask(perf
->ntb
, vec
);
163 db_bits
= ntb_db_read(perf
->ntb
);
165 dev_dbg(&perf
->ntb
->dev
, "doorbell vec %d mask %#llx bits %#llx\n",
166 vec
, db_mask
, db_bits
);
169 static const struct ntb_ctx_ops perf_ops
= {
170 .link_event
= perf_link_event
,
171 .db_event
= perf_db_event
,
174 static void perf_copy_callback(void *data
)
176 struct pthr_ctx
*pctx
= data
;
178 atomic_dec(&pctx
->dma_sync
);
181 static ssize_t
perf_copy(struct pthr_ctx
*pctx
, char *dst
,
182 char *src
, size_t size
)
184 struct perf_ctx
*perf
= pctx
->perf
;
185 struct dma_async_tx_descriptor
*txd
;
186 struct dma_chan
*chan
= pctx
->dma_chan
;
187 struct dma_device
*device
;
188 struct dmaengine_unmap_data
*unmap
;
190 size_t src_off
, dst_off
;
191 struct perf_mw
*mw
= &perf
->mw
;
192 u64 vbase
, dst_vaddr
;
197 memcpy_toio(dst
, src
, size
);
202 dev_err(&perf
->ntb
->dev
, "DMA engine does not exist\n");
206 device
= chan
->device
;
207 src_off
= (size_t)src
& ~PAGE_MASK
;
208 dst_off
= (size_t)dst
& ~PAGE_MASK
;
210 if (!is_dma_copy_aligned(device
, src_off
, dst_off
, size
))
213 vbase
= (u64
)(u64
*)mw
->vbase
;
214 dst_vaddr
= (u64
)(u64
*)dst
;
215 dst_phys
= mw
->phys_addr
+ (dst_vaddr
- vbase
);
217 unmap
= dmaengine_get_unmap_data(device
->dev
, 1, GFP_NOWAIT
);
222 unmap
->addr
[0] = dma_map_page(device
->dev
, virt_to_page(src
),
223 src_off
, size
, DMA_TO_DEVICE
);
224 if (dma_mapping_error(device
->dev
, unmap
->addr
[0]))
230 txd
= device
->device_prep_dma_memcpy(chan
, dst_phys
,
232 size
, DMA_PREP_INTERRUPT
);
234 set_current_state(TASK_INTERRUPTIBLE
);
235 schedule_timeout(DMA_OUT_RESOURCE_TO
);
237 } while (!txd
&& (++retries
< DMA_RETRIES
));
240 pctx
->dma_prep_err
++;
244 txd
->callback
= perf_copy_callback
;
245 txd
->callback_param
= pctx
;
246 dma_set_unmap(txd
, unmap
);
248 cookie
= dmaengine_submit(txd
);
249 if (dma_submit_error(cookie
))
252 atomic_inc(&pctx
->dma_sync
);
253 dma_async_issue_pending(chan
);
258 dmaengine_unmap_put(unmap
);
260 dmaengine_unmap_put(unmap
);
264 static int perf_move_data(struct pthr_ctx
*pctx
, char *dst
, char *src
,
265 u64 buf_size
, u64 win_size
, u64 total
)
267 int chunks
, total_chunks
, i
;
268 int copied_chunks
= 0;
269 u64 copied
= 0, result
;
272 ktime_t kstart
, kstop
, kdiff
;
274 chunks
= div64_u64(win_size
, buf_size
);
275 total_chunks
= div64_u64(total
, buf_size
);
276 kstart
= ktime_get();
278 for (i
= 0; i
< total_chunks
; i
++) {
279 result
= perf_copy(pctx
, tmp
, src
, buf_size
);
282 if (copied_chunks
== chunks
) {
288 /* Probably should schedule every 4GB to prevent soft hang. */
289 if (((copied
% SZ_4G
) == 0) && !use_dma
) {
290 set_current_state(TASK_INTERRUPTIBLE
);
296 pr_info("%s: All DMA descriptors submitted\n", current
->comm
);
297 while (atomic_read(&pctx
->dma_sync
) != 0)
302 kdiff
= ktime_sub(kstop
, kstart
);
303 diff_us
= ktime_to_us(kdiff
);
305 pr_info("%s: copied %llu bytes\n", current
->comm
, copied
);
307 pr_info("%s: lasted %llu usecs\n", current
->comm
, diff_us
);
309 perf
= div64_u64(copied
, diff_us
);
311 pr_info("%s: MBytes/s: %llu\n", current
->comm
, perf
);
316 static bool perf_dma_filter_fn(struct dma_chan
*chan
, void *node
)
318 return dev_to_node(&chan
->dev
->device
) == (int)(unsigned long)node
;
321 static int ntb_perf_thread(void *data
)
323 struct pthr_ctx
*pctx
= data
;
324 struct perf_ctx
*perf
= pctx
->perf
;
325 struct pci_dev
*pdev
= perf
->ntb
->pdev
;
326 struct perf_mw
*mw
= &perf
->mw
;
328 u64 win_size
, buf_size
, total
;
331 struct dma_chan
*dma_chan
= NULL
;
333 pr_info("kthread %s starting...\n", current
->comm
);
335 node
= dev_to_node(&pdev
->dev
);
337 if (use_dma
&& !pctx
->dma_chan
) {
338 dma_cap_mask_t dma_mask
;
340 dma_cap_zero(dma_mask
);
341 dma_cap_set(DMA_MEMCPY
, dma_mask
);
342 dma_chan
= dma_request_channel(dma_mask
, perf_dma_filter_fn
,
343 (void *)(unsigned long)node
);
345 pr_warn("%s: cannot acquire DMA channel, quitting\n",
349 pctx
->dma_chan
= dma_chan
;
352 for (i
= 0; i
< MAX_SRCS
; i
++) {
353 pctx
->srcs
[i
] = kmalloc_node(MAX_TEST_SIZE
, GFP_KERNEL
, node
);
354 if (!pctx
->srcs
[i
]) {
360 win_size
= mw
->phys_size
;
361 buf_size
= 1ULL << seg_order
;
362 total
= 1ULL << run_order
;
364 if (buf_size
> MAX_TEST_SIZE
)
365 buf_size
= MAX_TEST_SIZE
;
367 dst
= (char *)mw
->vbase
;
369 atomic_inc(&perf
->tsync
);
370 while (atomic_read(&perf
->tsync
) != perf
->perf_threads
)
373 src
= pctx
->srcs
[pctx
->src_idx
];
374 pctx
->src_idx
= (pctx
->src_idx
+ 1) & (MAX_SRCS
- 1);
376 rc
= perf_move_data(pctx
, dst
, src
, buf_size
, win_size
, total
);
378 atomic_dec(&perf
->tsync
);
381 pr_err("%s: failed\n", current
->comm
);
386 for (i
= 0; i
< MAX_SRCS
; i
++) {
387 kfree(pctx
->srcs
[i
]);
388 pctx
->srcs
[i
] = NULL
;
394 for (i
= 0; i
< MAX_SRCS
; i
++) {
395 kfree(pctx
->srcs
[i
]);
396 pctx
->srcs
[i
] = NULL
;
400 dma_release_channel(dma_chan
);
401 pctx
->dma_chan
= NULL
;
407 static void perf_free_mw(struct perf_ctx
*perf
)
409 struct perf_mw
*mw
= &perf
->mw
;
410 struct pci_dev
*pdev
= perf
->ntb
->pdev
;
415 ntb_mw_clear_trans(perf
->ntb
, 0);
416 dma_free_coherent(&pdev
->dev
, mw
->buf_size
,
417 mw
->virt_addr
, mw
->dma_addr
);
420 mw
->virt_addr
= NULL
;
423 static int perf_set_mw(struct perf_ctx
*perf
, resource_size_t size
)
425 struct perf_mw
*mw
= &perf
->mw
;
426 size_t xlat_size
, buf_size
;
431 xlat_size
= round_up(size
, mw
->xlat_align_size
);
432 buf_size
= round_up(size
, mw
->xlat_align
);
434 if (mw
->xlat_size
== xlat_size
)
440 mw
->xlat_size
= xlat_size
;
441 mw
->buf_size
= buf_size
;
443 mw
->virt_addr
= dma_alloc_coherent(&perf
->ntb
->pdev
->dev
, buf_size
,
444 &mw
->dma_addr
, GFP_KERNEL
);
445 if (!mw
->virt_addr
) {
453 static void perf_link_work(struct work_struct
*work
)
455 struct perf_ctx
*perf
=
456 container_of(work
, struct perf_ctx
, link_work
.work
);
457 struct ntb_dev
*ndev
= perf
->ntb
;
458 struct pci_dev
*pdev
= ndev
->pdev
;
463 dev_dbg(&perf
->ntb
->pdev
->dev
, "%s called\n", __func__
);
465 size
= perf
->mw
.phys_size
;
466 ntb_peer_spad_write(ndev
, MW_SZ_HIGH
, upper_32_bits(size
));
467 ntb_peer_spad_write(ndev
, MW_SZ_LOW
, lower_32_bits(size
));
468 ntb_peer_spad_write(ndev
, VERSION
, PERF_VERSION
);
470 /* now read what peer wrote */
471 val
= ntb_spad_read(ndev
, VERSION
);
472 if (val
!= PERF_VERSION
) {
473 dev_dbg(&pdev
->dev
, "Remote version = %#x\n", val
);
477 val
= ntb_spad_read(ndev
, MW_SZ_HIGH
);
478 size
= (u64
)val
<< 32;
480 val
= ntb_spad_read(ndev
, MW_SZ_LOW
);
483 dev_dbg(&pdev
->dev
, "Remote MW size = %#llx\n", size
);
485 rc
= perf_set_mw(perf
, size
);
489 perf
->link_is_up
= true;
497 if (ntb_link_is_up(ndev
, NULL
, NULL
) == 1)
498 schedule_delayed_work(&perf
->link_work
,
499 msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT
));
502 static void perf_link_cleanup(struct work_struct
*work
)
504 struct perf_ctx
*perf
= container_of(work
,
508 dev_dbg(&perf
->ntb
->pdev
->dev
, "%s called\n", __func__
);
510 if (!perf
->link_is_up
)
511 cancel_delayed_work_sync(&perf
->link_work
);
514 static int perf_setup_mw(struct ntb_dev
*ntb
, struct perf_ctx
*perf
)
521 rc
= ntb_mw_get_range(ntb
, 0, &mw
->phys_addr
, &mw
->phys_size
,
522 &mw
->xlat_align
, &mw
->xlat_align_size
);
526 perf
->mw
.vbase
= ioremap_wc(mw
->phys_addr
, mw
->phys_size
);
533 static ssize_t
debugfs_run_read(struct file
*filp
, char __user
*ubuf
,
534 size_t count
, loff_t
*offp
)
536 struct perf_ctx
*perf
= filp
->private_data
;
538 ssize_t ret
, out_offset
;
543 buf
= kmalloc(64, GFP_KERNEL
);
544 out_offset
= snprintf(buf
, 64, "%d\n", perf
->run
);
545 ret
= simple_read_from_buffer(ubuf
, count
, offp
, buf
, out_offset
);
551 static ssize_t
debugfs_run_write(struct file
*filp
, const char __user
*ubuf
,
552 size_t count
, loff_t
*offp
)
554 struct perf_ctx
*perf
= filp
->private_data
;
557 if (!perf
->link_is_up
)
560 if (perf
->perf_threads
== 0)
563 if (atomic_read(&perf
->tsync
) == 0)
567 /* lets stop the threads */
569 for (i
= 0; i
< MAX_THREADS
; i
++) {
570 if (perf
->pthr_ctx
[i
].thread
) {
571 kthread_stop(perf
->pthr_ctx
[i
].thread
);
572 perf
->pthr_ctx
[i
].thread
= NULL
;
579 if (perf
->perf_threads
> MAX_THREADS
) {
580 perf
->perf_threads
= MAX_THREADS
;
581 pr_info("Reset total threads to: %u\n", MAX_THREADS
);
584 /* no greater than 1M */
585 if (seg_order
> MAX_SEG_ORDER
) {
586 seg_order
= MAX_SEG_ORDER
;
587 pr_info("Fix seg_order to %u\n", seg_order
);
590 if (run_order
< seg_order
) {
591 run_order
= seg_order
;
592 pr_info("Fix run_order to %u\n", run_order
);
595 node
= dev_to_node(&perf
->ntb
->pdev
->dev
);
596 /* launch kernel thread */
597 for (i
= 0; i
< perf
->perf_threads
; i
++) {
598 struct pthr_ctx
*pctx
;
600 pctx
= &perf
->pthr_ctx
[i
];
601 atomic_set(&pctx
->dma_sync
, 0);
604 kthread_create_on_node(ntb_perf_thread
,
606 node
, "ntb_perf %d", i
);
608 wake_up_process(pctx
->thread
);
611 for (i
= 0; i
< MAX_THREADS
; i
++) {
613 kthread_stop(pctx
->thread
);
619 if (perf
->run
== false)
628 static const struct file_operations ntb_perf_debugfs_run
= {
629 .owner
= THIS_MODULE
,
631 .read
= debugfs_run_read
,
632 .write
= debugfs_run_write
,
635 static int perf_debugfs_setup(struct perf_ctx
*perf
)
637 struct pci_dev
*pdev
= perf
->ntb
->pdev
;
639 if (!debugfs_initialized())
642 if (!perf_debugfs_dir
) {
643 perf_debugfs_dir
= debugfs_create_dir(KBUILD_MODNAME
, NULL
);
644 if (!perf_debugfs_dir
)
648 perf
->debugfs_node_dir
= debugfs_create_dir(pci_name(pdev
),
650 if (!perf
->debugfs_node_dir
)
653 perf
->debugfs_run
= debugfs_create_file("run", S_IRUSR
| S_IWUSR
,
654 perf
->debugfs_node_dir
, perf
,
655 &ntb_perf_debugfs_run
);
656 if (!perf
->debugfs_run
)
659 perf
->debugfs_threads
= debugfs_create_u8("threads", S_IRUSR
| S_IWUSR
,
660 perf
->debugfs_node_dir
,
661 &perf
->perf_threads
);
662 if (!perf
->debugfs_threads
)
668 static int perf_probe(struct ntb_client
*client
, struct ntb_dev
*ntb
)
670 struct pci_dev
*pdev
= ntb
->pdev
;
671 struct perf_ctx
*perf
;
675 node
= dev_to_node(&pdev
->dev
);
677 perf
= kzalloc_node(sizeof(*perf
), GFP_KERNEL
, node
);
684 perf
->perf_threads
= 1;
685 atomic_set(&perf
->tsync
, 0);
687 spin_lock_init(&perf
->db_lock
);
688 perf_setup_mw(ntb
, perf
);
689 INIT_DELAYED_WORK(&perf
->link_work
, perf_link_work
);
690 INIT_WORK(&perf
->link_cleanup
, perf_link_cleanup
);
692 rc
= ntb_set_ctx(ntb
, perf
, &perf_ops
);
696 perf
->link_is_up
= false;
697 ntb_link_enable(ntb
, NTB_SPEED_AUTO
, NTB_WIDTH_AUTO
);
700 rc
= perf_debugfs_setup(perf
);
707 cancel_delayed_work_sync(&perf
->link_work
);
708 cancel_work_sync(&perf
->link_cleanup
);
714 static void perf_remove(struct ntb_client
*client
, struct ntb_dev
*ntb
)
716 struct perf_ctx
*perf
= ntb
->ctx
;
719 dev_dbg(&perf
->ntb
->dev
, "%s called\n", __func__
);
721 cancel_delayed_work_sync(&perf
->link_work
);
722 cancel_work_sync(&perf
->link_cleanup
);
725 ntb_link_disable(ntb
);
727 debugfs_remove_recursive(perf_debugfs_dir
);
728 perf_debugfs_dir
= NULL
;
731 for (i
= 0; i
< MAX_THREADS
; i
++) {
732 struct pthr_ctx
*pctx
= &perf
->pthr_ctx
[i
];
735 dma_release_channel(pctx
->dma_chan
);
742 static struct ntb_client perf_client
= {
745 .remove
= perf_remove
,
748 module_ntb_client(perf_client
);