1 /* Intel Ethernet Switch Host Interface Driver
2 * Copyright(c) 2013 - 2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * The full GNU General Public License is included in this distribution in
14 * the file called "COPYING".
16 * Contact Information:
17 * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
18 * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
21 #include <linux/types.h>
22 #include <linux/module.h>
26 #include <linux/if_macvlan.h>
27 #include <linux/prefetch.h>
31 #define DRV_VERSION "0.12.2-k"
32 const char fm10k_driver_version
[] = DRV_VERSION
;
33 char fm10k_driver_name
[] = "fm10k";
34 static const char fm10k_driver_string
[] =
35 "Intel(R) Ethernet Switch Host Interface Driver";
36 static const char fm10k_copyright
[] =
37 "Copyright (c) 2013 Intel Corporation.";
39 MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
40 MODULE_DESCRIPTION("Intel(R) Ethernet Switch Host Interface Driver");
41 MODULE_LICENSE("GPL");
42 MODULE_VERSION(DRV_VERSION
);
45 * fm10k_init_module - Driver Registration Routine
47 * fm10k_init_module is the first routine called when the driver is
48 * loaded. All it does is register with the PCI subsystem.
50 static int __init
fm10k_init_module(void)
52 pr_info("%s - version %s\n", fm10k_driver_string
, fm10k_driver_version
);
53 pr_info("%s\n", fm10k_copyright
);
55 return fm10k_register_pci_driver();
57 module_init(fm10k_init_module
);
60 * fm10k_exit_module - Driver Exit Cleanup Routine
62 * fm10k_exit_module is called just before the driver is removed
65 static void __exit
fm10k_exit_module(void)
67 fm10k_unregister_pci_driver();
69 module_exit(fm10k_exit_module
);
71 static bool fm10k_alloc_mapped_page(struct fm10k_ring
*rx_ring
,
72 struct fm10k_rx_buffer
*bi
)
74 struct page
*page
= bi
->page
;
77 /* Only page will be NULL if buffer was consumed */
81 /* alloc new page for storage */
82 page
= alloc_page(GFP_ATOMIC
| __GFP_COLD
);
83 if (unlikely(!page
)) {
84 rx_ring
->rx_stats
.alloc_failed
++;
88 /* map page for use */
89 dma
= dma_map_page(rx_ring
->dev
, page
, 0, PAGE_SIZE
, DMA_FROM_DEVICE
);
91 /* if mapping failed free memory back to system since
92 * there isn't much point in holding memory we can't use
94 if (dma_mapping_error(rx_ring
->dev
, dma
)) {
98 rx_ring
->rx_stats
.alloc_failed
++;
110 * fm10k_alloc_rx_buffers - Replace used receive buffers
111 * @rx_ring: ring to place buffers on
112 * @cleaned_count: number of buffers to replace
114 void fm10k_alloc_rx_buffers(struct fm10k_ring
*rx_ring
, u16 cleaned_count
)
116 union fm10k_rx_desc
*rx_desc
;
117 struct fm10k_rx_buffer
*bi
;
118 u16 i
= rx_ring
->next_to_use
;
124 rx_desc
= FM10K_RX_DESC(rx_ring
, i
);
125 bi
= &rx_ring
->rx_buffer
[i
];
129 if (!fm10k_alloc_mapped_page(rx_ring
, bi
))
132 /* Refresh the desc even if buffer_addrs didn't change
133 * because each write-back erases this info.
135 rx_desc
->q
.pkt_addr
= cpu_to_le64(bi
->dma
+ bi
->page_offset
);
141 rx_desc
= FM10K_RX_DESC(rx_ring
, 0);
142 bi
= rx_ring
->rx_buffer
;
146 /* clear the hdr_addr for the next_to_use descriptor */
147 rx_desc
->q
.hdr_addr
= 0;
150 } while (cleaned_count
);
154 if (rx_ring
->next_to_use
!= i
) {
155 /* record the next descriptor to use */
156 rx_ring
->next_to_use
= i
;
158 /* update next to alloc since we have filled the ring */
159 rx_ring
->next_to_alloc
= i
;
161 /* Force memory writes to complete before letting h/w
162 * know there are new descriptors to fetch. (Only
163 * applicable for weak-ordered memory model archs,
168 /* notify hardware of new descriptors */
169 writel(i
, rx_ring
->tail
);
174 * fm10k_reuse_rx_page - page flip buffer and store it back on the ring
175 * @rx_ring: rx descriptor ring to store buffers on
176 * @old_buff: donor buffer to have page reused
178 * Synchronizes page for reuse by the interface
180 static void fm10k_reuse_rx_page(struct fm10k_ring
*rx_ring
,
181 struct fm10k_rx_buffer
*old_buff
)
183 struct fm10k_rx_buffer
*new_buff
;
184 u16 nta
= rx_ring
->next_to_alloc
;
186 new_buff
= &rx_ring
->rx_buffer
[nta
];
188 /* update, and store next to alloc */
190 rx_ring
->next_to_alloc
= (nta
< rx_ring
->count
) ? nta
: 0;
192 /* transfer page from old buffer to new buffer */
193 memcpy(new_buff
, old_buff
, sizeof(struct fm10k_rx_buffer
));
195 /* sync the buffer for use by the device */
196 dma_sync_single_range_for_device(rx_ring
->dev
, old_buff
->dma
,
197 old_buff
->page_offset
,
202 static bool fm10k_can_reuse_rx_page(struct fm10k_rx_buffer
*rx_buffer
,
204 unsigned int truesize
)
206 /* avoid re-using remote pages */
207 if (unlikely(page_to_nid(page
) != numa_mem_id()))
210 #if (PAGE_SIZE < 8192)
211 /* if we are only owner of page we can reuse it */
212 if (unlikely(page_count(page
) != 1))
215 /* flip page offset to other buffer */
216 rx_buffer
->page_offset
^= FM10K_RX_BUFSZ
;
218 /* since we are the only owner of the page and we need to
219 * increment it, just set the value to 2 in order to avoid
220 * an unnecessary locked operation
222 atomic_set(&page
->_count
, 2);
224 /* move offset up to the next cache line */
225 rx_buffer
->page_offset
+= truesize
;
227 if (rx_buffer
->page_offset
> (PAGE_SIZE
- FM10K_RX_BUFSZ
))
230 /* bump ref count on page before it is given to the stack */
238 * fm10k_add_rx_frag - Add contents of Rx buffer to sk_buff
239 * @rx_ring: rx descriptor ring to transact packets on
240 * @rx_buffer: buffer containing page to add
241 * @rx_desc: descriptor containing length of buffer written by hardware
242 * @skb: sk_buff to place the data into
244 * This function will add the data contained in rx_buffer->page to the skb.
245 * This is done either through a direct copy if the data in the buffer is
246 * less than the skb header size, otherwise it will just attach the page as
249 * The function will then update the page offset if necessary and return
250 * true if the buffer can be reused by the interface.
252 static bool fm10k_add_rx_frag(struct fm10k_ring
*rx_ring
,
253 struct fm10k_rx_buffer
*rx_buffer
,
254 union fm10k_rx_desc
*rx_desc
,
257 struct page
*page
= rx_buffer
->page
;
258 unsigned int size
= le16_to_cpu(rx_desc
->w
.length
);
259 #if (PAGE_SIZE < 8192)
260 unsigned int truesize
= FM10K_RX_BUFSZ
;
262 unsigned int truesize
= ALIGN(size
, L1_CACHE_BYTES
);
265 if ((size
<= FM10K_RX_HDR_LEN
) && !skb_is_nonlinear(skb
)) {
266 unsigned char *va
= page_address(page
) + rx_buffer
->page_offset
;
268 memcpy(__skb_put(skb
, size
), va
, ALIGN(size
, sizeof(long)));
270 /* we can reuse buffer as-is, just make sure it is local */
271 if (likely(page_to_nid(page
) == numa_mem_id()))
274 /* this page cannot be reused so discard it */
279 skb_add_rx_frag(skb
, skb_shinfo(skb
)->nr_frags
, page
,
280 rx_buffer
->page_offset
, size
, truesize
);
282 return fm10k_can_reuse_rx_page(rx_buffer
, page
, truesize
);
285 static struct sk_buff
*fm10k_fetch_rx_buffer(struct fm10k_ring
*rx_ring
,
286 union fm10k_rx_desc
*rx_desc
,
289 struct fm10k_rx_buffer
*rx_buffer
;
292 rx_buffer
= &rx_ring
->rx_buffer
[rx_ring
->next_to_clean
];
294 page
= rx_buffer
->page
;
298 void *page_addr
= page_address(page
) +
299 rx_buffer
->page_offset
;
301 /* prefetch first cache line of first page */
303 #if L1_CACHE_BYTES < 128
304 prefetch(page_addr
+ L1_CACHE_BYTES
);
307 /* allocate a skb to store the frags */
308 skb
= netdev_alloc_skb_ip_align(rx_ring
->netdev
,
310 if (unlikely(!skb
)) {
311 rx_ring
->rx_stats
.alloc_failed
++;
315 /* we will be copying header into skb->data in
316 * pskb_may_pull so it is in our interest to prefetch
317 * it now to avoid a possible cache miss
319 prefetchw(skb
->data
);
322 /* we are reusing so sync this buffer for CPU use */
323 dma_sync_single_range_for_cpu(rx_ring
->dev
,
325 rx_buffer
->page_offset
,
329 /* pull page into skb */
330 if (fm10k_add_rx_frag(rx_ring
, rx_buffer
, rx_desc
, skb
)) {
331 /* hand second half of page back to the ring */
332 fm10k_reuse_rx_page(rx_ring
, rx_buffer
);
334 /* we are not reusing the buffer so unmap it */
335 dma_unmap_page(rx_ring
->dev
, rx_buffer
->dma
,
336 PAGE_SIZE
, DMA_FROM_DEVICE
);
339 /* clear contents of rx_buffer */
340 rx_buffer
->page
= NULL
;
346 * fm10k_process_skb_fields - Populate skb header fields from Rx descriptor
347 * @rx_ring: rx descriptor ring packet is being transacted on
348 * @rx_desc: pointer to the EOP Rx descriptor
349 * @skb: pointer to current skb being populated
351 * This function checks the ring, descriptor, and packet information in
352 * order to populate the hash, checksum, VLAN, timestamp, protocol, and
353 * other fields within the skb.
355 static unsigned int fm10k_process_skb_fields(struct fm10k_ring
*rx_ring
,
356 union fm10k_rx_desc
*rx_desc
,
359 unsigned int len
= skb
->len
;
361 FM10K_CB(skb
)->fi
.w
.vlan
= rx_desc
->w
.vlan
;
363 skb_record_rx_queue(skb
, rx_ring
->queue_index
);
365 FM10K_CB(skb
)->fi
.d
.glort
= rx_desc
->d
.glort
;
367 if (rx_desc
->w
.vlan
) {
368 u16 vid
= le16_to_cpu(rx_desc
->w
.vlan
);
370 if (vid
!= rx_ring
->vid
)
371 __vlan_hwaccel_put_tag(skb
, htons(ETH_P_8021Q
), vid
);
374 skb
->protocol
= eth_type_trans(skb
, rx_ring
->netdev
);
380 * fm10k_is_non_eop - process handling of non-EOP buffers
381 * @rx_ring: Rx ring being processed
382 * @rx_desc: Rx descriptor for current buffer
384 * This function updates next to clean. If the buffer is an EOP buffer
385 * this function exits returning false, otherwise it will place the
386 * sk_buff in the next buffer to be chained and return true indicating
387 * that this is in fact a non-EOP buffer.
389 static bool fm10k_is_non_eop(struct fm10k_ring
*rx_ring
,
390 union fm10k_rx_desc
*rx_desc
)
392 u32 ntc
= rx_ring
->next_to_clean
+ 1;
394 /* fetch, update, and store next to clean */
395 ntc
= (ntc
< rx_ring
->count
) ? ntc
: 0;
396 rx_ring
->next_to_clean
= ntc
;
398 prefetch(FM10K_RX_DESC(rx_ring
, ntc
));
400 if (likely(fm10k_test_staterr(rx_desc
, FM10K_RXD_STATUS_EOP
)))
407 * fm10k_pull_tail - fm10k specific version of skb_pull_tail
408 * @rx_ring: rx descriptor ring packet is being transacted on
409 * @rx_desc: pointer to the EOP Rx descriptor
410 * @skb: pointer to current skb being adjusted
412 * This function is an fm10k specific version of __pskb_pull_tail. The
413 * main difference between this version and the original function is that
414 * this function can make several assumptions about the state of things
415 * that allow for significant optimizations versus the standard function.
416 * As a result we can do things like drop a frag and maintain an accurate
417 * truesize for the skb.
419 static void fm10k_pull_tail(struct fm10k_ring
*rx_ring
,
420 union fm10k_rx_desc
*rx_desc
,
423 struct skb_frag_struct
*frag
= &skb_shinfo(skb
)->frags
[0];
425 unsigned int pull_len
;
427 /* it is valid to use page_address instead of kmap since we are
428 * working with pages allocated out of the lomem pool per
429 * alloc_page(GFP_ATOMIC)
431 va
= skb_frag_address(frag
);
433 /* we need the header to contain the greater of either ETH_HLEN or
434 * 60 bytes if the skb->len is less than 60 for skb_pad.
436 pull_len
= eth_get_headlen(va
, FM10K_RX_HDR_LEN
);
438 /* align pull length to size of long to optimize memcpy performance */
439 skb_copy_to_linear_data(skb
, va
, ALIGN(pull_len
, sizeof(long)));
441 /* update all of the pointers */
442 skb_frag_size_sub(frag
, pull_len
);
443 frag
->page_offset
+= pull_len
;
444 skb
->data_len
-= pull_len
;
445 skb
->tail
+= pull_len
;
449 * fm10k_cleanup_headers - Correct corrupted or empty headers
450 * @rx_ring: rx descriptor ring packet is being transacted on
451 * @rx_desc: pointer to the EOP Rx descriptor
452 * @skb: pointer to current skb being fixed
454 * Address the case where we are pulling data in on pages only
455 * and as such no data is present in the skb header.
457 * In addition if skb is not at least 60 bytes we need to pad it so that
458 * it is large enough to qualify as a valid Ethernet frame.
460 * Returns true if an error was encountered and skb was freed.
462 static bool fm10k_cleanup_headers(struct fm10k_ring
*rx_ring
,
463 union fm10k_rx_desc
*rx_desc
,
466 if (unlikely((fm10k_test_staterr(rx_desc
,
467 FM10K_RXD_STATUS_RXE
)))) {
468 dev_kfree_skb_any(skb
);
469 rx_ring
->rx_stats
.errors
++;
473 /* place header in linear portion of buffer */
474 if (skb_is_nonlinear(skb
))
475 fm10k_pull_tail(rx_ring
, rx_desc
, skb
);
477 /* if skb_pad returns an error the skb was freed */
478 if (unlikely(skb
->len
< 60)) {
479 int pad_len
= 60 - skb
->len
;
481 if (skb_pad(skb
, pad_len
))
483 __skb_put(skb
, pad_len
);
490 * fm10k_receive_skb - helper function to handle rx indications
491 * @q_vector: structure containing interrupt and ring information
492 * @skb: packet to send up
494 static void fm10k_receive_skb(struct fm10k_q_vector
*q_vector
,
497 napi_gro_receive(&q_vector
->napi
, skb
);
500 static bool fm10k_clean_rx_irq(struct fm10k_q_vector
*q_vector
,
501 struct fm10k_ring
*rx_ring
,
504 struct sk_buff
*skb
= rx_ring
->skb
;
505 unsigned int total_bytes
= 0, total_packets
= 0;
506 u16 cleaned_count
= fm10k_desc_unused(rx_ring
);
509 union fm10k_rx_desc
*rx_desc
;
511 /* return some buffers to hardware, one at a time is too slow */
512 if (cleaned_count
>= FM10K_RX_BUFFER_WRITE
) {
513 fm10k_alloc_rx_buffers(rx_ring
, cleaned_count
);
517 rx_desc
= FM10K_RX_DESC(rx_ring
, rx_ring
->next_to_clean
);
519 if (!fm10k_test_staterr(rx_desc
, FM10K_RXD_STATUS_DD
))
522 /* This memory barrier is needed to keep us from reading
523 * any other fields out of the rx_desc until we know the
524 * RXD_STATUS_DD bit is set
528 /* retrieve a buffer from the ring */
529 skb
= fm10k_fetch_rx_buffer(rx_ring
, rx_desc
, skb
);
531 /* exit if we failed to retrieve a buffer */
537 /* fetch next buffer in frame if non-eop */
538 if (fm10k_is_non_eop(rx_ring
, rx_desc
))
541 /* verify the packet layout is correct */
542 if (fm10k_cleanup_headers(rx_ring
, rx_desc
, skb
)) {
547 /* populate checksum, timestamp, VLAN, and protocol */
548 total_bytes
+= fm10k_process_skb_fields(rx_ring
, rx_desc
, skb
);
550 fm10k_receive_skb(q_vector
, skb
);
552 /* reset skb pointer */
555 /* update budget accounting */
557 } while (likely(total_packets
< budget
));
559 /* place incomplete frames back on ring for completion */
562 u64_stats_update_begin(&rx_ring
->syncp
);
563 rx_ring
->stats
.packets
+= total_packets
;
564 rx_ring
->stats
.bytes
+= total_bytes
;
565 u64_stats_update_end(&rx_ring
->syncp
);
566 q_vector
->rx
.total_packets
+= total_packets
;
567 q_vector
->rx
.total_bytes
+= total_bytes
;
569 return total_packets
< budget
;
572 static bool fm10k_tx_desc_push(struct fm10k_ring
*tx_ring
,
573 struct fm10k_tx_desc
*tx_desc
, u16 i
,
574 dma_addr_t dma
, unsigned int size
, u8 desc_flags
)
576 /* set RS and INT for last frame in a cache line */
577 if ((++i
& (FM10K_TXD_WB_FIFO_SIZE
- 1)) == 0)
578 desc_flags
|= FM10K_TXD_FLAG_RS
| FM10K_TXD_FLAG_INT
;
580 /* record values to descriptor */
581 tx_desc
->buffer_addr
= cpu_to_le64(dma
);
582 tx_desc
->flags
= desc_flags
;
583 tx_desc
->buflen
= cpu_to_le16(size
);
585 /* return true if we just wrapped the ring */
586 return i
== tx_ring
->count
;
589 static void fm10k_tx_map(struct fm10k_ring
*tx_ring
,
590 struct fm10k_tx_buffer
*first
)
592 struct sk_buff
*skb
= first
->skb
;
593 struct fm10k_tx_buffer
*tx_buffer
;
594 struct fm10k_tx_desc
*tx_desc
;
595 struct skb_frag_struct
*frag
;
598 unsigned int data_len
, size
;
599 u16 i
= tx_ring
->next_to_use
;
602 tx_desc
= FM10K_TX_DESC(tx_ring
, i
);
604 /* add HW VLAN tag */
605 if (vlan_tx_tag_present(skb
))
606 tx_desc
->vlan
= cpu_to_le16(vlan_tx_tag_get(skb
));
610 size
= skb_headlen(skb
);
613 dma
= dma_map_single(tx_ring
->dev
, data
, size
, DMA_TO_DEVICE
);
615 data_len
= skb
->data_len
;
618 for (frag
= &skb_shinfo(skb
)->frags
[0];; frag
++) {
619 if (dma_mapping_error(tx_ring
->dev
, dma
))
622 /* record length, and DMA address */
623 dma_unmap_len_set(tx_buffer
, len
, size
);
624 dma_unmap_addr_set(tx_buffer
, dma
, dma
);
626 while (unlikely(size
> FM10K_MAX_DATA_PER_TXD
)) {
627 if (fm10k_tx_desc_push(tx_ring
, tx_desc
++, i
++, dma
,
628 FM10K_MAX_DATA_PER_TXD
, flags
)) {
629 tx_desc
= FM10K_TX_DESC(tx_ring
, 0);
633 dma
+= FM10K_MAX_DATA_PER_TXD
;
634 size
-= FM10K_MAX_DATA_PER_TXD
;
637 if (likely(!data_len
))
640 if (fm10k_tx_desc_push(tx_ring
, tx_desc
++, i
++,
642 tx_desc
= FM10K_TX_DESC(tx_ring
, 0);
646 size
= skb_frag_size(frag
);
649 dma
= skb_frag_dma_map(tx_ring
->dev
, frag
, 0, size
,
652 tx_buffer
= &tx_ring
->tx_buffer
[i
];
655 /* write last descriptor with LAST bit set */
656 flags
|= FM10K_TXD_FLAG_LAST
;
658 if (fm10k_tx_desc_push(tx_ring
, tx_desc
, i
++, dma
, size
, flags
))
661 /* record bytecount for BQL */
662 netdev_tx_sent_queue(txring_txq(tx_ring
), first
->bytecount
);
664 /* record SW timestamp if HW timestamp is not available */
665 skb_tx_timestamp(first
->skb
);
667 /* Force memory writes to complete before letting h/w know there
668 * are new descriptors to fetch. (Only applicable for weak-ordered
669 * memory model archs, such as IA-64).
671 * We also need this memory barrier to make certain all of the
672 * status bits have been updated before next_to_watch is written.
676 /* set next_to_watch value indicating a packet is present */
677 first
->next_to_watch
= tx_desc
;
679 tx_ring
->next_to_use
= i
;
681 /* notify HW of packet */
682 writel(i
, tx_ring
->tail
);
684 /* we need this if more than one processor can write to our tail
685 * at a time, it synchronizes IO on IA64/Altix systems
691 dev_err(tx_ring
->dev
, "TX DMA map failed\n");
693 /* clear dma mappings for failed tx_buffer map */
695 tx_buffer
= &tx_ring
->tx_buffer
[i
];
696 fm10k_unmap_and_free_tx_resource(tx_ring
, tx_buffer
);
697 if (tx_buffer
== first
)
704 tx_ring
->next_to_use
= i
;
707 static int __fm10k_maybe_stop_tx(struct fm10k_ring
*tx_ring
, u16 size
)
709 netif_stop_subqueue(tx_ring
->netdev
, tx_ring
->queue_index
);
713 /* We need to check again in a case another CPU has just
714 * made room available. */
715 if (likely(fm10k_desc_unused(tx_ring
) < size
))
718 /* A reprieve! - use start_queue because it doesn't call schedule */
719 netif_start_subqueue(tx_ring
->netdev
, tx_ring
->queue_index
);
720 ++tx_ring
->tx_stats
.restart_queue
;
724 static inline int fm10k_maybe_stop_tx(struct fm10k_ring
*tx_ring
, u16 size
)
726 if (likely(fm10k_desc_unused(tx_ring
) >= size
))
728 return __fm10k_maybe_stop_tx(tx_ring
, size
);
731 netdev_tx_t
fm10k_xmit_frame_ring(struct sk_buff
*skb
,
732 struct fm10k_ring
*tx_ring
)
734 struct fm10k_tx_buffer
*first
;
736 #if PAGE_SIZE > FM10K_MAX_DATA_PER_TXD
739 u16 count
= TXD_USE_COUNT(skb_headlen(skb
));
741 /* need: 1 descriptor per page * PAGE_SIZE/FM10K_MAX_DATA_PER_TXD,
742 * + 1 desc for skb_headlen/FM10K_MAX_DATA_PER_TXD,
743 * + 2 desc gap to keep tail from touching head
744 * otherwise try next time
746 #if PAGE_SIZE > FM10K_MAX_DATA_PER_TXD
747 for (f
= 0; f
< skb_shinfo(skb
)->nr_frags
; f
++)
748 count
+= TXD_USE_COUNT(skb_shinfo(skb
)->frags
[f
].size
);
750 count
+= skb_shinfo(skb
)->nr_frags
;
752 if (fm10k_maybe_stop_tx(tx_ring
, count
+ 3)) {
753 tx_ring
->tx_stats
.tx_busy
++;
754 return NETDEV_TX_BUSY
;
757 /* record the location of the first descriptor for this packet */
758 first
= &tx_ring
->tx_buffer
[tx_ring
->next_to_use
];
760 first
->bytecount
= max_t(unsigned int, skb
->len
, ETH_ZLEN
);
763 /* record initial flags and protocol */
764 first
->tx_flags
= tx_flags
;
766 fm10k_tx_map(tx_ring
, first
);
768 fm10k_maybe_stop_tx(tx_ring
, DESC_NEEDED
);
773 static u64
fm10k_get_tx_completed(struct fm10k_ring
*ring
)
775 return ring
->stats
.packets
;
778 static u64
fm10k_get_tx_pending(struct fm10k_ring
*ring
)
780 /* use SW head and tail until we have real hardware */
781 u32 head
= ring
->next_to_clean
;
782 u32 tail
= ring
->next_to_use
;
784 return ((head
<= tail
) ? tail
: tail
+ ring
->count
) - head
;
787 bool fm10k_check_tx_hang(struct fm10k_ring
*tx_ring
)
789 u32 tx_done
= fm10k_get_tx_completed(tx_ring
);
790 u32 tx_done_old
= tx_ring
->tx_stats
.tx_done_old
;
791 u32 tx_pending
= fm10k_get_tx_pending(tx_ring
);
793 clear_check_for_tx_hang(tx_ring
);
795 /* Check for a hung queue, but be thorough. This verifies
796 * that a transmit has been completed since the previous
797 * check AND there is at least one packet pending. By
798 * requiring this to fail twice we avoid races with
799 * clearing the ARMED bit and conditions where we
800 * run the check_tx_hang logic with a transmit completion
801 * pending but without time to complete it yet.
803 if (!tx_pending
|| (tx_done_old
!= tx_done
)) {
804 /* update completed stats and continue */
805 tx_ring
->tx_stats
.tx_done_old
= tx_done
;
806 /* reset the countdown */
807 clear_bit(__FM10K_HANG_CHECK_ARMED
, &tx_ring
->state
);
812 /* make sure it is true for two checks in a row */
813 return test_and_set_bit(__FM10K_HANG_CHECK_ARMED
, &tx_ring
->state
);
817 * fm10k_tx_timeout_reset - initiate reset due to Tx timeout
818 * @interface: driver private struct
820 void fm10k_tx_timeout_reset(struct fm10k_intfc
*interface
)
822 /* Do the reset outside of interrupt context */
823 if (!test_bit(__FM10K_DOWN
, &interface
->state
)) {
824 netdev_err(interface
->netdev
, "Reset interface\n");
825 interface
->tx_timeout_count
++;
826 interface
->flags
|= FM10K_FLAG_RESET_REQUESTED
;
827 fm10k_service_event_schedule(interface
);
832 * fm10k_clean_tx_irq - Reclaim resources after transmit completes
833 * @q_vector: structure containing interrupt and ring information
834 * @tx_ring: tx ring to clean
836 static bool fm10k_clean_tx_irq(struct fm10k_q_vector
*q_vector
,
837 struct fm10k_ring
*tx_ring
)
839 struct fm10k_intfc
*interface
= q_vector
->interface
;
840 struct fm10k_tx_buffer
*tx_buffer
;
841 struct fm10k_tx_desc
*tx_desc
;
842 unsigned int total_bytes
= 0, total_packets
= 0;
843 unsigned int budget
= q_vector
->tx
.work_limit
;
844 unsigned int i
= tx_ring
->next_to_clean
;
846 if (test_bit(__FM10K_DOWN
, &interface
->state
))
849 tx_buffer
= &tx_ring
->tx_buffer
[i
];
850 tx_desc
= FM10K_TX_DESC(tx_ring
, i
);
854 struct fm10k_tx_desc
*eop_desc
= tx_buffer
->next_to_watch
;
856 /* if next_to_watch is not set then there is no work pending */
860 /* prevent any other reads prior to eop_desc */
861 read_barrier_depends();
863 /* if DD is not set pending work has not been completed */
864 if (!(eop_desc
->flags
& FM10K_TXD_FLAG_DONE
))
867 /* clear next_to_watch to prevent false hangs */
868 tx_buffer
->next_to_watch
= NULL
;
870 /* update the statistics for this packet */
871 total_bytes
+= tx_buffer
->bytecount
;
872 total_packets
+= tx_buffer
->gso_segs
;
875 dev_consume_skb_any(tx_buffer
->skb
);
877 /* unmap skb header data */
878 dma_unmap_single(tx_ring
->dev
,
879 dma_unmap_addr(tx_buffer
, dma
),
880 dma_unmap_len(tx_buffer
, len
),
883 /* clear tx_buffer data */
884 tx_buffer
->skb
= NULL
;
885 dma_unmap_len_set(tx_buffer
, len
, 0);
887 /* unmap remaining buffers */
888 while (tx_desc
!= eop_desc
) {
894 tx_buffer
= tx_ring
->tx_buffer
;
895 tx_desc
= FM10K_TX_DESC(tx_ring
, 0);
898 /* unmap any remaining paged data */
899 if (dma_unmap_len(tx_buffer
, len
)) {
900 dma_unmap_page(tx_ring
->dev
,
901 dma_unmap_addr(tx_buffer
, dma
),
902 dma_unmap_len(tx_buffer
, len
),
904 dma_unmap_len_set(tx_buffer
, len
, 0);
908 /* move us one more past the eop_desc for start of next pkt */
914 tx_buffer
= tx_ring
->tx_buffer
;
915 tx_desc
= FM10K_TX_DESC(tx_ring
, 0);
918 /* issue prefetch for next Tx descriptor */
921 /* update budget accounting */
923 } while (likely(budget
));
926 tx_ring
->next_to_clean
= i
;
927 u64_stats_update_begin(&tx_ring
->syncp
);
928 tx_ring
->stats
.bytes
+= total_bytes
;
929 tx_ring
->stats
.packets
+= total_packets
;
930 u64_stats_update_end(&tx_ring
->syncp
);
931 q_vector
->tx
.total_bytes
+= total_bytes
;
932 q_vector
->tx
.total_packets
+= total_packets
;
934 if (check_for_tx_hang(tx_ring
) && fm10k_check_tx_hang(tx_ring
)) {
935 /* schedule immediate reset if we believe we hung */
936 struct fm10k_hw
*hw
= &interface
->hw
;
938 netif_err(interface
, drv
, tx_ring
->netdev
,
939 "Detected Tx Unit Hang\n"
941 " TDH, TDT <%x>, <%x>\n"
942 " next_to_use <%x>\n"
943 " next_to_clean <%x>\n",
944 tx_ring
->queue_index
,
945 fm10k_read_reg(hw
, FM10K_TDH(tx_ring
->reg_idx
)),
946 fm10k_read_reg(hw
, FM10K_TDT(tx_ring
->reg_idx
)),
947 tx_ring
->next_to_use
, i
);
949 netif_stop_subqueue(tx_ring
->netdev
,
950 tx_ring
->queue_index
);
952 netif_info(interface
, probe
, tx_ring
->netdev
,
953 "tx hang %d detected on queue %d, resetting interface\n",
954 interface
->tx_timeout_count
+ 1,
955 tx_ring
->queue_index
);
957 fm10k_tx_timeout_reset(interface
);
959 /* the netdev is about to reset, no point in enabling stuff */
963 /* notify netdev of completed buffers */
964 netdev_tx_completed_queue(txring_txq(tx_ring
),
965 total_packets
, total_bytes
);
967 #define TX_WAKE_THRESHOLD min_t(u16, FM10K_MIN_TXD - 1, DESC_NEEDED * 2)
968 if (unlikely(total_packets
&& netif_carrier_ok(tx_ring
->netdev
) &&
969 (fm10k_desc_unused(tx_ring
) >= TX_WAKE_THRESHOLD
))) {
970 /* Make sure that anybody stopping the queue after this
971 * sees the new next_to_clean.
974 if (__netif_subqueue_stopped(tx_ring
->netdev
,
975 tx_ring
->queue_index
) &&
976 !test_bit(__FM10K_DOWN
, &interface
->state
)) {
977 netif_wake_subqueue(tx_ring
->netdev
,
978 tx_ring
->queue_index
);
979 ++tx_ring
->tx_stats
.restart_queue
;
987 * fm10k_update_itr - update the dynamic ITR value based on packet size
989 * Stores a new ITR value based on strictly on packet size. The
990 * divisors and thresholds used by this function were determined based
991 * on theoretical maximum wire speed and testing data, in order to
992 * minimize response time while increasing bulk throughput.
994 * @ring_container: Container for rings to have ITR updated
996 static void fm10k_update_itr(struct fm10k_ring_container
*ring_container
)
998 unsigned int avg_wire_size
, packets
;
1000 /* Only update ITR if we are using adaptive setting */
1001 if (!(ring_container
->itr
& FM10K_ITR_ADAPTIVE
))
1004 packets
= ring_container
->total_packets
;
1008 avg_wire_size
= ring_container
->total_bytes
/ packets
;
1010 /* Add 24 bytes to size to account for CRC, preamble, and gap */
1011 avg_wire_size
+= 24;
1013 /* Don't starve jumbo frames */
1014 if (avg_wire_size
> 3000)
1015 avg_wire_size
= 3000;
1017 /* Give a little boost to mid-size frames */
1018 if ((avg_wire_size
> 300) && (avg_wire_size
< 1200))
1023 /* write back value and retain adaptive flag */
1024 ring_container
->itr
= avg_wire_size
| FM10K_ITR_ADAPTIVE
;
1027 ring_container
->total_bytes
= 0;
1028 ring_container
->total_packets
= 0;
1031 static void fm10k_qv_enable(struct fm10k_q_vector
*q_vector
)
1033 /* Enable auto-mask and clear the current mask */
1034 u32 itr
= FM10K_ITR_ENABLE
;
1037 fm10k_update_itr(&q_vector
->tx
);
1040 fm10k_update_itr(&q_vector
->rx
);
1042 /* Store Tx itr in timer slot 0 */
1043 itr
|= (q_vector
->tx
.itr
& FM10K_ITR_MAX
);
1045 /* Shift Rx itr to timer slot 1 */
1046 itr
|= (q_vector
->rx
.itr
& FM10K_ITR_MAX
) << FM10K_ITR_INTERVAL1_SHIFT
;
1048 /* Write the final value to the ITR register */
1049 writel(itr
, q_vector
->itr
);
1052 static int fm10k_poll(struct napi_struct
*napi
, int budget
)
1054 struct fm10k_q_vector
*q_vector
=
1055 container_of(napi
, struct fm10k_q_vector
, napi
);
1056 struct fm10k_ring
*ring
;
1057 int per_ring_budget
;
1058 bool clean_complete
= true;
1060 fm10k_for_each_ring(ring
, q_vector
->tx
)
1061 clean_complete
&= fm10k_clean_tx_irq(q_vector
, ring
);
1063 /* attempt to distribute budget to each queue fairly, but don't
1064 * allow the budget to go below 1 because we'll exit polling
1066 if (q_vector
->rx
.count
> 1)
1067 per_ring_budget
= max(budget
/q_vector
->rx
.count
, 1);
1069 per_ring_budget
= budget
;
1071 fm10k_for_each_ring(ring
, q_vector
->rx
)
1072 clean_complete
&= fm10k_clean_rx_irq(q_vector
, ring
,
1075 /* If all work not completed, return budget and keep polling */
1076 if (!clean_complete
)
1079 /* all work done, exit the polling mode */
1080 napi_complete(napi
);
1082 /* re-enable the q_vector */
1083 fm10k_qv_enable(q_vector
);
1089 * fm10k_set_num_queues: Allocate queues for device, feature dependent
1090 * @interface: board private structure to initialize
1092 * This is the top level queue allocation routine. The order here is very
1093 * important, starting with the "most" number of features turned on at once,
1094 * and ending with the smallest set of features. This way large combinations
1095 * can be allocated if they're turned on, and smaller combinations are the
1096 * fallthrough conditions.
1099 static void fm10k_set_num_queues(struct fm10k_intfc
*interface
)
1101 /* Start with base case */
1102 interface
->num_rx_queues
= 1;
1103 interface
->num_tx_queues
= 1;
1107 * fm10k_alloc_q_vector - Allocate memory for a single interrupt vector
1108 * @interface: board private structure to initialize
1109 * @v_count: q_vectors allocated on interface, used for ring interleaving
1110 * @v_idx: index of vector in interface struct
1111 * @txr_count: total number of Tx rings to allocate
1112 * @txr_idx: index of first Tx ring to allocate
1113 * @rxr_count: total number of Rx rings to allocate
1114 * @rxr_idx: index of first Rx ring to allocate
1116 * We allocate one q_vector. If allocation fails we return -ENOMEM.
1118 static int fm10k_alloc_q_vector(struct fm10k_intfc
*interface
,
1119 unsigned int v_count
, unsigned int v_idx
,
1120 unsigned int txr_count
, unsigned int txr_idx
,
1121 unsigned int rxr_count
, unsigned int rxr_idx
)
1123 struct fm10k_q_vector
*q_vector
;
1124 struct fm10k_ring
*ring
;
1125 int ring_count
, size
;
1127 ring_count
= txr_count
+ rxr_count
;
1128 size
= sizeof(struct fm10k_q_vector
) +
1129 (sizeof(struct fm10k_ring
) * ring_count
);
1131 /* allocate q_vector and rings */
1132 q_vector
= kzalloc(size
, GFP_KERNEL
);
1136 /* initialize NAPI */
1137 netif_napi_add(interface
->netdev
, &q_vector
->napi
,
1138 fm10k_poll
, NAPI_POLL_WEIGHT
);
1140 /* tie q_vector and interface together */
1141 interface
->q_vector
[v_idx
] = q_vector
;
1142 q_vector
->interface
= interface
;
1143 q_vector
->v_idx
= v_idx
;
1145 /* initialize pointer to rings */
1146 ring
= q_vector
->ring
;
1148 /* save Tx ring container info */
1149 q_vector
->tx
.ring
= ring
;
1150 q_vector
->tx
.work_limit
= FM10K_DEFAULT_TX_WORK
;
1151 q_vector
->tx
.itr
= interface
->tx_itr
;
1152 q_vector
->tx
.count
= txr_count
;
1155 /* assign generic ring traits */
1156 ring
->dev
= &interface
->pdev
->dev
;
1157 ring
->netdev
= interface
->netdev
;
1159 /* configure backlink on ring */
1160 ring
->q_vector
= q_vector
;
1162 /* apply Tx specific ring traits */
1163 ring
->count
= interface
->tx_ring_count
;
1164 ring
->queue_index
= txr_idx
;
1166 /* assign ring to interface */
1167 interface
->tx_ring
[txr_idx
] = ring
;
1169 /* update count and index */
1173 /* push pointer to next ring */
1177 /* save Rx ring container info */
1178 q_vector
->rx
.ring
= ring
;
1179 q_vector
->rx
.itr
= interface
->rx_itr
;
1180 q_vector
->rx
.count
= rxr_count
;
1183 /* assign generic ring traits */
1184 ring
->dev
= &interface
->pdev
->dev
;
1185 ring
->netdev
= interface
->netdev
;
1187 /* configure backlink on ring */
1188 ring
->q_vector
= q_vector
;
1190 /* apply Rx specific ring traits */
1191 ring
->count
= interface
->rx_ring_count
;
1192 ring
->queue_index
= rxr_idx
;
1194 /* assign ring to interface */
1195 interface
->rx_ring
[rxr_idx
] = ring
;
1197 /* update count and index */
1201 /* push pointer to next ring */
1209 * fm10k_free_q_vector - Free memory allocated for specific interrupt vector
1210 * @interface: board private structure to initialize
1211 * @v_idx: Index of vector to be freed
1213 * This function frees the memory allocated to the q_vector. In addition if
1214 * NAPI is enabled it will delete any references to the NAPI struct prior
1215 * to freeing the q_vector.
1217 static void fm10k_free_q_vector(struct fm10k_intfc
*interface
, int v_idx
)
1219 struct fm10k_q_vector
*q_vector
= interface
->q_vector
[v_idx
];
1220 struct fm10k_ring
*ring
;
1222 fm10k_for_each_ring(ring
, q_vector
->tx
)
1223 interface
->tx_ring
[ring
->queue_index
] = NULL
;
1225 fm10k_for_each_ring(ring
, q_vector
->rx
)
1226 interface
->rx_ring
[ring
->queue_index
] = NULL
;
1228 interface
->q_vector
[v_idx
] = NULL
;
1229 netif_napi_del(&q_vector
->napi
);
1230 kfree_rcu(q_vector
, rcu
);
1234 * fm10k_alloc_q_vectors - Allocate memory for interrupt vectors
1235 * @interface: board private structure to initialize
1237 * We allocate one q_vector per queue interrupt. If allocation fails we
1240 static int fm10k_alloc_q_vectors(struct fm10k_intfc
*interface
)
1242 unsigned int q_vectors
= interface
->num_q_vectors
;
1243 unsigned int rxr_remaining
= interface
->num_rx_queues
;
1244 unsigned int txr_remaining
= interface
->num_tx_queues
;
1245 unsigned int rxr_idx
= 0, txr_idx
= 0, v_idx
= 0;
1248 if (q_vectors
>= (rxr_remaining
+ txr_remaining
)) {
1249 for (; rxr_remaining
; v_idx
++) {
1250 err
= fm10k_alloc_q_vector(interface
, q_vectors
, v_idx
,
1255 /* update counts and index */
1261 for (; v_idx
< q_vectors
; v_idx
++) {
1262 int rqpv
= DIV_ROUND_UP(rxr_remaining
, q_vectors
- v_idx
);
1263 int tqpv
= DIV_ROUND_UP(txr_remaining
, q_vectors
- v_idx
);
1265 err
= fm10k_alloc_q_vector(interface
, q_vectors
, v_idx
,
1272 /* update counts and index */
1273 rxr_remaining
-= rqpv
;
1274 txr_remaining
-= tqpv
;
1282 interface
->num_tx_queues
= 0;
1283 interface
->num_rx_queues
= 0;
1284 interface
->num_q_vectors
= 0;
1287 fm10k_free_q_vector(interface
, v_idx
);
1293 * fm10k_free_q_vectors - Free memory allocated for interrupt vectors
1294 * @interface: board private structure to initialize
1296 * This function frees the memory allocated to the q_vectors. In addition if
1297 * NAPI is enabled it will delete any references to the NAPI struct prior
1298 * to freeing the q_vector.
1300 static void fm10k_free_q_vectors(struct fm10k_intfc
*interface
)
1302 int v_idx
= interface
->num_q_vectors
;
1304 interface
->num_tx_queues
= 0;
1305 interface
->num_rx_queues
= 0;
1306 interface
->num_q_vectors
= 0;
1309 fm10k_free_q_vector(interface
, v_idx
);
1313 * f10k_reset_msix_capability - reset MSI-X capability
1314 * @interface: board private structure to initialize
1316 * Reset the MSI-X capability back to its starting state
1318 static void fm10k_reset_msix_capability(struct fm10k_intfc
*interface
)
1320 pci_disable_msix(interface
->pdev
);
1321 kfree(interface
->msix_entries
);
1322 interface
->msix_entries
= NULL
;
1326 * f10k_init_msix_capability - configure MSI-X capability
1327 * @interface: board private structure to initialize
1329 * Attempt to configure the interrupts using the best available
1330 * capabilities of the hardware and the kernel.
1332 static int fm10k_init_msix_capability(struct fm10k_intfc
*interface
)
1334 struct fm10k_hw
*hw
= &interface
->hw
;
1335 int v_budget
, vector
;
1337 /* It's easy to be greedy for MSI-X vectors, but it really
1338 * doesn't do us much good if we have a lot more vectors
1339 * than CPU's. So let's be conservative and only ask for
1340 * (roughly) the same number of vectors as there are CPU's.
1341 * the default is to use pairs of vectors
1343 v_budget
= max(interface
->num_rx_queues
, interface
->num_tx_queues
);
1344 v_budget
= min_t(u16
, v_budget
, num_online_cpus());
1346 /* account for vectors not related to queues */
1347 v_budget
+= NON_Q_VECTORS(hw
);
1349 /* At the same time, hardware can only support a maximum of
1350 * hw.mac->max_msix_vectors vectors. With features
1351 * such as RSS and VMDq, we can easily surpass the number of Rx and Tx
1352 * descriptor queues supported by our device. Thus, we cap it off in
1353 * those rare cases where the cpu count also exceeds our vector limit.
1355 v_budget
= min_t(int, v_budget
, hw
->mac
.max_msix_vectors
);
1357 /* A failure in MSI-X entry allocation is fatal. */
1358 interface
->msix_entries
= kcalloc(v_budget
, sizeof(struct msix_entry
),
1360 if (!interface
->msix_entries
)
1363 /* populate entry values */
1364 for (vector
= 0; vector
< v_budget
; vector
++)
1365 interface
->msix_entries
[vector
].entry
= vector
;
1367 /* Attempt to enable MSI-X with requested value */
1368 v_budget
= pci_enable_msix_range(interface
->pdev
,
1369 interface
->msix_entries
,
1373 kfree(interface
->msix_entries
);
1374 interface
->msix_entries
= NULL
;
1378 /* record the number of queues available for q_vectors */
1379 interface
->num_q_vectors
= v_budget
- NON_Q_VECTORS(hw
);
1384 static void fm10k_init_reta(struct fm10k_intfc
*interface
)
1386 u16 i
, rss_i
= interface
->ring_feature
[RING_F_RSS
].indices
;
1389 /* If the netdev is initialized we have to maintain table if possible */
1390 if (interface
->netdev
->reg_state
) {
1391 for (i
= FM10K_RETA_SIZE
; i
--;) {
1392 reta
= interface
->reta
[i
];
1393 if ((((reta
<< 24) >> 24) < rss_i
) &&
1394 (((reta
<< 16) >> 24) < rss_i
) &&
1395 (((reta
<< 8) >> 24) < rss_i
) &&
1396 (((reta
) >> 24) < rss_i
))
1398 goto repopulate_reta
;
1401 /* do nothing if all of the elements are in bounds */
1406 /* Populate the redirection table 4 entries at a time. To do this
1407 * we are generating the results for n and n+2 and then interleaving
1408 * those with the results with n+1 and n+3.
1410 for (i
= FM10K_RETA_SIZE
; i
--;) {
1411 /* first pass generates n and n+2 */
1412 base
= ((i
* 0x00040004) + 0x00020000) * rss_i
;
1413 reta
= (base
& 0x3F803F80) >> 7;
1415 /* second pass generates n+1 and n+3 */
1416 base
+= 0x00010001 * rss_i
;
1417 reta
|= (base
& 0x3F803F80) << 1;
1419 interface
->reta
[i
] = reta
;
1424 * fm10k_init_queueing_scheme - Determine proper queueing scheme
1425 * @interface: board private structure to initialize
1427 * We determine which queueing scheme to use based on...
1428 * - Hardware queue count (num_*_queues)
1429 * - defined by miscellaneous hardware support/features (RSS, etc.)
1431 int fm10k_init_queueing_scheme(struct fm10k_intfc
*interface
)
1435 /* Number of supported queues */
1436 fm10k_set_num_queues(interface
);
1438 /* Configure MSI-X capability */
1439 err
= fm10k_init_msix_capability(interface
);
1441 dev_err(&interface
->pdev
->dev
,
1442 "Unable to initialize MSI-X capability\n");
1446 /* Allocate memory for queues */
1447 err
= fm10k_alloc_q_vectors(interface
);
1451 /* Initialize RSS redirection table */
1452 fm10k_init_reta(interface
);
1458 * fm10k_clear_queueing_scheme - Clear the current queueing scheme settings
1459 * @interface: board private structure to clear queueing scheme on
1461 * We go through and clear queueing specific resources and reset the structure
1462 * to pre-load conditions
1464 void fm10k_clear_queueing_scheme(struct fm10k_intfc
*interface
)
1466 fm10k_free_q_vectors(interface
);
1467 fm10k_reset_msix_capability(interface
);