2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
121 #include <net/mpls.h>
122 #include <linux/ipv6.h>
123 #include <linux/in.h>
124 #include <linux/jhash.h>
125 #include <linux/random.h>
126 #include <trace/events/napi.h>
127 #include <trace/events/net.h>
128 #include <trace/events/skb.h>
129 #include <linux/pci.h>
130 #include <linux/inetdevice.h>
131 #include <linux/cpu_rmap.h>
132 #include <linux/static_key.h>
133 #include <linux/hashtable.h>
134 #include <linux/vmalloc.h>
135 #include <linux/if_macvlan.h>
136 #include <linux/errqueue.h>
138 #include "net-sysfs.h"
140 /* Instead of increasing this, you should create a hash table. */
141 #define MAX_GRO_SKBS 8
143 /* This should be increased if a protocol with a bigger head is added. */
144 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146 static DEFINE_SPINLOCK(ptype_lock
);
147 static DEFINE_SPINLOCK(offload_lock
);
148 struct list_head ptype_base
[PTYPE_HASH_SIZE
] __read_mostly
;
149 struct list_head ptype_all __read_mostly
; /* Taps */
150 static struct list_head offload_base __read_mostly
;
152 static int netif_rx_internal(struct sk_buff
*skb
);
153 static int call_netdevice_notifiers_info(unsigned long val
,
154 struct net_device
*dev
,
155 struct netdev_notifier_info
*info
);
158 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
161 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
163 * Writers must hold the rtnl semaphore while they loop through the
164 * dev_base_head list, and hold dev_base_lock for writing when they do the
165 * actual updates. This allows pure readers to access the list even
166 * while a writer is preparing to update it.
168 * To put it another way, dev_base_lock is held for writing only to
169 * protect against pure readers; the rtnl semaphore provides the
170 * protection against other writers.
172 * See, for example usages, register_netdevice() and
173 * unregister_netdevice(), which must be called with the rtnl
176 DEFINE_RWLOCK(dev_base_lock
);
177 EXPORT_SYMBOL(dev_base_lock
);
179 /* protects napi_hash addition/deletion and napi_gen_id */
180 static DEFINE_SPINLOCK(napi_hash_lock
);
182 static unsigned int napi_gen_id
;
183 static DEFINE_HASHTABLE(napi_hash
, 8);
185 static seqcount_t devnet_rename_seq
;
187 static inline void dev_base_seq_inc(struct net
*net
)
189 while (++net
->dev_base_seq
== 0);
192 static inline struct hlist_head
*dev_name_hash(struct net
*net
, const char *name
)
194 unsigned int hash
= full_name_hash(name
, strnlen(name
, IFNAMSIZ
));
196 return &net
->dev_name_head
[hash_32(hash
, NETDEV_HASHBITS
)];
199 static inline struct hlist_head
*dev_index_hash(struct net
*net
, int ifindex
)
201 return &net
->dev_index_head
[ifindex
& (NETDEV_HASHENTRIES
- 1)];
204 static inline void rps_lock(struct softnet_data
*sd
)
207 spin_lock(&sd
->input_pkt_queue
.lock
);
211 static inline void rps_unlock(struct softnet_data
*sd
)
214 spin_unlock(&sd
->input_pkt_queue
.lock
);
218 /* Device list insertion */
219 static void list_netdevice(struct net_device
*dev
)
221 struct net
*net
= dev_net(dev
);
225 write_lock_bh(&dev_base_lock
);
226 list_add_tail_rcu(&dev
->dev_list
, &net
->dev_base_head
);
227 hlist_add_head_rcu(&dev
->name_hlist
, dev_name_hash(net
, dev
->name
));
228 hlist_add_head_rcu(&dev
->index_hlist
,
229 dev_index_hash(net
, dev
->ifindex
));
230 write_unlock_bh(&dev_base_lock
);
232 dev_base_seq_inc(net
);
235 /* Device list removal
236 * caller must respect a RCU grace period before freeing/reusing dev
238 static void unlist_netdevice(struct net_device
*dev
)
242 /* Unlink dev from the device chain */
243 write_lock_bh(&dev_base_lock
);
244 list_del_rcu(&dev
->dev_list
);
245 hlist_del_rcu(&dev
->name_hlist
);
246 hlist_del_rcu(&dev
->index_hlist
);
247 write_unlock_bh(&dev_base_lock
);
249 dev_base_seq_inc(dev_net(dev
));
256 static RAW_NOTIFIER_HEAD(netdev_chain
);
259 * Device drivers call our routines to queue packets here. We empty the
260 * queue in the local softnet handler.
263 DEFINE_PER_CPU_ALIGNED(struct softnet_data
, softnet_data
);
264 EXPORT_PER_CPU_SYMBOL(softnet_data
);
266 #ifdef CONFIG_LOCKDEP
268 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
269 * according to dev->type
271 static const unsigned short netdev_lock_type
[] =
272 {ARPHRD_NETROM
, ARPHRD_ETHER
, ARPHRD_EETHER
, ARPHRD_AX25
,
273 ARPHRD_PRONET
, ARPHRD_CHAOS
, ARPHRD_IEEE802
, ARPHRD_ARCNET
,
274 ARPHRD_APPLETLK
, ARPHRD_DLCI
, ARPHRD_ATM
, ARPHRD_METRICOM
,
275 ARPHRD_IEEE1394
, ARPHRD_EUI64
, ARPHRD_INFINIBAND
, ARPHRD_SLIP
,
276 ARPHRD_CSLIP
, ARPHRD_SLIP6
, ARPHRD_CSLIP6
, ARPHRD_RSRVD
,
277 ARPHRD_ADAPT
, ARPHRD_ROSE
, ARPHRD_X25
, ARPHRD_HWX25
,
278 ARPHRD_PPP
, ARPHRD_CISCO
, ARPHRD_LAPB
, ARPHRD_DDCMP
,
279 ARPHRD_RAWHDLC
, ARPHRD_TUNNEL
, ARPHRD_TUNNEL6
, ARPHRD_FRAD
,
280 ARPHRD_SKIP
, ARPHRD_LOOPBACK
, ARPHRD_LOCALTLK
, ARPHRD_FDDI
,
281 ARPHRD_BIF
, ARPHRD_SIT
, ARPHRD_IPDDP
, ARPHRD_IPGRE
,
282 ARPHRD_PIMREG
, ARPHRD_HIPPI
, ARPHRD_ASH
, ARPHRD_ECONET
,
283 ARPHRD_IRDA
, ARPHRD_FCPP
, ARPHRD_FCAL
, ARPHRD_FCPL
,
284 ARPHRD_FCFABRIC
, ARPHRD_IEEE80211
, ARPHRD_IEEE80211_PRISM
,
285 ARPHRD_IEEE80211_RADIOTAP
, ARPHRD_PHONET
, ARPHRD_PHONET_PIPE
,
286 ARPHRD_IEEE802154
, ARPHRD_VOID
, ARPHRD_NONE
};
288 static const char *const netdev_lock_name
[] =
289 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
290 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
291 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
292 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
293 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
294 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
295 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
296 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
297 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
298 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
299 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
300 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
301 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
302 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
303 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
305 static struct lock_class_key netdev_xmit_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
306 static struct lock_class_key netdev_addr_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
308 static inline unsigned short netdev_lock_pos(unsigned short dev_type
)
312 for (i
= 0; i
< ARRAY_SIZE(netdev_lock_type
); i
++)
313 if (netdev_lock_type
[i
] == dev_type
)
315 /* the last key is used by default */
316 return ARRAY_SIZE(netdev_lock_type
) - 1;
319 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
320 unsigned short dev_type
)
324 i
= netdev_lock_pos(dev_type
);
325 lockdep_set_class_and_name(lock
, &netdev_xmit_lock_key
[i
],
326 netdev_lock_name
[i
]);
329 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
333 i
= netdev_lock_pos(dev
->type
);
334 lockdep_set_class_and_name(&dev
->addr_list_lock
,
335 &netdev_addr_lock_key
[i
],
336 netdev_lock_name
[i
]);
339 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
340 unsigned short dev_type
)
343 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
348 /*******************************************************************************
350 Protocol management and registration routines
352 *******************************************************************************/
355 * Add a protocol ID to the list. Now that the input handler is
356 * smarter we can dispense with all the messy stuff that used to be
359 * BEWARE!!! Protocol handlers, mangling input packets,
360 * MUST BE last in hash buckets and checking protocol handlers
361 * MUST start from promiscuous ptype_all chain in net_bh.
362 * It is true now, do not change it.
363 * Explanation follows: if protocol handler, mangling packet, will
364 * be the first on list, it is not able to sense, that packet
365 * is cloned and should be copied-on-write, so that it will
366 * change it and subsequent readers will get broken packet.
370 static inline struct list_head
*ptype_head(const struct packet_type
*pt
)
372 if (pt
->type
== htons(ETH_P_ALL
))
375 return &ptype_base
[ntohs(pt
->type
) & PTYPE_HASH_MASK
];
379 * dev_add_pack - add packet handler
380 * @pt: packet type declaration
382 * Add a protocol handler to the networking stack. The passed &packet_type
383 * is linked into kernel lists and may not be freed until it has been
384 * removed from the kernel lists.
386 * This call does not sleep therefore it can not
387 * guarantee all CPU's that are in middle of receiving packets
388 * will see the new packet type (until the next received packet).
391 void dev_add_pack(struct packet_type
*pt
)
393 struct list_head
*head
= ptype_head(pt
);
395 spin_lock(&ptype_lock
);
396 list_add_rcu(&pt
->list
, head
);
397 spin_unlock(&ptype_lock
);
399 EXPORT_SYMBOL(dev_add_pack
);
402 * __dev_remove_pack - remove packet handler
403 * @pt: packet type declaration
405 * Remove a protocol handler that was previously added to the kernel
406 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
407 * from the kernel lists and can be freed or reused once this function
410 * The packet type might still be in use by receivers
411 * and must not be freed until after all the CPU's have gone
412 * through a quiescent state.
414 void __dev_remove_pack(struct packet_type
*pt
)
416 struct list_head
*head
= ptype_head(pt
);
417 struct packet_type
*pt1
;
419 spin_lock(&ptype_lock
);
421 list_for_each_entry(pt1
, head
, list
) {
423 list_del_rcu(&pt
->list
);
428 pr_warn("dev_remove_pack: %p not found\n", pt
);
430 spin_unlock(&ptype_lock
);
432 EXPORT_SYMBOL(__dev_remove_pack
);
435 * dev_remove_pack - remove packet handler
436 * @pt: packet type declaration
438 * Remove a protocol handler that was previously added to the kernel
439 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
440 * from the kernel lists and can be freed or reused once this function
443 * This call sleeps to guarantee that no CPU is looking at the packet
446 void dev_remove_pack(struct packet_type
*pt
)
448 __dev_remove_pack(pt
);
452 EXPORT_SYMBOL(dev_remove_pack
);
456 * dev_add_offload - register offload handlers
457 * @po: protocol offload declaration
459 * Add protocol offload handlers to the networking stack. The passed
460 * &proto_offload is linked into kernel lists and may not be freed until
461 * it has been removed from the kernel lists.
463 * This call does not sleep therefore it can not
464 * guarantee all CPU's that are in middle of receiving packets
465 * will see the new offload handlers (until the next received packet).
467 void dev_add_offload(struct packet_offload
*po
)
469 struct list_head
*head
= &offload_base
;
471 spin_lock(&offload_lock
);
472 list_add_rcu(&po
->list
, head
);
473 spin_unlock(&offload_lock
);
475 EXPORT_SYMBOL(dev_add_offload
);
478 * __dev_remove_offload - remove offload handler
479 * @po: packet offload declaration
481 * Remove a protocol offload handler that was previously added to the
482 * kernel offload handlers by dev_add_offload(). The passed &offload_type
483 * is removed from the kernel lists and can be freed or reused once this
486 * The packet type might still be in use by receivers
487 * and must not be freed until after all the CPU's have gone
488 * through a quiescent state.
490 static void __dev_remove_offload(struct packet_offload
*po
)
492 struct list_head
*head
= &offload_base
;
493 struct packet_offload
*po1
;
495 spin_lock(&offload_lock
);
497 list_for_each_entry(po1
, head
, list
) {
499 list_del_rcu(&po
->list
);
504 pr_warn("dev_remove_offload: %p not found\n", po
);
506 spin_unlock(&offload_lock
);
510 * dev_remove_offload - remove packet offload handler
511 * @po: packet offload declaration
513 * Remove a packet offload handler that was previously added to the kernel
514 * offload handlers by dev_add_offload(). The passed &offload_type is
515 * removed from the kernel lists and can be freed or reused once this
518 * This call sleeps to guarantee that no CPU is looking at the packet
521 void dev_remove_offload(struct packet_offload
*po
)
523 __dev_remove_offload(po
);
527 EXPORT_SYMBOL(dev_remove_offload
);
529 /******************************************************************************
531 Device Boot-time Settings Routines
533 *******************************************************************************/
535 /* Boot time configuration table */
536 static struct netdev_boot_setup dev_boot_setup
[NETDEV_BOOT_SETUP_MAX
];
539 * netdev_boot_setup_add - add new setup entry
540 * @name: name of the device
541 * @map: configured settings for the device
543 * Adds new setup entry to the dev_boot_setup list. The function
544 * returns 0 on error and 1 on success. This is a generic routine to
547 static int netdev_boot_setup_add(char *name
, struct ifmap
*map
)
549 struct netdev_boot_setup
*s
;
553 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
554 if (s
[i
].name
[0] == '\0' || s
[i
].name
[0] == ' ') {
555 memset(s
[i
].name
, 0, sizeof(s
[i
].name
));
556 strlcpy(s
[i
].name
, name
, IFNAMSIZ
);
557 memcpy(&s
[i
].map
, map
, sizeof(s
[i
].map
));
562 return i
>= NETDEV_BOOT_SETUP_MAX
? 0 : 1;
566 * netdev_boot_setup_check - check boot time settings
567 * @dev: the netdevice
569 * Check boot time settings for the device.
570 * The found settings are set for the device to be used
571 * later in the device probing.
572 * Returns 0 if no settings found, 1 if they are.
574 int netdev_boot_setup_check(struct net_device
*dev
)
576 struct netdev_boot_setup
*s
= dev_boot_setup
;
579 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
580 if (s
[i
].name
[0] != '\0' && s
[i
].name
[0] != ' ' &&
581 !strcmp(dev
->name
, s
[i
].name
)) {
582 dev
->irq
= s
[i
].map
.irq
;
583 dev
->base_addr
= s
[i
].map
.base_addr
;
584 dev
->mem_start
= s
[i
].map
.mem_start
;
585 dev
->mem_end
= s
[i
].map
.mem_end
;
591 EXPORT_SYMBOL(netdev_boot_setup_check
);
595 * netdev_boot_base - get address from boot time settings
596 * @prefix: prefix for network device
597 * @unit: id for network device
599 * Check boot time settings for the base address of device.
600 * The found settings are set for the device to be used
601 * later in the device probing.
602 * Returns 0 if no settings found.
604 unsigned long netdev_boot_base(const char *prefix
, int unit
)
606 const struct netdev_boot_setup
*s
= dev_boot_setup
;
610 sprintf(name
, "%s%d", prefix
, unit
);
613 * If device already registered then return base of 1
614 * to indicate not to probe for this interface
616 if (__dev_get_by_name(&init_net
, name
))
619 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++)
620 if (!strcmp(name
, s
[i
].name
))
621 return s
[i
].map
.base_addr
;
626 * Saves at boot time configured settings for any netdevice.
628 int __init
netdev_boot_setup(char *str
)
633 str
= get_options(str
, ARRAY_SIZE(ints
), ints
);
638 memset(&map
, 0, sizeof(map
));
642 map
.base_addr
= ints
[2];
644 map
.mem_start
= ints
[3];
646 map
.mem_end
= ints
[4];
648 /* Add new entry to the list */
649 return netdev_boot_setup_add(str
, &map
);
652 __setup("netdev=", netdev_boot_setup
);
654 /*******************************************************************************
656 Device Interface Subroutines
658 *******************************************************************************/
661 * __dev_get_by_name - find a device by its name
662 * @net: the applicable net namespace
663 * @name: name to find
665 * Find an interface by name. Must be called under RTNL semaphore
666 * or @dev_base_lock. If the name is found a pointer to the device
667 * is returned. If the name is not found then %NULL is returned. The
668 * reference counters are not incremented so the caller must be
669 * careful with locks.
672 struct net_device
*__dev_get_by_name(struct net
*net
, const char *name
)
674 struct net_device
*dev
;
675 struct hlist_head
*head
= dev_name_hash(net
, name
);
677 hlist_for_each_entry(dev
, head
, name_hlist
)
678 if (!strncmp(dev
->name
, name
, IFNAMSIZ
))
683 EXPORT_SYMBOL(__dev_get_by_name
);
686 * dev_get_by_name_rcu - find a device by its name
687 * @net: the applicable net namespace
688 * @name: name to find
690 * Find an interface by name.
691 * If the name is found a pointer to the device is returned.
692 * If the name is not found then %NULL is returned.
693 * The reference counters are not incremented so the caller must be
694 * careful with locks. The caller must hold RCU lock.
697 struct net_device
*dev_get_by_name_rcu(struct net
*net
, const char *name
)
699 struct net_device
*dev
;
700 struct hlist_head
*head
= dev_name_hash(net
, name
);
702 hlist_for_each_entry_rcu(dev
, head
, name_hlist
)
703 if (!strncmp(dev
->name
, name
, IFNAMSIZ
))
708 EXPORT_SYMBOL(dev_get_by_name_rcu
);
711 * dev_get_by_name - find a device by its name
712 * @net: the applicable net namespace
713 * @name: name to find
715 * Find an interface by name. This can be called from any
716 * context and does its own locking. The returned handle has
717 * the usage count incremented and the caller must use dev_put() to
718 * release it when it is no longer needed. %NULL is returned if no
719 * matching device is found.
722 struct net_device
*dev_get_by_name(struct net
*net
, const char *name
)
724 struct net_device
*dev
;
727 dev
= dev_get_by_name_rcu(net
, name
);
733 EXPORT_SYMBOL(dev_get_by_name
);
736 * __dev_get_by_index - find a device by its ifindex
737 * @net: the applicable net namespace
738 * @ifindex: index of device
740 * Search for an interface by index. Returns %NULL if the device
741 * is not found or a pointer to the device. The device has not
742 * had its reference counter increased so the caller must be careful
743 * about locking. The caller must hold either the RTNL semaphore
747 struct net_device
*__dev_get_by_index(struct net
*net
, int ifindex
)
749 struct net_device
*dev
;
750 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
752 hlist_for_each_entry(dev
, head
, index_hlist
)
753 if (dev
->ifindex
== ifindex
)
758 EXPORT_SYMBOL(__dev_get_by_index
);
761 * dev_get_by_index_rcu - find a device by its ifindex
762 * @net: the applicable net namespace
763 * @ifindex: index of device
765 * Search for an interface by index. Returns %NULL if the device
766 * is not found or a pointer to the device. The device has not
767 * had its reference counter increased so the caller must be careful
768 * about locking. The caller must hold RCU lock.
771 struct net_device
*dev_get_by_index_rcu(struct net
*net
, int ifindex
)
773 struct net_device
*dev
;
774 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
776 hlist_for_each_entry_rcu(dev
, head
, index_hlist
)
777 if (dev
->ifindex
== ifindex
)
782 EXPORT_SYMBOL(dev_get_by_index_rcu
);
786 * dev_get_by_index - find a device by its ifindex
787 * @net: the applicable net namespace
788 * @ifindex: index of device
790 * Search for an interface by index. Returns NULL if the device
791 * is not found or a pointer to the device. The device returned has
792 * had a reference added and the pointer is safe until the user calls
793 * dev_put to indicate they have finished with it.
796 struct net_device
*dev_get_by_index(struct net
*net
, int ifindex
)
798 struct net_device
*dev
;
801 dev
= dev_get_by_index_rcu(net
, ifindex
);
807 EXPORT_SYMBOL(dev_get_by_index
);
810 * netdev_get_name - get a netdevice name, knowing its ifindex.
811 * @net: network namespace
812 * @name: a pointer to the buffer where the name will be stored.
813 * @ifindex: the ifindex of the interface to get the name from.
815 * The use of raw_seqcount_begin() and cond_resched() before
816 * retrying is required as we want to give the writers a chance
817 * to complete when CONFIG_PREEMPT is not set.
819 int netdev_get_name(struct net
*net
, char *name
, int ifindex
)
821 struct net_device
*dev
;
825 seq
= raw_seqcount_begin(&devnet_rename_seq
);
827 dev
= dev_get_by_index_rcu(net
, ifindex
);
833 strcpy(name
, dev
->name
);
835 if (read_seqcount_retry(&devnet_rename_seq
, seq
)) {
844 * dev_getbyhwaddr_rcu - find a device by its hardware address
845 * @net: the applicable net namespace
846 * @type: media type of device
847 * @ha: hardware address
849 * Search for an interface by MAC address. Returns NULL if the device
850 * is not found or a pointer to the device.
851 * The caller must hold RCU or RTNL.
852 * The returned device has not had its ref count increased
853 * and the caller must therefore be careful about locking
857 struct net_device
*dev_getbyhwaddr_rcu(struct net
*net
, unsigned short type
,
860 struct net_device
*dev
;
862 for_each_netdev_rcu(net
, dev
)
863 if (dev
->type
== type
&&
864 !memcmp(dev
->dev_addr
, ha
, dev
->addr_len
))
869 EXPORT_SYMBOL(dev_getbyhwaddr_rcu
);
871 struct net_device
*__dev_getfirstbyhwtype(struct net
*net
, unsigned short type
)
873 struct net_device
*dev
;
876 for_each_netdev(net
, dev
)
877 if (dev
->type
== type
)
882 EXPORT_SYMBOL(__dev_getfirstbyhwtype
);
884 struct net_device
*dev_getfirstbyhwtype(struct net
*net
, unsigned short type
)
886 struct net_device
*dev
, *ret
= NULL
;
889 for_each_netdev_rcu(net
, dev
)
890 if (dev
->type
== type
) {
898 EXPORT_SYMBOL(dev_getfirstbyhwtype
);
901 * __dev_get_by_flags - find any device with given flags
902 * @net: the applicable net namespace
903 * @if_flags: IFF_* values
904 * @mask: bitmask of bits in if_flags to check
906 * Search for any interface with the given flags. Returns NULL if a device
907 * is not found or a pointer to the device. Must be called inside
908 * rtnl_lock(), and result refcount is unchanged.
911 struct net_device
*__dev_get_by_flags(struct net
*net
, unsigned short if_flags
,
914 struct net_device
*dev
, *ret
;
919 for_each_netdev(net
, dev
) {
920 if (((dev
->flags
^ if_flags
) & mask
) == 0) {
927 EXPORT_SYMBOL(__dev_get_by_flags
);
930 * dev_valid_name - check if name is okay for network device
933 * Network device names need to be valid file names to
934 * to allow sysfs to work. We also disallow any kind of
937 bool dev_valid_name(const char *name
)
941 if (strlen(name
) >= IFNAMSIZ
)
943 if (!strcmp(name
, ".") || !strcmp(name
, ".."))
947 if (*name
== '/' || isspace(*name
))
953 EXPORT_SYMBOL(dev_valid_name
);
956 * __dev_alloc_name - allocate a name for a device
957 * @net: network namespace to allocate the device name in
958 * @name: name format string
959 * @buf: scratch buffer and result name string
961 * Passed a format string - eg "lt%d" it will try and find a suitable
962 * id. It scans list of devices to build up a free map, then chooses
963 * the first empty slot. The caller must hold the dev_base or rtnl lock
964 * while allocating the name and adding the device in order to avoid
966 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
967 * Returns the number of the unit assigned or a negative errno code.
970 static int __dev_alloc_name(struct net
*net
, const char *name
, char *buf
)
974 const int max_netdevices
= 8*PAGE_SIZE
;
975 unsigned long *inuse
;
976 struct net_device
*d
;
978 p
= strnchr(name
, IFNAMSIZ
-1, '%');
981 * Verify the string as this thing may have come from
982 * the user. There must be either one "%d" and no other "%"
985 if (p
[1] != 'd' || strchr(p
+ 2, '%'))
988 /* Use one page as a bit array of possible slots */
989 inuse
= (unsigned long *) get_zeroed_page(GFP_ATOMIC
);
993 for_each_netdev(net
, d
) {
994 if (!sscanf(d
->name
, name
, &i
))
996 if (i
< 0 || i
>= max_netdevices
)
999 /* avoid cases where sscanf is not exact inverse of printf */
1000 snprintf(buf
, IFNAMSIZ
, name
, i
);
1001 if (!strncmp(buf
, d
->name
, IFNAMSIZ
))
1005 i
= find_first_zero_bit(inuse
, max_netdevices
);
1006 free_page((unsigned long) inuse
);
1010 snprintf(buf
, IFNAMSIZ
, name
, i
);
1011 if (!__dev_get_by_name(net
, buf
))
1014 /* It is possible to run out of possible slots
1015 * when the name is long and there isn't enough space left
1016 * for the digits, or if all bits are used.
1022 * dev_alloc_name - allocate a name for a device
1024 * @name: name format string
1026 * Passed a format string - eg "lt%d" it will try and find a suitable
1027 * id. It scans list of devices to build up a free map, then chooses
1028 * the first empty slot. The caller must hold the dev_base or rtnl lock
1029 * while allocating the name and adding the device in order to avoid
1031 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1032 * Returns the number of the unit assigned or a negative errno code.
1035 int dev_alloc_name(struct net_device
*dev
, const char *name
)
1041 BUG_ON(!dev_net(dev
));
1043 ret
= __dev_alloc_name(net
, name
, buf
);
1045 strlcpy(dev
->name
, buf
, IFNAMSIZ
);
1048 EXPORT_SYMBOL(dev_alloc_name
);
1050 static int dev_alloc_name_ns(struct net
*net
,
1051 struct net_device
*dev
,
1057 ret
= __dev_alloc_name(net
, name
, buf
);
1059 strlcpy(dev
->name
, buf
, IFNAMSIZ
);
1063 static int dev_get_valid_name(struct net
*net
,
1064 struct net_device
*dev
,
1069 if (!dev_valid_name(name
))
1072 if (strchr(name
, '%'))
1073 return dev_alloc_name_ns(net
, dev
, name
);
1074 else if (__dev_get_by_name(net
, name
))
1076 else if (dev
->name
!= name
)
1077 strlcpy(dev
->name
, name
, IFNAMSIZ
);
1083 * dev_change_name - change name of a device
1085 * @newname: name (or format string) must be at least IFNAMSIZ
1087 * Change name of a device, can pass format strings "eth%d".
1090 int dev_change_name(struct net_device
*dev
, const char *newname
)
1092 unsigned char old_assign_type
;
1093 char oldname
[IFNAMSIZ
];
1099 BUG_ON(!dev_net(dev
));
1102 if (dev
->flags
& IFF_UP
)
1105 write_seqcount_begin(&devnet_rename_seq
);
1107 if (strncmp(newname
, dev
->name
, IFNAMSIZ
) == 0) {
1108 write_seqcount_end(&devnet_rename_seq
);
1112 memcpy(oldname
, dev
->name
, IFNAMSIZ
);
1114 err
= dev_get_valid_name(net
, dev
, newname
);
1116 write_seqcount_end(&devnet_rename_seq
);
1120 if (oldname
[0] && !strchr(oldname
, '%'))
1121 netdev_info(dev
, "renamed from %s\n", oldname
);
1123 old_assign_type
= dev
->name_assign_type
;
1124 dev
->name_assign_type
= NET_NAME_RENAMED
;
1127 ret
= device_rename(&dev
->dev
, dev
->name
);
1129 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1130 dev
->name_assign_type
= old_assign_type
;
1131 write_seqcount_end(&devnet_rename_seq
);
1135 write_seqcount_end(&devnet_rename_seq
);
1137 netdev_adjacent_rename_links(dev
, oldname
);
1139 write_lock_bh(&dev_base_lock
);
1140 hlist_del_rcu(&dev
->name_hlist
);
1141 write_unlock_bh(&dev_base_lock
);
1145 write_lock_bh(&dev_base_lock
);
1146 hlist_add_head_rcu(&dev
->name_hlist
, dev_name_hash(net
, dev
->name
));
1147 write_unlock_bh(&dev_base_lock
);
1149 ret
= call_netdevice_notifiers(NETDEV_CHANGENAME
, dev
);
1150 ret
= notifier_to_errno(ret
);
1153 /* err >= 0 after dev_alloc_name() or stores the first errno */
1156 write_seqcount_begin(&devnet_rename_seq
);
1157 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1158 memcpy(oldname
, newname
, IFNAMSIZ
);
1159 dev
->name_assign_type
= old_assign_type
;
1160 old_assign_type
= NET_NAME_RENAMED
;
1163 pr_err("%s: name change rollback failed: %d\n",
1172 * dev_set_alias - change ifalias of a device
1174 * @alias: name up to IFALIASZ
1175 * @len: limit of bytes to copy from info
1177 * Set ifalias for a device,
1179 int dev_set_alias(struct net_device
*dev
, const char *alias
, size_t len
)
1185 if (len
>= IFALIASZ
)
1189 kfree(dev
->ifalias
);
1190 dev
->ifalias
= NULL
;
1194 new_ifalias
= krealloc(dev
->ifalias
, len
+ 1, GFP_KERNEL
);
1197 dev
->ifalias
= new_ifalias
;
1199 strlcpy(dev
->ifalias
, alias
, len
+1);
1205 * netdev_features_change - device changes features
1206 * @dev: device to cause notification
1208 * Called to indicate a device has changed features.
1210 void netdev_features_change(struct net_device
*dev
)
1212 call_netdevice_notifiers(NETDEV_FEAT_CHANGE
, dev
);
1214 EXPORT_SYMBOL(netdev_features_change
);
1217 * netdev_state_change - device changes state
1218 * @dev: device to cause notification
1220 * Called to indicate a device has changed state. This function calls
1221 * the notifier chains for netdev_chain and sends a NEWLINK message
1222 * to the routing socket.
1224 void netdev_state_change(struct net_device
*dev
)
1226 if (dev
->flags
& IFF_UP
) {
1227 struct netdev_notifier_change_info change_info
;
1229 change_info
.flags_changed
= 0;
1230 call_netdevice_notifiers_info(NETDEV_CHANGE
, dev
,
1232 rtmsg_ifinfo(RTM_NEWLINK
, dev
, 0, GFP_KERNEL
);
1235 EXPORT_SYMBOL(netdev_state_change
);
1238 * netdev_notify_peers - notify network peers about existence of @dev
1239 * @dev: network device
1241 * Generate traffic such that interested network peers are aware of
1242 * @dev, such as by generating a gratuitous ARP. This may be used when
1243 * a device wants to inform the rest of the network about some sort of
1244 * reconfiguration such as a failover event or virtual machine
1247 void netdev_notify_peers(struct net_device
*dev
)
1250 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS
, dev
);
1253 EXPORT_SYMBOL(netdev_notify_peers
);
1255 static int __dev_open(struct net_device
*dev
)
1257 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1262 if (!netif_device_present(dev
))
1265 /* Block netpoll from trying to do any rx path servicing.
1266 * If we don't do this there is a chance ndo_poll_controller
1267 * or ndo_poll may be running while we open the device
1269 netpoll_poll_disable(dev
);
1271 ret
= call_netdevice_notifiers(NETDEV_PRE_UP
, dev
);
1272 ret
= notifier_to_errno(ret
);
1276 set_bit(__LINK_STATE_START
, &dev
->state
);
1278 if (ops
->ndo_validate_addr
)
1279 ret
= ops
->ndo_validate_addr(dev
);
1281 if (!ret
&& ops
->ndo_open
)
1282 ret
= ops
->ndo_open(dev
);
1284 netpoll_poll_enable(dev
);
1287 clear_bit(__LINK_STATE_START
, &dev
->state
);
1289 dev
->flags
|= IFF_UP
;
1290 dev_set_rx_mode(dev
);
1292 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
1299 * dev_open - prepare an interface for use.
1300 * @dev: device to open
1302 * Takes a device from down to up state. The device's private open
1303 * function is invoked and then the multicast lists are loaded. Finally
1304 * the device is moved into the up state and a %NETDEV_UP message is
1305 * sent to the netdev notifier chain.
1307 * Calling this function on an active interface is a nop. On a failure
1308 * a negative errno code is returned.
1310 int dev_open(struct net_device
*dev
)
1314 if (dev
->flags
& IFF_UP
)
1317 ret
= __dev_open(dev
);
1321 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
, GFP_KERNEL
);
1322 call_netdevice_notifiers(NETDEV_UP
, dev
);
1326 EXPORT_SYMBOL(dev_open
);
1328 static int __dev_close_many(struct list_head
*head
)
1330 struct net_device
*dev
;
1335 list_for_each_entry(dev
, head
, close_list
) {
1336 /* Temporarily disable netpoll until the interface is down */
1337 netpoll_poll_disable(dev
);
1339 call_netdevice_notifiers(NETDEV_GOING_DOWN
, dev
);
1341 clear_bit(__LINK_STATE_START
, &dev
->state
);
1343 /* Synchronize to scheduled poll. We cannot touch poll list, it
1344 * can be even on different cpu. So just clear netif_running().
1346 * dev->stop() will invoke napi_disable() on all of it's
1347 * napi_struct instances on this device.
1349 smp_mb__after_atomic(); /* Commit netif_running(). */
1352 dev_deactivate_many(head
);
1354 list_for_each_entry(dev
, head
, close_list
) {
1355 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1358 * Call the device specific close. This cannot fail.
1359 * Only if device is UP
1361 * We allow it to be called even after a DETACH hot-plug
1367 dev
->flags
&= ~IFF_UP
;
1368 netpoll_poll_enable(dev
);
1374 static int __dev_close(struct net_device
*dev
)
1379 list_add(&dev
->close_list
, &single
);
1380 retval
= __dev_close_many(&single
);
1386 static int dev_close_many(struct list_head
*head
)
1388 struct net_device
*dev
, *tmp
;
1390 /* Remove the devices that don't need to be closed */
1391 list_for_each_entry_safe(dev
, tmp
, head
, close_list
)
1392 if (!(dev
->flags
& IFF_UP
))
1393 list_del_init(&dev
->close_list
);
1395 __dev_close_many(head
);
1397 list_for_each_entry_safe(dev
, tmp
, head
, close_list
) {
1398 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
, GFP_KERNEL
);
1399 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
1400 list_del_init(&dev
->close_list
);
1407 * dev_close - shutdown an interface.
1408 * @dev: device to shutdown
1410 * This function moves an active device into down state. A
1411 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1412 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1415 int dev_close(struct net_device
*dev
)
1417 if (dev
->flags
& IFF_UP
) {
1420 list_add(&dev
->close_list
, &single
);
1421 dev_close_many(&single
);
1426 EXPORT_SYMBOL(dev_close
);
1430 * dev_disable_lro - disable Large Receive Offload on a device
1433 * Disable Large Receive Offload (LRO) on a net device. Must be
1434 * called under RTNL. This is needed if received packets may be
1435 * forwarded to another interface.
1437 void dev_disable_lro(struct net_device
*dev
)
1440 * If we're trying to disable lro on a vlan device
1441 * use the underlying physical device instead
1443 if (is_vlan_dev(dev
))
1444 dev
= vlan_dev_real_dev(dev
);
1446 /* the same for macvlan devices */
1447 if (netif_is_macvlan(dev
))
1448 dev
= macvlan_dev_real_dev(dev
);
1450 dev
->wanted_features
&= ~NETIF_F_LRO
;
1451 netdev_update_features(dev
);
1453 if (unlikely(dev
->features
& NETIF_F_LRO
))
1454 netdev_WARN(dev
, "failed to disable LRO!\n");
1456 EXPORT_SYMBOL(dev_disable_lro
);
1458 static int call_netdevice_notifier(struct notifier_block
*nb
, unsigned long val
,
1459 struct net_device
*dev
)
1461 struct netdev_notifier_info info
;
1463 netdev_notifier_info_init(&info
, dev
);
1464 return nb
->notifier_call(nb
, val
, &info
);
1467 static int dev_boot_phase
= 1;
1470 * register_netdevice_notifier - register a network notifier block
1473 * Register a notifier to be called when network device events occur.
1474 * The notifier passed is linked into the kernel structures and must
1475 * not be reused until it has been unregistered. A negative errno code
1476 * is returned on a failure.
1478 * When registered all registration and up events are replayed
1479 * to the new notifier to allow device to have a race free
1480 * view of the network device list.
1483 int register_netdevice_notifier(struct notifier_block
*nb
)
1485 struct net_device
*dev
;
1486 struct net_device
*last
;
1491 err
= raw_notifier_chain_register(&netdev_chain
, nb
);
1497 for_each_netdev(net
, dev
) {
1498 err
= call_netdevice_notifier(nb
, NETDEV_REGISTER
, dev
);
1499 err
= notifier_to_errno(err
);
1503 if (!(dev
->flags
& IFF_UP
))
1506 call_netdevice_notifier(nb
, NETDEV_UP
, dev
);
1517 for_each_netdev(net
, dev
) {
1521 if (dev
->flags
& IFF_UP
) {
1522 call_netdevice_notifier(nb
, NETDEV_GOING_DOWN
,
1524 call_netdevice_notifier(nb
, NETDEV_DOWN
, dev
);
1526 call_netdevice_notifier(nb
, NETDEV_UNREGISTER
, dev
);
1531 raw_notifier_chain_unregister(&netdev_chain
, nb
);
1534 EXPORT_SYMBOL(register_netdevice_notifier
);
1537 * unregister_netdevice_notifier - unregister a network notifier block
1540 * Unregister a notifier previously registered by
1541 * register_netdevice_notifier(). The notifier is unlinked into the
1542 * kernel structures and may then be reused. A negative errno code
1543 * is returned on a failure.
1545 * After unregistering unregister and down device events are synthesized
1546 * for all devices on the device list to the removed notifier to remove
1547 * the need for special case cleanup code.
1550 int unregister_netdevice_notifier(struct notifier_block
*nb
)
1552 struct net_device
*dev
;
1557 err
= raw_notifier_chain_unregister(&netdev_chain
, nb
);
1562 for_each_netdev(net
, dev
) {
1563 if (dev
->flags
& IFF_UP
) {
1564 call_netdevice_notifier(nb
, NETDEV_GOING_DOWN
,
1566 call_netdevice_notifier(nb
, NETDEV_DOWN
, dev
);
1568 call_netdevice_notifier(nb
, NETDEV_UNREGISTER
, dev
);
1575 EXPORT_SYMBOL(unregister_netdevice_notifier
);
1578 * call_netdevice_notifiers_info - call all network notifier blocks
1579 * @val: value passed unmodified to notifier function
1580 * @dev: net_device pointer passed unmodified to notifier function
1581 * @info: notifier information data
1583 * Call all network notifier blocks. Parameters and return value
1584 * are as for raw_notifier_call_chain().
1587 static int call_netdevice_notifiers_info(unsigned long val
,
1588 struct net_device
*dev
,
1589 struct netdev_notifier_info
*info
)
1592 netdev_notifier_info_init(info
, dev
);
1593 return raw_notifier_call_chain(&netdev_chain
, val
, info
);
1597 * call_netdevice_notifiers - call all network notifier blocks
1598 * @val: value passed unmodified to notifier function
1599 * @dev: net_device pointer passed unmodified to notifier function
1601 * Call all network notifier blocks. Parameters and return value
1602 * are as for raw_notifier_call_chain().
1605 int call_netdevice_notifiers(unsigned long val
, struct net_device
*dev
)
1607 struct netdev_notifier_info info
;
1609 return call_netdevice_notifiers_info(val
, dev
, &info
);
1611 EXPORT_SYMBOL(call_netdevice_notifiers
);
1613 static struct static_key netstamp_needed __read_mostly
;
1614 #ifdef HAVE_JUMP_LABEL
1615 /* We are not allowed to call static_key_slow_dec() from irq context
1616 * If net_disable_timestamp() is called from irq context, defer the
1617 * static_key_slow_dec() calls.
1619 static atomic_t netstamp_needed_deferred
;
1622 void net_enable_timestamp(void)
1624 #ifdef HAVE_JUMP_LABEL
1625 int deferred
= atomic_xchg(&netstamp_needed_deferred
, 0);
1629 static_key_slow_dec(&netstamp_needed
);
1633 static_key_slow_inc(&netstamp_needed
);
1635 EXPORT_SYMBOL(net_enable_timestamp
);
1637 void net_disable_timestamp(void)
1639 #ifdef HAVE_JUMP_LABEL
1640 if (in_interrupt()) {
1641 atomic_inc(&netstamp_needed_deferred
);
1645 static_key_slow_dec(&netstamp_needed
);
1647 EXPORT_SYMBOL(net_disable_timestamp
);
1649 static inline void net_timestamp_set(struct sk_buff
*skb
)
1651 skb
->tstamp
.tv64
= 0;
1652 if (static_key_false(&netstamp_needed
))
1653 __net_timestamp(skb
);
1656 #define net_timestamp_check(COND, SKB) \
1657 if (static_key_false(&netstamp_needed)) { \
1658 if ((COND) && !(SKB)->tstamp.tv64) \
1659 __net_timestamp(SKB); \
1662 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1666 if (!(dev
->flags
& IFF_UP
))
1669 len
= dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
;
1670 if (skb
->len
<= len
)
1673 /* if TSO is enabled, we don't care about the length as the packet
1674 * could be forwarded without being segmented before
1676 if (skb_is_gso(skb
))
1681 EXPORT_SYMBOL_GPL(is_skb_forwardable
);
1683 int __dev_forward_skb(struct net_device
*dev
, struct sk_buff
*skb
)
1685 if (skb_shinfo(skb
)->tx_flags
& SKBTX_DEV_ZEROCOPY
) {
1686 if (skb_copy_ubufs(skb
, GFP_ATOMIC
)) {
1687 atomic_long_inc(&dev
->rx_dropped
);
1693 if (unlikely(!is_skb_forwardable(dev
, skb
))) {
1694 atomic_long_inc(&dev
->rx_dropped
);
1699 skb_scrub_packet(skb
, true);
1700 skb
->protocol
= eth_type_trans(skb
, dev
);
1704 EXPORT_SYMBOL_GPL(__dev_forward_skb
);
1707 * dev_forward_skb - loopback an skb to another netif
1709 * @dev: destination network device
1710 * @skb: buffer to forward
1713 * NET_RX_SUCCESS (no congestion)
1714 * NET_RX_DROP (packet was dropped, but freed)
1716 * dev_forward_skb can be used for injecting an skb from the
1717 * start_xmit function of one device into the receive queue
1718 * of another device.
1720 * The receiving device may be in another namespace, so
1721 * we have to clear all information in the skb that could
1722 * impact namespace isolation.
1724 int dev_forward_skb(struct net_device
*dev
, struct sk_buff
*skb
)
1726 return __dev_forward_skb(dev
, skb
) ?: netif_rx_internal(skb
);
1728 EXPORT_SYMBOL_GPL(dev_forward_skb
);
1730 static inline int deliver_skb(struct sk_buff
*skb
,
1731 struct packet_type
*pt_prev
,
1732 struct net_device
*orig_dev
)
1734 if (unlikely(skb_orphan_frags(skb
, GFP_ATOMIC
)))
1736 atomic_inc(&skb
->users
);
1737 return pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
1740 static inline bool skb_loop_sk(struct packet_type
*ptype
, struct sk_buff
*skb
)
1742 if (!ptype
->af_packet_priv
|| !skb
->sk
)
1745 if (ptype
->id_match
)
1746 return ptype
->id_match(ptype
, skb
->sk
);
1747 else if ((struct sock
*)ptype
->af_packet_priv
== skb
->sk
)
1754 * Support routine. Sends outgoing frames to any network
1755 * taps currently in use.
1758 static void dev_queue_xmit_nit(struct sk_buff
*skb
, struct net_device
*dev
)
1760 struct packet_type
*ptype
;
1761 struct sk_buff
*skb2
= NULL
;
1762 struct packet_type
*pt_prev
= NULL
;
1765 list_for_each_entry_rcu(ptype
, &ptype_all
, list
) {
1766 /* Never send packets back to the socket
1767 * they originated from - MvS (miquels@drinkel.ow.org)
1769 if ((ptype
->dev
== dev
|| !ptype
->dev
) &&
1770 (!skb_loop_sk(ptype
, skb
))) {
1772 deliver_skb(skb2
, pt_prev
, skb
->dev
);
1777 skb2
= skb_clone(skb
, GFP_ATOMIC
);
1781 net_timestamp_set(skb2
);
1783 /* skb->nh should be correctly
1784 set by sender, so that the second statement is
1785 just protection against buggy protocols.
1787 skb_reset_mac_header(skb2
);
1789 if (skb_network_header(skb2
) < skb2
->data
||
1790 skb_network_header(skb2
) > skb_tail_pointer(skb2
)) {
1791 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1792 ntohs(skb2
->protocol
),
1794 skb_reset_network_header(skb2
);
1797 skb2
->transport_header
= skb2
->network_header
;
1798 skb2
->pkt_type
= PACKET_OUTGOING
;
1803 pt_prev
->func(skb2
, skb
->dev
, pt_prev
, skb
->dev
);
1808 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1809 * @dev: Network device
1810 * @txq: number of queues available
1812 * If real_num_tx_queues is changed the tc mappings may no longer be
1813 * valid. To resolve this verify the tc mapping remains valid and if
1814 * not NULL the mapping. With no priorities mapping to this
1815 * offset/count pair it will no longer be used. In the worst case TC0
1816 * is invalid nothing can be done so disable priority mappings. If is
1817 * expected that drivers will fix this mapping if they can before
1818 * calling netif_set_real_num_tx_queues.
1820 static void netif_setup_tc(struct net_device
*dev
, unsigned int txq
)
1823 struct netdev_tc_txq
*tc
= &dev
->tc_to_txq
[0];
1825 /* If TC0 is invalidated disable TC mapping */
1826 if (tc
->offset
+ tc
->count
> txq
) {
1827 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1832 /* Invalidated prio to tc mappings set to TC0 */
1833 for (i
= 1; i
< TC_BITMASK
+ 1; i
++) {
1834 int q
= netdev_get_prio_tc_map(dev
, i
);
1836 tc
= &dev
->tc_to_txq
[q
];
1837 if (tc
->offset
+ tc
->count
> txq
) {
1838 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1840 netdev_set_prio_tc_map(dev
, i
, 0);
1846 static DEFINE_MUTEX(xps_map_mutex
);
1847 #define xmap_dereference(P) \
1848 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1850 static struct xps_map
*remove_xps_queue(struct xps_dev_maps
*dev_maps
,
1853 struct xps_map
*map
= NULL
;
1857 map
= xmap_dereference(dev_maps
->cpu_map
[cpu
]);
1859 for (pos
= 0; map
&& pos
< map
->len
; pos
++) {
1860 if (map
->queues
[pos
] == index
) {
1862 map
->queues
[pos
] = map
->queues
[--map
->len
];
1864 RCU_INIT_POINTER(dev_maps
->cpu_map
[cpu
], NULL
);
1865 kfree_rcu(map
, rcu
);
1875 static void netif_reset_xps_queues_gt(struct net_device
*dev
, u16 index
)
1877 struct xps_dev_maps
*dev_maps
;
1879 bool active
= false;
1881 mutex_lock(&xps_map_mutex
);
1882 dev_maps
= xmap_dereference(dev
->xps_maps
);
1887 for_each_possible_cpu(cpu
) {
1888 for (i
= index
; i
< dev
->num_tx_queues
; i
++) {
1889 if (!remove_xps_queue(dev_maps
, cpu
, i
))
1892 if (i
== dev
->num_tx_queues
)
1897 RCU_INIT_POINTER(dev
->xps_maps
, NULL
);
1898 kfree_rcu(dev_maps
, rcu
);
1901 for (i
= index
; i
< dev
->num_tx_queues
; i
++)
1902 netdev_queue_numa_node_write(netdev_get_tx_queue(dev
, i
),
1906 mutex_unlock(&xps_map_mutex
);
1909 static struct xps_map
*expand_xps_map(struct xps_map
*map
,
1912 struct xps_map
*new_map
;
1913 int alloc_len
= XPS_MIN_MAP_ALLOC
;
1916 for (pos
= 0; map
&& pos
< map
->len
; pos
++) {
1917 if (map
->queues
[pos
] != index
)
1922 /* Need to add queue to this CPU's existing map */
1924 if (pos
< map
->alloc_len
)
1927 alloc_len
= map
->alloc_len
* 2;
1930 /* Need to allocate new map to store queue on this CPU's map */
1931 new_map
= kzalloc_node(XPS_MAP_SIZE(alloc_len
), GFP_KERNEL
,
1936 for (i
= 0; i
< pos
; i
++)
1937 new_map
->queues
[i
] = map
->queues
[i
];
1938 new_map
->alloc_len
= alloc_len
;
1944 int netif_set_xps_queue(struct net_device
*dev
, const struct cpumask
*mask
,
1947 struct xps_dev_maps
*dev_maps
, *new_dev_maps
= NULL
;
1948 struct xps_map
*map
, *new_map
;
1949 int maps_sz
= max_t(unsigned int, XPS_DEV_MAPS_SIZE
, L1_CACHE_BYTES
);
1950 int cpu
, numa_node_id
= -2;
1951 bool active
= false;
1953 mutex_lock(&xps_map_mutex
);
1955 dev_maps
= xmap_dereference(dev
->xps_maps
);
1957 /* allocate memory for queue storage */
1958 for_each_online_cpu(cpu
) {
1959 if (!cpumask_test_cpu(cpu
, mask
))
1963 new_dev_maps
= kzalloc(maps_sz
, GFP_KERNEL
);
1964 if (!new_dev_maps
) {
1965 mutex_unlock(&xps_map_mutex
);
1969 map
= dev_maps
? xmap_dereference(dev_maps
->cpu_map
[cpu
]) :
1972 map
= expand_xps_map(map
, cpu
, index
);
1976 RCU_INIT_POINTER(new_dev_maps
->cpu_map
[cpu
], map
);
1980 goto out_no_new_maps
;
1982 for_each_possible_cpu(cpu
) {
1983 if (cpumask_test_cpu(cpu
, mask
) && cpu_online(cpu
)) {
1984 /* add queue to CPU maps */
1987 map
= xmap_dereference(new_dev_maps
->cpu_map
[cpu
]);
1988 while ((pos
< map
->len
) && (map
->queues
[pos
] != index
))
1991 if (pos
== map
->len
)
1992 map
->queues
[map
->len
++] = index
;
1994 if (numa_node_id
== -2)
1995 numa_node_id
= cpu_to_node(cpu
);
1996 else if (numa_node_id
!= cpu_to_node(cpu
))
1999 } else if (dev_maps
) {
2000 /* fill in the new device map from the old device map */
2001 map
= xmap_dereference(dev_maps
->cpu_map
[cpu
]);
2002 RCU_INIT_POINTER(new_dev_maps
->cpu_map
[cpu
], map
);
2007 rcu_assign_pointer(dev
->xps_maps
, new_dev_maps
);
2009 /* Cleanup old maps */
2011 for_each_possible_cpu(cpu
) {
2012 new_map
= xmap_dereference(new_dev_maps
->cpu_map
[cpu
]);
2013 map
= xmap_dereference(dev_maps
->cpu_map
[cpu
]);
2014 if (map
&& map
!= new_map
)
2015 kfree_rcu(map
, rcu
);
2018 kfree_rcu(dev_maps
, rcu
);
2021 dev_maps
= new_dev_maps
;
2025 /* update Tx queue numa node */
2026 netdev_queue_numa_node_write(netdev_get_tx_queue(dev
, index
),
2027 (numa_node_id
>= 0) ? numa_node_id
:
2033 /* removes queue from unused CPUs */
2034 for_each_possible_cpu(cpu
) {
2035 if (cpumask_test_cpu(cpu
, mask
) && cpu_online(cpu
))
2038 if (remove_xps_queue(dev_maps
, cpu
, index
))
2042 /* free map if not active */
2044 RCU_INIT_POINTER(dev
->xps_maps
, NULL
);
2045 kfree_rcu(dev_maps
, rcu
);
2049 mutex_unlock(&xps_map_mutex
);
2053 /* remove any maps that we added */
2054 for_each_possible_cpu(cpu
) {
2055 new_map
= xmap_dereference(new_dev_maps
->cpu_map
[cpu
]);
2056 map
= dev_maps
? xmap_dereference(dev_maps
->cpu_map
[cpu
]) :
2058 if (new_map
&& new_map
!= map
)
2062 mutex_unlock(&xps_map_mutex
);
2064 kfree(new_dev_maps
);
2067 EXPORT_SYMBOL(netif_set_xps_queue
);
2071 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2072 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2074 int netif_set_real_num_tx_queues(struct net_device
*dev
, unsigned int txq
)
2078 if (txq
< 1 || txq
> dev
->num_tx_queues
)
2081 if (dev
->reg_state
== NETREG_REGISTERED
||
2082 dev
->reg_state
== NETREG_UNREGISTERING
) {
2085 rc
= netdev_queue_update_kobjects(dev
, dev
->real_num_tx_queues
,
2091 netif_setup_tc(dev
, txq
);
2093 if (txq
< dev
->real_num_tx_queues
) {
2094 qdisc_reset_all_tx_gt(dev
, txq
);
2096 netif_reset_xps_queues_gt(dev
, txq
);
2101 dev
->real_num_tx_queues
= txq
;
2104 EXPORT_SYMBOL(netif_set_real_num_tx_queues
);
2108 * netif_set_real_num_rx_queues - set actual number of RX queues used
2109 * @dev: Network device
2110 * @rxq: Actual number of RX queues
2112 * This must be called either with the rtnl_lock held or before
2113 * registration of the net device. Returns 0 on success, or a
2114 * negative error code. If called before registration, it always
2117 int netif_set_real_num_rx_queues(struct net_device
*dev
, unsigned int rxq
)
2121 if (rxq
< 1 || rxq
> dev
->num_rx_queues
)
2124 if (dev
->reg_state
== NETREG_REGISTERED
) {
2127 rc
= net_rx_queue_update_kobjects(dev
, dev
->real_num_rx_queues
,
2133 dev
->real_num_rx_queues
= rxq
;
2136 EXPORT_SYMBOL(netif_set_real_num_rx_queues
);
2140 * netif_get_num_default_rss_queues - default number of RSS queues
2142 * This routine should set an upper limit on the number of RSS queues
2143 * used by default by multiqueue devices.
2145 int netif_get_num_default_rss_queues(void)
2147 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES
, num_online_cpus());
2149 EXPORT_SYMBOL(netif_get_num_default_rss_queues
);
2151 static inline void __netif_reschedule(struct Qdisc
*q
)
2153 struct softnet_data
*sd
;
2154 unsigned long flags
;
2156 local_irq_save(flags
);
2157 sd
= this_cpu_ptr(&softnet_data
);
2158 q
->next_sched
= NULL
;
2159 *sd
->output_queue_tailp
= q
;
2160 sd
->output_queue_tailp
= &q
->next_sched
;
2161 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
2162 local_irq_restore(flags
);
2165 void __netif_schedule(struct Qdisc
*q
)
2167 if (!test_and_set_bit(__QDISC_STATE_SCHED
, &q
->state
))
2168 __netif_reschedule(q
);
2170 EXPORT_SYMBOL(__netif_schedule
);
2172 struct dev_kfree_skb_cb
{
2173 enum skb_free_reason reason
;
2176 static struct dev_kfree_skb_cb
*get_kfree_skb_cb(const struct sk_buff
*skb
)
2178 return (struct dev_kfree_skb_cb
*)skb
->cb
;
2181 void netif_schedule_queue(struct netdev_queue
*txq
)
2184 if (!(txq
->state
& QUEUE_STATE_ANY_XOFF
)) {
2185 struct Qdisc
*q
= rcu_dereference(txq
->qdisc
);
2187 __netif_schedule(q
);
2191 EXPORT_SYMBOL(netif_schedule_queue
);
2194 * netif_wake_subqueue - allow sending packets on subqueue
2195 * @dev: network device
2196 * @queue_index: sub queue index
2198 * Resume individual transmit queue of a device with multiple transmit queues.
2200 void netif_wake_subqueue(struct net_device
*dev
, u16 queue_index
)
2202 struct netdev_queue
*txq
= netdev_get_tx_queue(dev
, queue_index
);
2204 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF
, &txq
->state
)) {
2208 q
= rcu_dereference(txq
->qdisc
);
2209 __netif_schedule(q
);
2213 EXPORT_SYMBOL(netif_wake_subqueue
);
2215 void netif_tx_wake_queue(struct netdev_queue
*dev_queue
)
2217 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF
, &dev_queue
->state
)) {
2221 q
= rcu_dereference(dev_queue
->qdisc
);
2222 __netif_schedule(q
);
2226 EXPORT_SYMBOL(netif_tx_wake_queue
);
2228 void __dev_kfree_skb_irq(struct sk_buff
*skb
, enum skb_free_reason reason
)
2230 unsigned long flags
;
2232 if (likely(atomic_read(&skb
->users
) == 1)) {
2234 atomic_set(&skb
->users
, 0);
2235 } else if (likely(!atomic_dec_and_test(&skb
->users
))) {
2238 get_kfree_skb_cb(skb
)->reason
= reason
;
2239 local_irq_save(flags
);
2240 skb
->next
= __this_cpu_read(softnet_data
.completion_queue
);
2241 __this_cpu_write(softnet_data
.completion_queue
, skb
);
2242 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
2243 local_irq_restore(flags
);
2245 EXPORT_SYMBOL(__dev_kfree_skb_irq
);
2247 void __dev_kfree_skb_any(struct sk_buff
*skb
, enum skb_free_reason reason
)
2249 if (in_irq() || irqs_disabled())
2250 __dev_kfree_skb_irq(skb
, reason
);
2254 EXPORT_SYMBOL(__dev_kfree_skb_any
);
2258 * netif_device_detach - mark device as removed
2259 * @dev: network device
2261 * Mark device as removed from system and therefore no longer available.
2263 void netif_device_detach(struct net_device
*dev
)
2265 if (test_and_clear_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
2266 netif_running(dev
)) {
2267 netif_tx_stop_all_queues(dev
);
2270 EXPORT_SYMBOL(netif_device_detach
);
2273 * netif_device_attach - mark device as attached
2274 * @dev: network device
2276 * Mark device as attached from system and restart if needed.
2278 void netif_device_attach(struct net_device
*dev
)
2280 if (!test_and_set_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
2281 netif_running(dev
)) {
2282 netif_tx_wake_all_queues(dev
);
2283 __netdev_watchdog_up(dev
);
2286 EXPORT_SYMBOL(netif_device_attach
);
2288 static void skb_warn_bad_offload(const struct sk_buff
*skb
)
2290 static const netdev_features_t null_features
= 0;
2291 struct net_device
*dev
= skb
->dev
;
2292 const char *driver
= "";
2294 if (!net_ratelimit())
2297 if (dev
&& dev
->dev
.parent
)
2298 driver
= dev_driver_string(dev
->dev
.parent
);
2300 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2301 "gso_type=%d ip_summed=%d\n",
2302 driver
, dev
? &dev
->features
: &null_features
,
2303 skb
->sk
? &skb
->sk
->sk_route_caps
: &null_features
,
2304 skb
->len
, skb
->data_len
, skb_shinfo(skb
)->gso_size
,
2305 skb_shinfo(skb
)->gso_type
, skb
->ip_summed
);
2309 * Invalidate hardware checksum when packet is to be mangled, and
2310 * complete checksum manually on outgoing path.
2312 int skb_checksum_help(struct sk_buff
*skb
)
2315 int ret
= 0, offset
;
2317 if (skb
->ip_summed
== CHECKSUM_COMPLETE
)
2318 goto out_set_summed
;
2320 if (unlikely(skb_shinfo(skb
)->gso_size
)) {
2321 skb_warn_bad_offload(skb
);
2325 /* Before computing a checksum, we should make sure no frag could
2326 * be modified by an external entity : checksum could be wrong.
2328 if (skb_has_shared_frag(skb
)) {
2329 ret
= __skb_linearize(skb
);
2334 offset
= skb_checksum_start_offset(skb
);
2335 BUG_ON(offset
>= skb_headlen(skb
));
2336 csum
= skb_checksum(skb
, offset
, skb
->len
- offset
, 0);
2338 offset
+= skb
->csum_offset
;
2339 BUG_ON(offset
+ sizeof(__sum16
) > skb_headlen(skb
));
2341 if (skb_cloned(skb
) &&
2342 !skb_clone_writable(skb
, offset
+ sizeof(__sum16
))) {
2343 ret
= pskb_expand_head(skb
, 0, 0, GFP_ATOMIC
);
2348 *(__sum16
*)(skb
->data
+ offset
) = csum_fold(csum
);
2350 skb
->ip_summed
= CHECKSUM_NONE
;
2354 EXPORT_SYMBOL(skb_checksum_help
);
2356 __be16
skb_network_protocol(struct sk_buff
*skb
, int *depth
)
2358 unsigned int vlan_depth
= skb
->mac_len
;
2359 __be16 type
= skb
->protocol
;
2361 /* Tunnel gso handlers can set protocol to ethernet. */
2362 if (type
== htons(ETH_P_TEB
)) {
2365 if (unlikely(!pskb_may_pull(skb
, sizeof(struct ethhdr
))))
2368 eth
= (struct ethhdr
*)skb_mac_header(skb
);
2369 type
= eth
->h_proto
;
2372 /* if skb->protocol is 802.1Q/AD then the header should already be
2373 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2374 * ETH_HLEN otherwise
2376 if (type
== htons(ETH_P_8021Q
) || type
== htons(ETH_P_8021AD
)) {
2378 if (WARN_ON(vlan_depth
< VLAN_HLEN
))
2380 vlan_depth
-= VLAN_HLEN
;
2382 vlan_depth
= ETH_HLEN
;
2385 struct vlan_hdr
*vh
;
2387 if (unlikely(!pskb_may_pull(skb
,
2388 vlan_depth
+ VLAN_HLEN
)))
2391 vh
= (struct vlan_hdr
*)(skb
->data
+ vlan_depth
);
2392 type
= vh
->h_vlan_encapsulated_proto
;
2393 vlan_depth
+= VLAN_HLEN
;
2394 } while (type
== htons(ETH_P_8021Q
) ||
2395 type
== htons(ETH_P_8021AD
));
2398 *depth
= vlan_depth
;
2404 * skb_mac_gso_segment - mac layer segmentation handler.
2405 * @skb: buffer to segment
2406 * @features: features for the output path (see dev->features)
2408 struct sk_buff
*skb_mac_gso_segment(struct sk_buff
*skb
,
2409 netdev_features_t features
)
2411 struct sk_buff
*segs
= ERR_PTR(-EPROTONOSUPPORT
);
2412 struct packet_offload
*ptype
;
2413 int vlan_depth
= skb
->mac_len
;
2414 __be16 type
= skb_network_protocol(skb
, &vlan_depth
);
2416 if (unlikely(!type
))
2417 return ERR_PTR(-EINVAL
);
2419 __skb_pull(skb
, vlan_depth
);
2422 list_for_each_entry_rcu(ptype
, &offload_base
, list
) {
2423 if (ptype
->type
== type
&& ptype
->callbacks
.gso_segment
) {
2424 segs
= ptype
->callbacks
.gso_segment(skb
, features
);
2430 __skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2434 EXPORT_SYMBOL(skb_mac_gso_segment
);
2437 /* openvswitch calls this on rx path, so we need a different check.
2439 static inline bool skb_needs_check(struct sk_buff
*skb
, bool tx_path
)
2442 return skb
->ip_summed
!= CHECKSUM_PARTIAL
;
2444 return skb
->ip_summed
== CHECKSUM_NONE
;
2448 * __skb_gso_segment - Perform segmentation on skb.
2449 * @skb: buffer to segment
2450 * @features: features for the output path (see dev->features)
2451 * @tx_path: whether it is called in TX path
2453 * This function segments the given skb and returns a list of segments.
2455 * It may return NULL if the skb requires no segmentation. This is
2456 * only possible when GSO is used for verifying header integrity.
2458 struct sk_buff
*__skb_gso_segment(struct sk_buff
*skb
,
2459 netdev_features_t features
, bool tx_path
)
2461 if (unlikely(skb_needs_check(skb
, tx_path
))) {
2464 skb_warn_bad_offload(skb
);
2466 err
= skb_cow_head(skb
, 0);
2468 return ERR_PTR(err
);
2471 SKB_GSO_CB(skb
)->mac_offset
= skb_headroom(skb
);
2472 SKB_GSO_CB(skb
)->encap_level
= 0;
2474 skb_reset_mac_header(skb
);
2475 skb_reset_mac_len(skb
);
2477 return skb_mac_gso_segment(skb
, features
);
2479 EXPORT_SYMBOL(__skb_gso_segment
);
2481 /* Take action when hardware reception checksum errors are detected. */
2483 void netdev_rx_csum_fault(struct net_device
*dev
)
2485 if (net_ratelimit()) {
2486 pr_err("%s: hw csum failure\n", dev
? dev
->name
: "<unknown>");
2490 EXPORT_SYMBOL(netdev_rx_csum_fault
);
2493 /* Actually, we should eliminate this check as soon as we know, that:
2494 * 1. IOMMU is present and allows to map all the memory.
2495 * 2. No high memory really exists on this machine.
2498 static int illegal_highdma(struct net_device
*dev
, struct sk_buff
*skb
)
2500 #ifdef CONFIG_HIGHMEM
2502 if (!(dev
->features
& NETIF_F_HIGHDMA
)) {
2503 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++) {
2504 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
];
2505 if (PageHighMem(skb_frag_page(frag
)))
2510 if (PCI_DMA_BUS_IS_PHYS
) {
2511 struct device
*pdev
= dev
->dev
.parent
;
2515 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++) {
2516 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
];
2517 dma_addr_t addr
= page_to_phys(skb_frag_page(frag
));
2518 if (!pdev
->dma_mask
|| addr
+ PAGE_SIZE
- 1 > *pdev
->dma_mask
)
2526 /* If MPLS offload request, verify we are testing hardware MPLS features
2527 * instead of standard features for the netdev.
2529 #ifdef CONFIG_NET_MPLS_GSO
2530 static netdev_features_t
net_mpls_features(struct sk_buff
*skb
,
2531 netdev_features_t features
,
2534 if (eth_p_mpls(type
))
2535 features
&= skb
->dev
->mpls_features
;
2540 static netdev_features_t
net_mpls_features(struct sk_buff
*skb
,
2541 netdev_features_t features
,
2548 static netdev_features_t
harmonize_features(struct sk_buff
*skb
,
2549 netdev_features_t features
)
2554 type
= skb_network_protocol(skb
, &tmp
);
2555 features
= net_mpls_features(skb
, features
, type
);
2557 if (skb
->ip_summed
!= CHECKSUM_NONE
&&
2558 !can_checksum_protocol(features
, type
)) {
2559 features
&= ~NETIF_F_ALL_CSUM
;
2560 } else if (illegal_highdma(skb
->dev
, skb
)) {
2561 features
&= ~NETIF_F_SG
;
2567 netdev_features_t
netif_skb_features(struct sk_buff
*skb
)
2569 const struct net_device
*dev
= skb
->dev
;
2570 netdev_features_t features
= dev
->features
;
2571 u16 gso_segs
= skb_shinfo(skb
)->gso_segs
;
2572 __be16 protocol
= skb
->protocol
;
2574 if (gso_segs
> dev
->gso_max_segs
|| gso_segs
< dev
->gso_min_segs
)
2575 features
&= ~NETIF_F_GSO_MASK
;
2577 if (protocol
== htons(ETH_P_8021Q
) || protocol
== htons(ETH_P_8021AD
)) {
2578 struct vlan_ethhdr
*veh
= (struct vlan_ethhdr
*)skb
->data
;
2579 protocol
= veh
->h_vlan_encapsulated_proto
;
2580 } else if (!vlan_tx_tag_present(skb
)) {
2581 return harmonize_features(skb
, features
);
2584 features
= netdev_intersect_features(features
,
2585 dev
->vlan_features
|
2586 NETIF_F_HW_VLAN_CTAG_TX
|
2587 NETIF_F_HW_VLAN_STAG_TX
);
2589 if (protocol
== htons(ETH_P_8021Q
) || protocol
== htons(ETH_P_8021AD
))
2590 features
= netdev_intersect_features(features
,
2595 NETIF_F_HW_VLAN_CTAG_TX
|
2596 NETIF_F_HW_VLAN_STAG_TX
);
2598 return harmonize_features(skb
, features
);
2600 EXPORT_SYMBOL(netif_skb_features
);
2602 static int xmit_one(struct sk_buff
*skb
, struct net_device
*dev
,
2603 struct netdev_queue
*txq
, bool more
)
2608 if (!list_empty(&ptype_all
))
2609 dev_queue_xmit_nit(skb
, dev
);
2612 trace_net_dev_start_xmit(skb
, dev
);
2613 rc
= netdev_start_xmit(skb
, dev
, txq
, more
);
2614 trace_net_dev_xmit(skb
, rc
, dev
, len
);
2619 struct sk_buff
*dev_hard_start_xmit(struct sk_buff
*first
, struct net_device
*dev
,
2620 struct netdev_queue
*txq
, int *ret
)
2622 struct sk_buff
*skb
= first
;
2623 int rc
= NETDEV_TX_OK
;
2626 struct sk_buff
*next
= skb
->next
;
2629 rc
= xmit_one(skb
, dev
, txq
, next
!= NULL
);
2630 if (unlikely(!dev_xmit_complete(rc
))) {
2636 if (netif_xmit_stopped(txq
) && skb
) {
2637 rc
= NETDEV_TX_BUSY
;
2647 static struct sk_buff
*validate_xmit_vlan(struct sk_buff
*skb
,
2648 netdev_features_t features
)
2650 if (vlan_tx_tag_present(skb
) &&
2651 !vlan_hw_offload_capable(features
, skb
->vlan_proto
)) {
2652 skb
= __vlan_put_tag(skb
, skb
->vlan_proto
,
2653 vlan_tx_tag_get(skb
));
2660 static struct sk_buff
*validate_xmit_skb(struct sk_buff
*skb
, struct net_device
*dev
)
2662 netdev_features_t features
;
2667 features
= netif_skb_features(skb
);
2668 skb
= validate_xmit_vlan(skb
, features
);
2672 /* If encapsulation offload request, verify we are testing
2673 * hardware encapsulation features instead of standard
2674 * features for the netdev
2676 if (skb
->encapsulation
)
2677 features
&= dev
->hw_enc_features
;
2679 if (netif_needs_gso(dev
, skb
, features
)) {
2680 struct sk_buff
*segs
;
2682 segs
= skb_gso_segment(skb
, features
);
2690 if (skb_needs_linearize(skb
, features
) &&
2691 __skb_linearize(skb
))
2694 /* If packet is not checksummed and device does not
2695 * support checksumming for this protocol, complete
2696 * checksumming here.
2698 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
2699 if (skb
->encapsulation
)
2700 skb_set_inner_transport_header(skb
,
2701 skb_checksum_start_offset(skb
));
2703 skb_set_transport_header(skb
,
2704 skb_checksum_start_offset(skb
));
2705 if (!(features
& NETIF_F_ALL_CSUM
) &&
2706 skb_checksum_help(skb
))
2719 struct sk_buff
*validate_xmit_skb_list(struct sk_buff
*skb
, struct net_device
*dev
)
2721 struct sk_buff
*next
, *head
= NULL
, *tail
;
2723 for (; skb
!= NULL
; skb
= next
) {
2727 /* in case skb wont be segmented, point to itself */
2730 skb
= validate_xmit_skb(skb
, dev
);
2738 /* If skb was segmented, skb->prev points to
2739 * the last segment. If not, it still contains skb.
2746 static void qdisc_pkt_len_init(struct sk_buff
*skb
)
2748 const struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
2750 qdisc_skb_cb(skb
)->pkt_len
= skb
->len
;
2752 /* To get more precise estimation of bytes sent on wire,
2753 * we add to pkt_len the headers size of all segments
2755 if (shinfo
->gso_size
) {
2756 unsigned int hdr_len
;
2757 u16 gso_segs
= shinfo
->gso_segs
;
2759 /* mac layer + network layer */
2760 hdr_len
= skb_transport_header(skb
) - skb_mac_header(skb
);
2762 /* + transport layer */
2763 if (likely(shinfo
->gso_type
& (SKB_GSO_TCPV4
| SKB_GSO_TCPV6
)))
2764 hdr_len
+= tcp_hdrlen(skb
);
2766 hdr_len
+= sizeof(struct udphdr
);
2768 if (shinfo
->gso_type
& SKB_GSO_DODGY
)
2769 gso_segs
= DIV_ROUND_UP(skb
->len
- hdr_len
,
2772 qdisc_skb_cb(skb
)->pkt_len
+= (gso_segs
- 1) * hdr_len
;
2776 static inline int __dev_xmit_skb(struct sk_buff
*skb
, struct Qdisc
*q
,
2777 struct net_device
*dev
,
2778 struct netdev_queue
*txq
)
2780 spinlock_t
*root_lock
= qdisc_lock(q
);
2784 qdisc_pkt_len_init(skb
);
2785 qdisc_calculate_pkt_len(skb
, q
);
2787 * Heuristic to force contended enqueues to serialize on a
2788 * separate lock before trying to get qdisc main lock.
2789 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2790 * often and dequeue packets faster.
2792 contended
= qdisc_is_running(q
);
2793 if (unlikely(contended
))
2794 spin_lock(&q
->busylock
);
2796 spin_lock(root_lock
);
2797 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED
, &q
->state
))) {
2800 } else if ((q
->flags
& TCQ_F_CAN_BYPASS
) && !qdisc_qlen(q
) &&
2801 qdisc_run_begin(q
)) {
2803 * This is a work-conserving queue; there are no old skbs
2804 * waiting to be sent out; and the qdisc is not running -
2805 * xmit the skb directly.
2808 qdisc_bstats_update(q
, skb
);
2810 if (sch_direct_xmit(skb
, q
, dev
, txq
, root_lock
, true)) {
2811 if (unlikely(contended
)) {
2812 spin_unlock(&q
->busylock
);
2819 rc
= NET_XMIT_SUCCESS
;
2821 rc
= q
->enqueue(skb
, q
) & NET_XMIT_MASK
;
2822 if (qdisc_run_begin(q
)) {
2823 if (unlikely(contended
)) {
2824 spin_unlock(&q
->busylock
);
2830 spin_unlock(root_lock
);
2831 if (unlikely(contended
))
2832 spin_unlock(&q
->busylock
);
2836 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2837 static void skb_update_prio(struct sk_buff
*skb
)
2839 struct netprio_map
*map
= rcu_dereference_bh(skb
->dev
->priomap
);
2841 if (!skb
->priority
&& skb
->sk
&& map
) {
2842 unsigned int prioidx
= skb
->sk
->sk_cgrp_prioidx
;
2844 if (prioidx
< map
->priomap_len
)
2845 skb
->priority
= map
->priomap
[prioidx
];
2849 #define skb_update_prio(skb)
2852 static DEFINE_PER_CPU(int, xmit_recursion
);
2853 #define RECURSION_LIMIT 10
2856 * dev_loopback_xmit - loop back @skb
2857 * @skb: buffer to transmit
2859 int dev_loopback_xmit(struct sk_buff
*skb
)
2861 skb_reset_mac_header(skb
);
2862 __skb_pull(skb
, skb_network_offset(skb
));
2863 skb
->pkt_type
= PACKET_LOOPBACK
;
2864 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
2865 WARN_ON(!skb_dst(skb
));
2870 EXPORT_SYMBOL(dev_loopback_xmit
);
2873 * __dev_queue_xmit - transmit a buffer
2874 * @skb: buffer to transmit
2875 * @accel_priv: private data used for L2 forwarding offload
2877 * Queue a buffer for transmission to a network device. The caller must
2878 * have set the device and priority and built the buffer before calling
2879 * this function. The function can be called from an interrupt.
2881 * A negative errno code is returned on a failure. A success does not
2882 * guarantee the frame will be transmitted as it may be dropped due
2883 * to congestion or traffic shaping.
2885 * -----------------------------------------------------------------------------------
2886 * I notice this method can also return errors from the queue disciplines,
2887 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2890 * Regardless of the return value, the skb is consumed, so it is currently
2891 * difficult to retry a send to this method. (You can bump the ref count
2892 * before sending to hold a reference for retry if you are careful.)
2894 * When calling this method, interrupts MUST be enabled. This is because
2895 * the BH enable code must have IRQs enabled so that it will not deadlock.
2898 static int __dev_queue_xmit(struct sk_buff
*skb
, void *accel_priv
)
2900 struct net_device
*dev
= skb
->dev
;
2901 struct netdev_queue
*txq
;
2905 skb_reset_mac_header(skb
);
2907 if (unlikely(skb_shinfo(skb
)->tx_flags
& SKBTX_SCHED_TSTAMP
))
2908 __skb_tstamp_tx(skb
, NULL
, skb
->sk
, SCM_TSTAMP_SCHED
);
2910 /* Disable soft irqs for various locks below. Also
2911 * stops preemption for RCU.
2915 skb_update_prio(skb
);
2917 /* If device/qdisc don't need skb->dst, release it right now while
2918 * its hot in this cpu cache.
2920 if (dev
->priv_flags
& IFF_XMIT_DST_RELEASE
)
2925 txq
= netdev_pick_tx(dev
, skb
, accel_priv
);
2926 q
= rcu_dereference_bh(txq
->qdisc
);
2928 #ifdef CONFIG_NET_CLS_ACT
2929 skb
->tc_verd
= SET_TC_AT(skb
->tc_verd
, AT_EGRESS
);
2931 trace_net_dev_queue(skb
);
2933 rc
= __dev_xmit_skb(skb
, q
, dev
, txq
);
2937 /* The device has no queue. Common case for software devices:
2938 loopback, all the sorts of tunnels...
2940 Really, it is unlikely that netif_tx_lock protection is necessary
2941 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2943 However, it is possible, that they rely on protection
2946 Check this and shot the lock. It is not prone from deadlocks.
2947 Either shot noqueue qdisc, it is even simpler 8)
2949 if (dev
->flags
& IFF_UP
) {
2950 int cpu
= smp_processor_id(); /* ok because BHs are off */
2952 if (txq
->xmit_lock_owner
!= cpu
) {
2954 if (__this_cpu_read(xmit_recursion
) > RECURSION_LIMIT
)
2955 goto recursion_alert
;
2957 skb
= validate_xmit_skb(skb
, dev
);
2961 HARD_TX_LOCK(dev
, txq
, cpu
);
2963 if (!netif_xmit_stopped(txq
)) {
2964 __this_cpu_inc(xmit_recursion
);
2965 skb
= dev_hard_start_xmit(skb
, dev
, txq
, &rc
);
2966 __this_cpu_dec(xmit_recursion
);
2967 if (dev_xmit_complete(rc
)) {
2968 HARD_TX_UNLOCK(dev
, txq
);
2972 HARD_TX_UNLOCK(dev
, txq
);
2973 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2976 /* Recursion is detected! It is possible,
2980 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2987 rcu_read_unlock_bh();
2989 atomic_long_inc(&dev
->tx_dropped
);
2990 kfree_skb_list(skb
);
2993 rcu_read_unlock_bh();
2997 int dev_queue_xmit(struct sk_buff
*skb
)
2999 return __dev_queue_xmit(skb
, NULL
);
3001 EXPORT_SYMBOL(dev_queue_xmit
);
3003 int dev_queue_xmit_accel(struct sk_buff
*skb
, void *accel_priv
)
3005 return __dev_queue_xmit(skb
, accel_priv
);
3007 EXPORT_SYMBOL(dev_queue_xmit_accel
);
3010 /*=======================================================================
3012 =======================================================================*/
3014 int netdev_max_backlog __read_mostly
= 1000;
3015 EXPORT_SYMBOL(netdev_max_backlog
);
3017 int netdev_tstamp_prequeue __read_mostly
= 1;
3018 int netdev_budget __read_mostly
= 300;
3019 int weight_p __read_mostly
= 64; /* old backlog weight */
3021 /* Called with irq disabled */
3022 static inline void ____napi_schedule(struct softnet_data
*sd
,
3023 struct napi_struct
*napi
)
3025 list_add_tail(&napi
->poll_list
, &sd
->poll_list
);
3026 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
3031 /* One global table that all flow-based protocols share. */
3032 struct rps_sock_flow_table __rcu
*rps_sock_flow_table __read_mostly
;
3033 EXPORT_SYMBOL(rps_sock_flow_table
);
3035 struct static_key rps_needed __read_mostly
;
3037 static struct rps_dev_flow
*
3038 set_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
3039 struct rps_dev_flow
*rflow
, u16 next_cpu
)
3041 if (next_cpu
!= RPS_NO_CPU
) {
3042 #ifdef CONFIG_RFS_ACCEL
3043 struct netdev_rx_queue
*rxqueue
;
3044 struct rps_dev_flow_table
*flow_table
;
3045 struct rps_dev_flow
*old_rflow
;
3050 /* Should we steer this flow to a different hardware queue? */
3051 if (!skb_rx_queue_recorded(skb
) || !dev
->rx_cpu_rmap
||
3052 !(dev
->features
& NETIF_F_NTUPLE
))
3054 rxq_index
= cpu_rmap_lookup_index(dev
->rx_cpu_rmap
, next_cpu
);
3055 if (rxq_index
== skb_get_rx_queue(skb
))
3058 rxqueue
= dev
->_rx
+ rxq_index
;
3059 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
3062 flow_id
= skb_get_hash(skb
) & flow_table
->mask
;
3063 rc
= dev
->netdev_ops
->ndo_rx_flow_steer(dev
, skb
,
3064 rxq_index
, flow_id
);
3068 rflow
= &flow_table
->flows
[flow_id
];
3070 if (old_rflow
->filter
== rflow
->filter
)
3071 old_rflow
->filter
= RPS_NO_FILTER
;
3075 per_cpu(softnet_data
, next_cpu
).input_queue_head
;
3078 rflow
->cpu
= next_cpu
;
3083 * get_rps_cpu is called from netif_receive_skb and returns the target
3084 * CPU from the RPS map of the receiving queue for a given skb.
3085 * rcu_read_lock must be held on entry.
3087 static int get_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
3088 struct rps_dev_flow
**rflowp
)
3090 struct netdev_rx_queue
*rxqueue
;
3091 struct rps_map
*map
;
3092 struct rps_dev_flow_table
*flow_table
;
3093 struct rps_sock_flow_table
*sock_flow_table
;
3098 if (skb_rx_queue_recorded(skb
)) {
3099 u16 index
= skb_get_rx_queue(skb
);
3100 if (unlikely(index
>= dev
->real_num_rx_queues
)) {
3101 WARN_ONCE(dev
->real_num_rx_queues
> 1,
3102 "%s received packet on queue %u, but number "
3103 "of RX queues is %u\n",
3104 dev
->name
, index
, dev
->real_num_rx_queues
);
3107 rxqueue
= dev
->_rx
+ index
;
3111 map
= rcu_dereference(rxqueue
->rps_map
);
3113 if (map
->len
== 1 &&
3114 !rcu_access_pointer(rxqueue
->rps_flow_table
)) {
3115 tcpu
= map
->cpus
[0];
3116 if (cpu_online(tcpu
))
3120 } else if (!rcu_access_pointer(rxqueue
->rps_flow_table
)) {
3124 skb_reset_network_header(skb
);
3125 hash
= skb_get_hash(skb
);
3129 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
3130 sock_flow_table
= rcu_dereference(rps_sock_flow_table
);
3131 if (flow_table
&& sock_flow_table
) {
3133 struct rps_dev_flow
*rflow
;
3135 rflow
= &flow_table
->flows
[hash
& flow_table
->mask
];
3138 next_cpu
= sock_flow_table
->ents
[hash
& sock_flow_table
->mask
];
3141 * If the desired CPU (where last recvmsg was done) is
3142 * different from current CPU (one in the rx-queue flow
3143 * table entry), switch if one of the following holds:
3144 * - Current CPU is unset (equal to RPS_NO_CPU).
3145 * - Current CPU is offline.
3146 * - The current CPU's queue tail has advanced beyond the
3147 * last packet that was enqueued using this table entry.
3148 * This guarantees that all previous packets for the flow
3149 * have been dequeued, thus preserving in order delivery.
3151 if (unlikely(tcpu
!= next_cpu
) &&
3152 (tcpu
== RPS_NO_CPU
|| !cpu_online(tcpu
) ||
3153 ((int)(per_cpu(softnet_data
, tcpu
).input_queue_head
-
3154 rflow
->last_qtail
)) >= 0)) {
3156 rflow
= set_rps_cpu(dev
, skb
, rflow
, next_cpu
);
3159 if (tcpu
!= RPS_NO_CPU
&& cpu_online(tcpu
)) {
3167 tcpu
= map
->cpus
[reciprocal_scale(hash
, map
->len
)];
3168 if (cpu_online(tcpu
)) {
3178 #ifdef CONFIG_RFS_ACCEL
3181 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3182 * @dev: Device on which the filter was set
3183 * @rxq_index: RX queue index
3184 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3185 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3187 * Drivers that implement ndo_rx_flow_steer() should periodically call
3188 * this function for each installed filter and remove the filters for
3189 * which it returns %true.
3191 bool rps_may_expire_flow(struct net_device
*dev
, u16 rxq_index
,
3192 u32 flow_id
, u16 filter_id
)
3194 struct netdev_rx_queue
*rxqueue
= dev
->_rx
+ rxq_index
;
3195 struct rps_dev_flow_table
*flow_table
;
3196 struct rps_dev_flow
*rflow
;
3201 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
3202 if (flow_table
&& flow_id
<= flow_table
->mask
) {
3203 rflow
= &flow_table
->flows
[flow_id
];
3204 cpu
= ACCESS_ONCE(rflow
->cpu
);
3205 if (rflow
->filter
== filter_id
&& cpu
!= RPS_NO_CPU
&&
3206 ((int)(per_cpu(softnet_data
, cpu
).input_queue_head
-
3207 rflow
->last_qtail
) <
3208 (int)(10 * flow_table
->mask
)))
3214 EXPORT_SYMBOL(rps_may_expire_flow
);
3216 #endif /* CONFIG_RFS_ACCEL */
3218 /* Called from hardirq (IPI) context */
3219 static void rps_trigger_softirq(void *data
)
3221 struct softnet_data
*sd
= data
;
3223 ____napi_schedule(sd
, &sd
->backlog
);
3227 #endif /* CONFIG_RPS */
3230 * Check if this softnet_data structure is another cpu one
3231 * If yes, queue it to our IPI list and return 1
3234 static int rps_ipi_queued(struct softnet_data
*sd
)
3237 struct softnet_data
*mysd
= this_cpu_ptr(&softnet_data
);
3240 sd
->rps_ipi_next
= mysd
->rps_ipi_list
;
3241 mysd
->rps_ipi_list
= sd
;
3243 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
3246 #endif /* CONFIG_RPS */
3250 #ifdef CONFIG_NET_FLOW_LIMIT
3251 int netdev_flow_limit_table_len __read_mostly
= (1 << 12);
3254 static bool skb_flow_limit(struct sk_buff
*skb
, unsigned int qlen
)
3256 #ifdef CONFIG_NET_FLOW_LIMIT
3257 struct sd_flow_limit
*fl
;
3258 struct softnet_data
*sd
;
3259 unsigned int old_flow
, new_flow
;
3261 if (qlen
< (netdev_max_backlog
>> 1))
3264 sd
= this_cpu_ptr(&softnet_data
);
3267 fl
= rcu_dereference(sd
->flow_limit
);
3269 new_flow
= skb_get_hash(skb
) & (fl
->num_buckets
- 1);
3270 old_flow
= fl
->history
[fl
->history_head
];
3271 fl
->history
[fl
->history_head
] = new_flow
;
3274 fl
->history_head
&= FLOW_LIMIT_HISTORY
- 1;
3276 if (likely(fl
->buckets
[old_flow
]))
3277 fl
->buckets
[old_flow
]--;
3279 if (++fl
->buckets
[new_flow
] > (FLOW_LIMIT_HISTORY
>> 1)) {
3291 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3292 * queue (may be a remote CPU queue).
3294 static int enqueue_to_backlog(struct sk_buff
*skb
, int cpu
,
3295 unsigned int *qtail
)
3297 struct softnet_data
*sd
;
3298 unsigned long flags
;
3301 sd
= &per_cpu(softnet_data
, cpu
);
3303 local_irq_save(flags
);
3306 qlen
= skb_queue_len(&sd
->input_pkt_queue
);
3307 if (qlen
<= netdev_max_backlog
&& !skb_flow_limit(skb
, qlen
)) {
3308 if (skb_queue_len(&sd
->input_pkt_queue
)) {
3310 __skb_queue_tail(&sd
->input_pkt_queue
, skb
);
3311 input_queue_tail_incr_save(sd
, qtail
);
3313 local_irq_restore(flags
);
3314 return NET_RX_SUCCESS
;
3317 /* Schedule NAPI for backlog device
3318 * We can use non atomic operation since we own the queue lock
3320 if (!__test_and_set_bit(NAPI_STATE_SCHED
, &sd
->backlog
.state
)) {
3321 if (!rps_ipi_queued(sd
))
3322 ____napi_schedule(sd
, &sd
->backlog
);
3330 local_irq_restore(flags
);
3332 atomic_long_inc(&skb
->dev
->rx_dropped
);
3337 static int netif_rx_internal(struct sk_buff
*skb
)
3341 net_timestamp_check(netdev_tstamp_prequeue
, skb
);
3343 trace_netif_rx(skb
);
3345 if (static_key_false(&rps_needed
)) {
3346 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
3352 cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
3354 cpu
= smp_processor_id();
3356 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
3364 ret
= enqueue_to_backlog(skb
, get_cpu(), &qtail
);
3371 * netif_rx - post buffer to the network code
3372 * @skb: buffer to post
3374 * This function receives a packet from a device driver and queues it for
3375 * the upper (protocol) levels to process. It always succeeds. The buffer
3376 * may be dropped during processing for congestion control or by the
3380 * NET_RX_SUCCESS (no congestion)
3381 * NET_RX_DROP (packet was dropped)
3385 int netif_rx(struct sk_buff
*skb
)
3387 trace_netif_rx_entry(skb
);
3389 return netif_rx_internal(skb
);
3391 EXPORT_SYMBOL(netif_rx
);
3393 int netif_rx_ni(struct sk_buff
*skb
)
3397 trace_netif_rx_ni_entry(skb
);
3400 err
= netif_rx_internal(skb
);
3401 if (local_softirq_pending())
3407 EXPORT_SYMBOL(netif_rx_ni
);
3409 static void net_tx_action(struct softirq_action
*h
)
3411 struct softnet_data
*sd
= this_cpu_ptr(&softnet_data
);
3413 if (sd
->completion_queue
) {
3414 struct sk_buff
*clist
;
3416 local_irq_disable();
3417 clist
= sd
->completion_queue
;
3418 sd
->completion_queue
= NULL
;
3422 struct sk_buff
*skb
= clist
;
3423 clist
= clist
->next
;
3425 WARN_ON(atomic_read(&skb
->users
));
3426 if (likely(get_kfree_skb_cb(skb
)->reason
== SKB_REASON_CONSUMED
))
3427 trace_consume_skb(skb
);
3429 trace_kfree_skb(skb
, net_tx_action
);
3434 if (sd
->output_queue
) {
3437 local_irq_disable();
3438 head
= sd
->output_queue
;
3439 sd
->output_queue
= NULL
;
3440 sd
->output_queue_tailp
= &sd
->output_queue
;
3444 struct Qdisc
*q
= head
;
3445 spinlock_t
*root_lock
;
3447 head
= head
->next_sched
;
3449 root_lock
= qdisc_lock(q
);
3450 if (spin_trylock(root_lock
)) {
3451 smp_mb__before_atomic();
3452 clear_bit(__QDISC_STATE_SCHED
,
3455 spin_unlock(root_lock
);
3457 if (!test_bit(__QDISC_STATE_DEACTIVATED
,
3459 __netif_reschedule(q
);
3461 smp_mb__before_atomic();
3462 clear_bit(__QDISC_STATE_SCHED
,
3470 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3471 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3472 /* This hook is defined here for ATM LANE */
3473 int (*br_fdb_test_addr_hook
)(struct net_device
*dev
,
3474 unsigned char *addr
) __read_mostly
;
3475 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook
);
3478 #ifdef CONFIG_NET_CLS_ACT
3479 /* TODO: Maybe we should just force sch_ingress to be compiled in
3480 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3481 * a compare and 2 stores extra right now if we dont have it on
3482 * but have CONFIG_NET_CLS_ACT
3483 * NOTE: This doesn't stop any functionality; if you dont have
3484 * the ingress scheduler, you just can't add policies on ingress.
3487 static int ing_filter(struct sk_buff
*skb
, struct netdev_queue
*rxq
)
3489 struct net_device
*dev
= skb
->dev
;
3490 u32 ttl
= G_TC_RTTL(skb
->tc_verd
);
3491 int result
= TC_ACT_OK
;
3494 if (unlikely(MAX_RED_LOOP
< ttl
++)) {
3495 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3496 skb
->skb_iif
, dev
->ifindex
);
3500 skb
->tc_verd
= SET_TC_RTTL(skb
->tc_verd
, ttl
);
3501 skb
->tc_verd
= SET_TC_AT(skb
->tc_verd
, AT_INGRESS
);
3503 q
= rcu_dereference(rxq
->qdisc
);
3504 if (q
!= &noop_qdisc
) {
3505 spin_lock(qdisc_lock(q
));
3506 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED
, &q
->state
)))
3507 result
= qdisc_enqueue_root(skb
, q
);
3508 spin_unlock(qdisc_lock(q
));
3514 static inline struct sk_buff
*handle_ing(struct sk_buff
*skb
,
3515 struct packet_type
**pt_prev
,
3516 int *ret
, struct net_device
*orig_dev
)
3518 struct netdev_queue
*rxq
= rcu_dereference(skb
->dev
->ingress_queue
);
3520 if (!rxq
|| rcu_access_pointer(rxq
->qdisc
) == &noop_qdisc
)
3524 *ret
= deliver_skb(skb
, *pt_prev
, orig_dev
);
3528 switch (ing_filter(skb
, rxq
)) {
3542 * netdev_rx_handler_register - register receive handler
3543 * @dev: device to register a handler for
3544 * @rx_handler: receive handler to register
3545 * @rx_handler_data: data pointer that is used by rx handler
3547 * Register a receive handler for a device. This handler will then be
3548 * called from __netif_receive_skb. A negative errno code is returned
3551 * The caller must hold the rtnl_mutex.
3553 * For a general description of rx_handler, see enum rx_handler_result.
3555 int netdev_rx_handler_register(struct net_device
*dev
,
3556 rx_handler_func_t
*rx_handler
,
3557 void *rx_handler_data
)
3561 if (dev
->rx_handler
)
3564 /* Note: rx_handler_data must be set before rx_handler */
3565 rcu_assign_pointer(dev
->rx_handler_data
, rx_handler_data
);
3566 rcu_assign_pointer(dev
->rx_handler
, rx_handler
);
3570 EXPORT_SYMBOL_GPL(netdev_rx_handler_register
);
3573 * netdev_rx_handler_unregister - unregister receive handler
3574 * @dev: device to unregister a handler from
3576 * Unregister a receive handler from a device.
3578 * The caller must hold the rtnl_mutex.
3580 void netdev_rx_handler_unregister(struct net_device
*dev
)
3584 RCU_INIT_POINTER(dev
->rx_handler
, NULL
);
3585 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3586 * section has a guarantee to see a non NULL rx_handler_data
3590 RCU_INIT_POINTER(dev
->rx_handler_data
, NULL
);
3592 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister
);
3595 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3596 * the special handling of PFMEMALLOC skbs.
3598 static bool skb_pfmemalloc_protocol(struct sk_buff
*skb
)
3600 switch (skb
->protocol
) {
3601 case htons(ETH_P_ARP
):
3602 case htons(ETH_P_IP
):
3603 case htons(ETH_P_IPV6
):
3604 case htons(ETH_P_8021Q
):
3605 case htons(ETH_P_8021AD
):
3612 static int __netif_receive_skb_core(struct sk_buff
*skb
, bool pfmemalloc
)
3614 struct packet_type
*ptype
, *pt_prev
;
3615 rx_handler_func_t
*rx_handler
;
3616 struct net_device
*orig_dev
;
3617 struct net_device
*null_or_dev
;
3618 bool deliver_exact
= false;
3619 int ret
= NET_RX_DROP
;
3622 net_timestamp_check(!netdev_tstamp_prequeue
, skb
);
3624 trace_netif_receive_skb(skb
);
3626 orig_dev
= skb
->dev
;
3628 skb_reset_network_header(skb
);
3629 if (!skb_transport_header_was_set(skb
))
3630 skb_reset_transport_header(skb
);
3631 skb_reset_mac_len(skb
);
3638 skb
->skb_iif
= skb
->dev
->ifindex
;
3640 __this_cpu_inc(softnet_data
.processed
);
3642 if (skb
->protocol
== cpu_to_be16(ETH_P_8021Q
) ||
3643 skb
->protocol
== cpu_to_be16(ETH_P_8021AD
)) {
3644 skb
= skb_vlan_untag(skb
);
3649 #ifdef CONFIG_NET_CLS_ACT
3650 if (skb
->tc_verd
& TC_NCLS
) {
3651 skb
->tc_verd
= CLR_TC_NCLS(skb
->tc_verd
);
3659 list_for_each_entry_rcu(ptype
, &ptype_all
, list
) {
3660 if (!ptype
->dev
|| ptype
->dev
== skb
->dev
) {
3662 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3668 #ifdef CONFIG_NET_CLS_ACT
3669 skb
= handle_ing(skb
, &pt_prev
, &ret
, orig_dev
);
3675 if (pfmemalloc
&& !skb_pfmemalloc_protocol(skb
))
3678 if (vlan_tx_tag_present(skb
)) {
3680 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3683 if (vlan_do_receive(&skb
))
3685 else if (unlikely(!skb
))
3689 rx_handler
= rcu_dereference(skb
->dev
->rx_handler
);
3692 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3695 switch (rx_handler(&skb
)) {
3696 case RX_HANDLER_CONSUMED
:
3697 ret
= NET_RX_SUCCESS
;
3699 case RX_HANDLER_ANOTHER
:
3701 case RX_HANDLER_EXACT
:
3702 deliver_exact
= true;
3703 case RX_HANDLER_PASS
:
3710 if (unlikely(vlan_tx_tag_present(skb
))) {
3711 if (vlan_tx_tag_get_id(skb
))
3712 skb
->pkt_type
= PACKET_OTHERHOST
;
3713 /* Note: we might in the future use prio bits
3714 * and set skb->priority like in vlan_do_receive()
3715 * For the time being, just ignore Priority Code Point
3720 /* deliver only exact match when indicated */
3721 null_or_dev
= deliver_exact
? skb
->dev
: NULL
;
3723 type
= skb
->protocol
;
3724 list_for_each_entry_rcu(ptype
,
3725 &ptype_base
[ntohs(type
) & PTYPE_HASH_MASK
], list
) {
3726 if (ptype
->type
== type
&&
3727 (ptype
->dev
== null_or_dev
|| ptype
->dev
== skb
->dev
||
3728 ptype
->dev
== orig_dev
)) {
3730 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3736 if (unlikely(skb_orphan_frags(skb
, GFP_ATOMIC
)))
3739 ret
= pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
3742 atomic_long_inc(&skb
->dev
->rx_dropped
);
3744 /* Jamal, now you will not able to escape explaining
3745 * me how you were going to use this. :-)
3755 static int __netif_receive_skb(struct sk_buff
*skb
)
3759 if (sk_memalloc_socks() && skb_pfmemalloc(skb
)) {
3760 unsigned long pflags
= current
->flags
;
3763 * PFMEMALLOC skbs are special, they should
3764 * - be delivered to SOCK_MEMALLOC sockets only
3765 * - stay away from userspace
3766 * - have bounded memory usage
3768 * Use PF_MEMALLOC as this saves us from propagating the allocation
3769 * context down to all allocation sites.
3771 current
->flags
|= PF_MEMALLOC
;
3772 ret
= __netif_receive_skb_core(skb
, true);
3773 tsk_restore_flags(current
, pflags
, PF_MEMALLOC
);
3775 ret
= __netif_receive_skb_core(skb
, false);
3780 static int netif_receive_skb_internal(struct sk_buff
*skb
)
3782 net_timestamp_check(netdev_tstamp_prequeue
, skb
);
3784 if (skb_defer_rx_timestamp(skb
))
3785 return NET_RX_SUCCESS
;
3788 if (static_key_false(&rps_needed
)) {
3789 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
3794 cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
3797 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
3804 return __netif_receive_skb(skb
);
3808 * netif_receive_skb - process receive buffer from network
3809 * @skb: buffer to process
3811 * netif_receive_skb() is the main receive data processing function.
3812 * It always succeeds. The buffer may be dropped during processing
3813 * for congestion control or by the protocol layers.
3815 * This function may only be called from softirq context and interrupts
3816 * should be enabled.
3818 * Return values (usually ignored):
3819 * NET_RX_SUCCESS: no congestion
3820 * NET_RX_DROP: packet was dropped
3822 int netif_receive_skb(struct sk_buff
*skb
)
3824 trace_netif_receive_skb_entry(skb
);
3826 return netif_receive_skb_internal(skb
);
3828 EXPORT_SYMBOL(netif_receive_skb
);
3830 /* Network device is going away, flush any packets still pending
3831 * Called with irqs disabled.
3833 static void flush_backlog(void *arg
)
3835 struct net_device
*dev
= arg
;
3836 struct softnet_data
*sd
= this_cpu_ptr(&softnet_data
);
3837 struct sk_buff
*skb
, *tmp
;
3840 skb_queue_walk_safe(&sd
->input_pkt_queue
, skb
, tmp
) {
3841 if (skb
->dev
== dev
) {
3842 __skb_unlink(skb
, &sd
->input_pkt_queue
);
3844 input_queue_head_incr(sd
);
3849 skb_queue_walk_safe(&sd
->process_queue
, skb
, tmp
) {
3850 if (skb
->dev
== dev
) {
3851 __skb_unlink(skb
, &sd
->process_queue
);
3853 input_queue_head_incr(sd
);
3858 static int napi_gro_complete(struct sk_buff
*skb
)
3860 struct packet_offload
*ptype
;
3861 __be16 type
= skb
->protocol
;
3862 struct list_head
*head
= &offload_base
;
3865 BUILD_BUG_ON(sizeof(struct napi_gro_cb
) > sizeof(skb
->cb
));
3867 if (NAPI_GRO_CB(skb
)->count
== 1) {
3868 skb_shinfo(skb
)->gso_size
= 0;
3873 list_for_each_entry_rcu(ptype
, head
, list
) {
3874 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_complete
)
3877 err
= ptype
->callbacks
.gro_complete(skb
, 0);
3883 WARN_ON(&ptype
->list
== head
);
3885 return NET_RX_SUCCESS
;
3889 return netif_receive_skb_internal(skb
);
3892 /* napi->gro_list contains packets ordered by age.
3893 * youngest packets at the head of it.
3894 * Complete skbs in reverse order to reduce latencies.
3896 void napi_gro_flush(struct napi_struct
*napi
, bool flush_old
)
3898 struct sk_buff
*skb
, *prev
= NULL
;
3900 /* scan list and build reverse chain */
3901 for (skb
= napi
->gro_list
; skb
!= NULL
; skb
= skb
->next
) {
3906 for (skb
= prev
; skb
; skb
= prev
) {
3909 if (flush_old
&& NAPI_GRO_CB(skb
)->age
== jiffies
)
3913 napi_gro_complete(skb
);
3917 napi
->gro_list
= NULL
;
3919 EXPORT_SYMBOL(napi_gro_flush
);
3921 static void gro_list_prepare(struct napi_struct
*napi
, struct sk_buff
*skb
)
3924 unsigned int maclen
= skb
->dev
->hard_header_len
;
3925 u32 hash
= skb_get_hash_raw(skb
);
3927 for (p
= napi
->gro_list
; p
; p
= p
->next
) {
3928 unsigned long diffs
;
3930 NAPI_GRO_CB(p
)->flush
= 0;
3932 if (hash
!= skb_get_hash_raw(p
)) {
3933 NAPI_GRO_CB(p
)->same_flow
= 0;
3937 diffs
= (unsigned long)p
->dev
^ (unsigned long)skb
->dev
;
3938 diffs
|= p
->vlan_tci
^ skb
->vlan_tci
;
3939 if (maclen
== ETH_HLEN
)
3940 diffs
|= compare_ether_header(skb_mac_header(p
),
3941 skb_mac_header(skb
));
3943 diffs
= memcmp(skb_mac_header(p
),
3944 skb_mac_header(skb
),
3946 NAPI_GRO_CB(p
)->same_flow
= !diffs
;
3950 static void skb_gro_reset_offset(struct sk_buff
*skb
)
3952 const struct skb_shared_info
*pinfo
= skb_shinfo(skb
);
3953 const skb_frag_t
*frag0
= &pinfo
->frags
[0];
3955 NAPI_GRO_CB(skb
)->data_offset
= 0;
3956 NAPI_GRO_CB(skb
)->frag0
= NULL
;
3957 NAPI_GRO_CB(skb
)->frag0_len
= 0;
3959 if (skb_mac_header(skb
) == skb_tail_pointer(skb
) &&
3961 !PageHighMem(skb_frag_page(frag0
))) {
3962 NAPI_GRO_CB(skb
)->frag0
= skb_frag_address(frag0
);
3963 NAPI_GRO_CB(skb
)->frag0_len
= skb_frag_size(frag0
);
3967 static void gro_pull_from_frag0(struct sk_buff
*skb
, int grow
)
3969 struct skb_shared_info
*pinfo
= skb_shinfo(skb
);
3971 BUG_ON(skb
->end
- skb
->tail
< grow
);
3973 memcpy(skb_tail_pointer(skb
), NAPI_GRO_CB(skb
)->frag0
, grow
);
3975 skb
->data_len
-= grow
;
3978 pinfo
->frags
[0].page_offset
+= grow
;
3979 skb_frag_size_sub(&pinfo
->frags
[0], grow
);
3981 if (unlikely(!skb_frag_size(&pinfo
->frags
[0]))) {
3982 skb_frag_unref(skb
, 0);
3983 memmove(pinfo
->frags
, pinfo
->frags
+ 1,
3984 --pinfo
->nr_frags
* sizeof(pinfo
->frags
[0]));
3988 static enum gro_result
dev_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
3990 struct sk_buff
**pp
= NULL
;
3991 struct packet_offload
*ptype
;
3992 __be16 type
= skb
->protocol
;
3993 struct list_head
*head
= &offload_base
;
3995 enum gro_result ret
;
3998 if (!(skb
->dev
->features
& NETIF_F_GRO
))
4001 if (skb_is_gso(skb
) || skb_has_frag_list(skb
) || skb
->csum_bad
)
4004 gro_list_prepare(napi
, skb
);
4007 list_for_each_entry_rcu(ptype
, head
, list
) {
4008 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_receive
)
4011 skb_set_network_header(skb
, skb_gro_offset(skb
));
4012 skb_reset_mac_len(skb
);
4013 NAPI_GRO_CB(skb
)->same_flow
= 0;
4014 NAPI_GRO_CB(skb
)->flush
= 0;
4015 NAPI_GRO_CB(skb
)->free
= 0;
4016 NAPI_GRO_CB(skb
)->udp_mark
= 0;
4018 /* Setup for GRO checksum validation */
4019 switch (skb
->ip_summed
) {
4020 case CHECKSUM_COMPLETE
:
4021 NAPI_GRO_CB(skb
)->csum
= skb
->csum
;
4022 NAPI_GRO_CB(skb
)->csum_valid
= 1;
4023 NAPI_GRO_CB(skb
)->csum_cnt
= 0;
4025 case CHECKSUM_UNNECESSARY
:
4026 NAPI_GRO_CB(skb
)->csum_cnt
= skb
->csum_level
+ 1;
4027 NAPI_GRO_CB(skb
)->csum_valid
= 0;
4030 NAPI_GRO_CB(skb
)->csum_cnt
= 0;
4031 NAPI_GRO_CB(skb
)->csum_valid
= 0;
4034 pp
= ptype
->callbacks
.gro_receive(&napi
->gro_list
, skb
);
4039 if (&ptype
->list
== head
)
4042 same_flow
= NAPI_GRO_CB(skb
)->same_flow
;
4043 ret
= NAPI_GRO_CB(skb
)->free
? GRO_MERGED_FREE
: GRO_MERGED
;
4046 struct sk_buff
*nskb
= *pp
;
4050 napi_gro_complete(nskb
);
4057 if (NAPI_GRO_CB(skb
)->flush
)
4060 if (unlikely(napi
->gro_count
>= MAX_GRO_SKBS
)) {
4061 struct sk_buff
*nskb
= napi
->gro_list
;
4063 /* locate the end of the list to select the 'oldest' flow */
4064 while (nskb
->next
) {
4070 napi_gro_complete(nskb
);
4074 NAPI_GRO_CB(skb
)->count
= 1;
4075 NAPI_GRO_CB(skb
)->age
= jiffies
;
4076 NAPI_GRO_CB(skb
)->last
= skb
;
4077 skb_shinfo(skb
)->gso_size
= skb_gro_len(skb
);
4078 skb
->next
= napi
->gro_list
;
4079 napi
->gro_list
= skb
;
4083 grow
= skb_gro_offset(skb
) - skb_headlen(skb
);
4085 gro_pull_from_frag0(skb
, grow
);
4094 struct packet_offload
*gro_find_receive_by_type(__be16 type
)
4096 struct list_head
*offload_head
= &offload_base
;
4097 struct packet_offload
*ptype
;
4099 list_for_each_entry_rcu(ptype
, offload_head
, list
) {
4100 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_receive
)
4106 EXPORT_SYMBOL(gro_find_receive_by_type
);
4108 struct packet_offload
*gro_find_complete_by_type(__be16 type
)
4110 struct list_head
*offload_head
= &offload_base
;
4111 struct packet_offload
*ptype
;
4113 list_for_each_entry_rcu(ptype
, offload_head
, list
) {
4114 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_complete
)
4120 EXPORT_SYMBOL(gro_find_complete_by_type
);
4122 static gro_result_t
napi_skb_finish(gro_result_t ret
, struct sk_buff
*skb
)
4126 if (netif_receive_skb_internal(skb
))
4134 case GRO_MERGED_FREE
:
4135 if (NAPI_GRO_CB(skb
)->free
== NAPI_GRO_FREE_STOLEN_HEAD
)
4136 kmem_cache_free(skbuff_head_cache
, skb
);
4149 gro_result_t
napi_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
4151 trace_napi_gro_receive_entry(skb
);
4153 skb_gro_reset_offset(skb
);
4155 return napi_skb_finish(dev_gro_receive(napi
, skb
), skb
);
4157 EXPORT_SYMBOL(napi_gro_receive
);
4159 static void napi_reuse_skb(struct napi_struct
*napi
, struct sk_buff
*skb
)
4161 if (unlikely(skb
->pfmemalloc
)) {
4165 __skb_pull(skb
, skb_headlen(skb
));
4166 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4167 skb_reserve(skb
, NET_SKB_PAD
+ NET_IP_ALIGN
- skb_headroom(skb
));
4169 skb
->dev
= napi
->dev
;
4171 skb
->encapsulation
= 0;
4172 skb_shinfo(skb
)->gso_type
= 0;
4173 skb
->truesize
= SKB_TRUESIZE(skb_end_offset(skb
));
4178 struct sk_buff
*napi_get_frags(struct napi_struct
*napi
)
4180 struct sk_buff
*skb
= napi
->skb
;
4183 skb
= netdev_alloc_skb_ip_align(napi
->dev
, GRO_MAX_HEAD
);
4188 EXPORT_SYMBOL(napi_get_frags
);
4190 static gro_result_t
napi_frags_finish(struct napi_struct
*napi
,
4191 struct sk_buff
*skb
,
4197 __skb_push(skb
, ETH_HLEN
);
4198 skb
->protocol
= eth_type_trans(skb
, skb
->dev
);
4199 if (ret
== GRO_NORMAL
&& netif_receive_skb_internal(skb
))
4204 case GRO_MERGED_FREE
:
4205 napi_reuse_skb(napi
, skb
);
4215 /* Upper GRO stack assumes network header starts at gro_offset=0
4216 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4217 * We copy ethernet header into skb->data to have a common layout.
4219 static struct sk_buff
*napi_frags_skb(struct napi_struct
*napi
)
4221 struct sk_buff
*skb
= napi
->skb
;
4222 const struct ethhdr
*eth
;
4223 unsigned int hlen
= sizeof(*eth
);
4227 skb_reset_mac_header(skb
);
4228 skb_gro_reset_offset(skb
);
4230 eth
= skb_gro_header_fast(skb
, 0);
4231 if (unlikely(skb_gro_header_hard(skb
, hlen
))) {
4232 eth
= skb_gro_header_slow(skb
, hlen
, 0);
4233 if (unlikely(!eth
)) {
4234 napi_reuse_skb(napi
, skb
);
4238 gro_pull_from_frag0(skb
, hlen
);
4239 NAPI_GRO_CB(skb
)->frag0
+= hlen
;
4240 NAPI_GRO_CB(skb
)->frag0_len
-= hlen
;
4242 __skb_pull(skb
, hlen
);
4245 * This works because the only protocols we care about don't require
4247 * We'll fix it up properly in napi_frags_finish()
4249 skb
->protocol
= eth
->h_proto
;
4254 gro_result_t
napi_gro_frags(struct napi_struct
*napi
)
4256 struct sk_buff
*skb
= napi_frags_skb(napi
);
4261 trace_napi_gro_frags_entry(skb
);
4263 return napi_frags_finish(napi
, skb
, dev_gro_receive(napi
, skb
));
4265 EXPORT_SYMBOL(napi_gro_frags
);
4267 /* Compute the checksum from gro_offset and return the folded value
4268 * after adding in any pseudo checksum.
4270 __sum16
__skb_gro_checksum_complete(struct sk_buff
*skb
)
4275 wsum
= skb_checksum(skb
, skb_gro_offset(skb
), skb_gro_len(skb
), 0);
4277 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4278 sum
= csum_fold(csum_add(NAPI_GRO_CB(skb
)->csum
, wsum
));
4280 if (unlikely(skb
->ip_summed
== CHECKSUM_COMPLETE
) &&
4281 !skb
->csum_complete_sw
)
4282 netdev_rx_csum_fault(skb
->dev
);
4285 NAPI_GRO_CB(skb
)->csum
= wsum
;
4286 NAPI_GRO_CB(skb
)->csum_valid
= 1;
4290 EXPORT_SYMBOL(__skb_gro_checksum_complete
);
4293 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4294 * Note: called with local irq disabled, but exits with local irq enabled.
4296 static void net_rps_action_and_irq_enable(struct softnet_data
*sd
)
4299 struct softnet_data
*remsd
= sd
->rps_ipi_list
;
4302 sd
->rps_ipi_list
= NULL
;
4306 /* Send pending IPI's to kick RPS processing on remote cpus. */
4308 struct softnet_data
*next
= remsd
->rps_ipi_next
;
4310 if (cpu_online(remsd
->cpu
))
4311 smp_call_function_single_async(remsd
->cpu
,
4320 static bool sd_has_rps_ipi_waiting(struct softnet_data
*sd
)
4323 return sd
->rps_ipi_list
!= NULL
;
4329 static int process_backlog(struct napi_struct
*napi
, int quota
)
4332 struct softnet_data
*sd
= container_of(napi
, struct softnet_data
, backlog
);
4334 /* Check if we have pending ipi, its better to send them now,
4335 * not waiting net_rx_action() end.
4337 if (sd_has_rps_ipi_waiting(sd
)) {
4338 local_irq_disable();
4339 net_rps_action_and_irq_enable(sd
);
4342 napi
->weight
= weight_p
;
4343 local_irq_disable();
4345 struct sk_buff
*skb
;
4347 while ((skb
= __skb_dequeue(&sd
->process_queue
))) {
4349 __netif_receive_skb(skb
);
4350 local_irq_disable();
4351 input_queue_head_incr(sd
);
4352 if (++work
>= quota
) {
4359 if (skb_queue_empty(&sd
->input_pkt_queue
)) {
4361 * Inline a custom version of __napi_complete().
4362 * only current cpu owns and manipulates this napi,
4363 * and NAPI_STATE_SCHED is the only possible flag set
4365 * We can use a plain write instead of clear_bit(),
4366 * and we dont need an smp_mb() memory barrier.
4374 skb_queue_splice_tail_init(&sd
->input_pkt_queue
,
4375 &sd
->process_queue
);
4384 * __napi_schedule - schedule for receive
4385 * @n: entry to schedule
4387 * The entry's receive function will be scheduled to run.
4388 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4390 void __napi_schedule(struct napi_struct
*n
)
4392 unsigned long flags
;
4394 local_irq_save(flags
);
4395 ____napi_schedule(this_cpu_ptr(&softnet_data
), n
);
4396 local_irq_restore(flags
);
4398 EXPORT_SYMBOL(__napi_schedule
);
4401 * __napi_schedule_irqoff - schedule for receive
4402 * @n: entry to schedule
4404 * Variant of __napi_schedule() assuming hard irqs are masked
4406 void __napi_schedule_irqoff(struct napi_struct
*n
)
4408 ____napi_schedule(this_cpu_ptr(&softnet_data
), n
);
4410 EXPORT_SYMBOL(__napi_schedule_irqoff
);
4412 void __napi_complete(struct napi_struct
*n
)
4414 BUG_ON(!test_bit(NAPI_STATE_SCHED
, &n
->state
));
4415 BUG_ON(n
->gro_list
);
4417 list_del_init(&n
->poll_list
);
4418 smp_mb__before_atomic();
4419 clear_bit(NAPI_STATE_SCHED
, &n
->state
);
4421 EXPORT_SYMBOL(__napi_complete
);
4423 void napi_complete(struct napi_struct
*n
)
4425 unsigned long flags
;
4428 * don't let napi dequeue from the cpu poll list
4429 * just in case its running on a different cpu
4431 if (unlikely(test_bit(NAPI_STATE_NPSVC
, &n
->state
)))
4434 napi_gro_flush(n
, false);
4436 if (likely(list_empty(&n
->poll_list
))) {
4437 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED
, &n
->state
));
4439 /* If n->poll_list is not empty, we need to mask irqs */
4440 local_irq_save(flags
);
4442 local_irq_restore(flags
);
4445 EXPORT_SYMBOL(napi_complete
);
4447 /* must be called under rcu_read_lock(), as we dont take a reference */
4448 struct napi_struct
*napi_by_id(unsigned int napi_id
)
4450 unsigned int hash
= napi_id
% HASH_SIZE(napi_hash
);
4451 struct napi_struct
*napi
;
4453 hlist_for_each_entry_rcu(napi
, &napi_hash
[hash
], napi_hash_node
)
4454 if (napi
->napi_id
== napi_id
)
4459 EXPORT_SYMBOL_GPL(napi_by_id
);
4461 void napi_hash_add(struct napi_struct
*napi
)
4463 if (!test_and_set_bit(NAPI_STATE_HASHED
, &napi
->state
)) {
4465 spin_lock(&napi_hash_lock
);
4467 /* 0 is not a valid id, we also skip an id that is taken
4468 * we expect both events to be extremely rare
4471 while (!napi
->napi_id
) {
4472 napi
->napi_id
= ++napi_gen_id
;
4473 if (napi_by_id(napi
->napi_id
))
4477 hlist_add_head_rcu(&napi
->napi_hash_node
,
4478 &napi_hash
[napi
->napi_id
% HASH_SIZE(napi_hash
)]);
4480 spin_unlock(&napi_hash_lock
);
4483 EXPORT_SYMBOL_GPL(napi_hash_add
);
4485 /* Warning : caller is responsible to make sure rcu grace period
4486 * is respected before freeing memory containing @napi
4488 void napi_hash_del(struct napi_struct
*napi
)
4490 spin_lock(&napi_hash_lock
);
4492 if (test_and_clear_bit(NAPI_STATE_HASHED
, &napi
->state
))
4493 hlist_del_rcu(&napi
->napi_hash_node
);
4495 spin_unlock(&napi_hash_lock
);
4497 EXPORT_SYMBOL_GPL(napi_hash_del
);
4499 void netif_napi_add(struct net_device
*dev
, struct napi_struct
*napi
,
4500 int (*poll
)(struct napi_struct
*, int), int weight
)
4502 INIT_LIST_HEAD(&napi
->poll_list
);
4503 napi
->gro_count
= 0;
4504 napi
->gro_list
= NULL
;
4507 if (weight
> NAPI_POLL_WEIGHT
)
4508 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4510 napi
->weight
= weight
;
4511 list_add(&napi
->dev_list
, &dev
->napi_list
);
4513 #ifdef CONFIG_NETPOLL
4514 spin_lock_init(&napi
->poll_lock
);
4515 napi
->poll_owner
= -1;
4517 set_bit(NAPI_STATE_SCHED
, &napi
->state
);
4519 EXPORT_SYMBOL(netif_napi_add
);
4521 void netif_napi_del(struct napi_struct
*napi
)
4523 list_del_init(&napi
->dev_list
);
4524 napi_free_frags(napi
);
4526 kfree_skb_list(napi
->gro_list
);
4527 napi
->gro_list
= NULL
;
4528 napi
->gro_count
= 0;
4530 EXPORT_SYMBOL(netif_napi_del
);
4532 static void net_rx_action(struct softirq_action
*h
)
4534 struct softnet_data
*sd
= this_cpu_ptr(&softnet_data
);
4535 unsigned long time_limit
= jiffies
+ 2;
4536 int budget
= netdev_budget
;
4541 local_irq_disable();
4542 list_splice_init(&sd
->poll_list
, &list
);
4545 while (!list_empty(&list
)) {
4546 struct napi_struct
*n
;
4549 /* If softirq window is exhausted then punt.
4550 * Allow this to run for 2 jiffies since which will allow
4551 * an average latency of 1.5/HZ.
4553 if (unlikely(budget
<= 0 || time_after_eq(jiffies
, time_limit
)))
4557 n
= list_first_entry(&list
, struct napi_struct
, poll_list
);
4558 list_del_init(&n
->poll_list
);
4560 have
= netpoll_poll_lock(n
);
4564 /* This NAPI_STATE_SCHED test is for avoiding a race
4565 * with netpoll's poll_napi(). Only the entity which
4566 * obtains the lock and sees NAPI_STATE_SCHED set will
4567 * actually make the ->poll() call. Therefore we avoid
4568 * accidentally calling ->poll() when NAPI is not scheduled.
4571 if (test_bit(NAPI_STATE_SCHED
, &n
->state
)) {
4572 work
= n
->poll(n
, weight
);
4576 WARN_ON_ONCE(work
> weight
);
4580 /* Drivers must not modify the NAPI state if they
4581 * consume the entire weight. In such cases this code
4582 * still "owns" the NAPI instance and therefore can
4583 * move the instance around on the list at-will.
4585 if (unlikely(work
== weight
)) {
4586 if (unlikely(napi_disable_pending(n
))) {
4590 /* flush too old packets
4591 * If HZ < 1000, flush all packets.
4593 napi_gro_flush(n
, HZ
>= 1000);
4595 list_add_tail(&n
->poll_list
, &repoll
);
4599 netpoll_poll_unlock(have
);
4602 if (!sd_has_rps_ipi_waiting(sd
) &&
4603 list_empty(&list
) &&
4604 list_empty(&repoll
))
4607 local_irq_disable();
4609 list_splice_tail_init(&sd
->poll_list
, &list
);
4610 list_splice_tail(&repoll
, &list
);
4611 list_splice(&list
, &sd
->poll_list
);
4612 if (!list_empty(&sd
->poll_list
))
4613 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
4615 net_rps_action_and_irq_enable(sd
);
4624 struct netdev_adjacent
{
4625 struct net_device
*dev
;
4627 /* upper master flag, there can only be one master device per list */
4630 /* counter for the number of times this device was added to us */
4633 /* private field for the users */
4636 struct list_head list
;
4637 struct rcu_head rcu
;
4640 static struct netdev_adjacent
*__netdev_find_adj(struct net_device
*dev
,
4641 struct net_device
*adj_dev
,
4642 struct list_head
*adj_list
)
4644 struct netdev_adjacent
*adj
;
4646 list_for_each_entry(adj
, adj_list
, list
) {
4647 if (adj
->dev
== adj_dev
)
4654 * netdev_has_upper_dev - Check if device is linked to an upper device
4656 * @upper_dev: upper device to check
4658 * Find out if a device is linked to specified upper device and return true
4659 * in case it is. Note that this checks only immediate upper device,
4660 * not through a complete stack of devices. The caller must hold the RTNL lock.
4662 bool netdev_has_upper_dev(struct net_device
*dev
,
4663 struct net_device
*upper_dev
)
4667 return __netdev_find_adj(dev
, upper_dev
, &dev
->all_adj_list
.upper
);
4669 EXPORT_SYMBOL(netdev_has_upper_dev
);
4672 * netdev_has_any_upper_dev - Check if device is linked to some device
4675 * Find out if a device is linked to an upper device and return true in case
4676 * it is. The caller must hold the RTNL lock.
4678 static bool netdev_has_any_upper_dev(struct net_device
*dev
)
4682 return !list_empty(&dev
->all_adj_list
.upper
);
4686 * netdev_master_upper_dev_get - Get master upper device
4689 * Find a master upper device and return pointer to it or NULL in case
4690 * it's not there. The caller must hold the RTNL lock.
4692 struct net_device
*netdev_master_upper_dev_get(struct net_device
*dev
)
4694 struct netdev_adjacent
*upper
;
4698 if (list_empty(&dev
->adj_list
.upper
))
4701 upper
= list_first_entry(&dev
->adj_list
.upper
,
4702 struct netdev_adjacent
, list
);
4703 if (likely(upper
->master
))
4707 EXPORT_SYMBOL(netdev_master_upper_dev_get
);
4709 void *netdev_adjacent_get_private(struct list_head
*adj_list
)
4711 struct netdev_adjacent
*adj
;
4713 adj
= list_entry(adj_list
, struct netdev_adjacent
, list
);
4715 return adj
->private;
4717 EXPORT_SYMBOL(netdev_adjacent_get_private
);
4720 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4722 * @iter: list_head ** of the current position
4724 * Gets the next device from the dev's upper list, starting from iter
4725 * position. The caller must hold RCU read lock.
4727 struct net_device
*netdev_upper_get_next_dev_rcu(struct net_device
*dev
,
4728 struct list_head
**iter
)
4730 struct netdev_adjacent
*upper
;
4732 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4734 upper
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
4736 if (&upper
->list
== &dev
->adj_list
.upper
)
4739 *iter
= &upper
->list
;
4743 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu
);
4746 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4748 * @iter: list_head ** of the current position
4750 * Gets the next device from the dev's upper list, starting from iter
4751 * position. The caller must hold RCU read lock.
4753 struct net_device
*netdev_all_upper_get_next_dev_rcu(struct net_device
*dev
,
4754 struct list_head
**iter
)
4756 struct netdev_adjacent
*upper
;
4758 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4760 upper
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
4762 if (&upper
->list
== &dev
->all_adj_list
.upper
)
4765 *iter
= &upper
->list
;
4769 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu
);
4772 * netdev_lower_get_next_private - Get the next ->private from the
4773 * lower neighbour list
4775 * @iter: list_head ** of the current position
4777 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4778 * list, starting from iter position. The caller must hold either hold the
4779 * RTNL lock or its own locking that guarantees that the neighbour lower
4780 * list will remain unchainged.
4782 void *netdev_lower_get_next_private(struct net_device
*dev
,
4783 struct list_head
**iter
)
4785 struct netdev_adjacent
*lower
;
4787 lower
= list_entry(*iter
, struct netdev_adjacent
, list
);
4789 if (&lower
->list
== &dev
->adj_list
.lower
)
4792 *iter
= lower
->list
.next
;
4794 return lower
->private;
4796 EXPORT_SYMBOL(netdev_lower_get_next_private
);
4799 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4800 * lower neighbour list, RCU
4803 * @iter: list_head ** of the current position
4805 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4806 * list, starting from iter position. The caller must hold RCU read lock.
4808 void *netdev_lower_get_next_private_rcu(struct net_device
*dev
,
4809 struct list_head
**iter
)
4811 struct netdev_adjacent
*lower
;
4813 WARN_ON_ONCE(!rcu_read_lock_held());
4815 lower
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
4817 if (&lower
->list
== &dev
->adj_list
.lower
)
4820 *iter
= &lower
->list
;
4822 return lower
->private;
4824 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu
);
4827 * netdev_lower_get_next - Get the next device from the lower neighbour
4830 * @iter: list_head ** of the current position
4832 * Gets the next netdev_adjacent from the dev's lower neighbour
4833 * list, starting from iter position. The caller must hold RTNL lock or
4834 * its own locking that guarantees that the neighbour lower
4835 * list will remain unchainged.
4837 void *netdev_lower_get_next(struct net_device
*dev
, struct list_head
**iter
)
4839 struct netdev_adjacent
*lower
;
4841 lower
= list_entry((*iter
)->next
, struct netdev_adjacent
, list
);
4843 if (&lower
->list
== &dev
->adj_list
.lower
)
4846 *iter
= &lower
->list
;
4850 EXPORT_SYMBOL(netdev_lower_get_next
);
4853 * netdev_lower_get_first_private_rcu - Get the first ->private from the
4854 * lower neighbour list, RCU
4858 * Gets the first netdev_adjacent->private from the dev's lower neighbour
4859 * list. The caller must hold RCU read lock.
4861 void *netdev_lower_get_first_private_rcu(struct net_device
*dev
)
4863 struct netdev_adjacent
*lower
;
4865 lower
= list_first_or_null_rcu(&dev
->adj_list
.lower
,
4866 struct netdev_adjacent
, list
);
4868 return lower
->private;
4871 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu
);
4874 * netdev_master_upper_dev_get_rcu - Get master upper device
4877 * Find a master upper device and return pointer to it or NULL in case
4878 * it's not there. The caller must hold the RCU read lock.
4880 struct net_device
*netdev_master_upper_dev_get_rcu(struct net_device
*dev
)
4882 struct netdev_adjacent
*upper
;
4884 upper
= list_first_or_null_rcu(&dev
->adj_list
.upper
,
4885 struct netdev_adjacent
, list
);
4886 if (upper
&& likely(upper
->master
))
4890 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu
);
4892 static int netdev_adjacent_sysfs_add(struct net_device
*dev
,
4893 struct net_device
*adj_dev
,
4894 struct list_head
*dev_list
)
4896 char linkname
[IFNAMSIZ
+7];
4897 sprintf(linkname
, dev_list
== &dev
->adj_list
.upper
?
4898 "upper_%s" : "lower_%s", adj_dev
->name
);
4899 return sysfs_create_link(&(dev
->dev
.kobj
), &(adj_dev
->dev
.kobj
),
4902 static void netdev_adjacent_sysfs_del(struct net_device
*dev
,
4904 struct list_head
*dev_list
)
4906 char linkname
[IFNAMSIZ
+7];
4907 sprintf(linkname
, dev_list
== &dev
->adj_list
.upper
?
4908 "upper_%s" : "lower_%s", name
);
4909 sysfs_remove_link(&(dev
->dev
.kobj
), linkname
);
4912 static inline bool netdev_adjacent_is_neigh_list(struct net_device
*dev
,
4913 struct net_device
*adj_dev
,
4914 struct list_head
*dev_list
)
4916 return (dev_list
== &dev
->adj_list
.upper
||
4917 dev_list
== &dev
->adj_list
.lower
) &&
4918 net_eq(dev_net(dev
), dev_net(adj_dev
));
4921 static int __netdev_adjacent_dev_insert(struct net_device
*dev
,
4922 struct net_device
*adj_dev
,
4923 struct list_head
*dev_list
,
4924 void *private, bool master
)
4926 struct netdev_adjacent
*adj
;
4929 adj
= __netdev_find_adj(dev
, adj_dev
, dev_list
);
4936 adj
= kmalloc(sizeof(*adj
), GFP_KERNEL
);
4941 adj
->master
= master
;
4943 adj
->private = private;
4946 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4947 adj_dev
->name
, dev
->name
, adj_dev
->name
);
4949 if (netdev_adjacent_is_neigh_list(dev
, adj_dev
, dev_list
)) {
4950 ret
= netdev_adjacent_sysfs_add(dev
, adj_dev
, dev_list
);
4955 /* Ensure that master link is always the first item in list. */
4957 ret
= sysfs_create_link(&(dev
->dev
.kobj
),
4958 &(adj_dev
->dev
.kobj
), "master");
4960 goto remove_symlinks
;
4962 list_add_rcu(&adj
->list
, dev_list
);
4964 list_add_tail_rcu(&adj
->list
, dev_list
);
4970 if (netdev_adjacent_is_neigh_list(dev
, adj_dev
, dev_list
))
4971 netdev_adjacent_sysfs_del(dev
, adj_dev
->name
, dev_list
);
4979 static void __netdev_adjacent_dev_remove(struct net_device
*dev
,
4980 struct net_device
*adj_dev
,
4981 struct list_head
*dev_list
)
4983 struct netdev_adjacent
*adj
;
4985 adj
= __netdev_find_adj(dev
, adj_dev
, dev_list
);
4988 pr_err("tried to remove device %s from %s\n",
4989 dev
->name
, adj_dev
->name
);
4993 if (adj
->ref_nr
> 1) {
4994 pr_debug("%s to %s ref_nr-- = %d\n", dev
->name
, adj_dev
->name
,
5001 sysfs_remove_link(&(dev
->dev
.kobj
), "master");
5003 if (netdev_adjacent_is_neigh_list(dev
, adj_dev
, dev_list
))
5004 netdev_adjacent_sysfs_del(dev
, adj_dev
->name
, dev_list
);
5006 list_del_rcu(&adj
->list
);
5007 pr_debug("dev_put for %s, because link removed from %s to %s\n",
5008 adj_dev
->name
, dev
->name
, adj_dev
->name
);
5010 kfree_rcu(adj
, rcu
);
5013 static int __netdev_adjacent_dev_link_lists(struct net_device
*dev
,
5014 struct net_device
*upper_dev
,
5015 struct list_head
*up_list
,
5016 struct list_head
*down_list
,
5017 void *private, bool master
)
5021 ret
= __netdev_adjacent_dev_insert(dev
, upper_dev
, up_list
, private,
5026 ret
= __netdev_adjacent_dev_insert(upper_dev
, dev
, down_list
, private,
5029 __netdev_adjacent_dev_remove(dev
, upper_dev
, up_list
);
5036 static int __netdev_adjacent_dev_link(struct net_device
*dev
,
5037 struct net_device
*upper_dev
)
5039 return __netdev_adjacent_dev_link_lists(dev
, upper_dev
,
5040 &dev
->all_adj_list
.upper
,
5041 &upper_dev
->all_adj_list
.lower
,
5045 static void __netdev_adjacent_dev_unlink_lists(struct net_device
*dev
,
5046 struct net_device
*upper_dev
,
5047 struct list_head
*up_list
,
5048 struct list_head
*down_list
)
5050 __netdev_adjacent_dev_remove(dev
, upper_dev
, up_list
);
5051 __netdev_adjacent_dev_remove(upper_dev
, dev
, down_list
);
5054 static void __netdev_adjacent_dev_unlink(struct net_device
*dev
,
5055 struct net_device
*upper_dev
)
5057 __netdev_adjacent_dev_unlink_lists(dev
, upper_dev
,
5058 &dev
->all_adj_list
.upper
,
5059 &upper_dev
->all_adj_list
.lower
);
5062 static int __netdev_adjacent_dev_link_neighbour(struct net_device
*dev
,
5063 struct net_device
*upper_dev
,
5064 void *private, bool master
)
5066 int ret
= __netdev_adjacent_dev_link(dev
, upper_dev
);
5071 ret
= __netdev_adjacent_dev_link_lists(dev
, upper_dev
,
5072 &dev
->adj_list
.upper
,
5073 &upper_dev
->adj_list
.lower
,
5076 __netdev_adjacent_dev_unlink(dev
, upper_dev
);
5083 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device
*dev
,
5084 struct net_device
*upper_dev
)
5086 __netdev_adjacent_dev_unlink(dev
, upper_dev
);
5087 __netdev_adjacent_dev_unlink_lists(dev
, upper_dev
,
5088 &dev
->adj_list
.upper
,
5089 &upper_dev
->adj_list
.lower
);
5092 static int __netdev_upper_dev_link(struct net_device
*dev
,
5093 struct net_device
*upper_dev
, bool master
,
5096 struct netdev_adjacent
*i
, *j
, *to_i
, *to_j
;
5101 if (dev
== upper_dev
)
5104 /* To prevent loops, check if dev is not upper device to upper_dev. */
5105 if (__netdev_find_adj(upper_dev
, dev
, &upper_dev
->all_adj_list
.upper
))
5108 if (__netdev_find_adj(dev
, upper_dev
, &dev
->all_adj_list
.upper
))
5111 if (master
&& netdev_master_upper_dev_get(dev
))
5114 ret
= __netdev_adjacent_dev_link_neighbour(dev
, upper_dev
, private,
5119 /* Now that we linked these devs, make all the upper_dev's
5120 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5121 * versa, and don't forget the devices itself. All of these
5122 * links are non-neighbours.
5124 list_for_each_entry(i
, &dev
->all_adj_list
.lower
, list
) {
5125 list_for_each_entry(j
, &upper_dev
->all_adj_list
.upper
, list
) {
5126 pr_debug("Interlinking %s with %s, non-neighbour\n",
5127 i
->dev
->name
, j
->dev
->name
);
5128 ret
= __netdev_adjacent_dev_link(i
->dev
, j
->dev
);
5134 /* add dev to every upper_dev's upper device */
5135 list_for_each_entry(i
, &upper_dev
->all_adj_list
.upper
, list
) {
5136 pr_debug("linking %s's upper device %s with %s\n",
5137 upper_dev
->name
, i
->dev
->name
, dev
->name
);
5138 ret
= __netdev_adjacent_dev_link(dev
, i
->dev
);
5140 goto rollback_upper_mesh
;
5143 /* add upper_dev to every dev's lower device */
5144 list_for_each_entry(i
, &dev
->all_adj_list
.lower
, list
) {
5145 pr_debug("linking %s's lower device %s with %s\n", dev
->name
,
5146 i
->dev
->name
, upper_dev
->name
);
5147 ret
= __netdev_adjacent_dev_link(i
->dev
, upper_dev
);
5149 goto rollback_lower_mesh
;
5152 call_netdevice_notifiers(NETDEV_CHANGEUPPER
, dev
);
5155 rollback_lower_mesh
:
5157 list_for_each_entry(i
, &dev
->all_adj_list
.lower
, list
) {
5160 __netdev_adjacent_dev_unlink(i
->dev
, upper_dev
);
5165 rollback_upper_mesh
:
5167 list_for_each_entry(i
, &upper_dev
->all_adj_list
.upper
, list
) {
5170 __netdev_adjacent_dev_unlink(dev
, i
->dev
);
5178 list_for_each_entry(i
, &dev
->all_adj_list
.lower
, list
) {
5179 list_for_each_entry(j
, &upper_dev
->all_adj_list
.upper
, list
) {
5180 if (i
== to_i
&& j
== to_j
)
5182 __netdev_adjacent_dev_unlink(i
->dev
, j
->dev
);
5188 __netdev_adjacent_dev_unlink_neighbour(dev
, upper_dev
);
5194 * netdev_upper_dev_link - Add a link to the upper device
5196 * @upper_dev: new upper device
5198 * Adds a link to device which is upper to this one. The caller must hold
5199 * the RTNL lock. On a failure a negative errno code is returned.
5200 * On success the reference counts are adjusted and the function
5203 int netdev_upper_dev_link(struct net_device
*dev
,
5204 struct net_device
*upper_dev
)
5206 return __netdev_upper_dev_link(dev
, upper_dev
, false, NULL
);
5208 EXPORT_SYMBOL(netdev_upper_dev_link
);
5211 * netdev_master_upper_dev_link - Add a master link to the upper device
5213 * @upper_dev: new upper device
5215 * Adds a link to device which is upper to this one. In this case, only
5216 * one master upper device can be linked, although other non-master devices
5217 * might be linked as well. The caller must hold the RTNL lock.
5218 * On a failure a negative errno code is returned. On success the reference
5219 * counts are adjusted and the function returns zero.
5221 int netdev_master_upper_dev_link(struct net_device
*dev
,
5222 struct net_device
*upper_dev
)
5224 return __netdev_upper_dev_link(dev
, upper_dev
, true, NULL
);
5226 EXPORT_SYMBOL(netdev_master_upper_dev_link
);
5228 int netdev_master_upper_dev_link_private(struct net_device
*dev
,
5229 struct net_device
*upper_dev
,
5232 return __netdev_upper_dev_link(dev
, upper_dev
, true, private);
5234 EXPORT_SYMBOL(netdev_master_upper_dev_link_private
);
5237 * netdev_upper_dev_unlink - Removes a link to upper device
5239 * @upper_dev: new upper device
5241 * Removes a link to device which is upper to this one. The caller must hold
5244 void netdev_upper_dev_unlink(struct net_device
*dev
,
5245 struct net_device
*upper_dev
)
5247 struct netdev_adjacent
*i
, *j
;
5250 __netdev_adjacent_dev_unlink_neighbour(dev
, upper_dev
);
5252 /* Here is the tricky part. We must remove all dev's lower
5253 * devices from all upper_dev's upper devices and vice
5254 * versa, to maintain the graph relationship.
5256 list_for_each_entry(i
, &dev
->all_adj_list
.lower
, list
)
5257 list_for_each_entry(j
, &upper_dev
->all_adj_list
.upper
, list
)
5258 __netdev_adjacent_dev_unlink(i
->dev
, j
->dev
);
5260 /* remove also the devices itself from lower/upper device
5263 list_for_each_entry(i
, &dev
->all_adj_list
.lower
, list
)
5264 __netdev_adjacent_dev_unlink(i
->dev
, upper_dev
);
5266 list_for_each_entry(i
, &upper_dev
->all_adj_list
.upper
, list
)
5267 __netdev_adjacent_dev_unlink(dev
, i
->dev
);
5269 call_netdevice_notifiers(NETDEV_CHANGEUPPER
, dev
);
5271 EXPORT_SYMBOL(netdev_upper_dev_unlink
);
5273 void netdev_adjacent_add_links(struct net_device
*dev
)
5275 struct netdev_adjacent
*iter
;
5277 struct net
*net
= dev_net(dev
);
5279 list_for_each_entry(iter
, &dev
->adj_list
.upper
, list
) {
5280 if (!net_eq(net
,dev_net(iter
->dev
)))
5282 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
5283 &iter
->dev
->adj_list
.lower
);
5284 netdev_adjacent_sysfs_add(dev
, iter
->dev
,
5285 &dev
->adj_list
.upper
);
5288 list_for_each_entry(iter
, &dev
->adj_list
.lower
, list
) {
5289 if (!net_eq(net
,dev_net(iter
->dev
)))
5291 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
5292 &iter
->dev
->adj_list
.upper
);
5293 netdev_adjacent_sysfs_add(dev
, iter
->dev
,
5294 &dev
->adj_list
.lower
);
5298 void netdev_adjacent_del_links(struct net_device
*dev
)
5300 struct netdev_adjacent
*iter
;
5302 struct net
*net
= dev_net(dev
);
5304 list_for_each_entry(iter
, &dev
->adj_list
.upper
, list
) {
5305 if (!net_eq(net
,dev_net(iter
->dev
)))
5307 netdev_adjacent_sysfs_del(iter
->dev
, dev
->name
,
5308 &iter
->dev
->adj_list
.lower
);
5309 netdev_adjacent_sysfs_del(dev
, iter
->dev
->name
,
5310 &dev
->adj_list
.upper
);
5313 list_for_each_entry(iter
, &dev
->adj_list
.lower
, list
) {
5314 if (!net_eq(net
,dev_net(iter
->dev
)))
5316 netdev_adjacent_sysfs_del(iter
->dev
, dev
->name
,
5317 &iter
->dev
->adj_list
.upper
);
5318 netdev_adjacent_sysfs_del(dev
, iter
->dev
->name
,
5319 &dev
->adj_list
.lower
);
5323 void netdev_adjacent_rename_links(struct net_device
*dev
, char *oldname
)
5325 struct netdev_adjacent
*iter
;
5327 struct net
*net
= dev_net(dev
);
5329 list_for_each_entry(iter
, &dev
->adj_list
.upper
, list
) {
5330 if (!net_eq(net
,dev_net(iter
->dev
)))
5332 netdev_adjacent_sysfs_del(iter
->dev
, oldname
,
5333 &iter
->dev
->adj_list
.lower
);
5334 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
5335 &iter
->dev
->adj_list
.lower
);
5338 list_for_each_entry(iter
, &dev
->adj_list
.lower
, list
) {
5339 if (!net_eq(net
,dev_net(iter
->dev
)))
5341 netdev_adjacent_sysfs_del(iter
->dev
, oldname
,
5342 &iter
->dev
->adj_list
.upper
);
5343 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
5344 &iter
->dev
->adj_list
.upper
);
5348 void *netdev_lower_dev_get_private(struct net_device
*dev
,
5349 struct net_device
*lower_dev
)
5351 struct netdev_adjacent
*lower
;
5355 lower
= __netdev_find_adj(dev
, lower_dev
, &dev
->adj_list
.lower
);
5359 return lower
->private;
5361 EXPORT_SYMBOL(netdev_lower_dev_get_private
);
5364 int dev_get_nest_level(struct net_device
*dev
,
5365 bool (*type_check
)(struct net_device
*dev
))
5367 struct net_device
*lower
= NULL
;
5368 struct list_head
*iter
;
5374 netdev_for_each_lower_dev(dev
, lower
, iter
) {
5375 nest
= dev_get_nest_level(lower
, type_check
);
5376 if (max_nest
< nest
)
5380 if (type_check(dev
))
5385 EXPORT_SYMBOL(dev_get_nest_level
);
5387 static void dev_change_rx_flags(struct net_device
*dev
, int flags
)
5389 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5391 if (ops
->ndo_change_rx_flags
)
5392 ops
->ndo_change_rx_flags(dev
, flags
);
5395 static int __dev_set_promiscuity(struct net_device
*dev
, int inc
, bool notify
)
5397 unsigned int old_flags
= dev
->flags
;
5403 dev
->flags
|= IFF_PROMISC
;
5404 dev
->promiscuity
+= inc
;
5405 if (dev
->promiscuity
== 0) {
5408 * If inc causes overflow, untouch promisc and return error.
5411 dev
->flags
&= ~IFF_PROMISC
;
5413 dev
->promiscuity
-= inc
;
5414 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5419 if (dev
->flags
!= old_flags
) {
5420 pr_info("device %s %s promiscuous mode\n",
5422 dev
->flags
& IFF_PROMISC
? "entered" : "left");
5423 if (audit_enabled
) {
5424 current_uid_gid(&uid
, &gid
);
5425 audit_log(current
->audit_context
, GFP_ATOMIC
,
5426 AUDIT_ANOM_PROMISCUOUS
,
5427 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5428 dev
->name
, (dev
->flags
& IFF_PROMISC
),
5429 (old_flags
& IFF_PROMISC
),
5430 from_kuid(&init_user_ns
, audit_get_loginuid(current
)),
5431 from_kuid(&init_user_ns
, uid
),
5432 from_kgid(&init_user_ns
, gid
),
5433 audit_get_sessionid(current
));
5436 dev_change_rx_flags(dev
, IFF_PROMISC
);
5439 __dev_notify_flags(dev
, old_flags
, IFF_PROMISC
);
5444 * dev_set_promiscuity - update promiscuity count on a device
5448 * Add or remove promiscuity from a device. While the count in the device
5449 * remains above zero the interface remains promiscuous. Once it hits zero
5450 * the device reverts back to normal filtering operation. A negative inc
5451 * value is used to drop promiscuity on the device.
5452 * Return 0 if successful or a negative errno code on error.
5454 int dev_set_promiscuity(struct net_device
*dev
, int inc
)
5456 unsigned int old_flags
= dev
->flags
;
5459 err
= __dev_set_promiscuity(dev
, inc
, true);
5462 if (dev
->flags
!= old_flags
)
5463 dev_set_rx_mode(dev
);
5466 EXPORT_SYMBOL(dev_set_promiscuity
);
5468 static int __dev_set_allmulti(struct net_device
*dev
, int inc
, bool notify
)
5470 unsigned int old_flags
= dev
->flags
, old_gflags
= dev
->gflags
;
5474 dev
->flags
|= IFF_ALLMULTI
;
5475 dev
->allmulti
+= inc
;
5476 if (dev
->allmulti
== 0) {
5479 * If inc causes overflow, untouch allmulti and return error.
5482 dev
->flags
&= ~IFF_ALLMULTI
;
5484 dev
->allmulti
-= inc
;
5485 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5490 if (dev
->flags
^ old_flags
) {
5491 dev_change_rx_flags(dev
, IFF_ALLMULTI
);
5492 dev_set_rx_mode(dev
);
5494 __dev_notify_flags(dev
, old_flags
,
5495 dev
->gflags
^ old_gflags
);
5501 * dev_set_allmulti - update allmulti count on a device
5505 * Add or remove reception of all multicast frames to a device. While the
5506 * count in the device remains above zero the interface remains listening
5507 * to all interfaces. Once it hits zero the device reverts back to normal
5508 * filtering operation. A negative @inc value is used to drop the counter
5509 * when releasing a resource needing all multicasts.
5510 * Return 0 if successful or a negative errno code on error.
5513 int dev_set_allmulti(struct net_device
*dev
, int inc
)
5515 return __dev_set_allmulti(dev
, inc
, true);
5517 EXPORT_SYMBOL(dev_set_allmulti
);
5520 * Upload unicast and multicast address lists to device and
5521 * configure RX filtering. When the device doesn't support unicast
5522 * filtering it is put in promiscuous mode while unicast addresses
5525 void __dev_set_rx_mode(struct net_device
*dev
)
5527 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5529 /* dev_open will call this function so the list will stay sane. */
5530 if (!(dev
->flags
&IFF_UP
))
5533 if (!netif_device_present(dev
))
5536 if (!(dev
->priv_flags
& IFF_UNICAST_FLT
)) {
5537 /* Unicast addresses changes may only happen under the rtnl,
5538 * therefore calling __dev_set_promiscuity here is safe.
5540 if (!netdev_uc_empty(dev
) && !dev
->uc_promisc
) {
5541 __dev_set_promiscuity(dev
, 1, false);
5542 dev
->uc_promisc
= true;
5543 } else if (netdev_uc_empty(dev
) && dev
->uc_promisc
) {
5544 __dev_set_promiscuity(dev
, -1, false);
5545 dev
->uc_promisc
= false;
5549 if (ops
->ndo_set_rx_mode
)
5550 ops
->ndo_set_rx_mode(dev
);
5553 void dev_set_rx_mode(struct net_device
*dev
)
5555 netif_addr_lock_bh(dev
);
5556 __dev_set_rx_mode(dev
);
5557 netif_addr_unlock_bh(dev
);
5561 * dev_get_flags - get flags reported to userspace
5564 * Get the combination of flag bits exported through APIs to userspace.
5566 unsigned int dev_get_flags(const struct net_device
*dev
)
5570 flags
= (dev
->flags
& ~(IFF_PROMISC
|
5575 (dev
->gflags
& (IFF_PROMISC
|
5578 if (netif_running(dev
)) {
5579 if (netif_oper_up(dev
))
5580 flags
|= IFF_RUNNING
;
5581 if (netif_carrier_ok(dev
))
5582 flags
|= IFF_LOWER_UP
;
5583 if (netif_dormant(dev
))
5584 flags
|= IFF_DORMANT
;
5589 EXPORT_SYMBOL(dev_get_flags
);
5591 int __dev_change_flags(struct net_device
*dev
, unsigned int flags
)
5593 unsigned int old_flags
= dev
->flags
;
5599 * Set the flags on our device.
5602 dev
->flags
= (flags
& (IFF_DEBUG
| IFF_NOTRAILERS
| IFF_NOARP
|
5603 IFF_DYNAMIC
| IFF_MULTICAST
| IFF_PORTSEL
|
5605 (dev
->flags
& (IFF_UP
| IFF_VOLATILE
| IFF_PROMISC
|
5609 * Load in the correct multicast list now the flags have changed.
5612 if ((old_flags
^ flags
) & IFF_MULTICAST
)
5613 dev_change_rx_flags(dev
, IFF_MULTICAST
);
5615 dev_set_rx_mode(dev
);
5618 * Have we downed the interface. We handle IFF_UP ourselves
5619 * according to user attempts to set it, rather than blindly
5624 if ((old_flags
^ flags
) & IFF_UP
)
5625 ret
= ((old_flags
& IFF_UP
) ? __dev_close
: __dev_open
)(dev
);
5627 if ((flags
^ dev
->gflags
) & IFF_PROMISC
) {
5628 int inc
= (flags
& IFF_PROMISC
) ? 1 : -1;
5629 unsigned int old_flags
= dev
->flags
;
5631 dev
->gflags
^= IFF_PROMISC
;
5633 if (__dev_set_promiscuity(dev
, inc
, false) >= 0)
5634 if (dev
->flags
!= old_flags
)
5635 dev_set_rx_mode(dev
);
5638 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5639 is important. Some (broken) drivers set IFF_PROMISC, when
5640 IFF_ALLMULTI is requested not asking us and not reporting.
5642 if ((flags
^ dev
->gflags
) & IFF_ALLMULTI
) {
5643 int inc
= (flags
& IFF_ALLMULTI
) ? 1 : -1;
5645 dev
->gflags
^= IFF_ALLMULTI
;
5646 __dev_set_allmulti(dev
, inc
, false);
5652 void __dev_notify_flags(struct net_device
*dev
, unsigned int old_flags
,
5653 unsigned int gchanges
)
5655 unsigned int changes
= dev
->flags
^ old_flags
;
5658 rtmsg_ifinfo(RTM_NEWLINK
, dev
, gchanges
, GFP_ATOMIC
);
5660 if (changes
& IFF_UP
) {
5661 if (dev
->flags
& IFF_UP
)
5662 call_netdevice_notifiers(NETDEV_UP
, dev
);
5664 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
5667 if (dev
->flags
& IFF_UP
&&
5668 (changes
& ~(IFF_UP
| IFF_PROMISC
| IFF_ALLMULTI
| IFF_VOLATILE
))) {
5669 struct netdev_notifier_change_info change_info
;
5671 change_info
.flags_changed
= changes
;
5672 call_netdevice_notifiers_info(NETDEV_CHANGE
, dev
,
5678 * dev_change_flags - change device settings
5680 * @flags: device state flags
5682 * Change settings on device based state flags. The flags are
5683 * in the userspace exported format.
5685 int dev_change_flags(struct net_device
*dev
, unsigned int flags
)
5688 unsigned int changes
, old_flags
= dev
->flags
, old_gflags
= dev
->gflags
;
5690 ret
= __dev_change_flags(dev
, flags
);
5694 changes
= (old_flags
^ dev
->flags
) | (old_gflags
^ dev
->gflags
);
5695 __dev_notify_flags(dev
, old_flags
, changes
);
5698 EXPORT_SYMBOL(dev_change_flags
);
5700 static int __dev_set_mtu(struct net_device
*dev
, int new_mtu
)
5702 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5704 if (ops
->ndo_change_mtu
)
5705 return ops
->ndo_change_mtu(dev
, new_mtu
);
5712 * dev_set_mtu - Change maximum transfer unit
5714 * @new_mtu: new transfer unit
5716 * Change the maximum transfer size of the network device.
5718 int dev_set_mtu(struct net_device
*dev
, int new_mtu
)
5722 if (new_mtu
== dev
->mtu
)
5725 /* MTU must be positive. */
5729 if (!netif_device_present(dev
))
5732 err
= call_netdevice_notifiers(NETDEV_PRECHANGEMTU
, dev
);
5733 err
= notifier_to_errno(err
);
5737 orig_mtu
= dev
->mtu
;
5738 err
= __dev_set_mtu(dev
, new_mtu
);
5741 err
= call_netdevice_notifiers(NETDEV_CHANGEMTU
, dev
);
5742 err
= notifier_to_errno(err
);
5744 /* setting mtu back and notifying everyone again,
5745 * so that they have a chance to revert changes.
5747 __dev_set_mtu(dev
, orig_mtu
);
5748 call_netdevice_notifiers(NETDEV_CHANGEMTU
, dev
);
5753 EXPORT_SYMBOL(dev_set_mtu
);
5756 * dev_set_group - Change group this device belongs to
5758 * @new_group: group this device should belong to
5760 void dev_set_group(struct net_device
*dev
, int new_group
)
5762 dev
->group
= new_group
;
5764 EXPORT_SYMBOL(dev_set_group
);
5767 * dev_set_mac_address - Change Media Access Control Address
5771 * Change the hardware (MAC) address of the device
5773 int dev_set_mac_address(struct net_device
*dev
, struct sockaddr
*sa
)
5775 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5778 if (!ops
->ndo_set_mac_address
)
5780 if (sa
->sa_family
!= dev
->type
)
5782 if (!netif_device_present(dev
))
5784 err
= ops
->ndo_set_mac_address(dev
, sa
);
5787 dev
->addr_assign_type
= NET_ADDR_SET
;
5788 call_netdevice_notifiers(NETDEV_CHANGEADDR
, dev
);
5789 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
5792 EXPORT_SYMBOL(dev_set_mac_address
);
5795 * dev_change_carrier - Change device carrier
5797 * @new_carrier: new value
5799 * Change device carrier
5801 int dev_change_carrier(struct net_device
*dev
, bool new_carrier
)
5803 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5805 if (!ops
->ndo_change_carrier
)
5807 if (!netif_device_present(dev
))
5809 return ops
->ndo_change_carrier(dev
, new_carrier
);
5811 EXPORT_SYMBOL(dev_change_carrier
);
5814 * dev_get_phys_port_id - Get device physical port ID
5818 * Get device physical port ID
5820 int dev_get_phys_port_id(struct net_device
*dev
,
5821 struct netdev_phys_port_id
*ppid
)
5823 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5825 if (!ops
->ndo_get_phys_port_id
)
5827 return ops
->ndo_get_phys_port_id(dev
, ppid
);
5829 EXPORT_SYMBOL(dev_get_phys_port_id
);
5832 * dev_new_index - allocate an ifindex
5833 * @net: the applicable net namespace
5835 * Returns a suitable unique value for a new device interface
5836 * number. The caller must hold the rtnl semaphore or the
5837 * dev_base_lock to be sure it remains unique.
5839 static int dev_new_index(struct net
*net
)
5841 int ifindex
= net
->ifindex
;
5845 if (!__dev_get_by_index(net
, ifindex
))
5846 return net
->ifindex
= ifindex
;
5850 /* Delayed registration/unregisteration */
5851 static LIST_HEAD(net_todo_list
);
5852 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq
);
5854 static void net_set_todo(struct net_device
*dev
)
5856 list_add_tail(&dev
->todo_list
, &net_todo_list
);
5857 dev_net(dev
)->dev_unreg_count
++;
5860 static void rollback_registered_many(struct list_head
*head
)
5862 struct net_device
*dev
, *tmp
;
5863 LIST_HEAD(close_head
);
5865 BUG_ON(dev_boot_phase
);
5868 list_for_each_entry_safe(dev
, tmp
, head
, unreg_list
) {
5869 /* Some devices call without registering
5870 * for initialization unwind. Remove those
5871 * devices and proceed with the remaining.
5873 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
5874 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5878 list_del(&dev
->unreg_list
);
5881 dev
->dismantle
= true;
5882 BUG_ON(dev
->reg_state
!= NETREG_REGISTERED
);
5885 /* If device is running, close it first. */
5886 list_for_each_entry(dev
, head
, unreg_list
)
5887 list_add_tail(&dev
->close_list
, &close_head
);
5888 dev_close_many(&close_head
);
5890 list_for_each_entry(dev
, head
, unreg_list
) {
5891 /* And unlink it from device chain. */
5892 unlist_netdevice(dev
);
5894 dev
->reg_state
= NETREG_UNREGISTERING
;
5899 list_for_each_entry(dev
, head
, unreg_list
) {
5900 /* Shutdown queueing discipline. */
5904 /* Notify protocols, that we are about to destroy
5905 this device. They should clean all the things.
5907 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
5910 * Flush the unicast and multicast chains
5915 if (dev
->netdev_ops
->ndo_uninit
)
5916 dev
->netdev_ops
->ndo_uninit(dev
);
5918 if (!dev
->rtnl_link_ops
||
5919 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
5920 rtmsg_ifinfo(RTM_DELLINK
, dev
, ~0U, GFP_KERNEL
);
5922 /* Notifier chain MUST detach us all upper devices. */
5923 WARN_ON(netdev_has_any_upper_dev(dev
));
5925 /* Remove entries from kobject tree */
5926 netdev_unregister_kobject(dev
);
5928 /* Remove XPS queueing entries */
5929 netif_reset_xps_queues_gt(dev
, 0);
5935 list_for_each_entry(dev
, head
, unreg_list
)
5939 static void rollback_registered(struct net_device
*dev
)
5943 list_add(&dev
->unreg_list
, &single
);
5944 rollback_registered_many(&single
);
5948 static netdev_features_t
netdev_fix_features(struct net_device
*dev
,
5949 netdev_features_t features
)
5951 /* Fix illegal checksum combinations */
5952 if ((features
& NETIF_F_HW_CSUM
) &&
5953 (features
& (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))) {
5954 netdev_warn(dev
, "mixed HW and IP checksum settings.\n");
5955 features
&= ~(NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
);
5958 /* TSO requires that SG is present as well. */
5959 if ((features
& NETIF_F_ALL_TSO
) && !(features
& NETIF_F_SG
)) {
5960 netdev_dbg(dev
, "Dropping TSO features since no SG feature.\n");
5961 features
&= ~NETIF_F_ALL_TSO
;
5964 if ((features
& NETIF_F_TSO
) && !(features
& NETIF_F_HW_CSUM
) &&
5965 !(features
& NETIF_F_IP_CSUM
)) {
5966 netdev_dbg(dev
, "Dropping TSO features since no CSUM feature.\n");
5967 features
&= ~NETIF_F_TSO
;
5968 features
&= ~NETIF_F_TSO_ECN
;
5971 if ((features
& NETIF_F_TSO6
) && !(features
& NETIF_F_HW_CSUM
) &&
5972 !(features
& NETIF_F_IPV6_CSUM
)) {
5973 netdev_dbg(dev
, "Dropping TSO6 features since no CSUM feature.\n");
5974 features
&= ~NETIF_F_TSO6
;
5977 /* TSO ECN requires that TSO is present as well. */
5978 if ((features
& NETIF_F_ALL_TSO
) == NETIF_F_TSO_ECN
)
5979 features
&= ~NETIF_F_TSO_ECN
;
5981 /* Software GSO depends on SG. */
5982 if ((features
& NETIF_F_GSO
) && !(features
& NETIF_F_SG
)) {
5983 netdev_dbg(dev
, "Dropping NETIF_F_GSO since no SG feature.\n");
5984 features
&= ~NETIF_F_GSO
;
5987 /* UFO needs SG and checksumming */
5988 if (features
& NETIF_F_UFO
) {
5989 /* maybe split UFO into V4 and V6? */
5990 if (!((features
& NETIF_F_GEN_CSUM
) ||
5991 (features
& (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))
5992 == (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))) {
5994 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5995 features
&= ~NETIF_F_UFO
;
5998 if (!(features
& NETIF_F_SG
)) {
6000 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6001 features
&= ~NETIF_F_UFO
;
6005 #ifdef CONFIG_NET_RX_BUSY_POLL
6006 if (dev
->netdev_ops
->ndo_busy_poll
)
6007 features
|= NETIF_F_BUSY_POLL
;
6010 features
&= ~NETIF_F_BUSY_POLL
;
6015 int __netdev_update_features(struct net_device
*dev
)
6017 netdev_features_t features
;
6022 features
= netdev_get_wanted_features(dev
);
6024 if (dev
->netdev_ops
->ndo_fix_features
)
6025 features
= dev
->netdev_ops
->ndo_fix_features(dev
, features
);
6027 /* driver might be less strict about feature dependencies */
6028 features
= netdev_fix_features(dev
, features
);
6030 if (dev
->features
== features
)
6033 netdev_dbg(dev
, "Features changed: %pNF -> %pNF\n",
6034 &dev
->features
, &features
);
6036 if (dev
->netdev_ops
->ndo_set_features
)
6037 err
= dev
->netdev_ops
->ndo_set_features(dev
, features
);
6039 if (unlikely(err
< 0)) {
6041 "set_features() failed (%d); wanted %pNF, left %pNF\n",
6042 err
, &features
, &dev
->features
);
6047 dev
->features
= features
;
6053 * netdev_update_features - recalculate device features
6054 * @dev: the device to check
6056 * Recalculate dev->features set and send notifications if it
6057 * has changed. Should be called after driver or hardware dependent
6058 * conditions might have changed that influence the features.
6060 void netdev_update_features(struct net_device
*dev
)
6062 if (__netdev_update_features(dev
))
6063 netdev_features_change(dev
);
6065 EXPORT_SYMBOL(netdev_update_features
);
6068 * netdev_change_features - recalculate device features
6069 * @dev: the device to check
6071 * Recalculate dev->features set and send notifications even
6072 * if they have not changed. Should be called instead of
6073 * netdev_update_features() if also dev->vlan_features might
6074 * have changed to allow the changes to be propagated to stacked
6077 void netdev_change_features(struct net_device
*dev
)
6079 __netdev_update_features(dev
);
6080 netdev_features_change(dev
);
6082 EXPORT_SYMBOL(netdev_change_features
);
6085 * netif_stacked_transfer_operstate - transfer operstate
6086 * @rootdev: the root or lower level device to transfer state from
6087 * @dev: the device to transfer operstate to
6089 * Transfer operational state from root to device. This is normally
6090 * called when a stacking relationship exists between the root
6091 * device and the device(a leaf device).
6093 void netif_stacked_transfer_operstate(const struct net_device
*rootdev
,
6094 struct net_device
*dev
)
6096 if (rootdev
->operstate
== IF_OPER_DORMANT
)
6097 netif_dormant_on(dev
);
6099 netif_dormant_off(dev
);
6101 if (netif_carrier_ok(rootdev
)) {
6102 if (!netif_carrier_ok(dev
))
6103 netif_carrier_on(dev
);
6105 if (netif_carrier_ok(dev
))
6106 netif_carrier_off(dev
);
6109 EXPORT_SYMBOL(netif_stacked_transfer_operstate
);
6112 static int netif_alloc_rx_queues(struct net_device
*dev
)
6114 unsigned int i
, count
= dev
->num_rx_queues
;
6115 struct netdev_rx_queue
*rx
;
6119 rx
= kcalloc(count
, sizeof(struct netdev_rx_queue
), GFP_KERNEL
);
6125 for (i
= 0; i
< count
; i
++)
6131 static void netdev_init_one_queue(struct net_device
*dev
,
6132 struct netdev_queue
*queue
, void *_unused
)
6134 /* Initialize queue lock */
6135 spin_lock_init(&queue
->_xmit_lock
);
6136 netdev_set_xmit_lockdep_class(&queue
->_xmit_lock
, dev
->type
);
6137 queue
->xmit_lock_owner
= -1;
6138 netdev_queue_numa_node_write(queue
, NUMA_NO_NODE
);
6141 dql_init(&queue
->dql
, HZ
);
6145 static void netif_free_tx_queues(struct net_device
*dev
)
6150 static int netif_alloc_netdev_queues(struct net_device
*dev
)
6152 unsigned int count
= dev
->num_tx_queues
;
6153 struct netdev_queue
*tx
;
6154 size_t sz
= count
* sizeof(*tx
);
6156 BUG_ON(count
< 1 || count
> 0xffff);
6158 tx
= kzalloc(sz
, GFP_KERNEL
| __GFP_NOWARN
| __GFP_REPEAT
);
6166 netdev_for_each_tx_queue(dev
, netdev_init_one_queue
, NULL
);
6167 spin_lock_init(&dev
->tx_global_lock
);
6173 * register_netdevice - register a network device
6174 * @dev: device to register
6176 * Take a completed network device structure and add it to the kernel
6177 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6178 * chain. 0 is returned on success. A negative errno code is returned
6179 * on a failure to set up the device, or if the name is a duplicate.
6181 * Callers must hold the rtnl semaphore. You may want
6182 * register_netdev() instead of this.
6185 * The locking appears insufficient to guarantee two parallel registers
6186 * will not get the same name.
6189 int register_netdevice(struct net_device
*dev
)
6192 struct net
*net
= dev_net(dev
);
6194 BUG_ON(dev_boot_phase
);
6199 /* When net_device's are persistent, this will be fatal. */
6200 BUG_ON(dev
->reg_state
!= NETREG_UNINITIALIZED
);
6203 spin_lock_init(&dev
->addr_list_lock
);
6204 netdev_set_addr_lockdep_class(dev
);
6208 ret
= dev_get_valid_name(net
, dev
, dev
->name
);
6212 /* Init, if this function is available */
6213 if (dev
->netdev_ops
->ndo_init
) {
6214 ret
= dev
->netdev_ops
->ndo_init(dev
);
6222 if (((dev
->hw_features
| dev
->features
) &
6223 NETIF_F_HW_VLAN_CTAG_FILTER
) &&
6224 (!dev
->netdev_ops
->ndo_vlan_rx_add_vid
||
6225 !dev
->netdev_ops
->ndo_vlan_rx_kill_vid
)) {
6226 netdev_WARN(dev
, "Buggy VLAN acceleration in driver!\n");
6233 dev
->ifindex
= dev_new_index(net
);
6234 else if (__dev_get_by_index(net
, dev
->ifindex
))
6237 if (dev
->iflink
== -1)
6238 dev
->iflink
= dev
->ifindex
;
6240 /* Transfer changeable features to wanted_features and enable
6241 * software offloads (GSO and GRO).
6243 dev
->hw_features
|= NETIF_F_SOFT_FEATURES
;
6244 dev
->features
|= NETIF_F_SOFT_FEATURES
;
6245 dev
->wanted_features
= dev
->features
& dev
->hw_features
;
6247 if (!(dev
->flags
& IFF_LOOPBACK
)) {
6248 dev
->hw_features
|= NETIF_F_NOCACHE_COPY
;
6251 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6253 dev
->vlan_features
|= NETIF_F_HIGHDMA
;
6255 /* Make NETIF_F_SG inheritable to tunnel devices.
6257 dev
->hw_enc_features
|= NETIF_F_SG
;
6259 /* Make NETIF_F_SG inheritable to MPLS.
6261 dev
->mpls_features
|= NETIF_F_SG
;
6263 ret
= call_netdevice_notifiers(NETDEV_POST_INIT
, dev
);
6264 ret
= notifier_to_errno(ret
);
6268 ret
= netdev_register_kobject(dev
);
6271 dev
->reg_state
= NETREG_REGISTERED
;
6273 __netdev_update_features(dev
);
6276 * Default initial state at registry is that the
6277 * device is present.
6280 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
6282 linkwatch_init_dev(dev
);
6284 dev_init_scheduler(dev
);
6286 list_netdevice(dev
);
6287 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
6289 /* If the device has permanent device address, driver should
6290 * set dev_addr and also addr_assign_type should be set to
6291 * NET_ADDR_PERM (default value).
6293 if (dev
->addr_assign_type
== NET_ADDR_PERM
)
6294 memcpy(dev
->perm_addr
, dev
->dev_addr
, dev
->addr_len
);
6296 /* Notify protocols, that a new device appeared. */
6297 ret
= call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
6298 ret
= notifier_to_errno(ret
);
6300 rollback_registered(dev
);
6301 dev
->reg_state
= NETREG_UNREGISTERED
;
6304 * Prevent userspace races by waiting until the network
6305 * device is fully setup before sending notifications.
6307 if (!dev
->rtnl_link_ops
||
6308 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
6309 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U, GFP_KERNEL
);
6315 if (dev
->netdev_ops
->ndo_uninit
)
6316 dev
->netdev_ops
->ndo_uninit(dev
);
6319 EXPORT_SYMBOL(register_netdevice
);
6322 * init_dummy_netdev - init a dummy network device for NAPI
6323 * @dev: device to init
6325 * This takes a network device structure and initialize the minimum
6326 * amount of fields so it can be used to schedule NAPI polls without
6327 * registering a full blown interface. This is to be used by drivers
6328 * that need to tie several hardware interfaces to a single NAPI
6329 * poll scheduler due to HW limitations.
6331 int init_dummy_netdev(struct net_device
*dev
)
6333 /* Clear everything. Note we don't initialize spinlocks
6334 * are they aren't supposed to be taken by any of the
6335 * NAPI code and this dummy netdev is supposed to be
6336 * only ever used for NAPI polls
6338 memset(dev
, 0, sizeof(struct net_device
));
6340 /* make sure we BUG if trying to hit standard
6341 * register/unregister code path
6343 dev
->reg_state
= NETREG_DUMMY
;
6345 /* NAPI wants this */
6346 INIT_LIST_HEAD(&dev
->napi_list
);
6348 /* a dummy interface is started by default */
6349 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
6350 set_bit(__LINK_STATE_START
, &dev
->state
);
6352 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6353 * because users of this 'device' dont need to change
6359 EXPORT_SYMBOL_GPL(init_dummy_netdev
);
6363 * register_netdev - register a network device
6364 * @dev: device to register
6366 * Take a completed network device structure and add it to the kernel
6367 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6368 * chain. 0 is returned on success. A negative errno code is returned
6369 * on a failure to set up the device, or if the name is a duplicate.
6371 * This is a wrapper around register_netdevice that takes the rtnl semaphore
6372 * and expands the device name if you passed a format string to
6375 int register_netdev(struct net_device
*dev
)
6380 err
= register_netdevice(dev
);
6384 EXPORT_SYMBOL(register_netdev
);
6386 int netdev_refcnt_read(const struct net_device
*dev
)
6390 for_each_possible_cpu(i
)
6391 refcnt
+= *per_cpu_ptr(dev
->pcpu_refcnt
, i
);
6394 EXPORT_SYMBOL(netdev_refcnt_read
);
6397 * netdev_wait_allrefs - wait until all references are gone.
6398 * @dev: target net_device
6400 * This is called when unregistering network devices.
6402 * Any protocol or device that holds a reference should register
6403 * for netdevice notification, and cleanup and put back the
6404 * reference if they receive an UNREGISTER event.
6405 * We can get stuck here if buggy protocols don't correctly
6408 static void netdev_wait_allrefs(struct net_device
*dev
)
6410 unsigned long rebroadcast_time
, warning_time
;
6413 linkwatch_forget_dev(dev
);
6415 rebroadcast_time
= warning_time
= jiffies
;
6416 refcnt
= netdev_refcnt_read(dev
);
6418 while (refcnt
!= 0) {
6419 if (time_after(jiffies
, rebroadcast_time
+ 1 * HZ
)) {
6422 /* Rebroadcast unregister notification */
6423 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
6429 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
6430 if (test_bit(__LINK_STATE_LINKWATCH_PENDING
,
6432 /* We must not have linkwatch events
6433 * pending on unregister. If this
6434 * happens, we simply run the queue
6435 * unscheduled, resulting in a noop
6438 linkwatch_run_queue();
6443 rebroadcast_time
= jiffies
;
6448 refcnt
= netdev_refcnt_read(dev
);
6450 if (time_after(jiffies
, warning_time
+ 10 * HZ
)) {
6451 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6453 warning_time
= jiffies
;
6462 * register_netdevice(x1);
6463 * register_netdevice(x2);
6465 * unregister_netdevice(y1);
6466 * unregister_netdevice(y2);
6472 * We are invoked by rtnl_unlock().
6473 * This allows us to deal with problems:
6474 * 1) We can delete sysfs objects which invoke hotplug
6475 * without deadlocking with linkwatch via keventd.
6476 * 2) Since we run with the RTNL semaphore not held, we can sleep
6477 * safely in order to wait for the netdev refcnt to drop to zero.
6479 * We must not return until all unregister events added during
6480 * the interval the lock was held have been completed.
6482 void netdev_run_todo(void)
6484 struct list_head list
;
6486 /* Snapshot list, allow later requests */
6487 list_replace_init(&net_todo_list
, &list
);
6492 /* Wait for rcu callbacks to finish before next phase */
6493 if (!list_empty(&list
))
6496 while (!list_empty(&list
)) {
6497 struct net_device
*dev
6498 = list_first_entry(&list
, struct net_device
, todo_list
);
6499 list_del(&dev
->todo_list
);
6502 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
6505 if (unlikely(dev
->reg_state
!= NETREG_UNREGISTERING
)) {
6506 pr_err("network todo '%s' but state %d\n",
6507 dev
->name
, dev
->reg_state
);
6512 dev
->reg_state
= NETREG_UNREGISTERED
;
6514 on_each_cpu(flush_backlog
, dev
, 1);
6516 netdev_wait_allrefs(dev
);
6519 BUG_ON(netdev_refcnt_read(dev
));
6520 WARN_ON(rcu_access_pointer(dev
->ip_ptr
));
6521 WARN_ON(rcu_access_pointer(dev
->ip6_ptr
));
6522 WARN_ON(dev
->dn_ptr
);
6524 if (dev
->destructor
)
6525 dev
->destructor(dev
);
6527 /* Report a network device has been unregistered */
6529 dev_net(dev
)->dev_unreg_count
--;
6531 wake_up(&netdev_unregistering_wq
);
6533 /* Free network device */
6534 kobject_put(&dev
->dev
.kobj
);
6538 /* Convert net_device_stats to rtnl_link_stats64. They have the same
6539 * fields in the same order, with only the type differing.
6541 void netdev_stats_to_stats64(struct rtnl_link_stats64
*stats64
,
6542 const struct net_device_stats
*netdev_stats
)
6544 #if BITS_PER_LONG == 64
6545 BUILD_BUG_ON(sizeof(*stats64
) != sizeof(*netdev_stats
));
6546 memcpy(stats64
, netdev_stats
, sizeof(*stats64
));
6548 size_t i
, n
= sizeof(*stats64
) / sizeof(u64
);
6549 const unsigned long *src
= (const unsigned long *)netdev_stats
;
6550 u64
*dst
= (u64
*)stats64
;
6552 BUILD_BUG_ON(sizeof(*netdev_stats
) / sizeof(unsigned long) !=
6553 sizeof(*stats64
) / sizeof(u64
));
6554 for (i
= 0; i
< n
; i
++)
6558 EXPORT_SYMBOL(netdev_stats_to_stats64
);
6561 * dev_get_stats - get network device statistics
6562 * @dev: device to get statistics from
6563 * @storage: place to store stats
6565 * Get network statistics from device. Return @storage.
6566 * The device driver may provide its own method by setting
6567 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6568 * otherwise the internal statistics structure is used.
6570 struct rtnl_link_stats64
*dev_get_stats(struct net_device
*dev
,
6571 struct rtnl_link_stats64
*storage
)
6573 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6575 if (ops
->ndo_get_stats64
) {
6576 memset(storage
, 0, sizeof(*storage
));
6577 ops
->ndo_get_stats64(dev
, storage
);
6578 } else if (ops
->ndo_get_stats
) {
6579 netdev_stats_to_stats64(storage
, ops
->ndo_get_stats(dev
));
6581 netdev_stats_to_stats64(storage
, &dev
->stats
);
6583 storage
->rx_dropped
+= atomic_long_read(&dev
->rx_dropped
);
6584 storage
->tx_dropped
+= atomic_long_read(&dev
->tx_dropped
);
6587 EXPORT_SYMBOL(dev_get_stats
);
6589 struct netdev_queue
*dev_ingress_queue_create(struct net_device
*dev
)
6591 struct netdev_queue
*queue
= dev_ingress_queue(dev
);
6593 #ifdef CONFIG_NET_CLS_ACT
6596 queue
= kzalloc(sizeof(*queue
), GFP_KERNEL
);
6599 netdev_init_one_queue(dev
, queue
, NULL
);
6600 queue
->qdisc
= &noop_qdisc
;
6601 queue
->qdisc_sleeping
= &noop_qdisc
;
6602 rcu_assign_pointer(dev
->ingress_queue
, queue
);
6607 static const struct ethtool_ops default_ethtool_ops
;
6609 void netdev_set_default_ethtool_ops(struct net_device
*dev
,
6610 const struct ethtool_ops
*ops
)
6612 if (dev
->ethtool_ops
== &default_ethtool_ops
)
6613 dev
->ethtool_ops
= ops
;
6615 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops
);
6617 void netdev_freemem(struct net_device
*dev
)
6619 char *addr
= (char *)dev
- dev
->padded
;
6625 * alloc_netdev_mqs - allocate network device
6626 * @sizeof_priv: size of private data to allocate space for
6627 * @name: device name format string
6628 * @name_assign_type: origin of device name
6629 * @setup: callback to initialize device
6630 * @txqs: the number of TX subqueues to allocate
6631 * @rxqs: the number of RX subqueues to allocate
6633 * Allocates a struct net_device with private data area for driver use
6634 * and performs basic initialization. Also allocates subqueue structs
6635 * for each queue on the device.
6637 struct net_device
*alloc_netdev_mqs(int sizeof_priv
, const char *name
,
6638 unsigned char name_assign_type
,
6639 void (*setup
)(struct net_device
*),
6640 unsigned int txqs
, unsigned int rxqs
)
6642 struct net_device
*dev
;
6644 struct net_device
*p
;
6646 BUG_ON(strlen(name
) >= sizeof(dev
->name
));
6649 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6655 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6660 alloc_size
= sizeof(struct net_device
);
6662 /* ensure 32-byte alignment of private area */
6663 alloc_size
= ALIGN(alloc_size
, NETDEV_ALIGN
);
6664 alloc_size
+= sizeof_priv
;
6666 /* ensure 32-byte alignment of whole construct */
6667 alloc_size
+= NETDEV_ALIGN
- 1;
6669 p
= kzalloc(alloc_size
, GFP_KERNEL
| __GFP_NOWARN
| __GFP_REPEAT
);
6671 p
= vzalloc(alloc_size
);
6675 dev
= PTR_ALIGN(p
, NETDEV_ALIGN
);
6676 dev
->padded
= (char *)dev
- (char *)p
;
6678 dev
->pcpu_refcnt
= alloc_percpu(int);
6679 if (!dev
->pcpu_refcnt
)
6682 if (dev_addr_init(dev
))
6688 dev_net_set(dev
, &init_net
);
6690 dev
->gso_max_size
= GSO_MAX_SIZE
;
6691 dev
->gso_max_segs
= GSO_MAX_SEGS
;
6692 dev
->gso_min_segs
= 0;
6694 INIT_LIST_HEAD(&dev
->napi_list
);
6695 INIT_LIST_HEAD(&dev
->unreg_list
);
6696 INIT_LIST_HEAD(&dev
->close_list
);
6697 INIT_LIST_HEAD(&dev
->link_watch_list
);
6698 INIT_LIST_HEAD(&dev
->adj_list
.upper
);
6699 INIT_LIST_HEAD(&dev
->adj_list
.lower
);
6700 INIT_LIST_HEAD(&dev
->all_adj_list
.upper
);
6701 INIT_LIST_HEAD(&dev
->all_adj_list
.lower
);
6702 dev
->priv_flags
= IFF_XMIT_DST_RELEASE
| IFF_XMIT_DST_RELEASE_PERM
;
6705 dev
->num_tx_queues
= txqs
;
6706 dev
->real_num_tx_queues
= txqs
;
6707 if (netif_alloc_netdev_queues(dev
))
6711 dev
->num_rx_queues
= rxqs
;
6712 dev
->real_num_rx_queues
= rxqs
;
6713 if (netif_alloc_rx_queues(dev
))
6717 strcpy(dev
->name
, name
);
6718 dev
->name_assign_type
= name_assign_type
;
6719 dev
->group
= INIT_NETDEV_GROUP
;
6720 if (!dev
->ethtool_ops
)
6721 dev
->ethtool_ops
= &default_ethtool_ops
;
6729 free_percpu(dev
->pcpu_refcnt
);
6731 netdev_freemem(dev
);
6734 EXPORT_SYMBOL(alloc_netdev_mqs
);
6737 * free_netdev - free network device
6740 * This function does the last stage of destroying an allocated device
6741 * interface. The reference to the device object is released.
6742 * If this is the last reference then it will be freed.
6744 void free_netdev(struct net_device
*dev
)
6746 struct napi_struct
*p
, *n
;
6748 release_net(dev_net(dev
));
6750 netif_free_tx_queues(dev
);
6755 kfree(rcu_dereference_protected(dev
->ingress_queue
, 1));
6757 /* Flush device addresses */
6758 dev_addr_flush(dev
);
6760 list_for_each_entry_safe(p
, n
, &dev
->napi_list
, dev_list
)
6763 free_percpu(dev
->pcpu_refcnt
);
6764 dev
->pcpu_refcnt
= NULL
;
6766 /* Compatibility with error handling in drivers */
6767 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
6768 netdev_freemem(dev
);
6772 BUG_ON(dev
->reg_state
!= NETREG_UNREGISTERED
);
6773 dev
->reg_state
= NETREG_RELEASED
;
6775 /* will free via device release */
6776 put_device(&dev
->dev
);
6778 EXPORT_SYMBOL(free_netdev
);
6781 * synchronize_net - Synchronize with packet receive processing
6783 * Wait for packets currently being received to be done.
6784 * Does not block later packets from starting.
6786 void synchronize_net(void)
6789 if (rtnl_is_locked())
6790 synchronize_rcu_expedited();
6794 EXPORT_SYMBOL(synchronize_net
);
6797 * unregister_netdevice_queue - remove device from the kernel
6801 * This function shuts down a device interface and removes it
6802 * from the kernel tables.
6803 * If head not NULL, device is queued to be unregistered later.
6805 * Callers must hold the rtnl semaphore. You may want
6806 * unregister_netdev() instead of this.
6809 void unregister_netdevice_queue(struct net_device
*dev
, struct list_head
*head
)
6814 list_move_tail(&dev
->unreg_list
, head
);
6816 rollback_registered(dev
);
6817 /* Finish processing unregister after unlock */
6821 EXPORT_SYMBOL(unregister_netdevice_queue
);
6824 * unregister_netdevice_many - unregister many devices
6825 * @head: list of devices
6827 * Note: As most callers use a stack allocated list_head,
6828 * we force a list_del() to make sure stack wont be corrupted later.
6830 void unregister_netdevice_many(struct list_head
*head
)
6832 struct net_device
*dev
;
6834 if (!list_empty(head
)) {
6835 rollback_registered_many(head
);
6836 list_for_each_entry(dev
, head
, unreg_list
)
6841 EXPORT_SYMBOL(unregister_netdevice_many
);
6844 * unregister_netdev - remove device from the kernel
6847 * This function shuts down a device interface and removes it
6848 * from the kernel tables.
6850 * This is just a wrapper for unregister_netdevice that takes
6851 * the rtnl semaphore. In general you want to use this and not
6852 * unregister_netdevice.
6854 void unregister_netdev(struct net_device
*dev
)
6857 unregister_netdevice(dev
);
6860 EXPORT_SYMBOL(unregister_netdev
);
6863 * dev_change_net_namespace - move device to different nethost namespace
6865 * @net: network namespace
6866 * @pat: If not NULL name pattern to try if the current device name
6867 * is already taken in the destination network namespace.
6869 * This function shuts down a device interface and moves it
6870 * to a new network namespace. On success 0 is returned, on
6871 * a failure a netagive errno code is returned.
6873 * Callers must hold the rtnl semaphore.
6876 int dev_change_net_namespace(struct net_device
*dev
, struct net
*net
, const char *pat
)
6882 /* Don't allow namespace local devices to be moved. */
6884 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
6887 /* Ensure the device has been registrered */
6888 if (dev
->reg_state
!= NETREG_REGISTERED
)
6891 /* Get out if there is nothing todo */
6893 if (net_eq(dev_net(dev
), net
))
6896 /* Pick the destination device name, and ensure
6897 * we can use it in the destination network namespace.
6900 if (__dev_get_by_name(net
, dev
->name
)) {
6901 /* We get here if we can't use the current device name */
6904 if (dev_get_valid_name(net
, dev
, pat
) < 0)
6909 * And now a mini version of register_netdevice unregister_netdevice.
6912 /* If device is running close it first. */
6915 /* And unlink it from device chain */
6917 unlist_netdevice(dev
);
6921 /* Shutdown queueing discipline. */
6924 /* Notify protocols, that we are about to destroy
6925 this device. They should clean all the things.
6927 Note that dev->reg_state stays at NETREG_REGISTERED.
6928 This is wanted because this way 8021q and macvlan know
6929 the device is just moving and can keep their slaves up.
6931 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
6933 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
6934 rtmsg_ifinfo(RTM_DELLINK
, dev
, ~0U, GFP_KERNEL
);
6937 * Flush the unicast and multicast chains
6942 /* Send a netdev-removed uevent to the old namespace */
6943 kobject_uevent(&dev
->dev
.kobj
, KOBJ_REMOVE
);
6944 netdev_adjacent_del_links(dev
);
6946 /* Actually switch the network namespace */
6947 dev_net_set(dev
, net
);
6949 /* If there is an ifindex conflict assign a new one */
6950 if (__dev_get_by_index(net
, dev
->ifindex
)) {
6951 int iflink
= (dev
->iflink
== dev
->ifindex
);
6952 dev
->ifindex
= dev_new_index(net
);
6954 dev
->iflink
= dev
->ifindex
;
6957 /* Send a netdev-add uevent to the new namespace */
6958 kobject_uevent(&dev
->dev
.kobj
, KOBJ_ADD
);
6959 netdev_adjacent_add_links(dev
);
6961 /* Fixup kobjects */
6962 err
= device_rename(&dev
->dev
, dev
->name
);
6965 /* Add the device back in the hashes */
6966 list_netdevice(dev
);
6968 /* Notify protocols, that a new device appeared. */
6969 call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
6972 * Prevent userspace races by waiting until the network
6973 * device is fully setup before sending notifications.
6975 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U, GFP_KERNEL
);
6982 EXPORT_SYMBOL_GPL(dev_change_net_namespace
);
6984 static int dev_cpu_callback(struct notifier_block
*nfb
,
6985 unsigned long action
,
6988 struct sk_buff
**list_skb
;
6989 struct sk_buff
*skb
;
6990 unsigned int cpu
, oldcpu
= (unsigned long)ocpu
;
6991 struct softnet_data
*sd
, *oldsd
;
6993 if (action
!= CPU_DEAD
&& action
!= CPU_DEAD_FROZEN
)
6996 local_irq_disable();
6997 cpu
= smp_processor_id();
6998 sd
= &per_cpu(softnet_data
, cpu
);
6999 oldsd
= &per_cpu(softnet_data
, oldcpu
);
7001 /* Find end of our completion_queue. */
7002 list_skb
= &sd
->completion_queue
;
7004 list_skb
= &(*list_skb
)->next
;
7005 /* Append completion queue from offline CPU. */
7006 *list_skb
= oldsd
->completion_queue
;
7007 oldsd
->completion_queue
= NULL
;
7009 /* Append output queue from offline CPU. */
7010 if (oldsd
->output_queue
) {
7011 *sd
->output_queue_tailp
= oldsd
->output_queue
;
7012 sd
->output_queue_tailp
= oldsd
->output_queue_tailp
;
7013 oldsd
->output_queue
= NULL
;
7014 oldsd
->output_queue_tailp
= &oldsd
->output_queue
;
7016 /* Append NAPI poll list from offline CPU. */
7017 if (!list_empty(&oldsd
->poll_list
)) {
7018 list_splice_init(&oldsd
->poll_list
, &sd
->poll_list
);
7019 raise_softirq_irqoff(NET_RX_SOFTIRQ
);
7022 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
7025 /* Process offline CPU's input_pkt_queue */
7026 while ((skb
= __skb_dequeue(&oldsd
->process_queue
))) {
7027 netif_rx_internal(skb
);
7028 input_queue_head_incr(oldsd
);
7030 while ((skb
= __skb_dequeue(&oldsd
->input_pkt_queue
))) {
7031 netif_rx_internal(skb
);
7032 input_queue_head_incr(oldsd
);
7040 * netdev_increment_features - increment feature set by one
7041 * @all: current feature set
7042 * @one: new feature set
7043 * @mask: mask feature set
7045 * Computes a new feature set after adding a device with feature set
7046 * @one to the master device with current feature set @all. Will not
7047 * enable anything that is off in @mask. Returns the new feature set.
7049 netdev_features_t
netdev_increment_features(netdev_features_t all
,
7050 netdev_features_t one
, netdev_features_t mask
)
7052 if (mask
& NETIF_F_GEN_CSUM
)
7053 mask
|= NETIF_F_ALL_CSUM
;
7054 mask
|= NETIF_F_VLAN_CHALLENGED
;
7056 all
|= one
& (NETIF_F_ONE_FOR_ALL
|NETIF_F_ALL_CSUM
) & mask
;
7057 all
&= one
| ~NETIF_F_ALL_FOR_ALL
;
7059 /* If one device supports hw checksumming, set for all. */
7060 if (all
& NETIF_F_GEN_CSUM
)
7061 all
&= ~(NETIF_F_ALL_CSUM
& ~NETIF_F_GEN_CSUM
);
7065 EXPORT_SYMBOL(netdev_increment_features
);
7067 static struct hlist_head
* __net_init
netdev_create_hash(void)
7070 struct hlist_head
*hash
;
7072 hash
= kmalloc(sizeof(*hash
) * NETDEV_HASHENTRIES
, GFP_KERNEL
);
7074 for (i
= 0; i
< NETDEV_HASHENTRIES
; i
++)
7075 INIT_HLIST_HEAD(&hash
[i
]);
7080 /* Initialize per network namespace state */
7081 static int __net_init
netdev_init(struct net
*net
)
7083 if (net
!= &init_net
)
7084 INIT_LIST_HEAD(&net
->dev_base_head
);
7086 net
->dev_name_head
= netdev_create_hash();
7087 if (net
->dev_name_head
== NULL
)
7090 net
->dev_index_head
= netdev_create_hash();
7091 if (net
->dev_index_head
== NULL
)
7097 kfree(net
->dev_name_head
);
7103 * netdev_drivername - network driver for the device
7104 * @dev: network device
7106 * Determine network driver for device.
7108 const char *netdev_drivername(const struct net_device
*dev
)
7110 const struct device_driver
*driver
;
7111 const struct device
*parent
;
7112 const char *empty
= "";
7114 parent
= dev
->dev
.parent
;
7118 driver
= parent
->driver
;
7119 if (driver
&& driver
->name
)
7120 return driver
->name
;
7124 static void __netdev_printk(const char *level
, const struct net_device
*dev
,
7125 struct va_format
*vaf
)
7127 if (dev
&& dev
->dev
.parent
) {
7128 dev_printk_emit(level
[1] - '0',
7131 dev_driver_string(dev
->dev
.parent
),
7132 dev_name(dev
->dev
.parent
),
7133 netdev_name(dev
), netdev_reg_state(dev
),
7136 printk("%s%s%s: %pV",
7137 level
, netdev_name(dev
), netdev_reg_state(dev
), vaf
);
7139 printk("%s(NULL net_device): %pV", level
, vaf
);
7143 void netdev_printk(const char *level
, const struct net_device
*dev
,
7144 const char *format
, ...)
7146 struct va_format vaf
;
7149 va_start(args
, format
);
7154 __netdev_printk(level
, dev
, &vaf
);
7158 EXPORT_SYMBOL(netdev_printk
);
7160 #define define_netdev_printk_level(func, level) \
7161 void func(const struct net_device *dev, const char *fmt, ...) \
7163 struct va_format vaf; \
7166 va_start(args, fmt); \
7171 __netdev_printk(level, dev, &vaf); \
7175 EXPORT_SYMBOL(func);
7177 define_netdev_printk_level(netdev_emerg
, KERN_EMERG
);
7178 define_netdev_printk_level(netdev_alert
, KERN_ALERT
);
7179 define_netdev_printk_level(netdev_crit
, KERN_CRIT
);
7180 define_netdev_printk_level(netdev_err
, KERN_ERR
);
7181 define_netdev_printk_level(netdev_warn
, KERN_WARNING
);
7182 define_netdev_printk_level(netdev_notice
, KERN_NOTICE
);
7183 define_netdev_printk_level(netdev_info
, KERN_INFO
);
7185 static void __net_exit
netdev_exit(struct net
*net
)
7187 kfree(net
->dev_name_head
);
7188 kfree(net
->dev_index_head
);
7191 static struct pernet_operations __net_initdata netdev_net_ops
= {
7192 .init
= netdev_init
,
7193 .exit
= netdev_exit
,
7196 static void __net_exit
default_device_exit(struct net
*net
)
7198 struct net_device
*dev
, *aux
;
7200 * Push all migratable network devices back to the
7201 * initial network namespace
7204 for_each_netdev_safe(net
, dev
, aux
) {
7206 char fb_name
[IFNAMSIZ
];
7208 /* Ignore unmoveable devices (i.e. loopback) */
7209 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
7212 /* Leave virtual devices for the generic cleanup */
7213 if (dev
->rtnl_link_ops
)
7216 /* Push remaining network devices to init_net */
7217 snprintf(fb_name
, IFNAMSIZ
, "dev%d", dev
->ifindex
);
7218 err
= dev_change_net_namespace(dev
, &init_net
, fb_name
);
7220 pr_emerg("%s: failed to move %s to init_net: %d\n",
7221 __func__
, dev
->name
, err
);
7228 static void __net_exit
rtnl_lock_unregistering(struct list_head
*net_list
)
7230 /* Return with the rtnl_lock held when there are no network
7231 * devices unregistering in any network namespace in net_list.
7238 prepare_to_wait(&netdev_unregistering_wq
, &wait
,
7239 TASK_UNINTERRUPTIBLE
);
7240 unregistering
= false;
7242 list_for_each_entry(net
, net_list
, exit_list
) {
7243 if (net
->dev_unreg_count
> 0) {
7244 unregistering
= true;
7253 finish_wait(&netdev_unregistering_wq
, &wait
);
7256 static void __net_exit
default_device_exit_batch(struct list_head
*net_list
)
7258 /* At exit all network devices most be removed from a network
7259 * namespace. Do this in the reverse order of registration.
7260 * Do this across as many network namespaces as possible to
7261 * improve batching efficiency.
7263 struct net_device
*dev
;
7265 LIST_HEAD(dev_kill_list
);
7267 /* To prevent network device cleanup code from dereferencing
7268 * loopback devices or network devices that have been freed
7269 * wait here for all pending unregistrations to complete,
7270 * before unregistring the loopback device and allowing the
7271 * network namespace be freed.
7273 * The netdev todo list containing all network devices
7274 * unregistrations that happen in default_device_exit_batch
7275 * will run in the rtnl_unlock() at the end of
7276 * default_device_exit_batch.
7278 rtnl_lock_unregistering(net_list
);
7279 list_for_each_entry(net
, net_list
, exit_list
) {
7280 for_each_netdev_reverse(net
, dev
) {
7281 if (dev
->rtnl_link_ops
&& dev
->rtnl_link_ops
->dellink
)
7282 dev
->rtnl_link_ops
->dellink(dev
, &dev_kill_list
);
7284 unregister_netdevice_queue(dev
, &dev_kill_list
);
7287 unregister_netdevice_many(&dev_kill_list
);
7291 static struct pernet_operations __net_initdata default_device_ops
= {
7292 .exit
= default_device_exit
,
7293 .exit_batch
= default_device_exit_batch
,
7297 * Initialize the DEV module. At boot time this walks the device list and
7298 * unhooks any devices that fail to initialise (normally hardware not
7299 * present) and leaves us with a valid list of present and active devices.
7304 * This is called single threaded during boot, so no need
7305 * to take the rtnl semaphore.
7307 static int __init
net_dev_init(void)
7309 int i
, rc
= -ENOMEM
;
7311 BUG_ON(!dev_boot_phase
);
7313 if (dev_proc_init())
7316 if (netdev_kobject_init())
7319 INIT_LIST_HEAD(&ptype_all
);
7320 for (i
= 0; i
< PTYPE_HASH_SIZE
; i
++)
7321 INIT_LIST_HEAD(&ptype_base
[i
]);
7323 INIT_LIST_HEAD(&offload_base
);
7325 if (register_pernet_subsys(&netdev_net_ops
))
7329 * Initialise the packet receive queues.
7332 for_each_possible_cpu(i
) {
7333 struct softnet_data
*sd
= &per_cpu(softnet_data
, i
);
7335 skb_queue_head_init(&sd
->input_pkt_queue
);
7336 skb_queue_head_init(&sd
->process_queue
);
7337 INIT_LIST_HEAD(&sd
->poll_list
);
7338 sd
->output_queue_tailp
= &sd
->output_queue
;
7340 sd
->csd
.func
= rps_trigger_softirq
;
7345 sd
->backlog
.poll
= process_backlog
;
7346 sd
->backlog
.weight
= weight_p
;
7351 /* The loopback device is special if any other network devices
7352 * is present in a network namespace the loopback device must
7353 * be present. Since we now dynamically allocate and free the
7354 * loopback device ensure this invariant is maintained by
7355 * keeping the loopback device as the first device on the
7356 * list of network devices. Ensuring the loopback devices
7357 * is the first device that appears and the last network device
7360 if (register_pernet_device(&loopback_net_ops
))
7363 if (register_pernet_device(&default_device_ops
))
7366 open_softirq(NET_TX_SOFTIRQ
, net_tx_action
);
7367 open_softirq(NET_RX_SOFTIRQ
, net_rx_action
);
7369 hotcpu_notifier(dev_cpu_callback
, 0);
7376 subsys_initcall(net_dev_init
);