2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/proc_fs.h>
101 #include <linux/seq_file.h>
102 #include <linux/stat.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/module.h>
110 #include <linux/netpoll.h>
111 #include <linux/rcupdate.h>
112 #include <linux/delay.h>
113 #include <net/wext.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
124 #include <linux/ipv6.h>
125 #include <linux/in.h>
126 #include <linux/jhash.h>
127 #include <linux/random.h>
128 #include <trace/events/napi.h>
129 #include <trace/events/net.h>
130 #include <trace/events/skb.h>
131 #include <linux/pci.h>
132 #include <linux/inetdevice.h>
133 #include <linux/cpu_rmap.h>
134 #include <linux/static_key.h>
136 #include "net-sysfs.h"
138 /* Instead of increasing this, you should create a hash table. */
139 #define MAX_GRO_SKBS 8
141 /* This should be increased if a protocol with a bigger head is added. */
142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
145 * The list of packet types we will receive (as opposed to discard)
146 * and the routines to invoke.
148 * Why 16. Because with 16 the only overlap we get on a hash of the
149 * low nibble of the protocol value is RARP/SNAP/X.25.
151 * NOTE: That is no longer true with the addition of VLAN tags. Not
152 * sure which should go first, but I bet it won't make much
153 * difference if we are running VLANs. The good news is that
154 * this protocol won't be in the list unless compiled in, so
155 * the average user (w/out VLANs) will not be adversely affected.
172 #define PTYPE_HASH_SIZE (16)
173 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
175 static DEFINE_SPINLOCK(ptype_lock
);
176 static DEFINE_SPINLOCK(offload_lock
);
177 static struct list_head ptype_base
[PTYPE_HASH_SIZE
] __read_mostly
;
178 static struct list_head ptype_all __read_mostly
; /* Taps */
179 static struct list_head offload_base __read_mostly
;
182 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
185 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
187 * Writers must hold the rtnl semaphore while they loop through the
188 * dev_base_head list, and hold dev_base_lock for writing when they do the
189 * actual updates. This allows pure readers to access the list even
190 * while a writer is preparing to update it.
192 * To put it another way, dev_base_lock is held for writing only to
193 * protect against pure readers; the rtnl semaphore provides the
194 * protection against other writers.
196 * See, for example usages, register_netdevice() and
197 * unregister_netdevice(), which must be called with the rtnl
200 DEFINE_RWLOCK(dev_base_lock
);
201 EXPORT_SYMBOL(dev_base_lock
);
203 seqcount_t devnet_rename_seq
;
205 static inline void dev_base_seq_inc(struct net
*net
)
207 while (++net
->dev_base_seq
== 0);
210 static inline struct hlist_head
*dev_name_hash(struct net
*net
, const char *name
)
212 unsigned int hash
= full_name_hash(name
, strnlen(name
, IFNAMSIZ
));
214 return &net
->dev_name_head
[hash_32(hash
, NETDEV_HASHBITS
)];
217 static inline struct hlist_head
*dev_index_hash(struct net
*net
, int ifindex
)
219 return &net
->dev_index_head
[ifindex
& (NETDEV_HASHENTRIES
- 1)];
222 static inline void rps_lock(struct softnet_data
*sd
)
225 spin_lock(&sd
->input_pkt_queue
.lock
);
229 static inline void rps_unlock(struct softnet_data
*sd
)
232 spin_unlock(&sd
->input_pkt_queue
.lock
);
236 /* Device list insertion */
237 static int list_netdevice(struct net_device
*dev
)
239 struct net
*net
= dev_net(dev
);
243 write_lock_bh(&dev_base_lock
);
244 list_add_tail_rcu(&dev
->dev_list
, &net
->dev_base_head
);
245 hlist_add_head_rcu(&dev
->name_hlist
, dev_name_hash(net
, dev
->name
));
246 hlist_add_head_rcu(&dev
->index_hlist
,
247 dev_index_hash(net
, dev
->ifindex
));
248 write_unlock_bh(&dev_base_lock
);
250 dev_base_seq_inc(net
);
255 /* Device list removal
256 * caller must respect a RCU grace period before freeing/reusing dev
258 static void unlist_netdevice(struct net_device
*dev
)
262 /* Unlink dev from the device chain */
263 write_lock_bh(&dev_base_lock
);
264 list_del_rcu(&dev
->dev_list
);
265 hlist_del_rcu(&dev
->name_hlist
);
266 hlist_del_rcu(&dev
->index_hlist
);
267 write_unlock_bh(&dev_base_lock
);
269 dev_base_seq_inc(dev_net(dev
));
276 static RAW_NOTIFIER_HEAD(netdev_chain
);
279 * Device drivers call our routines to queue packets here. We empty the
280 * queue in the local softnet handler.
283 DEFINE_PER_CPU_ALIGNED(struct softnet_data
, softnet_data
);
284 EXPORT_PER_CPU_SYMBOL(softnet_data
);
286 #ifdef CONFIG_LOCKDEP
288 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
289 * according to dev->type
291 static const unsigned short netdev_lock_type
[] =
292 {ARPHRD_NETROM
, ARPHRD_ETHER
, ARPHRD_EETHER
, ARPHRD_AX25
,
293 ARPHRD_PRONET
, ARPHRD_CHAOS
, ARPHRD_IEEE802
, ARPHRD_ARCNET
,
294 ARPHRD_APPLETLK
, ARPHRD_DLCI
, ARPHRD_ATM
, ARPHRD_METRICOM
,
295 ARPHRD_IEEE1394
, ARPHRD_EUI64
, ARPHRD_INFINIBAND
, ARPHRD_SLIP
,
296 ARPHRD_CSLIP
, ARPHRD_SLIP6
, ARPHRD_CSLIP6
, ARPHRD_RSRVD
,
297 ARPHRD_ADAPT
, ARPHRD_ROSE
, ARPHRD_X25
, ARPHRD_HWX25
,
298 ARPHRD_PPP
, ARPHRD_CISCO
, ARPHRD_LAPB
, ARPHRD_DDCMP
,
299 ARPHRD_RAWHDLC
, ARPHRD_TUNNEL
, ARPHRD_TUNNEL6
, ARPHRD_FRAD
,
300 ARPHRD_SKIP
, ARPHRD_LOOPBACK
, ARPHRD_LOCALTLK
, ARPHRD_FDDI
,
301 ARPHRD_BIF
, ARPHRD_SIT
, ARPHRD_IPDDP
, ARPHRD_IPGRE
,
302 ARPHRD_PIMREG
, ARPHRD_HIPPI
, ARPHRD_ASH
, ARPHRD_ECONET
,
303 ARPHRD_IRDA
, ARPHRD_FCPP
, ARPHRD_FCAL
, ARPHRD_FCPL
,
304 ARPHRD_FCFABRIC
, ARPHRD_IEEE80211
, ARPHRD_IEEE80211_PRISM
,
305 ARPHRD_IEEE80211_RADIOTAP
, ARPHRD_PHONET
, ARPHRD_PHONET_PIPE
,
306 ARPHRD_IEEE802154
, ARPHRD_VOID
, ARPHRD_NONE
};
308 static const char *const netdev_lock_name
[] =
309 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
310 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
311 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
312 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
313 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
314 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
315 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
316 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
317 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
318 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
319 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
320 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
321 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
322 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
323 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
325 static struct lock_class_key netdev_xmit_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
326 static struct lock_class_key netdev_addr_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
328 static inline unsigned short netdev_lock_pos(unsigned short dev_type
)
332 for (i
= 0; i
< ARRAY_SIZE(netdev_lock_type
); i
++)
333 if (netdev_lock_type
[i
] == dev_type
)
335 /* the last key is used by default */
336 return ARRAY_SIZE(netdev_lock_type
) - 1;
339 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
340 unsigned short dev_type
)
344 i
= netdev_lock_pos(dev_type
);
345 lockdep_set_class_and_name(lock
, &netdev_xmit_lock_key
[i
],
346 netdev_lock_name
[i
]);
349 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
353 i
= netdev_lock_pos(dev
->type
);
354 lockdep_set_class_and_name(&dev
->addr_list_lock
,
355 &netdev_addr_lock_key
[i
],
356 netdev_lock_name
[i
]);
359 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
360 unsigned short dev_type
)
363 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
368 /*******************************************************************************
370 Protocol management and registration routines
372 *******************************************************************************/
375 * Add a protocol ID to the list. Now that the input handler is
376 * smarter we can dispense with all the messy stuff that used to be
379 * BEWARE!!! Protocol handlers, mangling input packets,
380 * MUST BE last in hash buckets and checking protocol handlers
381 * MUST start from promiscuous ptype_all chain in net_bh.
382 * It is true now, do not change it.
383 * Explanation follows: if protocol handler, mangling packet, will
384 * be the first on list, it is not able to sense, that packet
385 * is cloned and should be copied-on-write, so that it will
386 * change it and subsequent readers will get broken packet.
390 static inline struct list_head
*ptype_head(const struct packet_type
*pt
)
392 if (pt
->type
== htons(ETH_P_ALL
))
395 return &ptype_base
[ntohs(pt
->type
) & PTYPE_HASH_MASK
];
399 * dev_add_pack - add packet handler
400 * @pt: packet type declaration
402 * Add a protocol handler to the networking stack. The passed &packet_type
403 * is linked into kernel lists and may not be freed until it has been
404 * removed from the kernel lists.
406 * This call does not sleep therefore it can not
407 * guarantee all CPU's that are in middle of receiving packets
408 * will see the new packet type (until the next received packet).
411 void dev_add_pack(struct packet_type
*pt
)
413 struct list_head
*head
= ptype_head(pt
);
415 spin_lock(&ptype_lock
);
416 list_add_rcu(&pt
->list
, head
);
417 spin_unlock(&ptype_lock
);
419 EXPORT_SYMBOL(dev_add_pack
);
422 * __dev_remove_pack - remove packet handler
423 * @pt: packet type declaration
425 * Remove a protocol handler that was previously added to the kernel
426 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
427 * from the kernel lists and can be freed or reused once this function
430 * The packet type might still be in use by receivers
431 * and must not be freed until after all the CPU's have gone
432 * through a quiescent state.
434 void __dev_remove_pack(struct packet_type
*pt
)
436 struct list_head
*head
= ptype_head(pt
);
437 struct packet_type
*pt1
;
439 spin_lock(&ptype_lock
);
441 list_for_each_entry(pt1
, head
, list
) {
443 list_del_rcu(&pt
->list
);
448 pr_warn("dev_remove_pack: %p not found\n", pt
);
450 spin_unlock(&ptype_lock
);
452 EXPORT_SYMBOL(__dev_remove_pack
);
455 * dev_remove_pack - remove packet handler
456 * @pt: packet type declaration
458 * Remove a protocol handler that was previously added to the kernel
459 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
460 * from the kernel lists and can be freed or reused once this function
463 * This call sleeps to guarantee that no CPU is looking at the packet
466 void dev_remove_pack(struct packet_type
*pt
)
468 __dev_remove_pack(pt
);
472 EXPORT_SYMBOL(dev_remove_pack
);
476 * dev_add_offload - register offload handlers
477 * @po: protocol offload declaration
479 * Add protocol offload handlers to the networking stack. The passed
480 * &proto_offload is linked into kernel lists and may not be freed until
481 * it has been removed from the kernel lists.
483 * This call does not sleep therefore it can not
484 * guarantee all CPU's that are in middle of receiving packets
485 * will see the new offload handlers (until the next received packet).
487 void dev_add_offload(struct packet_offload
*po
)
489 struct list_head
*head
= &offload_base
;
491 spin_lock(&offload_lock
);
492 list_add_rcu(&po
->list
, head
);
493 spin_unlock(&offload_lock
);
495 EXPORT_SYMBOL(dev_add_offload
);
498 * __dev_remove_offload - remove offload handler
499 * @po: packet offload declaration
501 * Remove a protocol offload handler that was previously added to the
502 * kernel offload handlers by dev_add_offload(). The passed &offload_type
503 * is removed from the kernel lists and can be freed or reused once this
506 * The packet type might still be in use by receivers
507 * and must not be freed until after all the CPU's have gone
508 * through a quiescent state.
510 void __dev_remove_offload(struct packet_offload
*po
)
512 struct list_head
*head
= &offload_base
;
513 struct packet_offload
*po1
;
515 spin_lock(&offload_lock
);
517 list_for_each_entry(po1
, head
, list
) {
519 list_del_rcu(&po
->list
);
524 pr_warn("dev_remove_offload: %p not found\n", po
);
526 spin_unlock(&offload_lock
);
528 EXPORT_SYMBOL(__dev_remove_offload
);
531 * dev_remove_offload - remove packet offload handler
532 * @po: packet offload declaration
534 * Remove a packet offload handler that was previously added to the kernel
535 * offload handlers by dev_add_offload(). The passed &offload_type is
536 * removed from the kernel lists and can be freed or reused once this
539 * This call sleeps to guarantee that no CPU is looking at the packet
542 void dev_remove_offload(struct packet_offload
*po
)
544 __dev_remove_offload(po
);
548 EXPORT_SYMBOL(dev_remove_offload
);
550 /******************************************************************************
552 Device Boot-time Settings Routines
554 *******************************************************************************/
556 /* Boot time configuration table */
557 static struct netdev_boot_setup dev_boot_setup
[NETDEV_BOOT_SETUP_MAX
];
560 * netdev_boot_setup_add - add new setup entry
561 * @name: name of the device
562 * @map: configured settings for the device
564 * Adds new setup entry to the dev_boot_setup list. The function
565 * returns 0 on error and 1 on success. This is a generic routine to
568 static int netdev_boot_setup_add(char *name
, struct ifmap
*map
)
570 struct netdev_boot_setup
*s
;
574 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
575 if (s
[i
].name
[0] == '\0' || s
[i
].name
[0] == ' ') {
576 memset(s
[i
].name
, 0, sizeof(s
[i
].name
));
577 strlcpy(s
[i
].name
, name
, IFNAMSIZ
);
578 memcpy(&s
[i
].map
, map
, sizeof(s
[i
].map
));
583 return i
>= NETDEV_BOOT_SETUP_MAX
? 0 : 1;
587 * netdev_boot_setup_check - check boot time settings
588 * @dev: the netdevice
590 * Check boot time settings for the device.
591 * The found settings are set for the device to be used
592 * later in the device probing.
593 * Returns 0 if no settings found, 1 if they are.
595 int netdev_boot_setup_check(struct net_device
*dev
)
597 struct netdev_boot_setup
*s
= dev_boot_setup
;
600 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
601 if (s
[i
].name
[0] != '\0' && s
[i
].name
[0] != ' ' &&
602 !strcmp(dev
->name
, s
[i
].name
)) {
603 dev
->irq
= s
[i
].map
.irq
;
604 dev
->base_addr
= s
[i
].map
.base_addr
;
605 dev
->mem_start
= s
[i
].map
.mem_start
;
606 dev
->mem_end
= s
[i
].map
.mem_end
;
612 EXPORT_SYMBOL(netdev_boot_setup_check
);
616 * netdev_boot_base - get address from boot time settings
617 * @prefix: prefix for network device
618 * @unit: id for network device
620 * Check boot time settings for the base address of device.
621 * The found settings are set for the device to be used
622 * later in the device probing.
623 * Returns 0 if no settings found.
625 unsigned long netdev_boot_base(const char *prefix
, int unit
)
627 const struct netdev_boot_setup
*s
= dev_boot_setup
;
631 sprintf(name
, "%s%d", prefix
, unit
);
634 * If device already registered then return base of 1
635 * to indicate not to probe for this interface
637 if (__dev_get_by_name(&init_net
, name
))
640 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++)
641 if (!strcmp(name
, s
[i
].name
))
642 return s
[i
].map
.base_addr
;
647 * Saves at boot time configured settings for any netdevice.
649 int __init
netdev_boot_setup(char *str
)
654 str
= get_options(str
, ARRAY_SIZE(ints
), ints
);
659 memset(&map
, 0, sizeof(map
));
663 map
.base_addr
= ints
[2];
665 map
.mem_start
= ints
[3];
667 map
.mem_end
= ints
[4];
669 /* Add new entry to the list */
670 return netdev_boot_setup_add(str
, &map
);
673 __setup("netdev=", netdev_boot_setup
);
675 /*******************************************************************************
677 Device Interface Subroutines
679 *******************************************************************************/
682 * __dev_get_by_name - find a device by its name
683 * @net: the applicable net namespace
684 * @name: name to find
686 * Find an interface by name. Must be called under RTNL semaphore
687 * or @dev_base_lock. If the name is found a pointer to the device
688 * is returned. If the name is not found then %NULL is returned. The
689 * reference counters are not incremented so the caller must be
690 * careful with locks.
693 struct net_device
*__dev_get_by_name(struct net
*net
, const char *name
)
695 struct hlist_node
*p
;
696 struct net_device
*dev
;
697 struct hlist_head
*head
= dev_name_hash(net
, name
);
699 hlist_for_each_entry(dev
, p
, head
, name_hlist
)
700 if (!strncmp(dev
->name
, name
, IFNAMSIZ
))
705 EXPORT_SYMBOL(__dev_get_by_name
);
708 * dev_get_by_name_rcu - find a device by its name
709 * @net: the applicable net namespace
710 * @name: name to find
712 * Find an interface by name.
713 * If the name is found a pointer to the device is returned.
714 * If the name is not found then %NULL is returned.
715 * The reference counters are not incremented so the caller must be
716 * careful with locks. The caller must hold RCU lock.
719 struct net_device
*dev_get_by_name_rcu(struct net
*net
, const char *name
)
721 struct hlist_node
*p
;
722 struct net_device
*dev
;
723 struct hlist_head
*head
= dev_name_hash(net
, name
);
725 hlist_for_each_entry_rcu(dev
, p
, head
, name_hlist
)
726 if (!strncmp(dev
->name
, name
, IFNAMSIZ
))
731 EXPORT_SYMBOL(dev_get_by_name_rcu
);
734 * dev_get_by_name - find a device by its name
735 * @net: the applicable net namespace
736 * @name: name to find
738 * Find an interface by name. This can be called from any
739 * context and does its own locking. The returned handle has
740 * the usage count incremented and the caller must use dev_put() to
741 * release it when it is no longer needed. %NULL is returned if no
742 * matching device is found.
745 struct net_device
*dev_get_by_name(struct net
*net
, const char *name
)
747 struct net_device
*dev
;
750 dev
= dev_get_by_name_rcu(net
, name
);
756 EXPORT_SYMBOL(dev_get_by_name
);
759 * __dev_get_by_index - find a device by its ifindex
760 * @net: the applicable net namespace
761 * @ifindex: index of device
763 * Search for an interface by index. Returns %NULL if the device
764 * is not found or a pointer to the device. The device has not
765 * had its reference counter increased so the caller must be careful
766 * about locking. The caller must hold either the RTNL semaphore
770 struct net_device
*__dev_get_by_index(struct net
*net
, int ifindex
)
772 struct hlist_node
*p
;
773 struct net_device
*dev
;
774 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
776 hlist_for_each_entry(dev
, p
, head
, index_hlist
)
777 if (dev
->ifindex
== ifindex
)
782 EXPORT_SYMBOL(__dev_get_by_index
);
785 * dev_get_by_index_rcu - find a device by its ifindex
786 * @net: the applicable net namespace
787 * @ifindex: index of device
789 * Search for an interface by index. Returns %NULL if the device
790 * is not found or a pointer to the device. The device has not
791 * had its reference counter increased so the caller must be careful
792 * about locking. The caller must hold RCU lock.
795 struct net_device
*dev_get_by_index_rcu(struct net
*net
, int ifindex
)
797 struct hlist_node
*p
;
798 struct net_device
*dev
;
799 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
801 hlist_for_each_entry_rcu(dev
, p
, head
, index_hlist
)
802 if (dev
->ifindex
== ifindex
)
807 EXPORT_SYMBOL(dev_get_by_index_rcu
);
811 * dev_get_by_index - find a device by its ifindex
812 * @net: the applicable net namespace
813 * @ifindex: index of device
815 * Search for an interface by index. Returns NULL if the device
816 * is not found or a pointer to the device. The device returned has
817 * had a reference added and the pointer is safe until the user calls
818 * dev_put to indicate they have finished with it.
821 struct net_device
*dev_get_by_index(struct net
*net
, int ifindex
)
823 struct net_device
*dev
;
826 dev
= dev_get_by_index_rcu(net
, ifindex
);
832 EXPORT_SYMBOL(dev_get_by_index
);
835 * dev_getbyhwaddr_rcu - find a device by its hardware address
836 * @net: the applicable net namespace
837 * @type: media type of device
838 * @ha: hardware address
840 * Search for an interface by MAC address. Returns NULL if the device
841 * is not found or a pointer to the device.
842 * The caller must hold RCU or RTNL.
843 * The returned device has not had its ref count increased
844 * and the caller must therefore be careful about locking
848 struct net_device
*dev_getbyhwaddr_rcu(struct net
*net
, unsigned short type
,
851 struct net_device
*dev
;
853 for_each_netdev_rcu(net
, dev
)
854 if (dev
->type
== type
&&
855 !memcmp(dev
->dev_addr
, ha
, dev
->addr_len
))
860 EXPORT_SYMBOL(dev_getbyhwaddr_rcu
);
862 struct net_device
*__dev_getfirstbyhwtype(struct net
*net
, unsigned short type
)
864 struct net_device
*dev
;
867 for_each_netdev(net
, dev
)
868 if (dev
->type
== type
)
873 EXPORT_SYMBOL(__dev_getfirstbyhwtype
);
875 struct net_device
*dev_getfirstbyhwtype(struct net
*net
, unsigned short type
)
877 struct net_device
*dev
, *ret
= NULL
;
880 for_each_netdev_rcu(net
, dev
)
881 if (dev
->type
== type
) {
889 EXPORT_SYMBOL(dev_getfirstbyhwtype
);
892 * dev_get_by_flags_rcu - find any device with given flags
893 * @net: the applicable net namespace
894 * @if_flags: IFF_* values
895 * @mask: bitmask of bits in if_flags to check
897 * Search for any interface with the given flags. Returns NULL if a device
898 * is not found or a pointer to the device. Must be called inside
899 * rcu_read_lock(), and result refcount is unchanged.
902 struct net_device
*dev_get_by_flags_rcu(struct net
*net
, unsigned short if_flags
,
905 struct net_device
*dev
, *ret
;
908 for_each_netdev_rcu(net
, dev
) {
909 if (((dev
->flags
^ if_flags
) & mask
) == 0) {
916 EXPORT_SYMBOL(dev_get_by_flags_rcu
);
919 * dev_valid_name - check if name is okay for network device
922 * Network device names need to be valid file names to
923 * to allow sysfs to work. We also disallow any kind of
926 bool dev_valid_name(const char *name
)
930 if (strlen(name
) >= IFNAMSIZ
)
932 if (!strcmp(name
, ".") || !strcmp(name
, ".."))
936 if (*name
== '/' || isspace(*name
))
942 EXPORT_SYMBOL(dev_valid_name
);
945 * __dev_alloc_name - allocate a name for a device
946 * @net: network namespace to allocate the device name in
947 * @name: name format string
948 * @buf: scratch buffer and result name string
950 * Passed a format string - eg "lt%d" it will try and find a suitable
951 * id. It scans list of devices to build up a free map, then chooses
952 * the first empty slot. The caller must hold the dev_base or rtnl lock
953 * while allocating the name and adding the device in order to avoid
955 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
956 * Returns the number of the unit assigned or a negative errno code.
959 static int __dev_alloc_name(struct net
*net
, const char *name
, char *buf
)
963 const int max_netdevices
= 8*PAGE_SIZE
;
964 unsigned long *inuse
;
965 struct net_device
*d
;
967 p
= strnchr(name
, IFNAMSIZ
-1, '%');
970 * Verify the string as this thing may have come from
971 * the user. There must be either one "%d" and no other "%"
974 if (p
[1] != 'd' || strchr(p
+ 2, '%'))
977 /* Use one page as a bit array of possible slots */
978 inuse
= (unsigned long *) get_zeroed_page(GFP_ATOMIC
);
982 for_each_netdev(net
, d
) {
983 if (!sscanf(d
->name
, name
, &i
))
985 if (i
< 0 || i
>= max_netdevices
)
988 /* avoid cases where sscanf is not exact inverse of printf */
989 snprintf(buf
, IFNAMSIZ
, name
, i
);
990 if (!strncmp(buf
, d
->name
, IFNAMSIZ
))
994 i
= find_first_zero_bit(inuse
, max_netdevices
);
995 free_page((unsigned long) inuse
);
999 snprintf(buf
, IFNAMSIZ
, name
, i
);
1000 if (!__dev_get_by_name(net
, buf
))
1003 /* It is possible to run out of possible slots
1004 * when the name is long and there isn't enough space left
1005 * for the digits, or if all bits are used.
1011 * dev_alloc_name - allocate a name for a device
1013 * @name: name format string
1015 * Passed a format string - eg "lt%d" it will try and find a suitable
1016 * id. It scans list of devices to build up a free map, then chooses
1017 * the first empty slot. The caller must hold the dev_base or rtnl lock
1018 * while allocating the name and adding the device in order to avoid
1020 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 * Returns the number of the unit assigned or a negative errno code.
1024 int dev_alloc_name(struct net_device
*dev
, const char *name
)
1030 BUG_ON(!dev_net(dev
));
1032 ret
= __dev_alloc_name(net
, name
, buf
);
1034 strlcpy(dev
->name
, buf
, IFNAMSIZ
);
1037 EXPORT_SYMBOL(dev_alloc_name
);
1039 static int dev_alloc_name_ns(struct net
*net
,
1040 struct net_device
*dev
,
1046 ret
= __dev_alloc_name(net
, name
, buf
);
1048 strlcpy(dev
->name
, buf
, IFNAMSIZ
);
1052 static int dev_get_valid_name(struct net
*net
,
1053 struct net_device
*dev
,
1058 if (!dev_valid_name(name
))
1061 if (strchr(name
, '%'))
1062 return dev_alloc_name_ns(net
, dev
, name
);
1063 else if (__dev_get_by_name(net
, name
))
1065 else if (dev
->name
!= name
)
1066 strlcpy(dev
->name
, name
, IFNAMSIZ
);
1072 * dev_change_name - change name of a device
1074 * @newname: name (or format string) must be at least IFNAMSIZ
1076 * Change name of a device, can pass format strings "eth%d".
1079 int dev_change_name(struct net_device
*dev
, const char *newname
)
1081 char oldname
[IFNAMSIZ
];
1087 BUG_ON(!dev_net(dev
));
1090 if (dev
->flags
& IFF_UP
)
1093 write_seqcount_begin(&devnet_rename_seq
);
1095 if (strncmp(newname
, dev
->name
, IFNAMSIZ
) == 0) {
1096 write_seqcount_end(&devnet_rename_seq
);
1100 memcpy(oldname
, dev
->name
, IFNAMSIZ
);
1102 err
= dev_get_valid_name(net
, dev
, newname
);
1104 write_seqcount_end(&devnet_rename_seq
);
1109 ret
= device_rename(&dev
->dev
, dev
->name
);
1111 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1112 write_seqcount_end(&devnet_rename_seq
);
1116 write_seqcount_end(&devnet_rename_seq
);
1118 write_lock_bh(&dev_base_lock
);
1119 hlist_del_rcu(&dev
->name_hlist
);
1120 write_unlock_bh(&dev_base_lock
);
1124 write_lock_bh(&dev_base_lock
);
1125 hlist_add_head_rcu(&dev
->name_hlist
, dev_name_hash(net
, dev
->name
));
1126 write_unlock_bh(&dev_base_lock
);
1128 ret
= call_netdevice_notifiers(NETDEV_CHANGENAME
, dev
);
1129 ret
= notifier_to_errno(ret
);
1132 /* err >= 0 after dev_alloc_name() or stores the first errno */
1135 write_seqcount_begin(&devnet_rename_seq
);
1136 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1139 pr_err("%s: name change rollback failed: %d\n",
1148 * dev_set_alias - change ifalias of a device
1150 * @alias: name up to IFALIASZ
1151 * @len: limit of bytes to copy from info
1153 * Set ifalias for a device,
1155 int dev_set_alias(struct net_device
*dev
, const char *alias
, size_t len
)
1161 if (len
>= IFALIASZ
)
1165 kfree(dev
->ifalias
);
1166 dev
->ifalias
= NULL
;
1170 new_ifalias
= krealloc(dev
->ifalias
, len
+ 1, GFP_KERNEL
);
1173 dev
->ifalias
= new_ifalias
;
1175 strlcpy(dev
->ifalias
, alias
, len
+1);
1181 * netdev_features_change - device changes features
1182 * @dev: device to cause notification
1184 * Called to indicate a device has changed features.
1186 void netdev_features_change(struct net_device
*dev
)
1188 call_netdevice_notifiers(NETDEV_FEAT_CHANGE
, dev
);
1190 EXPORT_SYMBOL(netdev_features_change
);
1193 * netdev_state_change - device changes state
1194 * @dev: device to cause notification
1196 * Called to indicate a device has changed state. This function calls
1197 * the notifier chains for netdev_chain and sends a NEWLINK message
1198 * to the routing socket.
1200 void netdev_state_change(struct net_device
*dev
)
1202 if (dev
->flags
& IFF_UP
) {
1203 call_netdevice_notifiers(NETDEV_CHANGE
, dev
);
1204 rtmsg_ifinfo(RTM_NEWLINK
, dev
, 0);
1207 EXPORT_SYMBOL(netdev_state_change
);
1210 * netdev_notify_peers - notify network peers about existence of @dev
1211 * @dev: network device
1213 * Generate traffic such that interested network peers are aware of
1214 * @dev, such as by generating a gratuitous ARP. This may be used when
1215 * a device wants to inform the rest of the network about some sort of
1216 * reconfiguration such as a failover event or virtual machine
1219 void netdev_notify_peers(struct net_device
*dev
)
1222 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS
, dev
);
1225 EXPORT_SYMBOL(netdev_notify_peers
);
1227 static int __dev_open(struct net_device
*dev
)
1229 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1234 if (!netif_device_present(dev
))
1237 /* Block netpoll from trying to do any rx path servicing.
1238 * If we don't do this there is a chance ndo_poll_controller
1239 * or ndo_poll may be running while we open the device
1241 ret
= netpoll_rx_disable(dev
);
1245 ret
= call_netdevice_notifiers(NETDEV_PRE_UP
, dev
);
1246 ret
= notifier_to_errno(ret
);
1250 set_bit(__LINK_STATE_START
, &dev
->state
);
1252 if (ops
->ndo_validate_addr
)
1253 ret
= ops
->ndo_validate_addr(dev
);
1255 if (!ret
&& ops
->ndo_open
)
1256 ret
= ops
->ndo_open(dev
);
1258 netpoll_rx_enable(dev
);
1261 clear_bit(__LINK_STATE_START
, &dev
->state
);
1263 dev
->flags
|= IFF_UP
;
1264 net_dmaengine_get();
1265 dev_set_rx_mode(dev
);
1267 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
1274 * dev_open - prepare an interface for use.
1275 * @dev: device to open
1277 * Takes a device from down to up state. The device's private open
1278 * function is invoked and then the multicast lists are loaded. Finally
1279 * the device is moved into the up state and a %NETDEV_UP message is
1280 * sent to the netdev notifier chain.
1282 * Calling this function on an active interface is a nop. On a failure
1283 * a negative errno code is returned.
1285 int dev_open(struct net_device
*dev
)
1289 if (dev
->flags
& IFF_UP
)
1292 ret
= __dev_open(dev
);
1296 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
);
1297 call_netdevice_notifiers(NETDEV_UP
, dev
);
1301 EXPORT_SYMBOL(dev_open
);
1303 static int __dev_close_many(struct list_head
*head
)
1305 struct net_device
*dev
;
1310 list_for_each_entry(dev
, head
, unreg_list
) {
1311 call_netdevice_notifiers(NETDEV_GOING_DOWN
, dev
);
1313 clear_bit(__LINK_STATE_START
, &dev
->state
);
1315 /* Synchronize to scheduled poll. We cannot touch poll list, it
1316 * can be even on different cpu. So just clear netif_running().
1318 * dev->stop() will invoke napi_disable() on all of it's
1319 * napi_struct instances on this device.
1321 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1324 dev_deactivate_many(head
);
1326 list_for_each_entry(dev
, head
, unreg_list
) {
1327 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1330 * Call the device specific close. This cannot fail.
1331 * Only if device is UP
1333 * We allow it to be called even after a DETACH hot-plug
1339 dev
->flags
&= ~IFF_UP
;
1340 net_dmaengine_put();
1346 static int __dev_close(struct net_device
*dev
)
1351 /* Temporarily disable netpoll until the interface is down */
1352 retval
= netpoll_rx_disable(dev
);
1356 list_add(&dev
->unreg_list
, &single
);
1357 retval
= __dev_close_many(&single
);
1360 netpoll_rx_enable(dev
);
1364 static int dev_close_many(struct list_head
*head
)
1366 struct net_device
*dev
, *tmp
;
1367 LIST_HEAD(tmp_list
);
1369 list_for_each_entry_safe(dev
, tmp
, head
, unreg_list
)
1370 if (!(dev
->flags
& IFF_UP
))
1371 list_move(&dev
->unreg_list
, &tmp_list
);
1373 __dev_close_many(head
);
1375 list_for_each_entry(dev
, head
, unreg_list
) {
1376 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
);
1377 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
1380 /* rollback_registered_many needs the complete original list */
1381 list_splice(&tmp_list
, head
);
1386 * dev_close - shutdown an interface.
1387 * @dev: device to shutdown
1389 * This function moves an active device into down state. A
1390 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1391 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1394 int dev_close(struct net_device
*dev
)
1397 if (dev
->flags
& IFF_UP
) {
1400 /* Block netpoll rx while the interface is going down */
1401 ret
= netpoll_rx_disable(dev
);
1405 list_add(&dev
->unreg_list
, &single
);
1406 dev_close_many(&single
);
1409 netpoll_rx_enable(dev
);
1413 EXPORT_SYMBOL(dev_close
);
1417 * dev_disable_lro - disable Large Receive Offload on a device
1420 * Disable Large Receive Offload (LRO) on a net device. Must be
1421 * called under RTNL. This is needed if received packets may be
1422 * forwarded to another interface.
1424 void dev_disable_lro(struct net_device
*dev
)
1427 * If we're trying to disable lro on a vlan device
1428 * use the underlying physical device instead
1430 if (is_vlan_dev(dev
))
1431 dev
= vlan_dev_real_dev(dev
);
1433 dev
->wanted_features
&= ~NETIF_F_LRO
;
1434 netdev_update_features(dev
);
1436 if (unlikely(dev
->features
& NETIF_F_LRO
))
1437 netdev_WARN(dev
, "failed to disable LRO!\n");
1439 EXPORT_SYMBOL(dev_disable_lro
);
1442 static int dev_boot_phase
= 1;
1445 * register_netdevice_notifier - register a network notifier block
1448 * Register a notifier to be called when network device events occur.
1449 * The notifier passed is linked into the kernel structures and must
1450 * not be reused until it has been unregistered. A negative errno code
1451 * is returned on a failure.
1453 * When registered all registration and up events are replayed
1454 * to the new notifier to allow device to have a race free
1455 * view of the network device list.
1458 int register_netdevice_notifier(struct notifier_block
*nb
)
1460 struct net_device
*dev
;
1461 struct net_device
*last
;
1466 err
= raw_notifier_chain_register(&netdev_chain
, nb
);
1472 for_each_netdev(net
, dev
) {
1473 err
= nb
->notifier_call(nb
, NETDEV_REGISTER
, dev
);
1474 err
= notifier_to_errno(err
);
1478 if (!(dev
->flags
& IFF_UP
))
1481 nb
->notifier_call(nb
, NETDEV_UP
, dev
);
1492 for_each_netdev(net
, dev
) {
1496 if (dev
->flags
& IFF_UP
) {
1497 nb
->notifier_call(nb
, NETDEV_GOING_DOWN
, dev
);
1498 nb
->notifier_call(nb
, NETDEV_DOWN
, dev
);
1500 nb
->notifier_call(nb
, NETDEV_UNREGISTER
, dev
);
1505 raw_notifier_chain_unregister(&netdev_chain
, nb
);
1508 EXPORT_SYMBOL(register_netdevice_notifier
);
1511 * unregister_netdevice_notifier - unregister a network notifier block
1514 * Unregister a notifier previously registered by
1515 * register_netdevice_notifier(). The notifier is unlinked into the
1516 * kernel structures and may then be reused. A negative errno code
1517 * is returned on a failure.
1519 * After unregistering unregister and down device events are synthesized
1520 * for all devices on the device list to the removed notifier to remove
1521 * the need for special case cleanup code.
1524 int unregister_netdevice_notifier(struct notifier_block
*nb
)
1526 struct net_device
*dev
;
1531 err
= raw_notifier_chain_unregister(&netdev_chain
, nb
);
1536 for_each_netdev(net
, dev
) {
1537 if (dev
->flags
& IFF_UP
) {
1538 nb
->notifier_call(nb
, NETDEV_GOING_DOWN
, dev
);
1539 nb
->notifier_call(nb
, NETDEV_DOWN
, dev
);
1541 nb
->notifier_call(nb
, NETDEV_UNREGISTER
, dev
);
1548 EXPORT_SYMBOL(unregister_netdevice_notifier
);
1551 * call_netdevice_notifiers - call all network notifier blocks
1552 * @val: value passed unmodified to notifier function
1553 * @dev: net_device pointer passed unmodified to notifier function
1555 * Call all network notifier blocks. Parameters and return value
1556 * are as for raw_notifier_call_chain().
1559 int call_netdevice_notifiers(unsigned long val
, struct net_device
*dev
)
1562 return raw_notifier_call_chain(&netdev_chain
, val
, dev
);
1564 EXPORT_SYMBOL(call_netdevice_notifiers
);
1566 static struct static_key netstamp_needed __read_mostly
;
1567 #ifdef HAVE_JUMP_LABEL
1568 /* We are not allowed to call static_key_slow_dec() from irq context
1569 * If net_disable_timestamp() is called from irq context, defer the
1570 * static_key_slow_dec() calls.
1572 static atomic_t netstamp_needed_deferred
;
1575 void net_enable_timestamp(void)
1577 #ifdef HAVE_JUMP_LABEL
1578 int deferred
= atomic_xchg(&netstamp_needed_deferred
, 0);
1582 static_key_slow_dec(&netstamp_needed
);
1586 WARN_ON(in_interrupt());
1587 static_key_slow_inc(&netstamp_needed
);
1589 EXPORT_SYMBOL(net_enable_timestamp
);
1591 void net_disable_timestamp(void)
1593 #ifdef HAVE_JUMP_LABEL
1594 if (in_interrupt()) {
1595 atomic_inc(&netstamp_needed_deferred
);
1599 static_key_slow_dec(&netstamp_needed
);
1601 EXPORT_SYMBOL(net_disable_timestamp
);
1603 static inline void net_timestamp_set(struct sk_buff
*skb
)
1605 skb
->tstamp
.tv64
= 0;
1606 if (static_key_false(&netstamp_needed
))
1607 __net_timestamp(skb
);
1610 #define net_timestamp_check(COND, SKB) \
1611 if (static_key_false(&netstamp_needed)) { \
1612 if ((COND) && !(SKB)->tstamp.tv64) \
1613 __net_timestamp(SKB); \
1616 static inline bool is_skb_forwardable(struct net_device *dev,
1617 struct sk_buff
*skb
)
1621 if (!(dev
->flags
& IFF_UP
))
1624 len
= dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
;
1625 if (skb
->len
<= len
)
1628 /* if TSO is enabled, we don't care about the length as the packet
1629 * could be forwarded without being segmented before
1631 if (skb_is_gso(skb
))
1638 * dev_forward_skb - loopback an skb to another netif
1640 * @dev: destination network device
1641 * @skb: buffer to forward
1644 * NET_RX_SUCCESS (no congestion)
1645 * NET_RX_DROP (packet was dropped, but freed)
1647 * dev_forward_skb can be used for injecting an skb from the
1648 * start_xmit function of one device into the receive queue
1649 * of another device.
1651 * The receiving device may be in another namespace, so
1652 * we have to clear all information in the skb that could
1653 * impact namespace isolation.
1655 int dev_forward_skb(struct net_device
*dev
, struct sk_buff
*skb
)
1657 if (skb_shinfo(skb
)->tx_flags
& SKBTX_DEV_ZEROCOPY
) {
1658 if (skb_copy_ubufs(skb
, GFP_ATOMIC
)) {
1659 atomic_long_inc(&dev
->rx_dropped
);
1668 if (unlikely(!is_skb_forwardable(dev
, skb
))) {
1669 atomic_long_inc(&dev
->rx_dropped
);
1676 skb
->tstamp
.tv64
= 0;
1677 skb
->pkt_type
= PACKET_HOST
;
1678 skb
->protocol
= eth_type_trans(skb
, dev
);
1682 return netif_rx(skb
);
1684 EXPORT_SYMBOL_GPL(dev_forward_skb
);
1686 static inline int deliver_skb(struct sk_buff
*skb
,
1687 struct packet_type
*pt_prev
,
1688 struct net_device
*orig_dev
)
1690 if (unlikely(skb_orphan_frags(skb
, GFP_ATOMIC
)))
1692 atomic_inc(&skb
->users
);
1693 return pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
1696 static inline bool skb_loop_sk(struct packet_type
*ptype
, struct sk_buff
*skb
)
1698 if (!ptype
->af_packet_priv
|| !skb
->sk
)
1701 if (ptype
->id_match
)
1702 return ptype
->id_match(ptype
, skb
->sk
);
1703 else if ((struct sock
*)ptype
->af_packet_priv
== skb
->sk
)
1710 * Support routine. Sends outgoing frames to any network
1711 * taps currently in use.
1714 static void dev_queue_xmit_nit(struct sk_buff
*skb
, struct net_device
*dev
)
1716 struct packet_type
*ptype
;
1717 struct sk_buff
*skb2
= NULL
;
1718 struct packet_type
*pt_prev
= NULL
;
1721 list_for_each_entry_rcu(ptype
, &ptype_all
, list
) {
1722 /* Never send packets back to the socket
1723 * they originated from - MvS (miquels@drinkel.ow.org)
1725 if ((ptype
->dev
== dev
|| !ptype
->dev
) &&
1726 (!skb_loop_sk(ptype
, skb
))) {
1728 deliver_skb(skb2
, pt_prev
, skb
->dev
);
1733 skb2
= skb_clone(skb
, GFP_ATOMIC
);
1737 net_timestamp_set(skb2
);
1739 /* skb->nh should be correctly
1740 set by sender, so that the second statement is
1741 just protection against buggy protocols.
1743 skb_reset_mac_header(skb2
);
1745 if (skb_network_header(skb2
) < skb2
->data
||
1746 skb2
->network_header
> skb2
->tail
) {
1747 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1748 ntohs(skb2
->protocol
),
1750 skb_reset_network_header(skb2
);
1753 skb2
->transport_header
= skb2
->network_header
;
1754 skb2
->pkt_type
= PACKET_OUTGOING
;
1759 pt_prev
->func(skb2
, skb
->dev
, pt_prev
, skb
->dev
);
1764 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1765 * @dev: Network device
1766 * @txq: number of queues available
1768 * If real_num_tx_queues is changed the tc mappings may no longer be
1769 * valid. To resolve this verify the tc mapping remains valid and if
1770 * not NULL the mapping. With no priorities mapping to this
1771 * offset/count pair it will no longer be used. In the worst case TC0
1772 * is invalid nothing can be done so disable priority mappings. If is
1773 * expected that drivers will fix this mapping if they can before
1774 * calling netif_set_real_num_tx_queues.
1776 static void netif_setup_tc(struct net_device
*dev
, unsigned int txq
)
1779 struct netdev_tc_txq
*tc
= &dev
->tc_to_txq
[0];
1781 /* If TC0 is invalidated disable TC mapping */
1782 if (tc
->offset
+ tc
->count
> txq
) {
1783 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1788 /* Invalidated prio to tc mappings set to TC0 */
1789 for (i
= 1; i
< TC_BITMASK
+ 1; i
++) {
1790 int q
= netdev_get_prio_tc_map(dev
, i
);
1792 tc
= &dev
->tc_to_txq
[q
];
1793 if (tc
->offset
+ tc
->count
> txq
) {
1794 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1796 netdev_set_prio_tc_map(dev
, i
, 0);
1802 static DEFINE_MUTEX(xps_map_mutex
);
1803 #define xmap_dereference(P) \
1804 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1806 static struct xps_map
*remove_xps_queue(struct xps_dev_maps
*dev_maps
,
1809 struct xps_map
*map
= NULL
;
1813 map
= xmap_dereference(dev_maps
->cpu_map
[cpu
]);
1815 for (pos
= 0; map
&& pos
< map
->len
; pos
++) {
1816 if (map
->queues
[pos
] == index
) {
1818 map
->queues
[pos
] = map
->queues
[--map
->len
];
1820 RCU_INIT_POINTER(dev_maps
->cpu_map
[cpu
], NULL
);
1821 kfree_rcu(map
, rcu
);
1831 static void netif_reset_xps_queues_gt(struct net_device
*dev
, u16 index
)
1833 struct xps_dev_maps
*dev_maps
;
1835 bool active
= false;
1837 mutex_lock(&xps_map_mutex
);
1838 dev_maps
= xmap_dereference(dev
->xps_maps
);
1843 for_each_possible_cpu(cpu
) {
1844 for (i
= index
; i
< dev
->num_tx_queues
; i
++) {
1845 if (!remove_xps_queue(dev_maps
, cpu
, i
))
1848 if (i
== dev
->num_tx_queues
)
1853 RCU_INIT_POINTER(dev
->xps_maps
, NULL
);
1854 kfree_rcu(dev_maps
, rcu
);
1857 for (i
= index
; i
< dev
->num_tx_queues
; i
++)
1858 netdev_queue_numa_node_write(netdev_get_tx_queue(dev
, i
),
1862 mutex_unlock(&xps_map_mutex
);
1865 static struct xps_map
*expand_xps_map(struct xps_map
*map
,
1868 struct xps_map
*new_map
;
1869 int alloc_len
= XPS_MIN_MAP_ALLOC
;
1872 for (pos
= 0; map
&& pos
< map
->len
; pos
++) {
1873 if (map
->queues
[pos
] != index
)
1878 /* Need to add queue to this CPU's existing map */
1880 if (pos
< map
->alloc_len
)
1883 alloc_len
= map
->alloc_len
* 2;
1886 /* Need to allocate new map to store queue on this CPU's map */
1887 new_map
= kzalloc_node(XPS_MAP_SIZE(alloc_len
), GFP_KERNEL
,
1892 for (i
= 0; i
< pos
; i
++)
1893 new_map
->queues
[i
] = map
->queues
[i
];
1894 new_map
->alloc_len
= alloc_len
;
1900 int netif_set_xps_queue(struct net_device
*dev
, struct cpumask
*mask
, u16 index
)
1902 struct xps_dev_maps
*dev_maps
, *new_dev_maps
= NULL
;
1903 struct xps_map
*map
, *new_map
;
1904 int maps_sz
= max_t(unsigned int, XPS_DEV_MAPS_SIZE
, L1_CACHE_BYTES
);
1905 int cpu
, numa_node_id
= -2;
1906 bool active
= false;
1908 mutex_lock(&xps_map_mutex
);
1910 dev_maps
= xmap_dereference(dev
->xps_maps
);
1912 /* allocate memory for queue storage */
1913 for_each_online_cpu(cpu
) {
1914 if (!cpumask_test_cpu(cpu
, mask
))
1918 new_dev_maps
= kzalloc(maps_sz
, GFP_KERNEL
);
1922 map
= dev_maps
? xmap_dereference(dev_maps
->cpu_map
[cpu
]) :
1925 map
= expand_xps_map(map
, cpu
, index
);
1929 RCU_INIT_POINTER(new_dev_maps
->cpu_map
[cpu
], map
);
1933 goto out_no_new_maps
;
1935 for_each_possible_cpu(cpu
) {
1936 if (cpumask_test_cpu(cpu
, mask
) && cpu_online(cpu
)) {
1937 /* add queue to CPU maps */
1940 map
= xmap_dereference(new_dev_maps
->cpu_map
[cpu
]);
1941 while ((pos
< map
->len
) && (map
->queues
[pos
] != index
))
1944 if (pos
== map
->len
)
1945 map
->queues
[map
->len
++] = index
;
1947 if (numa_node_id
== -2)
1948 numa_node_id
= cpu_to_node(cpu
);
1949 else if (numa_node_id
!= cpu_to_node(cpu
))
1952 } else if (dev_maps
) {
1953 /* fill in the new device map from the old device map */
1954 map
= xmap_dereference(dev_maps
->cpu_map
[cpu
]);
1955 RCU_INIT_POINTER(new_dev_maps
->cpu_map
[cpu
], map
);
1960 rcu_assign_pointer(dev
->xps_maps
, new_dev_maps
);
1962 /* Cleanup old maps */
1964 for_each_possible_cpu(cpu
) {
1965 new_map
= xmap_dereference(new_dev_maps
->cpu_map
[cpu
]);
1966 map
= xmap_dereference(dev_maps
->cpu_map
[cpu
]);
1967 if (map
&& map
!= new_map
)
1968 kfree_rcu(map
, rcu
);
1971 kfree_rcu(dev_maps
, rcu
);
1974 dev_maps
= new_dev_maps
;
1978 /* update Tx queue numa node */
1979 netdev_queue_numa_node_write(netdev_get_tx_queue(dev
, index
),
1980 (numa_node_id
>= 0) ? numa_node_id
:
1986 /* removes queue from unused CPUs */
1987 for_each_possible_cpu(cpu
) {
1988 if (cpumask_test_cpu(cpu
, mask
) && cpu_online(cpu
))
1991 if (remove_xps_queue(dev_maps
, cpu
, index
))
1995 /* free map if not active */
1997 RCU_INIT_POINTER(dev
->xps_maps
, NULL
);
1998 kfree_rcu(dev_maps
, rcu
);
2002 mutex_unlock(&xps_map_mutex
);
2006 /* remove any maps that we added */
2007 for_each_possible_cpu(cpu
) {
2008 new_map
= xmap_dereference(new_dev_maps
->cpu_map
[cpu
]);
2009 map
= dev_maps
? xmap_dereference(dev_maps
->cpu_map
[cpu
]) :
2011 if (new_map
&& new_map
!= map
)
2015 mutex_unlock(&xps_map_mutex
);
2017 kfree(new_dev_maps
);
2020 EXPORT_SYMBOL(netif_set_xps_queue
);
2024 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2025 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2027 int netif_set_real_num_tx_queues(struct net_device
*dev
, unsigned int txq
)
2031 if (txq
< 1 || txq
> dev
->num_tx_queues
)
2034 if (dev
->reg_state
== NETREG_REGISTERED
||
2035 dev
->reg_state
== NETREG_UNREGISTERING
) {
2038 rc
= netdev_queue_update_kobjects(dev
, dev
->real_num_tx_queues
,
2044 netif_setup_tc(dev
, txq
);
2046 if (txq
< dev
->real_num_tx_queues
) {
2047 qdisc_reset_all_tx_gt(dev
, txq
);
2049 netif_reset_xps_queues_gt(dev
, txq
);
2054 dev
->real_num_tx_queues
= txq
;
2057 EXPORT_SYMBOL(netif_set_real_num_tx_queues
);
2061 * netif_set_real_num_rx_queues - set actual number of RX queues used
2062 * @dev: Network device
2063 * @rxq: Actual number of RX queues
2065 * This must be called either with the rtnl_lock held or before
2066 * registration of the net device. Returns 0 on success, or a
2067 * negative error code. If called before registration, it always
2070 int netif_set_real_num_rx_queues(struct net_device
*dev
, unsigned int rxq
)
2074 if (rxq
< 1 || rxq
> dev
->num_rx_queues
)
2077 if (dev
->reg_state
== NETREG_REGISTERED
) {
2080 rc
= net_rx_queue_update_kobjects(dev
, dev
->real_num_rx_queues
,
2086 dev
->real_num_rx_queues
= rxq
;
2089 EXPORT_SYMBOL(netif_set_real_num_rx_queues
);
2093 * netif_get_num_default_rss_queues - default number of RSS queues
2095 * This routine should set an upper limit on the number of RSS queues
2096 * used by default by multiqueue devices.
2098 int netif_get_num_default_rss_queues(void)
2100 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES
, num_online_cpus());
2102 EXPORT_SYMBOL(netif_get_num_default_rss_queues
);
2104 static inline void __netif_reschedule(struct Qdisc
*q
)
2106 struct softnet_data
*sd
;
2107 unsigned long flags
;
2109 local_irq_save(flags
);
2110 sd
= &__get_cpu_var(softnet_data
);
2111 q
->next_sched
= NULL
;
2112 *sd
->output_queue_tailp
= q
;
2113 sd
->output_queue_tailp
= &q
->next_sched
;
2114 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
2115 local_irq_restore(flags
);
2118 void __netif_schedule(struct Qdisc
*q
)
2120 if (!test_and_set_bit(__QDISC_STATE_SCHED
, &q
->state
))
2121 __netif_reschedule(q
);
2123 EXPORT_SYMBOL(__netif_schedule
);
2125 void dev_kfree_skb_irq(struct sk_buff
*skb
)
2127 if (atomic_dec_and_test(&skb
->users
)) {
2128 struct softnet_data
*sd
;
2129 unsigned long flags
;
2131 local_irq_save(flags
);
2132 sd
= &__get_cpu_var(softnet_data
);
2133 skb
->next
= sd
->completion_queue
;
2134 sd
->completion_queue
= skb
;
2135 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
2136 local_irq_restore(flags
);
2139 EXPORT_SYMBOL(dev_kfree_skb_irq
);
2141 void dev_kfree_skb_any(struct sk_buff
*skb
)
2143 if (in_irq() || irqs_disabled())
2144 dev_kfree_skb_irq(skb
);
2148 EXPORT_SYMBOL(dev_kfree_skb_any
);
2152 * netif_device_detach - mark device as removed
2153 * @dev: network device
2155 * Mark device as removed from system and therefore no longer available.
2157 void netif_device_detach(struct net_device
*dev
)
2159 if (test_and_clear_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
2160 netif_running(dev
)) {
2161 netif_tx_stop_all_queues(dev
);
2164 EXPORT_SYMBOL(netif_device_detach
);
2167 * netif_device_attach - mark device as attached
2168 * @dev: network device
2170 * Mark device as attached from system and restart if needed.
2172 void netif_device_attach(struct net_device
*dev
)
2174 if (!test_and_set_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
2175 netif_running(dev
)) {
2176 netif_tx_wake_all_queues(dev
);
2177 __netdev_watchdog_up(dev
);
2180 EXPORT_SYMBOL(netif_device_attach
);
2182 static void skb_warn_bad_offload(const struct sk_buff
*skb
)
2184 static const netdev_features_t null_features
= 0;
2185 struct net_device
*dev
= skb
->dev
;
2186 const char *driver
= "";
2188 if (dev
&& dev
->dev
.parent
)
2189 driver
= dev_driver_string(dev
->dev
.parent
);
2191 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2192 "gso_type=%d ip_summed=%d\n",
2193 driver
, dev
? &dev
->features
: &null_features
,
2194 skb
->sk
? &skb
->sk
->sk_route_caps
: &null_features
,
2195 skb
->len
, skb
->data_len
, skb_shinfo(skb
)->gso_size
,
2196 skb_shinfo(skb
)->gso_type
, skb
->ip_summed
);
2200 * Invalidate hardware checksum when packet is to be mangled, and
2201 * complete checksum manually on outgoing path.
2203 int skb_checksum_help(struct sk_buff
*skb
)
2206 int ret
= 0, offset
;
2208 if (skb
->ip_summed
== CHECKSUM_COMPLETE
)
2209 goto out_set_summed
;
2211 if (unlikely(skb_shinfo(skb
)->gso_size
)) {
2212 skb_warn_bad_offload(skb
);
2216 /* Before computing a checksum, we should make sure no frag could
2217 * be modified by an external entity : checksum could be wrong.
2219 if (skb_has_shared_frag(skb
)) {
2220 ret
= __skb_linearize(skb
);
2225 offset
= skb_checksum_start_offset(skb
);
2226 BUG_ON(offset
>= skb_headlen(skb
));
2227 csum
= skb_checksum(skb
, offset
, skb
->len
- offset
, 0);
2229 offset
+= skb
->csum_offset
;
2230 BUG_ON(offset
+ sizeof(__sum16
) > skb_headlen(skb
));
2232 if (skb_cloned(skb
) &&
2233 !skb_clone_writable(skb
, offset
+ sizeof(__sum16
))) {
2234 ret
= pskb_expand_head(skb
, 0, 0, GFP_ATOMIC
);
2239 *(__sum16
*)(skb
->data
+ offset
) = csum_fold(csum
);
2241 skb
->ip_summed
= CHECKSUM_NONE
;
2245 EXPORT_SYMBOL(skb_checksum_help
);
2248 * skb_mac_gso_segment - mac layer segmentation handler.
2249 * @skb: buffer to segment
2250 * @features: features for the output path (see dev->features)
2252 struct sk_buff
*skb_mac_gso_segment(struct sk_buff
*skb
,
2253 netdev_features_t features
)
2255 struct sk_buff
*segs
= ERR_PTR(-EPROTONOSUPPORT
);
2256 struct packet_offload
*ptype
;
2257 __be16 type
= skb
->protocol
;
2259 while (type
== htons(ETH_P_8021Q
)) {
2260 int vlan_depth
= ETH_HLEN
;
2261 struct vlan_hdr
*vh
;
2263 if (unlikely(!pskb_may_pull(skb
, vlan_depth
+ VLAN_HLEN
)))
2264 return ERR_PTR(-EINVAL
);
2266 vh
= (struct vlan_hdr
*)(skb
->data
+ vlan_depth
);
2267 type
= vh
->h_vlan_encapsulated_proto
;
2268 vlan_depth
+= VLAN_HLEN
;
2271 __skb_pull(skb
, skb
->mac_len
);
2274 list_for_each_entry_rcu(ptype
, &offload_base
, list
) {
2275 if (ptype
->type
== type
&& ptype
->callbacks
.gso_segment
) {
2276 if (unlikely(skb
->ip_summed
!= CHECKSUM_PARTIAL
)) {
2279 err
= ptype
->callbacks
.gso_send_check(skb
);
2280 segs
= ERR_PTR(err
);
2281 if (err
|| skb_gso_ok(skb
, features
))
2283 __skb_push(skb
, (skb
->data
-
2284 skb_network_header(skb
)));
2286 segs
= ptype
->callbacks
.gso_segment(skb
, features
);
2292 __skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2296 EXPORT_SYMBOL(skb_mac_gso_segment
);
2299 /* openvswitch calls this on rx path, so we need a different check.
2301 static inline bool skb_needs_check(struct sk_buff
*skb
, bool tx_path
)
2304 return skb
->ip_summed
!= CHECKSUM_PARTIAL
;
2306 return skb
->ip_summed
== CHECKSUM_NONE
;
2310 * __skb_gso_segment - Perform segmentation on skb.
2311 * @skb: buffer to segment
2312 * @features: features for the output path (see dev->features)
2313 * @tx_path: whether it is called in TX path
2315 * This function segments the given skb and returns a list of segments.
2317 * It may return NULL if the skb requires no segmentation. This is
2318 * only possible when GSO is used for verifying header integrity.
2320 struct sk_buff
*__skb_gso_segment(struct sk_buff
*skb
,
2321 netdev_features_t features
, bool tx_path
)
2323 if (unlikely(skb_needs_check(skb
, tx_path
))) {
2326 skb_warn_bad_offload(skb
);
2328 if (skb_header_cloned(skb
) &&
2329 (err
= pskb_expand_head(skb
, 0, 0, GFP_ATOMIC
)))
2330 return ERR_PTR(err
);
2333 SKB_GSO_CB(skb
)->mac_offset
= skb_headroom(skb
);
2334 skb_reset_mac_header(skb
);
2335 skb_reset_mac_len(skb
);
2337 return skb_mac_gso_segment(skb
, features
);
2339 EXPORT_SYMBOL(__skb_gso_segment
);
2341 /* Take action when hardware reception checksum errors are detected. */
2343 void netdev_rx_csum_fault(struct net_device
*dev
)
2345 if (net_ratelimit()) {
2346 pr_err("%s: hw csum failure\n", dev
? dev
->name
: "<unknown>");
2350 EXPORT_SYMBOL(netdev_rx_csum_fault
);
2353 /* Actually, we should eliminate this check as soon as we know, that:
2354 * 1. IOMMU is present and allows to map all the memory.
2355 * 2. No high memory really exists on this machine.
2358 static int illegal_highdma(struct net_device
*dev
, struct sk_buff
*skb
)
2360 #ifdef CONFIG_HIGHMEM
2362 if (!(dev
->features
& NETIF_F_HIGHDMA
)) {
2363 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++) {
2364 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
];
2365 if (PageHighMem(skb_frag_page(frag
)))
2370 if (PCI_DMA_BUS_IS_PHYS
) {
2371 struct device
*pdev
= dev
->dev
.parent
;
2375 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++) {
2376 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
];
2377 dma_addr_t addr
= page_to_phys(skb_frag_page(frag
));
2378 if (!pdev
->dma_mask
|| addr
+ PAGE_SIZE
- 1 > *pdev
->dma_mask
)
2387 void (*destructor
)(struct sk_buff
*skb
);
2390 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2392 static void dev_gso_skb_destructor(struct sk_buff
*skb
)
2394 struct dev_gso_cb
*cb
;
2397 struct sk_buff
*nskb
= skb
->next
;
2399 skb
->next
= nskb
->next
;
2402 } while (skb
->next
);
2404 cb
= DEV_GSO_CB(skb
);
2406 cb
->destructor(skb
);
2410 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2411 * @skb: buffer to segment
2412 * @features: device features as applicable to this skb
2414 * This function segments the given skb and stores the list of segments
2417 static int dev_gso_segment(struct sk_buff
*skb
, netdev_features_t features
)
2419 struct sk_buff
*segs
;
2421 segs
= skb_gso_segment(skb
, features
);
2423 /* Verifying header integrity only. */
2428 return PTR_ERR(segs
);
2431 DEV_GSO_CB(skb
)->destructor
= skb
->destructor
;
2432 skb
->destructor
= dev_gso_skb_destructor
;
2437 static bool can_checksum_protocol(netdev_features_t features
, __be16 protocol
)
2439 return ((features
& NETIF_F_GEN_CSUM
) ||
2440 ((features
& NETIF_F_V4_CSUM
) &&
2441 protocol
== htons(ETH_P_IP
)) ||
2442 ((features
& NETIF_F_V6_CSUM
) &&
2443 protocol
== htons(ETH_P_IPV6
)) ||
2444 ((features
& NETIF_F_FCOE_CRC
) &&
2445 protocol
== htons(ETH_P_FCOE
)));
2448 static netdev_features_t
harmonize_features(struct sk_buff
*skb
,
2449 __be16 protocol
, netdev_features_t features
)
2451 if (skb
->ip_summed
!= CHECKSUM_NONE
&&
2452 !can_checksum_protocol(features
, protocol
)) {
2453 features
&= ~NETIF_F_ALL_CSUM
;
2454 features
&= ~NETIF_F_SG
;
2455 } else if (illegal_highdma(skb
->dev
, skb
)) {
2456 features
&= ~NETIF_F_SG
;
2462 netdev_features_t
netif_skb_features(struct sk_buff
*skb
)
2464 __be16 protocol
= skb
->protocol
;
2465 netdev_features_t features
= skb
->dev
->features
;
2467 if (skb_shinfo(skb
)->gso_segs
> skb
->dev
->gso_max_segs
)
2468 features
&= ~NETIF_F_GSO_MASK
;
2470 if (protocol
== htons(ETH_P_8021Q
)) {
2471 struct vlan_ethhdr
*veh
= (struct vlan_ethhdr
*)skb
->data
;
2472 protocol
= veh
->h_vlan_encapsulated_proto
;
2473 } else if (!vlan_tx_tag_present(skb
)) {
2474 return harmonize_features(skb
, protocol
, features
);
2477 features
&= (skb
->dev
->vlan_features
| NETIF_F_HW_VLAN_TX
);
2479 if (protocol
!= htons(ETH_P_8021Q
)) {
2480 return harmonize_features(skb
, protocol
, features
);
2482 features
&= NETIF_F_SG
| NETIF_F_HIGHDMA
| NETIF_F_FRAGLIST
|
2483 NETIF_F_GEN_CSUM
| NETIF_F_HW_VLAN_TX
;
2484 return harmonize_features(skb
, protocol
, features
);
2487 EXPORT_SYMBOL(netif_skb_features
);
2490 * Returns true if either:
2491 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2492 * 2. skb is fragmented and the device does not support SG.
2494 static inline int skb_needs_linearize(struct sk_buff
*skb
,
2497 return skb_is_nonlinear(skb
) &&
2498 ((skb_has_frag_list(skb
) &&
2499 !(features
& NETIF_F_FRAGLIST
)) ||
2500 (skb_shinfo(skb
)->nr_frags
&&
2501 !(features
& NETIF_F_SG
)));
2504 int dev_hard_start_xmit(struct sk_buff
*skb
, struct net_device
*dev
,
2505 struct netdev_queue
*txq
)
2507 const struct net_device_ops
*ops
= dev
->netdev_ops
;
2508 int rc
= NETDEV_TX_OK
;
2509 unsigned int skb_len
;
2511 if (likely(!skb
->next
)) {
2512 netdev_features_t features
;
2515 * If device doesn't need skb->dst, release it right now while
2516 * its hot in this cpu cache
2518 if (dev
->priv_flags
& IFF_XMIT_DST_RELEASE
)
2521 features
= netif_skb_features(skb
);
2523 if (vlan_tx_tag_present(skb
) &&
2524 !(features
& NETIF_F_HW_VLAN_TX
)) {
2525 skb
= __vlan_put_tag(skb
, vlan_tx_tag_get(skb
));
2532 /* If encapsulation offload request, verify we are testing
2533 * hardware encapsulation features instead of standard
2534 * features for the netdev
2536 if (skb
->encapsulation
)
2537 features
&= dev
->hw_enc_features
;
2539 if (netif_needs_gso(skb
, features
)) {
2540 if (unlikely(dev_gso_segment(skb
, features
)))
2545 if (skb_needs_linearize(skb
, features
) &&
2546 __skb_linearize(skb
))
2549 /* If packet is not checksummed and device does not
2550 * support checksumming for this protocol, complete
2551 * checksumming here.
2553 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
2554 if (skb
->encapsulation
)
2555 skb_set_inner_transport_header(skb
,
2556 skb_checksum_start_offset(skb
));
2558 skb_set_transport_header(skb
,
2559 skb_checksum_start_offset(skb
));
2560 if (!(features
& NETIF_F_ALL_CSUM
) &&
2561 skb_checksum_help(skb
))
2566 if (!list_empty(&ptype_all
))
2567 dev_queue_xmit_nit(skb
, dev
);
2570 rc
= ops
->ndo_start_xmit(skb
, dev
);
2571 trace_net_dev_xmit(skb
, rc
, dev
, skb_len
);
2572 if (rc
== NETDEV_TX_OK
)
2573 txq_trans_update(txq
);
2579 struct sk_buff
*nskb
= skb
->next
;
2581 skb
->next
= nskb
->next
;
2585 * If device doesn't need nskb->dst, release it right now while
2586 * its hot in this cpu cache
2588 if (dev
->priv_flags
& IFF_XMIT_DST_RELEASE
)
2591 if (!list_empty(&ptype_all
))
2592 dev_queue_xmit_nit(nskb
, dev
);
2594 skb_len
= nskb
->len
;
2595 rc
= ops
->ndo_start_xmit(nskb
, dev
);
2596 trace_net_dev_xmit(nskb
, rc
, dev
, skb_len
);
2597 if (unlikely(rc
!= NETDEV_TX_OK
)) {
2598 if (rc
& ~NETDEV_TX_MASK
)
2599 goto out_kfree_gso_skb
;
2600 nskb
->next
= skb
->next
;
2604 txq_trans_update(txq
);
2605 if (unlikely(netif_xmit_stopped(txq
) && skb
->next
))
2606 return NETDEV_TX_BUSY
;
2607 } while (skb
->next
);
2610 if (likely(skb
->next
== NULL
))
2611 skb
->destructor
= DEV_GSO_CB(skb
)->destructor
;
2618 static void qdisc_pkt_len_init(struct sk_buff
*skb
)
2620 const struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
2622 qdisc_skb_cb(skb
)->pkt_len
= skb
->len
;
2624 /* To get more precise estimation of bytes sent on wire,
2625 * we add to pkt_len the headers size of all segments
2627 if (shinfo
->gso_size
) {
2628 unsigned int hdr_len
;
2630 /* mac layer + network layer */
2631 hdr_len
= skb_transport_header(skb
) - skb_mac_header(skb
);
2633 /* + transport layer */
2634 if (likely(shinfo
->gso_type
& (SKB_GSO_TCPV4
| SKB_GSO_TCPV6
)))
2635 hdr_len
+= tcp_hdrlen(skb
);
2637 hdr_len
+= sizeof(struct udphdr
);
2638 qdisc_skb_cb(skb
)->pkt_len
+= (shinfo
->gso_segs
- 1) * hdr_len
;
2642 static inline int __dev_xmit_skb(struct sk_buff
*skb
, struct Qdisc
*q
,
2643 struct net_device
*dev
,
2644 struct netdev_queue
*txq
)
2646 spinlock_t
*root_lock
= qdisc_lock(q
);
2650 qdisc_pkt_len_init(skb
);
2651 qdisc_calculate_pkt_len(skb
, q
);
2653 * Heuristic to force contended enqueues to serialize on a
2654 * separate lock before trying to get qdisc main lock.
2655 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2656 * and dequeue packets faster.
2658 contended
= qdisc_is_running(q
);
2659 if (unlikely(contended
))
2660 spin_lock(&q
->busylock
);
2662 spin_lock(root_lock
);
2663 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED
, &q
->state
))) {
2666 } else if ((q
->flags
& TCQ_F_CAN_BYPASS
) && !qdisc_qlen(q
) &&
2667 qdisc_run_begin(q
)) {
2669 * This is a work-conserving queue; there are no old skbs
2670 * waiting to be sent out; and the qdisc is not running -
2671 * xmit the skb directly.
2673 if (!(dev
->priv_flags
& IFF_XMIT_DST_RELEASE
))
2676 qdisc_bstats_update(q
, skb
);
2678 if (sch_direct_xmit(skb
, q
, dev
, txq
, root_lock
)) {
2679 if (unlikely(contended
)) {
2680 spin_unlock(&q
->busylock
);
2687 rc
= NET_XMIT_SUCCESS
;
2690 rc
= q
->enqueue(skb
, q
) & NET_XMIT_MASK
;
2691 if (qdisc_run_begin(q
)) {
2692 if (unlikely(contended
)) {
2693 spin_unlock(&q
->busylock
);
2699 spin_unlock(root_lock
);
2700 if (unlikely(contended
))
2701 spin_unlock(&q
->busylock
);
2705 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2706 static void skb_update_prio(struct sk_buff
*skb
)
2708 struct netprio_map
*map
= rcu_dereference_bh(skb
->dev
->priomap
);
2710 if (!skb
->priority
&& skb
->sk
&& map
) {
2711 unsigned int prioidx
= skb
->sk
->sk_cgrp_prioidx
;
2713 if (prioidx
< map
->priomap_len
)
2714 skb
->priority
= map
->priomap
[prioidx
];
2718 #define skb_update_prio(skb)
2721 static DEFINE_PER_CPU(int, xmit_recursion
);
2722 #define RECURSION_LIMIT 10
2725 * dev_loopback_xmit - loop back @skb
2726 * @skb: buffer to transmit
2728 int dev_loopback_xmit(struct sk_buff
*skb
)
2730 skb_reset_mac_header(skb
);
2731 __skb_pull(skb
, skb_network_offset(skb
));
2732 skb
->pkt_type
= PACKET_LOOPBACK
;
2733 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
2734 WARN_ON(!skb_dst(skb
));
2739 EXPORT_SYMBOL(dev_loopback_xmit
);
2742 * dev_queue_xmit - transmit a buffer
2743 * @skb: buffer to transmit
2745 * Queue a buffer for transmission to a network device. The caller must
2746 * have set the device and priority and built the buffer before calling
2747 * this function. The function can be called from an interrupt.
2749 * A negative errno code is returned on a failure. A success does not
2750 * guarantee the frame will be transmitted as it may be dropped due
2751 * to congestion or traffic shaping.
2753 * -----------------------------------------------------------------------------------
2754 * I notice this method can also return errors from the queue disciplines,
2755 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2758 * Regardless of the return value, the skb is consumed, so it is currently
2759 * difficult to retry a send to this method. (You can bump the ref count
2760 * before sending to hold a reference for retry if you are careful.)
2762 * When calling this method, interrupts MUST be enabled. This is because
2763 * the BH enable code must have IRQs enabled so that it will not deadlock.
2766 int dev_queue_xmit(struct sk_buff
*skb
)
2768 struct net_device
*dev
= skb
->dev
;
2769 struct netdev_queue
*txq
;
2773 skb_reset_mac_header(skb
);
2775 /* Disable soft irqs for various locks below. Also
2776 * stops preemption for RCU.
2780 skb_update_prio(skb
);
2782 txq
= netdev_pick_tx(dev
, skb
);
2783 q
= rcu_dereference_bh(txq
->qdisc
);
2785 #ifdef CONFIG_NET_CLS_ACT
2786 skb
->tc_verd
= SET_TC_AT(skb
->tc_verd
, AT_EGRESS
);
2788 trace_net_dev_queue(skb
);
2790 rc
= __dev_xmit_skb(skb
, q
, dev
, txq
);
2794 /* The device has no queue. Common case for software devices:
2795 loopback, all the sorts of tunnels...
2797 Really, it is unlikely that netif_tx_lock protection is necessary
2798 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2800 However, it is possible, that they rely on protection
2803 Check this and shot the lock. It is not prone from deadlocks.
2804 Either shot noqueue qdisc, it is even simpler 8)
2806 if (dev
->flags
& IFF_UP
) {
2807 int cpu
= smp_processor_id(); /* ok because BHs are off */
2809 if (txq
->xmit_lock_owner
!= cpu
) {
2811 if (__this_cpu_read(xmit_recursion
) > RECURSION_LIMIT
)
2812 goto recursion_alert
;
2814 HARD_TX_LOCK(dev
, txq
, cpu
);
2816 if (!netif_xmit_stopped(txq
)) {
2817 __this_cpu_inc(xmit_recursion
);
2818 rc
= dev_hard_start_xmit(skb
, dev
, txq
);
2819 __this_cpu_dec(xmit_recursion
);
2820 if (dev_xmit_complete(rc
)) {
2821 HARD_TX_UNLOCK(dev
, txq
);
2825 HARD_TX_UNLOCK(dev
, txq
);
2826 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2829 /* Recursion is detected! It is possible,
2833 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2839 rcu_read_unlock_bh();
2844 rcu_read_unlock_bh();
2847 EXPORT_SYMBOL(dev_queue_xmit
);
2850 /*=======================================================================
2852 =======================================================================*/
2854 int netdev_max_backlog __read_mostly
= 1000;
2855 EXPORT_SYMBOL(netdev_max_backlog
);
2857 int netdev_tstamp_prequeue __read_mostly
= 1;
2858 int netdev_budget __read_mostly
= 300;
2859 int weight_p __read_mostly
= 64; /* old backlog weight */
2861 /* Called with irq disabled */
2862 static inline void ____napi_schedule(struct softnet_data
*sd
,
2863 struct napi_struct
*napi
)
2865 list_add_tail(&napi
->poll_list
, &sd
->poll_list
);
2866 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
2871 /* One global table that all flow-based protocols share. */
2872 struct rps_sock_flow_table __rcu
*rps_sock_flow_table __read_mostly
;
2873 EXPORT_SYMBOL(rps_sock_flow_table
);
2875 struct static_key rps_needed __read_mostly
;
2877 static struct rps_dev_flow
*
2878 set_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
2879 struct rps_dev_flow
*rflow
, u16 next_cpu
)
2881 if (next_cpu
!= RPS_NO_CPU
) {
2882 #ifdef CONFIG_RFS_ACCEL
2883 struct netdev_rx_queue
*rxqueue
;
2884 struct rps_dev_flow_table
*flow_table
;
2885 struct rps_dev_flow
*old_rflow
;
2890 /* Should we steer this flow to a different hardware queue? */
2891 if (!skb_rx_queue_recorded(skb
) || !dev
->rx_cpu_rmap
||
2892 !(dev
->features
& NETIF_F_NTUPLE
))
2894 rxq_index
= cpu_rmap_lookup_index(dev
->rx_cpu_rmap
, next_cpu
);
2895 if (rxq_index
== skb_get_rx_queue(skb
))
2898 rxqueue
= dev
->_rx
+ rxq_index
;
2899 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
2902 flow_id
= skb
->rxhash
& flow_table
->mask
;
2903 rc
= dev
->netdev_ops
->ndo_rx_flow_steer(dev
, skb
,
2904 rxq_index
, flow_id
);
2908 rflow
= &flow_table
->flows
[flow_id
];
2910 if (old_rflow
->filter
== rflow
->filter
)
2911 old_rflow
->filter
= RPS_NO_FILTER
;
2915 per_cpu(softnet_data
, next_cpu
).input_queue_head
;
2918 rflow
->cpu
= next_cpu
;
2923 * get_rps_cpu is called from netif_receive_skb and returns the target
2924 * CPU from the RPS map of the receiving queue for a given skb.
2925 * rcu_read_lock must be held on entry.
2927 static int get_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
2928 struct rps_dev_flow
**rflowp
)
2930 struct netdev_rx_queue
*rxqueue
;
2931 struct rps_map
*map
;
2932 struct rps_dev_flow_table
*flow_table
;
2933 struct rps_sock_flow_table
*sock_flow_table
;
2937 if (skb_rx_queue_recorded(skb
)) {
2938 u16 index
= skb_get_rx_queue(skb
);
2939 if (unlikely(index
>= dev
->real_num_rx_queues
)) {
2940 WARN_ONCE(dev
->real_num_rx_queues
> 1,
2941 "%s received packet on queue %u, but number "
2942 "of RX queues is %u\n",
2943 dev
->name
, index
, dev
->real_num_rx_queues
);
2946 rxqueue
= dev
->_rx
+ index
;
2950 map
= rcu_dereference(rxqueue
->rps_map
);
2952 if (map
->len
== 1 &&
2953 !rcu_access_pointer(rxqueue
->rps_flow_table
)) {
2954 tcpu
= map
->cpus
[0];
2955 if (cpu_online(tcpu
))
2959 } else if (!rcu_access_pointer(rxqueue
->rps_flow_table
)) {
2963 skb_reset_network_header(skb
);
2964 if (!skb_get_rxhash(skb
))
2967 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
2968 sock_flow_table
= rcu_dereference(rps_sock_flow_table
);
2969 if (flow_table
&& sock_flow_table
) {
2971 struct rps_dev_flow
*rflow
;
2973 rflow
= &flow_table
->flows
[skb
->rxhash
& flow_table
->mask
];
2976 next_cpu
= sock_flow_table
->ents
[skb
->rxhash
&
2977 sock_flow_table
->mask
];
2980 * If the desired CPU (where last recvmsg was done) is
2981 * different from current CPU (one in the rx-queue flow
2982 * table entry), switch if one of the following holds:
2983 * - Current CPU is unset (equal to RPS_NO_CPU).
2984 * - Current CPU is offline.
2985 * - The current CPU's queue tail has advanced beyond the
2986 * last packet that was enqueued using this table entry.
2987 * This guarantees that all previous packets for the flow
2988 * have been dequeued, thus preserving in order delivery.
2990 if (unlikely(tcpu
!= next_cpu
) &&
2991 (tcpu
== RPS_NO_CPU
|| !cpu_online(tcpu
) ||
2992 ((int)(per_cpu(softnet_data
, tcpu
).input_queue_head
-
2993 rflow
->last_qtail
)) >= 0)) {
2995 rflow
= set_rps_cpu(dev
, skb
, rflow
, next_cpu
);
2998 if (tcpu
!= RPS_NO_CPU
&& cpu_online(tcpu
)) {
3006 tcpu
= map
->cpus
[((u64
) skb
->rxhash
* map
->len
) >> 32];
3008 if (cpu_online(tcpu
)) {
3018 #ifdef CONFIG_RFS_ACCEL
3021 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3022 * @dev: Device on which the filter was set
3023 * @rxq_index: RX queue index
3024 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3025 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3027 * Drivers that implement ndo_rx_flow_steer() should periodically call
3028 * this function for each installed filter and remove the filters for
3029 * which it returns %true.
3031 bool rps_may_expire_flow(struct net_device
*dev
, u16 rxq_index
,
3032 u32 flow_id
, u16 filter_id
)
3034 struct netdev_rx_queue
*rxqueue
= dev
->_rx
+ rxq_index
;
3035 struct rps_dev_flow_table
*flow_table
;
3036 struct rps_dev_flow
*rflow
;
3041 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
3042 if (flow_table
&& flow_id
<= flow_table
->mask
) {
3043 rflow
= &flow_table
->flows
[flow_id
];
3044 cpu
= ACCESS_ONCE(rflow
->cpu
);
3045 if (rflow
->filter
== filter_id
&& cpu
!= RPS_NO_CPU
&&
3046 ((int)(per_cpu(softnet_data
, cpu
).input_queue_head
-
3047 rflow
->last_qtail
) <
3048 (int)(10 * flow_table
->mask
)))
3054 EXPORT_SYMBOL(rps_may_expire_flow
);
3056 #endif /* CONFIG_RFS_ACCEL */
3058 /* Called from hardirq (IPI) context */
3059 static void rps_trigger_softirq(void *data
)
3061 struct softnet_data
*sd
= data
;
3063 ____napi_schedule(sd
, &sd
->backlog
);
3067 #endif /* CONFIG_RPS */
3070 * Check if this softnet_data structure is another cpu one
3071 * If yes, queue it to our IPI list and return 1
3074 static int rps_ipi_queued(struct softnet_data
*sd
)
3077 struct softnet_data
*mysd
= &__get_cpu_var(softnet_data
);
3080 sd
->rps_ipi_next
= mysd
->rps_ipi_list
;
3081 mysd
->rps_ipi_list
= sd
;
3083 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
3086 #endif /* CONFIG_RPS */
3091 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3092 * queue (may be a remote CPU queue).
3094 static int enqueue_to_backlog(struct sk_buff
*skb
, int cpu
,
3095 unsigned int *qtail
)
3097 struct softnet_data
*sd
;
3098 unsigned long flags
;
3100 sd
= &per_cpu(softnet_data
, cpu
);
3102 local_irq_save(flags
);
3105 if (skb_queue_len(&sd
->input_pkt_queue
) <= netdev_max_backlog
) {
3106 if (skb_queue_len(&sd
->input_pkt_queue
)) {
3108 __skb_queue_tail(&sd
->input_pkt_queue
, skb
);
3109 input_queue_tail_incr_save(sd
, qtail
);
3111 local_irq_restore(flags
);
3112 return NET_RX_SUCCESS
;
3115 /* Schedule NAPI for backlog device
3116 * We can use non atomic operation since we own the queue lock
3118 if (!__test_and_set_bit(NAPI_STATE_SCHED
, &sd
->backlog
.state
)) {
3119 if (!rps_ipi_queued(sd
))
3120 ____napi_schedule(sd
, &sd
->backlog
);
3128 local_irq_restore(flags
);
3130 atomic_long_inc(&skb
->dev
->rx_dropped
);
3136 * netif_rx - post buffer to the network code
3137 * @skb: buffer to post
3139 * This function receives a packet from a device driver and queues it for
3140 * the upper (protocol) levels to process. It always succeeds. The buffer
3141 * may be dropped during processing for congestion control or by the
3145 * NET_RX_SUCCESS (no congestion)
3146 * NET_RX_DROP (packet was dropped)
3150 int netif_rx(struct sk_buff
*skb
)
3154 /* if netpoll wants it, pretend we never saw it */
3155 if (netpoll_rx(skb
))
3158 net_timestamp_check(netdev_tstamp_prequeue
, skb
);
3160 trace_netif_rx(skb
);
3162 if (static_key_false(&rps_needed
)) {
3163 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
3169 cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
3171 cpu
= smp_processor_id();
3173 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
3181 ret
= enqueue_to_backlog(skb
, get_cpu(), &qtail
);
3186 EXPORT_SYMBOL(netif_rx
);
3188 int netif_rx_ni(struct sk_buff
*skb
)
3193 err
= netif_rx(skb
);
3194 if (local_softirq_pending())
3200 EXPORT_SYMBOL(netif_rx_ni
);
3202 static void net_tx_action(struct softirq_action
*h
)
3204 struct softnet_data
*sd
= &__get_cpu_var(softnet_data
);
3206 if (sd
->completion_queue
) {
3207 struct sk_buff
*clist
;
3209 local_irq_disable();
3210 clist
= sd
->completion_queue
;
3211 sd
->completion_queue
= NULL
;
3215 struct sk_buff
*skb
= clist
;
3216 clist
= clist
->next
;
3218 WARN_ON(atomic_read(&skb
->users
));
3219 trace_kfree_skb(skb
, net_tx_action
);
3224 if (sd
->output_queue
) {
3227 local_irq_disable();
3228 head
= sd
->output_queue
;
3229 sd
->output_queue
= NULL
;
3230 sd
->output_queue_tailp
= &sd
->output_queue
;
3234 struct Qdisc
*q
= head
;
3235 spinlock_t
*root_lock
;
3237 head
= head
->next_sched
;
3239 root_lock
= qdisc_lock(q
);
3240 if (spin_trylock(root_lock
)) {
3241 smp_mb__before_clear_bit();
3242 clear_bit(__QDISC_STATE_SCHED
,
3245 spin_unlock(root_lock
);
3247 if (!test_bit(__QDISC_STATE_DEACTIVATED
,
3249 __netif_reschedule(q
);
3251 smp_mb__before_clear_bit();
3252 clear_bit(__QDISC_STATE_SCHED
,
3260 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3261 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3262 /* This hook is defined here for ATM LANE */
3263 int (*br_fdb_test_addr_hook
)(struct net_device
*dev
,
3264 unsigned char *addr
) __read_mostly
;
3265 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook
);
3268 #ifdef CONFIG_NET_CLS_ACT
3269 /* TODO: Maybe we should just force sch_ingress to be compiled in
3270 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3271 * a compare and 2 stores extra right now if we dont have it on
3272 * but have CONFIG_NET_CLS_ACT
3273 * NOTE: This doesn't stop any functionality; if you dont have
3274 * the ingress scheduler, you just can't add policies on ingress.
3277 static int ing_filter(struct sk_buff
*skb
, struct netdev_queue
*rxq
)
3279 struct net_device
*dev
= skb
->dev
;
3280 u32 ttl
= G_TC_RTTL(skb
->tc_verd
);
3281 int result
= TC_ACT_OK
;
3284 if (unlikely(MAX_RED_LOOP
< ttl
++)) {
3285 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3286 skb
->skb_iif
, dev
->ifindex
);
3290 skb
->tc_verd
= SET_TC_RTTL(skb
->tc_verd
, ttl
);
3291 skb
->tc_verd
= SET_TC_AT(skb
->tc_verd
, AT_INGRESS
);
3294 if (q
!= &noop_qdisc
) {
3295 spin_lock(qdisc_lock(q
));
3296 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED
, &q
->state
)))
3297 result
= qdisc_enqueue_root(skb
, q
);
3298 spin_unlock(qdisc_lock(q
));
3304 static inline struct sk_buff
*handle_ing(struct sk_buff
*skb
,
3305 struct packet_type
**pt_prev
,
3306 int *ret
, struct net_device
*orig_dev
)
3308 struct netdev_queue
*rxq
= rcu_dereference(skb
->dev
->ingress_queue
);
3310 if (!rxq
|| rxq
->qdisc
== &noop_qdisc
)
3314 *ret
= deliver_skb(skb
, *pt_prev
, orig_dev
);
3318 switch (ing_filter(skb
, rxq
)) {
3332 * netdev_rx_handler_register - register receive handler
3333 * @dev: device to register a handler for
3334 * @rx_handler: receive handler to register
3335 * @rx_handler_data: data pointer that is used by rx handler
3337 * Register a receive hander for a device. This handler will then be
3338 * called from __netif_receive_skb. A negative errno code is returned
3341 * The caller must hold the rtnl_mutex.
3343 * For a general description of rx_handler, see enum rx_handler_result.
3345 int netdev_rx_handler_register(struct net_device
*dev
,
3346 rx_handler_func_t
*rx_handler
,
3347 void *rx_handler_data
)
3351 if (dev
->rx_handler
)
3354 rcu_assign_pointer(dev
->rx_handler_data
, rx_handler_data
);
3355 rcu_assign_pointer(dev
->rx_handler
, rx_handler
);
3359 EXPORT_SYMBOL_GPL(netdev_rx_handler_register
);
3362 * netdev_rx_handler_unregister - unregister receive handler
3363 * @dev: device to unregister a handler from
3365 * Unregister a receive hander from a device.
3367 * The caller must hold the rtnl_mutex.
3369 void netdev_rx_handler_unregister(struct net_device
*dev
)
3373 RCU_INIT_POINTER(dev
->rx_handler
, NULL
);
3374 RCU_INIT_POINTER(dev
->rx_handler_data
, NULL
);
3376 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister
);
3379 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3380 * the special handling of PFMEMALLOC skbs.
3382 static bool skb_pfmemalloc_protocol(struct sk_buff
*skb
)
3384 switch (skb
->protocol
) {
3385 case __constant_htons(ETH_P_ARP
):
3386 case __constant_htons(ETH_P_IP
):
3387 case __constant_htons(ETH_P_IPV6
):
3388 case __constant_htons(ETH_P_8021Q
):
3395 static int __netif_receive_skb_core(struct sk_buff
*skb
, bool pfmemalloc
)
3397 struct packet_type
*ptype
, *pt_prev
;
3398 rx_handler_func_t
*rx_handler
;
3399 struct net_device
*orig_dev
;
3400 struct net_device
*null_or_dev
;
3401 bool deliver_exact
= false;
3402 int ret
= NET_RX_DROP
;
3405 net_timestamp_check(!netdev_tstamp_prequeue
, skb
);
3407 trace_netif_receive_skb(skb
);
3409 /* if we've gotten here through NAPI, check netpoll */
3410 if (netpoll_receive_skb(skb
))
3413 orig_dev
= skb
->dev
;
3415 skb_reset_network_header(skb
);
3416 if (!skb_transport_header_was_set(skb
))
3417 skb_reset_transport_header(skb
);
3418 skb_reset_mac_len(skb
);
3425 skb
->skb_iif
= skb
->dev
->ifindex
;
3427 __this_cpu_inc(softnet_data
.processed
);
3429 if (skb
->protocol
== cpu_to_be16(ETH_P_8021Q
)) {
3430 skb
= vlan_untag(skb
);
3435 #ifdef CONFIG_NET_CLS_ACT
3436 if (skb
->tc_verd
& TC_NCLS
) {
3437 skb
->tc_verd
= CLR_TC_NCLS(skb
->tc_verd
);
3445 list_for_each_entry_rcu(ptype
, &ptype_all
, list
) {
3446 if (!ptype
->dev
|| ptype
->dev
== skb
->dev
) {
3448 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3454 #ifdef CONFIG_NET_CLS_ACT
3455 skb
= handle_ing(skb
, &pt_prev
, &ret
, orig_dev
);
3461 if (pfmemalloc
&& !skb_pfmemalloc_protocol(skb
))
3464 if (vlan_tx_tag_present(skb
)) {
3466 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3469 if (vlan_do_receive(&skb
))
3471 else if (unlikely(!skb
))
3475 rx_handler
= rcu_dereference(skb
->dev
->rx_handler
);
3478 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3481 switch (rx_handler(&skb
)) {
3482 case RX_HANDLER_CONSUMED
:
3484 case RX_HANDLER_ANOTHER
:
3486 case RX_HANDLER_EXACT
:
3487 deliver_exact
= true;
3488 case RX_HANDLER_PASS
:
3495 if (vlan_tx_nonzero_tag_present(skb
))
3496 skb
->pkt_type
= PACKET_OTHERHOST
;
3498 /* deliver only exact match when indicated */
3499 null_or_dev
= deliver_exact
? skb
->dev
: NULL
;
3501 type
= skb
->protocol
;
3502 list_for_each_entry_rcu(ptype
,
3503 &ptype_base
[ntohs(type
) & PTYPE_HASH_MASK
], list
) {
3504 if (ptype
->type
== type
&&
3505 (ptype
->dev
== null_or_dev
|| ptype
->dev
== skb
->dev
||
3506 ptype
->dev
== orig_dev
)) {
3508 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3514 if (unlikely(skb_orphan_frags(skb
, GFP_ATOMIC
)))
3517 ret
= pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
3520 atomic_long_inc(&skb
->dev
->rx_dropped
);
3522 /* Jamal, now you will not able to escape explaining
3523 * me how you were going to use this. :-)
3534 static int __netif_receive_skb(struct sk_buff
*skb
)
3538 if (sk_memalloc_socks() && skb_pfmemalloc(skb
)) {
3539 unsigned long pflags
= current
->flags
;
3542 * PFMEMALLOC skbs are special, they should
3543 * - be delivered to SOCK_MEMALLOC sockets only
3544 * - stay away from userspace
3545 * - have bounded memory usage
3547 * Use PF_MEMALLOC as this saves us from propagating the allocation
3548 * context down to all allocation sites.
3550 current
->flags
|= PF_MEMALLOC
;
3551 ret
= __netif_receive_skb_core(skb
, true);
3552 tsk_restore_flags(current
, pflags
, PF_MEMALLOC
);
3554 ret
= __netif_receive_skb_core(skb
, false);
3560 * netif_receive_skb - process receive buffer from network
3561 * @skb: buffer to process
3563 * netif_receive_skb() is the main receive data processing function.
3564 * It always succeeds. The buffer may be dropped during processing
3565 * for congestion control or by the protocol layers.
3567 * This function may only be called from softirq context and interrupts
3568 * should be enabled.
3570 * Return values (usually ignored):
3571 * NET_RX_SUCCESS: no congestion
3572 * NET_RX_DROP: packet was dropped
3574 int netif_receive_skb(struct sk_buff
*skb
)
3576 net_timestamp_check(netdev_tstamp_prequeue
, skb
);
3578 if (skb_defer_rx_timestamp(skb
))
3579 return NET_RX_SUCCESS
;
3582 if (static_key_false(&rps_needed
)) {
3583 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
3588 cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
3591 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
3598 return __netif_receive_skb(skb
);
3600 EXPORT_SYMBOL(netif_receive_skb
);
3602 /* Network device is going away, flush any packets still pending
3603 * Called with irqs disabled.
3605 static void flush_backlog(void *arg
)
3607 struct net_device
*dev
= arg
;
3608 struct softnet_data
*sd
= &__get_cpu_var(softnet_data
);
3609 struct sk_buff
*skb
, *tmp
;
3612 skb_queue_walk_safe(&sd
->input_pkt_queue
, skb
, tmp
) {
3613 if (skb
->dev
== dev
) {
3614 __skb_unlink(skb
, &sd
->input_pkt_queue
);
3616 input_queue_head_incr(sd
);
3621 skb_queue_walk_safe(&sd
->process_queue
, skb
, tmp
) {
3622 if (skb
->dev
== dev
) {
3623 __skb_unlink(skb
, &sd
->process_queue
);
3625 input_queue_head_incr(sd
);
3630 static int napi_gro_complete(struct sk_buff
*skb
)
3632 struct packet_offload
*ptype
;
3633 __be16 type
= skb
->protocol
;
3634 struct list_head
*head
= &offload_base
;
3637 BUILD_BUG_ON(sizeof(struct napi_gro_cb
) > sizeof(skb
->cb
));
3639 if (NAPI_GRO_CB(skb
)->count
== 1) {
3640 skb_shinfo(skb
)->gso_size
= 0;
3645 list_for_each_entry_rcu(ptype
, head
, list
) {
3646 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_complete
)
3649 err
= ptype
->callbacks
.gro_complete(skb
);
3655 WARN_ON(&ptype
->list
== head
);
3657 return NET_RX_SUCCESS
;
3661 return netif_receive_skb(skb
);
3664 /* napi->gro_list contains packets ordered by age.
3665 * youngest packets at the head of it.
3666 * Complete skbs in reverse order to reduce latencies.
3668 void napi_gro_flush(struct napi_struct
*napi
, bool flush_old
)
3670 struct sk_buff
*skb
, *prev
= NULL
;
3672 /* scan list and build reverse chain */
3673 for (skb
= napi
->gro_list
; skb
!= NULL
; skb
= skb
->next
) {
3678 for (skb
= prev
; skb
; skb
= prev
) {
3681 if (flush_old
&& NAPI_GRO_CB(skb
)->age
== jiffies
)
3685 napi_gro_complete(skb
);
3689 napi
->gro_list
= NULL
;
3691 EXPORT_SYMBOL(napi_gro_flush
);
3693 static void gro_list_prepare(struct napi_struct
*napi
, struct sk_buff
*skb
)
3696 unsigned int maclen
= skb
->dev
->hard_header_len
;
3698 for (p
= napi
->gro_list
; p
; p
= p
->next
) {
3699 unsigned long diffs
;
3701 diffs
= (unsigned long)p
->dev
^ (unsigned long)skb
->dev
;
3702 diffs
|= p
->vlan_tci
^ skb
->vlan_tci
;
3703 if (maclen
== ETH_HLEN
)
3704 diffs
|= compare_ether_header(skb_mac_header(p
),
3705 skb_gro_mac_header(skb
));
3707 diffs
= memcmp(skb_mac_header(p
),
3708 skb_gro_mac_header(skb
),
3710 NAPI_GRO_CB(p
)->same_flow
= !diffs
;
3711 NAPI_GRO_CB(p
)->flush
= 0;
3715 static enum gro_result
dev_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
3717 struct sk_buff
**pp
= NULL
;
3718 struct packet_offload
*ptype
;
3719 __be16 type
= skb
->protocol
;
3720 struct list_head
*head
= &offload_base
;
3722 enum gro_result ret
;
3724 if (!(skb
->dev
->features
& NETIF_F_GRO
) || netpoll_rx_on(skb
))
3727 if (skb_is_gso(skb
) || skb_has_frag_list(skb
))
3730 gro_list_prepare(napi
, skb
);
3733 list_for_each_entry_rcu(ptype
, head
, list
) {
3734 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_receive
)
3737 skb_set_network_header(skb
, skb_gro_offset(skb
));
3738 skb_reset_mac_len(skb
);
3739 NAPI_GRO_CB(skb
)->same_flow
= 0;
3740 NAPI_GRO_CB(skb
)->flush
= 0;
3741 NAPI_GRO_CB(skb
)->free
= 0;
3743 pp
= ptype
->callbacks
.gro_receive(&napi
->gro_list
, skb
);
3748 if (&ptype
->list
== head
)
3751 same_flow
= NAPI_GRO_CB(skb
)->same_flow
;
3752 ret
= NAPI_GRO_CB(skb
)->free
? GRO_MERGED_FREE
: GRO_MERGED
;
3755 struct sk_buff
*nskb
= *pp
;
3759 napi_gro_complete(nskb
);
3766 if (NAPI_GRO_CB(skb
)->flush
|| napi
->gro_count
>= MAX_GRO_SKBS
)
3770 NAPI_GRO_CB(skb
)->count
= 1;
3771 NAPI_GRO_CB(skb
)->age
= jiffies
;
3772 skb_shinfo(skb
)->gso_size
= skb_gro_len(skb
);
3773 skb
->next
= napi
->gro_list
;
3774 napi
->gro_list
= skb
;
3778 if (skb_headlen(skb
) < skb_gro_offset(skb
)) {
3779 int grow
= skb_gro_offset(skb
) - skb_headlen(skb
);
3781 BUG_ON(skb
->end
- skb
->tail
< grow
);
3783 memcpy(skb_tail_pointer(skb
), NAPI_GRO_CB(skb
)->frag0
, grow
);
3786 skb
->data_len
-= grow
;
3788 skb_shinfo(skb
)->frags
[0].page_offset
+= grow
;
3789 skb_frag_size_sub(&skb_shinfo(skb
)->frags
[0], grow
);
3791 if (unlikely(!skb_frag_size(&skb_shinfo(skb
)->frags
[0]))) {
3792 skb_frag_unref(skb
, 0);
3793 memmove(skb_shinfo(skb
)->frags
,
3794 skb_shinfo(skb
)->frags
+ 1,
3795 --skb_shinfo(skb
)->nr_frags
* sizeof(skb_frag_t
));
3808 static gro_result_t
napi_skb_finish(gro_result_t ret
, struct sk_buff
*skb
)
3812 if (netif_receive_skb(skb
))
3820 case GRO_MERGED_FREE
:
3821 if (NAPI_GRO_CB(skb
)->free
== NAPI_GRO_FREE_STOLEN_HEAD
)
3822 kmem_cache_free(skbuff_head_cache
, skb
);
3835 static void skb_gro_reset_offset(struct sk_buff
*skb
)
3837 const struct skb_shared_info
*pinfo
= skb_shinfo(skb
);
3838 const skb_frag_t
*frag0
= &pinfo
->frags
[0];
3840 NAPI_GRO_CB(skb
)->data_offset
= 0;
3841 NAPI_GRO_CB(skb
)->frag0
= NULL
;
3842 NAPI_GRO_CB(skb
)->frag0_len
= 0;
3844 if (skb
->mac_header
== skb
->tail
&&
3846 !PageHighMem(skb_frag_page(frag0
))) {
3847 NAPI_GRO_CB(skb
)->frag0
= skb_frag_address(frag0
);
3848 NAPI_GRO_CB(skb
)->frag0_len
= skb_frag_size(frag0
);
3852 gro_result_t
napi_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
3854 skb_gro_reset_offset(skb
);
3856 return napi_skb_finish(dev_gro_receive(napi
, skb
), skb
);
3858 EXPORT_SYMBOL(napi_gro_receive
);
3860 static void napi_reuse_skb(struct napi_struct
*napi
, struct sk_buff
*skb
)
3862 __skb_pull(skb
, skb_headlen(skb
));
3863 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3864 skb_reserve(skb
, NET_SKB_PAD
+ NET_IP_ALIGN
- skb_headroom(skb
));
3866 skb
->dev
= napi
->dev
;
3872 struct sk_buff
*napi_get_frags(struct napi_struct
*napi
)
3874 struct sk_buff
*skb
= napi
->skb
;
3877 skb
= netdev_alloc_skb_ip_align(napi
->dev
, GRO_MAX_HEAD
);
3883 EXPORT_SYMBOL(napi_get_frags
);
3885 static gro_result_t
napi_frags_finish(struct napi_struct
*napi
, struct sk_buff
*skb
,
3891 skb
->protocol
= eth_type_trans(skb
, skb
->dev
);
3893 if (ret
== GRO_HELD
)
3894 skb_gro_pull(skb
, -ETH_HLEN
);
3895 else if (netif_receive_skb(skb
))
3900 case GRO_MERGED_FREE
:
3901 napi_reuse_skb(napi
, skb
);
3911 static struct sk_buff
*napi_frags_skb(struct napi_struct
*napi
)
3913 struct sk_buff
*skb
= napi
->skb
;
3920 skb_reset_mac_header(skb
);
3921 skb_gro_reset_offset(skb
);
3923 off
= skb_gro_offset(skb
);
3924 hlen
= off
+ sizeof(*eth
);
3925 eth
= skb_gro_header_fast(skb
, off
);
3926 if (skb_gro_header_hard(skb
, hlen
)) {
3927 eth
= skb_gro_header_slow(skb
, hlen
, off
);
3928 if (unlikely(!eth
)) {
3929 napi_reuse_skb(napi
, skb
);
3935 skb_gro_pull(skb
, sizeof(*eth
));
3938 * This works because the only protocols we care about don't require
3939 * special handling. We'll fix it up properly at the end.
3941 skb
->protocol
= eth
->h_proto
;
3947 gro_result_t
napi_gro_frags(struct napi_struct
*napi
)
3949 struct sk_buff
*skb
= napi_frags_skb(napi
);
3954 return napi_frags_finish(napi
, skb
, dev_gro_receive(napi
, skb
));
3956 EXPORT_SYMBOL(napi_gro_frags
);
3959 * net_rps_action sends any pending IPI's for rps.
3960 * Note: called with local irq disabled, but exits with local irq enabled.
3962 static void net_rps_action_and_irq_enable(struct softnet_data
*sd
)
3965 struct softnet_data
*remsd
= sd
->rps_ipi_list
;
3968 sd
->rps_ipi_list
= NULL
;
3972 /* Send pending IPI's to kick RPS processing on remote cpus. */
3974 struct softnet_data
*next
= remsd
->rps_ipi_next
;
3976 if (cpu_online(remsd
->cpu
))
3977 __smp_call_function_single(remsd
->cpu
,
3986 static int process_backlog(struct napi_struct
*napi
, int quota
)
3989 struct softnet_data
*sd
= container_of(napi
, struct softnet_data
, backlog
);
3992 /* Check if we have pending ipi, its better to send them now,
3993 * not waiting net_rx_action() end.
3995 if (sd
->rps_ipi_list
) {
3996 local_irq_disable();
3997 net_rps_action_and_irq_enable(sd
);
4000 napi
->weight
= weight_p
;
4001 local_irq_disable();
4002 while (work
< quota
) {
4003 struct sk_buff
*skb
;
4006 while ((skb
= __skb_dequeue(&sd
->process_queue
))) {
4008 __netif_receive_skb(skb
);
4009 local_irq_disable();
4010 input_queue_head_incr(sd
);
4011 if (++work
>= quota
) {
4018 qlen
= skb_queue_len(&sd
->input_pkt_queue
);
4020 skb_queue_splice_tail_init(&sd
->input_pkt_queue
,
4021 &sd
->process_queue
);
4023 if (qlen
< quota
- work
) {
4025 * Inline a custom version of __napi_complete().
4026 * only current cpu owns and manipulates this napi,
4027 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4028 * we can use a plain write instead of clear_bit(),
4029 * and we dont need an smp_mb() memory barrier.
4031 list_del(&napi
->poll_list
);
4034 quota
= work
+ qlen
;
4044 * __napi_schedule - schedule for receive
4045 * @n: entry to schedule
4047 * The entry's receive function will be scheduled to run
4049 void __napi_schedule(struct napi_struct
*n
)
4051 unsigned long flags
;
4053 local_irq_save(flags
);
4054 ____napi_schedule(&__get_cpu_var(softnet_data
), n
);
4055 local_irq_restore(flags
);
4057 EXPORT_SYMBOL(__napi_schedule
);
4059 void __napi_complete(struct napi_struct
*n
)
4061 BUG_ON(!test_bit(NAPI_STATE_SCHED
, &n
->state
));
4062 BUG_ON(n
->gro_list
);
4064 list_del(&n
->poll_list
);
4065 smp_mb__before_clear_bit();
4066 clear_bit(NAPI_STATE_SCHED
, &n
->state
);
4068 EXPORT_SYMBOL(__napi_complete
);
4070 void napi_complete(struct napi_struct
*n
)
4072 unsigned long flags
;
4075 * don't let napi dequeue from the cpu poll list
4076 * just in case its running on a different cpu
4078 if (unlikely(test_bit(NAPI_STATE_NPSVC
, &n
->state
)))
4081 napi_gro_flush(n
, false);
4082 local_irq_save(flags
);
4084 local_irq_restore(flags
);
4086 EXPORT_SYMBOL(napi_complete
);
4088 void netif_napi_add(struct net_device
*dev
, struct napi_struct
*napi
,
4089 int (*poll
)(struct napi_struct
*, int), int weight
)
4091 INIT_LIST_HEAD(&napi
->poll_list
);
4092 napi
->gro_count
= 0;
4093 napi
->gro_list
= NULL
;
4096 napi
->weight
= weight
;
4097 list_add(&napi
->dev_list
, &dev
->napi_list
);
4099 #ifdef CONFIG_NETPOLL
4100 spin_lock_init(&napi
->poll_lock
);
4101 napi
->poll_owner
= -1;
4103 set_bit(NAPI_STATE_SCHED
, &napi
->state
);
4105 EXPORT_SYMBOL(netif_napi_add
);
4107 void netif_napi_del(struct napi_struct
*napi
)
4109 struct sk_buff
*skb
, *next
;
4111 list_del_init(&napi
->dev_list
);
4112 napi_free_frags(napi
);
4114 for (skb
= napi
->gro_list
; skb
; skb
= next
) {
4120 napi
->gro_list
= NULL
;
4121 napi
->gro_count
= 0;
4123 EXPORT_SYMBOL(netif_napi_del
);
4125 static void net_rx_action(struct softirq_action
*h
)
4127 struct softnet_data
*sd
= &__get_cpu_var(softnet_data
);
4128 unsigned long time_limit
= jiffies
+ 2;
4129 int budget
= netdev_budget
;
4132 local_irq_disable();
4134 while (!list_empty(&sd
->poll_list
)) {
4135 struct napi_struct
*n
;
4138 /* If softirq window is exhuasted then punt.
4139 * Allow this to run for 2 jiffies since which will allow
4140 * an average latency of 1.5/HZ.
4142 if (unlikely(budget
<= 0 || time_after(jiffies
, time_limit
)))
4147 /* Even though interrupts have been re-enabled, this
4148 * access is safe because interrupts can only add new
4149 * entries to the tail of this list, and only ->poll()
4150 * calls can remove this head entry from the list.
4152 n
= list_first_entry(&sd
->poll_list
, struct napi_struct
, poll_list
);
4154 have
= netpoll_poll_lock(n
);
4158 /* This NAPI_STATE_SCHED test is for avoiding a race
4159 * with netpoll's poll_napi(). Only the entity which
4160 * obtains the lock and sees NAPI_STATE_SCHED set will
4161 * actually make the ->poll() call. Therefore we avoid
4162 * accidentally calling ->poll() when NAPI is not scheduled.
4165 if (test_bit(NAPI_STATE_SCHED
, &n
->state
)) {
4166 work
= n
->poll(n
, weight
);
4170 WARN_ON_ONCE(work
> weight
);
4174 local_irq_disable();
4176 /* Drivers must not modify the NAPI state if they
4177 * consume the entire weight. In such cases this code
4178 * still "owns" the NAPI instance and therefore can
4179 * move the instance around on the list at-will.
4181 if (unlikely(work
== weight
)) {
4182 if (unlikely(napi_disable_pending(n
))) {
4185 local_irq_disable();
4188 /* flush too old packets
4189 * If HZ < 1000, flush all packets.
4192 napi_gro_flush(n
, HZ
>= 1000);
4193 local_irq_disable();
4195 list_move_tail(&n
->poll_list
, &sd
->poll_list
);
4199 netpoll_poll_unlock(have
);
4202 net_rps_action_and_irq_enable(sd
);
4204 #ifdef CONFIG_NET_DMA
4206 * There may not be any more sk_buffs coming right now, so push
4207 * any pending DMA copies to hardware
4209 dma_issue_pending_all();
4216 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
4220 #ifdef CONFIG_PROC_FS
4222 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4224 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4225 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4226 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4228 static inline struct net_device
*dev_from_same_bucket(struct seq_file
*seq
, loff_t
*pos
)
4230 struct net
*net
= seq_file_net(seq
);
4231 struct net_device
*dev
;
4232 struct hlist_node
*p
;
4233 struct hlist_head
*h
;
4234 unsigned int count
= 0, offset
= get_offset(*pos
);
4236 h
= &net
->dev_name_head
[get_bucket(*pos
)];
4237 hlist_for_each_entry_rcu(dev
, p
, h
, name_hlist
) {
4238 if (++count
== offset
)
4245 static inline struct net_device
*dev_from_bucket(struct seq_file
*seq
, loff_t
*pos
)
4247 struct net_device
*dev
;
4248 unsigned int bucket
;
4251 dev
= dev_from_same_bucket(seq
, pos
);
4255 bucket
= get_bucket(*pos
) + 1;
4256 *pos
= set_bucket_offset(bucket
, 1);
4257 } while (bucket
< NETDEV_HASHENTRIES
);
4263 * This is invoked by the /proc filesystem handler to display a device
4266 void *dev_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4271 return SEQ_START_TOKEN
;
4273 if (get_bucket(*pos
) >= NETDEV_HASHENTRIES
)
4276 return dev_from_bucket(seq
, pos
);
4279 void *dev_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4282 return dev_from_bucket(seq
, pos
);
4285 void dev_seq_stop(struct seq_file
*seq
, void *v
)
4291 static void dev_seq_printf_stats(struct seq_file
*seq
, struct net_device
*dev
)
4293 struct rtnl_link_stats64 temp
;
4294 const struct rtnl_link_stats64
*stats
= dev_get_stats(dev
, &temp
);
4296 seq_printf(seq
, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4297 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4298 dev
->name
, stats
->rx_bytes
, stats
->rx_packets
,
4300 stats
->rx_dropped
+ stats
->rx_missed_errors
,
4301 stats
->rx_fifo_errors
,
4302 stats
->rx_length_errors
+ stats
->rx_over_errors
+
4303 stats
->rx_crc_errors
+ stats
->rx_frame_errors
,
4304 stats
->rx_compressed
, stats
->multicast
,
4305 stats
->tx_bytes
, stats
->tx_packets
,
4306 stats
->tx_errors
, stats
->tx_dropped
,
4307 stats
->tx_fifo_errors
, stats
->collisions
,
4308 stats
->tx_carrier_errors
+
4309 stats
->tx_aborted_errors
+
4310 stats
->tx_window_errors
+
4311 stats
->tx_heartbeat_errors
,
4312 stats
->tx_compressed
);
4316 * Called from the PROCfs module. This now uses the new arbitrary sized
4317 * /proc/net interface to create /proc/net/dev
4319 static int dev_seq_show(struct seq_file
*seq
, void *v
)
4321 if (v
== SEQ_START_TOKEN
)
4322 seq_puts(seq
, "Inter-| Receive "
4324 " face |bytes packets errs drop fifo frame "
4325 "compressed multicast|bytes packets errs "
4326 "drop fifo colls carrier compressed\n");
4328 dev_seq_printf_stats(seq
, v
);
4332 static struct softnet_data
*softnet_get_online(loff_t
*pos
)
4334 struct softnet_data
*sd
= NULL
;
4336 while (*pos
< nr_cpu_ids
)
4337 if (cpu_online(*pos
)) {
4338 sd
= &per_cpu(softnet_data
, *pos
);
4345 static void *softnet_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4347 return softnet_get_online(pos
);
4350 static void *softnet_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4353 return softnet_get_online(pos
);
4356 static void softnet_seq_stop(struct seq_file
*seq
, void *v
)
4360 static int softnet_seq_show(struct seq_file
*seq
, void *v
)
4362 struct softnet_data
*sd
= v
;
4364 seq_printf(seq
, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4365 sd
->processed
, sd
->dropped
, sd
->time_squeeze
, 0,
4366 0, 0, 0, 0, /* was fastroute */
4367 sd
->cpu_collision
, sd
->received_rps
);
4371 static const struct seq_operations dev_seq_ops
= {
4372 .start
= dev_seq_start
,
4373 .next
= dev_seq_next
,
4374 .stop
= dev_seq_stop
,
4375 .show
= dev_seq_show
,
4378 static int dev_seq_open(struct inode
*inode
, struct file
*file
)
4380 return seq_open_net(inode
, file
, &dev_seq_ops
,
4381 sizeof(struct seq_net_private
));
4384 static const struct file_operations dev_seq_fops
= {
4385 .owner
= THIS_MODULE
,
4386 .open
= dev_seq_open
,
4388 .llseek
= seq_lseek
,
4389 .release
= seq_release_net
,
4392 static const struct seq_operations softnet_seq_ops
= {
4393 .start
= softnet_seq_start
,
4394 .next
= softnet_seq_next
,
4395 .stop
= softnet_seq_stop
,
4396 .show
= softnet_seq_show
,
4399 static int softnet_seq_open(struct inode
*inode
, struct file
*file
)
4401 return seq_open(file
, &softnet_seq_ops
);
4404 static const struct file_operations softnet_seq_fops
= {
4405 .owner
= THIS_MODULE
,
4406 .open
= softnet_seq_open
,
4408 .llseek
= seq_lseek
,
4409 .release
= seq_release
,
4412 static void *ptype_get_idx(loff_t pos
)
4414 struct packet_type
*pt
= NULL
;
4418 list_for_each_entry_rcu(pt
, &ptype_all
, list
) {
4424 for (t
= 0; t
< PTYPE_HASH_SIZE
; t
++) {
4425 list_for_each_entry_rcu(pt
, &ptype_base
[t
], list
) {
4434 static void *ptype_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4438 return *pos
? ptype_get_idx(*pos
- 1) : SEQ_START_TOKEN
;
4441 static void *ptype_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4443 struct packet_type
*pt
;
4444 struct list_head
*nxt
;
4448 if (v
== SEQ_START_TOKEN
)
4449 return ptype_get_idx(0);
4452 nxt
= pt
->list
.next
;
4453 if (pt
->type
== htons(ETH_P_ALL
)) {
4454 if (nxt
!= &ptype_all
)
4457 nxt
= ptype_base
[0].next
;
4459 hash
= ntohs(pt
->type
) & PTYPE_HASH_MASK
;
4461 while (nxt
== &ptype_base
[hash
]) {
4462 if (++hash
>= PTYPE_HASH_SIZE
)
4464 nxt
= ptype_base
[hash
].next
;
4467 return list_entry(nxt
, struct packet_type
, list
);
4470 static void ptype_seq_stop(struct seq_file
*seq
, void *v
)
4476 static int ptype_seq_show(struct seq_file
*seq
, void *v
)
4478 struct packet_type
*pt
= v
;
4480 if (v
== SEQ_START_TOKEN
)
4481 seq_puts(seq
, "Type Device Function\n");
4482 else if (pt
->dev
== NULL
|| dev_net(pt
->dev
) == seq_file_net(seq
)) {
4483 if (pt
->type
== htons(ETH_P_ALL
))
4484 seq_puts(seq
, "ALL ");
4486 seq_printf(seq
, "%04x", ntohs(pt
->type
));
4488 seq_printf(seq
, " %-8s %pF\n",
4489 pt
->dev
? pt
->dev
->name
: "", pt
->func
);
4495 static const struct seq_operations ptype_seq_ops
= {
4496 .start
= ptype_seq_start
,
4497 .next
= ptype_seq_next
,
4498 .stop
= ptype_seq_stop
,
4499 .show
= ptype_seq_show
,
4502 static int ptype_seq_open(struct inode
*inode
, struct file
*file
)
4504 return seq_open_net(inode
, file
, &ptype_seq_ops
,
4505 sizeof(struct seq_net_private
));
4508 static const struct file_operations ptype_seq_fops
= {
4509 .owner
= THIS_MODULE
,
4510 .open
= ptype_seq_open
,
4512 .llseek
= seq_lseek
,
4513 .release
= seq_release_net
,
4517 static int __net_init
dev_proc_net_init(struct net
*net
)
4521 if (!proc_create("dev", S_IRUGO
, net
->proc_net
, &dev_seq_fops
))
4523 if (!proc_create("softnet_stat", S_IRUGO
, net
->proc_net
,
4526 if (!proc_create("ptype", S_IRUGO
, net
->proc_net
, &ptype_seq_fops
))
4529 if (wext_proc_init(net
))
4535 proc_net_remove(net
, "ptype");
4537 proc_net_remove(net
, "softnet_stat");
4539 proc_net_remove(net
, "dev");
4543 static void __net_exit
dev_proc_net_exit(struct net
*net
)
4545 wext_proc_exit(net
);
4547 proc_net_remove(net
, "ptype");
4548 proc_net_remove(net
, "softnet_stat");
4549 proc_net_remove(net
, "dev");
4552 static struct pernet_operations __net_initdata dev_proc_ops
= {
4553 .init
= dev_proc_net_init
,
4554 .exit
= dev_proc_net_exit
,
4557 static int __init
dev_proc_init(void)
4559 return register_pernet_subsys(&dev_proc_ops
);
4562 #define dev_proc_init() 0
4563 #endif /* CONFIG_PROC_FS */
4566 struct netdev_upper
{
4567 struct net_device
*dev
;
4569 struct list_head list
;
4570 struct rcu_head rcu
;
4571 struct list_head search_list
;
4574 static void __append_search_uppers(struct list_head
*search_list
,
4575 struct net_device
*dev
)
4577 struct netdev_upper
*upper
;
4579 list_for_each_entry(upper
, &dev
->upper_dev_list
, list
) {
4580 /* check if this upper is not already in search list */
4581 if (list_empty(&upper
->search_list
))
4582 list_add_tail(&upper
->search_list
, search_list
);
4586 static bool __netdev_search_upper_dev(struct net_device
*dev
,
4587 struct net_device
*upper_dev
)
4589 LIST_HEAD(search_list
);
4590 struct netdev_upper
*upper
;
4591 struct netdev_upper
*tmp
;
4594 __append_search_uppers(&search_list
, dev
);
4595 list_for_each_entry(upper
, &search_list
, search_list
) {
4596 if (upper
->dev
== upper_dev
) {
4600 __append_search_uppers(&search_list
, upper
->dev
);
4602 list_for_each_entry_safe(upper
, tmp
, &search_list
, search_list
)
4603 INIT_LIST_HEAD(&upper
->search_list
);
4607 static struct netdev_upper
*__netdev_find_upper(struct net_device
*dev
,
4608 struct net_device
*upper_dev
)
4610 struct netdev_upper
*upper
;
4612 list_for_each_entry(upper
, &dev
->upper_dev_list
, list
) {
4613 if (upper
->dev
== upper_dev
)
4620 * netdev_has_upper_dev - Check if device is linked to an upper device
4622 * @upper_dev: upper device to check
4624 * Find out if a device is linked to specified upper device and return true
4625 * in case it is. Note that this checks only immediate upper device,
4626 * not through a complete stack of devices. The caller must hold the RTNL lock.
4628 bool netdev_has_upper_dev(struct net_device
*dev
,
4629 struct net_device
*upper_dev
)
4633 return __netdev_find_upper(dev
, upper_dev
);
4635 EXPORT_SYMBOL(netdev_has_upper_dev
);
4638 * netdev_has_any_upper_dev - Check if device is linked to some device
4641 * Find out if a device is linked to an upper device and return true in case
4642 * it is. The caller must hold the RTNL lock.
4644 bool netdev_has_any_upper_dev(struct net_device
*dev
)
4648 return !list_empty(&dev
->upper_dev_list
);
4650 EXPORT_SYMBOL(netdev_has_any_upper_dev
);
4653 * netdev_master_upper_dev_get - Get master upper device
4656 * Find a master upper device and return pointer to it or NULL in case
4657 * it's not there. The caller must hold the RTNL lock.
4659 struct net_device
*netdev_master_upper_dev_get(struct net_device
*dev
)
4661 struct netdev_upper
*upper
;
4665 if (list_empty(&dev
->upper_dev_list
))
4668 upper
= list_first_entry(&dev
->upper_dev_list
,
4669 struct netdev_upper
, list
);
4670 if (likely(upper
->master
))
4674 EXPORT_SYMBOL(netdev_master_upper_dev_get
);
4677 * netdev_master_upper_dev_get_rcu - Get master upper device
4680 * Find a master upper device and return pointer to it or NULL in case
4681 * it's not there. The caller must hold the RCU read lock.
4683 struct net_device
*netdev_master_upper_dev_get_rcu(struct net_device
*dev
)
4685 struct netdev_upper
*upper
;
4687 upper
= list_first_or_null_rcu(&dev
->upper_dev_list
,
4688 struct netdev_upper
, list
);
4689 if (upper
&& likely(upper
->master
))
4693 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu
);
4695 static int __netdev_upper_dev_link(struct net_device
*dev
,
4696 struct net_device
*upper_dev
, bool master
)
4698 struct netdev_upper
*upper
;
4702 if (dev
== upper_dev
)
4705 /* To prevent loops, check if dev is not upper device to upper_dev. */
4706 if (__netdev_search_upper_dev(upper_dev
, dev
))
4709 if (__netdev_find_upper(dev
, upper_dev
))
4712 if (master
&& netdev_master_upper_dev_get(dev
))
4715 upper
= kmalloc(sizeof(*upper
), GFP_KERNEL
);
4719 upper
->dev
= upper_dev
;
4720 upper
->master
= master
;
4721 INIT_LIST_HEAD(&upper
->search_list
);
4723 /* Ensure that master upper link is always the first item in list. */
4725 list_add_rcu(&upper
->list
, &dev
->upper_dev_list
);
4727 list_add_tail_rcu(&upper
->list
, &dev
->upper_dev_list
);
4728 dev_hold(upper_dev
);
4734 * netdev_upper_dev_link - Add a link to the upper device
4736 * @upper_dev: new upper device
4738 * Adds a link to device which is upper to this one. The caller must hold
4739 * the RTNL lock. On a failure a negative errno code is returned.
4740 * On success the reference counts are adjusted and the function
4743 int netdev_upper_dev_link(struct net_device
*dev
,
4744 struct net_device
*upper_dev
)
4746 return __netdev_upper_dev_link(dev
, upper_dev
, false);
4748 EXPORT_SYMBOL(netdev_upper_dev_link
);
4751 * netdev_master_upper_dev_link - Add a master link to the upper device
4753 * @upper_dev: new upper device
4755 * Adds a link to device which is upper to this one. In this case, only
4756 * one master upper device can be linked, although other non-master devices
4757 * might be linked as well. The caller must hold the RTNL lock.
4758 * On a failure a negative errno code is returned. On success the reference
4759 * counts are adjusted and the function returns zero.
4761 int netdev_master_upper_dev_link(struct net_device
*dev
,
4762 struct net_device
*upper_dev
)
4764 return __netdev_upper_dev_link(dev
, upper_dev
, true);
4766 EXPORT_SYMBOL(netdev_master_upper_dev_link
);
4769 * netdev_upper_dev_unlink - Removes a link to upper device
4771 * @upper_dev: new upper device
4773 * Removes a link to device which is upper to this one. The caller must hold
4776 void netdev_upper_dev_unlink(struct net_device
*dev
,
4777 struct net_device
*upper_dev
)
4779 struct netdev_upper
*upper
;
4783 upper
= __netdev_find_upper(dev
, upper_dev
);
4786 list_del_rcu(&upper
->list
);
4788 kfree_rcu(upper
, rcu
);
4790 EXPORT_SYMBOL(netdev_upper_dev_unlink
);
4792 static void dev_change_rx_flags(struct net_device
*dev
, int flags
)
4794 const struct net_device_ops
*ops
= dev
->netdev_ops
;
4796 if ((dev
->flags
& IFF_UP
) && ops
->ndo_change_rx_flags
)
4797 ops
->ndo_change_rx_flags(dev
, flags
);
4800 static int __dev_set_promiscuity(struct net_device
*dev
, int inc
)
4802 unsigned int old_flags
= dev
->flags
;
4808 dev
->flags
|= IFF_PROMISC
;
4809 dev
->promiscuity
+= inc
;
4810 if (dev
->promiscuity
== 0) {
4813 * If inc causes overflow, untouch promisc and return error.
4816 dev
->flags
&= ~IFF_PROMISC
;
4818 dev
->promiscuity
-= inc
;
4819 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4824 if (dev
->flags
!= old_flags
) {
4825 pr_info("device %s %s promiscuous mode\n",
4827 dev
->flags
& IFF_PROMISC
? "entered" : "left");
4828 if (audit_enabled
) {
4829 current_uid_gid(&uid
, &gid
);
4830 audit_log(current
->audit_context
, GFP_ATOMIC
,
4831 AUDIT_ANOM_PROMISCUOUS
,
4832 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4833 dev
->name
, (dev
->flags
& IFF_PROMISC
),
4834 (old_flags
& IFF_PROMISC
),
4835 from_kuid(&init_user_ns
, audit_get_loginuid(current
)),
4836 from_kuid(&init_user_ns
, uid
),
4837 from_kgid(&init_user_ns
, gid
),
4838 audit_get_sessionid(current
));
4841 dev_change_rx_flags(dev
, IFF_PROMISC
);
4847 * dev_set_promiscuity - update promiscuity count on a device
4851 * Add or remove promiscuity from a device. While the count in the device
4852 * remains above zero the interface remains promiscuous. Once it hits zero
4853 * the device reverts back to normal filtering operation. A negative inc
4854 * value is used to drop promiscuity on the device.
4855 * Return 0 if successful or a negative errno code on error.
4857 int dev_set_promiscuity(struct net_device
*dev
, int inc
)
4859 unsigned int old_flags
= dev
->flags
;
4862 err
= __dev_set_promiscuity(dev
, inc
);
4865 if (dev
->flags
!= old_flags
)
4866 dev_set_rx_mode(dev
);
4869 EXPORT_SYMBOL(dev_set_promiscuity
);
4872 * dev_set_allmulti - update allmulti count on a device
4876 * Add or remove reception of all multicast frames to a device. While the
4877 * count in the device remains above zero the interface remains listening
4878 * to all interfaces. Once it hits zero the device reverts back to normal
4879 * filtering operation. A negative @inc value is used to drop the counter
4880 * when releasing a resource needing all multicasts.
4881 * Return 0 if successful or a negative errno code on error.
4884 int dev_set_allmulti(struct net_device
*dev
, int inc
)
4886 unsigned int old_flags
= dev
->flags
;
4890 dev
->flags
|= IFF_ALLMULTI
;
4891 dev
->allmulti
+= inc
;
4892 if (dev
->allmulti
== 0) {
4895 * If inc causes overflow, untouch allmulti and return error.
4898 dev
->flags
&= ~IFF_ALLMULTI
;
4900 dev
->allmulti
-= inc
;
4901 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4906 if (dev
->flags
^ old_flags
) {
4907 dev_change_rx_flags(dev
, IFF_ALLMULTI
);
4908 dev_set_rx_mode(dev
);
4912 EXPORT_SYMBOL(dev_set_allmulti
);
4915 * Upload unicast and multicast address lists to device and
4916 * configure RX filtering. When the device doesn't support unicast
4917 * filtering it is put in promiscuous mode while unicast addresses
4920 void __dev_set_rx_mode(struct net_device
*dev
)
4922 const struct net_device_ops
*ops
= dev
->netdev_ops
;
4924 /* dev_open will call this function so the list will stay sane. */
4925 if (!(dev
->flags
&IFF_UP
))
4928 if (!netif_device_present(dev
))
4931 if (!(dev
->priv_flags
& IFF_UNICAST_FLT
)) {
4932 /* Unicast addresses changes may only happen under the rtnl,
4933 * therefore calling __dev_set_promiscuity here is safe.
4935 if (!netdev_uc_empty(dev
) && !dev
->uc_promisc
) {
4936 __dev_set_promiscuity(dev
, 1);
4937 dev
->uc_promisc
= true;
4938 } else if (netdev_uc_empty(dev
) && dev
->uc_promisc
) {
4939 __dev_set_promiscuity(dev
, -1);
4940 dev
->uc_promisc
= false;
4944 if (ops
->ndo_set_rx_mode
)
4945 ops
->ndo_set_rx_mode(dev
);
4948 void dev_set_rx_mode(struct net_device
*dev
)
4950 netif_addr_lock_bh(dev
);
4951 __dev_set_rx_mode(dev
);
4952 netif_addr_unlock_bh(dev
);
4956 * dev_get_flags - get flags reported to userspace
4959 * Get the combination of flag bits exported through APIs to userspace.
4961 unsigned int dev_get_flags(const struct net_device
*dev
)
4965 flags
= (dev
->flags
& ~(IFF_PROMISC
|
4970 (dev
->gflags
& (IFF_PROMISC
|
4973 if (netif_running(dev
)) {
4974 if (netif_oper_up(dev
))
4975 flags
|= IFF_RUNNING
;
4976 if (netif_carrier_ok(dev
))
4977 flags
|= IFF_LOWER_UP
;
4978 if (netif_dormant(dev
))
4979 flags
|= IFF_DORMANT
;
4984 EXPORT_SYMBOL(dev_get_flags
);
4986 int __dev_change_flags(struct net_device
*dev
, unsigned int flags
)
4988 unsigned int old_flags
= dev
->flags
;
4994 * Set the flags on our device.
4997 dev
->flags
= (flags
& (IFF_DEBUG
| IFF_NOTRAILERS
| IFF_NOARP
|
4998 IFF_DYNAMIC
| IFF_MULTICAST
| IFF_PORTSEL
|
5000 (dev
->flags
& (IFF_UP
| IFF_VOLATILE
| IFF_PROMISC
|
5004 * Load in the correct multicast list now the flags have changed.
5007 if ((old_flags
^ flags
) & IFF_MULTICAST
)
5008 dev_change_rx_flags(dev
, IFF_MULTICAST
);
5010 dev_set_rx_mode(dev
);
5013 * Have we downed the interface. We handle IFF_UP ourselves
5014 * according to user attempts to set it, rather than blindly
5019 if ((old_flags
^ flags
) & IFF_UP
) { /* Bit is different ? */
5020 ret
= ((old_flags
& IFF_UP
) ? __dev_close
: __dev_open
)(dev
);
5023 dev_set_rx_mode(dev
);
5026 if ((flags
^ dev
->gflags
) & IFF_PROMISC
) {
5027 int inc
= (flags
& IFF_PROMISC
) ? 1 : -1;
5029 dev
->gflags
^= IFF_PROMISC
;
5030 dev_set_promiscuity(dev
, inc
);
5033 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5034 is important. Some (broken) drivers set IFF_PROMISC, when
5035 IFF_ALLMULTI is requested not asking us and not reporting.
5037 if ((flags
^ dev
->gflags
) & IFF_ALLMULTI
) {
5038 int inc
= (flags
& IFF_ALLMULTI
) ? 1 : -1;
5040 dev
->gflags
^= IFF_ALLMULTI
;
5041 dev_set_allmulti(dev
, inc
);
5047 void __dev_notify_flags(struct net_device
*dev
, unsigned int old_flags
)
5049 unsigned int changes
= dev
->flags
^ old_flags
;
5051 if (changes
& IFF_UP
) {
5052 if (dev
->flags
& IFF_UP
)
5053 call_netdevice_notifiers(NETDEV_UP
, dev
);
5055 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
5058 if (dev
->flags
& IFF_UP
&&
5059 (changes
& ~(IFF_UP
| IFF_PROMISC
| IFF_ALLMULTI
| IFF_VOLATILE
)))
5060 call_netdevice_notifiers(NETDEV_CHANGE
, dev
);
5064 * dev_change_flags - change device settings
5066 * @flags: device state flags
5068 * Change settings on device based state flags. The flags are
5069 * in the userspace exported format.
5071 int dev_change_flags(struct net_device
*dev
, unsigned int flags
)
5074 unsigned int changes
, old_flags
= dev
->flags
;
5076 ret
= __dev_change_flags(dev
, flags
);
5080 changes
= old_flags
^ dev
->flags
;
5082 rtmsg_ifinfo(RTM_NEWLINK
, dev
, changes
);
5084 __dev_notify_flags(dev
, old_flags
);
5087 EXPORT_SYMBOL(dev_change_flags
);
5090 * dev_set_mtu - Change maximum transfer unit
5092 * @new_mtu: new transfer unit
5094 * Change the maximum transfer size of the network device.
5096 int dev_set_mtu(struct net_device
*dev
, int new_mtu
)
5098 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5101 if (new_mtu
== dev
->mtu
)
5104 /* MTU must be positive. */
5108 if (!netif_device_present(dev
))
5112 if (ops
->ndo_change_mtu
)
5113 err
= ops
->ndo_change_mtu(dev
, new_mtu
);
5118 call_netdevice_notifiers(NETDEV_CHANGEMTU
, dev
);
5121 EXPORT_SYMBOL(dev_set_mtu
);
5124 * dev_set_group - Change group this device belongs to
5126 * @new_group: group this device should belong to
5128 void dev_set_group(struct net_device
*dev
, int new_group
)
5130 dev
->group
= new_group
;
5132 EXPORT_SYMBOL(dev_set_group
);
5135 * dev_set_mac_address - Change Media Access Control Address
5139 * Change the hardware (MAC) address of the device
5141 int dev_set_mac_address(struct net_device
*dev
, struct sockaddr
*sa
)
5143 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5146 if (!ops
->ndo_set_mac_address
)
5148 if (sa
->sa_family
!= dev
->type
)
5150 if (!netif_device_present(dev
))
5152 err
= ops
->ndo_set_mac_address(dev
, sa
);
5155 dev
->addr_assign_type
= NET_ADDR_SET
;
5156 call_netdevice_notifiers(NETDEV_CHANGEADDR
, dev
);
5157 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
5160 EXPORT_SYMBOL(dev_set_mac_address
);
5163 * dev_change_carrier - Change device carrier
5165 * @new_carries: new value
5167 * Change device carrier
5169 int dev_change_carrier(struct net_device
*dev
, bool new_carrier
)
5171 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5173 if (!ops
->ndo_change_carrier
)
5175 if (!netif_device_present(dev
))
5177 return ops
->ndo_change_carrier(dev
, new_carrier
);
5179 EXPORT_SYMBOL(dev_change_carrier
);
5182 * dev_new_index - allocate an ifindex
5183 * @net: the applicable net namespace
5185 * Returns a suitable unique value for a new device interface
5186 * number. The caller must hold the rtnl semaphore or the
5187 * dev_base_lock to be sure it remains unique.
5189 static int dev_new_index(struct net
*net
)
5191 int ifindex
= net
->ifindex
;
5195 if (!__dev_get_by_index(net
, ifindex
))
5196 return net
->ifindex
= ifindex
;
5200 /* Delayed registration/unregisteration */
5201 static LIST_HEAD(net_todo_list
);
5203 static void net_set_todo(struct net_device
*dev
)
5205 list_add_tail(&dev
->todo_list
, &net_todo_list
);
5208 static void rollback_registered_many(struct list_head
*head
)
5210 struct net_device
*dev
, *tmp
;
5212 BUG_ON(dev_boot_phase
);
5215 list_for_each_entry_safe(dev
, tmp
, head
, unreg_list
) {
5216 /* Some devices call without registering
5217 * for initialization unwind. Remove those
5218 * devices and proceed with the remaining.
5220 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
5221 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5225 list_del(&dev
->unreg_list
);
5228 dev
->dismantle
= true;
5229 BUG_ON(dev
->reg_state
!= NETREG_REGISTERED
);
5232 /* If device is running, close it first. */
5233 dev_close_many(head
);
5235 list_for_each_entry(dev
, head
, unreg_list
) {
5236 /* And unlink it from device chain. */
5237 unlist_netdevice(dev
);
5239 dev
->reg_state
= NETREG_UNREGISTERING
;
5244 list_for_each_entry(dev
, head
, unreg_list
) {
5245 /* Shutdown queueing discipline. */
5249 /* Notify protocols, that we are about to destroy
5250 this device. They should clean all the things.
5252 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
5254 if (!dev
->rtnl_link_ops
||
5255 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
5256 rtmsg_ifinfo(RTM_DELLINK
, dev
, ~0U);
5259 * Flush the unicast and multicast chains
5264 if (dev
->netdev_ops
->ndo_uninit
)
5265 dev
->netdev_ops
->ndo_uninit(dev
);
5267 /* Notifier chain MUST detach us all upper devices. */
5268 WARN_ON(netdev_has_any_upper_dev(dev
));
5270 /* Remove entries from kobject tree */
5271 netdev_unregister_kobject(dev
);
5273 /* Remove XPS queueing entries */
5274 netif_reset_xps_queues_gt(dev
, 0);
5280 list_for_each_entry(dev
, head
, unreg_list
)
5284 static void rollback_registered(struct net_device
*dev
)
5288 list_add(&dev
->unreg_list
, &single
);
5289 rollback_registered_many(&single
);
5293 static netdev_features_t
netdev_fix_features(struct net_device
*dev
,
5294 netdev_features_t features
)
5296 /* Fix illegal checksum combinations */
5297 if ((features
& NETIF_F_HW_CSUM
) &&
5298 (features
& (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))) {
5299 netdev_warn(dev
, "mixed HW and IP checksum settings.\n");
5300 features
&= ~(NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
);
5303 /* Fix illegal SG+CSUM combinations. */
5304 if ((features
& NETIF_F_SG
) &&
5305 !(features
& NETIF_F_ALL_CSUM
)) {
5307 "Dropping NETIF_F_SG since no checksum feature.\n");
5308 features
&= ~NETIF_F_SG
;
5311 /* TSO requires that SG is present as well. */
5312 if ((features
& NETIF_F_ALL_TSO
) && !(features
& NETIF_F_SG
)) {
5313 netdev_dbg(dev
, "Dropping TSO features since no SG feature.\n");
5314 features
&= ~NETIF_F_ALL_TSO
;
5317 /* TSO ECN requires that TSO is present as well. */
5318 if ((features
& NETIF_F_ALL_TSO
) == NETIF_F_TSO_ECN
)
5319 features
&= ~NETIF_F_TSO_ECN
;
5321 /* Software GSO depends on SG. */
5322 if ((features
& NETIF_F_GSO
) && !(features
& NETIF_F_SG
)) {
5323 netdev_dbg(dev
, "Dropping NETIF_F_GSO since no SG feature.\n");
5324 features
&= ~NETIF_F_GSO
;
5327 /* UFO needs SG and checksumming */
5328 if (features
& NETIF_F_UFO
) {
5329 /* maybe split UFO into V4 and V6? */
5330 if (!((features
& NETIF_F_GEN_CSUM
) ||
5331 (features
& (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))
5332 == (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))) {
5334 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5335 features
&= ~NETIF_F_UFO
;
5338 if (!(features
& NETIF_F_SG
)) {
5340 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5341 features
&= ~NETIF_F_UFO
;
5348 int __netdev_update_features(struct net_device
*dev
)
5350 netdev_features_t features
;
5355 features
= netdev_get_wanted_features(dev
);
5357 if (dev
->netdev_ops
->ndo_fix_features
)
5358 features
= dev
->netdev_ops
->ndo_fix_features(dev
, features
);
5360 /* driver might be less strict about feature dependencies */
5361 features
= netdev_fix_features(dev
, features
);
5363 if (dev
->features
== features
)
5366 netdev_dbg(dev
, "Features changed: %pNF -> %pNF\n",
5367 &dev
->features
, &features
);
5369 if (dev
->netdev_ops
->ndo_set_features
)
5370 err
= dev
->netdev_ops
->ndo_set_features(dev
, features
);
5372 if (unlikely(err
< 0)) {
5374 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5375 err
, &features
, &dev
->features
);
5380 dev
->features
= features
;
5386 * netdev_update_features - recalculate device features
5387 * @dev: the device to check
5389 * Recalculate dev->features set and send notifications if it
5390 * has changed. Should be called after driver or hardware dependent
5391 * conditions might have changed that influence the features.
5393 void netdev_update_features(struct net_device
*dev
)
5395 if (__netdev_update_features(dev
))
5396 netdev_features_change(dev
);
5398 EXPORT_SYMBOL(netdev_update_features
);
5401 * netdev_change_features - recalculate device features
5402 * @dev: the device to check
5404 * Recalculate dev->features set and send notifications even
5405 * if they have not changed. Should be called instead of
5406 * netdev_update_features() if also dev->vlan_features might
5407 * have changed to allow the changes to be propagated to stacked
5410 void netdev_change_features(struct net_device
*dev
)
5412 __netdev_update_features(dev
);
5413 netdev_features_change(dev
);
5415 EXPORT_SYMBOL(netdev_change_features
);
5418 * netif_stacked_transfer_operstate - transfer operstate
5419 * @rootdev: the root or lower level device to transfer state from
5420 * @dev: the device to transfer operstate to
5422 * Transfer operational state from root to device. This is normally
5423 * called when a stacking relationship exists between the root
5424 * device and the device(a leaf device).
5426 void netif_stacked_transfer_operstate(const struct net_device
*rootdev
,
5427 struct net_device
*dev
)
5429 if (rootdev
->operstate
== IF_OPER_DORMANT
)
5430 netif_dormant_on(dev
);
5432 netif_dormant_off(dev
);
5434 if (netif_carrier_ok(rootdev
)) {
5435 if (!netif_carrier_ok(dev
))
5436 netif_carrier_on(dev
);
5438 if (netif_carrier_ok(dev
))
5439 netif_carrier_off(dev
);
5442 EXPORT_SYMBOL(netif_stacked_transfer_operstate
);
5445 static int netif_alloc_rx_queues(struct net_device
*dev
)
5447 unsigned int i
, count
= dev
->num_rx_queues
;
5448 struct netdev_rx_queue
*rx
;
5452 rx
= kcalloc(count
, sizeof(struct netdev_rx_queue
), GFP_KERNEL
);
5458 for (i
= 0; i
< count
; i
++)
5464 static void netdev_init_one_queue(struct net_device
*dev
,
5465 struct netdev_queue
*queue
, void *_unused
)
5467 /* Initialize queue lock */
5468 spin_lock_init(&queue
->_xmit_lock
);
5469 netdev_set_xmit_lockdep_class(&queue
->_xmit_lock
, dev
->type
);
5470 queue
->xmit_lock_owner
= -1;
5471 netdev_queue_numa_node_write(queue
, NUMA_NO_NODE
);
5474 dql_init(&queue
->dql
, HZ
);
5478 static int netif_alloc_netdev_queues(struct net_device
*dev
)
5480 unsigned int count
= dev
->num_tx_queues
;
5481 struct netdev_queue
*tx
;
5485 tx
= kcalloc(count
, sizeof(struct netdev_queue
), GFP_KERNEL
);
5491 netdev_for_each_tx_queue(dev
, netdev_init_one_queue
, NULL
);
5492 spin_lock_init(&dev
->tx_global_lock
);
5498 * register_netdevice - register a network device
5499 * @dev: device to register
5501 * Take a completed network device structure and add it to the kernel
5502 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5503 * chain. 0 is returned on success. A negative errno code is returned
5504 * on a failure to set up the device, or if the name is a duplicate.
5506 * Callers must hold the rtnl semaphore. You may want
5507 * register_netdev() instead of this.
5510 * The locking appears insufficient to guarantee two parallel registers
5511 * will not get the same name.
5514 int register_netdevice(struct net_device
*dev
)
5517 struct net
*net
= dev_net(dev
);
5519 BUG_ON(dev_boot_phase
);
5524 /* When net_device's are persistent, this will be fatal. */
5525 BUG_ON(dev
->reg_state
!= NETREG_UNINITIALIZED
);
5528 spin_lock_init(&dev
->addr_list_lock
);
5529 netdev_set_addr_lockdep_class(dev
);
5533 ret
= dev_get_valid_name(net
, dev
, dev
->name
);
5537 /* Init, if this function is available */
5538 if (dev
->netdev_ops
->ndo_init
) {
5539 ret
= dev
->netdev_ops
->ndo_init(dev
);
5547 if (((dev
->hw_features
| dev
->features
) & NETIF_F_HW_VLAN_FILTER
) &&
5548 (!dev
->netdev_ops
->ndo_vlan_rx_add_vid
||
5549 !dev
->netdev_ops
->ndo_vlan_rx_kill_vid
)) {
5550 netdev_WARN(dev
, "Buggy VLAN acceleration in driver!\n");
5557 dev
->ifindex
= dev_new_index(net
);
5558 else if (__dev_get_by_index(net
, dev
->ifindex
))
5561 if (dev
->iflink
== -1)
5562 dev
->iflink
= dev
->ifindex
;
5564 /* Transfer changeable features to wanted_features and enable
5565 * software offloads (GSO and GRO).
5567 dev
->hw_features
|= NETIF_F_SOFT_FEATURES
;
5568 dev
->features
|= NETIF_F_SOFT_FEATURES
;
5569 dev
->wanted_features
= dev
->features
& dev
->hw_features
;
5571 /* Turn on no cache copy if HW is doing checksum */
5572 if (!(dev
->flags
& IFF_LOOPBACK
)) {
5573 dev
->hw_features
|= NETIF_F_NOCACHE_COPY
;
5574 if (dev
->features
& NETIF_F_ALL_CSUM
) {
5575 dev
->wanted_features
|= NETIF_F_NOCACHE_COPY
;
5576 dev
->features
|= NETIF_F_NOCACHE_COPY
;
5580 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5582 dev
->vlan_features
|= NETIF_F_HIGHDMA
;
5584 ret
= call_netdevice_notifiers(NETDEV_POST_INIT
, dev
);
5585 ret
= notifier_to_errno(ret
);
5589 ret
= netdev_register_kobject(dev
);
5592 dev
->reg_state
= NETREG_REGISTERED
;
5594 __netdev_update_features(dev
);
5597 * Default initial state at registry is that the
5598 * device is present.
5601 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
5603 linkwatch_init_dev(dev
);
5605 dev_init_scheduler(dev
);
5607 list_netdevice(dev
);
5608 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
5610 /* If the device has permanent device address, driver should
5611 * set dev_addr and also addr_assign_type should be set to
5612 * NET_ADDR_PERM (default value).
5614 if (dev
->addr_assign_type
== NET_ADDR_PERM
)
5615 memcpy(dev
->perm_addr
, dev
->dev_addr
, dev
->addr_len
);
5617 /* Notify protocols, that a new device appeared. */
5618 ret
= call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
5619 ret
= notifier_to_errno(ret
);
5621 rollback_registered(dev
);
5622 dev
->reg_state
= NETREG_UNREGISTERED
;
5625 * Prevent userspace races by waiting until the network
5626 * device is fully setup before sending notifications.
5628 if (!dev
->rtnl_link_ops
||
5629 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
5630 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U);
5636 if (dev
->netdev_ops
->ndo_uninit
)
5637 dev
->netdev_ops
->ndo_uninit(dev
);
5640 EXPORT_SYMBOL(register_netdevice
);
5643 * init_dummy_netdev - init a dummy network device for NAPI
5644 * @dev: device to init
5646 * This takes a network device structure and initialize the minimum
5647 * amount of fields so it can be used to schedule NAPI polls without
5648 * registering a full blown interface. This is to be used by drivers
5649 * that need to tie several hardware interfaces to a single NAPI
5650 * poll scheduler due to HW limitations.
5652 int init_dummy_netdev(struct net_device
*dev
)
5654 /* Clear everything. Note we don't initialize spinlocks
5655 * are they aren't supposed to be taken by any of the
5656 * NAPI code and this dummy netdev is supposed to be
5657 * only ever used for NAPI polls
5659 memset(dev
, 0, sizeof(struct net_device
));
5661 /* make sure we BUG if trying to hit standard
5662 * register/unregister code path
5664 dev
->reg_state
= NETREG_DUMMY
;
5666 /* NAPI wants this */
5667 INIT_LIST_HEAD(&dev
->napi_list
);
5669 /* a dummy interface is started by default */
5670 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
5671 set_bit(__LINK_STATE_START
, &dev
->state
);
5673 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5674 * because users of this 'device' dont need to change
5680 EXPORT_SYMBOL_GPL(init_dummy_netdev
);
5684 * register_netdev - register a network device
5685 * @dev: device to register
5687 * Take a completed network device structure and add it to the kernel
5688 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5689 * chain. 0 is returned on success. A negative errno code is returned
5690 * on a failure to set up the device, or if the name is a duplicate.
5692 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5693 * and expands the device name if you passed a format string to
5696 int register_netdev(struct net_device
*dev
)
5701 err
= register_netdevice(dev
);
5705 EXPORT_SYMBOL(register_netdev
);
5707 int netdev_refcnt_read(const struct net_device
*dev
)
5711 for_each_possible_cpu(i
)
5712 refcnt
+= *per_cpu_ptr(dev
->pcpu_refcnt
, i
);
5715 EXPORT_SYMBOL(netdev_refcnt_read
);
5718 * netdev_wait_allrefs - wait until all references are gone.
5719 * @dev: target net_device
5721 * This is called when unregistering network devices.
5723 * Any protocol or device that holds a reference should register
5724 * for netdevice notification, and cleanup and put back the
5725 * reference if they receive an UNREGISTER event.
5726 * We can get stuck here if buggy protocols don't correctly
5729 static void netdev_wait_allrefs(struct net_device
*dev
)
5731 unsigned long rebroadcast_time
, warning_time
;
5734 linkwatch_forget_dev(dev
);
5736 rebroadcast_time
= warning_time
= jiffies
;
5737 refcnt
= netdev_refcnt_read(dev
);
5739 while (refcnt
!= 0) {
5740 if (time_after(jiffies
, rebroadcast_time
+ 1 * HZ
)) {
5743 /* Rebroadcast unregister notification */
5744 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
5750 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
5751 if (test_bit(__LINK_STATE_LINKWATCH_PENDING
,
5753 /* We must not have linkwatch events
5754 * pending on unregister. If this
5755 * happens, we simply run the queue
5756 * unscheduled, resulting in a noop
5759 linkwatch_run_queue();
5764 rebroadcast_time
= jiffies
;
5769 refcnt
= netdev_refcnt_read(dev
);
5771 if (time_after(jiffies
, warning_time
+ 10 * HZ
)) {
5772 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5774 warning_time
= jiffies
;
5783 * register_netdevice(x1);
5784 * register_netdevice(x2);
5786 * unregister_netdevice(y1);
5787 * unregister_netdevice(y2);
5793 * We are invoked by rtnl_unlock().
5794 * This allows us to deal with problems:
5795 * 1) We can delete sysfs objects which invoke hotplug
5796 * without deadlocking with linkwatch via keventd.
5797 * 2) Since we run with the RTNL semaphore not held, we can sleep
5798 * safely in order to wait for the netdev refcnt to drop to zero.
5800 * We must not return until all unregister events added during
5801 * the interval the lock was held have been completed.
5803 void netdev_run_todo(void)
5805 struct list_head list
;
5807 /* Snapshot list, allow later requests */
5808 list_replace_init(&net_todo_list
, &list
);
5813 /* Wait for rcu callbacks to finish before next phase */
5814 if (!list_empty(&list
))
5817 while (!list_empty(&list
)) {
5818 struct net_device
*dev
5819 = list_first_entry(&list
, struct net_device
, todo_list
);
5820 list_del(&dev
->todo_list
);
5823 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
5826 if (unlikely(dev
->reg_state
!= NETREG_UNREGISTERING
)) {
5827 pr_err("network todo '%s' but state %d\n",
5828 dev
->name
, dev
->reg_state
);
5833 dev
->reg_state
= NETREG_UNREGISTERED
;
5835 on_each_cpu(flush_backlog
, dev
, 1);
5837 netdev_wait_allrefs(dev
);
5840 BUG_ON(netdev_refcnt_read(dev
));
5841 WARN_ON(rcu_access_pointer(dev
->ip_ptr
));
5842 WARN_ON(rcu_access_pointer(dev
->ip6_ptr
));
5843 WARN_ON(dev
->dn_ptr
);
5845 if (dev
->destructor
)
5846 dev
->destructor(dev
);
5848 /* Free network device */
5849 kobject_put(&dev
->dev
.kobj
);
5853 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5854 * fields in the same order, with only the type differing.
5856 void netdev_stats_to_stats64(struct rtnl_link_stats64
*stats64
,
5857 const struct net_device_stats
*netdev_stats
)
5859 #if BITS_PER_LONG == 64
5860 BUILD_BUG_ON(sizeof(*stats64
) != sizeof(*netdev_stats
));
5861 memcpy(stats64
, netdev_stats
, sizeof(*stats64
));
5863 size_t i
, n
= sizeof(*stats64
) / sizeof(u64
);
5864 const unsigned long *src
= (const unsigned long *)netdev_stats
;
5865 u64
*dst
= (u64
*)stats64
;
5867 BUILD_BUG_ON(sizeof(*netdev_stats
) / sizeof(unsigned long) !=
5868 sizeof(*stats64
) / sizeof(u64
));
5869 for (i
= 0; i
< n
; i
++)
5873 EXPORT_SYMBOL(netdev_stats_to_stats64
);
5876 * dev_get_stats - get network device statistics
5877 * @dev: device to get statistics from
5878 * @storage: place to store stats
5880 * Get network statistics from device. Return @storage.
5881 * The device driver may provide its own method by setting
5882 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5883 * otherwise the internal statistics structure is used.
5885 struct rtnl_link_stats64
*dev_get_stats(struct net_device
*dev
,
5886 struct rtnl_link_stats64
*storage
)
5888 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5890 if (ops
->ndo_get_stats64
) {
5891 memset(storage
, 0, sizeof(*storage
));
5892 ops
->ndo_get_stats64(dev
, storage
);
5893 } else if (ops
->ndo_get_stats
) {
5894 netdev_stats_to_stats64(storage
, ops
->ndo_get_stats(dev
));
5896 netdev_stats_to_stats64(storage
, &dev
->stats
);
5898 storage
->rx_dropped
+= atomic_long_read(&dev
->rx_dropped
);
5901 EXPORT_SYMBOL(dev_get_stats
);
5903 struct netdev_queue
*dev_ingress_queue_create(struct net_device
*dev
)
5905 struct netdev_queue
*queue
= dev_ingress_queue(dev
);
5907 #ifdef CONFIG_NET_CLS_ACT
5910 queue
= kzalloc(sizeof(*queue
), GFP_KERNEL
);
5913 netdev_init_one_queue(dev
, queue
, NULL
);
5914 queue
->qdisc
= &noop_qdisc
;
5915 queue
->qdisc_sleeping
= &noop_qdisc
;
5916 rcu_assign_pointer(dev
->ingress_queue
, queue
);
5921 static const struct ethtool_ops default_ethtool_ops
;
5923 void netdev_set_default_ethtool_ops(struct net_device
*dev
,
5924 const struct ethtool_ops
*ops
)
5926 if (dev
->ethtool_ops
== &default_ethtool_ops
)
5927 dev
->ethtool_ops
= ops
;
5929 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops
);
5932 * alloc_netdev_mqs - allocate network device
5933 * @sizeof_priv: size of private data to allocate space for
5934 * @name: device name format string
5935 * @setup: callback to initialize device
5936 * @txqs: the number of TX subqueues to allocate
5937 * @rxqs: the number of RX subqueues to allocate
5939 * Allocates a struct net_device with private data area for driver use
5940 * and performs basic initialization. Also allocates subquue structs
5941 * for each queue on the device.
5943 struct net_device
*alloc_netdev_mqs(int sizeof_priv
, const char *name
,
5944 void (*setup
)(struct net_device
*),
5945 unsigned int txqs
, unsigned int rxqs
)
5947 struct net_device
*dev
;
5949 struct net_device
*p
;
5951 BUG_ON(strlen(name
) >= sizeof(dev
->name
));
5954 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5960 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5965 alloc_size
= sizeof(struct net_device
);
5967 /* ensure 32-byte alignment of private area */
5968 alloc_size
= ALIGN(alloc_size
, NETDEV_ALIGN
);
5969 alloc_size
+= sizeof_priv
;
5971 /* ensure 32-byte alignment of whole construct */
5972 alloc_size
+= NETDEV_ALIGN
- 1;
5974 p
= kzalloc(alloc_size
, GFP_KERNEL
);
5978 dev
= PTR_ALIGN(p
, NETDEV_ALIGN
);
5979 dev
->padded
= (char *)dev
- (char *)p
;
5981 dev
->pcpu_refcnt
= alloc_percpu(int);
5982 if (!dev
->pcpu_refcnt
)
5985 if (dev_addr_init(dev
))
5991 dev_net_set(dev
, &init_net
);
5993 dev
->gso_max_size
= GSO_MAX_SIZE
;
5994 dev
->gso_max_segs
= GSO_MAX_SEGS
;
5996 INIT_LIST_HEAD(&dev
->napi_list
);
5997 INIT_LIST_HEAD(&dev
->unreg_list
);
5998 INIT_LIST_HEAD(&dev
->link_watch_list
);
5999 INIT_LIST_HEAD(&dev
->upper_dev_list
);
6000 dev
->priv_flags
= IFF_XMIT_DST_RELEASE
;
6003 dev
->num_tx_queues
= txqs
;
6004 dev
->real_num_tx_queues
= txqs
;
6005 if (netif_alloc_netdev_queues(dev
))
6009 dev
->num_rx_queues
= rxqs
;
6010 dev
->real_num_rx_queues
= rxqs
;
6011 if (netif_alloc_rx_queues(dev
))
6015 strcpy(dev
->name
, name
);
6016 dev
->group
= INIT_NETDEV_GROUP
;
6017 if (!dev
->ethtool_ops
)
6018 dev
->ethtool_ops
= &default_ethtool_ops
;
6026 free_percpu(dev
->pcpu_refcnt
);
6036 EXPORT_SYMBOL(alloc_netdev_mqs
);
6039 * free_netdev - free network device
6042 * This function does the last stage of destroying an allocated device
6043 * interface. The reference to the device object is released.
6044 * If this is the last reference then it will be freed.
6046 void free_netdev(struct net_device
*dev
)
6048 struct napi_struct
*p
, *n
;
6050 release_net(dev_net(dev
));
6057 kfree(rcu_dereference_protected(dev
->ingress_queue
, 1));
6059 /* Flush device addresses */
6060 dev_addr_flush(dev
);
6062 list_for_each_entry_safe(p
, n
, &dev
->napi_list
, dev_list
)
6065 free_percpu(dev
->pcpu_refcnt
);
6066 dev
->pcpu_refcnt
= NULL
;
6068 /* Compatibility with error handling in drivers */
6069 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
6070 kfree((char *)dev
- dev
->padded
);
6074 BUG_ON(dev
->reg_state
!= NETREG_UNREGISTERED
);
6075 dev
->reg_state
= NETREG_RELEASED
;
6077 /* will free via device release */
6078 put_device(&dev
->dev
);
6080 EXPORT_SYMBOL(free_netdev
);
6083 * synchronize_net - Synchronize with packet receive processing
6085 * Wait for packets currently being received to be done.
6086 * Does not block later packets from starting.
6088 void synchronize_net(void)
6091 if (rtnl_is_locked())
6092 synchronize_rcu_expedited();
6096 EXPORT_SYMBOL(synchronize_net
);
6099 * unregister_netdevice_queue - remove device from the kernel
6103 * This function shuts down a device interface and removes it
6104 * from the kernel tables.
6105 * If head not NULL, device is queued to be unregistered later.
6107 * Callers must hold the rtnl semaphore. You may want
6108 * unregister_netdev() instead of this.
6111 void unregister_netdevice_queue(struct net_device
*dev
, struct list_head
*head
)
6116 list_move_tail(&dev
->unreg_list
, head
);
6118 rollback_registered(dev
);
6119 /* Finish processing unregister after unlock */
6123 EXPORT_SYMBOL(unregister_netdevice_queue
);
6126 * unregister_netdevice_many - unregister many devices
6127 * @head: list of devices
6129 void unregister_netdevice_many(struct list_head
*head
)
6131 struct net_device
*dev
;
6133 if (!list_empty(head
)) {
6134 rollback_registered_many(head
);
6135 list_for_each_entry(dev
, head
, unreg_list
)
6139 EXPORT_SYMBOL(unregister_netdevice_many
);
6142 * unregister_netdev - remove device from the kernel
6145 * This function shuts down a device interface and removes it
6146 * from the kernel tables.
6148 * This is just a wrapper for unregister_netdevice that takes
6149 * the rtnl semaphore. In general you want to use this and not
6150 * unregister_netdevice.
6152 void unregister_netdev(struct net_device
*dev
)
6155 unregister_netdevice(dev
);
6158 EXPORT_SYMBOL(unregister_netdev
);
6161 * dev_change_net_namespace - move device to different nethost namespace
6163 * @net: network namespace
6164 * @pat: If not NULL name pattern to try if the current device name
6165 * is already taken in the destination network namespace.
6167 * This function shuts down a device interface and moves it
6168 * to a new network namespace. On success 0 is returned, on
6169 * a failure a netagive errno code is returned.
6171 * Callers must hold the rtnl semaphore.
6174 int dev_change_net_namespace(struct net_device
*dev
, struct net
*net
, const char *pat
)
6180 /* Don't allow namespace local devices to be moved. */
6182 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
6185 /* Ensure the device has been registrered */
6186 if (dev
->reg_state
!= NETREG_REGISTERED
)
6189 /* Get out if there is nothing todo */
6191 if (net_eq(dev_net(dev
), net
))
6194 /* Pick the destination device name, and ensure
6195 * we can use it in the destination network namespace.
6198 if (__dev_get_by_name(net
, dev
->name
)) {
6199 /* We get here if we can't use the current device name */
6202 if (dev_get_valid_name(net
, dev
, pat
) < 0)
6207 * And now a mini version of register_netdevice unregister_netdevice.
6210 /* If device is running close it first. */
6213 /* And unlink it from device chain */
6215 unlist_netdevice(dev
);
6219 /* Shutdown queueing discipline. */
6222 /* Notify protocols, that we are about to destroy
6223 this device. They should clean all the things.
6225 Note that dev->reg_state stays at NETREG_REGISTERED.
6226 This is wanted because this way 8021q and macvlan know
6227 the device is just moving and can keep their slaves up.
6229 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
6231 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
6232 rtmsg_ifinfo(RTM_DELLINK
, dev
, ~0U);
6235 * Flush the unicast and multicast chains
6240 /* Send a netdev-removed uevent to the old namespace */
6241 kobject_uevent(&dev
->dev
.kobj
, KOBJ_REMOVE
);
6243 /* Actually switch the network namespace */
6244 dev_net_set(dev
, net
);
6246 /* If there is an ifindex conflict assign a new one */
6247 if (__dev_get_by_index(net
, dev
->ifindex
)) {
6248 int iflink
= (dev
->iflink
== dev
->ifindex
);
6249 dev
->ifindex
= dev_new_index(net
);
6251 dev
->iflink
= dev
->ifindex
;
6254 /* Send a netdev-add uevent to the new namespace */
6255 kobject_uevent(&dev
->dev
.kobj
, KOBJ_ADD
);
6257 /* Fixup kobjects */
6258 err
= device_rename(&dev
->dev
, dev
->name
);
6261 /* Add the device back in the hashes */
6262 list_netdevice(dev
);
6264 /* Notify protocols, that a new device appeared. */
6265 call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
6268 * Prevent userspace races by waiting until the network
6269 * device is fully setup before sending notifications.
6271 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U);
6278 EXPORT_SYMBOL_GPL(dev_change_net_namespace
);
6280 static int dev_cpu_callback(struct notifier_block
*nfb
,
6281 unsigned long action
,
6284 struct sk_buff
**list_skb
;
6285 struct sk_buff
*skb
;
6286 unsigned int cpu
, oldcpu
= (unsigned long)ocpu
;
6287 struct softnet_data
*sd
, *oldsd
;
6289 if (action
!= CPU_DEAD
&& action
!= CPU_DEAD_FROZEN
)
6292 local_irq_disable();
6293 cpu
= smp_processor_id();
6294 sd
= &per_cpu(softnet_data
, cpu
);
6295 oldsd
= &per_cpu(softnet_data
, oldcpu
);
6297 /* Find end of our completion_queue. */
6298 list_skb
= &sd
->completion_queue
;
6300 list_skb
= &(*list_skb
)->next
;
6301 /* Append completion queue from offline CPU. */
6302 *list_skb
= oldsd
->completion_queue
;
6303 oldsd
->completion_queue
= NULL
;
6305 /* Append output queue from offline CPU. */
6306 if (oldsd
->output_queue
) {
6307 *sd
->output_queue_tailp
= oldsd
->output_queue
;
6308 sd
->output_queue_tailp
= oldsd
->output_queue_tailp
;
6309 oldsd
->output_queue
= NULL
;
6310 oldsd
->output_queue_tailp
= &oldsd
->output_queue
;
6312 /* Append NAPI poll list from offline CPU. */
6313 if (!list_empty(&oldsd
->poll_list
)) {
6314 list_splice_init(&oldsd
->poll_list
, &sd
->poll_list
);
6315 raise_softirq_irqoff(NET_RX_SOFTIRQ
);
6318 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
6321 /* Process offline CPU's input_pkt_queue */
6322 while ((skb
= __skb_dequeue(&oldsd
->process_queue
))) {
6324 input_queue_head_incr(oldsd
);
6326 while ((skb
= __skb_dequeue(&oldsd
->input_pkt_queue
))) {
6328 input_queue_head_incr(oldsd
);
6336 * netdev_increment_features - increment feature set by one
6337 * @all: current feature set
6338 * @one: new feature set
6339 * @mask: mask feature set
6341 * Computes a new feature set after adding a device with feature set
6342 * @one to the master device with current feature set @all. Will not
6343 * enable anything that is off in @mask. Returns the new feature set.
6345 netdev_features_t
netdev_increment_features(netdev_features_t all
,
6346 netdev_features_t one
, netdev_features_t mask
)
6348 if (mask
& NETIF_F_GEN_CSUM
)
6349 mask
|= NETIF_F_ALL_CSUM
;
6350 mask
|= NETIF_F_VLAN_CHALLENGED
;
6352 all
|= one
& (NETIF_F_ONE_FOR_ALL
|NETIF_F_ALL_CSUM
) & mask
;
6353 all
&= one
| ~NETIF_F_ALL_FOR_ALL
;
6355 /* If one device supports hw checksumming, set for all. */
6356 if (all
& NETIF_F_GEN_CSUM
)
6357 all
&= ~(NETIF_F_ALL_CSUM
& ~NETIF_F_GEN_CSUM
);
6361 EXPORT_SYMBOL(netdev_increment_features
);
6363 static struct hlist_head
*netdev_create_hash(void)
6366 struct hlist_head
*hash
;
6368 hash
= kmalloc(sizeof(*hash
) * NETDEV_HASHENTRIES
, GFP_KERNEL
);
6370 for (i
= 0; i
< NETDEV_HASHENTRIES
; i
++)
6371 INIT_HLIST_HEAD(&hash
[i
]);
6376 /* Initialize per network namespace state */
6377 static int __net_init
netdev_init(struct net
*net
)
6379 if (net
!= &init_net
)
6380 INIT_LIST_HEAD(&net
->dev_base_head
);
6382 net
->dev_name_head
= netdev_create_hash();
6383 if (net
->dev_name_head
== NULL
)
6386 net
->dev_index_head
= netdev_create_hash();
6387 if (net
->dev_index_head
== NULL
)
6393 kfree(net
->dev_name_head
);
6399 * netdev_drivername - network driver for the device
6400 * @dev: network device
6402 * Determine network driver for device.
6404 const char *netdev_drivername(const struct net_device
*dev
)
6406 const struct device_driver
*driver
;
6407 const struct device
*parent
;
6408 const char *empty
= "";
6410 parent
= dev
->dev
.parent
;
6414 driver
= parent
->driver
;
6415 if (driver
&& driver
->name
)
6416 return driver
->name
;
6420 static int __netdev_printk(const char *level
, const struct net_device
*dev
,
6421 struct va_format
*vaf
)
6425 if (dev
&& dev
->dev
.parent
) {
6426 r
= dev_printk_emit(level
[1] - '0',
6429 dev_driver_string(dev
->dev
.parent
),
6430 dev_name(dev
->dev
.parent
),
6431 netdev_name(dev
), vaf
);
6433 r
= printk("%s%s: %pV", level
, netdev_name(dev
), vaf
);
6435 r
= printk("%s(NULL net_device): %pV", level
, vaf
);
6441 int netdev_printk(const char *level
, const struct net_device
*dev
,
6442 const char *format
, ...)
6444 struct va_format vaf
;
6448 va_start(args
, format
);
6453 r
= __netdev_printk(level
, dev
, &vaf
);
6459 EXPORT_SYMBOL(netdev_printk
);
6461 #define define_netdev_printk_level(func, level) \
6462 int func(const struct net_device *dev, const char *fmt, ...) \
6465 struct va_format vaf; \
6468 va_start(args, fmt); \
6473 r = __netdev_printk(level, dev, &vaf); \
6479 EXPORT_SYMBOL(func);
6481 define_netdev_printk_level(netdev_emerg
, KERN_EMERG
);
6482 define_netdev_printk_level(netdev_alert
, KERN_ALERT
);
6483 define_netdev_printk_level(netdev_crit
, KERN_CRIT
);
6484 define_netdev_printk_level(netdev_err
, KERN_ERR
);
6485 define_netdev_printk_level(netdev_warn
, KERN_WARNING
);
6486 define_netdev_printk_level(netdev_notice
, KERN_NOTICE
);
6487 define_netdev_printk_level(netdev_info
, KERN_INFO
);
6489 static void __net_exit
netdev_exit(struct net
*net
)
6491 kfree(net
->dev_name_head
);
6492 kfree(net
->dev_index_head
);
6495 static struct pernet_operations __net_initdata netdev_net_ops
= {
6496 .init
= netdev_init
,
6497 .exit
= netdev_exit
,
6500 static void __net_exit
default_device_exit(struct net
*net
)
6502 struct net_device
*dev
, *aux
;
6504 * Push all migratable network devices back to the
6505 * initial network namespace
6508 for_each_netdev_safe(net
, dev
, aux
) {
6510 char fb_name
[IFNAMSIZ
];
6512 /* Ignore unmoveable devices (i.e. loopback) */
6513 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
6516 /* Leave virtual devices for the generic cleanup */
6517 if (dev
->rtnl_link_ops
)
6520 /* Push remaining network devices to init_net */
6521 snprintf(fb_name
, IFNAMSIZ
, "dev%d", dev
->ifindex
);
6522 err
= dev_change_net_namespace(dev
, &init_net
, fb_name
);
6524 pr_emerg("%s: failed to move %s to init_net: %d\n",
6525 __func__
, dev
->name
, err
);
6532 static void __net_exit
default_device_exit_batch(struct list_head
*net_list
)
6534 /* At exit all network devices most be removed from a network
6535 * namespace. Do this in the reverse order of registration.
6536 * Do this across as many network namespaces as possible to
6537 * improve batching efficiency.
6539 struct net_device
*dev
;
6541 LIST_HEAD(dev_kill_list
);
6544 list_for_each_entry(net
, net_list
, exit_list
) {
6545 for_each_netdev_reverse(net
, dev
) {
6546 if (dev
->rtnl_link_ops
)
6547 dev
->rtnl_link_ops
->dellink(dev
, &dev_kill_list
);
6549 unregister_netdevice_queue(dev
, &dev_kill_list
);
6552 unregister_netdevice_many(&dev_kill_list
);
6553 list_del(&dev_kill_list
);
6557 static struct pernet_operations __net_initdata default_device_ops
= {
6558 .exit
= default_device_exit
,
6559 .exit_batch
= default_device_exit_batch
,
6563 * Initialize the DEV module. At boot time this walks the device list and
6564 * unhooks any devices that fail to initialise (normally hardware not
6565 * present) and leaves us with a valid list of present and active devices.
6570 * This is called single threaded during boot, so no need
6571 * to take the rtnl semaphore.
6573 static int __init
net_dev_init(void)
6575 int i
, rc
= -ENOMEM
;
6577 BUG_ON(!dev_boot_phase
);
6579 if (dev_proc_init())
6582 if (netdev_kobject_init())
6585 INIT_LIST_HEAD(&ptype_all
);
6586 for (i
= 0; i
< PTYPE_HASH_SIZE
; i
++)
6587 INIT_LIST_HEAD(&ptype_base
[i
]);
6589 INIT_LIST_HEAD(&offload_base
);
6591 if (register_pernet_subsys(&netdev_net_ops
))
6595 * Initialise the packet receive queues.
6598 for_each_possible_cpu(i
) {
6599 struct softnet_data
*sd
= &per_cpu(softnet_data
, i
);
6601 memset(sd
, 0, sizeof(*sd
));
6602 skb_queue_head_init(&sd
->input_pkt_queue
);
6603 skb_queue_head_init(&sd
->process_queue
);
6604 sd
->completion_queue
= NULL
;
6605 INIT_LIST_HEAD(&sd
->poll_list
);
6606 sd
->output_queue
= NULL
;
6607 sd
->output_queue_tailp
= &sd
->output_queue
;
6609 sd
->csd
.func
= rps_trigger_softirq
;
6615 sd
->backlog
.poll
= process_backlog
;
6616 sd
->backlog
.weight
= weight_p
;
6617 sd
->backlog
.gro_list
= NULL
;
6618 sd
->backlog
.gro_count
= 0;
6623 /* The loopback device is special if any other network devices
6624 * is present in a network namespace the loopback device must
6625 * be present. Since we now dynamically allocate and free the
6626 * loopback device ensure this invariant is maintained by
6627 * keeping the loopback device as the first device on the
6628 * list of network devices. Ensuring the loopback devices
6629 * is the first device that appears and the last network device
6632 if (register_pernet_device(&loopback_net_ops
))
6635 if (register_pernet_device(&default_device_ops
))
6638 open_softirq(NET_TX_SOFTIRQ
, net_tx_action
);
6639 open_softirq(NET_RX_SOFTIRQ
, net_rx_action
);
6641 hotcpu_notifier(dev_cpu_callback
, 0);
6649 subsys_initcall(net_dev_init
);