rtnetlink: add babel protocol recognition
[deliverable/linux.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
1da177e4 76#include <linux/bitops.h>
4fc268d2 77#include <linux/capability.h>
1da177e4
LT
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
08e9897d 81#include <linux/hash.h>
5a0e3ad6 82#include <linux/slab.h>
1da177e4 83#include <linux/sched.h>
4a3e2f71 84#include <linux/mutex.h>
1da177e4
LT
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
0187bdfb 94#include <linux/ethtool.h>
1da177e4
LT
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
1da177e4 100#include <linux/stat.h>
1da177e4
LT
101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
44540960 104#include <net/xfrm.h>
1da177e4
LT
105#include <linux/highmem.h>
106#include <linux/init.h>
1da177e4 107#include <linux/module.h>
1da177e4
LT
108#include <linux/netpoll.h>
109#include <linux/rcupdate.h>
110#include <linux/delay.h>
1da177e4 111#include <net/iw_handler.h>
1da177e4 112#include <asm/current.h>
5bdb9886 113#include <linux/audit.h>
db217334 114#include <linux/dmaengine.h>
f6a78bfc 115#include <linux/err.h>
c7fa9d18 116#include <linux/ctype.h>
723e98b7 117#include <linux/if_arp.h>
6de329e2 118#include <linux/if_vlan.h>
8f0f2223 119#include <linux/ip.h>
ad55dcaf 120#include <net/ip.h>
25cd9ba0 121#include <net/mpls.h>
8f0f2223
DM
122#include <linux/ipv6.h>
123#include <linux/in.h>
b6b2fed1
DM
124#include <linux/jhash.h>
125#include <linux/random.h>
9cbc1cb8 126#include <trace/events/napi.h>
cf66ba58 127#include <trace/events/net.h>
07dc22e7 128#include <trace/events/skb.h>
5acbbd42 129#include <linux/pci.h>
caeda9b9 130#include <linux/inetdevice.h>
c445477d 131#include <linux/cpu_rmap.h>
c5905afb 132#include <linux/static_key.h>
af12fa6e 133#include <linux/hashtable.h>
60877a32 134#include <linux/vmalloc.h>
529d0489 135#include <linux/if_macvlan.h>
e7fd2885 136#include <linux/errqueue.h>
1da177e4 137
342709ef
PE
138#include "net-sysfs.h"
139
d565b0a1
HX
140/* Instead of increasing this, you should create a hash table. */
141#define MAX_GRO_SKBS 8
142
5d38a079
HX
143/* This should be increased if a protocol with a bigger head is added. */
144#define GRO_MAX_HEAD (MAX_HEADER + 128)
145
1da177e4 146static DEFINE_SPINLOCK(ptype_lock);
62532da9 147static DEFINE_SPINLOCK(offload_lock);
900ff8c6
CW
148struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
149struct list_head ptype_all __read_mostly; /* Taps */
62532da9 150static struct list_head offload_base __read_mostly;
1da177e4 151
ae78dbfa 152static int netif_rx_internal(struct sk_buff *skb);
54951194
LP
153static int call_netdevice_notifiers_info(unsigned long val,
154 struct net_device *dev,
155 struct netdev_notifier_info *info);
ae78dbfa 156
1da177e4 157/*
7562f876 158 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
159 * semaphore.
160 *
c6d14c84 161 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
162 *
163 * Writers must hold the rtnl semaphore while they loop through the
7562f876 164 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
165 * actual updates. This allows pure readers to access the list even
166 * while a writer is preparing to update it.
167 *
168 * To put it another way, dev_base_lock is held for writing only to
169 * protect against pure readers; the rtnl semaphore provides the
170 * protection against other writers.
171 *
172 * See, for example usages, register_netdevice() and
173 * unregister_netdevice(), which must be called with the rtnl
174 * semaphore held.
175 */
1da177e4 176DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
177EXPORT_SYMBOL(dev_base_lock);
178
af12fa6e
ET
179/* protects napi_hash addition/deletion and napi_gen_id */
180static DEFINE_SPINLOCK(napi_hash_lock);
181
182static unsigned int napi_gen_id;
183static DEFINE_HASHTABLE(napi_hash, 8);
184
18afa4b0 185static seqcount_t devnet_rename_seq;
c91f6df2 186
4e985ada
TG
187static inline void dev_base_seq_inc(struct net *net)
188{
189 while (++net->dev_base_seq == 0);
190}
191
881d966b 192static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4 193{
95c96174
ED
194 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
195
08e9897d 196 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
197}
198
881d966b 199static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 200{
7c28bd0b 201 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
202}
203
e36fa2f7 204static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
205{
206#ifdef CONFIG_RPS
e36fa2f7 207 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
208#endif
209}
210
e36fa2f7 211static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
212{
213#ifdef CONFIG_RPS
e36fa2f7 214 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
215#endif
216}
217
ce286d32 218/* Device list insertion */
53759be9 219static void list_netdevice(struct net_device *dev)
ce286d32 220{
c346dca1 221 struct net *net = dev_net(dev);
ce286d32
EB
222
223 ASSERT_RTNL();
224
225 write_lock_bh(&dev_base_lock);
c6d14c84 226 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 227 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
228 hlist_add_head_rcu(&dev->index_hlist,
229 dev_index_hash(net, dev->ifindex));
ce286d32 230 write_unlock_bh(&dev_base_lock);
4e985ada
TG
231
232 dev_base_seq_inc(net);
ce286d32
EB
233}
234
fb699dfd
ED
235/* Device list removal
236 * caller must respect a RCU grace period before freeing/reusing dev
237 */
ce286d32
EB
238static void unlist_netdevice(struct net_device *dev)
239{
240 ASSERT_RTNL();
241
242 /* Unlink dev from the device chain */
243 write_lock_bh(&dev_base_lock);
c6d14c84 244 list_del_rcu(&dev->dev_list);
72c9528b 245 hlist_del_rcu(&dev->name_hlist);
fb699dfd 246 hlist_del_rcu(&dev->index_hlist);
ce286d32 247 write_unlock_bh(&dev_base_lock);
4e985ada
TG
248
249 dev_base_seq_inc(dev_net(dev));
ce286d32
EB
250}
251
1da177e4
LT
252/*
253 * Our notifier list
254 */
255
f07d5b94 256static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
257
258/*
259 * Device drivers call our routines to queue packets here. We empty the
260 * queue in the local softnet handler.
261 */
bea3348e 262
9958da05 263DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 264EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 265
cf508b12 266#ifdef CONFIG_LOCKDEP
723e98b7 267/*
c773e847 268 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
269 * according to dev->type
270 */
271static const unsigned short netdev_lock_type[] =
272 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
273 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
274 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
275 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
276 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
277 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
278 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
279 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
280 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
281 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
282 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
283 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
211ed865
PG
284 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
285 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
286 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
723e98b7 287
36cbd3dc 288static const char *const netdev_lock_name[] =
723e98b7
JP
289 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
290 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
291 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
292 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
293 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
294 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
295 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
296 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
297 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
298 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
299 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
300 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
211ed865
PG
301 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
302 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
303 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
304
305static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 306static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
307
308static inline unsigned short netdev_lock_pos(unsigned short dev_type)
309{
310 int i;
311
312 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
313 if (netdev_lock_type[i] == dev_type)
314 return i;
315 /* the last key is used by default */
316 return ARRAY_SIZE(netdev_lock_type) - 1;
317}
318
cf508b12
DM
319static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
320 unsigned short dev_type)
723e98b7
JP
321{
322 int i;
323
324 i = netdev_lock_pos(dev_type);
325 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
326 netdev_lock_name[i]);
327}
cf508b12
DM
328
329static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
330{
331 int i;
332
333 i = netdev_lock_pos(dev->type);
334 lockdep_set_class_and_name(&dev->addr_list_lock,
335 &netdev_addr_lock_key[i],
336 netdev_lock_name[i]);
337}
723e98b7 338#else
cf508b12
DM
339static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
340 unsigned short dev_type)
341{
342}
343static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
344{
345}
346#endif
1da177e4
LT
347
348/*******************************************************************************
349
350 Protocol management and registration routines
351
352*******************************************************************************/
353
1da177e4
LT
354/*
355 * Add a protocol ID to the list. Now that the input handler is
356 * smarter we can dispense with all the messy stuff that used to be
357 * here.
358 *
359 * BEWARE!!! Protocol handlers, mangling input packets,
360 * MUST BE last in hash buckets and checking protocol handlers
361 * MUST start from promiscuous ptype_all chain in net_bh.
362 * It is true now, do not change it.
363 * Explanation follows: if protocol handler, mangling packet, will
364 * be the first on list, it is not able to sense, that packet
365 * is cloned and should be copied-on-write, so that it will
366 * change it and subsequent readers will get broken packet.
367 * --ANK (980803)
368 */
369
c07b68e8
ED
370static inline struct list_head *ptype_head(const struct packet_type *pt)
371{
372 if (pt->type == htons(ETH_P_ALL))
373 return &ptype_all;
374 else
375 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
376}
377
1da177e4
LT
378/**
379 * dev_add_pack - add packet handler
380 * @pt: packet type declaration
381 *
382 * Add a protocol handler to the networking stack. The passed &packet_type
383 * is linked into kernel lists and may not be freed until it has been
384 * removed from the kernel lists.
385 *
4ec93edb 386 * This call does not sleep therefore it can not
1da177e4
LT
387 * guarantee all CPU's that are in middle of receiving packets
388 * will see the new packet type (until the next received packet).
389 */
390
391void dev_add_pack(struct packet_type *pt)
392{
c07b68e8 393 struct list_head *head = ptype_head(pt);
1da177e4 394
c07b68e8
ED
395 spin_lock(&ptype_lock);
396 list_add_rcu(&pt->list, head);
397 spin_unlock(&ptype_lock);
1da177e4 398}
d1b19dff 399EXPORT_SYMBOL(dev_add_pack);
1da177e4 400
1da177e4
LT
401/**
402 * __dev_remove_pack - remove packet handler
403 * @pt: packet type declaration
404 *
405 * Remove a protocol handler that was previously added to the kernel
406 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
407 * from the kernel lists and can be freed or reused once this function
4ec93edb 408 * returns.
1da177e4
LT
409 *
410 * The packet type might still be in use by receivers
411 * and must not be freed until after all the CPU's have gone
412 * through a quiescent state.
413 */
414void __dev_remove_pack(struct packet_type *pt)
415{
c07b68e8 416 struct list_head *head = ptype_head(pt);
1da177e4
LT
417 struct packet_type *pt1;
418
c07b68e8 419 spin_lock(&ptype_lock);
1da177e4
LT
420
421 list_for_each_entry(pt1, head, list) {
422 if (pt == pt1) {
423 list_del_rcu(&pt->list);
424 goto out;
425 }
426 }
427
7b6cd1ce 428 pr_warn("dev_remove_pack: %p not found\n", pt);
1da177e4 429out:
c07b68e8 430 spin_unlock(&ptype_lock);
1da177e4 431}
d1b19dff
ED
432EXPORT_SYMBOL(__dev_remove_pack);
433
1da177e4
LT
434/**
435 * dev_remove_pack - remove packet handler
436 * @pt: packet type declaration
437 *
438 * Remove a protocol handler that was previously added to the kernel
439 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
440 * from the kernel lists and can be freed or reused once this function
441 * returns.
442 *
443 * This call sleeps to guarantee that no CPU is looking at the packet
444 * type after return.
445 */
446void dev_remove_pack(struct packet_type *pt)
447{
448 __dev_remove_pack(pt);
4ec93edb 449
1da177e4
LT
450 synchronize_net();
451}
d1b19dff 452EXPORT_SYMBOL(dev_remove_pack);
1da177e4 453
62532da9
VY
454
455/**
456 * dev_add_offload - register offload handlers
457 * @po: protocol offload declaration
458 *
459 * Add protocol offload handlers to the networking stack. The passed
460 * &proto_offload is linked into kernel lists and may not be freed until
461 * it has been removed from the kernel lists.
462 *
463 * This call does not sleep therefore it can not
464 * guarantee all CPU's that are in middle of receiving packets
465 * will see the new offload handlers (until the next received packet).
466 */
467void dev_add_offload(struct packet_offload *po)
468{
469 struct list_head *head = &offload_base;
470
471 spin_lock(&offload_lock);
472 list_add_rcu(&po->list, head);
473 spin_unlock(&offload_lock);
474}
475EXPORT_SYMBOL(dev_add_offload);
476
477/**
478 * __dev_remove_offload - remove offload handler
479 * @po: packet offload declaration
480 *
481 * Remove a protocol offload handler that was previously added to the
482 * kernel offload handlers by dev_add_offload(). The passed &offload_type
483 * is removed from the kernel lists and can be freed or reused once this
484 * function returns.
485 *
486 * The packet type might still be in use by receivers
487 * and must not be freed until after all the CPU's have gone
488 * through a quiescent state.
489 */
1d143d9f 490static void __dev_remove_offload(struct packet_offload *po)
62532da9
VY
491{
492 struct list_head *head = &offload_base;
493 struct packet_offload *po1;
494
c53aa505 495 spin_lock(&offload_lock);
62532da9
VY
496
497 list_for_each_entry(po1, head, list) {
498 if (po == po1) {
499 list_del_rcu(&po->list);
500 goto out;
501 }
502 }
503
504 pr_warn("dev_remove_offload: %p not found\n", po);
505out:
c53aa505 506 spin_unlock(&offload_lock);
62532da9 507}
62532da9
VY
508
509/**
510 * dev_remove_offload - remove packet offload handler
511 * @po: packet offload declaration
512 *
513 * Remove a packet offload handler that was previously added to the kernel
514 * offload handlers by dev_add_offload(). The passed &offload_type is
515 * removed from the kernel lists and can be freed or reused once this
516 * function returns.
517 *
518 * This call sleeps to guarantee that no CPU is looking at the packet
519 * type after return.
520 */
521void dev_remove_offload(struct packet_offload *po)
522{
523 __dev_remove_offload(po);
524
525 synchronize_net();
526}
527EXPORT_SYMBOL(dev_remove_offload);
528
1da177e4
LT
529/******************************************************************************
530
531 Device Boot-time Settings Routines
532
533*******************************************************************************/
534
535/* Boot time configuration table */
536static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
537
538/**
539 * netdev_boot_setup_add - add new setup entry
540 * @name: name of the device
541 * @map: configured settings for the device
542 *
543 * Adds new setup entry to the dev_boot_setup list. The function
544 * returns 0 on error and 1 on success. This is a generic routine to
545 * all netdevices.
546 */
547static int netdev_boot_setup_add(char *name, struct ifmap *map)
548{
549 struct netdev_boot_setup *s;
550 int i;
551
552 s = dev_boot_setup;
553 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
554 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
555 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 556 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
557 memcpy(&s[i].map, map, sizeof(s[i].map));
558 break;
559 }
560 }
561
562 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
563}
564
565/**
566 * netdev_boot_setup_check - check boot time settings
567 * @dev: the netdevice
568 *
569 * Check boot time settings for the device.
570 * The found settings are set for the device to be used
571 * later in the device probing.
572 * Returns 0 if no settings found, 1 if they are.
573 */
574int netdev_boot_setup_check(struct net_device *dev)
575{
576 struct netdev_boot_setup *s = dev_boot_setup;
577 int i;
578
579 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
580 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 581 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
582 dev->irq = s[i].map.irq;
583 dev->base_addr = s[i].map.base_addr;
584 dev->mem_start = s[i].map.mem_start;
585 dev->mem_end = s[i].map.mem_end;
586 return 1;
587 }
588 }
589 return 0;
590}
d1b19dff 591EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
592
593
594/**
595 * netdev_boot_base - get address from boot time settings
596 * @prefix: prefix for network device
597 * @unit: id for network device
598 *
599 * Check boot time settings for the base address of device.
600 * The found settings are set for the device to be used
601 * later in the device probing.
602 * Returns 0 if no settings found.
603 */
604unsigned long netdev_boot_base(const char *prefix, int unit)
605{
606 const struct netdev_boot_setup *s = dev_boot_setup;
607 char name[IFNAMSIZ];
608 int i;
609
610 sprintf(name, "%s%d", prefix, unit);
611
612 /*
613 * If device already registered then return base of 1
614 * to indicate not to probe for this interface
615 */
881d966b 616 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
617 return 1;
618
619 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
620 if (!strcmp(name, s[i].name))
621 return s[i].map.base_addr;
622 return 0;
623}
624
625/*
626 * Saves at boot time configured settings for any netdevice.
627 */
628int __init netdev_boot_setup(char *str)
629{
630 int ints[5];
631 struct ifmap map;
632
633 str = get_options(str, ARRAY_SIZE(ints), ints);
634 if (!str || !*str)
635 return 0;
636
637 /* Save settings */
638 memset(&map, 0, sizeof(map));
639 if (ints[0] > 0)
640 map.irq = ints[1];
641 if (ints[0] > 1)
642 map.base_addr = ints[2];
643 if (ints[0] > 2)
644 map.mem_start = ints[3];
645 if (ints[0] > 3)
646 map.mem_end = ints[4];
647
648 /* Add new entry to the list */
649 return netdev_boot_setup_add(str, &map);
650}
651
652__setup("netdev=", netdev_boot_setup);
653
654/*******************************************************************************
655
656 Device Interface Subroutines
657
658*******************************************************************************/
659
660/**
661 * __dev_get_by_name - find a device by its name
c4ea43c5 662 * @net: the applicable net namespace
1da177e4
LT
663 * @name: name to find
664 *
665 * Find an interface by name. Must be called under RTNL semaphore
666 * or @dev_base_lock. If the name is found a pointer to the device
667 * is returned. If the name is not found then %NULL is returned. The
668 * reference counters are not incremented so the caller must be
669 * careful with locks.
670 */
671
881d966b 672struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4 673{
0bd8d536
ED
674 struct net_device *dev;
675 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 676
b67bfe0d 677 hlist_for_each_entry(dev, head, name_hlist)
1da177e4
LT
678 if (!strncmp(dev->name, name, IFNAMSIZ))
679 return dev;
0bd8d536 680
1da177e4
LT
681 return NULL;
682}
d1b19dff 683EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 684
72c9528b
ED
685/**
686 * dev_get_by_name_rcu - find a device by its name
687 * @net: the applicable net namespace
688 * @name: name to find
689 *
690 * Find an interface by name.
691 * If the name is found a pointer to the device is returned.
692 * If the name is not found then %NULL is returned.
693 * The reference counters are not incremented so the caller must be
694 * careful with locks. The caller must hold RCU lock.
695 */
696
697struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
698{
72c9528b
ED
699 struct net_device *dev;
700 struct hlist_head *head = dev_name_hash(net, name);
701
b67bfe0d 702 hlist_for_each_entry_rcu(dev, head, name_hlist)
72c9528b
ED
703 if (!strncmp(dev->name, name, IFNAMSIZ))
704 return dev;
705
706 return NULL;
707}
708EXPORT_SYMBOL(dev_get_by_name_rcu);
709
1da177e4
LT
710/**
711 * dev_get_by_name - find a device by its name
c4ea43c5 712 * @net: the applicable net namespace
1da177e4
LT
713 * @name: name to find
714 *
715 * Find an interface by name. This can be called from any
716 * context and does its own locking. The returned handle has
717 * the usage count incremented and the caller must use dev_put() to
718 * release it when it is no longer needed. %NULL is returned if no
719 * matching device is found.
720 */
721
881d966b 722struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
723{
724 struct net_device *dev;
725
72c9528b
ED
726 rcu_read_lock();
727 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
728 if (dev)
729 dev_hold(dev);
72c9528b 730 rcu_read_unlock();
1da177e4
LT
731 return dev;
732}
d1b19dff 733EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
734
735/**
736 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 737 * @net: the applicable net namespace
1da177e4
LT
738 * @ifindex: index of device
739 *
740 * Search for an interface by index. Returns %NULL if the device
741 * is not found or a pointer to the device. The device has not
742 * had its reference counter increased so the caller must be careful
743 * about locking. The caller must hold either the RTNL semaphore
744 * or @dev_base_lock.
745 */
746
881d966b 747struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4 748{
0bd8d536
ED
749 struct net_device *dev;
750 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 751
b67bfe0d 752 hlist_for_each_entry(dev, head, index_hlist)
1da177e4
LT
753 if (dev->ifindex == ifindex)
754 return dev;
0bd8d536 755
1da177e4
LT
756 return NULL;
757}
d1b19dff 758EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 759
fb699dfd
ED
760/**
761 * dev_get_by_index_rcu - find a device by its ifindex
762 * @net: the applicable net namespace
763 * @ifindex: index of device
764 *
765 * Search for an interface by index. Returns %NULL if the device
766 * is not found or a pointer to the device. The device has not
767 * had its reference counter increased so the caller must be careful
768 * about locking. The caller must hold RCU lock.
769 */
770
771struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
772{
fb699dfd
ED
773 struct net_device *dev;
774 struct hlist_head *head = dev_index_hash(net, ifindex);
775
b67bfe0d 776 hlist_for_each_entry_rcu(dev, head, index_hlist)
fb699dfd
ED
777 if (dev->ifindex == ifindex)
778 return dev;
779
780 return NULL;
781}
782EXPORT_SYMBOL(dev_get_by_index_rcu);
783
1da177e4
LT
784
785/**
786 * dev_get_by_index - find a device by its ifindex
c4ea43c5 787 * @net: the applicable net namespace
1da177e4
LT
788 * @ifindex: index of device
789 *
790 * Search for an interface by index. Returns NULL if the device
791 * is not found or a pointer to the device. The device returned has
792 * had a reference added and the pointer is safe until the user calls
793 * dev_put to indicate they have finished with it.
794 */
795
881d966b 796struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
797{
798 struct net_device *dev;
799
fb699dfd
ED
800 rcu_read_lock();
801 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
802 if (dev)
803 dev_hold(dev);
fb699dfd 804 rcu_read_unlock();
1da177e4
LT
805 return dev;
806}
d1b19dff 807EXPORT_SYMBOL(dev_get_by_index);
1da177e4 808
5dbe7c17
NS
809/**
810 * netdev_get_name - get a netdevice name, knowing its ifindex.
811 * @net: network namespace
812 * @name: a pointer to the buffer where the name will be stored.
813 * @ifindex: the ifindex of the interface to get the name from.
814 *
815 * The use of raw_seqcount_begin() and cond_resched() before
816 * retrying is required as we want to give the writers a chance
817 * to complete when CONFIG_PREEMPT is not set.
818 */
819int netdev_get_name(struct net *net, char *name, int ifindex)
820{
821 struct net_device *dev;
822 unsigned int seq;
823
824retry:
825 seq = raw_seqcount_begin(&devnet_rename_seq);
826 rcu_read_lock();
827 dev = dev_get_by_index_rcu(net, ifindex);
828 if (!dev) {
829 rcu_read_unlock();
830 return -ENODEV;
831 }
832
833 strcpy(name, dev->name);
834 rcu_read_unlock();
835 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
836 cond_resched();
837 goto retry;
838 }
839
840 return 0;
841}
842
1da177e4 843/**
941666c2 844 * dev_getbyhwaddr_rcu - find a device by its hardware address
c4ea43c5 845 * @net: the applicable net namespace
1da177e4
LT
846 * @type: media type of device
847 * @ha: hardware address
848 *
849 * Search for an interface by MAC address. Returns NULL if the device
c506653d
ED
850 * is not found or a pointer to the device.
851 * The caller must hold RCU or RTNL.
941666c2 852 * The returned device has not had its ref count increased
1da177e4
LT
853 * and the caller must therefore be careful about locking
854 *
1da177e4
LT
855 */
856
941666c2
ED
857struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
858 const char *ha)
1da177e4
LT
859{
860 struct net_device *dev;
861
941666c2 862 for_each_netdev_rcu(net, dev)
1da177e4
LT
863 if (dev->type == type &&
864 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
865 return dev;
866
867 return NULL;
1da177e4 868}
941666c2 869EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
cf309e3f 870
881d966b 871struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
872{
873 struct net_device *dev;
874
4e9cac2b 875 ASSERT_RTNL();
881d966b 876 for_each_netdev(net, dev)
4e9cac2b 877 if (dev->type == type)
7562f876
PE
878 return dev;
879
880 return NULL;
4e9cac2b 881}
4e9cac2b
PM
882EXPORT_SYMBOL(__dev_getfirstbyhwtype);
883
881d966b 884struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 885{
99fe3c39 886 struct net_device *dev, *ret = NULL;
4e9cac2b 887
99fe3c39
ED
888 rcu_read_lock();
889 for_each_netdev_rcu(net, dev)
890 if (dev->type == type) {
891 dev_hold(dev);
892 ret = dev;
893 break;
894 }
895 rcu_read_unlock();
896 return ret;
1da177e4 897}
1da177e4
LT
898EXPORT_SYMBOL(dev_getfirstbyhwtype);
899
900/**
6c555490 901 * __dev_get_by_flags - find any device with given flags
c4ea43c5 902 * @net: the applicable net namespace
1da177e4
LT
903 * @if_flags: IFF_* values
904 * @mask: bitmask of bits in if_flags to check
905 *
906 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04 907 * is not found or a pointer to the device. Must be called inside
6c555490 908 * rtnl_lock(), and result refcount is unchanged.
1da177e4
LT
909 */
910
6c555490
WC
911struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
912 unsigned short mask)
1da177e4 913{
7562f876 914 struct net_device *dev, *ret;
1da177e4 915
6c555490
WC
916 ASSERT_RTNL();
917
7562f876 918 ret = NULL;
6c555490 919 for_each_netdev(net, dev) {
1da177e4 920 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 921 ret = dev;
1da177e4
LT
922 break;
923 }
924 }
7562f876 925 return ret;
1da177e4 926}
6c555490 927EXPORT_SYMBOL(__dev_get_by_flags);
1da177e4
LT
928
929/**
930 * dev_valid_name - check if name is okay for network device
931 * @name: name string
932 *
933 * Network device names need to be valid file names to
c7fa9d18
DM
934 * to allow sysfs to work. We also disallow any kind of
935 * whitespace.
1da177e4 936 */
95f050bf 937bool dev_valid_name(const char *name)
1da177e4 938{
c7fa9d18 939 if (*name == '\0')
95f050bf 940 return false;
b6fe17d6 941 if (strlen(name) >= IFNAMSIZ)
95f050bf 942 return false;
c7fa9d18 943 if (!strcmp(name, ".") || !strcmp(name, ".."))
95f050bf 944 return false;
c7fa9d18
DM
945
946 while (*name) {
947 if (*name == '/' || isspace(*name))
95f050bf 948 return false;
c7fa9d18
DM
949 name++;
950 }
95f050bf 951 return true;
1da177e4 952}
d1b19dff 953EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
954
955/**
b267b179
EB
956 * __dev_alloc_name - allocate a name for a device
957 * @net: network namespace to allocate the device name in
1da177e4 958 * @name: name format string
b267b179 959 * @buf: scratch buffer and result name string
1da177e4
LT
960 *
961 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
962 * id. It scans list of devices to build up a free map, then chooses
963 * the first empty slot. The caller must hold the dev_base or rtnl lock
964 * while allocating the name and adding the device in order to avoid
965 * duplicates.
966 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
967 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
968 */
969
b267b179 970static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
971{
972 int i = 0;
1da177e4
LT
973 const char *p;
974 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 975 unsigned long *inuse;
1da177e4
LT
976 struct net_device *d;
977
978 p = strnchr(name, IFNAMSIZ-1, '%');
979 if (p) {
980 /*
981 * Verify the string as this thing may have come from
982 * the user. There must be either one "%d" and no other "%"
983 * characters.
984 */
985 if (p[1] != 'd' || strchr(p + 2, '%'))
986 return -EINVAL;
987
988 /* Use one page as a bit array of possible slots */
cfcabdcc 989 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
990 if (!inuse)
991 return -ENOMEM;
992
881d966b 993 for_each_netdev(net, d) {
1da177e4
LT
994 if (!sscanf(d->name, name, &i))
995 continue;
996 if (i < 0 || i >= max_netdevices)
997 continue;
998
999 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 1000 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
1001 if (!strncmp(buf, d->name, IFNAMSIZ))
1002 set_bit(i, inuse);
1003 }
1004
1005 i = find_first_zero_bit(inuse, max_netdevices);
1006 free_page((unsigned long) inuse);
1007 }
1008
d9031024
OP
1009 if (buf != name)
1010 snprintf(buf, IFNAMSIZ, name, i);
b267b179 1011 if (!__dev_get_by_name(net, buf))
1da177e4 1012 return i;
1da177e4
LT
1013
1014 /* It is possible to run out of possible slots
1015 * when the name is long and there isn't enough space left
1016 * for the digits, or if all bits are used.
1017 */
1018 return -ENFILE;
1019}
1020
b267b179
EB
1021/**
1022 * dev_alloc_name - allocate a name for a device
1023 * @dev: device
1024 * @name: name format string
1025 *
1026 * Passed a format string - eg "lt%d" it will try and find a suitable
1027 * id. It scans list of devices to build up a free map, then chooses
1028 * the first empty slot. The caller must hold the dev_base or rtnl lock
1029 * while allocating the name and adding the device in order to avoid
1030 * duplicates.
1031 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1032 * Returns the number of the unit assigned or a negative errno code.
1033 */
1034
1035int dev_alloc_name(struct net_device *dev, const char *name)
1036{
1037 char buf[IFNAMSIZ];
1038 struct net *net;
1039 int ret;
1040
c346dca1
YH
1041 BUG_ON(!dev_net(dev));
1042 net = dev_net(dev);
b267b179
EB
1043 ret = __dev_alloc_name(net, name, buf);
1044 if (ret >= 0)
1045 strlcpy(dev->name, buf, IFNAMSIZ);
1046 return ret;
1047}
d1b19dff 1048EXPORT_SYMBOL(dev_alloc_name);
b267b179 1049
828de4f6
G
1050static int dev_alloc_name_ns(struct net *net,
1051 struct net_device *dev,
1052 const char *name)
d9031024 1053{
828de4f6
G
1054 char buf[IFNAMSIZ];
1055 int ret;
8ce6cebc 1056
828de4f6
G
1057 ret = __dev_alloc_name(net, name, buf);
1058 if (ret >= 0)
1059 strlcpy(dev->name, buf, IFNAMSIZ);
1060 return ret;
1061}
1062
1063static int dev_get_valid_name(struct net *net,
1064 struct net_device *dev,
1065 const char *name)
1066{
1067 BUG_ON(!net);
8ce6cebc 1068
d9031024
OP
1069 if (!dev_valid_name(name))
1070 return -EINVAL;
1071
1c5cae81 1072 if (strchr(name, '%'))
828de4f6 1073 return dev_alloc_name_ns(net, dev, name);
d9031024
OP
1074 else if (__dev_get_by_name(net, name))
1075 return -EEXIST;
8ce6cebc
DL
1076 else if (dev->name != name)
1077 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
1078
1079 return 0;
1080}
1da177e4
LT
1081
1082/**
1083 * dev_change_name - change name of a device
1084 * @dev: device
1085 * @newname: name (or format string) must be at least IFNAMSIZ
1086 *
1087 * Change name of a device, can pass format strings "eth%d".
1088 * for wildcarding.
1089 */
cf04a4c7 1090int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 1091{
238fa362 1092 unsigned char old_assign_type;
fcc5a03a 1093 char oldname[IFNAMSIZ];
1da177e4 1094 int err = 0;
fcc5a03a 1095 int ret;
881d966b 1096 struct net *net;
1da177e4
LT
1097
1098 ASSERT_RTNL();
c346dca1 1099 BUG_ON(!dev_net(dev));
1da177e4 1100
c346dca1 1101 net = dev_net(dev);
1da177e4
LT
1102 if (dev->flags & IFF_UP)
1103 return -EBUSY;
1104
30e6c9fa 1105 write_seqcount_begin(&devnet_rename_seq);
c91f6df2
BH
1106
1107 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
30e6c9fa 1108 write_seqcount_end(&devnet_rename_seq);
c8d90dca 1109 return 0;
c91f6df2 1110 }
c8d90dca 1111
fcc5a03a
HX
1112 memcpy(oldname, dev->name, IFNAMSIZ);
1113
828de4f6 1114 err = dev_get_valid_name(net, dev, newname);
c91f6df2 1115 if (err < 0) {
30e6c9fa 1116 write_seqcount_end(&devnet_rename_seq);
d9031024 1117 return err;
c91f6df2 1118 }
1da177e4 1119
6fe82a39
VF
1120 if (oldname[0] && !strchr(oldname, '%'))
1121 netdev_info(dev, "renamed from %s\n", oldname);
1122
238fa362
TG
1123 old_assign_type = dev->name_assign_type;
1124 dev->name_assign_type = NET_NAME_RENAMED;
1125
fcc5a03a 1126rollback:
a1b3f594
EB
1127 ret = device_rename(&dev->dev, dev->name);
1128 if (ret) {
1129 memcpy(dev->name, oldname, IFNAMSIZ);
238fa362 1130 dev->name_assign_type = old_assign_type;
30e6c9fa 1131 write_seqcount_end(&devnet_rename_seq);
a1b3f594 1132 return ret;
dcc99773 1133 }
7f988eab 1134
30e6c9fa 1135 write_seqcount_end(&devnet_rename_seq);
c91f6df2 1136
5bb025fa
VF
1137 netdev_adjacent_rename_links(dev, oldname);
1138
7f988eab 1139 write_lock_bh(&dev_base_lock);
372b2312 1140 hlist_del_rcu(&dev->name_hlist);
72c9528b
ED
1141 write_unlock_bh(&dev_base_lock);
1142
1143 synchronize_rcu();
1144
1145 write_lock_bh(&dev_base_lock);
1146 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1147 write_unlock_bh(&dev_base_lock);
1148
056925ab 1149 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1150 ret = notifier_to_errno(ret);
1151
1152 if (ret) {
91e9c07b
ED
1153 /* err >= 0 after dev_alloc_name() or stores the first errno */
1154 if (err >= 0) {
fcc5a03a 1155 err = ret;
30e6c9fa 1156 write_seqcount_begin(&devnet_rename_seq);
fcc5a03a 1157 memcpy(dev->name, oldname, IFNAMSIZ);
5bb025fa 1158 memcpy(oldname, newname, IFNAMSIZ);
238fa362
TG
1159 dev->name_assign_type = old_assign_type;
1160 old_assign_type = NET_NAME_RENAMED;
fcc5a03a 1161 goto rollback;
91e9c07b 1162 } else {
7b6cd1ce 1163 pr_err("%s: name change rollback failed: %d\n",
91e9c07b 1164 dev->name, ret);
fcc5a03a
HX
1165 }
1166 }
1da177e4
LT
1167
1168 return err;
1169}
1170
0b815a1a
SH
1171/**
1172 * dev_set_alias - change ifalias of a device
1173 * @dev: device
1174 * @alias: name up to IFALIASZ
f0db275a 1175 * @len: limit of bytes to copy from info
0b815a1a
SH
1176 *
1177 * Set ifalias for a device,
1178 */
1179int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1180{
7364e445
AK
1181 char *new_ifalias;
1182
0b815a1a
SH
1183 ASSERT_RTNL();
1184
1185 if (len >= IFALIASZ)
1186 return -EINVAL;
1187
96ca4a2c 1188 if (!len) {
388dfc2d
SK
1189 kfree(dev->ifalias);
1190 dev->ifalias = NULL;
96ca4a2c
OH
1191 return 0;
1192 }
1193
7364e445
AK
1194 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1195 if (!new_ifalias)
0b815a1a 1196 return -ENOMEM;
7364e445 1197 dev->ifalias = new_ifalias;
0b815a1a
SH
1198
1199 strlcpy(dev->ifalias, alias, len+1);
1200 return len;
1201}
1202
1203
d8a33ac4 1204/**
3041a069 1205 * netdev_features_change - device changes features
d8a33ac4
SH
1206 * @dev: device to cause notification
1207 *
1208 * Called to indicate a device has changed features.
1209 */
1210void netdev_features_change(struct net_device *dev)
1211{
056925ab 1212 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1213}
1214EXPORT_SYMBOL(netdev_features_change);
1215
1da177e4
LT
1216/**
1217 * netdev_state_change - device changes state
1218 * @dev: device to cause notification
1219 *
1220 * Called to indicate a device has changed state. This function calls
1221 * the notifier chains for netdev_chain and sends a NEWLINK message
1222 * to the routing socket.
1223 */
1224void netdev_state_change(struct net_device *dev)
1225{
1226 if (dev->flags & IFF_UP) {
54951194
LP
1227 struct netdev_notifier_change_info change_info;
1228
1229 change_info.flags_changed = 0;
1230 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1231 &change_info.info);
7f294054 1232 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1da177e4
LT
1233 }
1234}
d1b19dff 1235EXPORT_SYMBOL(netdev_state_change);
1da177e4 1236
ee89bab1
AW
1237/**
1238 * netdev_notify_peers - notify network peers about existence of @dev
1239 * @dev: network device
1240 *
1241 * Generate traffic such that interested network peers are aware of
1242 * @dev, such as by generating a gratuitous ARP. This may be used when
1243 * a device wants to inform the rest of the network about some sort of
1244 * reconfiguration such as a failover event or virtual machine
1245 * migration.
1246 */
1247void netdev_notify_peers(struct net_device *dev)
c1da4ac7 1248{
ee89bab1
AW
1249 rtnl_lock();
1250 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1251 rtnl_unlock();
c1da4ac7 1252}
ee89bab1 1253EXPORT_SYMBOL(netdev_notify_peers);
c1da4ac7 1254
bd380811 1255static int __dev_open(struct net_device *dev)
1da177e4 1256{
d314774c 1257 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1258 int ret;
1da177e4 1259
e46b66bc
BH
1260 ASSERT_RTNL();
1261
1da177e4
LT
1262 if (!netif_device_present(dev))
1263 return -ENODEV;
1264
ca99ca14
NH
1265 /* Block netpoll from trying to do any rx path servicing.
1266 * If we don't do this there is a chance ndo_poll_controller
1267 * or ndo_poll may be running while we open the device
1268 */
66b5552f 1269 netpoll_poll_disable(dev);
ca99ca14 1270
3b8bcfd5
JB
1271 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1272 ret = notifier_to_errno(ret);
1273 if (ret)
1274 return ret;
1275
1da177e4 1276 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1277
d314774c
SH
1278 if (ops->ndo_validate_addr)
1279 ret = ops->ndo_validate_addr(dev);
bada339b 1280
d314774c
SH
1281 if (!ret && ops->ndo_open)
1282 ret = ops->ndo_open(dev);
1da177e4 1283
66b5552f 1284 netpoll_poll_enable(dev);
ca99ca14 1285
bada339b
JG
1286 if (ret)
1287 clear_bit(__LINK_STATE_START, &dev->state);
1288 else {
1da177e4 1289 dev->flags |= IFF_UP;
4417da66 1290 dev_set_rx_mode(dev);
1da177e4 1291 dev_activate(dev);
7bf23575 1292 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 1293 }
bada339b 1294
1da177e4
LT
1295 return ret;
1296}
1297
1298/**
bd380811
PM
1299 * dev_open - prepare an interface for use.
1300 * @dev: device to open
1da177e4 1301 *
bd380811
PM
1302 * Takes a device from down to up state. The device's private open
1303 * function is invoked and then the multicast lists are loaded. Finally
1304 * the device is moved into the up state and a %NETDEV_UP message is
1305 * sent to the netdev notifier chain.
1306 *
1307 * Calling this function on an active interface is a nop. On a failure
1308 * a negative errno code is returned.
1da177e4 1309 */
bd380811
PM
1310int dev_open(struct net_device *dev)
1311{
1312 int ret;
1313
bd380811
PM
1314 if (dev->flags & IFF_UP)
1315 return 0;
1316
bd380811
PM
1317 ret = __dev_open(dev);
1318 if (ret < 0)
1319 return ret;
1320
7f294054 1321 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
bd380811
PM
1322 call_netdevice_notifiers(NETDEV_UP, dev);
1323
1324 return ret;
1325}
1326EXPORT_SYMBOL(dev_open);
1327
44345724 1328static int __dev_close_many(struct list_head *head)
1da177e4 1329{
44345724 1330 struct net_device *dev;
e46b66bc 1331
bd380811 1332 ASSERT_RTNL();
9d5010db
DM
1333 might_sleep();
1334
5cde2829 1335 list_for_each_entry(dev, head, close_list) {
3f4df206 1336 /* Temporarily disable netpoll until the interface is down */
66b5552f 1337 netpoll_poll_disable(dev);
3f4df206 1338
44345724 1339 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1340
44345724 1341 clear_bit(__LINK_STATE_START, &dev->state);
1da177e4 1342
44345724
OP
1343 /* Synchronize to scheduled poll. We cannot touch poll list, it
1344 * can be even on different cpu. So just clear netif_running().
1345 *
1346 * dev->stop() will invoke napi_disable() on all of it's
1347 * napi_struct instances on this device.
1348 */
4e857c58 1349 smp_mb__after_atomic(); /* Commit netif_running(). */
44345724 1350 }
1da177e4 1351
44345724 1352 dev_deactivate_many(head);
d8b2a4d2 1353
5cde2829 1354 list_for_each_entry(dev, head, close_list) {
44345724 1355 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4 1356
44345724
OP
1357 /*
1358 * Call the device specific close. This cannot fail.
1359 * Only if device is UP
1360 *
1361 * We allow it to be called even after a DETACH hot-plug
1362 * event.
1363 */
1364 if (ops->ndo_stop)
1365 ops->ndo_stop(dev);
1366
44345724 1367 dev->flags &= ~IFF_UP;
66b5552f 1368 netpoll_poll_enable(dev);
44345724
OP
1369 }
1370
1371 return 0;
1372}
1373
1374static int __dev_close(struct net_device *dev)
1375{
f87e6f47 1376 int retval;
44345724
OP
1377 LIST_HEAD(single);
1378
5cde2829 1379 list_add(&dev->close_list, &single);
f87e6f47
LT
1380 retval = __dev_close_many(&single);
1381 list_del(&single);
ca99ca14 1382
f87e6f47 1383 return retval;
44345724
OP
1384}
1385
3fbd8758 1386static int dev_close_many(struct list_head *head)
44345724
OP
1387{
1388 struct net_device *dev, *tmp;
1da177e4 1389
5cde2829
EB
1390 /* Remove the devices that don't need to be closed */
1391 list_for_each_entry_safe(dev, tmp, head, close_list)
44345724 1392 if (!(dev->flags & IFF_UP))
5cde2829 1393 list_del_init(&dev->close_list);
44345724
OP
1394
1395 __dev_close_many(head);
1da177e4 1396
5cde2829 1397 list_for_each_entry_safe(dev, tmp, head, close_list) {
7f294054 1398 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
44345724 1399 call_netdevice_notifiers(NETDEV_DOWN, dev);
5cde2829 1400 list_del_init(&dev->close_list);
44345724 1401 }
bd380811
PM
1402
1403 return 0;
1404}
1405
1406/**
1407 * dev_close - shutdown an interface.
1408 * @dev: device to shutdown
1409 *
1410 * This function moves an active device into down state. A
1411 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1412 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1413 * chain.
1414 */
1415int dev_close(struct net_device *dev)
1416{
e14a5993
ED
1417 if (dev->flags & IFF_UP) {
1418 LIST_HEAD(single);
1da177e4 1419
5cde2829 1420 list_add(&dev->close_list, &single);
e14a5993
ED
1421 dev_close_many(&single);
1422 list_del(&single);
1423 }
da6e378b 1424 return 0;
1da177e4 1425}
d1b19dff 1426EXPORT_SYMBOL(dev_close);
1da177e4
LT
1427
1428
0187bdfb
BH
1429/**
1430 * dev_disable_lro - disable Large Receive Offload on a device
1431 * @dev: device
1432 *
1433 * Disable Large Receive Offload (LRO) on a net device. Must be
1434 * called under RTNL. This is needed if received packets may be
1435 * forwarded to another interface.
1436 */
1437void dev_disable_lro(struct net_device *dev)
1438{
f11970e3
NH
1439 /*
1440 * If we're trying to disable lro on a vlan device
1441 * use the underlying physical device instead
1442 */
1443 if (is_vlan_dev(dev))
1444 dev = vlan_dev_real_dev(dev);
1445
529d0489
MK
1446 /* the same for macvlan devices */
1447 if (netif_is_macvlan(dev))
1448 dev = macvlan_dev_real_dev(dev);
1449
bc5787c6
MM
1450 dev->wanted_features &= ~NETIF_F_LRO;
1451 netdev_update_features(dev);
27660515 1452
22d5969f
MM
1453 if (unlikely(dev->features & NETIF_F_LRO))
1454 netdev_WARN(dev, "failed to disable LRO!\n");
0187bdfb
BH
1455}
1456EXPORT_SYMBOL(dev_disable_lro);
1457
351638e7
JP
1458static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1459 struct net_device *dev)
1460{
1461 struct netdev_notifier_info info;
1462
1463 netdev_notifier_info_init(&info, dev);
1464 return nb->notifier_call(nb, val, &info);
1465}
0187bdfb 1466
881d966b
EB
1467static int dev_boot_phase = 1;
1468
1da177e4
LT
1469/**
1470 * register_netdevice_notifier - register a network notifier block
1471 * @nb: notifier
1472 *
1473 * Register a notifier to be called when network device events occur.
1474 * The notifier passed is linked into the kernel structures and must
1475 * not be reused until it has been unregistered. A negative errno code
1476 * is returned on a failure.
1477 *
1478 * When registered all registration and up events are replayed
4ec93edb 1479 * to the new notifier to allow device to have a race free
1da177e4
LT
1480 * view of the network device list.
1481 */
1482
1483int register_netdevice_notifier(struct notifier_block *nb)
1484{
1485 struct net_device *dev;
fcc5a03a 1486 struct net_device *last;
881d966b 1487 struct net *net;
1da177e4
LT
1488 int err;
1489
1490 rtnl_lock();
f07d5b94 1491 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1492 if (err)
1493 goto unlock;
881d966b
EB
1494 if (dev_boot_phase)
1495 goto unlock;
1496 for_each_net(net) {
1497 for_each_netdev(net, dev) {
351638e7 1498 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
881d966b
EB
1499 err = notifier_to_errno(err);
1500 if (err)
1501 goto rollback;
1502
1503 if (!(dev->flags & IFF_UP))
1504 continue;
1da177e4 1505
351638e7 1506 call_netdevice_notifier(nb, NETDEV_UP, dev);
881d966b 1507 }
1da177e4 1508 }
fcc5a03a
HX
1509
1510unlock:
1da177e4
LT
1511 rtnl_unlock();
1512 return err;
fcc5a03a
HX
1513
1514rollback:
1515 last = dev;
881d966b
EB
1516 for_each_net(net) {
1517 for_each_netdev(net, dev) {
1518 if (dev == last)
8f891489 1519 goto outroll;
fcc5a03a 1520
881d966b 1521 if (dev->flags & IFF_UP) {
351638e7
JP
1522 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1523 dev);
1524 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
881d966b 1525 }
351638e7 1526 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
fcc5a03a 1527 }
fcc5a03a 1528 }
c67625a1 1529
8f891489 1530outroll:
c67625a1 1531 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1532 goto unlock;
1da177e4 1533}
d1b19dff 1534EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1535
1536/**
1537 * unregister_netdevice_notifier - unregister a network notifier block
1538 * @nb: notifier
1539 *
1540 * Unregister a notifier previously registered by
1541 * register_netdevice_notifier(). The notifier is unlinked into the
1542 * kernel structures and may then be reused. A negative errno code
1543 * is returned on a failure.
7d3d43da
EB
1544 *
1545 * After unregistering unregister and down device events are synthesized
1546 * for all devices on the device list to the removed notifier to remove
1547 * the need for special case cleanup code.
1da177e4
LT
1548 */
1549
1550int unregister_netdevice_notifier(struct notifier_block *nb)
1551{
7d3d43da
EB
1552 struct net_device *dev;
1553 struct net *net;
9f514950
HX
1554 int err;
1555
1556 rtnl_lock();
f07d5b94 1557 err = raw_notifier_chain_unregister(&netdev_chain, nb);
7d3d43da
EB
1558 if (err)
1559 goto unlock;
1560
1561 for_each_net(net) {
1562 for_each_netdev(net, dev) {
1563 if (dev->flags & IFF_UP) {
351638e7
JP
1564 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1565 dev);
1566 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
7d3d43da 1567 }
351638e7 1568 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
7d3d43da
EB
1569 }
1570 }
1571unlock:
9f514950
HX
1572 rtnl_unlock();
1573 return err;
1da177e4 1574}
d1b19dff 1575EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4 1576
351638e7
JP
1577/**
1578 * call_netdevice_notifiers_info - call all network notifier blocks
1579 * @val: value passed unmodified to notifier function
1580 * @dev: net_device pointer passed unmodified to notifier function
1581 * @info: notifier information data
1582 *
1583 * Call all network notifier blocks. Parameters and return value
1584 * are as for raw_notifier_call_chain().
1585 */
1586
1d143d9f 1587static int call_netdevice_notifiers_info(unsigned long val,
1588 struct net_device *dev,
1589 struct netdev_notifier_info *info)
351638e7
JP
1590{
1591 ASSERT_RTNL();
1592 netdev_notifier_info_init(info, dev);
1593 return raw_notifier_call_chain(&netdev_chain, val, info);
1594}
351638e7 1595
1da177e4
LT
1596/**
1597 * call_netdevice_notifiers - call all network notifier blocks
1598 * @val: value passed unmodified to notifier function
c4ea43c5 1599 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1600 *
1601 * Call all network notifier blocks. Parameters and return value
f07d5b94 1602 * are as for raw_notifier_call_chain().
1da177e4
LT
1603 */
1604
ad7379d4 1605int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1606{
351638e7
JP
1607 struct netdev_notifier_info info;
1608
1609 return call_netdevice_notifiers_info(val, dev, &info);
1da177e4 1610}
edf947f1 1611EXPORT_SYMBOL(call_netdevice_notifiers);
1da177e4 1612
c5905afb 1613static struct static_key netstamp_needed __read_mostly;
b90e5794 1614#ifdef HAVE_JUMP_LABEL
c5905afb 1615/* We are not allowed to call static_key_slow_dec() from irq context
b90e5794 1616 * If net_disable_timestamp() is called from irq context, defer the
c5905afb 1617 * static_key_slow_dec() calls.
b90e5794
ED
1618 */
1619static atomic_t netstamp_needed_deferred;
1620#endif
1da177e4
LT
1621
1622void net_enable_timestamp(void)
1623{
b90e5794
ED
1624#ifdef HAVE_JUMP_LABEL
1625 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1626
1627 if (deferred) {
1628 while (--deferred)
c5905afb 1629 static_key_slow_dec(&netstamp_needed);
b90e5794
ED
1630 return;
1631 }
1632#endif
c5905afb 1633 static_key_slow_inc(&netstamp_needed);
1da177e4 1634}
d1b19dff 1635EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1636
1637void net_disable_timestamp(void)
1638{
b90e5794
ED
1639#ifdef HAVE_JUMP_LABEL
1640 if (in_interrupt()) {
1641 atomic_inc(&netstamp_needed_deferred);
1642 return;
1643 }
1644#endif
c5905afb 1645 static_key_slow_dec(&netstamp_needed);
1da177e4 1646}
d1b19dff 1647EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1648
3b098e2d 1649static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4 1650{
588f0330 1651 skb->tstamp.tv64 = 0;
c5905afb 1652 if (static_key_false(&netstamp_needed))
a61bbcf2 1653 __net_timestamp(skb);
1da177e4
LT
1654}
1655
588f0330 1656#define net_timestamp_check(COND, SKB) \
c5905afb 1657 if (static_key_false(&netstamp_needed)) { \
588f0330
ED
1658 if ((COND) && !(SKB)->tstamp.tv64) \
1659 __net_timestamp(SKB); \
1660 } \
3b098e2d 1661
1ee481fb 1662bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
79b569f0
DL
1663{
1664 unsigned int len;
1665
1666 if (!(dev->flags & IFF_UP))
1667 return false;
1668
1669 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1670 if (skb->len <= len)
1671 return true;
1672
1673 /* if TSO is enabled, we don't care about the length as the packet
1674 * could be forwarded without being segmented before
1675 */
1676 if (skb_is_gso(skb))
1677 return true;
1678
1679 return false;
1680}
1ee481fb 1681EXPORT_SYMBOL_GPL(is_skb_forwardable);
79b569f0 1682
a0265d28
HX
1683int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1684{
1685 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1686 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1687 atomic_long_inc(&dev->rx_dropped);
1688 kfree_skb(skb);
1689 return NET_RX_DROP;
1690 }
1691 }
1692
1693 if (unlikely(!is_skb_forwardable(dev, skb))) {
1694 atomic_long_inc(&dev->rx_dropped);
1695 kfree_skb(skb);
1696 return NET_RX_DROP;
1697 }
1698
1699 skb_scrub_packet(skb, true);
1700 skb->protocol = eth_type_trans(skb, dev);
1701
1702 return 0;
1703}
1704EXPORT_SYMBOL_GPL(__dev_forward_skb);
1705
44540960
AB
1706/**
1707 * dev_forward_skb - loopback an skb to another netif
1708 *
1709 * @dev: destination network device
1710 * @skb: buffer to forward
1711 *
1712 * return values:
1713 * NET_RX_SUCCESS (no congestion)
6ec82562 1714 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1715 *
1716 * dev_forward_skb can be used for injecting an skb from the
1717 * start_xmit function of one device into the receive queue
1718 * of another device.
1719 *
1720 * The receiving device may be in another namespace, so
1721 * we have to clear all information in the skb that could
1722 * impact namespace isolation.
1723 */
1724int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1725{
a0265d28 1726 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
44540960
AB
1727}
1728EXPORT_SYMBOL_GPL(dev_forward_skb);
1729
71d9dec2
CG
1730static inline int deliver_skb(struct sk_buff *skb,
1731 struct packet_type *pt_prev,
1732 struct net_device *orig_dev)
1733{
1080e512
MT
1734 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1735 return -ENOMEM;
71d9dec2
CG
1736 atomic_inc(&skb->users);
1737 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1738}
1739
c0de08d0
EL
1740static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1741{
a3d744e9 1742 if (!ptype->af_packet_priv || !skb->sk)
c0de08d0
EL
1743 return false;
1744
1745 if (ptype->id_match)
1746 return ptype->id_match(ptype, skb->sk);
1747 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1748 return true;
1749
1750 return false;
1751}
1752
1da177e4
LT
1753/*
1754 * Support routine. Sends outgoing frames to any network
1755 * taps currently in use.
1756 */
1757
f6a78bfc 1758static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1759{
1760 struct packet_type *ptype;
71d9dec2
CG
1761 struct sk_buff *skb2 = NULL;
1762 struct packet_type *pt_prev = NULL;
a61bbcf2 1763
1da177e4
LT
1764 rcu_read_lock();
1765 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1766 /* Never send packets back to the socket
1767 * they originated from - MvS (miquels@drinkel.ow.org)
1768 */
1769 if ((ptype->dev == dev || !ptype->dev) &&
c0de08d0 1770 (!skb_loop_sk(ptype, skb))) {
71d9dec2
CG
1771 if (pt_prev) {
1772 deliver_skb(skb2, pt_prev, skb->dev);
1773 pt_prev = ptype;
1774 continue;
1775 }
1776
1777 skb2 = skb_clone(skb, GFP_ATOMIC);
1da177e4
LT
1778 if (!skb2)
1779 break;
1780
70978182
ED
1781 net_timestamp_set(skb2);
1782
1da177e4
LT
1783 /* skb->nh should be correctly
1784 set by sender, so that the second statement is
1785 just protection against buggy protocols.
1786 */
459a98ed 1787 skb_reset_mac_header(skb2);
1da177e4 1788
d56f90a7 1789 if (skb_network_header(skb2) < skb2->data ||
ced14f68 1790 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
e87cc472
JP
1791 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1792 ntohs(skb2->protocol),
1793 dev->name);
c1d2bbe1 1794 skb_reset_network_header(skb2);
1da177e4
LT
1795 }
1796
b0e380b1 1797 skb2->transport_header = skb2->network_header;
1da177e4 1798 skb2->pkt_type = PACKET_OUTGOING;
71d9dec2 1799 pt_prev = ptype;
1da177e4
LT
1800 }
1801 }
71d9dec2
CG
1802 if (pt_prev)
1803 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1da177e4
LT
1804 rcu_read_unlock();
1805}
1806
2c53040f
BH
1807/**
1808 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
4f57c087
JF
1809 * @dev: Network device
1810 * @txq: number of queues available
1811 *
1812 * If real_num_tx_queues is changed the tc mappings may no longer be
1813 * valid. To resolve this verify the tc mapping remains valid and if
1814 * not NULL the mapping. With no priorities mapping to this
1815 * offset/count pair it will no longer be used. In the worst case TC0
1816 * is invalid nothing can be done so disable priority mappings. If is
1817 * expected that drivers will fix this mapping if they can before
1818 * calling netif_set_real_num_tx_queues.
1819 */
bb134d22 1820static void netif_setup_tc(struct net_device *dev, unsigned int txq)
4f57c087
JF
1821{
1822 int i;
1823 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1824
1825 /* If TC0 is invalidated disable TC mapping */
1826 if (tc->offset + tc->count > txq) {
7b6cd1ce 1827 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
4f57c087
JF
1828 dev->num_tc = 0;
1829 return;
1830 }
1831
1832 /* Invalidated prio to tc mappings set to TC0 */
1833 for (i = 1; i < TC_BITMASK + 1; i++) {
1834 int q = netdev_get_prio_tc_map(dev, i);
1835
1836 tc = &dev->tc_to_txq[q];
1837 if (tc->offset + tc->count > txq) {
7b6cd1ce
JP
1838 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1839 i, q);
4f57c087
JF
1840 netdev_set_prio_tc_map(dev, i, 0);
1841 }
1842 }
1843}
1844
537c00de
AD
1845#ifdef CONFIG_XPS
1846static DEFINE_MUTEX(xps_map_mutex);
1847#define xmap_dereference(P) \
1848 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1849
10cdc3f3
AD
1850static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1851 int cpu, u16 index)
537c00de 1852{
10cdc3f3
AD
1853 struct xps_map *map = NULL;
1854 int pos;
537c00de 1855
10cdc3f3
AD
1856 if (dev_maps)
1857 map = xmap_dereference(dev_maps->cpu_map[cpu]);
537c00de 1858
10cdc3f3
AD
1859 for (pos = 0; map && pos < map->len; pos++) {
1860 if (map->queues[pos] == index) {
537c00de
AD
1861 if (map->len > 1) {
1862 map->queues[pos] = map->queues[--map->len];
1863 } else {
10cdc3f3 1864 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
537c00de
AD
1865 kfree_rcu(map, rcu);
1866 map = NULL;
1867 }
10cdc3f3 1868 break;
537c00de 1869 }
537c00de
AD
1870 }
1871
10cdc3f3
AD
1872 return map;
1873}
1874
024e9679 1875static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
10cdc3f3
AD
1876{
1877 struct xps_dev_maps *dev_maps;
024e9679 1878 int cpu, i;
10cdc3f3
AD
1879 bool active = false;
1880
1881 mutex_lock(&xps_map_mutex);
1882 dev_maps = xmap_dereference(dev->xps_maps);
1883
1884 if (!dev_maps)
1885 goto out_no_maps;
1886
1887 for_each_possible_cpu(cpu) {
024e9679
AD
1888 for (i = index; i < dev->num_tx_queues; i++) {
1889 if (!remove_xps_queue(dev_maps, cpu, i))
1890 break;
1891 }
1892 if (i == dev->num_tx_queues)
10cdc3f3
AD
1893 active = true;
1894 }
1895
1896 if (!active) {
537c00de
AD
1897 RCU_INIT_POINTER(dev->xps_maps, NULL);
1898 kfree_rcu(dev_maps, rcu);
1899 }
1900
024e9679
AD
1901 for (i = index; i < dev->num_tx_queues; i++)
1902 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1903 NUMA_NO_NODE);
1904
537c00de
AD
1905out_no_maps:
1906 mutex_unlock(&xps_map_mutex);
1907}
1908
01c5f864
AD
1909static struct xps_map *expand_xps_map(struct xps_map *map,
1910 int cpu, u16 index)
1911{
1912 struct xps_map *new_map;
1913 int alloc_len = XPS_MIN_MAP_ALLOC;
1914 int i, pos;
1915
1916 for (pos = 0; map && pos < map->len; pos++) {
1917 if (map->queues[pos] != index)
1918 continue;
1919 return map;
1920 }
1921
1922 /* Need to add queue to this CPU's existing map */
1923 if (map) {
1924 if (pos < map->alloc_len)
1925 return map;
1926
1927 alloc_len = map->alloc_len * 2;
1928 }
1929
1930 /* Need to allocate new map to store queue on this CPU's map */
1931 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1932 cpu_to_node(cpu));
1933 if (!new_map)
1934 return NULL;
1935
1936 for (i = 0; i < pos; i++)
1937 new_map->queues[i] = map->queues[i];
1938 new_map->alloc_len = alloc_len;
1939 new_map->len = pos;
1940
1941 return new_map;
1942}
1943
3573540c
MT
1944int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1945 u16 index)
537c00de 1946{
01c5f864 1947 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
537c00de 1948 struct xps_map *map, *new_map;
537c00de 1949 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
01c5f864
AD
1950 int cpu, numa_node_id = -2;
1951 bool active = false;
537c00de
AD
1952
1953 mutex_lock(&xps_map_mutex);
1954
1955 dev_maps = xmap_dereference(dev->xps_maps);
1956
01c5f864
AD
1957 /* allocate memory for queue storage */
1958 for_each_online_cpu(cpu) {
1959 if (!cpumask_test_cpu(cpu, mask))
1960 continue;
1961
1962 if (!new_dev_maps)
1963 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2bb60cb9
AD
1964 if (!new_dev_maps) {
1965 mutex_unlock(&xps_map_mutex);
01c5f864 1966 return -ENOMEM;
2bb60cb9 1967 }
01c5f864
AD
1968
1969 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1970 NULL;
1971
1972 map = expand_xps_map(map, cpu, index);
1973 if (!map)
1974 goto error;
1975
1976 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1977 }
1978
1979 if (!new_dev_maps)
1980 goto out_no_new_maps;
1981
537c00de 1982 for_each_possible_cpu(cpu) {
01c5f864
AD
1983 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1984 /* add queue to CPU maps */
1985 int pos = 0;
1986
1987 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1988 while ((pos < map->len) && (map->queues[pos] != index))
1989 pos++;
1990
1991 if (pos == map->len)
1992 map->queues[map->len++] = index;
537c00de 1993#ifdef CONFIG_NUMA
537c00de
AD
1994 if (numa_node_id == -2)
1995 numa_node_id = cpu_to_node(cpu);
1996 else if (numa_node_id != cpu_to_node(cpu))
1997 numa_node_id = -1;
537c00de 1998#endif
01c5f864
AD
1999 } else if (dev_maps) {
2000 /* fill in the new device map from the old device map */
2001 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2002 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
537c00de 2003 }
01c5f864 2004
537c00de
AD
2005 }
2006
01c5f864
AD
2007 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2008
537c00de 2009 /* Cleanup old maps */
01c5f864
AD
2010 if (dev_maps) {
2011 for_each_possible_cpu(cpu) {
2012 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2013 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2014 if (map && map != new_map)
2015 kfree_rcu(map, rcu);
2016 }
537c00de 2017
01c5f864 2018 kfree_rcu(dev_maps, rcu);
537c00de
AD
2019 }
2020
01c5f864
AD
2021 dev_maps = new_dev_maps;
2022 active = true;
537c00de 2023
01c5f864
AD
2024out_no_new_maps:
2025 /* update Tx queue numa node */
537c00de
AD
2026 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2027 (numa_node_id >= 0) ? numa_node_id :
2028 NUMA_NO_NODE);
2029
01c5f864
AD
2030 if (!dev_maps)
2031 goto out_no_maps;
2032
2033 /* removes queue from unused CPUs */
2034 for_each_possible_cpu(cpu) {
2035 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2036 continue;
2037
2038 if (remove_xps_queue(dev_maps, cpu, index))
2039 active = true;
2040 }
2041
2042 /* free map if not active */
2043 if (!active) {
2044 RCU_INIT_POINTER(dev->xps_maps, NULL);
2045 kfree_rcu(dev_maps, rcu);
2046 }
2047
2048out_no_maps:
537c00de
AD
2049 mutex_unlock(&xps_map_mutex);
2050
2051 return 0;
2052error:
01c5f864
AD
2053 /* remove any maps that we added */
2054 for_each_possible_cpu(cpu) {
2055 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2056 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2057 NULL;
2058 if (new_map && new_map != map)
2059 kfree(new_map);
2060 }
2061
537c00de
AD
2062 mutex_unlock(&xps_map_mutex);
2063
537c00de
AD
2064 kfree(new_dev_maps);
2065 return -ENOMEM;
2066}
2067EXPORT_SYMBOL(netif_set_xps_queue);
2068
2069#endif
f0796d5c
JF
2070/*
2071 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2072 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2073 */
e6484930 2074int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
f0796d5c 2075{
1d24eb48
TH
2076 int rc;
2077
e6484930
TH
2078 if (txq < 1 || txq > dev->num_tx_queues)
2079 return -EINVAL;
f0796d5c 2080
5c56580b
BH
2081 if (dev->reg_state == NETREG_REGISTERED ||
2082 dev->reg_state == NETREG_UNREGISTERING) {
e6484930
TH
2083 ASSERT_RTNL();
2084
1d24eb48
TH
2085 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2086 txq);
bf264145
TH
2087 if (rc)
2088 return rc;
2089
4f57c087
JF
2090 if (dev->num_tc)
2091 netif_setup_tc(dev, txq);
2092
024e9679 2093 if (txq < dev->real_num_tx_queues) {
e6484930 2094 qdisc_reset_all_tx_gt(dev, txq);
024e9679
AD
2095#ifdef CONFIG_XPS
2096 netif_reset_xps_queues_gt(dev, txq);
2097#endif
2098 }
f0796d5c 2099 }
e6484930
TH
2100
2101 dev->real_num_tx_queues = txq;
2102 return 0;
f0796d5c
JF
2103}
2104EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 2105
a953be53 2106#ifdef CONFIG_SYSFS
62fe0b40
BH
2107/**
2108 * netif_set_real_num_rx_queues - set actual number of RX queues used
2109 * @dev: Network device
2110 * @rxq: Actual number of RX queues
2111 *
2112 * This must be called either with the rtnl_lock held or before
2113 * registration of the net device. Returns 0 on success, or a
4e7f7951
BH
2114 * negative error code. If called before registration, it always
2115 * succeeds.
62fe0b40
BH
2116 */
2117int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2118{
2119 int rc;
2120
bd25fa7b
TH
2121 if (rxq < 1 || rxq > dev->num_rx_queues)
2122 return -EINVAL;
2123
62fe0b40
BH
2124 if (dev->reg_state == NETREG_REGISTERED) {
2125 ASSERT_RTNL();
2126
62fe0b40
BH
2127 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2128 rxq);
2129 if (rc)
2130 return rc;
62fe0b40
BH
2131 }
2132
2133 dev->real_num_rx_queues = rxq;
2134 return 0;
2135}
2136EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2137#endif
2138
2c53040f
BH
2139/**
2140 * netif_get_num_default_rss_queues - default number of RSS queues
16917b87
YM
2141 *
2142 * This routine should set an upper limit on the number of RSS queues
2143 * used by default by multiqueue devices.
2144 */
a55b138b 2145int netif_get_num_default_rss_queues(void)
16917b87
YM
2146{
2147 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2148}
2149EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2150
def82a1d 2151static inline void __netif_reschedule(struct Qdisc *q)
56079431 2152{
def82a1d
JP
2153 struct softnet_data *sd;
2154 unsigned long flags;
56079431 2155
def82a1d 2156 local_irq_save(flags);
903ceff7 2157 sd = this_cpu_ptr(&softnet_data);
a9cbd588
CG
2158 q->next_sched = NULL;
2159 *sd->output_queue_tailp = q;
2160 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
2161 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2162 local_irq_restore(flags);
2163}
2164
2165void __netif_schedule(struct Qdisc *q)
2166{
2167 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2168 __netif_reschedule(q);
56079431
DV
2169}
2170EXPORT_SYMBOL(__netif_schedule);
2171
e6247027
ED
2172struct dev_kfree_skb_cb {
2173 enum skb_free_reason reason;
2174};
2175
2176static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
56079431 2177{
e6247027
ED
2178 return (struct dev_kfree_skb_cb *)skb->cb;
2179}
2180
46e5da40
JF
2181void netif_schedule_queue(struct netdev_queue *txq)
2182{
2183 rcu_read_lock();
2184 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2185 struct Qdisc *q = rcu_dereference(txq->qdisc);
2186
2187 __netif_schedule(q);
2188 }
2189 rcu_read_unlock();
2190}
2191EXPORT_SYMBOL(netif_schedule_queue);
2192
2193/**
2194 * netif_wake_subqueue - allow sending packets on subqueue
2195 * @dev: network device
2196 * @queue_index: sub queue index
2197 *
2198 * Resume individual transmit queue of a device with multiple transmit queues.
2199 */
2200void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2201{
2202 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2203
2204 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2205 struct Qdisc *q;
2206
2207 rcu_read_lock();
2208 q = rcu_dereference(txq->qdisc);
2209 __netif_schedule(q);
2210 rcu_read_unlock();
2211 }
2212}
2213EXPORT_SYMBOL(netif_wake_subqueue);
2214
2215void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2216{
2217 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2218 struct Qdisc *q;
2219
2220 rcu_read_lock();
2221 q = rcu_dereference(dev_queue->qdisc);
2222 __netif_schedule(q);
2223 rcu_read_unlock();
2224 }
2225}
2226EXPORT_SYMBOL(netif_tx_wake_queue);
2227
e6247027 2228void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
56079431 2229{
e6247027 2230 unsigned long flags;
56079431 2231
e6247027
ED
2232 if (likely(atomic_read(&skb->users) == 1)) {
2233 smp_rmb();
2234 atomic_set(&skb->users, 0);
2235 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2236 return;
bea3348e 2237 }
e6247027
ED
2238 get_kfree_skb_cb(skb)->reason = reason;
2239 local_irq_save(flags);
2240 skb->next = __this_cpu_read(softnet_data.completion_queue);
2241 __this_cpu_write(softnet_data.completion_queue, skb);
2242 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2243 local_irq_restore(flags);
56079431 2244}
e6247027 2245EXPORT_SYMBOL(__dev_kfree_skb_irq);
56079431 2246
e6247027 2247void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
56079431
DV
2248{
2249 if (in_irq() || irqs_disabled())
e6247027 2250 __dev_kfree_skb_irq(skb, reason);
56079431
DV
2251 else
2252 dev_kfree_skb(skb);
2253}
e6247027 2254EXPORT_SYMBOL(__dev_kfree_skb_any);
56079431
DV
2255
2256
bea3348e
SH
2257/**
2258 * netif_device_detach - mark device as removed
2259 * @dev: network device
2260 *
2261 * Mark device as removed from system and therefore no longer available.
2262 */
56079431
DV
2263void netif_device_detach(struct net_device *dev)
2264{
2265 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2266 netif_running(dev)) {
d543103a 2267 netif_tx_stop_all_queues(dev);
56079431
DV
2268 }
2269}
2270EXPORT_SYMBOL(netif_device_detach);
2271
bea3348e
SH
2272/**
2273 * netif_device_attach - mark device as attached
2274 * @dev: network device
2275 *
2276 * Mark device as attached from system and restart if needed.
2277 */
56079431
DV
2278void netif_device_attach(struct net_device *dev)
2279{
2280 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2281 netif_running(dev)) {
d543103a 2282 netif_tx_wake_all_queues(dev);
4ec93edb 2283 __netdev_watchdog_up(dev);
56079431
DV
2284 }
2285}
2286EXPORT_SYMBOL(netif_device_attach);
2287
36c92474
BH
2288static void skb_warn_bad_offload(const struct sk_buff *skb)
2289{
65e9d2fa 2290 static const netdev_features_t null_features = 0;
36c92474
BH
2291 struct net_device *dev = skb->dev;
2292 const char *driver = "";
2293
c846ad9b
BG
2294 if (!net_ratelimit())
2295 return;
2296
36c92474
BH
2297 if (dev && dev->dev.parent)
2298 driver = dev_driver_string(dev->dev.parent);
2299
2300 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2301 "gso_type=%d ip_summed=%d\n",
65e9d2fa
MM
2302 driver, dev ? &dev->features : &null_features,
2303 skb->sk ? &skb->sk->sk_route_caps : &null_features,
36c92474
BH
2304 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2305 skb_shinfo(skb)->gso_type, skb->ip_summed);
2306}
2307
1da177e4
LT
2308/*
2309 * Invalidate hardware checksum when packet is to be mangled, and
2310 * complete checksum manually on outgoing path.
2311 */
84fa7933 2312int skb_checksum_help(struct sk_buff *skb)
1da177e4 2313{
d3bc23e7 2314 __wsum csum;
663ead3b 2315 int ret = 0, offset;
1da177e4 2316
84fa7933 2317 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
2318 goto out_set_summed;
2319
2320 if (unlikely(skb_shinfo(skb)->gso_size)) {
36c92474
BH
2321 skb_warn_bad_offload(skb);
2322 return -EINVAL;
1da177e4
LT
2323 }
2324
cef401de
ED
2325 /* Before computing a checksum, we should make sure no frag could
2326 * be modified by an external entity : checksum could be wrong.
2327 */
2328 if (skb_has_shared_frag(skb)) {
2329 ret = __skb_linearize(skb);
2330 if (ret)
2331 goto out;
2332 }
2333
55508d60 2334 offset = skb_checksum_start_offset(skb);
a030847e
HX
2335 BUG_ON(offset >= skb_headlen(skb));
2336 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2337
2338 offset += skb->csum_offset;
2339 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2340
2341 if (skb_cloned(skb) &&
2342 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
2343 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2344 if (ret)
2345 goto out;
2346 }
2347
a030847e 2348 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 2349out_set_summed:
1da177e4 2350 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 2351out:
1da177e4
LT
2352 return ret;
2353}
d1b19dff 2354EXPORT_SYMBOL(skb_checksum_help);
1da177e4 2355
53d6471c 2356__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
f6a78bfc 2357{
4b9b1cdf 2358 unsigned int vlan_depth = skb->mac_len;
252e3346 2359 __be16 type = skb->protocol;
f6a78bfc 2360
19acc327
PS
2361 /* Tunnel gso handlers can set protocol to ethernet. */
2362 if (type == htons(ETH_P_TEB)) {
2363 struct ethhdr *eth;
2364
2365 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2366 return 0;
2367
2368 eth = (struct ethhdr *)skb_mac_header(skb);
2369 type = eth->h_proto;
2370 }
2371
4b9b1cdf
NA
2372 /* if skb->protocol is 802.1Q/AD then the header should already be
2373 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2374 * ETH_HLEN otherwise
2375 */
2376 if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2377 if (vlan_depth) {
80019d31 2378 if (WARN_ON(vlan_depth < VLAN_HLEN))
4b9b1cdf
NA
2379 return 0;
2380 vlan_depth -= VLAN_HLEN;
2381 } else {
2382 vlan_depth = ETH_HLEN;
2383 }
2384 do {
2385 struct vlan_hdr *vh;
2386
2387 if (unlikely(!pskb_may_pull(skb,
2388 vlan_depth + VLAN_HLEN)))
2389 return 0;
2390
2391 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2392 type = vh->h_vlan_encapsulated_proto;
2393 vlan_depth += VLAN_HLEN;
2394 } while (type == htons(ETH_P_8021Q) ||
2395 type == htons(ETH_P_8021AD));
7b9c6090
JG
2396 }
2397
53d6471c
VY
2398 *depth = vlan_depth;
2399
ec5f0615
PS
2400 return type;
2401}
2402
2403/**
2404 * skb_mac_gso_segment - mac layer segmentation handler.
2405 * @skb: buffer to segment
2406 * @features: features for the output path (see dev->features)
2407 */
2408struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2409 netdev_features_t features)
2410{
2411 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2412 struct packet_offload *ptype;
53d6471c
VY
2413 int vlan_depth = skb->mac_len;
2414 __be16 type = skb_network_protocol(skb, &vlan_depth);
ec5f0615
PS
2415
2416 if (unlikely(!type))
2417 return ERR_PTR(-EINVAL);
2418
53d6471c 2419 __skb_pull(skb, vlan_depth);
f6a78bfc
HX
2420
2421 rcu_read_lock();
22061d80 2422 list_for_each_entry_rcu(ptype, &offload_base, list) {
f191a1d1 2423 if (ptype->type == type && ptype->callbacks.gso_segment) {
f191a1d1 2424 segs = ptype->callbacks.gso_segment(skb, features);
f6a78bfc
HX
2425 break;
2426 }
2427 }
2428 rcu_read_unlock();
2429
98e399f8 2430 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 2431
f6a78bfc
HX
2432 return segs;
2433}
05e8ef4a
PS
2434EXPORT_SYMBOL(skb_mac_gso_segment);
2435
2436
2437/* openvswitch calls this on rx path, so we need a different check.
2438 */
2439static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2440{
2441 if (tx_path)
2442 return skb->ip_summed != CHECKSUM_PARTIAL;
2443 else
2444 return skb->ip_summed == CHECKSUM_NONE;
2445}
2446
2447/**
2448 * __skb_gso_segment - Perform segmentation on skb.
2449 * @skb: buffer to segment
2450 * @features: features for the output path (see dev->features)
2451 * @tx_path: whether it is called in TX path
2452 *
2453 * This function segments the given skb and returns a list of segments.
2454 *
2455 * It may return NULL if the skb requires no segmentation. This is
2456 * only possible when GSO is used for verifying header integrity.
2457 */
2458struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2459 netdev_features_t features, bool tx_path)
2460{
2461 if (unlikely(skb_needs_check(skb, tx_path))) {
2462 int err;
2463
2464 skb_warn_bad_offload(skb);
2465
a40e0a66 2466 err = skb_cow_head(skb, 0);
2467 if (err < 0)
05e8ef4a
PS
2468 return ERR_PTR(err);
2469 }
2470
68c33163 2471 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3347c960
ED
2472 SKB_GSO_CB(skb)->encap_level = 0;
2473
05e8ef4a
PS
2474 skb_reset_mac_header(skb);
2475 skb_reset_mac_len(skb);
2476
2477 return skb_mac_gso_segment(skb, features);
2478}
12b0004d 2479EXPORT_SYMBOL(__skb_gso_segment);
f6a78bfc 2480
fb286bb2
HX
2481/* Take action when hardware reception checksum errors are detected. */
2482#ifdef CONFIG_BUG
2483void netdev_rx_csum_fault(struct net_device *dev)
2484{
2485 if (net_ratelimit()) {
7b6cd1ce 2486 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
fb286bb2
HX
2487 dump_stack();
2488 }
2489}
2490EXPORT_SYMBOL(netdev_rx_csum_fault);
2491#endif
2492
1da177e4
LT
2493/* Actually, we should eliminate this check as soon as we know, that:
2494 * 1. IOMMU is present and allows to map all the memory.
2495 * 2. No high memory really exists on this machine.
2496 */
2497
c1e756bf 2498static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 2499{
3d3a8533 2500#ifdef CONFIG_HIGHMEM
1da177e4 2501 int i;
5acbbd42 2502 if (!(dev->features & NETIF_F_HIGHDMA)) {
ea2ab693
IC
2503 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2504 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2505 if (PageHighMem(skb_frag_page(frag)))
5acbbd42 2506 return 1;
ea2ab693 2507 }
5acbbd42 2508 }
1da177e4 2509
5acbbd42
FT
2510 if (PCI_DMA_BUS_IS_PHYS) {
2511 struct device *pdev = dev->dev.parent;
1da177e4 2512
9092c658
ED
2513 if (!pdev)
2514 return 0;
5acbbd42 2515 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
ea2ab693
IC
2516 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2517 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
5acbbd42
FT
2518 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2519 return 1;
2520 }
2521 }
3d3a8533 2522#endif
1da177e4
LT
2523 return 0;
2524}
1da177e4 2525
3b392ddb
SH
2526/* If MPLS offload request, verify we are testing hardware MPLS features
2527 * instead of standard features for the netdev.
2528 */
2529#ifdef CONFIG_NET_MPLS_GSO
2530static netdev_features_t net_mpls_features(struct sk_buff *skb,
2531 netdev_features_t features,
2532 __be16 type)
2533{
25cd9ba0 2534 if (eth_p_mpls(type))
3b392ddb
SH
2535 features &= skb->dev->mpls_features;
2536
2537 return features;
2538}
2539#else
2540static netdev_features_t net_mpls_features(struct sk_buff *skb,
2541 netdev_features_t features,
2542 __be16 type)
2543{
2544 return features;
2545}
2546#endif
2547
c8f44aff 2548static netdev_features_t harmonize_features(struct sk_buff *skb,
c1e756bf 2549 netdev_features_t features)
f01a5236 2550{
53d6471c 2551 int tmp;
3b392ddb
SH
2552 __be16 type;
2553
2554 type = skb_network_protocol(skb, &tmp);
2555 features = net_mpls_features(skb, features, type);
53d6471c 2556
c0d680e5 2557 if (skb->ip_summed != CHECKSUM_NONE &&
3b392ddb 2558 !can_checksum_protocol(features, type)) {
f01a5236 2559 features &= ~NETIF_F_ALL_CSUM;
c1e756bf 2560 } else if (illegal_highdma(skb->dev, skb)) {
f01a5236
JG
2561 features &= ~NETIF_F_SG;
2562 }
2563
2564 return features;
2565}
2566
c1e756bf 2567netdev_features_t netif_skb_features(struct sk_buff *skb)
58e998c6 2568{
fcbeb976
ED
2569 const struct net_device *dev = skb->dev;
2570 netdev_features_t features = dev->features;
2571 u16 gso_segs = skb_shinfo(skb)->gso_segs;
58e998c6
JG
2572 __be16 protocol = skb->protocol;
2573
fcbeb976 2574 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
30b678d8
BH
2575 features &= ~NETIF_F_GSO_MASK;
2576
8ad227ff 2577 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
58e998c6
JG
2578 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2579 protocol = veh->h_vlan_encapsulated_proto;
f01a5236 2580 } else if (!vlan_tx_tag_present(skb)) {
c1e756bf 2581 return harmonize_features(skb, features);
f01a5236 2582 }
58e998c6 2583
db115037 2584 features = netdev_intersect_features(features,
fcbeb976 2585 dev->vlan_features |
db115037
MK
2586 NETIF_F_HW_VLAN_CTAG_TX |
2587 NETIF_F_HW_VLAN_STAG_TX);
f01a5236 2588
cdbaa0bb 2589 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
db115037
MK
2590 features = netdev_intersect_features(features,
2591 NETIF_F_SG |
2592 NETIF_F_HIGHDMA |
2593 NETIF_F_FRAGLIST |
2594 NETIF_F_GEN_CSUM |
2595 NETIF_F_HW_VLAN_CTAG_TX |
2596 NETIF_F_HW_VLAN_STAG_TX);
cdbaa0bb 2597
c1e756bf 2598 return harmonize_features(skb, features);
58e998c6 2599}
c1e756bf 2600EXPORT_SYMBOL(netif_skb_features);
58e998c6 2601
2ea25513 2602static int xmit_one(struct sk_buff *skb, struct net_device *dev,
95f6b3dd 2603 struct netdev_queue *txq, bool more)
f6a78bfc 2604{
2ea25513
DM
2605 unsigned int len;
2606 int rc;
00829823 2607
2ea25513
DM
2608 if (!list_empty(&ptype_all))
2609 dev_queue_xmit_nit(skb, dev);
fc741216 2610
2ea25513
DM
2611 len = skb->len;
2612 trace_net_dev_start_xmit(skb, dev);
95f6b3dd 2613 rc = netdev_start_xmit(skb, dev, txq, more);
2ea25513 2614 trace_net_dev_xmit(skb, rc, dev, len);
adf30907 2615
2ea25513
DM
2616 return rc;
2617}
7b9c6090 2618
8dcda22a
DM
2619struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2620 struct netdev_queue *txq, int *ret)
7f2e870f
DM
2621{
2622 struct sk_buff *skb = first;
2623 int rc = NETDEV_TX_OK;
7b9c6090 2624
7f2e870f
DM
2625 while (skb) {
2626 struct sk_buff *next = skb->next;
fc70fb64 2627
7f2e870f 2628 skb->next = NULL;
95f6b3dd 2629 rc = xmit_one(skb, dev, txq, next != NULL);
7f2e870f
DM
2630 if (unlikely(!dev_xmit_complete(rc))) {
2631 skb->next = next;
2632 goto out;
2633 }
6afff0ca 2634
7f2e870f
DM
2635 skb = next;
2636 if (netif_xmit_stopped(txq) && skb) {
2637 rc = NETDEV_TX_BUSY;
2638 break;
9ccb8975 2639 }
7f2e870f 2640 }
9ccb8975 2641
7f2e870f
DM
2642out:
2643 *ret = rc;
2644 return skb;
2645}
b40863c6 2646
1ff0dc94
ED
2647static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2648 netdev_features_t features)
f6a78bfc 2649{
eae3f88e
DM
2650 if (vlan_tx_tag_present(skb) &&
2651 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2652 skb = __vlan_put_tag(skb, skb->vlan_proto,
2653 vlan_tx_tag_get(skb));
2654 if (skb)
2655 skb->vlan_tci = 0;
f6a78bfc 2656 }
eae3f88e
DM
2657 return skb;
2658}
f6a78bfc 2659
55a93b3e 2660static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
eae3f88e
DM
2661{
2662 netdev_features_t features;
f6a78bfc 2663
eae3f88e
DM
2664 if (skb->next)
2665 return skb;
068a2de5 2666
eae3f88e
DM
2667 features = netif_skb_features(skb);
2668 skb = validate_xmit_vlan(skb, features);
2669 if (unlikely(!skb))
2670 goto out_null;
7b9c6090 2671
eae3f88e
DM
2672 /* If encapsulation offload request, verify we are testing
2673 * hardware encapsulation features instead of standard
2674 * features for the netdev
2675 */
2676 if (skb->encapsulation)
2677 features &= dev->hw_enc_features;
2678
04ffcb25 2679 if (netif_needs_gso(dev, skb, features)) {
ce93718f
DM
2680 struct sk_buff *segs;
2681
2682 segs = skb_gso_segment(skb, features);
cecda693 2683 if (IS_ERR(segs)) {
ce93718f 2684 segs = NULL;
cecda693
JW
2685 } else if (segs) {
2686 consume_skb(skb);
2687 skb = segs;
f6a78bfc 2688 }
eae3f88e
DM
2689 } else {
2690 if (skb_needs_linearize(skb, features) &&
2691 __skb_linearize(skb))
2692 goto out_kfree_skb;
4ec93edb 2693
eae3f88e
DM
2694 /* If packet is not checksummed and device does not
2695 * support checksumming for this protocol, complete
2696 * checksumming here.
2697 */
2698 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2699 if (skb->encapsulation)
2700 skb_set_inner_transport_header(skb,
2701 skb_checksum_start_offset(skb));
2702 else
2703 skb_set_transport_header(skb,
2704 skb_checksum_start_offset(skb));
2705 if (!(features & NETIF_F_ALL_CSUM) &&
2706 skb_checksum_help(skb))
2707 goto out_kfree_skb;
7b9c6090 2708 }
0c772159 2709 }
7b9c6090 2710
eae3f88e 2711 return skb;
fc70fb64 2712
f6a78bfc
HX
2713out_kfree_skb:
2714 kfree_skb(skb);
eae3f88e
DM
2715out_null:
2716 return NULL;
2717}
6afff0ca 2718
55a93b3e
ED
2719struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2720{
2721 struct sk_buff *next, *head = NULL, *tail;
2722
bec3cfdc 2723 for (; skb != NULL; skb = next) {
55a93b3e
ED
2724 next = skb->next;
2725 skb->next = NULL;
bec3cfdc
ED
2726
2727 /* in case skb wont be segmented, point to itself */
2728 skb->prev = skb;
2729
55a93b3e 2730 skb = validate_xmit_skb(skb, dev);
bec3cfdc
ED
2731 if (!skb)
2732 continue;
55a93b3e 2733
bec3cfdc
ED
2734 if (!head)
2735 head = skb;
2736 else
2737 tail->next = skb;
2738 /* If skb was segmented, skb->prev points to
2739 * the last segment. If not, it still contains skb.
2740 */
2741 tail = skb->prev;
55a93b3e
ED
2742 }
2743 return head;
f6a78bfc
HX
2744}
2745
1def9238
ED
2746static void qdisc_pkt_len_init(struct sk_buff *skb)
2747{
2748 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2749
2750 qdisc_skb_cb(skb)->pkt_len = skb->len;
2751
2752 /* To get more precise estimation of bytes sent on wire,
2753 * we add to pkt_len the headers size of all segments
2754 */
2755 if (shinfo->gso_size) {
757b8b1d 2756 unsigned int hdr_len;
15e5a030 2757 u16 gso_segs = shinfo->gso_segs;
1def9238 2758
757b8b1d
ED
2759 /* mac layer + network layer */
2760 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2761
2762 /* + transport layer */
1def9238
ED
2763 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2764 hdr_len += tcp_hdrlen(skb);
2765 else
2766 hdr_len += sizeof(struct udphdr);
15e5a030
JW
2767
2768 if (shinfo->gso_type & SKB_GSO_DODGY)
2769 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2770 shinfo->gso_size);
2771
2772 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
1def9238
ED
2773 }
2774}
2775
bbd8a0d3
KK
2776static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2777 struct net_device *dev,
2778 struct netdev_queue *txq)
2779{
2780 spinlock_t *root_lock = qdisc_lock(q);
a2da570d 2781 bool contended;
bbd8a0d3
KK
2782 int rc;
2783
1def9238 2784 qdisc_pkt_len_init(skb);
a2da570d 2785 qdisc_calculate_pkt_len(skb, q);
79640a4c
ED
2786 /*
2787 * Heuristic to force contended enqueues to serialize on a
2788 * separate lock before trying to get qdisc main lock.
9bf2b8c2
YX
2789 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2790 * often and dequeue packets faster.
79640a4c 2791 */
a2da570d 2792 contended = qdisc_is_running(q);
79640a4c
ED
2793 if (unlikely(contended))
2794 spin_lock(&q->busylock);
2795
bbd8a0d3
KK
2796 spin_lock(root_lock);
2797 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2798 kfree_skb(skb);
2799 rc = NET_XMIT_DROP;
2800 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 2801 qdisc_run_begin(q)) {
bbd8a0d3
KK
2802 /*
2803 * This is a work-conserving queue; there are no old skbs
2804 * waiting to be sent out; and the qdisc is not running -
2805 * xmit the skb directly.
2806 */
bfe0d029 2807
bfe0d029
ED
2808 qdisc_bstats_update(q, skb);
2809
55a93b3e 2810 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
79640a4c
ED
2811 if (unlikely(contended)) {
2812 spin_unlock(&q->busylock);
2813 contended = false;
2814 }
bbd8a0d3 2815 __qdisc_run(q);
79640a4c 2816 } else
bc135b23 2817 qdisc_run_end(q);
bbd8a0d3
KK
2818
2819 rc = NET_XMIT_SUCCESS;
2820 } else {
a2da570d 2821 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
79640a4c
ED
2822 if (qdisc_run_begin(q)) {
2823 if (unlikely(contended)) {
2824 spin_unlock(&q->busylock);
2825 contended = false;
2826 }
2827 __qdisc_run(q);
2828 }
bbd8a0d3
KK
2829 }
2830 spin_unlock(root_lock);
79640a4c
ED
2831 if (unlikely(contended))
2832 spin_unlock(&q->busylock);
bbd8a0d3
KK
2833 return rc;
2834}
2835
86f8515f 2836#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
5bc1421e
NH
2837static void skb_update_prio(struct sk_buff *skb)
2838{
6977a79d 2839 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
5bc1421e 2840
91c68ce2
ED
2841 if (!skb->priority && skb->sk && map) {
2842 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2843
2844 if (prioidx < map->priomap_len)
2845 skb->priority = map->priomap[prioidx];
2846 }
5bc1421e
NH
2847}
2848#else
2849#define skb_update_prio(skb)
2850#endif
2851
745e20f1 2852static DEFINE_PER_CPU(int, xmit_recursion);
11a766ce 2853#define RECURSION_LIMIT 10
745e20f1 2854
95603e22
MM
2855/**
2856 * dev_loopback_xmit - loop back @skb
2857 * @skb: buffer to transmit
2858 */
2859int dev_loopback_xmit(struct sk_buff *skb)
2860{
2861 skb_reset_mac_header(skb);
2862 __skb_pull(skb, skb_network_offset(skb));
2863 skb->pkt_type = PACKET_LOOPBACK;
2864 skb->ip_summed = CHECKSUM_UNNECESSARY;
2865 WARN_ON(!skb_dst(skb));
2866 skb_dst_force(skb);
2867 netif_rx_ni(skb);
2868 return 0;
2869}
2870EXPORT_SYMBOL(dev_loopback_xmit);
2871
d29f749e 2872/**
9d08dd3d 2873 * __dev_queue_xmit - transmit a buffer
d29f749e 2874 * @skb: buffer to transmit
9d08dd3d 2875 * @accel_priv: private data used for L2 forwarding offload
d29f749e
DJ
2876 *
2877 * Queue a buffer for transmission to a network device. The caller must
2878 * have set the device and priority and built the buffer before calling
2879 * this function. The function can be called from an interrupt.
2880 *
2881 * A negative errno code is returned on a failure. A success does not
2882 * guarantee the frame will be transmitted as it may be dropped due
2883 * to congestion or traffic shaping.
2884 *
2885 * -----------------------------------------------------------------------------------
2886 * I notice this method can also return errors from the queue disciplines,
2887 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2888 * be positive.
2889 *
2890 * Regardless of the return value, the skb is consumed, so it is currently
2891 * difficult to retry a send to this method. (You can bump the ref count
2892 * before sending to hold a reference for retry if you are careful.)
2893 *
2894 * When calling this method, interrupts MUST be enabled. This is because
2895 * the BH enable code must have IRQs enabled so that it will not deadlock.
2896 * --BLG
2897 */
0a59f3a9 2898static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
1da177e4
LT
2899{
2900 struct net_device *dev = skb->dev;
dc2b4847 2901 struct netdev_queue *txq;
1da177e4
LT
2902 struct Qdisc *q;
2903 int rc = -ENOMEM;
2904
6d1ccff6
ED
2905 skb_reset_mac_header(skb);
2906
e7fd2885
WB
2907 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2908 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2909
4ec93edb
YH
2910 /* Disable soft irqs for various locks below. Also
2911 * stops preemption for RCU.
1da177e4 2912 */
4ec93edb 2913 rcu_read_lock_bh();
1da177e4 2914
5bc1421e
NH
2915 skb_update_prio(skb);
2916
02875878
ED
2917 /* If device/qdisc don't need skb->dst, release it right now while
2918 * its hot in this cpu cache.
2919 */
2920 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2921 skb_dst_drop(skb);
2922 else
2923 skb_dst_force(skb);
2924
f663dd9a 2925 txq = netdev_pick_tx(dev, skb, accel_priv);
a898def2 2926 q = rcu_dereference_bh(txq->qdisc);
37437bb2 2927
1da177e4 2928#ifdef CONFIG_NET_CLS_ACT
d1b19dff 2929 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4 2930#endif
cf66ba58 2931 trace_net_dev_queue(skb);
1da177e4 2932 if (q->enqueue) {
bbd8a0d3 2933 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 2934 goto out;
1da177e4
LT
2935 }
2936
2937 /* The device has no queue. Common case for software devices:
2938 loopback, all the sorts of tunnels...
2939
932ff279
HX
2940 Really, it is unlikely that netif_tx_lock protection is necessary
2941 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
2942 counters.)
2943 However, it is possible, that they rely on protection
2944 made by us here.
2945
2946 Check this and shot the lock. It is not prone from deadlocks.
2947 Either shot noqueue qdisc, it is even simpler 8)
2948 */
2949 if (dev->flags & IFF_UP) {
2950 int cpu = smp_processor_id(); /* ok because BHs are off */
2951
c773e847 2952 if (txq->xmit_lock_owner != cpu) {
1da177e4 2953
745e20f1
ED
2954 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2955 goto recursion_alert;
2956
1f59533f
JDB
2957 skb = validate_xmit_skb(skb, dev);
2958 if (!skb)
2959 goto drop;
2960
c773e847 2961 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 2962
73466498 2963 if (!netif_xmit_stopped(txq)) {
745e20f1 2964 __this_cpu_inc(xmit_recursion);
ce93718f 2965 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
745e20f1 2966 __this_cpu_dec(xmit_recursion);
572a9d7b 2967 if (dev_xmit_complete(rc)) {
c773e847 2968 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2969 goto out;
2970 }
2971 }
c773e847 2972 HARD_TX_UNLOCK(dev, txq);
e87cc472
JP
2973 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2974 dev->name);
1da177e4
LT
2975 } else {
2976 /* Recursion is detected! It is possible,
745e20f1
ED
2977 * unfortunately
2978 */
2979recursion_alert:
e87cc472
JP
2980 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2981 dev->name);
1da177e4
LT
2982 }
2983 }
2984
2985 rc = -ENETDOWN;
1f59533f 2986drop:
d4828d85 2987 rcu_read_unlock_bh();
1da177e4 2988
015f0688 2989 atomic_long_inc(&dev->tx_dropped);
1f59533f 2990 kfree_skb_list(skb);
1da177e4
LT
2991 return rc;
2992out:
d4828d85 2993 rcu_read_unlock_bh();
1da177e4
LT
2994 return rc;
2995}
f663dd9a
JW
2996
2997int dev_queue_xmit(struct sk_buff *skb)
2998{
2999 return __dev_queue_xmit(skb, NULL);
3000}
d1b19dff 3001EXPORT_SYMBOL(dev_queue_xmit);
1da177e4 3002
f663dd9a
JW
3003int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3004{
3005 return __dev_queue_xmit(skb, accel_priv);
3006}
3007EXPORT_SYMBOL(dev_queue_xmit_accel);
3008
1da177e4
LT
3009
3010/*=======================================================================
3011 Receiver routines
3012 =======================================================================*/
3013
6b2bedc3 3014int netdev_max_backlog __read_mostly = 1000;
c9e6bc64
ED
3015EXPORT_SYMBOL(netdev_max_backlog);
3016
3b098e2d 3017int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
3018int netdev_budget __read_mostly = 300;
3019int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 3020
eecfd7c4
ED
3021/* Called with irq disabled */
3022static inline void ____napi_schedule(struct softnet_data *sd,
3023 struct napi_struct *napi)
3024{
3025 list_add_tail(&napi->poll_list, &sd->poll_list);
3026 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3027}
3028
bfb564e7
KK
3029#ifdef CONFIG_RPS
3030
3031/* One global table that all flow-based protocols share. */
6e3f7faf 3032struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
bfb564e7
KK
3033EXPORT_SYMBOL(rps_sock_flow_table);
3034
c5905afb 3035struct static_key rps_needed __read_mostly;
adc9300e 3036
c445477d
BH
3037static struct rps_dev_flow *
3038set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3039 struct rps_dev_flow *rflow, u16 next_cpu)
3040{
09994d1b 3041 if (next_cpu != RPS_NO_CPU) {
c445477d
BH
3042#ifdef CONFIG_RFS_ACCEL
3043 struct netdev_rx_queue *rxqueue;
3044 struct rps_dev_flow_table *flow_table;
3045 struct rps_dev_flow *old_rflow;
3046 u32 flow_id;
3047 u16 rxq_index;
3048 int rc;
3049
3050 /* Should we steer this flow to a different hardware queue? */
69a19ee6
BH
3051 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3052 !(dev->features & NETIF_F_NTUPLE))
c445477d
BH
3053 goto out;
3054 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3055 if (rxq_index == skb_get_rx_queue(skb))
3056 goto out;
3057
3058 rxqueue = dev->_rx + rxq_index;
3059 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3060 if (!flow_table)
3061 goto out;
61b905da 3062 flow_id = skb_get_hash(skb) & flow_table->mask;
c445477d
BH
3063 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3064 rxq_index, flow_id);
3065 if (rc < 0)
3066 goto out;
3067 old_rflow = rflow;
3068 rflow = &flow_table->flows[flow_id];
c445477d
BH
3069 rflow->filter = rc;
3070 if (old_rflow->filter == rflow->filter)
3071 old_rflow->filter = RPS_NO_FILTER;
3072 out:
3073#endif
3074 rflow->last_qtail =
09994d1b 3075 per_cpu(softnet_data, next_cpu).input_queue_head;
c445477d
BH
3076 }
3077
09994d1b 3078 rflow->cpu = next_cpu;
c445477d
BH
3079 return rflow;
3080}
3081
bfb564e7
KK
3082/*
3083 * get_rps_cpu is called from netif_receive_skb and returns the target
3084 * CPU from the RPS map of the receiving queue for a given skb.
3085 * rcu_read_lock must be held on entry.
3086 */
3087static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3088 struct rps_dev_flow **rflowp)
3089{
3090 struct netdev_rx_queue *rxqueue;
6e3f7faf 3091 struct rps_map *map;
bfb564e7
KK
3092 struct rps_dev_flow_table *flow_table;
3093 struct rps_sock_flow_table *sock_flow_table;
3094 int cpu = -1;
3095 u16 tcpu;
61b905da 3096 u32 hash;
bfb564e7
KK
3097
3098 if (skb_rx_queue_recorded(skb)) {
3099 u16 index = skb_get_rx_queue(skb);
62fe0b40
BH
3100 if (unlikely(index >= dev->real_num_rx_queues)) {
3101 WARN_ONCE(dev->real_num_rx_queues > 1,
3102 "%s received packet on queue %u, but number "
3103 "of RX queues is %u\n",
3104 dev->name, index, dev->real_num_rx_queues);
bfb564e7
KK
3105 goto done;
3106 }
3107 rxqueue = dev->_rx + index;
3108 } else
3109 rxqueue = dev->_rx;
3110
6e3f7faf
ED
3111 map = rcu_dereference(rxqueue->rps_map);
3112 if (map) {
85875236 3113 if (map->len == 1 &&
33d480ce 3114 !rcu_access_pointer(rxqueue->rps_flow_table)) {
6febfca9
CG
3115 tcpu = map->cpus[0];
3116 if (cpu_online(tcpu))
3117 cpu = tcpu;
3118 goto done;
3119 }
33d480ce 3120 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
bfb564e7 3121 goto done;
6febfca9 3122 }
bfb564e7 3123
2d47b459 3124 skb_reset_network_header(skb);
61b905da
TH
3125 hash = skb_get_hash(skb);
3126 if (!hash)
bfb564e7
KK
3127 goto done;
3128
fec5e652
TH
3129 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3130 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3131 if (flow_table && sock_flow_table) {
3132 u16 next_cpu;
3133 struct rps_dev_flow *rflow;
3134
61b905da 3135 rflow = &flow_table->flows[hash & flow_table->mask];
fec5e652
TH
3136 tcpu = rflow->cpu;
3137
61b905da 3138 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
fec5e652
TH
3139
3140 /*
3141 * If the desired CPU (where last recvmsg was done) is
3142 * different from current CPU (one in the rx-queue flow
3143 * table entry), switch if one of the following holds:
3144 * - Current CPU is unset (equal to RPS_NO_CPU).
3145 * - Current CPU is offline.
3146 * - The current CPU's queue tail has advanced beyond the
3147 * last packet that was enqueued using this table entry.
3148 * This guarantees that all previous packets for the flow
3149 * have been dequeued, thus preserving in order delivery.
3150 */
3151 if (unlikely(tcpu != next_cpu) &&
3152 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3153 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
baefa31d
TH
3154 rflow->last_qtail)) >= 0)) {
3155 tcpu = next_cpu;
c445477d 3156 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
baefa31d 3157 }
c445477d 3158
fec5e652
TH
3159 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3160 *rflowp = rflow;
3161 cpu = tcpu;
3162 goto done;
3163 }
3164 }
3165
0a9627f2 3166 if (map) {
8fc54f68 3167 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
0a9627f2
TH
3168 if (cpu_online(tcpu)) {
3169 cpu = tcpu;
3170 goto done;
3171 }
3172 }
3173
3174done:
0a9627f2
TH
3175 return cpu;
3176}
3177
c445477d
BH
3178#ifdef CONFIG_RFS_ACCEL
3179
3180/**
3181 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3182 * @dev: Device on which the filter was set
3183 * @rxq_index: RX queue index
3184 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3185 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3186 *
3187 * Drivers that implement ndo_rx_flow_steer() should periodically call
3188 * this function for each installed filter and remove the filters for
3189 * which it returns %true.
3190 */
3191bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3192 u32 flow_id, u16 filter_id)
3193{
3194 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3195 struct rps_dev_flow_table *flow_table;
3196 struct rps_dev_flow *rflow;
3197 bool expire = true;
3198 int cpu;
3199
3200 rcu_read_lock();
3201 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3202 if (flow_table && flow_id <= flow_table->mask) {
3203 rflow = &flow_table->flows[flow_id];
3204 cpu = ACCESS_ONCE(rflow->cpu);
3205 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3206 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3207 rflow->last_qtail) <
3208 (int)(10 * flow_table->mask)))
3209 expire = false;
3210 }
3211 rcu_read_unlock();
3212 return expire;
3213}
3214EXPORT_SYMBOL(rps_may_expire_flow);
3215
3216#endif /* CONFIG_RFS_ACCEL */
3217
0a9627f2 3218/* Called from hardirq (IPI) context */
e36fa2f7 3219static void rps_trigger_softirq(void *data)
0a9627f2 3220{
e36fa2f7
ED
3221 struct softnet_data *sd = data;
3222
eecfd7c4 3223 ____napi_schedule(sd, &sd->backlog);
dee42870 3224 sd->received_rps++;
0a9627f2 3225}
e36fa2f7 3226
fec5e652 3227#endif /* CONFIG_RPS */
0a9627f2 3228
e36fa2f7
ED
3229/*
3230 * Check if this softnet_data structure is another cpu one
3231 * If yes, queue it to our IPI list and return 1
3232 * If no, return 0
3233 */
3234static int rps_ipi_queued(struct softnet_data *sd)
3235{
3236#ifdef CONFIG_RPS
903ceff7 3237 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
e36fa2f7
ED
3238
3239 if (sd != mysd) {
3240 sd->rps_ipi_next = mysd->rps_ipi_list;
3241 mysd->rps_ipi_list = sd;
3242
3243 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3244 return 1;
3245 }
3246#endif /* CONFIG_RPS */
3247 return 0;
3248}
3249
99bbc707
WB
3250#ifdef CONFIG_NET_FLOW_LIMIT
3251int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3252#endif
3253
3254static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3255{
3256#ifdef CONFIG_NET_FLOW_LIMIT
3257 struct sd_flow_limit *fl;
3258 struct softnet_data *sd;
3259 unsigned int old_flow, new_flow;
3260
3261 if (qlen < (netdev_max_backlog >> 1))
3262 return false;
3263
903ceff7 3264 sd = this_cpu_ptr(&softnet_data);
99bbc707
WB
3265
3266 rcu_read_lock();
3267 fl = rcu_dereference(sd->flow_limit);
3268 if (fl) {
3958afa1 3269 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
99bbc707
WB
3270 old_flow = fl->history[fl->history_head];
3271 fl->history[fl->history_head] = new_flow;
3272
3273 fl->history_head++;
3274 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3275
3276 if (likely(fl->buckets[old_flow]))
3277 fl->buckets[old_flow]--;
3278
3279 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3280 fl->count++;
3281 rcu_read_unlock();
3282 return true;
3283 }
3284 }
3285 rcu_read_unlock();
3286#endif
3287 return false;
3288}
3289
0a9627f2
TH
3290/*
3291 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3292 * queue (may be a remote CPU queue).
3293 */
fec5e652
TH
3294static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3295 unsigned int *qtail)
0a9627f2 3296{
e36fa2f7 3297 struct softnet_data *sd;
0a9627f2 3298 unsigned long flags;
99bbc707 3299 unsigned int qlen;
0a9627f2 3300
e36fa2f7 3301 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
3302
3303 local_irq_save(flags);
0a9627f2 3304
e36fa2f7 3305 rps_lock(sd);
99bbc707
WB
3306 qlen = skb_queue_len(&sd->input_pkt_queue);
3307 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
6e7676c1 3308 if (skb_queue_len(&sd->input_pkt_queue)) {
0a9627f2 3309enqueue:
e36fa2f7 3310 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 3311 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 3312 rps_unlock(sd);
152102c7 3313 local_irq_restore(flags);
0a9627f2
TH
3314 return NET_RX_SUCCESS;
3315 }
3316
ebda37c2
ED
3317 /* Schedule NAPI for backlog device
3318 * We can use non atomic operation since we own the queue lock
3319 */
3320 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 3321 if (!rps_ipi_queued(sd))
eecfd7c4 3322 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
3323 }
3324 goto enqueue;
3325 }
3326
dee42870 3327 sd->dropped++;
e36fa2f7 3328 rps_unlock(sd);
0a9627f2 3329
0a9627f2
TH
3330 local_irq_restore(flags);
3331
caf586e5 3332 atomic_long_inc(&skb->dev->rx_dropped);
0a9627f2
TH
3333 kfree_skb(skb);
3334 return NET_RX_DROP;
3335}
1da177e4 3336
ae78dbfa 3337static int netif_rx_internal(struct sk_buff *skb)
1da177e4 3338{
b0e28f1e 3339 int ret;
1da177e4 3340
588f0330 3341 net_timestamp_check(netdev_tstamp_prequeue, skb);
1da177e4 3342
cf66ba58 3343 trace_netif_rx(skb);
df334545 3344#ifdef CONFIG_RPS
c5905afb 3345 if (static_key_false(&rps_needed)) {
fec5e652 3346 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
3347 int cpu;
3348
cece1945 3349 preempt_disable();
b0e28f1e 3350 rcu_read_lock();
fec5e652
TH
3351
3352 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
3353 if (cpu < 0)
3354 cpu = smp_processor_id();
fec5e652
TH
3355
3356 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3357
b0e28f1e 3358 rcu_read_unlock();
cece1945 3359 preempt_enable();
adc9300e
ED
3360 } else
3361#endif
fec5e652
TH
3362 {
3363 unsigned int qtail;
3364 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3365 put_cpu();
3366 }
b0e28f1e 3367 return ret;
1da177e4 3368}
ae78dbfa
BH
3369
3370/**
3371 * netif_rx - post buffer to the network code
3372 * @skb: buffer to post
3373 *
3374 * This function receives a packet from a device driver and queues it for
3375 * the upper (protocol) levels to process. It always succeeds. The buffer
3376 * may be dropped during processing for congestion control or by the
3377 * protocol layers.
3378 *
3379 * return values:
3380 * NET_RX_SUCCESS (no congestion)
3381 * NET_RX_DROP (packet was dropped)
3382 *
3383 */
3384
3385int netif_rx(struct sk_buff *skb)
3386{
3387 trace_netif_rx_entry(skb);
3388
3389 return netif_rx_internal(skb);
3390}
d1b19dff 3391EXPORT_SYMBOL(netif_rx);
1da177e4
LT
3392
3393int netif_rx_ni(struct sk_buff *skb)
3394{
3395 int err;
3396
ae78dbfa
BH
3397 trace_netif_rx_ni_entry(skb);
3398
1da177e4 3399 preempt_disable();
ae78dbfa 3400 err = netif_rx_internal(skb);
1da177e4
LT
3401 if (local_softirq_pending())
3402 do_softirq();
3403 preempt_enable();
3404
3405 return err;
3406}
1da177e4
LT
3407EXPORT_SYMBOL(netif_rx_ni);
3408
1da177e4
LT
3409static void net_tx_action(struct softirq_action *h)
3410{
903ceff7 3411 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
1da177e4
LT
3412
3413 if (sd->completion_queue) {
3414 struct sk_buff *clist;
3415
3416 local_irq_disable();
3417 clist = sd->completion_queue;
3418 sd->completion_queue = NULL;
3419 local_irq_enable();
3420
3421 while (clist) {
3422 struct sk_buff *skb = clist;
3423 clist = clist->next;
3424
547b792c 3425 WARN_ON(atomic_read(&skb->users));
e6247027
ED
3426 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3427 trace_consume_skb(skb);
3428 else
3429 trace_kfree_skb(skb, net_tx_action);
1da177e4
LT
3430 __kfree_skb(skb);
3431 }
3432 }
3433
3434 if (sd->output_queue) {
37437bb2 3435 struct Qdisc *head;
1da177e4
LT
3436
3437 local_irq_disable();
3438 head = sd->output_queue;
3439 sd->output_queue = NULL;
a9cbd588 3440 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
3441 local_irq_enable();
3442
3443 while (head) {
37437bb2
DM
3444 struct Qdisc *q = head;
3445 spinlock_t *root_lock;
3446
1da177e4
LT
3447 head = head->next_sched;
3448
5fb66229 3449 root_lock = qdisc_lock(q);
37437bb2 3450 if (spin_trylock(root_lock)) {
4e857c58 3451 smp_mb__before_atomic();
def82a1d
JP
3452 clear_bit(__QDISC_STATE_SCHED,
3453 &q->state);
37437bb2
DM
3454 qdisc_run(q);
3455 spin_unlock(root_lock);
1da177e4 3456 } else {
195648bb 3457 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 3458 &q->state)) {
195648bb 3459 __netif_reschedule(q);
e8a83e10 3460 } else {
4e857c58 3461 smp_mb__before_atomic();
e8a83e10
JP
3462 clear_bit(__QDISC_STATE_SCHED,
3463 &q->state);
3464 }
1da177e4
LT
3465 }
3466 }
3467 }
3468}
3469
ab95bfe0
JP
3470#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3471 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
da678292
MM
3472/* This hook is defined here for ATM LANE */
3473int (*br_fdb_test_addr_hook)(struct net_device *dev,
3474 unsigned char *addr) __read_mostly;
4fb019a0 3475EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 3476#endif
1da177e4 3477
1da177e4
LT
3478#ifdef CONFIG_NET_CLS_ACT
3479/* TODO: Maybe we should just force sch_ingress to be compiled in
3480 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3481 * a compare and 2 stores extra right now if we dont have it on
3482 * but have CONFIG_NET_CLS_ACT
25985edc
LDM
3483 * NOTE: This doesn't stop any functionality; if you dont have
3484 * the ingress scheduler, you just can't add policies on ingress.
1da177e4
LT
3485 *
3486 */
24824a09 3487static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
1da177e4 3488{
1da177e4 3489 struct net_device *dev = skb->dev;
f697c3e8 3490 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
3491 int result = TC_ACT_OK;
3492 struct Qdisc *q;
4ec93edb 3493
de384830 3494 if (unlikely(MAX_RED_LOOP < ttl++)) {
e87cc472
JP
3495 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3496 skb->skb_iif, dev->ifindex);
f697c3e8
HX
3497 return TC_ACT_SHOT;
3498 }
1da177e4 3499
f697c3e8
HX
3500 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3501 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 3502
46e5da40 3503 q = rcu_dereference(rxq->qdisc);
8d50b53d 3504 if (q != &noop_qdisc) {
83874000 3505 spin_lock(qdisc_lock(q));
a9312ae8
DM
3506 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3507 result = qdisc_enqueue_root(skb, q);
83874000
DM
3508 spin_unlock(qdisc_lock(q));
3509 }
f697c3e8
HX
3510
3511 return result;
3512}
86e65da9 3513
f697c3e8
HX
3514static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3515 struct packet_type **pt_prev,
3516 int *ret, struct net_device *orig_dev)
3517{
24824a09
ED
3518 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3519
46e5da40 3520 if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
f697c3e8 3521 goto out;
1da177e4 3522
f697c3e8
HX
3523 if (*pt_prev) {
3524 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3525 *pt_prev = NULL;
1da177e4
LT
3526 }
3527
24824a09 3528 switch (ing_filter(skb, rxq)) {
f697c3e8
HX
3529 case TC_ACT_SHOT:
3530 case TC_ACT_STOLEN:
3531 kfree_skb(skb);
3532 return NULL;
3533 }
3534
3535out:
3536 skb->tc_verd = 0;
3537 return skb;
1da177e4
LT
3538}
3539#endif
3540
ab95bfe0
JP
3541/**
3542 * netdev_rx_handler_register - register receive handler
3543 * @dev: device to register a handler for
3544 * @rx_handler: receive handler to register
93e2c32b 3545 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0 3546 *
e227867f 3547 * Register a receive handler for a device. This handler will then be
ab95bfe0
JP
3548 * called from __netif_receive_skb. A negative errno code is returned
3549 * on a failure.
3550 *
3551 * The caller must hold the rtnl_mutex.
8a4eb573
JP
3552 *
3553 * For a general description of rx_handler, see enum rx_handler_result.
ab95bfe0
JP
3554 */
3555int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
3556 rx_handler_func_t *rx_handler,
3557 void *rx_handler_data)
ab95bfe0
JP
3558{
3559 ASSERT_RTNL();
3560
3561 if (dev->rx_handler)
3562 return -EBUSY;
3563
00cfec37 3564 /* Note: rx_handler_data must be set before rx_handler */
93e2c32b 3565 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
3566 rcu_assign_pointer(dev->rx_handler, rx_handler);
3567
3568 return 0;
3569}
3570EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3571
3572/**
3573 * netdev_rx_handler_unregister - unregister receive handler
3574 * @dev: device to unregister a handler from
3575 *
166ec369 3576 * Unregister a receive handler from a device.
ab95bfe0
JP
3577 *
3578 * The caller must hold the rtnl_mutex.
3579 */
3580void netdev_rx_handler_unregister(struct net_device *dev)
3581{
3582
3583 ASSERT_RTNL();
a9b3cd7f 3584 RCU_INIT_POINTER(dev->rx_handler, NULL);
00cfec37
ED
3585 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3586 * section has a guarantee to see a non NULL rx_handler_data
3587 * as well.
3588 */
3589 synchronize_net();
a9b3cd7f 3590 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
ab95bfe0
JP
3591}
3592EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3593
b4b9e355
MG
3594/*
3595 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3596 * the special handling of PFMEMALLOC skbs.
3597 */
3598static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3599{
3600 switch (skb->protocol) {
2b8837ae
JP
3601 case htons(ETH_P_ARP):
3602 case htons(ETH_P_IP):
3603 case htons(ETH_P_IPV6):
3604 case htons(ETH_P_8021Q):
3605 case htons(ETH_P_8021AD):
b4b9e355
MG
3606 return true;
3607 default:
3608 return false;
3609 }
3610}
3611
9754e293 3612static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
1da177e4
LT
3613{
3614 struct packet_type *ptype, *pt_prev;
ab95bfe0 3615 rx_handler_func_t *rx_handler;
f2ccd8fa 3616 struct net_device *orig_dev;
63d8ea7f 3617 struct net_device *null_or_dev;
8a4eb573 3618 bool deliver_exact = false;
1da177e4 3619 int ret = NET_RX_DROP;
252e3346 3620 __be16 type;
1da177e4 3621
588f0330 3622 net_timestamp_check(!netdev_tstamp_prequeue, skb);
81bbb3d4 3623
cf66ba58 3624 trace_netif_receive_skb(skb);
9b22ea56 3625
cc9bd5ce 3626 orig_dev = skb->dev;
8f903c70 3627
c1d2bbe1 3628 skb_reset_network_header(skb);
fda55eca
ED
3629 if (!skb_transport_header_was_set(skb))
3630 skb_reset_transport_header(skb);
0b5c9db1 3631 skb_reset_mac_len(skb);
1da177e4
LT
3632
3633 pt_prev = NULL;
3634
3635 rcu_read_lock();
3636
63d8ea7f 3637another_round:
b6858177 3638 skb->skb_iif = skb->dev->ifindex;
63d8ea7f
DM
3639
3640 __this_cpu_inc(softnet_data.processed);
3641
8ad227ff
PM
3642 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3643 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
0d5501c1 3644 skb = skb_vlan_untag(skb);
bcc6d479 3645 if (unlikely(!skb))
b4b9e355 3646 goto unlock;
bcc6d479
JP
3647 }
3648
1da177e4
LT
3649#ifdef CONFIG_NET_CLS_ACT
3650 if (skb->tc_verd & TC_NCLS) {
3651 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3652 goto ncls;
3653 }
3654#endif
3655
9754e293 3656 if (pfmemalloc)
b4b9e355
MG
3657 goto skip_taps;
3658
1da177e4 3659 list_for_each_entry_rcu(ptype, &ptype_all, list) {
63d8ea7f 3660 if (!ptype->dev || ptype->dev == skb->dev) {
4ec93edb 3661 if (pt_prev)
f2ccd8fa 3662 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3663 pt_prev = ptype;
3664 }
3665 }
3666
b4b9e355 3667skip_taps:
1da177e4 3668#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
3669 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3670 if (!skb)
b4b9e355 3671 goto unlock;
1da177e4
LT
3672ncls:
3673#endif
3674
9754e293 3675 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
b4b9e355
MG
3676 goto drop;
3677
2425717b
JF
3678 if (vlan_tx_tag_present(skb)) {
3679 if (pt_prev) {
3680 ret = deliver_skb(skb, pt_prev, orig_dev);
3681 pt_prev = NULL;
3682 }
48cc32d3 3683 if (vlan_do_receive(&skb))
2425717b
JF
3684 goto another_round;
3685 else if (unlikely(!skb))
b4b9e355 3686 goto unlock;
2425717b
JF
3687 }
3688
48cc32d3 3689 rx_handler = rcu_dereference(skb->dev->rx_handler);
ab95bfe0
JP
3690 if (rx_handler) {
3691 if (pt_prev) {
3692 ret = deliver_skb(skb, pt_prev, orig_dev);
3693 pt_prev = NULL;
3694 }
8a4eb573
JP
3695 switch (rx_handler(&skb)) {
3696 case RX_HANDLER_CONSUMED:
3bc1b1ad 3697 ret = NET_RX_SUCCESS;
b4b9e355 3698 goto unlock;
8a4eb573 3699 case RX_HANDLER_ANOTHER:
63d8ea7f 3700 goto another_round;
8a4eb573
JP
3701 case RX_HANDLER_EXACT:
3702 deliver_exact = true;
3703 case RX_HANDLER_PASS:
3704 break;
3705 default:
3706 BUG();
3707 }
ab95bfe0 3708 }
1da177e4 3709
d4b812de
ED
3710 if (unlikely(vlan_tx_tag_present(skb))) {
3711 if (vlan_tx_tag_get_id(skb))
3712 skb->pkt_type = PACKET_OTHERHOST;
3713 /* Note: we might in the future use prio bits
3714 * and set skb->priority like in vlan_do_receive()
3715 * For the time being, just ignore Priority Code Point
3716 */
3717 skb->vlan_tci = 0;
3718 }
48cc32d3 3719
63d8ea7f 3720 /* deliver only exact match when indicated */
8a4eb573 3721 null_or_dev = deliver_exact ? skb->dev : NULL;
1f3c8804 3722
1da177e4 3723 type = skb->protocol;
82d8a867
PE
3724 list_for_each_entry_rcu(ptype,
3725 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
63d8ea7f 3726 if (ptype->type == type &&
e3f48d37
JP
3727 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3728 ptype->dev == orig_dev)) {
4ec93edb 3729 if (pt_prev)
f2ccd8fa 3730 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3731 pt_prev = ptype;
3732 }
3733 }
3734
3735 if (pt_prev) {
1080e512 3736 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
0e698bf6 3737 goto drop;
1080e512
MT
3738 else
3739 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4 3740 } else {
b4b9e355 3741drop:
caf586e5 3742 atomic_long_inc(&skb->dev->rx_dropped);
1da177e4
LT
3743 kfree_skb(skb);
3744 /* Jamal, now you will not able to escape explaining
3745 * me how you were going to use this. :-)
3746 */
3747 ret = NET_RX_DROP;
3748 }
3749
b4b9e355 3750unlock:
1da177e4 3751 rcu_read_unlock();
9754e293
DM
3752 return ret;
3753}
3754
3755static int __netif_receive_skb(struct sk_buff *skb)
3756{
3757 int ret;
3758
3759 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3760 unsigned long pflags = current->flags;
3761
3762 /*
3763 * PFMEMALLOC skbs are special, they should
3764 * - be delivered to SOCK_MEMALLOC sockets only
3765 * - stay away from userspace
3766 * - have bounded memory usage
3767 *
3768 * Use PF_MEMALLOC as this saves us from propagating the allocation
3769 * context down to all allocation sites.
3770 */
3771 current->flags |= PF_MEMALLOC;
3772 ret = __netif_receive_skb_core(skb, true);
3773 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3774 } else
3775 ret = __netif_receive_skb_core(skb, false);
3776
1da177e4
LT
3777 return ret;
3778}
0a9627f2 3779
ae78dbfa 3780static int netif_receive_skb_internal(struct sk_buff *skb)
0a9627f2 3781{
588f0330 3782 net_timestamp_check(netdev_tstamp_prequeue, skb);
3b098e2d 3783
c1f19b51
RC
3784 if (skb_defer_rx_timestamp(skb))
3785 return NET_RX_SUCCESS;
3786
df334545 3787#ifdef CONFIG_RPS
c5905afb 3788 if (static_key_false(&rps_needed)) {
3b098e2d
ED
3789 struct rps_dev_flow voidflow, *rflow = &voidflow;
3790 int cpu, ret;
fec5e652 3791
3b098e2d
ED
3792 rcu_read_lock();
3793
3794 cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 3795
3b098e2d
ED
3796 if (cpu >= 0) {
3797 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3798 rcu_read_unlock();
adc9300e 3799 return ret;
3b098e2d 3800 }
adc9300e 3801 rcu_read_unlock();
fec5e652 3802 }
1e94d72f 3803#endif
adc9300e 3804 return __netif_receive_skb(skb);
0a9627f2 3805}
ae78dbfa
BH
3806
3807/**
3808 * netif_receive_skb - process receive buffer from network
3809 * @skb: buffer to process
3810 *
3811 * netif_receive_skb() is the main receive data processing function.
3812 * It always succeeds. The buffer may be dropped during processing
3813 * for congestion control or by the protocol layers.
3814 *
3815 * This function may only be called from softirq context and interrupts
3816 * should be enabled.
3817 *
3818 * Return values (usually ignored):
3819 * NET_RX_SUCCESS: no congestion
3820 * NET_RX_DROP: packet was dropped
3821 */
3822int netif_receive_skb(struct sk_buff *skb)
3823{
3824 trace_netif_receive_skb_entry(skb);
3825
3826 return netif_receive_skb_internal(skb);
3827}
d1b19dff 3828EXPORT_SYMBOL(netif_receive_skb);
1da177e4 3829
88751275
ED
3830/* Network device is going away, flush any packets still pending
3831 * Called with irqs disabled.
3832 */
152102c7 3833static void flush_backlog(void *arg)
6e583ce5 3834{
152102c7 3835 struct net_device *dev = arg;
903ceff7 3836 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6e583ce5
SH
3837 struct sk_buff *skb, *tmp;
3838
e36fa2f7 3839 rps_lock(sd);
6e7676c1 3840 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6e583ce5 3841 if (skb->dev == dev) {
e36fa2f7 3842 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 3843 kfree_skb(skb);
76cc8b13 3844 input_queue_head_incr(sd);
6e583ce5 3845 }
6e7676c1 3846 }
e36fa2f7 3847 rps_unlock(sd);
6e7676c1
CG
3848
3849 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3850 if (skb->dev == dev) {
3851 __skb_unlink(skb, &sd->process_queue);
3852 kfree_skb(skb);
76cc8b13 3853 input_queue_head_incr(sd);
6e7676c1
CG
3854 }
3855 }
6e583ce5
SH
3856}
3857
d565b0a1
HX
3858static int napi_gro_complete(struct sk_buff *skb)
3859{
22061d80 3860 struct packet_offload *ptype;
d565b0a1 3861 __be16 type = skb->protocol;
22061d80 3862 struct list_head *head = &offload_base;
d565b0a1
HX
3863 int err = -ENOENT;
3864
c3c7c254
ED
3865 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3866
fc59f9a3
HX
3867 if (NAPI_GRO_CB(skb)->count == 1) {
3868 skb_shinfo(skb)->gso_size = 0;
d565b0a1 3869 goto out;
fc59f9a3 3870 }
d565b0a1
HX
3871
3872 rcu_read_lock();
3873 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 3874 if (ptype->type != type || !ptype->callbacks.gro_complete)
d565b0a1
HX
3875 continue;
3876
299603e8 3877 err = ptype->callbacks.gro_complete(skb, 0);
d565b0a1
HX
3878 break;
3879 }
3880 rcu_read_unlock();
3881
3882 if (err) {
3883 WARN_ON(&ptype->list == head);
3884 kfree_skb(skb);
3885 return NET_RX_SUCCESS;
3886 }
3887
3888out:
ae78dbfa 3889 return netif_receive_skb_internal(skb);
d565b0a1
HX
3890}
3891
2e71a6f8
ED
3892/* napi->gro_list contains packets ordered by age.
3893 * youngest packets at the head of it.
3894 * Complete skbs in reverse order to reduce latencies.
3895 */
3896void napi_gro_flush(struct napi_struct *napi, bool flush_old)
d565b0a1 3897{
2e71a6f8 3898 struct sk_buff *skb, *prev = NULL;
d565b0a1 3899
2e71a6f8
ED
3900 /* scan list and build reverse chain */
3901 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3902 skb->prev = prev;
3903 prev = skb;
3904 }
3905
3906 for (skb = prev; skb; skb = prev) {
d565b0a1 3907 skb->next = NULL;
2e71a6f8
ED
3908
3909 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3910 return;
3911
3912 prev = skb->prev;
d565b0a1 3913 napi_gro_complete(skb);
2e71a6f8 3914 napi->gro_count--;
d565b0a1
HX
3915 }
3916
3917 napi->gro_list = NULL;
3918}
86cac58b 3919EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 3920
89c5fa33
ED
3921static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3922{
3923 struct sk_buff *p;
3924 unsigned int maclen = skb->dev->hard_header_len;
0b4cec8c 3925 u32 hash = skb_get_hash_raw(skb);
89c5fa33
ED
3926
3927 for (p = napi->gro_list; p; p = p->next) {
3928 unsigned long diffs;
3929
0b4cec8c
TH
3930 NAPI_GRO_CB(p)->flush = 0;
3931
3932 if (hash != skb_get_hash_raw(p)) {
3933 NAPI_GRO_CB(p)->same_flow = 0;
3934 continue;
3935 }
3936
89c5fa33
ED
3937 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3938 diffs |= p->vlan_tci ^ skb->vlan_tci;
3939 if (maclen == ETH_HLEN)
3940 diffs |= compare_ether_header(skb_mac_header(p),
a50e233c 3941 skb_mac_header(skb));
89c5fa33
ED
3942 else if (!diffs)
3943 diffs = memcmp(skb_mac_header(p),
a50e233c 3944 skb_mac_header(skb),
89c5fa33
ED
3945 maclen);
3946 NAPI_GRO_CB(p)->same_flow = !diffs;
89c5fa33
ED
3947 }
3948}
3949
299603e8
JC
3950static void skb_gro_reset_offset(struct sk_buff *skb)
3951{
3952 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3953 const skb_frag_t *frag0 = &pinfo->frags[0];
3954
3955 NAPI_GRO_CB(skb)->data_offset = 0;
3956 NAPI_GRO_CB(skb)->frag0 = NULL;
3957 NAPI_GRO_CB(skb)->frag0_len = 0;
3958
3959 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3960 pinfo->nr_frags &&
3961 !PageHighMem(skb_frag_page(frag0))) {
3962 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3963 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
89c5fa33
ED
3964 }
3965}
3966
a50e233c
ED
3967static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3968{
3969 struct skb_shared_info *pinfo = skb_shinfo(skb);
3970
3971 BUG_ON(skb->end - skb->tail < grow);
3972
3973 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3974
3975 skb->data_len -= grow;
3976 skb->tail += grow;
3977
3978 pinfo->frags[0].page_offset += grow;
3979 skb_frag_size_sub(&pinfo->frags[0], grow);
3980
3981 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3982 skb_frag_unref(skb, 0);
3983 memmove(pinfo->frags, pinfo->frags + 1,
3984 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3985 }
3986}
3987
bb728820 3988static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
3989{
3990 struct sk_buff **pp = NULL;
22061d80 3991 struct packet_offload *ptype;
d565b0a1 3992 __be16 type = skb->protocol;
22061d80 3993 struct list_head *head = &offload_base;
0da2afd5 3994 int same_flow;
5b252f0c 3995 enum gro_result ret;
a50e233c 3996 int grow;
d565b0a1 3997
9c62a68d 3998 if (!(skb->dev->features & NETIF_F_GRO))
d565b0a1
HX
3999 goto normal;
4000
5a212329 4001 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
f17f5c91
HX
4002 goto normal;
4003
89c5fa33
ED
4004 gro_list_prepare(napi, skb);
4005
d565b0a1
HX
4006 rcu_read_lock();
4007 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 4008 if (ptype->type != type || !ptype->callbacks.gro_receive)
d565b0a1
HX
4009 continue;
4010
86911732 4011 skb_set_network_header(skb, skb_gro_offset(skb));
efd9450e 4012 skb_reset_mac_len(skb);
d565b0a1
HX
4013 NAPI_GRO_CB(skb)->same_flow = 0;
4014 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 4015 NAPI_GRO_CB(skb)->free = 0;
b582ef09 4016 NAPI_GRO_CB(skb)->udp_mark = 0;
d565b0a1 4017
662880f4
TH
4018 /* Setup for GRO checksum validation */
4019 switch (skb->ip_summed) {
4020 case CHECKSUM_COMPLETE:
4021 NAPI_GRO_CB(skb)->csum = skb->csum;
4022 NAPI_GRO_CB(skb)->csum_valid = 1;
4023 NAPI_GRO_CB(skb)->csum_cnt = 0;
4024 break;
4025 case CHECKSUM_UNNECESSARY:
4026 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4027 NAPI_GRO_CB(skb)->csum_valid = 0;
4028 break;
4029 default:
4030 NAPI_GRO_CB(skb)->csum_cnt = 0;
4031 NAPI_GRO_CB(skb)->csum_valid = 0;
4032 }
d565b0a1 4033
f191a1d1 4034 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
d565b0a1
HX
4035 break;
4036 }
4037 rcu_read_unlock();
4038
4039 if (&ptype->list == head)
4040 goto normal;
4041
0da2afd5 4042 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 4043 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 4044
d565b0a1
HX
4045 if (pp) {
4046 struct sk_buff *nskb = *pp;
4047
4048 *pp = nskb->next;
4049 nskb->next = NULL;
4050 napi_gro_complete(nskb);
4ae5544f 4051 napi->gro_count--;
d565b0a1
HX
4052 }
4053
0da2afd5 4054 if (same_flow)
d565b0a1
HX
4055 goto ok;
4056
600adc18 4057 if (NAPI_GRO_CB(skb)->flush)
d565b0a1 4058 goto normal;
d565b0a1 4059
600adc18
ED
4060 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4061 struct sk_buff *nskb = napi->gro_list;
4062
4063 /* locate the end of the list to select the 'oldest' flow */
4064 while (nskb->next) {
4065 pp = &nskb->next;
4066 nskb = *pp;
4067 }
4068 *pp = NULL;
4069 nskb->next = NULL;
4070 napi_gro_complete(nskb);
4071 } else {
4072 napi->gro_count++;
4073 }
d565b0a1 4074 NAPI_GRO_CB(skb)->count = 1;
2e71a6f8 4075 NAPI_GRO_CB(skb)->age = jiffies;
29e98242 4076 NAPI_GRO_CB(skb)->last = skb;
86911732 4077 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
4078 skb->next = napi->gro_list;
4079 napi->gro_list = skb;
5d0d9be8 4080 ret = GRO_HELD;
d565b0a1 4081
ad0f9904 4082pull:
a50e233c
ED
4083 grow = skb_gro_offset(skb) - skb_headlen(skb);
4084 if (grow > 0)
4085 gro_pull_from_frag0(skb, grow);
d565b0a1 4086ok:
5d0d9be8 4087 return ret;
d565b0a1
HX
4088
4089normal:
ad0f9904
HX
4090 ret = GRO_NORMAL;
4091 goto pull;
5d38a079 4092}
96e93eab 4093
bf5a755f
JC
4094struct packet_offload *gro_find_receive_by_type(__be16 type)
4095{
4096 struct list_head *offload_head = &offload_base;
4097 struct packet_offload *ptype;
4098
4099 list_for_each_entry_rcu(ptype, offload_head, list) {
4100 if (ptype->type != type || !ptype->callbacks.gro_receive)
4101 continue;
4102 return ptype;
4103 }
4104 return NULL;
4105}
e27a2f83 4106EXPORT_SYMBOL(gro_find_receive_by_type);
bf5a755f
JC
4107
4108struct packet_offload *gro_find_complete_by_type(__be16 type)
4109{
4110 struct list_head *offload_head = &offload_base;
4111 struct packet_offload *ptype;
4112
4113 list_for_each_entry_rcu(ptype, offload_head, list) {
4114 if (ptype->type != type || !ptype->callbacks.gro_complete)
4115 continue;
4116 return ptype;
4117 }
4118 return NULL;
4119}
e27a2f83 4120EXPORT_SYMBOL(gro_find_complete_by_type);
5d38a079 4121
bb728820 4122static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 4123{
5d0d9be8
HX
4124 switch (ret) {
4125 case GRO_NORMAL:
ae78dbfa 4126 if (netif_receive_skb_internal(skb))
c7c4b3b6
BH
4127 ret = GRO_DROP;
4128 break;
5d38a079 4129
5d0d9be8 4130 case GRO_DROP:
5d38a079
HX
4131 kfree_skb(skb);
4132 break;
5b252f0c 4133
daa86548 4134 case GRO_MERGED_FREE:
d7e8883c
ED
4135 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4136 kmem_cache_free(skbuff_head_cache, skb);
4137 else
4138 __kfree_skb(skb);
daa86548
ED
4139 break;
4140
5b252f0c
BH
4141 case GRO_HELD:
4142 case GRO_MERGED:
4143 break;
5d38a079
HX
4144 }
4145
c7c4b3b6 4146 return ret;
5d0d9be8 4147}
5d0d9be8 4148
c7c4b3b6 4149gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 4150{
ae78dbfa 4151 trace_napi_gro_receive_entry(skb);
86911732 4152
a50e233c
ED
4153 skb_gro_reset_offset(skb);
4154
89c5fa33 4155 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
d565b0a1
HX
4156}
4157EXPORT_SYMBOL(napi_gro_receive);
4158
d0c2b0d2 4159static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
96e93eab 4160{
93a35f59
ED
4161 if (unlikely(skb->pfmemalloc)) {
4162 consume_skb(skb);
4163 return;
4164 }
96e93eab 4165 __skb_pull(skb, skb_headlen(skb));
2a2a459e
ED
4166 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4167 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3701e513 4168 skb->vlan_tci = 0;
66c46d74 4169 skb->dev = napi->dev;
6d152e23 4170 skb->skb_iif = 0;
c3caf119
JC
4171 skb->encapsulation = 0;
4172 skb_shinfo(skb)->gso_type = 0;
e33d0ba8 4173 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
96e93eab
HX
4174
4175 napi->skb = skb;
4176}
96e93eab 4177
76620aaf 4178struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 4179{
5d38a079 4180 struct sk_buff *skb = napi->skb;
5d38a079
HX
4181
4182 if (!skb) {
89d71a66 4183 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
84b9cd63 4184 napi->skb = skb;
80595d59 4185 }
96e93eab
HX
4186 return skb;
4187}
76620aaf 4188EXPORT_SYMBOL(napi_get_frags);
96e93eab 4189
a50e233c
ED
4190static gro_result_t napi_frags_finish(struct napi_struct *napi,
4191 struct sk_buff *skb,
4192 gro_result_t ret)
96e93eab 4193{
5d0d9be8
HX
4194 switch (ret) {
4195 case GRO_NORMAL:
a50e233c
ED
4196 case GRO_HELD:
4197 __skb_push(skb, ETH_HLEN);
4198 skb->protocol = eth_type_trans(skb, skb->dev);
4199 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
c7c4b3b6 4200 ret = GRO_DROP;
86911732 4201 break;
5d38a079 4202
5d0d9be8 4203 case GRO_DROP:
5d0d9be8
HX
4204 case GRO_MERGED_FREE:
4205 napi_reuse_skb(napi, skb);
4206 break;
5b252f0c
BH
4207
4208 case GRO_MERGED:
4209 break;
5d0d9be8 4210 }
5d38a079 4211
c7c4b3b6 4212 return ret;
5d38a079 4213}
5d0d9be8 4214
a50e233c
ED
4215/* Upper GRO stack assumes network header starts at gro_offset=0
4216 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4217 * We copy ethernet header into skb->data to have a common layout.
4218 */
4adb9c4a 4219static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
76620aaf
HX
4220{
4221 struct sk_buff *skb = napi->skb;
a50e233c
ED
4222 const struct ethhdr *eth;
4223 unsigned int hlen = sizeof(*eth);
76620aaf
HX
4224
4225 napi->skb = NULL;
4226
a50e233c
ED
4227 skb_reset_mac_header(skb);
4228 skb_gro_reset_offset(skb);
4229
4230 eth = skb_gro_header_fast(skb, 0);
4231 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4232 eth = skb_gro_header_slow(skb, hlen, 0);
4233 if (unlikely(!eth)) {
4234 napi_reuse_skb(napi, skb);
4235 return NULL;
4236 }
4237 } else {
4238 gro_pull_from_frag0(skb, hlen);
4239 NAPI_GRO_CB(skb)->frag0 += hlen;
4240 NAPI_GRO_CB(skb)->frag0_len -= hlen;
76620aaf 4241 }
a50e233c
ED
4242 __skb_pull(skb, hlen);
4243
4244 /*
4245 * This works because the only protocols we care about don't require
4246 * special handling.
4247 * We'll fix it up properly in napi_frags_finish()
4248 */
4249 skb->protocol = eth->h_proto;
76620aaf 4250
76620aaf
HX
4251 return skb;
4252}
76620aaf 4253
c7c4b3b6 4254gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 4255{
76620aaf 4256 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
4257
4258 if (!skb)
c7c4b3b6 4259 return GRO_DROP;
5d0d9be8 4260
ae78dbfa
BH
4261 trace_napi_gro_frags_entry(skb);
4262
89c5fa33 4263 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5d0d9be8 4264}
5d38a079
HX
4265EXPORT_SYMBOL(napi_gro_frags);
4266
573e8fca
TH
4267/* Compute the checksum from gro_offset and return the folded value
4268 * after adding in any pseudo checksum.
4269 */
4270__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4271{
4272 __wsum wsum;
4273 __sum16 sum;
4274
4275 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4276
4277 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4278 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4279 if (likely(!sum)) {
4280 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4281 !skb->csum_complete_sw)
4282 netdev_rx_csum_fault(skb->dev);
4283 }
4284
4285 NAPI_GRO_CB(skb)->csum = wsum;
4286 NAPI_GRO_CB(skb)->csum_valid = 1;
4287
4288 return sum;
4289}
4290EXPORT_SYMBOL(__skb_gro_checksum_complete);
4291
e326bed2 4292/*
855abcf0 4293 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
e326bed2
ED
4294 * Note: called with local irq disabled, but exits with local irq enabled.
4295 */
4296static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4297{
4298#ifdef CONFIG_RPS
4299 struct softnet_data *remsd = sd->rps_ipi_list;
4300
4301 if (remsd) {
4302 sd->rps_ipi_list = NULL;
4303
4304 local_irq_enable();
4305
4306 /* Send pending IPI's to kick RPS processing on remote cpus. */
4307 while (remsd) {
4308 struct softnet_data *next = remsd->rps_ipi_next;
4309
4310 if (cpu_online(remsd->cpu))
c46fff2a 4311 smp_call_function_single_async(remsd->cpu,
fce8ad15 4312 &remsd->csd);
e326bed2
ED
4313 remsd = next;
4314 }
4315 } else
4316#endif
4317 local_irq_enable();
4318}
4319
d75b1ade
ED
4320static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4321{
4322#ifdef CONFIG_RPS
4323 return sd->rps_ipi_list != NULL;
4324#else
4325 return false;
4326#endif
4327}
4328
bea3348e 4329static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
4330{
4331 int work = 0;
eecfd7c4 4332 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
1da177e4 4333
e326bed2
ED
4334 /* Check if we have pending ipi, its better to send them now,
4335 * not waiting net_rx_action() end.
4336 */
d75b1ade 4337 if (sd_has_rps_ipi_waiting(sd)) {
e326bed2
ED
4338 local_irq_disable();
4339 net_rps_action_and_irq_enable(sd);
4340 }
d75b1ade 4341
bea3348e 4342 napi->weight = weight_p;
6e7676c1 4343 local_irq_disable();
11ef7a89 4344 while (1) {
1da177e4 4345 struct sk_buff *skb;
6e7676c1
CG
4346
4347 while ((skb = __skb_dequeue(&sd->process_queue))) {
4348 local_irq_enable();
4349 __netif_receive_skb(skb);
6e7676c1 4350 local_irq_disable();
76cc8b13
TH
4351 input_queue_head_incr(sd);
4352 if (++work >= quota) {
4353 local_irq_enable();
4354 return work;
4355 }
6e7676c1 4356 }
1da177e4 4357
e36fa2f7 4358 rps_lock(sd);
11ef7a89 4359 if (skb_queue_empty(&sd->input_pkt_queue)) {
eecfd7c4
ED
4360 /*
4361 * Inline a custom version of __napi_complete().
4362 * only current cpu owns and manipulates this napi,
11ef7a89
TH
4363 * and NAPI_STATE_SCHED is the only possible flag set
4364 * on backlog.
4365 * We can use a plain write instead of clear_bit(),
eecfd7c4
ED
4366 * and we dont need an smp_mb() memory barrier.
4367 */
eecfd7c4 4368 napi->state = 0;
11ef7a89 4369 rps_unlock(sd);
eecfd7c4 4370
11ef7a89 4371 break;
bea3348e 4372 }
11ef7a89
TH
4373
4374 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4375 &sd->process_queue);
e36fa2f7 4376 rps_unlock(sd);
6e7676c1
CG
4377 }
4378 local_irq_enable();
1da177e4 4379
bea3348e
SH
4380 return work;
4381}
1da177e4 4382
bea3348e
SH
4383/**
4384 * __napi_schedule - schedule for receive
c4ea43c5 4385 * @n: entry to schedule
bea3348e 4386 *
bc9ad166
ED
4387 * The entry's receive function will be scheduled to run.
4388 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
bea3348e 4389 */
b5606c2d 4390void __napi_schedule(struct napi_struct *n)
bea3348e
SH
4391{
4392 unsigned long flags;
1da177e4 4393
bea3348e 4394 local_irq_save(flags);
903ceff7 4395 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
bea3348e 4396 local_irq_restore(flags);
1da177e4 4397}
bea3348e
SH
4398EXPORT_SYMBOL(__napi_schedule);
4399
bc9ad166
ED
4400/**
4401 * __napi_schedule_irqoff - schedule for receive
4402 * @n: entry to schedule
4403 *
4404 * Variant of __napi_schedule() assuming hard irqs are masked
4405 */
4406void __napi_schedule_irqoff(struct napi_struct *n)
4407{
4408 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4409}
4410EXPORT_SYMBOL(__napi_schedule_irqoff);
4411
d565b0a1
HX
4412void __napi_complete(struct napi_struct *n)
4413{
4414 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4415 BUG_ON(n->gro_list);
4416
d75b1ade 4417 list_del_init(&n->poll_list);
4e857c58 4418 smp_mb__before_atomic();
d565b0a1
HX
4419 clear_bit(NAPI_STATE_SCHED, &n->state);
4420}
4421EXPORT_SYMBOL(__napi_complete);
4422
4423void napi_complete(struct napi_struct *n)
4424{
4425 unsigned long flags;
4426
4427 /*
4428 * don't let napi dequeue from the cpu poll list
4429 * just in case its running on a different cpu
4430 */
4431 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4432 return;
4433
2e71a6f8 4434 napi_gro_flush(n, false);
d75b1ade
ED
4435
4436 if (likely(list_empty(&n->poll_list))) {
4437 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4438 } else {
4439 /* If n->poll_list is not empty, we need to mask irqs */
4440 local_irq_save(flags);
4441 __napi_complete(n);
4442 local_irq_restore(flags);
4443 }
d565b0a1
HX
4444}
4445EXPORT_SYMBOL(napi_complete);
4446
af12fa6e
ET
4447/* must be called under rcu_read_lock(), as we dont take a reference */
4448struct napi_struct *napi_by_id(unsigned int napi_id)
4449{
4450 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4451 struct napi_struct *napi;
4452
4453 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4454 if (napi->napi_id == napi_id)
4455 return napi;
4456
4457 return NULL;
4458}
4459EXPORT_SYMBOL_GPL(napi_by_id);
4460
4461void napi_hash_add(struct napi_struct *napi)
4462{
4463 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4464
4465 spin_lock(&napi_hash_lock);
4466
4467 /* 0 is not a valid id, we also skip an id that is taken
4468 * we expect both events to be extremely rare
4469 */
4470 napi->napi_id = 0;
4471 while (!napi->napi_id) {
4472 napi->napi_id = ++napi_gen_id;
4473 if (napi_by_id(napi->napi_id))
4474 napi->napi_id = 0;
4475 }
4476
4477 hlist_add_head_rcu(&napi->napi_hash_node,
4478 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4479
4480 spin_unlock(&napi_hash_lock);
4481 }
4482}
4483EXPORT_SYMBOL_GPL(napi_hash_add);
4484
4485/* Warning : caller is responsible to make sure rcu grace period
4486 * is respected before freeing memory containing @napi
4487 */
4488void napi_hash_del(struct napi_struct *napi)
4489{
4490 spin_lock(&napi_hash_lock);
4491
4492 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4493 hlist_del_rcu(&napi->napi_hash_node);
4494
4495 spin_unlock(&napi_hash_lock);
4496}
4497EXPORT_SYMBOL_GPL(napi_hash_del);
4498
d565b0a1
HX
4499void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4500 int (*poll)(struct napi_struct *, int), int weight)
4501{
4502 INIT_LIST_HEAD(&napi->poll_list);
4ae5544f 4503 napi->gro_count = 0;
d565b0a1 4504 napi->gro_list = NULL;
5d38a079 4505 napi->skb = NULL;
d565b0a1 4506 napi->poll = poll;
82dc3c63
ED
4507 if (weight > NAPI_POLL_WEIGHT)
4508 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4509 weight, dev->name);
d565b0a1
HX
4510 napi->weight = weight;
4511 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 4512 napi->dev = dev;
5d38a079 4513#ifdef CONFIG_NETPOLL
d565b0a1
HX
4514 spin_lock_init(&napi->poll_lock);
4515 napi->poll_owner = -1;
4516#endif
4517 set_bit(NAPI_STATE_SCHED, &napi->state);
4518}
4519EXPORT_SYMBOL(netif_napi_add);
4520
4521void netif_napi_del(struct napi_struct *napi)
4522{
d7b06636 4523 list_del_init(&napi->dev_list);
76620aaf 4524 napi_free_frags(napi);
d565b0a1 4525
289dccbe 4526 kfree_skb_list(napi->gro_list);
d565b0a1 4527 napi->gro_list = NULL;
4ae5544f 4528 napi->gro_count = 0;
d565b0a1
HX
4529}
4530EXPORT_SYMBOL(netif_napi_del);
4531
1da177e4
LT
4532static void net_rx_action(struct softirq_action *h)
4533{
903ceff7 4534 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
24f8b238 4535 unsigned long time_limit = jiffies + 2;
51b0bded 4536 int budget = netdev_budget;
d75b1ade
ED
4537 LIST_HEAD(list);
4538 LIST_HEAD(repoll);
53fb95d3
MM
4539 void *have;
4540
1da177e4 4541 local_irq_disable();
d75b1ade
ED
4542 list_splice_init(&sd->poll_list, &list);
4543 local_irq_enable();
1da177e4 4544
d75b1ade 4545 while (!list_empty(&list)) {
bea3348e
SH
4546 struct napi_struct *n;
4547 int work, weight;
1da177e4 4548
d75b1ade 4549 /* If softirq window is exhausted then punt.
24f8b238
SH
4550 * Allow this to run for 2 jiffies since which will allow
4551 * an average latency of 1.5/HZ.
bea3348e 4552 */
d1f41b67 4553 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
1da177e4
LT
4554 goto softnet_break;
4555
1da177e4 4556
d75b1ade
ED
4557 n = list_first_entry(&list, struct napi_struct, poll_list);
4558 list_del_init(&n->poll_list);
1da177e4 4559
bea3348e
SH
4560 have = netpoll_poll_lock(n);
4561
4562 weight = n->weight;
4563
0a7606c1
DM
4564 /* This NAPI_STATE_SCHED test is for avoiding a race
4565 * with netpoll's poll_napi(). Only the entity which
4566 * obtains the lock and sees NAPI_STATE_SCHED set will
4567 * actually make the ->poll() call. Therefore we avoid
25985edc 4568 * accidentally calling ->poll() when NAPI is not scheduled.
0a7606c1
DM
4569 */
4570 work = 0;
4ea7e386 4571 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
0a7606c1 4572 work = n->poll(n, weight);
4ea7e386
NH
4573 trace_napi_poll(n);
4574 }
bea3348e
SH
4575
4576 WARN_ON_ONCE(work > weight);
4577
4578 budget -= work;
4579
bea3348e
SH
4580 /* Drivers must not modify the NAPI state if they
4581 * consume the entire weight. In such cases this code
4582 * still "owns" the NAPI instance and therefore can
4583 * move the instance around on the list at-will.
4584 */
fed17f30 4585 if (unlikely(work == weight)) {
ff780cd8 4586 if (unlikely(napi_disable_pending(n))) {
ff780cd8 4587 napi_complete(n);
2e71a6f8
ED
4588 } else {
4589 if (n->gro_list) {
4590 /* flush too old packets
4591 * If HZ < 1000, flush all packets.
4592 */
2e71a6f8 4593 napi_gro_flush(n, HZ >= 1000);
2e71a6f8 4594 }
d75b1ade 4595 list_add_tail(&n->poll_list, &repoll);
2e71a6f8 4596 }
fed17f30 4597 }
bea3348e
SH
4598
4599 netpoll_poll_unlock(have);
1da177e4 4600 }
d75b1ade
ED
4601
4602 if (!sd_has_rps_ipi_waiting(sd) &&
4603 list_empty(&list) &&
4604 list_empty(&repoll))
4605 return;
1da177e4 4606out:
d75b1ade
ED
4607 local_irq_disable();
4608
4609 list_splice_tail_init(&sd->poll_list, &list);
4610 list_splice_tail(&repoll, &list);
4611 list_splice(&list, &sd->poll_list);
4612 if (!list_empty(&sd->poll_list))
4613 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4614
e326bed2 4615 net_rps_action_and_irq_enable(sd);
0a9627f2 4616
1da177e4
LT
4617 return;
4618
4619softnet_break:
dee42870 4620 sd->time_squeeze++;
1da177e4
LT
4621 goto out;
4622}
4623
aa9d8560 4624struct netdev_adjacent {
9ff162a8 4625 struct net_device *dev;
5d261913
VF
4626
4627 /* upper master flag, there can only be one master device per list */
9ff162a8 4628 bool master;
5d261913 4629
5d261913
VF
4630 /* counter for the number of times this device was added to us */
4631 u16 ref_nr;
4632
402dae96
VF
4633 /* private field for the users */
4634 void *private;
4635
9ff162a8
JP
4636 struct list_head list;
4637 struct rcu_head rcu;
9ff162a8
JP
4638};
4639
5d261913
VF
4640static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4641 struct net_device *adj_dev,
2f268f12 4642 struct list_head *adj_list)
9ff162a8 4643{
5d261913 4644 struct netdev_adjacent *adj;
5d261913 4645
2f268f12 4646 list_for_each_entry(adj, adj_list, list) {
5d261913
VF
4647 if (adj->dev == adj_dev)
4648 return adj;
9ff162a8
JP
4649 }
4650 return NULL;
4651}
4652
4653/**
4654 * netdev_has_upper_dev - Check if device is linked to an upper device
4655 * @dev: device
4656 * @upper_dev: upper device to check
4657 *
4658 * Find out if a device is linked to specified upper device and return true
4659 * in case it is. Note that this checks only immediate upper device,
4660 * not through a complete stack of devices. The caller must hold the RTNL lock.
4661 */
4662bool netdev_has_upper_dev(struct net_device *dev,
4663 struct net_device *upper_dev)
4664{
4665 ASSERT_RTNL();
4666
2f268f12 4667 return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
9ff162a8
JP
4668}
4669EXPORT_SYMBOL(netdev_has_upper_dev);
4670
4671/**
4672 * netdev_has_any_upper_dev - Check if device is linked to some device
4673 * @dev: device
4674 *
4675 * Find out if a device is linked to an upper device and return true in case
4676 * it is. The caller must hold the RTNL lock.
4677 */
1d143d9f 4678static bool netdev_has_any_upper_dev(struct net_device *dev)
9ff162a8
JP
4679{
4680 ASSERT_RTNL();
4681
2f268f12 4682 return !list_empty(&dev->all_adj_list.upper);
9ff162a8 4683}
9ff162a8
JP
4684
4685/**
4686 * netdev_master_upper_dev_get - Get master upper device
4687 * @dev: device
4688 *
4689 * Find a master upper device and return pointer to it or NULL in case
4690 * it's not there. The caller must hold the RTNL lock.
4691 */
4692struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4693{
aa9d8560 4694 struct netdev_adjacent *upper;
9ff162a8
JP
4695
4696 ASSERT_RTNL();
4697
2f268f12 4698 if (list_empty(&dev->adj_list.upper))
9ff162a8
JP
4699 return NULL;
4700
2f268f12 4701 upper = list_first_entry(&dev->adj_list.upper,
aa9d8560 4702 struct netdev_adjacent, list);
9ff162a8
JP
4703 if (likely(upper->master))
4704 return upper->dev;
4705 return NULL;
4706}
4707EXPORT_SYMBOL(netdev_master_upper_dev_get);
4708
b6ccba4c
VF
4709void *netdev_adjacent_get_private(struct list_head *adj_list)
4710{
4711 struct netdev_adjacent *adj;
4712
4713 adj = list_entry(adj_list, struct netdev_adjacent, list);
4714
4715 return adj->private;
4716}
4717EXPORT_SYMBOL(netdev_adjacent_get_private);
4718
44a40855
VY
4719/**
4720 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4721 * @dev: device
4722 * @iter: list_head ** of the current position
4723 *
4724 * Gets the next device from the dev's upper list, starting from iter
4725 * position. The caller must hold RCU read lock.
4726 */
4727struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4728 struct list_head **iter)
4729{
4730 struct netdev_adjacent *upper;
4731
4732 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4733
4734 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4735
4736 if (&upper->list == &dev->adj_list.upper)
4737 return NULL;
4738
4739 *iter = &upper->list;
4740
4741 return upper->dev;
4742}
4743EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4744
31088a11
VF
4745/**
4746 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
48311f46
VF
4747 * @dev: device
4748 * @iter: list_head ** of the current position
4749 *
4750 * Gets the next device from the dev's upper list, starting from iter
4751 * position. The caller must hold RCU read lock.
4752 */
2f268f12
VF
4753struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4754 struct list_head **iter)
48311f46
VF
4755{
4756 struct netdev_adjacent *upper;
4757
85328240 4758 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
48311f46
VF
4759
4760 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4761
2f268f12 4762 if (&upper->list == &dev->all_adj_list.upper)
48311f46
VF
4763 return NULL;
4764
4765 *iter = &upper->list;
4766
4767 return upper->dev;
4768}
2f268f12 4769EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
48311f46 4770
31088a11
VF
4771/**
4772 * netdev_lower_get_next_private - Get the next ->private from the
4773 * lower neighbour list
4774 * @dev: device
4775 * @iter: list_head ** of the current position
4776 *
4777 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4778 * list, starting from iter position. The caller must hold either hold the
4779 * RTNL lock or its own locking that guarantees that the neighbour lower
4780 * list will remain unchainged.
4781 */
4782void *netdev_lower_get_next_private(struct net_device *dev,
4783 struct list_head **iter)
4784{
4785 struct netdev_adjacent *lower;
4786
4787 lower = list_entry(*iter, struct netdev_adjacent, list);
4788
4789 if (&lower->list == &dev->adj_list.lower)
4790 return NULL;
4791
6859e7df 4792 *iter = lower->list.next;
31088a11
VF
4793
4794 return lower->private;
4795}
4796EXPORT_SYMBOL(netdev_lower_get_next_private);
4797
4798/**
4799 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4800 * lower neighbour list, RCU
4801 * variant
4802 * @dev: device
4803 * @iter: list_head ** of the current position
4804 *
4805 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4806 * list, starting from iter position. The caller must hold RCU read lock.
4807 */
4808void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4809 struct list_head **iter)
4810{
4811 struct netdev_adjacent *lower;
4812
4813 WARN_ON_ONCE(!rcu_read_lock_held());
4814
4815 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4816
4817 if (&lower->list == &dev->adj_list.lower)
4818 return NULL;
4819
6859e7df 4820 *iter = &lower->list;
31088a11
VF
4821
4822 return lower->private;
4823}
4824EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4825
4085ebe8
VY
4826/**
4827 * netdev_lower_get_next - Get the next device from the lower neighbour
4828 * list
4829 * @dev: device
4830 * @iter: list_head ** of the current position
4831 *
4832 * Gets the next netdev_adjacent from the dev's lower neighbour
4833 * list, starting from iter position. The caller must hold RTNL lock or
4834 * its own locking that guarantees that the neighbour lower
4835 * list will remain unchainged.
4836 */
4837void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4838{
4839 struct netdev_adjacent *lower;
4840
4841 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4842
4843 if (&lower->list == &dev->adj_list.lower)
4844 return NULL;
4845
4846 *iter = &lower->list;
4847
4848 return lower->dev;
4849}
4850EXPORT_SYMBOL(netdev_lower_get_next);
4851
e001bfad 4852/**
4853 * netdev_lower_get_first_private_rcu - Get the first ->private from the
4854 * lower neighbour list, RCU
4855 * variant
4856 * @dev: device
4857 *
4858 * Gets the first netdev_adjacent->private from the dev's lower neighbour
4859 * list. The caller must hold RCU read lock.
4860 */
4861void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4862{
4863 struct netdev_adjacent *lower;
4864
4865 lower = list_first_or_null_rcu(&dev->adj_list.lower,
4866 struct netdev_adjacent, list);
4867 if (lower)
4868 return lower->private;
4869 return NULL;
4870}
4871EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4872
9ff162a8
JP
4873/**
4874 * netdev_master_upper_dev_get_rcu - Get master upper device
4875 * @dev: device
4876 *
4877 * Find a master upper device and return pointer to it or NULL in case
4878 * it's not there. The caller must hold the RCU read lock.
4879 */
4880struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4881{
aa9d8560 4882 struct netdev_adjacent *upper;
9ff162a8 4883
2f268f12 4884 upper = list_first_or_null_rcu(&dev->adj_list.upper,
aa9d8560 4885 struct netdev_adjacent, list);
9ff162a8
JP
4886 if (upper && likely(upper->master))
4887 return upper->dev;
4888 return NULL;
4889}
4890EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4891
0a59f3a9 4892static int netdev_adjacent_sysfs_add(struct net_device *dev,
3ee32707
VF
4893 struct net_device *adj_dev,
4894 struct list_head *dev_list)
4895{
4896 char linkname[IFNAMSIZ+7];
4897 sprintf(linkname, dev_list == &dev->adj_list.upper ?
4898 "upper_%s" : "lower_%s", adj_dev->name);
4899 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4900 linkname);
4901}
0a59f3a9 4902static void netdev_adjacent_sysfs_del(struct net_device *dev,
3ee32707
VF
4903 char *name,
4904 struct list_head *dev_list)
4905{
4906 char linkname[IFNAMSIZ+7];
4907 sprintf(linkname, dev_list == &dev->adj_list.upper ?
4908 "upper_%s" : "lower_%s", name);
4909 sysfs_remove_link(&(dev->dev.kobj), linkname);
4910}
4911
7ce64c79
AF
4912static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4913 struct net_device *adj_dev,
4914 struct list_head *dev_list)
4915{
4916 return (dev_list == &dev->adj_list.upper ||
4917 dev_list == &dev->adj_list.lower) &&
4918 net_eq(dev_net(dev), dev_net(adj_dev));
4919}
3ee32707 4920
5d261913
VF
4921static int __netdev_adjacent_dev_insert(struct net_device *dev,
4922 struct net_device *adj_dev,
7863c054 4923 struct list_head *dev_list,
402dae96 4924 void *private, bool master)
5d261913
VF
4925{
4926 struct netdev_adjacent *adj;
842d67a7 4927 int ret;
5d261913 4928
7863c054 4929 adj = __netdev_find_adj(dev, adj_dev, dev_list);
5d261913
VF
4930
4931 if (adj) {
5d261913
VF
4932 adj->ref_nr++;
4933 return 0;
4934 }
4935
4936 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4937 if (!adj)
4938 return -ENOMEM;
4939
4940 adj->dev = adj_dev;
4941 adj->master = master;
5d261913 4942 adj->ref_nr = 1;
402dae96 4943 adj->private = private;
5d261913 4944 dev_hold(adj_dev);
2f268f12
VF
4945
4946 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4947 adj_dev->name, dev->name, adj_dev->name);
5d261913 4948
7ce64c79 4949 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
3ee32707 4950 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5831d66e
VF
4951 if (ret)
4952 goto free_adj;
4953 }
4954
7863c054 4955 /* Ensure that master link is always the first item in list. */
842d67a7
VF
4956 if (master) {
4957 ret = sysfs_create_link(&(dev->dev.kobj),
4958 &(adj_dev->dev.kobj), "master");
4959 if (ret)
5831d66e 4960 goto remove_symlinks;
842d67a7 4961
7863c054 4962 list_add_rcu(&adj->list, dev_list);
842d67a7 4963 } else {
7863c054 4964 list_add_tail_rcu(&adj->list, dev_list);
842d67a7 4965 }
5d261913
VF
4966
4967 return 0;
842d67a7 4968
5831d66e 4969remove_symlinks:
7ce64c79 4970 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
3ee32707 4971 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
842d67a7
VF
4972free_adj:
4973 kfree(adj);
974daef7 4974 dev_put(adj_dev);
842d67a7
VF
4975
4976 return ret;
5d261913
VF
4977}
4978
1d143d9f 4979static void __netdev_adjacent_dev_remove(struct net_device *dev,
4980 struct net_device *adj_dev,
4981 struct list_head *dev_list)
5d261913
VF
4982{
4983 struct netdev_adjacent *adj;
4984
7863c054 4985 adj = __netdev_find_adj(dev, adj_dev, dev_list);
5d261913 4986
2f268f12
VF
4987 if (!adj) {
4988 pr_err("tried to remove device %s from %s\n",
4989 dev->name, adj_dev->name);
5d261913 4990 BUG();
2f268f12 4991 }
5d261913
VF
4992
4993 if (adj->ref_nr > 1) {
2f268f12
VF
4994 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4995 adj->ref_nr-1);
5d261913
VF
4996 adj->ref_nr--;
4997 return;
4998 }
4999
842d67a7
VF
5000 if (adj->master)
5001 sysfs_remove_link(&(dev->dev.kobj), "master");
5002
7ce64c79 5003 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
3ee32707 5004 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5831d66e 5005
5d261913 5006 list_del_rcu(&adj->list);
2f268f12
VF
5007 pr_debug("dev_put for %s, because link removed from %s to %s\n",
5008 adj_dev->name, dev->name, adj_dev->name);
5d261913
VF
5009 dev_put(adj_dev);
5010 kfree_rcu(adj, rcu);
5011}
5012
1d143d9f 5013static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5014 struct net_device *upper_dev,
5015 struct list_head *up_list,
5016 struct list_head *down_list,
5017 void *private, bool master)
5d261913
VF
5018{
5019 int ret;
5020
402dae96
VF
5021 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5022 master);
5d261913
VF
5023 if (ret)
5024 return ret;
5025
402dae96
VF
5026 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5027 false);
5d261913 5028 if (ret) {
2f268f12 5029 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5d261913
VF
5030 return ret;
5031 }
5032
5033 return 0;
5034}
5035
1d143d9f 5036static int __netdev_adjacent_dev_link(struct net_device *dev,
5037 struct net_device *upper_dev)
5d261913 5038{
2f268f12
VF
5039 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5040 &dev->all_adj_list.upper,
5041 &upper_dev->all_adj_list.lower,
402dae96 5042 NULL, false);
5d261913
VF
5043}
5044
1d143d9f 5045static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5046 struct net_device *upper_dev,
5047 struct list_head *up_list,
5048 struct list_head *down_list)
5d261913 5049{
2f268f12
VF
5050 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5051 __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5d261913
VF
5052}
5053
1d143d9f 5054static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5055 struct net_device *upper_dev)
5d261913 5056{
2f268f12
VF
5057 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5058 &dev->all_adj_list.upper,
5059 &upper_dev->all_adj_list.lower);
5060}
5061
1d143d9f 5062static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5063 struct net_device *upper_dev,
5064 void *private, bool master)
2f268f12
VF
5065{
5066 int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5067
5068 if (ret)
5069 return ret;
5070
5071 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5072 &dev->adj_list.upper,
5073 &upper_dev->adj_list.lower,
402dae96 5074 private, master);
2f268f12
VF
5075 if (ret) {
5076 __netdev_adjacent_dev_unlink(dev, upper_dev);
5077 return ret;
5078 }
5079
5080 return 0;
5d261913
VF
5081}
5082
1d143d9f 5083static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5084 struct net_device *upper_dev)
2f268f12
VF
5085{
5086 __netdev_adjacent_dev_unlink(dev, upper_dev);
5087 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5088 &dev->adj_list.upper,
5089 &upper_dev->adj_list.lower);
5090}
5d261913 5091
9ff162a8 5092static int __netdev_upper_dev_link(struct net_device *dev,
402dae96
VF
5093 struct net_device *upper_dev, bool master,
5094 void *private)
9ff162a8 5095{
5d261913
VF
5096 struct netdev_adjacent *i, *j, *to_i, *to_j;
5097 int ret = 0;
9ff162a8
JP
5098
5099 ASSERT_RTNL();
5100
5101 if (dev == upper_dev)
5102 return -EBUSY;
5103
5104 /* To prevent loops, check if dev is not upper device to upper_dev. */
2f268f12 5105 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
9ff162a8
JP
5106 return -EBUSY;
5107
2f268f12 5108 if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
9ff162a8
JP
5109 return -EEXIST;
5110
5111 if (master && netdev_master_upper_dev_get(dev))
5112 return -EBUSY;
5113
402dae96
VF
5114 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5115 master);
5d261913
VF
5116 if (ret)
5117 return ret;
9ff162a8 5118
5d261913 5119 /* Now that we linked these devs, make all the upper_dev's
2f268f12 5120 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5d261913
VF
5121 * versa, and don't forget the devices itself. All of these
5122 * links are non-neighbours.
5123 */
2f268f12
VF
5124 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5125 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5126 pr_debug("Interlinking %s with %s, non-neighbour\n",
5127 i->dev->name, j->dev->name);
5d261913
VF
5128 ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5129 if (ret)
5130 goto rollback_mesh;
5131 }
5132 }
5133
5134 /* add dev to every upper_dev's upper device */
2f268f12
VF
5135 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5136 pr_debug("linking %s's upper device %s with %s\n",
5137 upper_dev->name, i->dev->name, dev->name);
5d261913
VF
5138 ret = __netdev_adjacent_dev_link(dev, i->dev);
5139 if (ret)
5140 goto rollback_upper_mesh;
5141 }
5142
5143 /* add upper_dev to every dev's lower device */
2f268f12
VF
5144 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5145 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5146 i->dev->name, upper_dev->name);
5d261913
VF
5147 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5148 if (ret)
5149 goto rollback_lower_mesh;
5150 }
9ff162a8 5151
42e52bf9 5152 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
9ff162a8 5153 return 0;
5d261913
VF
5154
5155rollback_lower_mesh:
5156 to_i = i;
2f268f12 5157 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5d261913
VF
5158 if (i == to_i)
5159 break;
5160 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5161 }
5162
5163 i = NULL;
5164
5165rollback_upper_mesh:
5166 to_i = i;
2f268f12 5167 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5d261913
VF
5168 if (i == to_i)
5169 break;
5170 __netdev_adjacent_dev_unlink(dev, i->dev);
5171 }
5172
5173 i = j = NULL;
5174
5175rollback_mesh:
5176 to_i = i;
5177 to_j = j;
2f268f12
VF
5178 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5179 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5d261913
VF
5180 if (i == to_i && j == to_j)
5181 break;
5182 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5183 }
5184 if (i == to_i)
5185 break;
5186 }
5187
2f268f12 5188 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5d261913
VF
5189
5190 return ret;
9ff162a8
JP
5191}
5192
5193/**
5194 * netdev_upper_dev_link - Add a link to the upper device
5195 * @dev: device
5196 * @upper_dev: new upper device
5197 *
5198 * Adds a link to device which is upper to this one. The caller must hold
5199 * the RTNL lock. On a failure a negative errno code is returned.
5200 * On success the reference counts are adjusted and the function
5201 * returns zero.
5202 */
5203int netdev_upper_dev_link(struct net_device *dev,
5204 struct net_device *upper_dev)
5205{
402dae96 5206 return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
9ff162a8
JP
5207}
5208EXPORT_SYMBOL(netdev_upper_dev_link);
5209
5210/**
5211 * netdev_master_upper_dev_link - Add a master link to the upper device
5212 * @dev: device
5213 * @upper_dev: new upper device
5214 *
5215 * Adds a link to device which is upper to this one. In this case, only
5216 * one master upper device can be linked, although other non-master devices
5217 * might be linked as well. The caller must hold the RTNL lock.
5218 * On a failure a negative errno code is returned. On success the reference
5219 * counts are adjusted and the function returns zero.
5220 */
5221int netdev_master_upper_dev_link(struct net_device *dev,
5222 struct net_device *upper_dev)
5223{
402dae96 5224 return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
9ff162a8
JP
5225}
5226EXPORT_SYMBOL(netdev_master_upper_dev_link);
5227
402dae96
VF
5228int netdev_master_upper_dev_link_private(struct net_device *dev,
5229 struct net_device *upper_dev,
5230 void *private)
5231{
5232 return __netdev_upper_dev_link(dev, upper_dev, true, private);
5233}
5234EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5235
9ff162a8
JP
5236/**
5237 * netdev_upper_dev_unlink - Removes a link to upper device
5238 * @dev: device
5239 * @upper_dev: new upper device
5240 *
5241 * Removes a link to device which is upper to this one. The caller must hold
5242 * the RTNL lock.
5243 */
5244void netdev_upper_dev_unlink(struct net_device *dev,
5245 struct net_device *upper_dev)
5246{
5d261913 5247 struct netdev_adjacent *i, *j;
9ff162a8
JP
5248 ASSERT_RTNL();
5249
2f268f12 5250 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5d261913
VF
5251
5252 /* Here is the tricky part. We must remove all dev's lower
5253 * devices from all upper_dev's upper devices and vice
5254 * versa, to maintain the graph relationship.
5255 */
2f268f12
VF
5256 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5257 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5d261913
VF
5258 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5259
5260 /* remove also the devices itself from lower/upper device
5261 * list
5262 */
2f268f12 5263 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5d261913
VF
5264 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5265
2f268f12 5266 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5d261913
VF
5267 __netdev_adjacent_dev_unlink(dev, i->dev);
5268
42e52bf9 5269 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
9ff162a8
JP
5270}
5271EXPORT_SYMBOL(netdev_upper_dev_unlink);
5272
4c75431a
AF
5273void netdev_adjacent_add_links(struct net_device *dev)
5274{
5275 struct netdev_adjacent *iter;
5276
5277 struct net *net = dev_net(dev);
5278
5279 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5280 if (!net_eq(net,dev_net(iter->dev)))
5281 continue;
5282 netdev_adjacent_sysfs_add(iter->dev, dev,
5283 &iter->dev->adj_list.lower);
5284 netdev_adjacent_sysfs_add(dev, iter->dev,
5285 &dev->adj_list.upper);
5286 }
5287
5288 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5289 if (!net_eq(net,dev_net(iter->dev)))
5290 continue;
5291 netdev_adjacent_sysfs_add(iter->dev, dev,
5292 &iter->dev->adj_list.upper);
5293 netdev_adjacent_sysfs_add(dev, iter->dev,
5294 &dev->adj_list.lower);
5295 }
5296}
5297
5298void netdev_adjacent_del_links(struct net_device *dev)
5299{
5300 struct netdev_adjacent *iter;
5301
5302 struct net *net = dev_net(dev);
5303
5304 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5305 if (!net_eq(net,dev_net(iter->dev)))
5306 continue;
5307 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5308 &iter->dev->adj_list.lower);
5309 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5310 &dev->adj_list.upper);
5311 }
5312
5313 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5314 if (!net_eq(net,dev_net(iter->dev)))
5315 continue;
5316 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5317 &iter->dev->adj_list.upper);
5318 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5319 &dev->adj_list.lower);
5320 }
5321}
5322
5bb025fa 5323void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
402dae96 5324{
5bb025fa 5325 struct netdev_adjacent *iter;
402dae96 5326
4c75431a
AF
5327 struct net *net = dev_net(dev);
5328
5bb025fa 5329 list_for_each_entry(iter, &dev->adj_list.upper, list) {
4c75431a
AF
5330 if (!net_eq(net,dev_net(iter->dev)))
5331 continue;
5bb025fa
VF
5332 netdev_adjacent_sysfs_del(iter->dev, oldname,
5333 &iter->dev->adj_list.lower);
5334 netdev_adjacent_sysfs_add(iter->dev, dev,
5335 &iter->dev->adj_list.lower);
5336 }
402dae96 5337
5bb025fa 5338 list_for_each_entry(iter, &dev->adj_list.lower, list) {
4c75431a
AF
5339 if (!net_eq(net,dev_net(iter->dev)))
5340 continue;
5bb025fa
VF
5341 netdev_adjacent_sysfs_del(iter->dev, oldname,
5342 &iter->dev->adj_list.upper);
5343 netdev_adjacent_sysfs_add(iter->dev, dev,
5344 &iter->dev->adj_list.upper);
5345 }
402dae96 5346}
402dae96
VF
5347
5348void *netdev_lower_dev_get_private(struct net_device *dev,
5349 struct net_device *lower_dev)
5350{
5351 struct netdev_adjacent *lower;
5352
5353 if (!lower_dev)
5354 return NULL;
5355 lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5356 if (!lower)
5357 return NULL;
5358
5359 return lower->private;
5360}
5361EXPORT_SYMBOL(netdev_lower_dev_get_private);
5362
4085ebe8
VY
5363
5364int dev_get_nest_level(struct net_device *dev,
5365 bool (*type_check)(struct net_device *dev))
5366{
5367 struct net_device *lower = NULL;
5368 struct list_head *iter;
5369 int max_nest = -1;
5370 int nest;
5371
5372 ASSERT_RTNL();
5373
5374 netdev_for_each_lower_dev(dev, lower, iter) {
5375 nest = dev_get_nest_level(lower, type_check);
5376 if (max_nest < nest)
5377 max_nest = nest;
5378 }
5379
5380 if (type_check(dev))
5381 max_nest++;
5382
5383 return max_nest;
5384}
5385EXPORT_SYMBOL(dev_get_nest_level);
5386
b6c40d68
PM
5387static void dev_change_rx_flags(struct net_device *dev, int flags)
5388{
d314774c
SH
5389 const struct net_device_ops *ops = dev->netdev_ops;
5390
d2615bf4 5391 if (ops->ndo_change_rx_flags)
d314774c 5392 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
5393}
5394
991fb3f7 5395static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
1da177e4 5396{
b536db93 5397 unsigned int old_flags = dev->flags;
d04a48b0
EB
5398 kuid_t uid;
5399 kgid_t gid;
1da177e4 5400
24023451
PM
5401 ASSERT_RTNL();
5402
dad9b335
WC
5403 dev->flags |= IFF_PROMISC;
5404 dev->promiscuity += inc;
5405 if (dev->promiscuity == 0) {
5406 /*
5407 * Avoid overflow.
5408 * If inc causes overflow, untouch promisc and return error.
5409 */
5410 if (inc < 0)
5411 dev->flags &= ~IFF_PROMISC;
5412 else {
5413 dev->promiscuity -= inc;
7b6cd1ce
JP
5414 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5415 dev->name);
dad9b335
WC
5416 return -EOVERFLOW;
5417 }
5418 }
52609c0b 5419 if (dev->flags != old_flags) {
7b6cd1ce
JP
5420 pr_info("device %s %s promiscuous mode\n",
5421 dev->name,
5422 dev->flags & IFF_PROMISC ? "entered" : "left");
8192b0c4
DH
5423 if (audit_enabled) {
5424 current_uid_gid(&uid, &gid);
7759db82
KHK
5425 audit_log(current->audit_context, GFP_ATOMIC,
5426 AUDIT_ANOM_PROMISCUOUS,
5427 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5428 dev->name, (dev->flags & IFF_PROMISC),
5429 (old_flags & IFF_PROMISC),
e1760bd5 5430 from_kuid(&init_user_ns, audit_get_loginuid(current)),
d04a48b0
EB
5431 from_kuid(&init_user_ns, uid),
5432 from_kgid(&init_user_ns, gid),
7759db82 5433 audit_get_sessionid(current));
8192b0c4 5434 }
24023451 5435
b6c40d68 5436 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 5437 }
991fb3f7
ND
5438 if (notify)
5439 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
dad9b335 5440 return 0;
1da177e4
LT
5441}
5442
4417da66
PM
5443/**
5444 * dev_set_promiscuity - update promiscuity count on a device
5445 * @dev: device
5446 * @inc: modifier
5447 *
5448 * Add or remove promiscuity from a device. While the count in the device
5449 * remains above zero the interface remains promiscuous. Once it hits zero
5450 * the device reverts back to normal filtering operation. A negative inc
5451 * value is used to drop promiscuity on the device.
dad9b335 5452 * Return 0 if successful or a negative errno code on error.
4417da66 5453 */
dad9b335 5454int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66 5455{
b536db93 5456 unsigned int old_flags = dev->flags;
dad9b335 5457 int err;
4417da66 5458
991fb3f7 5459 err = __dev_set_promiscuity(dev, inc, true);
4b5a698e 5460 if (err < 0)
dad9b335 5461 return err;
4417da66
PM
5462 if (dev->flags != old_flags)
5463 dev_set_rx_mode(dev);
dad9b335 5464 return err;
4417da66 5465}
d1b19dff 5466EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 5467
991fb3f7 5468static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
1da177e4 5469{
991fb3f7 5470 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
1da177e4 5471
24023451
PM
5472 ASSERT_RTNL();
5473
1da177e4 5474 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
5475 dev->allmulti += inc;
5476 if (dev->allmulti == 0) {
5477 /*
5478 * Avoid overflow.
5479 * If inc causes overflow, untouch allmulti and return error.
5480 */
5481 if (inc < 0)
5482 dev->flags &= ~IFF_ALLMULTI;
5483 else {
5484 dev->allmulti -= inc;
7b6cd1ce
JP
5485 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5486 dev->name);
dad9b335
WC
5487 return -EOVERFLOW;
5488 }
5489 }
24023451 5490 if (dev->flags ^ old_flags) {
b6c40d68 5491 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 5492 dev_set_rx_mode(dev);
991fb3f7
ND
5493 if (notify)
5494 __dev_notify_flags(dev, old_flags,
5495 dev->gflags ^ old_gflags);
24023451 5496 }
dad9b335 5497 return 0;
4417da66 5498}
991fb3f7
ND
5499
5500/**
5501 * dev_set_allmulti - update allmulti count on a device
5502 * @dev: device
5503 * @inc: modifier
5504 *
5505 * Add or remove reception of all multicast frames to a device. While the
5506 * count in the device remains above zero the interface remains listening
5507 * to all interfaces. Once it hits zero the device reverts back to normal
5508 * filtering operation. A negative @inc value is used to drop the counter
5509 * when releasing a resource needing all multicasts.
5510 * Return 0 if successful or a negative errno code on error.
5511 */
5512
5513int dev_set_allmulti(struct net_device *dev, int inc)
5514{
5515 return __dev_set_allmulti(dev, inc, true);
5516}
d1b19dff 5517EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
5518
5519/*
5520 * Upload unicast and multicast address lists to device and
5521 * configure RX filtering. When the device doesn't support unicast
53ccaae1 5522 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
5523 * are present.
5524 */
5525void __dev_set_rx_mode(struct net_device *dev)
5526{
d314774c
SH
5527 const struct net_device_ops *ops = dev->netdev_ops;
5528
4417da66
PM
5529 /* dev_open will call this function so the list will stay sane. */
5530 if (!(dev->flags&IFF_UP))
5531 return;
5532
5533 if (!netif_device_present(dev))
40b77c94 5534 return;
4417da66 5535
01789349 5536 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4417da66
PM
5537 /* Unicast addresses changes may only happen under the rtnl,
5538 * therefore calling __dev_set_promiscuity here is safe.
5539 */
32e7bfc4 5540 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
991fb3f7 5541 __dev_set_promiscuity(dev, 1, false);
2d348d1f 5542 dev->uc_promisc = true;
32e7bfc4 5543 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
991fb3f7 5544 __dev_set_promiscuity(dev, -1, false);
2d348d1f 5545 dev->uc_promisc = false;
4417da66 5546 }
4417da66 5547 }
01789349
JP
5548
5549 if (ops->ndo_set_rx_mode)
5550 ops->ndo_set_rx_mode(dev);
4417da66
PM
5551}
5552
5553void dev_set_rx_mode(struct net_device *dev)
5554{
b9e40857 5555 netif_addr_lock_bh(dev);
4417da66 5556 __dev_set_rx_mode(dev);
b9e40857 5557 netif_addr_unlock_bh(dev);
1da177e4
LT
5558}
5559
f0db275a
SH
5560/**
5561 * dev_get_flags - get flags reported to userspace
5562 * @dev: device
5563 *
5564 * Get the combination of flag bits exported through APIs to userspace.
5565 */
95c96174 5566unsigned int dev_get_flags(const struct net_device *dev)
1da177e4 5567{
95c96174 5568 unsigned int flags;
1da177e4
LT
5569
5570 flags = (dev->flags & ~(IFF_PROMISC |
5571 IFF_ALLMULTI |
b00055aa
SR
5572 IFF_RUNNING |
5573 IFF_LOWER_UP |
5574 IFF_DORMANT)) |
1da177e4
LT
5575 (dev->gflags & (IFF_PROMISC |
5576 IFF_ALLMULTI));
5577
b00055aa
SR
5578 if (netif_running(dev)) {
5579 if (netif_oper_up(dev))
5580 flags |= IFF_RUNNING;
5581 if (netif_carrier_ok(dev))
5582 flags |= IFF_LOWER_UP;
5583 if (netif_dormant(dev))
5584 flags |= IFF_DORMANT;
5585 }
1da177e4
LT
5586
5587 return flags;
5588}
d1b19dff 5589EXPORT_SYMBOL(dev_get_flags);
1da177e4 5590
bd380811 5591int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 5592{
b536db93 5593 unsigned int old_flags = dev->flags;
bd380811 5594 int ret;
1da177e4 5595
24023451
PM
5596 ASSERT_RTNL();
5597
1da177e4
LT
5598 /*
5599 * Set the flags on our device.
5600 */
5601
5602 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5603 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5604 IFF_AUTOMEDIA)) |
5605 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5606 IFF_ALLMULTI));
5607
5608 /*
5609 * Load in the correct multicast list now the flags have changed.
5610 */
5611
b6c40d68
PM
5612 if ((old_flags ^ flags) & IFF_MULTICAST)
5613 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 5614
4417da66 5615 dev_set_rx_mode(dev);
1da177e4
LT
5616
5617 /*
5618 * Have we downed the interface. We handle IFF_UP ourselves
5619 * according to user attempts to set it, rather than blindly
5620 * setting it.
5621 */
5622
5623 ret = 0;
d215d10f 5624 if ((old_flags ^ flags) & IFF_UP)
bd380811 5625 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4 5626
1da177e4 5627 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff 5628 int inc = (flags & IFF_PROMISC) ? 1 : -1;
991fb3f7 5629 unsigned int old_flags = dev->flags;
d1b19dff 5630
1da177e4 5631 dev->gflags ^= IFF_PROMISC;
991fb3f7
ND
5632
5633 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5634 if (dev->flags != old_flags)
5635 dev_set_rx_mode(dev);
1da177e4
LT
5636 }
5637
5638 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5639 is important. Some (broken) drivers set IFF_PROMISC, when
5640 IFF_ALLMULTI is requested not asking us and not reporting.
5641 */
5642 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
5643 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5644
1da177e4 5645 dev->gflags ^= IFF_ALLMULTI;
991fb3f7 5646 __dev_set_allmulti(dev, inc, false);
1da177e4
LT
5647 }
5648
bd380811
PM
5649 return ret;
5650}
5651
a528c219
ND
5652void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5653 unsigned int gchanges)
bd380811
PM
5654{
5655 unsigned int changes = dev->flags ^ old_flags;
5656
a528c219 5657 if (gchanges)
7f294054 5658 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
a528c219 5659
bd380811
PM
5660 if (changes & IFF_UP) {
5661 if (dev->flags & IFF_UP)
5662 call_netdevice_notifiers(NETDEV_UP, dev);
5663 else
5664 call_netdevice_notifiers(NETDEV_DOWN, dev);
5665 }
5666
5667 if (dev->flags & IFF_UP &&
be9efd36
JP
5668 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5669 struct netdev_notifier_change_info change_info;
5670
5671 change_info.flags_changed = changes;
5672 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5673 &change_info.info);
5674 }
bd380811
PM
5675}
5676
5677/**
5678 * dev_change_flags - change device settings
5679 * @dev: device
5680 * @flags: device state flags
5681 *
5682 * Change settings on device based state flags. The flags are
5683 * in the userspace exported format.
5684 */
b536db93 5685int dev_change_flags(struct net_device *dev, unsigned int flags)
bd380811 5686{
b536db93 5687 int ret;
991fb3f7 5688 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
bd380811
PM
5689
5690 ret = __dev_change_flags(dev, flags);
5691 if (ret < 0)
5692 return ret;
5693
991fb3f7 5694 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
a528c219 5695 __dev_notify_flags(dev, old_flags, changes);
1da177e4
LT
5696 return ret;
5697}
d1b19dff 5698EXPORT_SYMBOL(dev_change_flags);
1da177e4 5699
2315dc91
VF
5700static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5701{
5702 const struct net_device_ops *ops = dev->netdev_ops;
5703
5704 if (ops->ndo_change_mtu)
5705 return ops->ndo_change_mtu(dev, new_mtu);
5706
5707 dev->mtu = new_mtu;
5708 return 0;
5709}
5710
f0db275a
SH
5711/**
5712 * dev_set_mtu - Change maximum transfer unit
5713 * @dev: device
5714 * @new_mtu: new transfer unit
5715 *
5716 * Change the maximum transfer size of the network device.
5717 */
1da177e4
LT
5718int dev_set_mtu(struct net_device *dev, int new_mtu)
5719{
2315dc91 5720 int err, orig_mtu;
1da177e4
LT
5721
5722 if (new_mtu == dev->mtu)
5723 return 0;
5724
5725 /* MTU must be positive. */
5726 if (new_mtu < 0)
5727 return -EINVAL;
5728
5729 if (!netif_device_present(dev))
5730 return -ENODEV;
5731
1d486bfb
VF
5732 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5733 err = notifier_to_errno(err);
5734 if (err)
5735 return err;
d314774c 5736
2315dc91
VF
5737 orig_mtu = dev->mtu;
5738 err = __dev_set_mtu(dev, new_mtu);
d314774c 5739
2315dc91
VF
5740 if (!err) {
5741 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5742 err = notifier_to_errno(err);
5743 if (err) {
5744 /* setting mtu back and notifying everyone again,
5745 * so that they have a chance to revert changes.
5746 */
5747 __dev_set_mtu(dev, orig_mtu);
5748 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5749 }
5750 }
1da177e4
LT
5751 return err;
5752}
d1b19dff 5753EXPORT_SYMBOL(dev_set_mtu);
1da177e4 5754
cbda10fa
VD
5755/**
5756 * dev_set_group - Change group this device belongs to
5757 * @dev: device
5758 * @new_group: group this device should belong to
5759 */
5760void dev_set_group(struct net_device *dev, int new_group)
5761{
5762 dev->group = new_group;
5763}
5764EXPORT_SYMBOL(dev_set_group);
5765
f0db275a
SH
5766/**
5767 * dev_set_mac_address - Change Media Access Control Address
5768 * @dev: device
5769 * @sa: new address
5770 *
5771 * Change the hardware (MAC) address of the device
5772 */
1da177e4
LT
5773int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5774{
d314774c 5775 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
5776 int err;
5777
d314774c 5778 if (!ops->ndo_set_mac_address)
1da177e4
LT
5779 return -EOPNOTSUPP;
5780 if (sa->sa_family != dev->type)
5781 return -EINVAL;
5782 if (!netif_device_present(dev))
5783 return -ENODEV;
d314774c 5784 err = ops->ndo_set_mac_address(dev, sa);
f6521516
JP
5785 if (err)
5786 return err;
fbdeca2d 5787 dev->addr_assign_type = NET_ADDR_SET;
f6521516 5788 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7bf23575 5789 add_device_randomness(dev->dev_addr, dev->addr_len);
f6521516 5790 return 0;
1da177e4 5791}
d1b19dff 5792EXPORT_SYMBOL(dev_set_mac_address);
1da177e4 5793
4bf84c35
JP
5794/**
5795 * dev_change_carrier - Change device carrier
5796 * @dev: device
691b3b7e 5797 * @new_carrier: new value
4bf84c35
JP
5798 *
5799 * Change device carrier
5800 */
5801int dev_change_carrier(struct net_device *dev, bool new_carrier)
5802{
5803 const struct net_device_ops *ops = dev->netdev_ops;
5804
5805 if (!ops->ndo_change_carrier)
5806 return -EOPNOTSUPP;
5807 if (!netif_device_present(dev))
5808 return -ENODEV;
5809 return ops->ndo_change_carrier(dev, new_carrier);
5810}
5811EXPORT_SYMBOL(dev_change_carrier);
5812
66b52b0d
JP
5813/**
5814 * dev_get_phys_port_id - Get device physical port ID
5815 * @dev: device
5816 * @ppid: port ID
5817 *
5818 * Get device physical port ID
5819 */
5820int dev_get_phys_port_id(struct net_device *dev,
5821 struct netdev_phys_port_id *ppid)
5822{
5823 const struct net_device_ops *ops = dev->netdev_ops;
5824
5825 if (!ops->ndo_get_phys_port_id)
5826 return -EOPNOTSUPP;
5827 return ops->ndo_get_phys_port_id(dev, ppid);
5828}
5829EXPORT_SYMBOL(dev_get_phys_port_id);
5830
1da177e4
LT
5831/**
5832 * dev_new_index - allocate an ifindex
c4ea43c5 5833 * @net: the applicable net namespace
1da177e4
LT
5834 *
5835 * Returns a suitable unique value for a new device interface
5836 * number. The caller must hold the rtnl semaphore or the
5837 * dev_base_lock to be sure it remains unique.
5838 */
881d966b 5839static int dev_new_index(struct net *net)
1da177e4 5840{
aa79e66e 5841 int ifindex = net->ifindex;
1da177e4
LT
5842 for (;;) {
5843 if (++ifindex <= 0)
5844 ifindex = 1;
881d966b 5845 if (!__dev_get_by_index(net, ifindex))
aa79e66e 5846 return net->ifindex = ifindex;
1da177e4
LT
5847 }
5848}
5849
1da177e4 5850/* Delayed registration/unregisteration */
3b5b34fd 5851static LIST_HEAD(net_todo_list);
200b916f 5852DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
1da177e4 5853
6f05f629 5854static void net_set_todo(struct net_device *dev)
1da177e4 5855{
1da177e4 5856 list_add_tail(&dev->todo_list, &net_todo_list);
50624c93 5857 dev_net(dev)->dev_unreg_count++;
1da177e4
LT
5858}
5859
9b5e383c 5860static void rollback_registered_many(struct list_head *head)
93ee31f1 5861{
e93737b0 5862 struct net_device *dev, *tmp;
5cde2829 5863 LIST_HEAD(close_head);
9b5e383c 5864
93ee31f1
DL
5865 BUG_ON(dev_boot_phase);
5866 ASSERT_RTNL();
5867
e93737b0 5868 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 5869 /* Some devices call without registering
e93737b0
KK
5870 * for initialization unwind. Remove those
5871 * devices and proceed with the remaining.
9b5e383c
ED
5872 */
5873 if (dev->reg_state == NETREG_UNINITIALIZED) {
7b6cd1ce
JP
5874 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5875 dev->name, dev);
93ee31f1 5876
9b5e383c 5877 WARN_ON(1);
e93737b0
KK
5878 list_del(&dev->unreg_list);
5879 continue;
9b5e383c 5880 }
449f4544 5881 dev->dismantle = true;
9b5e383c 5882 BUG_ON(dev->reg_state != NETREG_REGISTERED);
44345724 5883 }
93ee31f1 5884
44345724 5885 /* If device is running, close it first. */
5cde2829
EB
5886 list_for_each_entry(dev, head, unreg_list)
5887 list_add_tail(&dev->close_list, &close_head);
5888 dev_close_many(&close_head);
93ee31f1 5889
44345724 5890 list_for_each_entry(dev, head, unreg_list) {
9b5e383c
ED
5891 /* And unlink it from device chain. */
5892 unlist_netdevice(dev);
93ee31f1 5893
9b5e383c
ED
5894 dev->reg_state = NETREG_UNREGISTERING;
5895 }
93ee31f1
DL
5896
5897 synchronize_net();
5898
9b5e383c
ED
5899 list_for_each_entry(dev, head, unreg_list) {
5900 /* Shutdown queueing discipline. */
5901 dev_shutdown(dev);
93ee31f1
DL
5902
5903
9b5e383c
ED
5904 /* Notify protocols, that we are about to destroy
5905 this device. They should clean all the things.
5906 */
5907 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 5908
9b5e383c
ED
5909 /*
5910 * Flush the unicast and multicast chains
5911 */
a748ee24 5912 dev_uc_flush(dev);
22bedad3 5913 dev_mc_flush(dev);
93ee31f1 5914
9b5e383c
ED
5915 if (dev->netdev_ops->ndo_uninit)
5916 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 5917
56bfa7ee
RP
5918 if (!dev->rtnl_link_ops ||
5919 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5920 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5921
9ff162a8
JP
5922 /* Notifier chain MUST detach us all upper devices. */
5923 WARN_ON(netdev_has_any_upper_dev(dev));
93ee31f1 5924
9b5e383c
ED
5925 /* Remove entries from kobject tree */
5926 netdev_unregister_kobject(dev);
024e9679
AD
5927#ifdef CONFIG_XPS
5928 /* Remove XPS queueing entries */
5929 netif_reset_xps_queues_gt(dev, 0);
5930#endif
9b5e383c 5931 }
93ee31f1 5932
850a545b 5933 synchronize_net();
395264d5 5934
a5ee1551 5935 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
5936 dev_put(dev);
5937}
5938
5939static void rollback_registered(struct net_device *dev)
5940{
5941 LIST_HEAD(single);
5942
5943 list_add(&dev->unreg_list, &single);
5944 rollback_registered_many(&single);
ceaaec98 5945 list_del(&single);
93ee31f1
DL
5946}
5947
c8f44aff
MM
5948static netdev_features_t netdev_fix_features(struct net_device *dev,
5949 netdev_features_t features)
b63365a2 5950{
57422dc5
MM
5951 /* Fix illegal checksum combinations */
5952 if ((features & NETIF_F_HW_CSUM) &&
5953 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5954 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
57422dc5
MM
5955 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5956 }
5957
b63365a2 5958 /* TSO requires that SG is present as well. */
ea2d3688 5959 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6f404e44 5960 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
ea2d3688 5961 features &= ~NETIF_F_ALL_TSO;
b63365a2
HX
5962 }
5963
ec5f0615
PS
5964 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5965 !(features & NETIF_F_IP_CSUM)) {
5966 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5967 features &= ~NETIF_F_TSO;
5968 features &= ~NETIF_F_TSO_ECN;
5969 }
5970
5971 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5972 !(features & NETIF_F_IPV6_CSUM)) {
5973 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5974 features &= ~NETIF_F_TSO6;
5975 }
5976
31d8b9e0
BH
5977 /* TSO ECN requires that TSO is present as well. */
5978 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5979 features &= ~NETIF_F_TSO_ECN;
5980
212b573f
MM
5981 /* Software GSO depends on SG. */
5982 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6f404e44 5983 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
212b573f
MM
5984 features &= ~NETIF_F_GSO;
5985 }
5986
acd1130e 5987 /* UFO needs SG and checksumming */
b63365a2 5988 if (features & NETIF_F_UFO) {
79032644
MM
5989 /* maybe split UFO into V4 and V6? */
5990 if (!((features & NETIF_F_GEN_CSUM) ||
5991 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5992 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5993 netdev_dbg(dev,
acd1130e 5994 "Dropping NETIF_F_UFO since no checksum offload features.\n");
b63365a2
HX
5995 features &= ~NETIF_F_UFO;
5996 }
5997
5998 if (!(features & NETIF_F_SG)) {
6f404e44 5999 netdev_dbg(dev,
acd1130e 6000 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
b63365a2
HX
6001 features &= ~NETIF_F_UFO;
6002 }
6003 }
6004
d0290214
JP
6005#ifdef CONFIG_NET_RX_BUSY_POLL
6006 if (dev->netdev_ops->ndo_busy_poll)
6007 features |= NETIF_F_BUSY_POLL;
6008 else
6009#endif
6010 features &= ~NETIF_F_BUSY_POLL;
6011
b63365a2
HX
6012 return features;
6013}
b63365a2 6014
6cb6a27c 6015int __netdev_update_features(struct net_device *dev)
5455c699 6016{
c8f44aff 6017 netdev_features_t features;
5455c699
MM
6018 int err = 0;
6019
87267485
MM
6020 ASSERT_RTNL();
6021
5455c699
MM
6022 features = netdev_get_wanted_features(dev);
6023
6024 if (dev->netdev_ops->ndo_fix_features)
6025 features = dev->netdev_ops->ndo_fix_features(dev, features);
6026
6027 /* driver might be less strict about feature dependencies */
6028 features = netdev_fix_features(dev, features);
6029
6030 if (dev->features == features)
6cb6a27c 6031 return 0;
5455c699 6032
c8f44aff
MM
6033 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6034 &dev->features, &features);
5455c699
MM
6035
6036 if (dev->netdev_ops->ndo_set_features)
6037 err = dev->netdev_ops->ndo_set_features(dev, features);
6038
6cb6a27c 6039 if (unlikely(err < 0)) {
5455c699 6040 netdev_err(dev,
c8f44aff
MM
6041 "set_features() failed (%d); wanted %pNF, left %pNF\n",
6042 err, &features, &dev->features);
6cb6a27c
MM
6043 return -1;
6044 }
6045
6046 if (!err)
6047 dev->features = features;
6048
6049 return 1;
6050}
6051
afe12cc8
MM
6052/**
6053 * netdev_update_features - recalculate device features
6054 * @dev: the device to check
6055 *
6056 * Recalculate dev->features set and send notifications if it
6057 * has changed. Should be called after driver or hardware dependent
6058 * conditions might have changed that influence the features.
6059 */
6cb6a27c
MM
6060void netdev_update_features(struct net_device *dev)
6061{
6062 if (__netdev_update_features(dev))
6063 netdev_features_change(dev);
5455c699
MM
6064}
6065EXPORT_SYMBOL(netdev_update_features);
6066
afe12cc8
MM
6067/**
6068 * netdev_change_features - recalculate device features
6069 * @dev: the device to check
6070 *
6071 * Recalculate dev->features set and send notifications even
6072 * if they have not changed. Should be called instead of
6073 * netdev_update_features() if also dev->vlan_features might
6074 * have changed to allow the changes to be propagated to stacked
6075 * VLAN devices.
6076 */
6077void netdev_change_features(struct net_device *dev)
6078{
6079 __netdev_update_features(dev);
6080 netdev_features_change(dev);
6081}
6082EXPORT_SYMBOL(netdev_change_features);
6083
fc4a7489
PM
6084/**
6085 * netif_stacked_transfer_operstate - transfer operstate
6086 * @rootdev: the root or lower level device to transfer state from
6087 * @dev: the device to transfer operstate to
6088 *
6089 * Transfer operational state from root to device. This is normally
6090 * called when a stacking relationship exists between the root
6091 * device and the device(a leaf device).
6092 */
6093void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6094 struct net_device *dev)
6095{
6096 if (rootdev->operstate == IF_OPER_DORMANT)
6097 netif_dormant_on(dev);
6098 else
6099 netif_dormant_off(dev);
6100
6101 if (netif_carrier_ok(rootdev)) {
6102 if (!netif_carrier_ok(dev))
6103 netif_carrier_on(dev);
6104 } else {
6105 if (netif_carrier_ok(dev))
6106 netif_carrier_off(dev);
6107 }
6108}
6109EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6110
a953be53 6111#ifdef CONFIG_SYSFS
1b4bf461
ED
6112static int netif_alloc_rx_queues(struct net_device *dev)
6113{
1b4bf461 6114 unsigned int i, count = dev->num_rx_queues;
bd25fa7b 6115 struct netdev_rx_queue *rx;
1b4bf461 6116
bd25fa7b 6117 BUG_ON(count < 1);
1b4bf461 6118
bd25fa7b 6119 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
62b5942a 6120 if (!rx)
bd25fa7b 6121 return -ENOMEM;
62b5942a 6122
bd25fa7b
TH
6123 dev->_rx = rx;
6124
bd25fa7b 6125 for (i = 0; i < count; i++)
fe822240 6126 rx[i].dev = dev;
1b4bf461
ED
6127 return 0;
6128}
bf264145 6129#endif
1b4bf461 6130
aa942104
CG
6131static void netdev_init_one_queue(struct net_device *dev,
6132 struct netdev_queue *queue, void *_unused)
6133{
6134 /* Initialize queue lock */
6135 spin_lock_init(&queue->_xmit_lock);
6136 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6137 queue->xmit_lock_owner = -1;
b236da69 6138 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
aa942104 6139 queue->dev = dev;
114cf580
TH
6140#ifdef CONFIG_BQL
6141 dql_init(&queue->dql, HZ);
6142#endif
aa942104
CG
6143}
6144
60877a32
ED
6145static void netif_free_tx_queues(struct net_device *dev)
6146{
4cb28970 6147 kvfree(dev->_tx);
60877a32
ED
6148}
6149
e6484930
TH
6150static int netif_alloc_netdev_queues(struct net_device *dev)
6151{
6152 unsigned int count = dev->num_tx_queues;
6153 struct netdev_queue *tx;
60877a32 6154 size_t sz = count * sizeof(*tx);
e6484930 6155
60877a32 6156 BUG_ON(count < 1 || count > 0xffff);
62b5942a 6157
60877a32
ED
6158 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6159 if (!tx) {
6160 tx = vzalloc(sz);
6161 if (!tx)
6162 return -ENOMEM;
6163 }
e6484930 6164 dev->_tx = tx;
1d24eb48 6165
e6484930
TH
6166 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6167 spin_lock_init(&dev->tx_global_lock);
aa942104
CG
6168
6169 return 0;
e6484930
TH
6170}
6171
1da177e4
LT
6172/**
6173 * register_netdevice - register a network device
6174 * @dev: device to register
6175 *
6176 * Take a completed network device structure and add it to the kernel
6177 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6178 * chain. 0 is returned on success. A negative errno code is returned
6179 * on a failure to set up the device, or if the name is a duplicate.
6180 *
6181 * Callers must hold the rtnl semaphore. You may want
6182 * register_netdev() instead of this.
6183 *
6184 * BUGS:
6185 * The locking appears insufficient to guarantee two parallel registers
6186 * will not get the same name.
6187 */
6188
6189int register_netdevice(struct net_device *dev)
6190{
1da177e4 6191 int ret;
d314774c 6192 struct net *net = dev_net(dev);
1da177e4
LT
6193
6194 BUG_ON(dev_boot_phase);
6195 ASSERT_RTNL();
6196
b17a7c17
SH
6197 might_sleep();
6198
1da177e4
LT
6199 /* When net_device's are persistent, this will be fatal. */
6200 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 6201 BUG_ON(!net);
1da177e4 6202
f1f28aa3 6203 spin_lock_init(&dev->addr_list_lock);
cf508b12 6204 netdev_set_addr_lockdep_class(dev);
1da177e4 6205
1da177e4
LT
6206 dev->iflink = -1;
6207
828de4f6 6208 ret = dev_get_valid_name(net, dev, dev->name);
0696c3a8
PP
6209 if (ret < 0)
6210 goto out;
6211
1da177e4 6212 /* Init, if this function is available */
d314774c
SH
6213 if (dev->netdev_ops->ndo_init) {
6214 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
6215 if (ret) {
6216 if (ret > 0)
6217 ret = -EIO;
90833aa4 6218 goto out;
1da177e4
LT
6219 }
6220 }
4ec93edb 6221
f646968f
PM
6222 if (((dev->hw_features | dev->features) &
6223 NETIF_F_HW_VLAN_CTAG_FILTER) &&
d2ed273d
MM
6224 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6225 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6226 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6227 ret = -EINVAL;
6228 goto err_uninit;
6229 }
6230
9c7dafbf
PE
6231 ret = -EBUSY;
6232 if (!dev->ifindex)
6233 dev->ifindex = dev_new_index(net);
6234 else if (__dev_get_by_index(net, dev->ifindex))
6235 goto err_uninit;
6236
1da177e4
LT
6237 if (dev->iflink == -1)
6238 dev->iflink = dev->ifindex;
6239
5455c699
MM
6240 /* Transfer changeable features to wanted_features and enable
6241 * software offloads (GSO and GRO).
6242 */
6243 dev->hw_features |= NETIF_F_SOFT_FEATURES;
14d1232f
MM
6244 dev->features |= NETIF_F_SOFT_FEATURES;
6245 dev->wanted_features = dev->features & dev->hw_features;
1da177e4 6246
34324dc2
MM
6247 if (!(dev->flags & IFF_LOOPBACK)) {
6248 dev->hw_features |= NETIF_F_NOCACHE_COPY;
c6e1a0d1
TH
6249 }
6250
1180e7d6 6251 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
16c3ea78 6252 */
1180e7d6 6253 dev->vlan_features |= NETIF_F_HIGHDMA;
16c3ea78 6254
ee579677
PS
6255 /* Make NETIF_F_SG inheritable to tunnel devices.
6256 */
6257 dev->hw_enc_features |= NETIF_F_SG;
6258
0d89d203
SH
6259 /* Make NETIF_F_SG inheritable to MPLS.
6260 */
6261 dev->mpls_features |= NETIF_F_SG;
6262
7ffbe3fd
JB
6263 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6264 ret = notifier_to_errno(ret);
6265 if (ret)
6266 goto err_uninit;
6267
8b41d188 6268 ret = netdev_register_kobject(dev);
b17a7c17 6269 if (ret)
7ce1b0ed 6270 goto err_uninit;
b17a7c17
SH
6271 dev->reg_state = NETREG_REGISTERED;
6272
6cb6a27c 6273 __netdev_update_features(dev);
8e9b59b2 6274
1da177e4
LT
6275 /*
6276 * Default initial state at registry is that the
6277 * device is present.
6278 */
6279
6280 set_bit(__LINK_STATE_PRESENT, &dev->state);
6281
8f4cccbb
BH
6282 linkwatch_init_dev(dev);
6283
1da177e4 6284 dev_init_scheduler(dev);
1da177e4 6285 dev_hold(dev);
ce286d32 6286 list_netdevice(dev);
7bf23575 6287 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 6288
948b337e
JP
6289 /* If the device has permanent device address, driver should
6290 * set dev_addr and also addr_assign_type should be set to
6291 * NET_ADDR_PERM (default value).
6292 */
6293 if (dev->addr_assign_type == NET_ADDR_PERM)
6294 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6295
1da177e4 6296 /* Notify protocols, that a new device appeared. */
056925ab 6297 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 6298 ret = notifier_to_errno(ret);
93ee31f1
DL
6299 if (ret) {
6300 rollback_registered(dev);
6301 dev->reg_state = NETREG_UNREGISTERED;
6302 }
d90a909e
EB
6303 /*
6304 * Prevent userspace races by waiting until the network
6305 * device is fully setup before sending notifications.
6306 */
a2835763
PM
6307 if (!dev->rtnl_link_ops ||
6308 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7f294054 6309 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
1da177e4
LT
6310
6311out:
6312 return ret;
7ce1b0ed
HX
6313
6314err_uninit:
d314774c
SH
6315 if (dev->netdev_ops->ndo_uninit)
6316 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 6317 goto out;
1da177e4 6318}
d1b19dff 6319EXPORT_SYMBOL(register_netdevice);
1da177e4 6320
937f1ba5
BH
6321/**
6322 * init_dummy_netdev - init a dummy network device for NAPI
6323 * @dev: device to init
6324 *
6325 * This takes a network device structure and initialize the minimum
6326 * amount of fields so it can be used to schedule NAPI polls without
6327 * registering a full blown interface. This is to be used by drivers
6328 * that need to tie several hardware interfaces to a single NAPI
6329 * poll scheduler due to HW limitations.
6330 */
6331int init_dummy_netdev(struct net_device *dev)
6332{
6333 /* Clear everything. Note we don't initialize spinlocks
6334 * are they aren't supposed to be taken by any of the
6335 * NAPI code and this dummy netdev is supposed to be
6336 * only ever used for NAPI polls
6337 */
6338 memset(dev, 0, sizeof(struct net_device));
6339
6340 /* make sure we BUG if trying to hit standard
6341 * register/unregister code path
6342 */
6343 dev->reg_state = NETREG_DUMMY;
6344
937f1ba5
BH
6345 /* NAPI wants this */
6346 INIT_LIST_HEAD(&dev->napi_list);
6347
6348 /* a dummy interface is started by default */
6349 set_bit(__LINK_STATE_PRESENT, &dev->state);
6350 set_bit(__LINK_STATE_START, &dev->state);
6351
29b4433d
ED
6352 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6353 * because users of this 'device' dont need to change
6354 * its refcount.
6355 */
6356
937f1ba5
BH
6357 return 0;
6358}
6359EXPORT_SYMBOL_GPL(init_dummy_netdev);
6360
6361
1da177e4
LT
6362/**
6363 * register_netdev - register a network device
6364 * @dev: device to register
6365 *
6366 * Take a completed network device structure and add it to the kernel
6367 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6368 * chain. 0 is returned on success. A negative errno code is returned
6369 * on a failure to set up the device, or if the name is a duplicate.
6370 *
38b4da38 6371 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
6372 * and expands the device name if you passed a format string to
6373 * alloc_netdev.
6374 */
6375int register_netdev(struct net_device *dev)
6376{
6377 int err;
6378
6379 rtnl_lock();
1da177e4 6380 err = register_netdevice(dev);
1da177e4
LT
6381 rtnl_unlock();
6382 return err;
6383}
6384EXPORT_SYMBOL(register_netdev);
6385
29b4433d
ED
6386int netdev_refcnt_read(const struct net_device *dev)
6387{
6388 int i, refcnt = 0;
6389
6390 for_each_possible_cpu(i)
6391 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6392 return refcnt;
6393}
6394EXPORT_SYMBOL(netdev_refcnt_read);
6395
2c53040f 6396/**
1da177e4 6397 * netdev_wait_allrefs - wait until all references are gone.
3de7a37b 6398 * @dev: target net_device
1da177e4
LT
6399 *
6400 * This is called when unregistering network devices.
6401 *
6402 * Any protocol or device that holds a reference should register
6403 * for netdevice notification, and cleanup and put back the
6404 * reference if they receive an UNREGISTER event.
6405 * We can get stuck here if buggy protocols don't correctly
4ec93edb 6406 * call dev_put.
1da177e4
LT
6407 */
6408static void netdev_wait_allrefs(struct net_device *dev)
6409{
6410 unsigned long rebroadcast_time, warning_time;
29b4433d 6411 int refcnt;
1da177e4 6412
e014debe
ED
6413 linkwatch_forget_dev(dev);
6414
1da177e4 6415 rebroadcast_time = warning_time = jiffies;
29b4433d
ED
6416 refcnt = netdev_refcnt_read(dev);
6417
6418 while (refcnt != 0) {
1da177e4 6419 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 6420 rtnl_lock();
1da177e4
LT
6421
6422 /* Rebroadcast unregister notification */
056925ab 6423 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
1da177e4 6424
748e2d93 6425 __rtnl_unlock();
0115e8e3 6426 rcu_barrier();
748e2d93
ED
6427 rtnl_lock();
6428
0115e8e3 6429 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
1da177e4
LT
6430 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6431 &dev->state)) {
6432 /* We must not have linkwatch events
6433 * pending on unregister. If this
6434 * happens, we simply run the queue
6435 * unscheduled, resulting in a noop
6436 * for this device.
6437 */
6438 linkwatch_run_queue();
6439 }
6440
6756ae4b 6441 __rtnl_unlock();
1da177e4
LT
6442
6443 rebroadcast_time = jiffies;
6444 }
6445
6446 msleep(250);
6447
29b4433d
ED
6448 refcnt = netdev_refcnt_read(dev);
6449
1da177e4 6450 if (time_after(jiffies, warning_time + 10 * HZ)) {
7b6cd1ce
JP
6451 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6452 dev->name, refcnt);
1da177e4
LT
6453 warning_time = jiffies;
6454 }
6455 }
6456}
6457
6458/* The sequence is:
6459 *
6460 * rtnl_lock();
6461 * ...
6462 * register_netdevice(x1);
6463 * register_netdevice(x2);
6464 * ...
6465 * unregister_netdevice(y1);
6466 * unregister_netdevice(y2);
6467 * ...
6468 * rtnl_unlock();
6469 * free_netdev(y1);
6470 * free_netdev(y2);
6471 *
58ec3b4d 6472 * We are invoked by rtnl_unlock().
1da177e4 6473 * This allows us to deal with problems:
b17a7c17 6474 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
6475 * without deadlocking with linkwatch via keventd.
6476 * 2) Since we run with the RTNL semaphore not held, we can sleep
6477 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
6478 *
6479 * We must not return until all unregister events added during
6480 * the interval the lock was held have been completed.
1da177e4 6481 */
1da177e4
LT
6482void netdev_run_todo(void)
6483{
626ab0e6 6484 struct list_head list;
1da177e4 6485
1da177e4 6486 /* Snapshot list, allow later requests */
626ab0e6 6487 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
6488
6489 __rtnl_unlock();
626ab0e6 6490
0115e8e3
ED
6491
6492 /* Wait for rcu callbacks to finish before next phase */
850a545b
EB
6493 if (!list_empty(&list))
6494 rcu_barrier();
6495
1da177e4
LT
6496 while (!list_empty(&list)) {
6497 struct net_device *dev
e5e26d75 6498 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
6499 list_del(&dev->todo_list);
6500
748e2d93 6501 rtnl_lock();
0115e8e3 6502 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
748e2d93 6503 __rtnl_unlock();
0115e8e3 6504
b17a7c17 6505 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7b6cd1ce 6506 pr_err("network todo '%s' but state %d\n",
b17a7c17
SH
6507 dev->name, dev->reg_state);
6508 dump_stack();
6509 continue;
6510 }
1da177e4 6511
b17a7c17 6512 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 6513
152102c7 6514 on_each_cpu(flush_backlog, dev, 1);
6e583ce5 6515
b17a7c17 6516 netdev_wait_allrefs(dev);
1da177e4 6517
b17a7c17 6518 /* paranoia */
29b4433d 6519 BUG_ON(netdev_refcnt_read(dev));
33d480ce
ED
6520 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6521 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
547b792c 6522 WARN_ON(dev->dn_ptr);
1da177e4 6523
b17a7c17
SH
6524 if (dev->destructor)
6525 dev->destructor(dev);
9093bbb2 6526
50624c93
EB
6527 /* Report a network device has been unregistered */
6528 rtnl_lock();
6529 dev_net(dev)->dev_unreg_count--;
6530 __rtnl_unlock();
6531 wake_up(&netdev_unregistering_wq);
6532
9093bbb2
SH
6533 /* Free network device */
6534 kobject_put(&dev->dev.kobj);
1da177e4 6535 }
1da177e4
LT
6536}
6537
3cfde79c
BH
6538/* Convert net_device_stats to rtnl_link_stats64. They have the same
6539 * fields in the same order, with only the type differing.
6540 */
77a1abf5
ED
6541void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6542 const struct net_device_stats *netdev_stats)
3cfde79c
BH
6543{
6544#if BITS_PER_LONG == 64
77a1abf5
ED
6545 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6546 memcpy(stats64, netdev_stats, sizeof(*stats64));
3cfde79c
BH
6547#else
6548 size_t i, n = sizeof(*stats64) / sizeof(u64);
6549 const unsigned long *src = (const unsigned long *)netdev_stats;
6550 u64 *dst = (u64 *)stats64;
6551
6552 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6553 sizeof(*stats64) / sizeof(u64));
6554 for (i = 0; i < n; i++)
6555 dst[i] = src[i];
6556#endif
6557}
77a1abf5 6558EXPORT_SYMBOL(netdev_stats_to_stats64);
3cfde79c 6559
eeda3fd6
SH
6560/**
6561 * dev_get_stats - get network device statistics
6562 * @dev: device to get statistics from
28172739 6563 * @storage: place to store stats
eeda3fd6 6564 *
d7753516
BH
6565 * Get network statistics from device. Return @storage.
6566 * The device driver may provide its own method by setting
6567 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6568 * otherwise the internal statistics structure is used.
eeda3fd6 6569 */
d7753516
BH
6570struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6571 struct rtnl_link_stats64 *storage)
7004bf25 6572{
eeda3fd6
SH
6573 const struct net_device_ops *ops = dev->netdev_ops;
6574
28172739
ED
6575 if (ops->ndo_get_stats64) {
6576 memset(storage, 0, sizeof(*storage));
caf586e5
ED
6577 ops->ndo_get_stats64(dev, storage);
6578 } else if (ops->ndo_get_stats) {
3cfde79c 6579 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
caf586e5
ED
6580 } else {
6581 netdev_stats_to_stats64(storage, &dev->stats);
28172739 6582 }
caf586e5 6583 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
015f0688 6584 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
28172739 6585 return storage;
c45d286e 6586}
eeda3fd6 6587EXPORT_SYMBOL(dev_get_stats);
c45d286e 6588
24824a09 6589struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
dc2b4847 6590{
24824a09 6591 struct netdev_queue *queue = dev_ingress_queue(dev);
dc2b4847 6592
24824a09
ED
6593#ifdef CONFIG_NET_CLS_ACT
6594 if (queue)
6595 return queue;
6596 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6597 if (!queue)
6598 return NULL;
6599 netdev_init_one_queue(dev, queue, NULL);
24824a09
ED
6600 queue->qdisc = &noop_qdisc;
6601 queue->qdisc_sleeping = &noop_qdisc;
6602 rcu_assign_pointer(dev->ingress_queue, queue);
6603#endif
6604 return queue;
bb949fbd
DM
6605}
6606
2c60db03
ED
6607static const struct ethtool_ops default_ethtool_ops;
6608
d07d7507
SG
6609void netdev_set_default_ethtool_ops(struct net_device *dev,
6610 const struct ethtool_ops *ops)
6611{
6612 if (dev->ethtool_ops == &default_ethtool_ops)
6613 dev->ethtool_ops = ops;
6614}
6615EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6616
74d332c1
ED
6617void netdev_freemem(struct net_device *dev)
6618{
6619 char *addr = (char *)dev - dev->padded;
6620
4cb28970 6621 kvfree(addr);
74d332c1
ED
6622}
6623
1da177e4 6624/**
36909ea4 6625 * alloc_netdev_mqs - allocate network device
c835a677
TG
6626 * @sizeof_priv: size of private data to allocate space for
6627 * @name: device name format string
6628 * @name_assign_type: origin of device name
6629 * @setup: callback to initialize device
6630 * @txqs: the number of TX subqueues to allocate
6631 * @rxqs: the number of RX subqueues to allocate
1da177e4
LT
6632 *
6633 * Allocates a struct net_device with private data area for driver use
90e51adf 6634 * and performs basic initialization. Also allocates subqueue structs
36909ea4 6635 * for each queue on the device.
1da177e4 6636 */
36909ea4 6637struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
c835a677 6638 unsigned char name_assign_type,
36909ea4
TH
6639 void (*setup)(struct net_device *),
6640 unsigned int txqs, unsigned int rxqs)
1da177e4 6641{
1da177e4 6642 struct net_device *dev;
7943986c 6643 size_t alloc_size;
1ce8e7b5 6644 struct net_device *p;
1da177e4 6645
b6fe17d6
SH
6646 BUG_ON(strlen(name) >= sizeof(dev->name));
6647
36909ea4 6648 if (txqs < 1) {
7b6cd1ce 6649 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
55513fb4
TH
6650 return NULL;
6651 }
6652
a953be53 6653#ifdef CONFIG_SYSFS
36909ea4 6654 if (rxqs < 1) {
7b6cd1ce 6655 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
36909ea4
TH
6656 return NULL;
6657 }
6658#endif
6659
fd2ea0a7 6660 alloc_size = sizeof(struct net_device);
d1643d24
AD
6661 if (sizeof_priv) {
6662 /* ensure 32-byte alignment of private area */
1ce8e7b5 6663 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
6664 alloc_size += sizeof_priv;
6665 }
6666 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 6667 alloc_size += NETDEV_ALIGN - 1;
1da177e4 6668
74d332c1
ED
6669 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6670 if (!p)
6671 p = vzalloc(alloc_size);
62b5942a 6672 if (!p)
1da177e4 6673 return NULL;
1da177e4 6674
1ce8e7b5 6675 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 6676 dev->padded = (char *)dev - (char *)p;
ab9c73cc 6677
29b4433d
ED
6678 dev->pcpu_refcnt = alloc_percpu(int);
6679 if (!dev->pcpu_refcnt)
74d332c1 6680 goto free_dev;
ab9c73cc 6681
ab9c73cc 6682 if (dev_addr_init(dev))
29b4433d 6683 goto free_pcpu;
ab9c73cc 6684
22bedad3 6685 dev_mc_init(dev);
a748ee24 6686 dev_uc_init(dev);
ccffad25 6687
c346dca1 6688 dev_net_set(dev, &init_net);
1da177e4 6689
8d3bdbd5 6690 dev->gso_max_size = GSO_MAX_SIZE;
30b678d8 6691 dev->gso_max_segs = GSO_MAX_SEGS;
fcbeb976 6692 dev->gso_min_segs = 0;
8d3bdbd5 6693
8d3bdbd5
DM
6694 INIT_LIST_HEAD(&dev->napi_list);
6695 INIT_LIST_HEAD(&dev->unreg_list);
5cde2829 6696 INIT_LIST_HEAD(&dev->close_list);
8d3bdbd5 6697 INIT_LIST_HEAD(&dev->link_watch_list);
2f268f12
VF
6698 INIT_LIST_HEAD(&dev->adj_list.upper);
6699 INIT_LIST_HEAD(&dev->adj_list.lower);
6700 INIT_LIST_HEAD(&dev->all_adj_list.upper);
6701 INIT_LIST_HEAD(&dev->all_adj_list.lower);
02875878 6702 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
8d3bdbd5
DM
6703 setup(dev);
6704
36909ea4
TH
6705 dev->num_tx_queues = txqs;
6706 dev->real_num_tx_queues = txqs;
ed9af2e8 6707 if (netif_alloc_netdev_queues(dev))
8d3bdbd5 6708 goto free_all;
e8a0464c 6709
a953be53 6710#ifdef CONFIG_SYSFS
36909ea4
TH
6711 dev->num_rx_queues = rxqs;
6712 dev->real_num_rx_queues = rxqs;
fe822240 6713 if (netif_alloc_rx_queues(dev))
8d3bdbd5 6714 goto free_all;
df334545 6715#endif
0a9627f2 6716
1da177e4 6717 strcpy(dev->name, name);
c835a677 6718 dev->name_assign_type = name_assign_type;
cbda10fa 6719 dev->group = INIT_NETDEV_GROUP;
2c60db03
ED
6720 if (!dev->ethtool_ops)
6721 dev->ethtool_ops = &default_ethtool_ops;
1da177e4 6722 return dev;
ab9c73cc 6723
8d3bdbd5
DM
6724free_all:
6725 free_netdev(dev);
6726 return NULL;
6727
29b4433d
ED
6728free_pcpu:
6729 free_percpu(dev->pcpu_refcnt);
74d332c1
ED
6730free_dev:
6731 netdev_freemem(dev);
ab9c73cc 6732 return NULL;
1da177e4 6733}
36909ea4 6734EXPORT_SYMBOL(alloc_netdev_mqs);
1da177e4
LT
6735
6736/**
6737 * free_netdev - free network device
6738 * @dev: device
6739 *
4ec93edb
YH
6740 * This function does the last stage of destroying an allocated device
6741 * interface. The reference to the device object is released.
1da177e4
LT
6742 * If this is the last reference then it will be freed.
6743 */
6744void free_netdev(struct net_device *dev)
6745{
d565b0a1
HX
6746 struct napi_struct *p, *n;
6747
f3005d7f
DL
6748 release_net(dev_net(dev));
6749
60877a32 6750 netif_free_tx_queues(dev);
a953be53 6751#ifdef CONFIG_SYSFS
fe822240
TH
6752 kfree(dev->_rx);
6753#endif
e8a0464c 6754
33d480ce 6755 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
24824a09 6756
f001fde5
JP
6757 /* Flush device addresses */
6758 dev_addr_flush(dev);
6759
d565b0a1
HX
6760 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6761 netif_napi_del(p);
6762
29b4433d
ED
6763 free_percpu(dev->pcpu_refcnt);
6764 dev->pcpu_refcnt = NULL;
6765
3041a069 6766 /* Compatibility with error handling in drivers */
1da177e4 6767 if (dev->reg_state == NETREG_UNINITIALIZED) {
74d332c1 6768 netdev_freemem(dev);
1da177e4
LT
6769 return;
6770 }
6771
6772 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6773 dev->reg_state = NETREG_RELEASED;
6774
43cb76d9
GKH
6775 /* will free via device release */
6776 put_device(&dev->dev);
1da177e4 6777}
d1b19dff 6778EXPORT_SYMBOL(free_netdev);
4ec93edb 6779
f0db275a
SH
6780/**
6781 * synchronize_net - Synchronize with packet receive processing
6782 *
6783 * Wait for packets currently being received to be done.
6784 * Does not block later packets from starting.
6785 */
4ec93edb 6786void synchronize_net(void)
1da177e4
LT
6787{
6788 might_sleep();
be3fc413
ED
6789 if (rtnl_is_locked())
6790 synchronize_rcu_expedited();
6791 else
6792 synchronize_rcu();
1da177e4 6793}
d1b19dff 6794EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
6795
6796/**
44a0873d 6797 * unregister_netdevice_queue - remove device from the kernel
1da177e4 6798 * @dev: device
44a0873d 6799 * @head: list
6ebfbc06 6800 *
1da177e4 6801 * This function shuts down a device interface and removes it
d59b54b1 6802 * from the kernel tables.
44a0873d 6803 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
6804 *
6805 * Callers must hold the rtnl semaphore. You may want
6806 * unregister_netdev() instead of this.
6807 */
6808
44a0873d 6809void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 6810{
a6620712
HX
6811 ASSERT_RTNL();
6812
44a0873d 6813 if (head) {
9fdce099 6814 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
6815 } else {
6816 rollback_registered(dev);
6817 /* Finish processing unregister after unlock */
6818 net_set_todo(dev);
6819 }
1da177e4 6820}
44a0873d 6821EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 6822
9b5e383c
ED
6823/**
6824 * unregister_netdevice_many - unregister many devices
6825 * @head: list of devices
87757a91
ED
6826 *
6827 * Note: As most callers use a stack allocated list_head,
6828 * we force a list_del() to make sure stack wont be corrupted later.
9b5e383c
ED
6829 */
6830void unregister_netdevice_many(struct list_head *head)
6831{
6832 struct net_device *dev;
6833
6834 if (!list_empty(head)) {
6835 rollback_registered_many(head);
6836 list_for_each_entry(dev, head, unreg_list)
6837 net_set_todo(dev);
87757a91 6838 list_del(head);
9b5e383c
ED
6839 }
6840}
63c8099d 6841EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 6842
1da177e4
LT
6843/**
6844 * unregister_netdev - remove device from the kernel
6845 * @dev: device
6846 *
6847 * This function shuts down a device interface and removes it
d59b54b1 6848 * from the kernel tables.
1da177e4
LT
6849 *
6850 * This is just a wrapper for unregister_netdevice that takes
6851 * the rtnl semaphore. In general you want to use this and not
6852 * unregister_netdevice.
6853 */
6854void unregister_netdev(struct net_device *dev)
6855{
6856 rtnl_lock();
6857 unregister_netdevice(dev);
6858 rtnl_unlock();
6859}
1da177e4
LT
6860EXPORT_SYMBOL(unregister_netdev);
6861
ce286d32
EB
6862/**
6863 * dev_change_net_namespace - move device to different nethost namespace
6864 * @dev: device
6865 * @net: network namespace
6866 * @pat: If not NULL name pattern to try if the current device name
6867 * is already taken in the destination network namespace.
6868 *
6869 * This function shuts down a device interface and moves it
6870 * to a new network namespace. On success 0 is returned, on
6871 * a failure a netagive errno code is returned.
6872 *
6873 * Callers must hold the rtnl semaphore.
6874 */
6875
6876int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6877{
ce286d32
EB
6878 int err;
6879
6880 ASSERT_RTNL();
6881
6882 /* Don't allow namespace local devices to be moved. */
6883 err = -EINVAL;
6884 if (dev->features & NETIF_F_NETNS_LOCAL)
6885 goto out;
6886
6887 /* Ensure the device has been registrered */
ce286d32
EB
6888 if (dev->reg_state != NETREG_REGISTERED)
6889 goto out;
6890
6891 /* Get out if there is nothing todo */
6892 err = 0;
878628fb 6893 if (net_eq(dev_net(dev), net))
ce286d32
EB
6894 goto out;
6895
6896 /* Pick the destination device name, and ensure
6897 * we can use it in the destination network namespace.
6898 */
6899 err = -EEXIST;
d9031024 6900 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
6901 /* We get here if we can't use the current device name */
6902 if (!pat)
6903 goto out;
828de4f6 6904 if (dev_get_valid_name(net, dev, pat) < 0)
ce286d32
EB
6905 goto out;
6906 }
6907
6908 /*
6909 * And now a mini version of register_netdevice unregister_netdevice.
6910 */
6911
6912 /* If device is running close it first. */
9b772652 6913 dev_close(dev);
ce286d32
EB
6914
6915 /* And unlink it from device chain */
6916 err = -ENODEV;
6917 unlist_netdevice(dev);
6918
6919 synchronize_net();
6920
6921 /* Shutdown queueing discipline. */
6922 dev_shutdown(dev);
6923
6924 /* Notify protocols, that we are about to destroy
6925 this device. They should clean all the things.
3b27e105
DL
6926
6927 Note that dev->reg_state stays at NETREG_REGISTERED.
6928 This is wanted because this way 8021q and macvlan know
6929 the device is just moving and can keep their slaves up.
ce286d32
EB
6930 */
6931 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6549dd43
G
6932 rcu_barrier();
6933 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7f294054 6934 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
ce286d32
EB
6935
6936 /*
6937 * Flush the unicast and multicast chains
6938 */
a748ee24 6939 dev_uc_flush(dev);
22bedad3 6940 dev_mc_flush(dev);
ce286d32 6941
4e66ae2e
SH
6942 /* Send a netdev-removed uevent to the old namespace */
6943 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
4c75431a 6944 netdev_adjacent_del_links(dev);
4e66ae2e 6945
ce286d32 6946 /* Actually switch the network namespace */
c346dca1 6947 dev_net_set(dev, net);
ce286d32 6948
ce286d32
EB
6949 /* If there is an ifindex conflict assign a new one */
6950 if (__dev_get_by_index(net, dev->ifindex)) {
6951 int iflink = (dev->iflink == dev->ifindex);
6952 dev->ifindex = dev_new_index(net);
6953 if (iflink)
6954 dev->iflink = dev->ifindex;
6955 }
6956
4e66ae2e
SH
6957 /* Send a netdev-add uevent to the new namespace */
6958 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
4c75431a 6959 netdev_adjacent_add_links(dev);
4e66ae2e 6960
8b41d188 6961 /* Fixup kobjects */
a1b3f594 6962 err = device_rename(&dev->dev, dev->name);
8b41d188 6963 WARN_ON(err);
ce286d32
EB
6964
6965 /* Add the device back in the hashes */
6966 list_netdevice(dev);
6967
6968 /* Notify protocols, that a new device appeared. */
6969 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6970
d90a909e
EB
6971 /*
6972 * Prevent userspace races by waiting until the network
6973 * device is fully setup before sending notifications.
6974 */
7f294054 6975 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
d90a909e 6976
ce286d32
EB
6977 synchronize_net();
6978 err = 0;
6979out:
6980 return err;
6981}
463d0183 6982EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 6983
1da177e4
LT
6984static int dev_cpu_callback(struct notifier_block *nfb,
6985 unsigned long action,
6986 void *ocpu)
6987{
6988 struct sk_buff **list_skb;
1da177e4
LT
6989 struct sk_buff *skb;
6990 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6991 struct softnet_data *sd, *oldsd;
6992
8bb78442 6993 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
6994 return NOTIFY_OK;
6995
6996 local_irq_disable();
6997 cpu = smp_processor_id();
6998 sd = &per_cpu(softnet_data, cpu);
6999 oldsd = &per_cpu(softnet_data, oldcpu);
7000
7001 /* Find end of our completion_queue. */
7002 list_skb = &sd->completion_queue;
7003 while (*list_skb)
7004 list_skb = &(*list_skb)->next;
7005 /* Append completion queue from offline CPU. */
7006 *list_skb = oldsd->completion_queue;
7007 oldsd->completion_queue = NULL;
7008
1da177e4 7009 /* Append output queue from offline CPU. */
a9cbd588
CG
7010 if (oldsd->output_queue) {
7011 *sd->output_queue_tailp = oldsd->output_queue;
7012 sd->output_queue_tailp = oldsd->output_queue_tailp;
7013 oldsd->output_queue = NULL;
7014 oldsd->output_queue_tailp = &oldsd->output_queue;
7015 }
264524d5
HC
7016 /* Append NAPI poll list from offline CPU. */
7017 if (!list_empty(&oldsd->poll_list)) {
7018 list_splice_init(&oldsd->poll_list, &sd->poll_list);
7019 raise_softirq_irqoff(NET_RX_SOFTIRQ);
7020 }
1da177e4
LT
7021
7022 raise_softirq_irqoff(NET_TX_SOFTIRQ);
7023 local_irq_enable();
7024
7025 /* Process offline CPU's input_pkt_queue */
76cc8b13 7026 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
ae78dbfa 7027 netif_rx_internal(skb);
76cc8b13 7028 input_queue_head_incr(oldsd);
fec5e652 7029 }
76cc8b13 7030 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
ae78dbfa 7031 netif_rx_internal(skb);
76cc8b13
TH
7032 input_queue_head_incr(oldsd);
7033 }
1da177e4
LT
7034
7035 return NOTIFY_OK;
7036}
1da177e4
LT
7037
7038
7f353bf2 7039/**
b63365a2
HX
7040 * netdev_increment_features - increment feature set by one
7041 * @all: current feature set
7042 * @one: new feature set
7043 * @mask: mask feature set
7f353bf2
HX
7044 *
7045 * Computes a new feature set after adding a device with feature set
b63365a2
HX
7046 * @one to the master device with current feature set @all. Will not
7047 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 7048 */
c8f44aff
MM
7049netdev_features_t netdev_increment_features(netdev_features_t all,
7050 netdev_features_t one, netdev_features_t mask)
b63365a2 7051{
1742f183
MM
7052 if (mask & NETIF_F_GEN_CSUM)
7053 mask |= NETIF_F_ALL_CSUM;
7054 mask |= NETIF_F_VLAN_CHALLENGED;
7f353bf2 7055
1742f183
MM
7056 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7057 all &= one | ~NETIF_F_ALL_FOR_ALL;
c6e1a0d1 7058
1742f183
MM
7059 /* If one device supports hw checksumming, set for all. */
7060 if (all & NETIF_F_GEN_CSUM)
7061 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7f353bf2
HX
7062
7063 return all;
7064}
b63365a2 7065EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 7066
430f03cd 7067static struct hlist_head * __net_init netdev_create_hash(void)
30d97d35
PE
7068{
7069 int i;
7070 struct hlist_head *hash;
7071
7072 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7073 if (hash != NULL)
7074 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7075 INIT_HLIST_HEAD(&hash[i]);
7076
7077 return hash;
7078}
7079
881d966b 7080/* Initialize per network namespace state */
4665079c 7081static int __net_init netdev_init(struct net *net)
881d966b 7082{
734b6541
RM
7083 if (net != &init_net)
7084 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 7085
30d97d35
PE
7086 net->dev_name_head = netdev_create_hash();
7087 if (net->dev_name_head == NULL)
7088 goto err_name;
881d966b 7089
30d97d35
PE
7090 net->dev_index_head = netdev_create_hash();
7091 if (net->dev_index_head == NULL)
7092 goto err_idx;
881d966b
EB
7093
7094 return 0;
30d97d35
PE
7095
7096err_idx:
7097 kfree(net->dev_name_head);
7098err_name:
7099 return -ENOMEM;
881d966b
EB
7100}
7101
f0db275a
SH
7102/**
7103 * netdev_drivername - network driver for the device
7104 * @dev: network device
f0db275a
SH
7105 *
7106 * Determine network driver for device.
7107 */
3019de12 7108const char *netdev_drivername(const struct net_device *dev)
6579e57b 7109{
cf04a4c7
SH
7110 const struct device_driver *driver;
7111 const struct device *parent;
3019de12 7112 const char *empty = "";
6579e57b
AV
7113
7114 parent = dev->dev.parent;
6579e57b 7115 if (!parent)
3019de12 7116 return empty;
6579e57b
AV
7117
7118 driver = parent->driver;
7119 if (driver && driver->name)
3019de12
DM
7120 return driver->name;
7121 return empty;
6579e57b
AV
7122}
7123
6ea754eb
JP
7124static void __netdev_printk(const char *level, const struct net_device *dev,
7125 struct va_format *vaf)
256df2f3 7126{
b004ff49 7127 if (dev && dev->dev.parent) {
6ea754eb
JP
7128 dev_printk_emit(level[1] - '0',
7129 dev->dev.parent,
7130 "%s %s %s%s: %pV",
7131 dev_driver_string(dev->dev.parent),
7132 dev_name(dev->dev.parent),
7133 netdev_name(dev), netdev_reg_state(dev),
7134 vaf);
b004ff49 7135 } else if (dev) {
6ea754eb
JP
7136 printk("%s%s%s: %pV",
7137 level, netdev_name(dev), netdev_reg_state(dev), vaf);
b004ff49 7138 } else {
6ea754eb 7139 printk("%s(NULL net_device): %pV", level, vaf);
b004ff49 7140 }
256df2f3
JP
7141}
7142
6ea754eb
JP
7143void netdev_printk(const char *level, const struct net_device *dev,
7144 const char *format, ...)
256df2f3
JP
7145{
7146 struct va_format vaf;
7147 va_list args;
256df2f3
JP
7148
7149 va_start(args, format);
7150
7151 vaf.fmt = format;
7152 vaf.va = &args;
7153
6ea754eb 7154 __netdev_printk(level, dev, &vaf);
b004ff49 7155
256df2f3 7156 va_end(args);
256df2f3
JP
7157}
7158EXPORT_SYMBOL(netdev_printk);
7159
7160#define define_netdev_printk_level(func, level) \
6ea754eb 7161void func(const struct net_device *dev, const char *fmt, ...) \
256df2f3 7162{ \
256df2f3
JP
7163 struct va_format vaf; \
7164 va_list args; \
7165 \
7166 va_start(args, fmt); \
7167 \
7168 vaf.fmt = fmt; \
7169 vaf.va = &args; \
7170 \
6ea754eb 7171 __netdev_printk(level, dev, &vaf); \
b004ff49 7172 \
256df2f3 7173 va_end(args); \
256df2f3
JP
7174} \
7175EXPORT_SYMBOL(func);
7176
7177define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7178define_netdev_printk_level(netdev_alert, KERN_ALERT);
7179define_netdev_printk_level(netdev_crit, KERN_CRIT);
7180define_netdev_printk_level(netdev_err, KERN_ERR);
7181define_netdev_printk_level(netdev_warn, KERN_WARNING);
7182define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7183define_netdev_printk_level(netdev_info, KERN_INFO);
7184
4665079c 7185static void __net_exit netdev_exit(struct net *net)
881d966b
EB
7186{
7187 kfree(net->dev_name_head);
7188 kfree(net->dev_index_head);
7189}
7190
022cbae6 7191static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
7192 .init = netdev_init,
7193 .exit = netdev_exit,
7194};
7195
4665079c 7196static void __net_exit default_device_exit(struct net *net)
ce286d32 7197{
e008b5fc 7198 struct net_device *dev, *aux;
ce286d32 7199 /*
e008b5fc 7200 * Push all migratable network devices back to the
ce286d32
EB
7201 * initial network namespace
7202 */
7203 rtnl_lock();
e008b5fc 7204 for_each_netdev_safe(net, dev, aux) {
ce286d32 7205 int err;
aca51397 7206 char fb_name[IFNAMSIZ];
ce286d32
EB
7207
7208 /* Ignore unmoveable devices (i.e. loopback) */
7209 if (dev->features & NETIF_F_NETNS_LOCAL)
7210 continue;
7211
e008b5fc
EB
7212 /* Leave virtual devices for the generic cleanup */
7213 if (dev->rtnl_link_ops)
7214 continue;
d0c082ce 7215
25985edc 7216 /* Push remaining network devices to init_net */
aca51397
PE
7217 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7218 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 7219 if (err) {
7b6cd1ce
JP
7220 pr_emerg("%s: failed to move %s to init_net: %d\n",
7221 __func__, dev->name, err);
aca51397 7222 BUG();
ce286d32
EB
7223 }
7224 }
7225 rtnl_unlock();
7226}
7227
50624c93
EB
7228static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7229{
7230 /* Return with the rtnl_lock held when there are no network
7231 * devices unregistering in any network namespace in net_list.
7232 */
7233 struct net *net;
7234 bool unregistering;
7235 DEFINE_WAIT(wait);
7236
7237 for (;;) {
7238 prepare_to_wait(&netdev_unregistering_wq, &wait,
7239 TASK_UNINTERRUPTIBLE);
7240 unregistering = false;
7241 rtnl_lock();
7242 list_for_each_entry(net, net_list, exit_list) {
7243 if (net->dev_unreg_count > 0) {
7244 unregistering = true;
7245 break;
7246 }
7247 }
7248 if (!unregistering)
7249 break;
7250 __rtnl_unlock();
7251 schedule();
7252 }
7253 finish_wait(&netdev_unregistering_wq, &wait);
7254}
7255
04dc7f6b
EB
7256static void __net_exit default_device_exit_batch(struct list_head *net_list)
7257{
7258 /* At exit all network devices most be removed from a network
b595076a 7259 * namespace. Do this in the reverse order of registration.
04dc7f6b
EB
7260 * Do this across as many network namespaces as possible to
7261 * improve batching efficiency.
7262 */
7263 struct net_device *dev;
7264 struct net *net;
7265 LIST_HEAD(dev_kill_list);
7266
50624c93
EB
7267 /* To prevent network device cleanup code from dereferencing
7268 * loopback devices or network devices that have been freed
7269 * wait here for all pending unregistrations to complete,
7270 * before unregistring the loopback device and allowing the
7271 * network namespace be freed.
7272 *
7273 * The netdev todo list containing all network devices
7274 * unregistrations that happen in default_device_exit_batch
7275 * will run in the rtnl_unlock() at the end of
7276 * default_device_exit_batch.
7277 */
7278 rtnl_lock_unregistering(net_list);
04dc7f6b
EB
7279 list_for_each_entry(net, net_list, exit_list) {
7280 for_each_netdev_reverse(net, dev) {
b0ab2fab 7281 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
04dc7f6b
EB
7282 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7283 else
7284 unregister_netdevice_queue(dev, &dev_kill_list);
7285 }
7286 }
7287 unregister_netdevice_many(&dev_kill_list);
7288 rtnl_unlock();
7289}
7290
022cbae6 7291static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 7292 .exit = default_device_exit,
04dc7f6b 7293 .exit_batch = default_device_exit_batch,
ce286d32
EB
7294};
7295
1da177e4
LT
7296/*
7297 * Initialize the DEV module. At boot time this walks the device list and
7298 * unhooks any devices that fail to initialise (normally hardware not
7299 * present) and leaves us with a valid list of present and active devices.
7300 *
7301 */
7302
7303/*
7304 * This is called single threaded during boot, so no need
7305 * to take the rtnl semaphore.
7306 */
7307static int __init net_dev_init(void)
7308{
7309 int i, rc = -ENOMEM;
7310
7311 BUG_ON(!dev_boot_phase);
7312
1da177e4
LT
7313 if (dev_proc_init())
7314 goto out;
7315
8b41d188 7316 if (netdev_kobject_init())
1da177e4
LT
7317 goto out;
7318
7319 INIT_LIST_HEAD(&ptype_all);
82d8a867 7320 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
7321 INIT_LIST_HEAD(&ptype_base[i]);
7322
62532da9
VY
7323 INIT_LIST_HEAD(&offload_base);
7324
881d966b
EB
7325 if (register_pernet_subsys(&netdev_net_ops))
7326 goto out;
1da177e4
LT
7327
7328 /*
7329 * Initialise the packet receive queues.
7330 */
7331
6f912042 7332 for_each_possible_cpu(i) {
e36fa2f7 7333 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 7334
e36fa2f7 7335 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 7336 skb_queue_head_init(&sd->process_queue);
e36fa2f7 7337 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588 7338 sd->output_queue_tailp = &sd->output_queue;
df334545 7339#ifdef CONFIG_RPS
e36fa2f7
ED
7340 sd->csd.func = rps_trigger_softirq;
7341 sd->csd.info = sd;
e36fa2f7 7342 sd->cpu = i;
1e94d72f 7343#endif
0a9627f2 7344
e36fa2f7
ED
7345 sd->backlog.poll = process_backlog;
7346 sd->backlog.weight = weight_p;
1da177e4
LT
7347 }
7348
1da177e4
LT
7349 dev_boot_phase = 0;
7350
505d4f73
EB
7351 /* The loopback device is special if any other network devices
7352 * is present in a network namespace the loopback device must
7353 * be present. Since we now dynamically allocate and free the
7354 * loopback device ensure this invariant is maintained by
7355 * keeping the loopback device as the first device on the
7356 * list of network devices. Ensuring the loopback devices
7357 * is the first device that appears and the last network device
7358 * that disappears.
7359 */
7360 if (register_pernet_device(&loopback_net_ops))
7361 goto out;
7362
7363 if (register_pernet_device(&default_device_ops))
7364 goto out;
7365
962cf36c
CM
7366 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7367 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
7368
7369 hotcpu_notifier(dev_cpu_callback, 0);
7370 dst_init();
1da177e4
LT
7371 rc = 0;
7372out:
7373 return rc;
7374}
7375
7376subsys_initcall(net_dev_init);
This page took 1.796864 seconds and 5 git commands to generate.