mlx4: TCP/UDP packets have L4 hash
[deliverable/linux.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
1da177e4 76#include <linux/bitops.h>
4fc268d2 77#include <linux/capability.h>
1da177e4
LT
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
08e9897d 81#include <linux/hash.h>
5a0e3ad6 82#include <linux/slab.h>
1da177e4 83#include <linux/sched.h>
4a3e2f71 84#include <linux/mutex.h>
1da177e4
LT
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
0187bdfb 94#include <linux/ethtool.h>
1da177e4
LT
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
1da177e4 100#include <linux/stat.h>
1da177e4
LT
101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
44540960 104#include <net/xfrm.h>
1da177e4
LT
105#include <linux/highmem.h>
106#include <linux/init.h>
1da177e4 107#include <linux/module.h>
1da177e4
LT
108#include <linux/netpoll.h>
109#include <linux/rcupdate.h>
110#include <linux/delay.h>
1da177e4 111#include <net/iw_handler.h>
1da177e4 112#include <asm/current.h>
5bdb9886 113#include <linux/audit.h>
db217334 114#include <linux/dmaengine.h>
f6a78bfc 115#include <linux/err.h>
c7fa9d18 116#include <linux/ctype.h>
723e98b7 117#include <linux/if_arp.h>
6de329e2 118#include <linux/if_vlan.h>
8f0f2223 119#include <linux/ip.h>
ad55dcaf 120#include <net/ip.h>
25cd9ba0 121#include <net/mpls.h>
8f0f2223
DM
122#include <linux/ipv6.h>
123#include <linux/in.h>
b6b2fed1
DM
124#include <linux/jhash.h>
125#include <linux/random.h>
9cbc1cb8 126#include <trace/events/napi.h>
cf66ba58 127#include <trace/events/net.h>
07dc22e7 128#include <trace/events/skb.h>
5acbbd42 129#include <linux/pci.h>
caeda9b9 130#include <linux/inetdevice.h>
c445477d 131#include <linux/cpu_rmap.h>
c5905afb 132#include <linux/static_key.h>
af12fa6e 133#include <linux/hashtable.h>
60877a32 134#include <linux/vmalloc.h>
529d0489 135#include <linux/if_macvlan.h>
e7fd2885 136#include <linux/errqueue.h>
3b47d303 137#include <linux/hrtimer.h>
e687ad60 138#include <linux/netfilter_ingress.h>
1da177e4 139
342709ef
PE
140#include "net-sysfs.h"
141
d565b0a1
HX
142/* Instead of increasing this, you should create a hash table. */
143#define MAX_GRO_SKBS 8
144
5d38a079
HX
145/* This should be increased if a protocol with a bigger head is added. */
146#define GRO_MAX_HEAD (MAX_HEADER + 128)
147
1da177e4 148static DEFINE_SPINLOCK(ptype_lock);
62532da9 149static DEFINE_SPINLOCK(offload_lock);
900ff8c6
CW
150struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
151struct list_head ptype_all __read_mostly; /* Taps */
62532da9 152static struct list_head offload_base __read_mostly;
1da177e4 153
ae78dbfa 154static int netif_rx_internal(struct sk_buff *skb);
54951194
LP
155static int call_netdevice_notifiers_info(unsigned long val,
156 struct net_device *dev,
157 struct netdev_notifier_info *info);
ae78dbfa 158
1da177e4 159/*
7562f876 160 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
161 * semaphore.
162 *
c6d14c84 163 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
164 *
165 * Writers must hold the rtnl semaphore while they loop through the
7562f876 166 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
167 * actual updates. This allows pure readers to access the list even
168 * while a writer is preparing to update it.
169 *
170 * To put it another way, dev_base_lock is held for writing only to
171 * protect against pure readers; the rtnl semaphore provides the
172 * protection against other writers.
173 *
174 * See, for example usages, register_netdevice() and
175 * unregister_netdevice(), which must be called with the rtnl
176 * semaphore held.
177 */
1da177e4 178DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
179EXPORT_SYMBOL(dev_base_lock);
180
af12fa6e
ET
181/* protects napi_hash addition/deletion and napi_gen_id */
182static DEFINE_SPINLOCK(napi_hash_lock);
183
184static unsigned int napi_gen_id;
185static DEFINE_HASHTABLE(napi_hash, 8);
186
18afa4b0 187static seqcount_t devnet_rename_seq;
c91f6df2 188
4e985ada
TG
189static inline void dev_base_seq_inc(struct net *net)
190{
191 while (++net->dev_base_seq == 0);
192}
193
881d966b 194static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4 195{
95c96174
ED
196 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
197
08e9897d 198 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
199}
200
881d966b 201static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 202{
7c28bd0b 203 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
204}
205
e36fa2f7 206static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
207{
208#ifdef CONFIG_RPS
e36fa2f7 209 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
210#endif
211}
212
e36fa2f7 213static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
214{
215#ifdef CONFIG_RPS
e36fa2f7 216 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
217#endif
218}
219
ce286d32 220/* Device list insertion */
53759be9 221static void list_netdevice(struct net_device *dev)
ce286d32 222{
c346dca1 223 struct net *net = dev_net(dev);
ce286d32
EB
224
225 ASSERT_RTNL();
226
227 write_lock_bh(&dev_base_lock);
c6d14c84 228 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 229 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
230 hlist_add_head_rcu(&dev->index_hlist,
231 dev_index_hash(net, dev->ifindex));
ce286d32 232 write_unlock_bh(&dev_base_lock);
4e985ada
TG
233
234 dev_base_seq_inc(net);
ce286d32
EB
235}
236
fb699dfd
ED
237/* Device list removal
238 * caller must respect a RCU grace period before freeing/reusing dev
239 */
ce286d32
EB
240static void unlist_netdevice(struct net_device *dev)
241{
242 ASSERT_RTNL();
243
244 /* Unlink dev from the device chain */
245 write_lock_bh(&dev_base_lock);
c6d14c84 246 list_del_rcu(&dev->dev_list);
72c9528b 247 hlist_del_rcu(&dev->name_hlist);
fb699dfd 248 hlist_del_rcu(&dev->index_hlist);
ce286d32 249 write_unlock_bh(&dev_base_lock);
4e985ada
TG
250
251 dev_base_seq_inc(dev_net(dev));
ce286d32
EB
252}
253
1da177e4
LT
254/*
255 * Our notifier list
256 */
257
f07d5b94 258static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
259
260/*
261 * Device drivers call our routines to queue packets here. We empty the
262 * queue in the local softnet handler.
263 */
bea3348e 264
9958da05 265DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 266EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 267
cf508b12 268#ifdef CONFIG_LOCKDEP
723e98b7 269/*
c773e847 270 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
271 * according to dev->type
272 */
273static const unsigned short netdev_lock_type[] =
274 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
275 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
276 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
277 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
278 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
279 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
280 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
281 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
282 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
283 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
284 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
285 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
211ed865
PG
286 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
287 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
288 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
723e98b7 289
36cbd3dc 290static const char *const netdev_lock_name[] =
723e98b7
JP
291 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
292 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
293 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
294 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
295 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
296 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
297 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
298 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
299 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
300 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
301 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
302 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
211ed865
PG
303 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
304 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
305 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
306
307static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 308static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
309
310static inline unsigned short netdev_lock_pos(unsigned short dev_type)
311{
312 int i;
313
314 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
315 if (netdev_lock_type[i] == dev_type)
316 return i;
317 /* the last key is used by default */
318 return ARRAY_SIZE(netdev_lock_type) - 1;
319}
320
cf508b12
DM
321static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
322 unsigned short dev_type)
723e98b7
JP
323{
324 int i;
325
326 i = netdev_lock_pos(dev_type);
327 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
328 netdev_lock_name[i]);
329}
cf508b12
DM
330
331static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
332{
333 int i;
334
335 i = netdev_lock_pos(dev->type);
336 lockdep_set_class_and_name(&dev->addr_list_lock,
337 &netdev_addr_lock_key[i],
338 netdev_lock_name[i]);
339}
723e98b7 340#else
cf508b12
DM
341static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
342 unsigned short dev_type)
343{
344}
345static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
346{
347}
348#endif
1da177e4
LT
349
350/*******************************************************************************
351
352 Protocol management and registration routines
353
354*******************************************************************************/
355
1da177e4
LT
356/*
357 * Add a protocol ID to the list. Now that the input handler is
358 * smarter we can dispense with all the messy stuff that used to be
359 * here.
360 *
361 * BEWARE!!! Protocol handlers, mangling input packets,
362 * MUST BE last in hash buckets and checking protocol handlers
363 * MUST start from promiscuous ptype_all chain in net_bh.
364 * It is true now, do not change it.
365 * Explanation follows: if protocol handler, mangling packet, will
366 * be the first on list, it is not able to sense, that packet
367 * is cloned and should be copied-on-write, so that it will
368 * change it and subsequent readers will get broken packet.
369 * --ANK (980803)
370 */
371
c07b68e8
ED
372static inline struct list_head *ptype_head(const struct packet_type *pt)
373{
374 if (pt->type == htons(ETH_P_ALL))
7866a621 375 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
c07b68e8 376 else
7866a621
SN
377 return pt->dev ? &pt->dev->ptype_specific :
378 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
c07b68e8
ED
379}
380
1da177e4
LT
381/**
382 * dev_add_pack - add packet handler
383 * @pt: packet type declaration
384 *
385 * Add a protocol handler to the networking stack. The passed &packet_type
386 * is linked into kernel lists and may not be freed until it has been
387 * removed from the kernel lists.
388 *
4ec93edb 389 * This call does not sleep therefore it can not
1da177e4
LT
390 * guarantee all CPU's that are in middle of receiving packets
391 * will see the new packet type (until the next received packet).
392 */
393
394void dev_add_pack(struct packet_type *pt)
395{
c07b68e8 396 struct list_head *head = ptype_head(pt);
1da177e4 397
c07b68e8
ED
398 spin_lock(&ptype_lock);
399 list_add_rcu(&pt->list, head);
400 spin_unlock(&ptype_lock);
1da177e4 401}
d1b19dff 402EXPORT_SYMBOL(dev_add_pack);
1da177e4 403
1da177e4
LT
404/**
405 * __dev_remove_pack - remove packet handler
406 * @pt: packet type declaration
407 *
408 * Remove a protocol handler that was previously added to the kernel
409 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
410 * from the kernel lists and can be freed or reused once this function
4ec93edb 411 * returns.
1da177e4
LT
412 *
413 * The packet type might still be in use by receivers
414 * and must not be freed until after all the CPU's have gone
415 * through a quiescent state.
416 */
417void __dev_remove_pack(struct packet_type *pt)
418{
c07b68e8 419 struct list_head *head = ptype_head(pt);
1da177e4
LT
420 struct packet_type *pt1;
421
c07b68e8 422 spin_lock(&ptype_lock);
1da177e4
LT
423
424 list_for_each_entry(pt1, head, list) {
425 if (pt == pt1) {
426 list_del_rcu(&pt->list);
427 goto out;
428 }
429 }
430
7b6cd1ce 431 pr_warn("dev_remove_pack: %p not found\n", pt);
1da177e4 432out:
c07b68e8 433 spin_unlock(&ptype_lock);
1da177e4 434}
d1b19dff
ED
435EXPORT_SYMBOL(__dev_remove_pack);
436
1da177e4
LT
437/**
438 * dev_remove_pack - remove packet handler
439 * @pt: packet type declaration
440 *
441 * Remove a protocol handler that was previously added to the kernel
442 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
443 * from the kernel lists and can be freed or reused once this function
444 * returns.
445 *
446 * This call sleeps to guarantee that no CPU is looking at the packet
447 * type after return.
448 */
449void dev_remove_pack(struct packet_type *pt)
450{
451 __dev_remove_pack(pt);
4ec93edb 452
1da177e4
LT
453 synchronize_net();
454}
d1b19dff 455EXPORT_SYMBOL(dev_remove_pack);
1da177e4 456
62532da9
VY
457
458/**
459 * dev_add_offload - register offload handlers
460 * @po: protocol offload declaration
461 *
462 * Add protocol offload handlers to the networking stack. The passed
463 * &proto_offload is linked into kernel lists and may not be freed until
464 * it has been removed from the kernel lists.
465 *
466 * This call does not sleep therefore it can not
467 * guarantee all CPU's that are in middle of receiving packets
468 * will see the new offload handlers (until the next received packet).
469 */
470void dev_add_offload(struct packet_offload *po)
471{
bdef7de4 472 struct packet_offload *elem;
62532da9
VY
473
474 spin_lock(&offload_lock);
bdef7de4
DM
475 list_for_each_entry(elem, &offload_base, list) {
476 if (po->priority < elem->priority)
477 break;
478 }
479 list_add_rcu(&po->list, elem->list.prev);
62532da9
VY
480 spin_unlock(&offload_lock);
481}
482EXPORT_SYMBOL(dev_add_offload);
483
484/**
485 * __dev_remove_offload - remove offload handler
486 * @po: packet offload declaration
487 *
488 * Remove a protocol offload handler that was previously added to the
489 * kernel offload handlers by dev_add_offload(). The passed &offload_type
490 * is removed from the kernel lists and can be freed or reused once this
491 * function returns.
492 *
493 * The packet type might still be in use by receivers
494 * and must not be freed until after all the CPU's have gone
495 * through a quiescent state.
496 */
1d143d9f 497static void __dev_remove_offload(struct packet_offload *po)
62532da9
VY
498{
499 struct list_head *head = &offload_base;
500 struct packet_offload *po1;
501
c53aa505 502 spin_lock(&offload_lock);
62532da9
VY
503
504 list_for_each_entry(po1, head, list) {
505 if (po == po1) {
506 list_del_rcu(&po->list);
507 goto out;
508 }
509 }
510
511 pr_warn("dev_remove_offload: %p not found\n", po);
512out:
c53aa505 513 spin_unlock(&offload_lock);
62532da9 514}
62532da9
VY
515
516/**
517 * dev_remove_offload - remove packet offload handler
518 * @po: packet offload declaration
519 *
520 * Remove a packet offload handler that was previously added to the kernel
521 * offload handlers by dev_add_offload(). The passed &offload_type is
522 * removed from the kernel lists and can be freed or reused once this
523 * function returns.
524 *
525 * This call sleeps to guarantee that no CPU is looking at the packet
526 * type after return.
527 */
528void dev_remove_offload(struct packet_offload *po)
529{
530 __dev_remove_offload(po);
531
532 synchronize_net();
533}
534EXPORT_SYMBOL(dev_remove_offload);
535
1da177e4
LT
536/******************************************************************************
537
538 Device Boot-time Settings Routines
539
540*******************************************************************************/
541
542/* Boot time configuration table */
543static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
544
545/**
546 * netdev_boot_setup_add - add new setup entry
547 * @name: name of the device
548 * @map: configured settings for the device
549 *
550 * Adds new setup entry to the dev_boot_setup list. The function
551 * returns 0 on error and 1 on success. This is a generic routine to
552 * all netdevices.
553 */
554static int netdev_boot_setup_add(char *name, struct ifmap *map)
555{
556 struct netdev_boot_setup *s;
557 int i;
558
559 s = dev_boot_setup;
560 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
561 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
562 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 563 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
564 memcpy(&s[i].map, map, sizeof(s[i].map));
565 break;
566 }
567 }
568
569 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
570}
571
572/**
573 * netdev_boot_setup_check - check boot time settings
574 * @dev: the netdevice
575 *
576 * Check boot time settings for the device.
577 * The found settings are set for the device to be used
578 * later in the device probing.
579 * Returns 0 if no settings found, 1 if they are.
580 */
581int netdev_boot_setup_check(struct net_device *dev)
582{
583 struct netdev_boot_setup *s = dev_boot_setup;
584 int i;
585
586 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
587 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 588 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
589 dev->irq = s[i].map.irq;
590 dev->base_addr = s[i].map.base_addr;
591 dev->mem_start = s[i].map.mem_start;
592 dev->mem_end = s[i].map.mem_end;
593 return 1;
594 }
595 }
596 return 0;
597}
d1b19dff 598EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
599
600
601/**
602 * netdev_boot_base - get address from boot time settings
603 * @prefix: prefix for network device
604 * @unit: id for network device
605 *
606 * Check boot time settings for the base address of device.
607 * The found settings are set for the device to be used
608 * later in the device probing.
609 * Returns 0 if no settings found.
610 */
611unsigned long netdev_boot_base(const char *prefix, int unit)
612{
613 const struct netdev_boot_setup *s = dev_boot_setup;
614 char name[IFNAMSIZ];
615 int i;
616
617 sprintf(name, "%s%d", prefix, unit);
618
619 /*
620 * If device already registered then return base of 1
621 * to indicate not to probe for this interface
622 */
881d966b 623 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
624 return 1;
625
626 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
627 if (!strcmp(name, s[i].name))
628 return s[i].map.base_addr;
629 return 0;
630}
631
632/*
633 * Saves at boot time configured settings for any netdevice.
634 */
635int __init netdev_boot_setup(char *str)
636{
637 int ints[5];
638 struct ifmap map;
639
640 str = get_options(str, ARRAY_SIZE(ints), ints);
641 if (!str || !*str)
642 return 0;
643
644 /* Save settings */
645 memset(&map, 0, sizeof(map));
646 if (ints[0] > 0)
647 map.irq = ints[1];
648 if (ints[0] > 1)
649 map.base_addr = ints[2];
650 if (ints[0] > 2)
651 map.mem_start = ints[3];
652 if (ints[0] > 3)
653 map.mem_end = ints[4];
654
655 /* Add new entry to the list */
656 return netdev_boot_setup_add(str, &map);
657}
658
659__setup("netdev=", netdev_boot_setup);
660
661/*******************************************************************************
662
663 Device Interface Subroutines
664
665*******************************************************************************/
666
a54acb3a
ND
667/**
668 * dev_get_iflink - get 'iflink' value of a interface
669 * @dev: targeted interface
670 *
671 * Indicates the ifindex the interface is linked to.
672 * Physical interfaces have the same 'ifindex' and 'iflink' values.
673 */
674
675int dev_get_iflink(const struct net_device *dev)
676{
677 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
678 return dev->netdev_ops->ndo_get_iflink(dev);
679
e1622baf
ND
680 /* If dev->rtnl_link_ops is set, it's a virtual interface. */
681 if (dev->rtnl_link_ops)
682 return 0;
683
7a66bbc9 684 return dev->ifindex;
a54acb3a
ND
685}
686EXPORT_SYMBOL(dev_get_iflink);
687
1da177e4
LT
688/**
689 * __dev_get_by_name - find a device by its name
c4ea43c5 690 * @net: the applicable net namespace
1da177e4
LT
691 * @name: name to find
692 *
693 * Find an interface by name. Must be called under RTNL semaphore
694 * or @dev_base_lock. If the name is found a pointer to the device
695 * is returned. If the name is not found then %NULL is returned. The
696 * reference counters are not incremented so the caller must be
697 * careful with locks.
698 */
699
881d966b 700struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4 701{
0bd8d536
ED
702 struct net_device *dev;
703 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 704
b67bfe0d 705 hlist_for_each_entry(dev, head, name_hlist)
1da177e4
LT
706 if (!strncmp(dev->name, name, IFNAMSIZ))
707 return dev;
0bd8d536 708
1da177e4
LT
709 return NULL;
710}
d1b19dff 711EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 712
72c9528b
ED
713/**
714 * dev_get_by_name_rcu - find a device by its name
715 * @net: the applicable net namespace
716 * @name: name to find
717 *
718 * Find an interface by name.
719 * If the name is found a pointer to the device is returned.
720 * If the name is not found then %NULL is returned.
721 * The reference counters are not incremented so the caller must be
722 * careful with locks. The caller must hold RCU lock.
723 */
724
725struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
726{
72c9528b
ED
727 struct net_device *dev;
728 struct hlist_head *head = dev_name_hash(net, name);
729
b67bfe0d 730 hlist_for_each_entry_rcu(dev, head, name_hlist)
72c9528b
ED
731 if (!strncmp(dev->name, name, IFNAMSIZ))
732 return dev;
733
734 return NULL;
735}
736EXPORT_SYMBOL(dev_get_by_name_rcu);
737
1da177e4
LT
738/**
739 * dev_get_by_name - find a device by its name
c4ea43c5 740 * @net: the applicable net namespace
1da177e4
LT
741 * @name: name to find
742 *
743 * Find an interface by name. This can be called from any
744 * context and does its own locking. The returned handle has
745 * the usage count incremented and the caller must use dev_put() to
746 * release it when it is no longer needed. %NULL is returned if no
747 * matching device is found.
748 */
749
881d966b 750struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
751{
752 struct net_device *dev;
753
72c9528b
ED
754 rcu_read_lock();
755 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
756 if (dev)
757 dev_hold(dev);
72c9528b 758 rcu_read_unlock();
1da177e4
LT
759 return dev;
760}
d1b19dff 761EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
762
763/**
764 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 765 * @net: the applicable net namespace
1da177e4
LT
766 * @ifindex: index of device
767 *
768 * Search for an interface by index. Returns %NULL if the device
769 * is not found or a pointer to the device. The device has not
770 * had its reference counter increased so the caller must be careful
771 * about locking. The caller must hold either the RTNL semaphore
772 * or @dev_base_lock.
773 */
774
881d966b 775struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4 776{
0bd8d536
ED
777 struct net_device *dev;
778 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 779
b67bfe0d 780 hlist_for_each_entry(dev, head, index_hlist)
1da177e4
LT
781 if (dev->ifindex == ifindex)
782 return dev;
0bd8d536 783
1da177e4
LT
784 return NULL;
785}
d1b19dff 786EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 787
fb699dfd
ED
788/**
789 * dev_get_by_index_rcu - find a device by its ifindex
790 * @net: the applicable net namespace
791 * @ifindex: index of device
792 *
793 * Search for an interface by index. Returns %NULL if the device
794 * is not found or a pointer to the device. The device has not
795 * had its reference counter increased so the caller must be careful
796 * about locking. The caller must hold RCU lock.
797 */
798
799struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
800{
fb699dfd
ED
801 struct net_device *dev;
802 struct hlist_head *head = dev_index_hash(net, ifindex);
803
b67bfe0d 804 hlist_for_each_entry_rcu(dev, head, index_hlist)
fb699dfd
ED
805 if (dev->ifindex == ifindex)
806 return dev;
807
808 return NULL;
809}
810EXPORT_SYMBOL(dev_get_by_index_rcu);
811
1da177e4
LT
812
813/**
814 * dev_get_by_index - find a device by its ifindex
c4ea43c5 815 * @net: the applicable net namespace
1da177e4
LT
816 * @ifindex: index of device
817 *
818 * Search for an interface by index. Returns NULL if the device
819 * is not found or a pointer to the device. The device returned has
820 * had a reference added and the pointer is safe until the user calls
821 * dev_put to indicate they have finished with it.
822 */
823
881d966b 824struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
825{
826 struct net_device *dev;
827
fb699dfd
ED
828 rcu_read_lock();
829 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
830 if (dev)
831 dev_hold(dev);
fb699dfd 832 rcu_read_unlock();
1da177e4
LT
833 return dev;
834}
d1b19dff 835EXPORT_SYMBOL(dev_get_by_index);
1da177e4 836
5dbe7c17
NS
837/**
838 * netdev_get_name - get a netdevice name, knowing its ifindex.
839 * @net: network namespace
840 * @name: a pointer to the buffer where the name will be stored.
841 * @ifindex: the ifindex of the interface to get the name from.
842 *
843 * The use of raw_seqcount_begin() and cond_resched() before
844 * retrying is required as we want to give the writers a chance
845 * to complete when CONFIG_PREEMPT is not set.
846 */
847int netdev_get_name(struct net *net, char *name, int ifindex)
848{
849 struct net_device *dev;
850 unsigned int seq;
851
852retry:
853 seq = raw_seqcount_begin(&devnet_rename_seq);
854 rcu_read_lock();
855 dev = dev_get_by_index_rcu(net, ifindex);
856 if (!dev) {
857 rcu_read_unlock();
858 return -ENODEV;
859 }
860
861 strcpy(name, dev->name);
862 rcu_read_unlock();
863 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
864 cond_resched();
865 goto retry;
866 }
867
868 return 0;
869}
870
1da177e4 871/**
941666c2 872 * dev_getbyhwaddr_rcu - find a device by its hardware address
c4ea43c5 873 * @net: the applicable net namespace
1da177e4
LT
874 * @type: media type of device
875 * @ha: hardware address
876 *
877 * Search for an interface by MAC address. Returns NULL if the device
c506653d
ED
878 * is not found or a pointer to the device.
879 * The caller must hold RCU or RTNL.
941666c2 880 * The returned device has not had its ref count increased
1da177e4
LT
881 * and the caller must therefore be careful about locking
882 *
1da177e4
LT
883 */
884
941666c2
ED
885struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
886 const char *ha)
1da177e4
LT
887{
888 struct net_device *dev;
889
941666c2 890 for_each_netdev_rcu(net, dev)
1da177e4
LT
891 if (dev->type == type &&
892 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
893 return dev;
894
895 return NULL;
1da177e4 896}
941666c2 897EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
cf309e3f 898
881d966b 899struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
900{
901 struct net_device *dev;
902
4e9cac2b 903 ASSERT_RTNL();
881d966b 904 for_each_netdev(net, dev)
4e9cac2b 905 if (dev->type == type)
7562f876
PE
906 return dev;
907
908 return NULL;
4e9cac2b 909}
4e9cac2b
PM
910EXPORT_SYMBOL(__dev_getfirstbyhwtype);
911
881d966b 912struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 913{
99fe3c39 914 struct net_device *dev, *ret = NULL;
4e9cac2b 915
99fe3c39
ED
916 rcu_read_lock();
917 for_each_netdev_rcu(net, dev)
918 if (dev->type == type) {
919 dev_hold(dev);
920 ret = dev;
921 break;
922 }
923 rcu_read_unlock();
924 return ret;
1da177e4 925}
1da177e4
LT
926EXPORT_SYMBOL(dev_getfirstbyhwtype);
927
928/**
6c555490 929 * __dev_get_by_flags - find any device with given flags
c4ea43c5 930 * @net: the applicable net namespace
1da177e4
LT
931 * @if_flags: IFF_* values
932 * @mask: bitmask of bits in if_flags to check
933 *
934 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04 935 * is not found or a pointer to the device. Must be called inside
6c555490 936 * rtnl_lock(), and result refcount is unchanged.
1da177e4
LT
937 */
938
6c555490
WC
939struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
940 unsigned short mask)
1da177e4 941{
7562f876 942 struct net_device *dev, *ret;
1da177e4 943
6c555490
WC
944 ASSERT_RTNL();
945
7562f876 946 ret = NULL;
6c555490 947 for_each_netdev(net, dev) {
1da177e4 948 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 949 ret = dev;
1da177e4
LT
950 break;
951 }
952 }
7562f876 953 return ret;
1da177e4 954}
6c555490 955EXPORT_SYMBOL(__dev_get_by_flags);
1da177e4
LT
956
957/**
958 * dev_valid_name - check if name is okay for network device
959 * @name: name string
960 *
961 * Network device names need to be valid file names to
c7fa9d18
DM
962 * to allow sysfs to work. We also disallow any kind of
963 * whitespace.
1da177e4 964 */
95f050bf 965bool dev_valid_name(const char *name)
1da177e4 966{
c7fa9d18 967 if (*name == '\0')
95f050bf 968 return false;
b6fe17d6 969 if (strlen(name) >= IFNAMSIZ)
95f050bf 970 return false;
c7fa9d18 971 if (!strcmp(name, ".") || !strcmp(name, ".."))
95f050bf 972 return false;
c7fa9d18
DM
973
974 while (*name) {
a4176a93 975 if (*name == '/' || *name == ':' || isspace(*name))
95f050bf 976 return false;
c7fa9d18
DM
977 name++;
978 }
95f050bf 979 return true;
1da177e4 980}
d1b19dff 981EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
982
983/**
b267b179
EB
984 * __dev_alloc_name - allocate a name for a device
985 * @net: network namespace to allocate the device name in
1da177e4 986 * @name: name format string
b267b179 987 * @buf: scratch buffer and result name string
1da177e4
LT
988 *
989 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
990 * id. It scans list of devices to build up a free map, then chooses
991 * the first empty slot. The caller must hold the dev_base or rtnl lock
992 * while allocating the name and adding the device in order to avoid
993 * duplicates.
994 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
995 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
996 */
997
b267b179 998static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
999{
1000 int i = 0;
1da177e4
LT
1001 const char *p;
1002 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 1003 unsigned long *inuse;
1da177e4
LT
1004 struct net_device *d;
1005
1006 p = strnchr(name, IFNAMSIZ-1, '%');
1007 if (p) {
1008 /*
1009 * Verify the string as this thing may have come from
1010 * the user. There must be either one "%d" and no other "%"
1011 * characters.
1012 */
1013 if (p[1] != 'd' || strchr(p + 2, '%'))
1014 return -EINVAL;
1015
1016 /* Use one page as a bit array of possible slots */
cfcabdcc 1017 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
1018 if (!inuse)
1019 return -ENOMEM;
1020
881d966b 1021 for_each_netdev(net, d) {
1da177e4
LT
1022 if (!sscanf(d->name, name, &i))
1023 continue;
1024 if (i < 0 || i >= max_netdevices)
1025 continue;
1026
1027 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 1028 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
1029 if (!strncmp(buf, d->name, IFNAMSIZ))
1030 set_bit(i, inuse);
1031 }
1032
1033 i = find_first_zero_bit(inuse, max_netdevices);
1034 free_page((unsigned long) inuse);
1035 }
1036
d9031024
OP
1037 if (buf != name)
1038 snprintf(buf, IFNAMSIZ, name, i);
b267b179 1039 if (!__dev_get_by_name(net, buf))
1da177e4 1040 return i;
1da177e4
LT
1041
1042 /* It is possible to run out of possible slots
1043 * when the name is long and there isn't enough space left
1044 * for the digits, or if all bits are used.
1045 */
1046 return -ENFILE;
1047}
1048
b267b179
EB
1049/**
1050 * dev_alloc_name - allocate a name for a device
1051 * @dev: device
1052 * @name: name format string
1053 *
1054 * Passed a format string - eg "lt%d" it will try and find a suitable
1055 * id. It scans list of devices to build up a free map, then chooses
1056 * the first empty slot. The caller must hold the dev_base or rtnl lock
1057 * while allocating the name and adding the device in order to avoid
1058 * duplicates.
1059 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1060 * Returns the number of the unit assigned or a negative errno code.
1061 */
1062
1063int dev_alloc_name(struct net_device *dev, const char *name)
1064{
1065 char buf[IFNAMSIZ];
1066 struct net *net;
1067 int ret;
1068
c346dca1
YH
1069 BUG_ON(!dev_net(dev));
1070 net = dev_net(dev);
b267b179
EB
1071 ret = __dev_alloc_name(net, name, buf);
1072 if (ret >= 0)
1073 strlcpy(dev->name, buf, IFNAMSIZ);
1074 return ret;
1075}
d1b19dff 1076EXPORT_SYMBOL(dev_alloc_name);
b267b179 1077
828de4f6
G
1078static int dev_alloc_name_ns(struct net *net,
1079 struct net_device *dev,
1080 const char *name)
d9031024 1081{
828de4f6
G
1082 char buf[IFNAMSIZ];
1083 int ret;
8ce6cebc 1084
828de4f6
G
1085 ret = __dev_alloc_name(net, name, buf);
1086 if (ret >= 0)
1087 strlcpy(dev->name, buf, IFNAMSIZ);
1088 return ret;
1089}
1090
1091static int dev_get_valid_name(struct net *net,
1092 struct net_device *dev,
1093 const char *name)
1094{
1095 BUG_ON(!net);
8ce6cebc 1096
d9031024
OP
1097 if (!dev_valid_name(name))
1098 return -EINVAL;
1099
1c5cae81 1100 if (strchr(name, '%'))
828de4f6 1101 return dev_alloc_name_ns(net, dev, name);
d9031024
OP
1102 else if (__dev_get_by_name(net, name))
1103 return -EEXIST;
8ce6cebc
DL
1104 else if (dev->name != name)
1105 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
1106
1107 return 0;
1108}
1da177e4
LT
1109
1110/**
1111 * dev_change_name - change name of a device
1112 * @dev: device
1113 * @newname: name (or format string) must be at least IFNAMSIZ
1114 *
1115 * Change name of a device, can pass format strings "eth%d".
1116 * for wildcarding.
1117 */
cf04a4c7 1118int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 1119{
238fa362 1120 unsigned char old_assign_type;
fcc5a03a 1121 char oldname[IFNAMSIZ];
1da177e4 1122 int err = 0;
fcc5a03a 1123 int ret;
881d966b 1124 struct net *net;
1da177e4
LT
1125
1126 ASSERT_RTNL();
c346dca1 1127 BUG_ON(!dev_net(dev));
1da177e4 1128
c346dca1 1129 net = dev_net(dev);
1da177e4
LT
1130 if (dev->flags & IFF_UP)
1131 return -EBUSY;
1132
30e6c9fa 1133 write_seqcount_begin(&devnet_rename_seq);
c91f6df2
BH
1134
1135 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
30e6c9fa 1136 write_seqcount_end(&devnet_rename_seq);
c8d90dca 1137 return 0;
c91f6df2 1138 }
c8d90dca 1139
fcc5a03a
HX
1140 memcpy(oldname, dev->name, IFNAMSIZ);
1141
828de4f6 1142 err = dev_get_valid_name(net, dev, newname);
c91f6df2 1143 if (err < 0) {
30e6c9fa 1144 write_seqcount_end(&devnet_rename_seq);
d9031024 1145 return err;
c91f6df2 1146 }
1da177e4 1147
6fe82a39
VF
1148 if (oldname[0] && !strchr(oldname, '%'))
1149 netdev_info(dev, "renamed from %s\n", oldname);
1150
238fa362
TG
1151 old_assign_type = dev->name_assign_type;
1152 dev->name_assign_type = NET_NAME_RENAMED;
1153
fcc5a03a 1154rollback:
a1b3f594
EB
1155 ret = device_rename(&dev->dev, dev->name);
1156 if (ret) {
1157 memcpy(dev->name, oldname, IFNAMSIZ);
238fa362 1158 dev->name_assign_type = old_assign_type;
30e6c9fa 1159 write_seqcount_end(&devnet_rename_seq);
a1b3f594 1160 return ret;
dcc99773 1161 }
7f988eab 1162
30e6c9fa 1163 write_seqcount_end(&devnet_rename_seq);
c91f6df2 1164
5bb025fa
VF
1165 netdev_adjacent_rename_links(dev, oldname);
1166
7f988eab 1167 write_lock_bh(&dev_base_lock);
372b2312 1168 hlist_del_rcu(&dev->name_hlist);
72c9528b
ED
1169 write_unlock_bh(&dev_base_lock);
1170
1171 synchronize_rcu();
1172
1173 write_lock_bh(&dev_base_lock);
1174 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1175 write_unlock_bh(&dev_base_lock);
1176
056925ab 1177 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1178 ret = notifier_to_errno(ret);
1179
1180 if (ret) {
91e9c07b
ED
1181 /* err >= 0 after dev_alloc_name() or stores the first errno */
1182 if (err >= 0) {
fcc5a03a 1183 err = ret;
30e6c9fa 1184 write_seqcount_begin(&devnet_rename_seq);
fcc5a03a 1185 memcpy(dev->name, oldname, IFNAMSIZ);
5bb025fa 1186 memcpy(oldname, newname, IFNAMSIZ);
238fa362
TG
1187 dev->name_assign_type = old_assign_type;
1188 old_assign_type = NET_NAME_RENAMED;
fcc5a03a 1189 goto rollback;
91e9c07b 1190 } else {
7b6cd1ce 1191 pr_err("%s: name change rollback failed: %d\n",
91e9c07b 1192 dev->name, ret);
fcc5a03a
HX
1193 }
1194 }
1da177e4
LT
1195
1196 return err;
1197}
1198
0b815a1a
SH
1199/**
1200 * dev_set_alias - change ifalias of a device
1201 * @dev: device
1202 * @alias: name up to IFALIASZ
f0db275a 1203 * @len: limit of bytes to copy from info
0b815a1a
SH
1204 *
1205 * Set ifalias for a device,
1206 */
1207int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1208{
7364e445
AK
1209 char *new_ifalias;
1210
0b815a1a
SH
1211 ASSERT_RTNL();
1212
1213 if (len >= IFALIASZ)
1214 return -EINVAL;
1215
96ca4a2c 1216 if (!len) {
388dfc2d
SK
1217 kfree(dev->ifalias);
1218 dev->ifalias = NULL;
96ca4a2c
OH
1219 return 0;
1220 }
1221
7364e445
AK
1222 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1223 if (!new_ifalias)
0b815a1a 1224 return -ENOMEM;
7364e445 1225 dev->ifalias = new_ifalias;
0b815a1a
SH
1226
1227 strlcpy(dev->ifalias, alias, len+1);
1228 return len;
1229}
1230
1231
d8a33ac4 1232/**
3041a069 1233 * netdev_features_change - device changes features
d8a33ac4
SH
1234 * @dev: device to cause notification
1235 *
1236 * Called to indicate a device has changed features.
1237 */
1238void netdev_features_change(struct net_device *dev)
1239{
056925ab 1240 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1241}
1242EXPORT_SYMBOL(netdev_features_change);
1243
1da177e4
LT
1244/**
1245 * netdev_state_change - device changes state
1246 * @dev: device to cause notification
1247 *
1248 * Called to indicate a device has changed state. This function calls
1249 * the notifier chains for netdev_chain and sends a NEWLINK message
1250 * to the routing socket.
1251 */
1252void netdev_state_change(struct net_device *dev)
1253{
1254 if (dev->flags & IFF_UP) {
54951194
LP
1255 struct netdev_notifier_change_info change_info;
1256
1257 change_info.flags_changed = 0;
1258 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1259 &change_info.info);
7f294054 1260 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1da177e4
LT
1261 }
1262}
d1b19dff 1263EXPORT_SYMBOL(netdev_state_change);
1da177e4 1264
ee89bab1
AW
1265/**
1266 * netdev_notify_peers - notify network peers about existence of @dev
1267 * @dev: network device
1268 *
1269 * Generate traffic such that interested network peers are aware of
1270 * @dev, such as by generating a gratuitous ARP. This may be used when
1271 * a device wants to inform the rest of the network about some sort of
1272 * reconfiguration such as a failover event or virtual machine
1273 * migration.
1274 */
1275void netdev_notify_peers(struct net_device *dev)
c1da4ac7 1276{
ee89bab1
AW
1277 rtnl_lock();
1278 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1279 rtnl_unlock();
c1da4ac7 1280}
ee89bab1 1281EXPORT_SYMBOL(netdev_notify_peers);
c1da4ac7 1282
bd380811 1283static int __dev_open(struct net_device *dev)
1da177e4 1284{
d314774c 1285 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1286 int ret;
1da177e4 1287
e46b66bc
BH
1288 ASSERT_RTNL();
1289
1da177e4
LT
1290 if (!netif_device_present(dev))
1291 return -ENODEV;
1292
ca99ca14
NH
1293 /* Block netpoll from trying to do any rx path servicing.
1294 * If we don't do this there is a chance ndo_poll_controller
1295 * or ndo_poll may be running while we open the device
1296 */
66b5552f 1297 netpoll_poll_disable(dev);
ca99ca14 1298
3b8bcfd5
JB
1299 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1300 ret = notifier_to_errno(ret);
1301 if (ret)
1302 return ret;
1303
1da177e4 1304 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1305
d314774c
SH
1306 if (ops->ndo_validate_addr)
1307 ret = ops->ndo_validate_addr(dev);
bada339b 1308
d314774c
SH
1309 if (!ret && ops->ndo_open)
1310 ret = ops->ndo_open(dev);
1da177e4 1311
66b5552f 1312 netpoll_poll_enable(dev);
ca99ca14 1313
bada339b
JG
1314 if (ret)
1315 clear_bit(__LINK_STATE_START, &dev->state);
1316 else {
1da177e4 1317 dev->flags |= IFF_UP;
4417da66 1318 dev_set_rx_mode(dev);
1da177e4 1319 dev_activate(dev);
7bf23575 1320 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 1321 }
bada339b 1322
1da177e4
LT
1323 return ret;
1324}
1325
1326/**
bd380811
PM
1327 * dev_open - prepare an interface for use.
1328 * @dev: device to open
1da177e4 1329 *
bd380811
PM
1330 * Takes a device from down to up state. The device's private open
1331 * function is invoked and then the multicast lists are loaded. Finally
1332 * the device is moved into the up state and a %NETDEV_UP message is
1333 * sent to the netdev notifier chain.
1334 *
1335 * Calling this function on an active interface is a nop. On a failure
1336 * a negative errno code is returned.
1da177e4 1337 */
bd380811
PM
1338int dev_open(struct net_device *dev)
1339{
1340 int ret;
1341
bd380811
PM
1342 if (dev->flags & IFF_UP)
1343 return 0;
1344
bd380811
PM
1345 ret = __dev_open(dev);
1346 if (ret < 0)
1347 return ret;
1348
7f294054 1349 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
bd380811
PM
1350 call_netdevice_notifiers(NETDEV_UP, dev);
1351
1352 return ret;
1353}
1354EXPORT_SYMBOL(dev_open);
1355
44345724 1356static int __dev_close_many(struct list_head *head)
1da177e4 1357{
44345724 1358 struct net_device *dev;
e46b66bc 1359
bd380811 1360 ASSERT_RTNL();
9d5010db
DM
1361 might_sleep();
1362
5cde2829 1363 list_for_each_entry(dev, head, close_list) {
3f4df206 1364 /* Temporarily disable netpoll until the interface is down */
66b5552f 1365 netpoll_poll_disable(dev);
3f4df206 1366
44345724 1367 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1368
44345724 1369 clear_bit(__LINK_STATE_START, &dev->state);
1da177e4 1370
44345724
OP
1371 /* Synchronize to scheduled poll. We cannot touch poll list, it
1372 * can be even on different cpu. So just clear netif_running().
1373 *
1374 * dev->stop() will invoke napi_disable() on all of it's
1375 * napi_struct instances on this device.
1376 */
4e857c58 1377 smp_mb__after_atomic(); /* Commit netif_running(). */
44345724 1378 }
1da177e4 1379
44345724 1380 dev_deactivate_many(head);
d8b2a4d2 1381
5cde2829 1382 list_for_each_entry(dev, head, close_list) {
44345724 1383 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4 1384
44345724
OP
1385 /*
1386 * Call the device specific close. This cannot fail.
1387 * Only if device is UP
1388 *
1389 * We allow it to be called even after a DETACH hot-plug
1390 * event.
1391 */
1392 if (ops->ndo_stop)
1393 ops->ndo_stop(dev);
1394
44345724 1395 dev->flags &= ~IFF_UP;
66b5552f 1396 netpoll_poll_enable(dev);
44345724
OP
1397 }
1398
1399 return 0;
1400}
1401
1402static int __dev_close(struct net_device *dev)
1403{
f87e6f47 1404 int retval;
44345724
OP
1405 LIST_HEAD(single);
1406
5cde2829 1407 list_add(&dev->close_list, &single);
f87e6f47
LT
1408 retval = __dev_close_many(&single);
1409 list_del(&single);
ca99ca14 1410
f87e6f47 1411 return retval;
44345724
OP
1412}
1413
99c4a26a 1414int dev_close_many(struct list_head *head, bool unlink)
44345724
OP
1415{
1416 struct net_device *dev, *tmp;
1da177e4 1417
5cde2829
EB
1418 /* Remove the devices that don't need to be closed */
1419 list_for_each_entry_safe(dev, tmp, head, close_list)
44345724 1420 if (!(dev->flags & IFF_UP))
5cde2829 1421 list_del_init(&dev->close_list);
44345724
OP
1422
1423 __dev_close_many(head);
1da177e4 1424
5cde2829 1425 list_for_each_entry_safe(dev, tmp, head, close_list) {
7f294054 1426 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
44345724 1427 call_netdevice_notifiers(NETDEV_DOWN, dev);
99c4a26a
DM
1428 if (unlink)
1429 list_del_init(&dev->close_list);
44345724 1430 }
bd380811
PM
1431
1432 return 0;
1433}
99c4a26a 1434EXPORT_SYMBOL(dev_close_many);
bd380811
PM
1435
1436/**
1437 * dev_close - shutdown an interface.
1438 * @dev: device to shutdown
1439 *
1440 * This function moves an active device into down state. A
1441 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1442 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1443 * chain.
1444 */
1445int dev_close(struct net_device *dev)
1446{
e14a5993
ED
1447 if (dev->flags & IFF_UP) {
1448 LIST_HEAD(single);
1da177e4 1449
5cde2829 1450 list_add(&dev->close_list, &single);
99c4a26a 1451 dev_close_many(&single, true);
e14a5993
ED
1452 list_del(&single);
1453 }
da6e378b 1454 return 0;
1da177e4 1455}
d1b19dff 1456EXPORT_SYMBOL(dev_close);
1da177e4
LT
1457
1458
0187bdfb
BH
1459/**
1460 * dev_disable_lro - disable Large Receive Offload on a device
1461 * @dev: device
1462 *
1463 * Disable Large Receive Offload (LRO) on a net device. Must be
1464 * called under RTNL. This is needed if received packets may be
1465 * forwarded to another interface.
1466 */
1467void dev_disable_lro(struct net_device *dev)
1468{
fbe168ba
MK
1469 struct net_device *lower_dev;
1470 struct list_head *iter;
529d0489 1471
bc5787c6
MM
1472 dev->wanted_features &= ~NETIF_F_LRO;
1473 netdev_update_features(dev);
27660515 1474
22d5969f
MM
1475 if (unlikely(dev->features & NETIF_F_LRO))
1476 netdev_WARN(dev, "failed to disable LRO!\n");
fbe168ba
MK
1477
1478 netdev_for_each_lower_dev(dev, lower_dev, iter)
1479 dev_disable_lro(lower_dev);
0187bdfb
BH
1480}
1481EXPORT_SYMBOL(dev_disable_lro);
1482
351638e7
JP
1483static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1484 struct net_device *dev)
1485{
1486 struct netdev_notifier_info info;
1487
1488 netdev_notifier_info_init(&info, dev);
1489 return nb->notifier_call(nb, val, &info);
1490}
0187bdfb 1491
881d966b
EB
1492static int dev_boot_phase = 1;
1493
1da177e4
LT
1494/**
1495 * register_netdevice_notifier - register a network notifier block
1496 * @nb: notifier
1497 *
1498 * Register a notifier to be called when network device events occur.
1499 * The notifier passed is linked into the kernel structures and must
1500 * not be reused until it has been unregistered. A negative errno code
1501 * is returned on a failure.
1502 *
1503 * When registered all registration and up events are replayed
4ec93edb 1504 * to the new notifier to allow device to have a race free
1da177e4
LT
1505 * view of the network device list.
1506 */
1507
1508int register_netdevice_notifier(struct notifier_block *nb)
1509{
1510 struct net_device *dev;
fcc5a03a 1511 struct net_device *last;
881d966b 1512 struct net *net;
1da177e4
LT
1513 int err;
1514
1515 rtnl_lock();
f07d5b94 1516 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1517 if (err)
1518 goto unlock;
881d966b
EB
1519 if (dev_boot_phase)
1520 goto unlock;
1521 for_each_net(net) {
1522 for_each_netdev(net, dev) {
351638e7 1523 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
881d966b
EB
1524 err = notifier_to_errno(err);
1525 if (err)
1526 goto rollback;
1527
1528 if (!(dev->flags & IFF_UP))
1529 continue;
1da177e4 1530
351638e7 1531 call_netdevice_notifier(nb, NETDEV_UP, dev);
881d966b 1532 }
1da177e4 1533 }
fcc5a03a
HX
1534
1535unlock:
1da177e4
LT
1536 rtnl_unlock();
1537 return err;
fcc5a03a
HX
1538
1539rollback:
1540 last = dev;
881d966b
EB
1541 for_each_net(net) {
1542 for_each_netdev(net, dev) {
1543 if (dev == last)
8f891489 1544 goto outroll;
fcc5a03a 1545
881d966b 1546 if (dev->flags & IFF_UP) {
351638e7
JP
1547 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1548 dev);
1549 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
881d966b 1550 }
351638e7 1551 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
fcc5a03a 1552 }
fcc5a03a 1553 }
c67625a1 1554
8f891489 1555outroll:
c67625a1 1556 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1557 goto unlock;
1da177e4 1558}
d1b19dff 1559EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1560
1561/**
1562 * unregister_netdevice_notifier - unregister a network notifier block
1563 * @nb: notifier
1564 *
1565 * Unregister a notifier previously registered by
1566 * register_netdevice_notifier(). The notifier is unlinked into the
1567 * kernel structures and may then be reused. A negative errno code
1568 * is returned on a failure.
7d3d43da
EB
1569 *
1570 * After unregistering unregister and down device events are synthesized
1571 * for all devices on the device list to the removed notifier to remove
1572 * the need for special case cleanup code.
1da177e4
LT
1573 */
1574
1575int unregister_netdevice_notifier(struct notifier_block *nb)
1576{
7d3d43da
EB
1577 struct net_device *dev;
1578 struct net *net;
9f514950
HX
1579 int err;
1580
1581 rtnl_lock();
f07d5b94 1582 err = raw_notifier_chain_unregister(&netdev_chain, nb);
7d3d43da
EB
1583 if (err)
1584 goto unlock;
1585
1586 for_each_net(net) {
1587 for_each_netdev(net, dev) {
1588 if (dev->flags & IFF_UP) {
351638e7
JP
1589 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1590 dev);
1591 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
7d3d43da 1592 }
351638e7 1593 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
7d3d43da
EB
1594 }
1595 }
1596unlock:
9f514950
HX
1597 rtnl_unlock();
1598 return err;
1da177e4 1599}
d1b19dff 1600EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4 1601
351638e7
JP
1602/**
1603 * call_netdevice_notifiers_info - call all network notifier blocks
1604 * @val: value passed unmodified to notifier function
1605 * @dev: net_device pointer passed unmodified to notifier function
1606 * @info: notifier information data
1607 *
1608 * Call all network notifier blocks. Parameters and return value
1609 * are as for raw_notifier_call_chain().
1610 */
1611
1d143d9f 1612static int call_netdevice_notifiers_info(unsigned long val,
1613 struct net_device *dev,
1614 struct netdev_notifier_info *info)
351638e7
JP
1615{
1616 ASSERT_RTNL();
1617 netdev_notifier_info_init(info, dev);
1618 return raw_notifier_call_chain(&netdev_chain, val, info);
1619}
351638e7 1620
1da177e4
LT
1621/**
1622 * call_netdevice_notifiers - call all network notifier blocks
1623 * @val: value passed unmodified to notifier function
c4ea43c5 1624 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1625 *
1626 * Call all network notifier blocks. Parameters and return value
f07d5b94 1627 * are as for raw_notifier_call_chain().
1da177e4
LT
1628 */
1629
ad7379d4 1630int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1631{
351638e7
JP
1632 struct netdev_notifier_info info;
1633
1634 return call_netdevice_notifiers_info(val, dev, &info);
1da177e4 1635}
edf947f1 1636EXPORT_SYMBOL(call_netdevice_notifiers);
1da177e4 1637
1cf51900 1638#ifdef CONFIG_NET_INGRESS
4577139b
DB
1639static struct static_key ingress_needed __read_mostly;
1640
1641void net_inc_ingress_queue(void)
1642{
1643 static_key_slow_inc(&ingress_needed);
1644}
1645EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1646
1647void net_dec_ingress_queue(void)
1648{
1649 static_key_slow_dec(&ingress_needed);
1650}
1651EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1652#endif
1653
c5905afb 1654static struct static_key netstamp_needed __read_mostly;
b90e5794 1655#ifdef HAVE_JUMP_LABEL
c5905afb 1656/* We are not allowed to call static_key_slow_dec() from irq context
b90e5794 1657 * If net_disable_timestamp() is called from irq context, defer the
c5905afb 1658 * static_key_slow_dec() calls.
b90e5794
ED
1659 */
1660static atomic_t netstamp_needed_deferred;
1661#endif
1da177e4
LT
1662
1663void net_enable_timestamp(void)
1664{
b90e5794
ED
1665#ifdef HAVE_JUMP_LABEL
1666 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1667
1668 if (deferred) {
1669 while (--deferred)
c5905afb 1670 static_key_slow_dec(&netstamp_needed);
b90e5794
ED
1671 return;
1672 }
1673#endif
c5905afb 1674 static_key_slow_inc(&netstamp_needed);
1da177e4 1675}
d1b19dff 1676EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1677
1678void net_disable_timestamp(void)
1679{
b90e5794
ED
1680#ifdef HAVE_JUMP_LABEL
1681 if (in_interrupt()) {
1682 atomic_inc(&netstamp_needed_deferred);
1683 return;
1684 }
1685#endif
c5905afb 1686 static_key_slow_dec(&netstamp_needed);
1da177e4 1687}
d1b19dff 1688EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1689
3b098e2d 1690static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4 1691{
588f0330 1692 skb->tstamp.tv64 = 0;
c5905afb 1693 if (static_key_false(&netstamp_needed))
a61bbcf2 1694 __net_timestamp(skb);
1da177e4
LT
1695}
1696
588f0330 1697#define net_timestamp_check(COND, SKB) \
c5905afb 1698 if (static_key_false(&netstamp_needed)) { \
588f0330
ED
1699 if ((COND) && !(SKB)->tstamp.tv64) \
1700 __net_timestamp(SKB); \
1701 } \
3b098e2d 1702
1ee481fb 1703bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
79b569f0
DL
1704{
1705 unsigned int len;
1706
1707 if (!(dev->flags & IFF_UP))
1708 return false;
1709
1710 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1711 if (skb->len <= len)
1712 return true;
1713
1714 /* if TSO is enabled, we don't care about the length as the packet
1715 * could be forwarded without being segmented before
1716 */
1717 if (skb_is_gso(skb))
1718 return true;
1719
1720 return false;
1721}
1ee481fb 1722EXPORT_SYMBOL_GPL(is_skb_forwardable);
79b569f0 1723
a0265d28
HX
1724int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1725{
bbbf2df0
WB
1726 if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1727 unlikely(!is_skb_forwardable(dev, skb))) {
a0265d28
HX
1728 atomic_long_inc(&dev->rx_dropped);
1729 kfree_skb(skb);
1730 return NET_RX_DROP;
1731 }
1732
1733 skb_scrub_packet(skb, true);
08b4b8ea 1734 skb->priority = 0;
a0265d28 1735 skb->protocol = eth_type_trans(skb, dev);
2c26d34b 1736 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
a0265d28
HX
1737
1738 return 0;
1739}
1740EXPORT_SYMBOL_GPL(__dev_forward_skb);
1741
44540960
AB
1742/**
1743 * dev_forward_skb - loopback an skb to another netif
1744 *
1745 * @dev: destination network device
1746 * @skb: buffer to forward
1747 *
1748 * return values:
1749 * NET_RX_SUCCESS (no congestion)
6ec82562 1750 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1751 *
1752 * dev_forward_skb can be used for injecting an skb from the
1753 * start_xmit function of one device into the receive queue
1754 * of another device.
1755 *
1756 * The receiving device may be in another namespace, so
1757 * we have to clear all information in the skb that could
1758 * impact namespace isolation.
1759 */
1760int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1761{
a0265d28 1762 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
44540960
AB
1763}
1764EXPORT_SYMBOL_GPL(dev_forward_skb);
1765
71d9dec2
CG
1766static inline int deliver_skb(struct sk_buff *skb,
1767 struct packet_type *pt_prev,
1768 struct net_device *orig_dev)
1769{
1080e512
MT
1770 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1771 return -ENOMEM;
71d9dec2
CG
1772 atomic_inc(&skb->users);
1773 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1774}
1775
7866a621
SN
1776static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1777 struct packet_type **pt,
fbcb2170
JP
1778 struct net_device *orig_dev,
1779 __be16 type,
7866a621
SN
1780 struct list_head *ptype_list)
1781{
1782 struct packet_type *ptype, *pt_prev = *pt;
1783
1784 list_for_each_entry_rcu(ptype, ptype_list, list) {
1785 if (ptype->type != type)
1786 continue;
1787 if (pt_prev)
fbcb2170 1788 deliver_skb(skb, pt_prev, orig_dev);
7866a621
SN
1789 pt_prev = ptype;
1790 }
1791 *pt = pt_prev;
1792}
1793
c0de08d0
EL
1794static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1795{
a3d744e9 1796 if (!ptype->af_packet_priv || !skb->sk)
c0de08d0
EL
1797 return false;
1798
1799 if (ptype->id_match)
1800 return ptype->id_match(ptype, skb->sk);
1801 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1802 return true;
1803
1804 return false;
1805}
1806
1da177e4
LT
1807/*
1808 * Support routine. Sends outgoing frames to any network
1809 * taps currently in use.
1810 */
1811
f6a78bfc 1812static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1813{
1814 struct packet_type *ptype;
71d9dec2
CG
1815 struct sk_buff *skb2 = NULL;
1816 struct packet_type *pt_prev = NULL;
7866a621 1817 struct list_head *ptype_list = &ptype_all;
a61bbcf2 1818
1da177e4 1819 rcu_read_lock();
7866a621
SN
1820again:
1821 list_for_each_entry_rcu(ptype, ptype_list, list) {
1da177e4
LT
1822 /* Never send packets back to the socket
1823 * they originated from - MvS (miquels@drinkel.ow.org)
1824 */
7866a621
SN
1825 if (skb_loop_sk(ptype, skb))
1826 continue;
71d9dec2 1827
7866a621
SN
1828 if (pt_prev) {
1829 deliver_skb(skb2, pt_prev, skb->dev);
1830 pt_prev = ptype;
1831 continue;
1832 }
1da177e4 1833
7866a621
SN
1834 /* need to clone skb, done only once */
1835 skb2 = skb_clone(skb, GFP_ATOMIC);
1836 if (!skb2)
1837 goto out_unlock;
70978182 1838
7866a621 1839 net_timestamp_set(skb2);
1da177e4 1840
7866a621
SN
1841 /* skb->nh should be correctly
1842 * set by sender, so that the second statement is
1843 * just protection against buggy protocols.
1844 */
1845 skb_reset_mac_header(skb2);
1846
1847 if (skb_network_header(skb2) < skb2->data ||
1848 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1849 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1850 ntohs(skb2->protocol),
1851 dev->name);
1852 skb_reset_network_header(skb2);
1da177e4 1853 }
7866a621
SN
1854
1855 skb2->transport_header = skb2->network_header;
1856 skb2->pkt_type = PACKET_OUTGOING;
1857 pt_prev = ptype;
1858 }
1859
1860 if (ptype_list == &ptype_all) {
1861 ptype_list = &dev->ptype_all;
1862 goto again;
1da177e4 1863 }
7866a621 1864out_unlock:
71d9dec2
CG
1865 if (pt_prev)
1866 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1da177e4
LT
1867 rcu_read_unlock();
1868}
1869
2c53040f
BH
1870/**
1871 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
4f57c087
JF
1872 * @dev: Network device
1873 * @txq: number of queues available
1874 *
1875 * If real_num_tx_queues is changed the tc mappings may no longer be
1876 * valid. To resolve this verify the tc mapping remains valid and if
1877 * not NULL the mapping. With no priorities mapping to this
1878 * offset/count pair it will no longer be used. In the worst case TC0
1879 * is invalid nothing can be done so disable priority mappings. If is
1880 * expected that drivers will fix this mapping if they can before
1881 * calling netif_set_real_num_tx_queues.
1882 */
bb134d22 1883static void netif_setup_tc(struct net_device *dev, unsigned int txq)
4f57c087
JF
1884{
1885 int i;
1886 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1887
1888 /* If TC0 is invalidated disable TC mapping */
1889 if (tc->offset + tc->count > txq) {
7b6cd1ce 1890 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
4f57c087
JF
1891 dev->num_tc = 0;
1892 return;
1893 }
1894
1895 /* Invalidated prio to tc mappings set to TC0 */
1896 for (i = 1; i < TC_BITMASK + 1; i++) {
1897 int q = netdev_get_prio_tc_map(dev, i);
1898
1899 tc = &dev->tc_to_txq[q];
1900 if (tc->offset + tc->count > txq) {
7b6cd1ce
JP
1901 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1902 i, q);
4f57c087
JF
1903 netdev_set_prio_tc_map(dev, i, 0);
1904 }
1905 }
1906}
1907
537c00de
AD
1908#ifdef CONFIG_XPS
1909static DEFINE_MUTEX(xps_map_mutex);
1910#define xmap_dereference(P) \
1911 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1912
10cdc3f3
AD
1913static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1914 int cpu, u16 index)
537c00de 1915{
10cdc3f3
AD
1916 struct xps_map *map = NULL;
1917 int pos;
537c00de 1918
10cdc3f3
AD
1919 if (dev_maps)
1920 map = xmap_dereference(dev_maps->cpu_map[cpu]);
537c00de 1921
10cdc3f3
AD
1922 for (pos = 0; map && pos < map->len; pos++) {
1923 if (map->queues[pos] == index) {
537c00de
AD
1924 if (map->len > 1) {
1925 map->queues[pos] = map->queues[--map->len];
1926 } else {
10cdc3f3 1927 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
537c00de
AD
1928 kfree_rcu(map, rcu);
1929 map = NULL;
1930 }
10cdc3f3 1931 break;
537c00de 1932 }
537c00de
AD
1933 }
1934
10cdc3f3
AD
1935 return map;
1936}
1937
024e9679 1938static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
10cdc3f3
AD
1939{
1940 struct xps_dev_maps *dev_maps;
024e9679 1941 int cpu, i;
10cdc3f3
AD
1942 bool active = false;
1943
1944 mutex_lock(&xps_map_mutex);
1945 dev_maps = xmap_dereference(dev->xps_maps);
1946
1947 if (!dev_maps)
1948 goto out_no_maps;
1949
1950 for_each_possible_cpu(cpu) {
024e9679
AD
1951 for (i = index; i < dev->num_tx_queues; i++) {
1952 if (!remove_xps_queue(dev_maps, cpu, i))
1953 break;
1954 }
1955 if (i == dev->num_tx_queues)
10cdc3f3
AD
1956 active = true;
1957 }
1958
1959 if (!active) {
537c00de
AD
1960 RCU_INIT_POINTER(dev->xps_maps, NULL);
1961 kfree_rcu(dev_maps, rcu);
1962 }
1963
024e9679
AD
1964 for (i = index; i < dev->num_tx_queues; i++)
1965 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1966 NUMA_NO_NODE);
1967
537c00de
AD
1968out_no_maps:
1969 mutex_unlock(&xps_map_mutex);
1970}
1971
01c5f864
AD
1972static struct xps_map *expand_xps_map(struct xps_map *map,
1973 int cpu, u16 index)
1974{
1975 struct xps_map *new_map;
1976 int alloc_len = XPS_MIN_MAP_ALLOC;
1977 int i, pos;
1978
1979 for (pos = 0; map && pos < map->len; pos++) {
1980 if (map->queues[pos] != index)
1981 continue;
1982 return map;
1983 }
1984
1985 /* Need to add queue to this CPU's existing map */
1986 if (map) {
1987 if (pos < map->alloc_len)
1988 return map;
1989
1990 alloc_len = map->alloc_len * 2;
1991 }
1992
1993 /* Need to allocate new map to store queue on this CPU's map */
1994 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1995 cpu_to_node(cpu));
1996 if (!new_map)
1997 return NULL;
1998
1999 for (i = 0; i < pos; i++)
2000 new_map->queues[i] = map->queues[i];
2001 new_map->alloc_len = alloc_len;
2002 new_map->len = pos;
2003
2004 return new_map;
2005}
2006
3573540c
MT
2007int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2008 u16 index)
537c00de 2009{
01c5f864 2010 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
537c00de 2011 struct xps_map *map, *new_map;
537c00de 2012 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
01c5f864
AD
2013 int cpu, numa_node_id = -2;
2014 bool active = false;
537c00de
AD
2015
2016 mutex_lock(&xps_map_mutex);
2017
2018 dev_maps = xmap_dereference(dev->xps_maps);
2019
01c5f864
AD
2020 /* allocate memory for queue storage */
2021 for_each_online_cpu(cpu) {
2022 if (!cpumask_test_cpu(cpu, mask))
2023 continue;
2024
2025 if (!new_dev_maps)
2026 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2bb60cb9
AD
2027 if (!new_dev_maps) {
2028 mutex_unlock(&xps_map_mutex);
01c5f864 2029 return -ENOMEM;
2bb60cb9 2030 }
01c5f864
AD
2031
2032 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2033 NULL;
2034
2035 map = expand_xps_map(map, cpu, index);
2036 if (!map)
2037 goto error;
2038
2039 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2040 }
2041
2042 if (!new_dev_maps)
2043 goto out_no_new_maps;
2044
537c00de 2045 for_each_possible_cpu(cpu) {
01c5f864
AD
2046 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2047 /* add queue to CPU maps */
2048 int pos = 0;
2049
2050 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2051 while ((pos < map->len) && (map->queues[pos] != index))
2052 pos++;
2053
2054 if (pos == map->len)
2055 map->queues[map->len++] = index;
537c00de 2056#ifdef CONFIG_NUMA
537c00de
AD
2057 if (numa_node_id == -2)
2058 numa_node_id = cpu_to_node(cpu);
2059 else if (numa_node_id != cpu_to_node(cpu))
2060 numa_node_id = -1;
537c00de 2061#endif
01c5f864
AD
2062 } else if (dev_maps) {
2063 /* fill in the new device map from the old device map */
2064 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2065 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
537c00de 2066 }
01c5f864 2067
537c00de
AD
2068 }
2069
01c5f864
AD
2070 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2071
537c00de 2072 /* Cleanup old maps */
01c5f864
AD
2073 if (dev_maps) {
2074 for_each_possible_cpu(cpu) {
2075 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2076 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2077 if (map && map != new_map)
2078 kfree_rcu(map, rcu);
2079 }
537c00de 2080
01c5f864 2081 kfree_rcu(dev_maps, rcu);
537c00de
AD
2082 }
2083
01c5f864
AD
2084 dev_maps = new_dev_maps;
2085 active = true;
537c00de 2086
01c5f864
AD
2087out_no_new_maps:
2088 /* update Tx queue numa node */
537c00de
AD
2089 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2090 (numa_node_id >= 0) ? numa_node_id :
2091 NUMA_NO_NODE);
2092
01c5f864
AD
2093 if (!dev_maps)
2094 goto out_no_maps;
2095
2096 /* removes queue from unused CPUs */
2097 for_each_possible_cpu(cpu) {
2098 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2099 continue;
2100
2101 if (remove_xps_queue(dev_maps, cpu, index))
2102 active = true;
2103 }
2104
2105 /* free map if not active */
2106 if (!active) {
2107 RCU_INIT_POINTER(dev->xps_maps, NULL);
2108 kfree_rcu(dev_maps, rcu);
2109 }
2110
2111out_no_maps:
537c00de
AD
2112 mutex_unlock(&xps_map_mutex);
2113
2114 return 0;
2115error:
01c5f864
AD
2116 /* remove any maps that we added */
2117 for_each_possible_cpu(cpu) {
2118 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2119 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2120 NULL;
2121 if (new_map && new_map != map)
2122 kfree(new_map);
2123 }
2124
537c00de
AD
2125 mutex_unlock(&xps_map_mutex);
2126
537c00de
AD
2127 kfree(new_dev_maps);
2128 return -ENOMEM;
2129}
2130EXPORT_SYMBOL(netif_set_xps_queue);
2131
2132#endif
f0796d5c
JF
2133/*
2134 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2135 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2136 */
e6484930 2137int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
f0796d5c 2138{
1d24eb48
TH
2139 int rc;
2140
e6484930
TH
2141 if (txq < 1 || txq > dev->num_tx_queues)
2142 return -EINVAL;
f0796d5c 2143
5c56580b
BH
2144 if (dev->reg_state == NETREG_REGISTERED ||
2145 dev->reg_state == NETREG_UNREGISTERING) {
e6484930
TH
2146 ASSERT_RTNL();
2147
1d24eb48
TH
2148 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2149 txq);
bf264145
TH
2150 if (rc)
2151 return rc;
2152
4f57c087
JF
2153 if (dev->num_tc)
2154 netif_setup_tc(dev, txq);
2155
024e9679 2156 if (txq < dev->real_num_tx_queues) {
e6484930 2157 qdisc_reset_all_tx_gt(dev, txq);
024e9679
AD
2158#ifdef CONFIG_XPS
2159 netif_reset_xps_queues_gt(dev, txq);
2160#endif
2161 }
f0796d5c 2162 }
e6484930
TH
2163
2164 dev->real_num_tx_queues = txq;
2165 return 0;
f0796d5c
JF
2166}
2167EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 2168
a953be53 2169#ifdef CONFIG_SYSFS
62fe0b40
BH
2170/**
2171 * netif_set_real_num_rx_queues - set actual number of RX queues used
2172 * @dev: Network device
2173 * @rxq: Actual number of RX queues
2174 *
2175 * This must be called either with the rtnl_lock held or before
2176 * registration of the net device. Returns 0 on success, or a
4e7f7951
BH
2177 * negative error code. If called before registration, it always
2178 * succeeds.
62fe0b40
BH
2179 */
2180int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2181{
2182 int rc;
2183
bd25fa7b
TH
2184 if (rxq < 1 || rxq > dev->num_rx_queues)
2185 return -EINVAL;
2186
62fe0b40
BH
2187 if (dev->reg_state == NETREG_REGISTERED) {
2188 ASSERT_RTNL();
2189
62fe0b40
BH
2190 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2191 rxq);
2192 if (rc)
2193 return rc;
62fe0b40
BH
2194 }
2195
2196 dev->real_num_rx_queues = rxq;
2197 return 0;
2198}
2199EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2200#endif
2201
2c53040f
BH
2202/**
2203 * netif_get_num_default_rss_queues - default number of RSS queues
16917b87
YM
2204 *
2205 * This routine should set an upper limit on the number of RSS queues
2206 * used by default by multiqueue devices.
2207 */
a55b138b 2208int netif_get_num_default_rss_queues(void)
16917b87
YM
2209{
2210 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2211}
2212EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2213
def82a1d 2214static inline void __netif_reschedule(struct Qdisc *q)
56079431 2215{
def82a1d
JP
2216 struct softnet_data *sd;
2217 unsigned long flags;
56079431 2218
def82a1d 2219 local_irq_save(flags);
903ceff7 2220 sd = this_cpu_ptr(&softnet_data);
a9cbd588
CG
2221 q->next_sched = NULL;
2222 *sd->output_queue_tailp = q;
2223 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
2224 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2225 local_irq_restore(flags);
2226}
2227
2228void __netif_schedule(struct Qdisc *q)
2229{
2230 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2231 __netif_reschedule(q);
56079431
DV
2232}
2233EXPORT_SYMBOL(__netif_schedule);
2234
e6247027
ED
2235struct dev_kfree_skb_cb {
2236 enum skb_free_reason reason;
2237};
2238
2239static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
56079431 2240{
e6247027
ED
2241 return (struct dev_kfree_skb_cb *)skb->cb;
2242}
2243
46e5da40
JF
2244void netif_schedule_queue(struct netdev_queue *txq)
2245{
2246 rcu_read_lock();
2247 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2248 struct Qdisc *q = rcu_dereference(txq->qdisc);
2249
2250 __netif_schedule(q);
2251 }
2252 rcu_read_unlock();
2253}
2254EXPORT_SYMBOL(netif_schedule_queue);
2255
2256/**
2257 * netif_wake_subqueue - allow sending packets on subqueue
2258 * @dev: network device
2259 * @queue_index: sub queue index
2260 *
2261 * Resume individual transmit queue of a device with multiple transmit queues.
2262 */
2263void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2264{
2265 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2266
2267 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2268 struct Qdisc *q;
2269
2270 rcu_read_lock();
2271 q = rcu_dereference(txq->qdisc);
2272 __netif_schedule(q);
2273 rcu_read_unlock();
2274 }
2275}
2276EXPORT_SYMBOL(netif_wake_subqueue);
2277
2278void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2279{
2280 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2281 struct Qdisc *q;
2282
2283 rcu_read_lock();
2284 q = rcu_dereference(dev_queue->qdisc);
2285 __netif_schedule(q);
2286 rcu_read_unlock();
2287 }
2288}
2289EXPORT_SYMBOL(netif_tx_wake_queue);
2290
e6247027 2291void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
56079431 2292{
e6247027 2293 unsigned long flags;
56079431 2294
e6247027
ED
2295 if (likely(atomic_read(&skb->users) == 1)) {
2296 smp_rmb();
2297 atomic_set(&skb->users, 0);
2298 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2299 return;
bea3348e 2300 }
e6247027
ED
2301 get_kfree_skb_cb(skb)->reason = reason;
2302 local_irq_save(flags);
2303 skb->next = __this_cpu_read(softnet_data.completion_queue);
2304 __this_cpu_write(softnet_data.completion_queue, skb);
2305 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2306 local_irq_restore(flags);
56079431 2307}
e6247027 2308EXPORT_SYMBOL(__dev_kfree_skb_irq);
56079431 2309
e6247027 2310void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
56079431
DV
2311{
2312 if (in_irq() || irqs_disabled())
e6247027 2313 __dev_kfree_skb_irq(skb, reason);
56079431
DV
2314 else
2315 dev_kfree_skb(skb);
2316}
e6247027 2317EXPORT_SYMBOL(__dev_kfree_skb_any);
56079431
DV
2318
2319
bea3348e
SH
2320/**
2321 * netif_device_detach - mark device as removed
2322 * @dev: network device
2323 *
2324 * Mark device as removed from system and therefore no longer available.
2325 */
56079431
DV
2326void netif_device_detach(struct net_device *dev)
2327{
2328 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2329 netif_running(dev)) {
d543103a 2330 netif_tx_stop_all_queues(dev);
56079431
DV
2331 }
2332}
2333EXPORT_SYMBOL(netif_device_detach);
2334
bea3348e
SH
2335/**
2336 * netif_device_attach - mark device as attached
2337 * @dev: network device
2338 *
2339 * Mark device as attached from system and restart if needed.
2340 */
56079431
DV
2341void netif_device_attach(struct net_device *dev)
2342{
2343 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2344 netif_running(dev)) {
d543103a 2345 netif_tx_wake_all_queues(dev);
4ec93edb 2346 __netdev_watchdog_up(dev);
56079431
DV
2347 }
2348}
2349EXPORT_SYMBOL(netif_device_attach);
2350
5605c762
JP
2351/*
2352 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2353 * to be used as a distribution range.
2354 */
2355u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2356 unsigned int num_tx_queues)
2357{
2358 u32 hash;
2359 u16 qoffset = 0;
2360 u16 qcount = num_tx_queues;
2361
2362 if (skb_rx_queue_recorded(skb)) {
2363 hash = skb_get_rx_queue(skb);
2364 while (unlikely(hash >= num_tx_queues))
2365 hash -= num_tx_queues;
2366 return hash;
2367 }
2368
2369 if (dev->num_tc) {
2370 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2371 qoffset = dev->tc_to_txq[tc].offset;
2372 qcount = dev->tc_to_txq[tc].count;
2373 }
2374
2375 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2376}
2377EXPORT_SYMBOL(__skb_tx_hash);
2378
36c92474
BH
2379static void skb_warn_bad_offload(const struct sk_buff *skb)
2380{
65e9d2fa 2381 static const netdev_features_t null_features = 0;
36c92474
BH
2382 struct net_device *dev = skb->dev;
2383 const char *driver = "";
2384
c846ad9b
BG
2385 if (!net_ratelimit())
2386 return;
2387
36c92474
BH
2388 if (dev && dev->dev.parent)
2389 driver = dev_driver_string(dev->dev.parent);
2390
2391 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2392 "gso_type=%d ip_summed=%d\n",
65e9d2fa
MM
2393 driver, dev ? &dev->features : &null_features,
2394 skb->sk ? &skb->sk->sk_route_caps : &null_features,
36c92474
BH
2395 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2396 skb_shinfo(skb)->gso_type, skb->ip_summed);
2397}
2398
1da177e4
LT
2399/*
2400 * Invalidate hardware checksum when packet is to be mangled, and
2401 * complete checksum manually on outgoing path.
2402 */
84fa7933 2403int skb_checksum_help(struct sk_buff *skb)
1da177e4 2404{
d3bc23e7 2405 __wsum csum;
663ead3b 2406 int ret = 0, offset;
1da177e4 2407
84fa7933 2408 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
2409 goto out_set_summed;
2410
2411 if (unlikely(skb_shinfo(skb)->gso_size)) {
36c92474
BH
2412 skb_warn_bad_offload(skb);
2413 return -EINVAL;
1da177e4
LT
2414 }
2415
cef401de
ED
2416 /* Before computing a checksum, we should make sure no frag could
2417 * be modified by an external entity : checksum could be wrong.
2418 */
2419 if (skb_has_shared_frag(skb)) {
2420 ret = __skb_linearize(skb);
2421 if (ret)
2422 goto out;
2423 }
2424
55508d60 2425 offset = skb_checksum_start_offset(skb);
a030847e
HX
2426 BUG_ON(offset >= skb_headlen(skb));
2427 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2428
2429 offset += skb->csum_offset;
2430 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2431
2432 if (skb_cloned(skb) &&
2433 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
2434 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2435 if (ret)
2436 goto out;
2437 }
2438
a030847e 2439 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 2440out_set_summed:
1da177e4 2441 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 2442out:
1da177e4
LT
2443 return ret;
2444}
d1b19dff 2445EXPORT_SYMBOL(skb_checksum_help);
1da177e4 2446
53d6471c 2447__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
f6a78bfc 2448{
252e3346 2449 __be16 type = skb->protocol;
f6a78bfc 2450
19acc327
PS
2451 /* Tunnel gso handlers can set protocol to ethernet. */
2452 if (type == htons(ETH_P_TEB)) {
2453 struct ethhdr *eth;
2454
2455 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2456 return 0;
2457
2458 eth = (struct ethhdr *)skb_mac_header(skb);
2459 type = eth->h_proto;
2460 }
2461
d4bcef3f 2462 return __vlan_get_protocol(skb, type, depth);
ec5f0615
PS
2463}
2464
2465/**
2466 * skb_mac_gso_segment - mac layer segmentation handler.
2467 * @skb: buffer to segment
2468 * @features: features for the output path (see dev->features)
2469 */
2470struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2471 netdev_features_t features)
2472{
2473 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2474 struct packet_offload *ptype;
53d6471c
VY
2475 int vlan_depth = skb->mac_len;
2476 __be16 type = skb_network_protocol(skb, &vlan_depth);
ec5f0615
PS
2477
2478 if (unlikely(!type))
2479 return ERR_PTR(-EINVAL);
2480
53d6471c 2481 __skb_pull(skb, vlan_depth);
f6a78bfc
HX
2482
2483 rcu_read_lock();
22061d80 2484 list_for_each_entry_rcu(ptype, &offload_base, list) {
f191a1d1 2485 if (ptype->type == type && ptype->callbacks.gso_segment) {
f191a1d1 2486 segs = ptype->callbacks.gso_segment(skb, features);
f6a78bfc
HX
2487 break;
2488 }
2489 }
2490 rcu_read_unlock();
2491
98e399f8 2492 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 2493
f6a78bfc
HX
2494 return segs;
2495}
05e8ef4a
PS
2496EXPORT_SYMBOL(skb_mac_gso_segment);
2497
2498
2499/* openvswitch calls this on rx path, so we need a different check.
2500 */
2501static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2502{
2503 if (tx_path)
2504 return skb->ip_summed != CHECKSUM_PARTIAL;
2505 else
2506 return skb->ip_summed == CHECKSUM_NONE;
2507}
2508
2509/**
2510 * __skb_gso_segment - Perform segmentation on skb.
2511 * @skb: buffer to segment
2512 * @features: features for the output path (see dev->features)
2513 * @tx_path: whether it is called in TX path
2514 *
2515 * This function segments the given skb and returns a list of segments.
2516 *
2517 * It may return NULL if the skb requires no segmentation. This is
2518 * only possible when GSO is used for verifying header integrity.
2519 */
2520struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2521 netdev_features_t features, bool tx_path)
2522{
2523 if (unlikely(skb_needs_check(skb, tx_path))) {
2524 int err;
2525
2526 skb_warn_bad_offload(skb);
2527
a40e0a66 2528 err = skb_cow_head(skb, 0);
2529 if (err < 0)
05e8ef4a
PS
2530 return ERR_PTR(err);
2531 }
2532
68c33163 2533 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3347c960
ED
2534 SKB_GSO_CB(skb)->encap_level = 0;
2535
05e8ef4a
PS
2536 skb_reset_mac_header(skb);
2537 skb_reset_mac_len(skb);
2538
2539 return skb_mac_gso_segment(skb, features);
2540}
12b0004d 2541EXPORT_SYMBOL(__skb_gso_segment);
f6a78bfc 2542
fb286bb2
HX
2543/* Take action when hardware reception checksum errors are detected. */
2544#ifdef CONFIG_BUG
2545void netdev_rx_csum_fault(struct net_device *dev)
2546{
2547 if (net_ratelimit()) {
7b6cd1ce 2548 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
fb286bb2
HX
2549 dump_stack();
2550 }
2551}
2552EXPORT_SYMBOL(netdev_rx_csum_fault);
2553#endif
2554
1da177e4
LT
2555/* Actually, we should eliminate this check as soon as we know, that:
2556 * 1. IOMMU is present and allows to map all the memory.
2557 * 2. No high memory really exists on this machine.
2558 */
2559
c1e756bf 2560static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 2561{
3d3a8533 2562#ifdef CONFIG_HIGHMEM
1da177e4 2563 int i;
5acbbd42 2564 if (!(dev->features & NETIF_F_HIGHDMA)) {
ea2ab693
IC
2565 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2566 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2567 if (PageHighMem(skb_frag_page(frag)))
5acbbd42 2568 return 1;
ea2ab693 2569 }
5acbbd42 2570 }
1da177e4 2571
5acbbd42
FT
2572 if (PCI_DMA_BUS_IS_PHYS) {
2573 struct device *pdev = dev->dev.parent;
1da177e4 2574
9092c658
ED
2575 if (!pdev)
2576 return 0;
5acbbd42 2577 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
ea2ab693
IC
2578 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2579 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
5acbbd42
FT
2580 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2581 return 1;
2582 }
2583 }
3d3a8533 2584#endif
1da177e4
LT
2585 return 0;
2586}
1da177e4 2587
3b392ddb
SH
2588/* If MPLS offload request, verify we are testing hardware MPLS features
2589 * instead of standard features for the netdev.
2590 */
d0edc7bf 2591#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3b392ddb
SH
2592static netdev_features_t net_mpls_features(struct sk_buff *skb,
2593 netdev_features_t features,
2594 __be16 type)
2595{
25cd9ba0 2596 if (eth_p_mpls(type))
3b392ddb
SH
2597 features &= skb->dev->mpls_features;
2598
2599 return features;
2600}
2601#else
2602static netdev_features_t net_mpls_features(struct sk_buff *skb,
2603 netdev_features_t features,
2604 __be16 type)
2605{
2606 return features;
2607}
2608#endif
2609
c8f44aff 2610static netdev_features_t harmonize_features(struct sk_buff *skb,
c1e756bf 2611 netdev_features_t features)
f01a5236 2612{
53d6471c 2613 int tmp;
3b392ddb
SH
2614 __be16 type;
2615
2616 type = skb_network_protocol(skb, &tmp);
2617 features = net_mpls_features(skb, features, type);
53d6471c 2618
c0d680e5 2619 if (skb->ip_summed != CHECKSUM_NONE &&
3b392ddb 2620 !can_checksum_protocol(features, type)) {
f01a5236 2621 features &= ~NETIF_F_ALL_CSUM;
c1e756bf 2622 } else if (illegal_highdma(skb->dev, skb)) {
f01a5236
JG
2623 features &= ~NETIF_F_SG;
2624 }
2625
2626 return features;
2627}
2628
e38f3025
TM
2629netdev_features_t passthru_features_check(struct sk_buff *skb,
2630 struct net_device *dev,
2631 netdev_features_t features)
2632{
2633 return features;
2634}
2635EXPORT_SYMBOL(passthru_features_check);
2636
8cb65d00
TM
2637static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2638 struct net_device *dev,
2639 netdev_features_t features)
2640{
2641 return vlan_features_check(skb, features);
2642}
2643
c1e756bf 2644netdev_features_t netif_skb_features(struct sk_buff *skb)
58e998c6 2645{
5f35227e 2646 struct net_device *dev = skb->dev;
fcbeb976
ED
2647 netdev_features_t features = dev->features;
2648 u16 gso_segs = skb_shinfo(skb)->gso_segs;
58e998c6 2649
fcbeb976 2650 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
30b678d8
BH
2651 features &= ~NETIF_F_GSO_MASK;
2652
5f35227e
JG
2653 /* If encapsulation offload request, verify we are testing
2654 * hardware encapsulation features instead of standard
2655 * features for the netdev
2656 */
2657 if (skb->encapsulation)
2658 features &= dev->hw_enc_features;
2659
f5a7fb88
TM
2660 if (skb_vlan_tagged(skb))
2661 features = netdev_intersect_features(features,
2662 dev->vlan_features |
2663 NETIF_F_HW_VLAN_CTAG_TX |
2664 NETIF_F_HW_VLAN_STAG_TX);
f01a5236 2665
5f35227e
JG
2666 if (dev->netdev_ops->ndo_features_check)
2667 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2668 features);
8cb65d00
TM
2669 else
2670 features &= dflt_features_check(skb, dev, features);
5f35227e 2671
c1e756bf 2672 return harmonize_features(skb, features);
58e998c6 2673}
c1e756bf 2674EXPORT_SYMBOL(netif_skb_features);
58e998c6 2675
2ea25513 2676static int xmit_one(struct sk_buff *skb, struct net_device *dev,
95f6b3dd 2677 struct netdev_queue *txq, bool more)
f6a78bfc 2678{
2ea25513
DM
2679 unsigned int len;
2680 int rc;
00829823 2681
7866a621 2682 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2ea25513 2683 dev_queue_xmit_nit(skb, dev);
fc741216 2684
2ea25513
DM
2685 len = skb->len;
2686 trace_net_dev_start_xmit(skb, dev);
95f6b3dd 2687 rc = netdev_start_xmit(skb, dev, txq, more);
2ea25513 2688 trace_net_dev_xmit(skb, rc, dev, len);
adf30907 2689
2ea25513
DM
2690 return rc;
2691}
7b9c6090 2692
8dcda22a
DM
2693struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2694 struct netdev_queue *txq, int *ret)
7f2e870f
DM
2695{
2696 struct sk_buff *skb = first;
2697 int rc = NETDEV_TX_OK;
7b9c6090 2698
7f2e870f
DM
2699 while (skb) {
2700 struct sk_buff *next = skb->next;
fc70fb64 2701
7f2e870f 2702 skb->next = NULL;
95f6b3dd 2703 rc = xmit_one(skb, dev, txq, next != NULL);
7f2e870f
DM
2704 if (unlikely(!dev_xmit_complete(rc))) {
2705 skb->next = next;
2706 goto out;
2707 }
6afff0ca 2708
7f2e870f
DM
2709 skb = next;
2710 if (netif_xmit_stopped(txq) && skb) {
2711 rc = NETDEV_TX_BUSY;
2712 break;
9ccb8975 2713 }
7f2e870f 2714 }
9ccb8975 2715
7f2e870f
DM
2716out:
2717 *ret = rc;
2718 return skb;
2719}
b40863c6 2720
1ff0dc94
ED
2721static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2722 netdev_features_t features)
f6a78bfc 2723{
df8a39de 2724 if (skb_vlan_tag_present(skb) &&
5968250c
JP
2725 !vlan_hw_offload_capable(features, skb->vlan_proto))
2726 skb = __vlan_hwaccel_push_inside(skb);
eae3f88e
DM
2727 return skb;
2728}
f6a78bfc 2729
55a93b3e 2730static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
eae3f88e
DM
2731{
2732 netdev_features_t features;
f6a78bfc 2733
eae3f88e
DM
2734 if (skb->next)
2735 return skb;
068a2de5 2736
eae3f88e
DM
2737 features = netif_skb_features(skb);
2738 skb = validate_xmit_vlan(skb, features);
2739 if (unlikely(!skb))
2740 goto out_null;
7b9c6090 2741
8b86a61d 2742 if (netif_needs_gso(skb, features)) {
ce93718f
DM
2743 struct sk_buff *segs;
2744
2745 segs = skb_gso_segment(skb, features);
cecda693 2746 if (IS_ERR(segs)) {
af6dabc9 2747 goto out_kfree_skb;
cecda693
JW
2748 } else if (segs) {
2749 consume_skb(skb);
2750 skb = segs;
f6a78bfc 2751 }
eae3f88e
DM
2752 } else {
2753 if (skb_needs_linearize(skb, features) &&
2754 __skb_linearize(skb))
2755 goto out_kfree_skb;
4ec93edb 2756
eae3f88e
DM
2757 /* If packet is not checksummed and device does not
2758 * support checksumming for this protocol, complete
2759 * checksumming here.
2760 */
2761 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2762 if (skb->encapsulation)
2763 skb_set_inner_transport_header(skb,
2764 skb_checksum_start_offset(skb));
2765 else
2766 skb_set_transport_header(skb,
2767 skb_checksum_start_offset(skb));
2768 if (!(features & NETIF_F_ALL_CSUM) &&
2769 skb_checksum_help(skb))
2770 goto out_kfree_skb;
7b9c6090 2771 }
0c772159 2772 }
7b9c6090 2773
eae3f88e 2774 return skb;
fc70fb64 2775
f6a78bfc
HX
2776out_kfree_skb:
2777 kfree_skb(skb);
eae3f88e
DM
2778out_null:
2779 return NULL;
2780}
6afff0ca 2781
55a93b3e
ED
2782struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2783{
2784 struct sk_buff *next, *head = NULL, *tail;
2785
bec3cfdc 2786 for (; skb != NULL; skb = next) {
55a93b3e
ED
2787 next = skb->next;
2788 skb->next = NULL;
bec3cfdc
ED
2789
2790 /* in case skb wont be segmented, point to itself */
2791 skb->prev = skb;
2792
55a93b3e 2793 skb = validate_xmit_skb(skb, dev);
bec3cfdc
ED
2794 if (!skb)
2795 continue;
55a93b3e 2796
bec3cfdc
ED
2797 if (!head)
2798 head = skb;
2799 else
2800 tail->next = skb;
2801 /* If skb was segmented, skb->prev points to
2802 * the last segment. If not, it still contains skb.
2803 */
2804 tail = skb->prev;
55a93b3e
ED
2805 }
2806 return head;
f6a78bfc
HX
2807}
2808
1def9238
ED
2809static void qdisc_pkt_len_init(struct sk_buff *skb)
2810{
2811 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2812
2813 qdisc_skb_cb(skb)->pkt_len = skb->len;
2814
2815 /* To get more precise estimation of bytes sent on wire,
2816 * we add to pkt_len the headers size of all segments
2817 */
2818 if (shinfo->gso_size) {
757b8b1d 2819 unsigned int hdr_len;
15e5a030 2820 u16 gso_segs = shinfo->gso_segs;
1def9238 2821
757b8b1d
ED
2822 /* mac layer + network layer */
2823 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2824
2825 /* + transport layer */
1def9238
ED
2826 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2827 hdr_len += tcp_hdrlen(skb);
2828 else
2829 hdr_len += sizeof(struct udphdr);
15e5a030
JW
2830
2831 if (shinfo->gso_type & SKB_GSO_DODGY)
2832 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2833 shinfo->gso_size);
2834
2835 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
1def9238
ED
2836 }
2837}
2838
bbd8a0d3
KK
2839static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2840 struct net_device *dev,
2841 struct netdev_queue *txq)
2842{
2843 spinlock_t *root_lock = qdisc_lock(q);
a2da570d 2844 bool contended;
bbd8a0d3
KK
2845 int rc;
2846
1def9238 2847 qdisc_pkt_len_init(skb);
a2da570d 2848 qdisc_calculate_pkt_len(skb, q);
79640a4c
ED
2849 /*
2850 * Heuristic to force contended enqueues to serialize on a
2851 * separate lock before trying to get qdisc main lock.
9bf2b8c2
YX
2852 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2853 * often and dequeue packets faster.
79640a4c 2854 */
a2da570d 2855 contended = qdisc_is_running(q);
79640a4c
ED
2856 if (unlikely(contended))
2857 spin_lock(&q->busylock);
2858
bbd8a0d3
KK
2859 spin_lock(root_lock);
2860 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2861 kfree_skb(skb);
2862 rc = NET_XMIT_DROP;
2863 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 2864 qdisc_run_begin(q)) {
bbd8a0d3
KK
2865 /*
2866 * This is a work-conserving queue; there are no old skbs
2867 * waiting to be sent out; and the qdisc is not running -
2868 * xmit the skb directly.
2869 */
bfe0d029 2870
bfe0d029
ED
2871 qdisc_bstats_update(q, skb);
2872
55a93b3e 2873 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
79640a4c
ED
2874 if (unlikely(contended)) {
2875 spin_unlock(&q->busylock);
2876 contended = false;
2877 }
bbd8a0d3 2878 __qdisc_run(q);
79640a4c 2879 } else
bc135b23 2880 qdisc_run_end(q);
bbd8a0d3
KK
2881
2882 rc = NET_XMIT_SUCCESS;
2883 } else {
a2da570d 2884 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
79640a4c
ED
2885 if (qdisc_run_begin(q)) {
2886 if (unlikely(contended)) {
2887 spin_unlock(&q->busylock);
2888 contended = false;
2889 }
2890 __qdisc_run(q);
2891 }
bbd8a0d3
KK
2892 }
2893 spin_unlock(root_lock);
79640a4c
ED
2894 if (unlikely(contended))
2895 spin_unlock(&q->busylock);
bbd8a0d3
KK
2896 return rc;
2897}
2898
86f8515f 2899#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
5bc1421e
NH
2900static void skb_update_prio(struct sk_buff *skb)
2901{
6977a79d 2902 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
5bc1421e 2903
91c68ce2
ED
2904 if (!skb->priority && skb->sk && map) {
2905 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2906
2907 if (prioidx < map->priomap_len)
2908 skb->priority = map->priomap[prioidx];
2909 }
5bc1421e
NH
2910}
2911#else
2912#define skb_update_prio(skb)
2913#endif
2914
f60e5990 2915DEFINE_PER_CPU(int, xmit_recursion);
2916EXPORT_SYMBOL(xmit_recursion);
2917
11a766ce 2918#define RECURSION_LIMIT 10
745e20f1 2919
95603e22
MM
2920/**
2921 * dev_loopback_xmit - loop back @skb
2922 * @skb: buffer to transmit
2923 */
7026b1dd 2924int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
95603e22
MM
2925{
2926 skb_reset_mac_header(skb);
2927 __skb_pull(skb, skb_network_offset(skb));
2928 skb->pkt_type = PACKET_LOOPBACK;
2929 skb->ip_summed = CHECKSUM_UNNECESSARY;
2930 WARN_ON(!skb_dst(skb));
2931 skb_dst_force(skb);
2932 netif_rx_ni(skb);
2933 return 0;
2934}
2935EXPORT_SYMBOL(dev_loopback_xmit);
2936
638b2a69
JP
2937static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2938{
2939#ifdef CONFIG_XPS
2940 struct xps_dev_maps *dev_maps;
2941 struct xps_map *map;
2942 int queue_index = -1;
2943
2944 rcu_read_lock();
2945 dev_maps = rcu_dereference(dev->xps_maps);
2946 if (dev_maps) {
2947 map = rcu_dereference(
2948 dev_maps->cpu_map[skb->sender_cpu - 1]);
2949 if (map) {
2950 if (map->len == 1)
2951 queue_index = map->queues[0];
2952 else
2953 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
2954 map->len)];
2955 if (unlikely(queue_index >= dev->real_num_tx_queues))
2956 queue_index = -1;
2957 }
2958 }
2959 rcu_read_unlock();
2960
2961 return queue_index;
2962#else
2963 return -1;
2964#endif
2965}
2966
2967static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
2968{
2969 struct sock *sk = skb->sk;
2970 int queue_index = sk_tx_queue_get(sk);
2971
2972 if (queue_index < 0 || skb->ooo_okay ||
2973 queue_index >= dev->real_num_tx_queues) {
2974 int new_index = get_xps_queue(dev, skb);
2975 if (new_index < 0)
2976 new_index = skb_tx_hash(dev, skb);
2977
2978 if (queue_index != new_index && sk &&
2979 rcu_access_pointer(sk->sk_dst_cache))
2980 sk_tx_queue_set(sk, new_index);
2981
2982 queue_index = new_index;
2983 }
2984
2985 return queue_index;
2986}
2987
2988struct netdev_queue *netdev_pick_tx(struct net_device *dev,
2989 struct sk_buff *skb,
2990 void *accel_priv)
2991{
2992 int queue_index = 0;
2993
2994#ifdef CONFIG_XPS
2995 if (skb->sender_cpu == 0)
2996 skb->sender_cpu = raw_smp_processor_id() + 1;
2997#endif
2998
2999 if (dev->real_num_tx_queues != 1) {
3000 const struct net_device_ops *ops = dev->netdev_ops;
3001 if (ops->ndo_select_queue)
3002 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3003 __netdev_pick_tx);
3004 else
3005 queue_index = __netdev_pick_tx(dev, skb);
3006
3007 if (!accel_priv)
3008 queue_index = netdev_cap_txqueue(dev, queue_index);
3009 }
3010
3011 skb_set_queue_mapping(skb, queue_index);
3012 return netdev_get_tx_queue(dev, queue_index);
3013}
3014
d29f749e 3015/**
9d08dd3d 3016 * __dev_queue_xmit - transmit a buffer
d29f749e 3017 * @skb: buffer to transmit
9d08dd3d 3018 * @accel_priv: private data used for L2 forwarding offload
d29f749e
DJ
3019 *
3020 * Queue a buffer for transmission to a network device. The caller must
3021 * have set the device and priority and built the buffer before calling
3022 * this function. The function can be called from an interrupt.
3023 *
3024 * A negative errno code is returned on a failure. A success does not
3025 * guarantee the frame will be transmitted as it may be dropped due
3026 * to congestion or traffic shaping.
3027 *
3028 * -----------------------------------------------------------------------------------
3029 * I notice this method can also return errors from the queue disciplines,
3030 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3031 * be positive.
3032 *
3033 * Regardless of the return value, the skb is consumed, so it is currently
3034 * difficult to retry a send to this method. (You can bump the ref count
3035 * before sending to hold a reference for retry if you are careful.)
3036 *
3037 * When calling this method, interrupts MUST be enabled. This is because
3038 * the BH enable code must have IRQs enabled so that it will not deadlock.
3039 * --BLG
3040 */
0a59f3a9 3041static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
1da177e4
LT
3042{
3043 struct net_device *dev = skb->dev;
dc2b4847 3044 struct netdev_queue *txq;
1da177e4
LT
3045 struct Qdisc *q;
3046 int rc = -ENOMEM;
3047
6d1ccff6
ED
3048 skb_reset_mac_header(skb);
3049
e7fd2885
WB
3050 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3051 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3052
4ec93edb
YH
3053 /* Disable soft irqs for various locks below. Also
3054 * stops preemption for RCU.
1da177e4 3055 */
4ec93edb 3056 rcu_read_lock_bh();
1da177e4 3057
5bc1421e
NH
3058 skb_update_prio(skb);
3059
02875878
ED
3060 /* If device/qdisc don't need skb->dst, release it right now while
3061 * its hot in this cpu cache.
3062 */
3063 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3064 skb_dst_drop(skb);
3065 else
3066 skb_dst_force(skb);
3067
f663dd9a 3068 txq = netdev_pick_tx(dev, skb, accel_priv);
a898def2 3069 q = rcu_dereference_bh(txq->qdisc);
37437bb2 3070
1da177e4 3071#ifdef CONFIG_NET_CLS_ACT
d1b19dff 3072 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4 3073#endif
cf66ba58 3074 trace_net_dev_queue(skb);
1da177e4 3075 if (q->enqueue) {
bbd8a0d3 3076 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 3077 goto out;
1da177e4
LT
3078 }
3079
3080 /* The device has no queue. Common case for software devices:
3081 loopback, all the sorts of tunnels...
3082
932ff279
HX
3083 Really, it is unlikely that netif_tx_lock protection is necessary
3084 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
3085 counters.)
3086 However, it is possible, that they rely on protection
3087 made by us here.
3088
3089 Check this and shot the lock. It is not prone from deadlocks.
3090 Either shot noqueue qdisc, it is even simpler 8)
3091 */
3092 if (dev->flags & IFF_UP) {
3093 int cpu = smp_processor_id(); /* ok because BHs are off */
3094
c773e847 3095 if (txq->xmit_lock_owner != cpu) {
1da177e4 3096
745e20f1
ED
3097 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3098 goto recursion_alert;
3099
1f59533f
JDB
3100 skb = validate_xmit_skb(skb, dev);
3101 if (!skb)
3102 goto drop;
3103
c773e847 3104 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 3105
73466498 3106 if (!netif_xmit_stopped(txq)) {
745e20f1 3107 __this_cpu_inc(xmit_recursion);
ce93718f 3108 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
745e20f1 3109 __this_cpu_dec(xmit_recursion);
572a9d7b 3110 if (dev_xmit_complete(rc)) {
c773e847 3111 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
3112 goto out;
3113 }
3114 }
c773e847 3115 HARD_TX_UNLOCK(dev, txq);
e87cc472
JP
3116 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3117 dev->name);
1da177e4
LT
3118 } else {
3119 /* Recursion is detected! It is possible,
745e20f1
ED
3120 * unfortunately
3121 */
3122recursion_alert:
e87cc472
JP
3123 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3124 dev->name);
1da177e4
LT
3125 }
3126 }
3127
3128 rc = -ENETDOWN;
1f59533f 3129drop:
d4828d85 3130 rcu_read_unlock_bh();
1da177e4 3131
015f0688 3132 atomic_long_inc(&dev->tx_dropped);
1f59533f 3133 kfree_skb_list(skb);
1da177e4
LT
3134 return rc;
3135out:
d4828d85 3136 rcu_read_unlock_bh();
1da177e4
LT
3137 return rc;
3138}
f663dd9a 3139
7026b1dd 3140int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
f663dd9a
JW
3141{
3142 return __dev_queue_xmit(skb, NULL);
3143}
7026b1dd 3144EXPORT_SYMBOL(dev_queue_xmit_sk);
1da177e4 3145
f663dd9a
JW
3146int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3147{
3148 return __dev_queue_xmit(skb, accel_priv);
3149}
3150EXPORT_SYMBOL(dev_queue_xmit_accel);
3151
1da177e4
LT
3152
3153/*=======================================================================
3154 Receiver routines
3155 =======================================================================*/
3156
6b2bedc3 3157int netdev_max_backlog __read_mostly = 1000;
c9e6bc64
ED
3158EXPORT_SYMBOL(netdev_max_backlog);
3159
3b098e2d 3160int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
3161int netdev_budget __read_mostly = 300;
3162int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 3163
eecfd7c4
ED
3164/* Called with irq disabled */
3165static inline void ____napi_schedule(struct softnet_data *sd,
3166 struct napi_struct *napi)
3167{
3168 list_add_tail(&napi->poll_list, &sd->poll_list);
3169 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3170}
3171
bfb564e7
KK
3172#ifdef CONFIG_RPS
3173
3174/* One global table that all flow-based protocols share. */
6e3f7faf 3175struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
bfb564e7 3176EXPORT_SYMBOL(rps_sock_flow_table);
567e4b79
ED
3177u32 rps_cpu_mask __read_mostly;
3178EXPORT_SYMBOL(rps_cpu_mask);
bfb564e7 3179
c5905afb 3180struct static_key rps_needed __read_mostly;
adc9300e 3181
c445477d
BH
3182static struct rps_dev_flow *
3183set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3184 struct rps_dev_flow *rflow, u16 next_cpu)
3185{
a31196b0 3186 if (next_cpu < nr_cpu_ids) {
c445477d
BH
3187#ifdef CONFIG_RFS_ACCEL
3188 struct netdev_rx_queue *rxqueue;
3189 struct rps_dev_flow_table *flow_table;
3190 struct rps_dev_flow *old_rflow;
3191 u32 flow_id;
3192 u16 rxq_index;
3193 int rc;
3194
3195 /* Should we steer this flow to a different hardware queue? */
69a19ee6
BH
3196 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3197 !(dev->features & NETIF_F_NTUPLE))
c445477d
BH
3198 goto out;
3199 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3200 if (rxq_index == skb_get_rx_queue(skb))
3201 goto out;
3202
3203 rxqueue = dev->_rx + rxq_index;
3204 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3205 if (!flow_table)
3206 goto out;
61b905da 3207 flow_id = skb_get_hash(skb) & flow_table->mask;
c445477d
BH
3208 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3209 rxq_index, flow_id);
3210 if (rc < 0)
3211 goto out;
3212 old_rflow = rflow;
3213 rflow = &flow_table->flows[flow_id];
c445477d
BH
3214 rflow->filter = rc;
3215 if (old_rflow->filter == rflow->filter)
3216 old_rflow->filter = RPS_NO_FILTER;
3217 out:
3218#endif
3219 rflow->last_qtail =
09994d1b 3220 per_cpu(softnet_data, next_cpu).input_queue_head;
c445477d
BH
3221 }
3222
09994d1b 3223 rflow->cpu = next_cpu;
c445477d
BH
3224 return rflow;
3225}
3226
bfb564e7
KK
3227/*
3228 * get_rps_cpu is called from netif_receive_skb and returns the target
3229 * CPU from the RPS map of the receiving queue for a given skb.
3230 * rcu_read_lock must be held on entry.
3231 */
3232static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3233 struct rps_dev_flow **rflowp)
3234{
567e4b79
ED
3235 const struct rps_sock_flow_table *sock_flow_table;
3236 struct netdev_rx_queue *rxqueue = dev->_rx;
bfb564e7 3237 struct rps_dev_flow_table *flow_table;
567e4b79 3238 struct rps_map *map;
bfb564e7 3239 int cpu = -1;
567e4b79 3240 u32 tcpu;
61b905da 3241 u32 hash;
bfb564e7
KK
3242
3243 if (skb_rx_queue_recorded(skb)) {
3244 u16 index = skb_get_rx_queue(skb);
567e4b79 3245
62fe0b40
BH
3246 if (unlikely(index >= dev->real_num_rx_queues)) {
3247 WARN_ONCE(dev->real_num_rx_queues > 1,
3248 "%s received packet on queue %u, but number "
3249 "of RX queues is %u\n",
3250 dev->name, index, dev->real_num_rx_queues);
bfb564e7
KK
3251 goto done;
3252 }
567e4b79
ED
3253 rxqueue += index;
3254 }
bfb564e7 3255
567e4b79
ED
3256 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3257
3258 flow_table = rcu_dereference(rxqueue->rps_flow_table);
6e3f7faf 3259 map = rcu_dereference(rxqueue->rps_map);
567e4b79 3260 if (!flow_table && !map)
bfb564e7
KK
3261 goto done;
3262
2d47b459 3263 skb_reset_network_header(skb);
61b905da
TH
3264 hash = skb_get_hash(skb);
3265 if (!hash)
bfb564e7
KK
3266 goto done;
3267
fec5e652
TH
3268 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3269 if (flow_table && sock_flow_table) {
fec5e652 3270 struct rps_dev_flow *rflow;
567e4b79
ED
3271 u32 next_cpu;
3272 u32 ident;
3273
3274 /* First check into global flow table if there is a match */
3275 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3276 if ((ident ^ hash) & ~rps_cpu_mask)
3277 goto try_rps;
fec5e652 3278
567e4b79
ED
3279 next_cpu = ident & rps_cpu_mask;
3280
3281 /* OK, now we know there is a match,
3282 * we can look at the local (per receive queue) flow table
3283 */
61b905da 3284 rflow = &flow_table->flows[hash & flow_table->mask];
fec5e652
TH
3285 tcpu = rflow->cpu;
3286
fec5e652
TH
3287 /*
3288 * If the desired CPU (where last recvmsg was done) is
3289 * different from current CPU (one in the rx-queue flow
3290 * table entry), switch if one of the following holds:
a31196b0 3291 * - Current CPU is unset (>= nr_cpu_ids).
fec5e652
TH
3292 * - Current CPU is offline.
3293 * - The current CPU's queue tail has advanced beyond the
3294 * last packet that was enqueued using this table entry.
3295 * This guarantees that all previous packets for the flow
3296 * have been dequeued, thus preserving in order delivery.
3297 */
3298 if (unlikely(tcpu != next_cpu) &&
a31196b0 3299 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
fec5e652 3300 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
baefa31d
TH
3301 rflow->last_qtail)) >= 0)) {
3302 tcpu = next_cpu;
c445477d 3303 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
baefa31d 3304 }
c445477d 3305
a31196b0 3306 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
fec5e652
TH
3307 *rflowp = rflow;
3308 cpu = tcpu;
3309 goto done;
3310 }
3311 }
3312
567e4b79
ED
3313try_rps:
3314
0a9627f2 3315 if (map) {
8fc54f68 3316 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
0a9627f2
TH
3317 if (cpu_online(tcpu)) {
3318 cpu = tcpu;
3319 goto done;
3320 }
3321 }
3322
3323done:
0a9627f2
TH
3324 return cpu;
3325}
3326
c445477d
BH
3327#ifdef CONFIG_RFS_ACCEL
3328
3329/**
3330 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3331 * @dev: Device on which the filter was set
3332 * @rxq_index: RX queue index
3333 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3334 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3335 *
3336 * Drivers that implement ndo_rx_flow_steer() should periodically call
3337 * this function for each installed filter and remove the filters for
3338 * which it returns %true.
3339 */
3340bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3341 u32 flow_id, u16 filter_id)
3342{
3343 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3344 struct rps_dev_flow_table *flow_table;
3345 struct rps_dev_flow *rflow;
3346 bool expire = true;
a31196b0 3347 unsigned int cpu;
c445477d
BH
3348
3349 rcu_read_lock();
3350 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3351 if (flow_table && flow_id <= flow_table->mask) {
3352 rflow = &flow_table->flows[flow_id];
3353 cpu = ACCESS_ONCE(rflow->cpu);
a31196b0 3354 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
c445477d
BH
3355 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3356 rflow->last_qtail) <
3357 (int)(10 * flow_table->mask)))
3358 expire = false;
3359 }
3360 rcu_read_unlock();
3361 return expire;
3362}
3363EXPORT_SYMBOL(rps_may_expire_flow);
3364
3365#endif /* CONFIG_RFS_ACCEL */
3366
0a9627f2 3367/* Called from hardirq (IPI) context */
e36fa2f7 3368static void rps_trigger_softirq(void *data)
0a9627f2 3369{
e36fa2f7
ED
3370 struct softnet_data *sd = data;
3371
eecfd7c4 3372 ____napi_schedule(sd, &sd->backlog);
dee42870 3373 sd->received_rps++;
0a9627f2 3374}
e36fa2f7 3375
fec5e652 3376#endif /* CONFIG_RPS */
0a9627f2 3377
e36fa2f7
ED
3378/*
3379 * Check if this softnet_data structure is another cpu one
3380 * If yes, queue it to our IPI list and return 1
3381 * If no, return 0
3382 */
3383static int rps_ipi_queued(struct softnet_data *sd)
3384{
3385#ifdef CONFIG_RPS
903ceff7 3386 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
e36fa2f7
ED
3387
3388 if (sd != mysd) {
3389 sd->rps_ipi_next = mysd->rps_ipi_list;
3390 mysd->rps_ipi_list = sd;
3391
3392 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3393 return 1;
3394 }
3395#endif /* CONFIG_RPS */
3396 return 0;
3397}
3398
99bbc707
WB
3399#ifdef CONFIG_NET_FLOW_LIMIT
3400int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3401#endif
3402
3403static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3404{
3405#ifdef CONFIG_NET_FLOW_LIMIT
3406 struct sd_flow_limit *fl;
3407 struct softnet_data *sd;
3408 unsigned int old_flow, new_flow;
3409
3410 if (qlen < (netdev_max_backlog >> 1))
3411 return false;
3412
903ceff7 3413 sd = this_cpu_ptr(&softnet_data);
99bbc707
WB
3414
3415 rcu_read_lock();
3416 fl = rcu_dereference(sd->flow_limit);
3417 if (fl) {
3958afa1 3418 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
99bbc707
WB
3419 old_flow = fl->history[fl->history_head];
3420 fl->history[fl->history_head] = new_flow;
3421
3422 fl->history_head++;
3423 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3424
3425 if (likely(fl->buckets[old_flow]))
3426 fl->buckets[old_flow]--;
3427
3428 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3429 fl->count++;
3430 rcu_read_unlock();
3431 return true;
3432 }
3433 }
3434 rcu_read_unlock();
3435#endif
3436 return false;
3437}
3438
0a9627f2
TH
3439/*
3440 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3441 * queue (may be a remote CPU queue).
3442 */
fec5e652
TH
3443static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3444 unsigned int *qtail)
0a9627f2 3445{
e36fa2f7 3446 struct softnet_data *sd;
0a9627f2 3447 unsigned long flags;
99bbc707 3448 unsigned int qlen;
0a9627f2 3449
e36fa2f7 3450 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
3451
3452 local_irq_save(flags);
0a9627f2 3453
e36fa2f7 3454 rps_lock(sd);
99bbc707
WB
3455 qlen = skb_queue_len(&sd->input_pkt_queue);
3456 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
e008f3f0 3457 if (qlen) {
0a9627f2 3458enqueue:
e36fa2f7 3459 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 3460 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 3461 rps_unlock(sd);
152102c7 3462 local_irq_restore(flags);
0a9627f2
TH
3463 return NET_RX_SUCCESS;
3464 }
3465
ebda37c2
ED
3466 /* Schedule NAPI for backlog device
3467 * We can use non atomic operation since we own the queue lock
3468 */
3469 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 3470 if (!rps_ipi_queued(sd))
eecfd7c4 3471 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
3472 }
3473 goto enqueue;
3474 }
3475
dee42870 3476 sd->dropped++;
e36fa2f7 3477 rps_unlock(sd);
0a9627f2 3478
0a9627f2
TH
3479 local_irq_restore(flags);
3480
caf586e5 3481 atomic_long_inc(&skb->dev->rx_dropped);
0a9627f2
TH
3482 kfree_skb(skb);
3483 return NET_RX_DROP;
3484}
1da177e4 3485
ae78dbfa 3486static int netif_rx_internal(struct sk_buff *skb)
1da177e4 3487{
b0e28f1e 3488 int ret;
1da177e4 3489
588f0330 3490 net_timestamp_check(netdev_tstamp_prequeue, skb);
1da177e4 3491
cf66ba58 3492 trace_netif_rx(skb);
df334545 3493#ifdef CONFIG_RPS
c5905afb 3494 if (static_key_false(&rps_needed)) {
fec5e652 3495 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
3496 int cpu;
3497
cece1945 3498 preempt_disable();
b0e28f1e 3499 rcu_read_lock();
fec5e652
TH
3500
3501 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
3502 if (cpu < 0)
3503 cpu = smp_processor_id();
fec5e652
TH
3504
3505 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3506
b0e28f1e 3507 rcu_read_unlock();
cece1945 3508 preempt_enable();
adc9300e
ED
3509 } else
3510#endif
fec5e652
TH
3511 {
3512 unsigned int qtail;
3513 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3514 put_cpu();
3515 }
b0e28f1e 3516 return ret;
1da177e4 3517}
ae78dbfa
BH
3518
3519/**
3520 * netif_rx - post buffer to the network code
3521 * @skb: buffer to post
3522 *
3523 * This function receives a packet from a device driver and queues it for
3524 * the upper (protocol) levels to process. It always succeeds. The buffer
3525 * may be dropped during processing for congestion control or by the
3526 * protocol layers.
3527 *
3528 * return values:
3529 * NET_RX_SUCCESS (no congestion)
3530 * NET_RX_DROP (packet was dropped)
3531 *
3532 */
3533
3534int netif_rx(struct sk_buff *skb)
3535{
3536 trace_netif_rx_entry(skb);
3537
3538 return netif_rx_internal(skb);
3539}
d1b19dff 3540EXPORT_SYMBOL(netif_rx);
1da177e4
LT
3541
3542int netif_rx_ni(struct sk_buff *skb)
3543{
3544 int err;
3545
ae78dbfa
BH
3546 trace_netif_rx_ni_entry(skb);
3547
1da177e4 3548 preempt_disable();
ae78dbfa 3549 err = netif_rx_internal(skb);
1da177e4
LT
3550 if (local_softirq_pending())
3551 do_softirq();
3552 preempt_enable();
3553
3554 return err;
3555}
1da177e4
LT
3556EXPORT_SYMBOL(netif_rx_ni);
3557
1da177e4
LT
3558static void net_tx_action(struct softirq_action *h)
3559{
903ceff7 3560 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
1da177e4
LT
3561
3562 if (sd->completion_queue) {
3563 struct sk_buff *clist;
3564
3565 local_irq_disable();
3566 clist = sd->completion_queue;
3567 sd->completion_queue = NULL;
3568 local_irq_enable();
3569
3570 while (clist) {
3571 struct sk_buff *skb = clist;
3572 clist = clist->next;
3573
547b792c 3574 WARN_ON(atomic_read(&skb->users));
e6247027
ED
3575 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3576 trace_consume_skb(skb);
3577 else
3578 trace_kfree_skb(skb, net_tx_action);
1da177e4
LT
3579 __kfree_skb(skb);
3580 }
3581 }
3582
3583 if (sd->output_queue) {
37437bb2 3584 struct Qdisc *head;
1da177e4
LT
3585
3586 local_irq_disable();
3587 head = sd->output_queue;
3588 sd->output_queue = NULL;
a9cbd588 3589 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
3590 local_irq_enable();
3591
3592 while (head) {
37437bb2
DM
3593 struct Qdisc *q = head;
3594 spinlock_t *root_lock;
3595
1da177e4
LT
3596 head = head->next_sched;
3597
5fb66229 3598 root_lock = qdisc_lock(q);
37437bb2 3599 if (spin_trylock(root_lock)) {
4e857c58 3600 smp_mb__before_atomic();
def82a1d
JP
3601 clear_bit(__QDISC_STATE_SCHED,
3602 &q->state);
37437bb2
DM
3603 qdisc_run(q);
3604 spin_unlock(root_lock);
1da177e4 3605 } else {
195648bb 3606 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 3607 &q->state)) {
195648bb 3608 __netif_reschedule(q);
e8a83e10 3609 } else {
4e857c58 3610 smp_mb__before_atomic();
e8a83e10
JP
3611 clear_bit(__QDISC_STATE_SCHED,
3612 &q->state);
3613 }
1da177e4
LT
3614 }
3615 }
3616 }
3617}
3618
ab95bfe0
JP
3619#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3620 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
da678292
MM
3621/* This hook is defined here for ATM LANE */
3622int (*br_fdb_test_addr_hook)(struct net_device *dev,
3623 unsigned char *addr) __read_mostly;
4fb019a0 3624EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 3625#endif
1da177e4 3626
f697c3e8
HX
3627static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3628 struct packet_type **pt_prev,
3629 int *ret, struct net_device *orig_dev)
3630{
e7582bab 3631#ifdef CONFIG_NET_CLS_ACT
d2788d34
DB
3632 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3633 struct tcf_result cl_res;
24824a09 3634
c9e99fd0
DB
3635 /* If there's at least one ingress present somewhere (so
3636 * we get here via enabled static key), remaining devices
3637 * that are not configured with an ingress qdisc will bail
d2788d34 3638 * out here.
c9e99fd0 3639 */
d2788d34 3640 if (!cl)
4577139b 3641 return skb;
f697c3e8
HX
3642 if (*pt_prev) {
3643 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3644 *pt_prev = NULL;
1da177e4
LT
3645 }
3646
3365495c 3647 qdisc_skb_cb(skb)->pkt_len = skb->len;
c9e99fd0 3648 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3365495c 3649 qdisc_bstats_update_cpu(cl->q, skb);
c9e99fd0 3650
d2788d34
DB
3651 switch (tc_classify(skb, cl, &cl_res)) {
3652 case TC_ACT_OK:
3653 case TC_ACT_RECLASSIFY:
3654 skb->tc_index = TC_H_MIN(cl_res.classid);
3655 break;
3656 case TC_ACT_SHOT:
3657 qdisc_qstats_drop_cpu(cl->q);
3658 case TC_ACT_STOLEN:
3659 case TC_ACT_QUEUED:
3660 kfree_skb(skb);
3661 return NULL;
3662 default:
3663 break;
f697c3e8 3664 }
e7582bab 3665#endif /* CONFIG_NET_CLS_ACT */
e687ad60
PN
3666 return skb;
3667}
1da177e4 3668
ab95bfe0
JP
3669/**
3670 * netdev_rx_handler_register - register receive handler
3671 * @dev: device to register a handler for
3672 * @rx_handler: receive handler to register
93e2c32b 3673 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0 3674 *
e227867f 3675 * Register a receive handler for a device. This handler will then be
ab95bfe0
JP
3676 * called from __netif_receive_skb. A negative errno code is returned
3677 * on a failure.
3678 *
3679 * The caller must hold the rtnl_mutex.
8a4eb573
JP
3680 *
3681 * For a general description of rx_handler, see enum rx_handler_result.
ab95bfe0
JP
3682 */
3683int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
3684 rx_handler_func_t *rx_handler,
3685 void *rx_handler_data)
ab95bfe0
JP
3686{
3687 ASSERT_RTNL();
3688
3689 if (dev->rx_handler)
3690 return -EBUSY;
3691
00cfec37 3692 /* Note: rx_handler_data must be set before rx_handler */
93e2c32b 3693 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
3694 rcu_assign_pointer(dev->rx_handler, rx_handler);
3695
3696 return 0;
3697}
3698EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3699
3700/**
3701 * netdev_rx_handler_unregister - unregister receive handler
3702 * @dev: device to unregister a handler from
3703 *
166ec369 3704 * Unregister a receive handler from a device.
ab95bfe0
JP
3705 *
3706 * The caller must hold the rtnl_mutex.
3707 */
3708void netdev_rx_handler_unregister(struct net_device *dev)
3709{
3710
3711 ASSERT_RTNL();
a9b3cd7f 3712 RCU_INIT_POINTER(dev->rx_handler, NULL);
00cfec37
ED
3713 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3714 * section has a guarantee to see a non NULL rx_handler_data
3715 * as well.
3716 */
3717 synchronize_net();
a9b3cd7f 3718 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
ab95bfe0
JP
3719}
3720EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3721
b4b9e355
MG
3722/*
3723 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3724 * the special handling of PFMEMALLOC skbs.
3725 */
3726static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3727{
3728 switch (skb->protocol) {
2b8837ae
JP
3729 case htons(ETH_P_ARP):
3730 case htons(ETH_P_IP):
3731 case htons(ETH_P_IPV6):
3732 case htons(ETH_P_8021Q):
3733 case htons(ETH_P_8021AD):
b4b9e355
MG
3734 return true;
3735 default:
3736 return false;
3737 }
3738}
3739
e687ad60
PN
3740static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3741 int *ret, struct net_device *orig_dev)
3742{
e7582bab 3743#ifdef CONFIG_NETFILTER_INGRESS
e687ad60
PN
3744 if (nf_hook_ingress_active(skb)) {
3745 if (*pt_prev) {
3746 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3747 *pt_prev = NULL;
3748 }
3749
3750 return nf_hook_ingress(skb);
3751 }
e7582bab 3752#endif /* CONFIG_NETFILTER_INGRESS */
e687ad60
PN
3753 return 0;
3754}
e687ad60 3755
9754e293 3756static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
1da177e4
LT
3757{
3758 struct packet_type *ptype, *pt_prev;
ab95bfe0 3759 rx_handler_func_t *rx_handler;
f2ccd8fa 3760 struct net_device *orig_dev;
8a4eb573 3761 bool deliver_exact = false;
1da177e4 3762 int ret = NET_RX_DROP;
252e3346 3763 __be16 type;
1da177e4 3764
588f0330 3765 net_timestamp_check(!netdev_tstamp_prequeue, skb);
81bbb3d4 3766
cf66ba58 3767 trace_netif_receive_skb(skb);
9b22ea56 3768
cc9bd5ce 3769 orig_dev = skb->dev;
8f903c70 3770
c1d2bbe1 3771 skb_reset_network_header(skb);
fda55eca
ED
3772 if (!skb_transport_header_was_set(skb))
3773 skb_reset_transport_header(skb);
0b5c9db1 3774 skb_reset_mac_len(skb);
1da177e4
LT
3775
3776 pt_prev = NULL;
3777
3778 rcu_read_lock();
3779
63d8ea7f 3780another_round:
b6858177 3781 skb->skb_iif = skb->dev->ifindex;
63d8ea7f
DM
3782
3783 __this_cpu_inc(softnet_data.processed);
3784
8ad227ff
PM
3785 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3786 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
0d5501c1 3787 skb = skb_vlan_untag(skb);
bcc6d479 3788 if (unlikely(!skb))
b4b9e355 3789 goto unlock;
bcc6d479
JP
3790 }
3791
1da177e4
LT
3792#ifdef CONFIG_NET_CLS_ACT
3793 if (skb->tc_verd & TC_NCLS) {
3794 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3795 goto ncls;
3796 }
3797#endif
3798
9754e293 3799 if (pfmemalloc)
b4b9e355
MG
3800 goto skip_taps;
3801
1da177e4 3802 list_for_each_entry_rcu(ptype, &ptype_all, list) {
7866a621
SN
3803 if (pt_prev)
3804 ret = deliver_skb(skb, pt_prev, orig_dev);
3805 pt_prev = ptype;
3806 }
3807
3808 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3809 if (pt_prev)
3810 ret = deliver_skb(skb, pt_prev, orig_dev);
3811 pt_prev = ptype;
1da177e4
LT
3812 }
3813
b4b9e355 3814skip_taps:
1cf51900 3815#ifdef CONFIG_NET_INGRESS
4577139b
DB
3816 if (static_key_false(&ingress_needed)) {
3817 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3818 if (!skb)
3819 goto unlock;
e687ad60
PN
3820
3821 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
3822 goto unlock;
4577139b 3823 }
1cf51900
PN
3824#endif
3825#ifdef CONFIG_NET_CLS_ACT
4577139b 3826 skb->tc_verd = 0;
1da177e4
LT
3827ncls:
3828#endif
9754e293 3829 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
b4b9e355
MG
3830 goto drop;
3831
df8a39de 3832 if (skb_vlan_tag_present(skb)) {
2425717b
JF
3833 if (pt_prev) {
3834 ret = deliver_skb(skb, pt_prev, orig_dev);
3835 pt_prev = NULL;
3836 }
48cc32d3 3837 if (vlan_do_receive(&skb))
2425717b
JF
3838 goto another_round;
3839 else if (unlikely(!skb))
b4b9e355 3840 goto unlock;
2425717b
JF
3841 }
3842
48cc32d3 3843 rx_handler = rcu_dereference(skb->dev->rx_handler);
ab95bfe0
JP
3844 if (rx_handler) {
3845 if (pt_prev) {
3846 ret = deliver_skb(skb, pt_prev, orig_dev);
3847 pt_prev = NULL;
3848 }
8a4eb573
JP
3849 switch (rx_handler(&skb)) {
3850 case RX_HANDLER_CONSUMED:
3bc1b1ad 3851 ret = NET_RX_SUCCESS;
b4b9e355 3852 goto unlock;
8a4eb573 3853 case RX_HANDLER_ANOTHER:
63d8ea7f 3854 goto another_round;
8a4eb573
JP
3855 case RX_HANDLER_EXACT:
3856 deliver_exact = true;
3857 case RX_HANDLER_PASS:
3858 break;
3859 default:
3860 BUG();
3861 }
ab95bfe0 3862 }
1da177e4 3863
df8a39de
JP
3864 if (unlikely(skb_vlan_tag_present(skb))) {
3865 if (skb_vlan_tag_get_id(skb))
d4b812de
ED
3866 skb->pkt_type = PACKET_OTHERHOST;
3867 /* Note: we might in the future use prio bits
3868 * and set skb->priority like in vlan_do_receive()
3869 * For the time being, just ignore Priority Code Point
3870 */
3871 skb->vlan_tci = 0;
3872 }
48cc32d3 3873
7866a621
SN
3874 type = skb->protocol;
3875
63d8ea7f 3876 /* deliver only exact match when indicated */
7866a621
SN
3877 if (likely(!deliver_exact)) {
3878 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3879 &ptype_base[ntohs(type) &
3880 PTYPE_HASH_MASK]);
3881 }
1f3c8804 3882
7866a621
SN
3883 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3884 &orig_dev->ptype_specific);
3885
3886 if (unlikely(skb->dev != orig_dev)) {
3887 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3888 &skb->dev->ptype_specific);
1da177e4
LT
3889 }
3890
3891 if (pt_prev) {
1080e512 3892 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
0e698bf6 3893 goto drop;
1080e512
MT
3894 else
3895 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4 3896 } else {
b4b9e355 3897drop:
caf586e5 3898 atomic_long_inc(&skb->dev->rx_dropped);
1da177e4
LT
3899 kfree_skb(skb);
3900 /* Jamal, now you will not able to escape explaining
3901 * me how you were going to use this. :-)
3902 */
3903 ret = NET_RX_DROP;
3904 }
3905
b4b9e355 3906unlock:
1da177e4 3907 rcu_read_unlock();
9754e293
DM
3908 return ret;
3909}
3910
3911static int __netif_receive_skb(struct sk_buff *skb)
3912{
3913 int ret;
3914
3915 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3916 unsigned long pflags = current->flags;
3917
3918 /*
3919 * PFMEMALLOC skbs are special, they should
3920 * - be delivered to SOCK_MEMALLOC sockets only
3921 * - stay away from userspace
3922 * - have bounded memory usage
3923 *
3924 * Use PF_MEMALLOC as this saves us from propagating the allocation
3925 * context down to all allocation sites.
3926 */
3927 current->flags |= PF_MEMALLOC;
3928 ret = __netif_receive_skb_core(skb, true);
3929 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3930 } else
3931 ret = __netif_receive_skb_core(skb, false);
3932
1da177e4
LT
3933 return ret;
3934}
0a9627f2 3935
ae78dbfa 3936static int netif_receive_skb_internal(struct sk_buff *skb)
0a9627f2 3937{
588f0330 3938 net_timestamp_check(netdev_tstamp_prequeue, skb);
3b098e2d 3939
c1f19b51
RC
3940 if (skb_defer_rx_timestamp(skb))
3941 return NET_RX_SUCCESS;
3942
df334545 3943#ifdef CONFIG_RPS
c5905afb 3944 if (static_key_false(&rps_needed)) {
3b098e2d
ED
3945 struct rps_dev_flow voidflow, *rflow = &voidflow;
3946 int cpu, ret;
fec5e652 3947
3b098e2d
ED
3948 rcu_read_lock();
3949
3950 cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 3951
3b098e2d
ED
3952 if (cpu >= 0) {
3953 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3954 rcu_read_unlock();
adc9300e 3955 return ret;
3b098e2d 3956 }
adc9300e 3957 rcu_read_unlock();
fec5e652 3958 }
1e94d72f 3959#endif
adc9300e 3960 return __netif_receive_skb(skb);
0a9627f2 3961}
ae78dbfa
BH
3962
3963/**
3964 * netif_receive_skb - process receive buffer from network
3965 * @skb: buffer to process
3966 *
3967 * netif_receive_skb() is the main receive data processing function.
3968 * It always succeeds. The buffer may be dropped during processing
3969 * for congestion control or by the protocol layers.
3970 *
3971 * This function may only be called from softirq context and interrupts
3972 * should be enabled.
3973 *
3974 * Return values (usually ignored):
3975 * NET_RX_SUCCESS: no congestion
3976 * NET_RX_DROP: packet was dropped
3977 */
7026b1dd 3978int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
ae78dbfa
BH
3979{
3980 trace_netif_receive_skb_entry(skb);
3981
3982 return netif_receive_skb_internal(skb);
3983}
7026b1dd 3984EXPORT_SYMBOL(netif_receive_skb_sk);
1da177e4 3985
88751275
ED
3986/* Network device is going away, flush any packets still pending
3987 * Called with irqs disabled.
3988 */
152102c7 3989static void flush_backlog(void *arg)
6e583ce5 3990{
152102c7 3991 struct net_device *dev = arg;
903ceff7 3992 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6e583ce5
SH
3993 struct sk_buff *skb, *tmp;
3994
e36fa2f7 3995 rps_lock(sd);
6e7676c1 3996 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6e583ce5 3997 if (skb->dev == dev) {
e36fa2f7 3998 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 3999 kfree_skb(skb);
76cc8b13 4000 input_queue_head_incr(sd);
6e583ce5 4001 }
6e7676c1 4002 }
e36fa2f7 4003 rps_unlock(sd);
6e7676c1
CG
4004
4005 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4006 if (skb->dev == dev) {
4007 __skb_unlink(skb, &sd->process_queue);
4008 kfree_skb(skb);
76cc8b13 4009 input_queue_head_incr(sd);
6e7676c1
CG
4010 }
4011 }
6e583ce5
SH
4012}
4013
d565b0a1
HX
4014static int napi_gro_complete(struct sk_buff *skb)
4015{
22061d80 4016 struct packet_offload *ptype;
d565b0a1 4017 __be16 type = skb->protocol;
22061d80 4018 struct list_head *head = &offload_base;
d565b0a1
HX
4019 int err = -ENOENT;
4020
c3c7c254
ED
4021 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4022
fc59f9a3
HX
4023 if (NAPI_GRO_CB(skb)->count == 1) {
4024 skb_shinfo(skb)->gso_size = 0;
d565b0a1 4025 goto out;
fc59f9a3 4026 }
d565b0a1
HX
4027
4028 rcu_read_lock();
4029 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 4030 if (ptype->type != type || !ptype->callbacks.gro_complete)
d565b0a1
HX
4031 continue;
4032
299603e8 4033 err = ptype->callbacks.gro_complete(skb, 0);
d565b0a1
HX
4034 break;
4035 }
4036 rcu_read_unlock();
4037
4038 if (err) {
4039 WARN_ON(&ptype->list == head);
4040 kfree_skb(skb);
4041 return NET_RX_SUCCESS;
4042 }
4043
4044out:
ae78dbfa 4045 return netif_receive_skb_internal(skb);
d565b0a1
HX
4046}
4047
2e71a6f8
ED
4048/* napi->gro_list contains packets ordered by age.
4049 * youngest packets at the head of it.
4050 * Complete skbs in reverse order to reduce latencies.
4051 */
4052void napi_gro_flush(struct napi_struct *napi, bool flush_old)
d565b0a1 4053{
2e71a6f8 4054 struct sk_buff *skb, *prev = NULL;
d565b0a1 4055
2e71a6f8
ED
4056 /* scan list and build reverse chain */
4057 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4058 skb->prev = prev;
4059 prev = skb;
4060 }
4061
4062 for (skb = prev; skb; skb = prev) {
d565b0a1 4063 skb->next = NULL;
2e71a6f8
ED
4064
4065 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4066 return;
4067
4068 prev = skb->prev;
d565b0a1 4069 napi_gro_complete(skb);
2e71a6f8 4070 napi->gro_count--;
d565b0a1
HX
4071 }
4072
4073 napi->gro_list = NULL;
4074}
86cac58b 4075EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 4076
89c5fa33
ED
4077static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4078{
4079 struct sk_buff *p;
4080 unsigned int maclen = skb->dev->hard_header_len;
0b4cec8c 4081 u32 hash = skb_get_hash_raw(skb);
89c5fa33
ED
4082
4083 for (p = napi->gro_list; p; p = p->next) {
4084 unsigned long diffs;
4085
0b4cec8c
TH
4086 NAPI_GRO_CB(p)->flush = 0;
4087
4088 if (hash != skb_get_hash_raw(p)) {
4089 NAPI_GRO_CB(p)->same_flow = 0;
4090 continue;
4091 }
4092
89c5fa33
ED
4093 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4094 diffs |= p->vlan_tci ^ skb->vlan_tci;
4095 if (maclen == ETH_HLEN)
4096 diffs |= compare_ether_header(skb_mac_header(p),
a50e233c 4097 skb_mac_header(skb));
89c5fa33
ED
4098 else if (!diffs)
4099 diffs = memcmp(skb_mac_header(p),
a50e233c 4100 skb_mac_header(skb),
89c5fa33
ED
4101 maclen);
4102 NAPI_GRO_CB(p)->same_flow = !diffs;
89c5fa33
ED
4103 }
4104}
4105
299603e8
JC
4106static void skb_gro_reset_offset(struct sk_buff *skb)
4107{
4108 const struct skb_shared_info *pinfo = skb_shinfo(skb);
4109 const skb_frag_t *frag0 = &pinfo->frags[0];
4110
4111 NAPI_GRO_CB(skb)->data_offset = 0;
4112 NAPI_GRO_CB(skb)->frag0 = NULL;
4113 NAPI_GRO_CB(skb)->frag0_len = 0;
4114
4115 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4116 pinfo->nr_frags &&
4117 !PageHighMem(skb_frag_page(frag0))) {
4118 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4119 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
89c5fa33
ED
4120 }
4121}
4122
a50e233c
ED
4123static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4124{
4125 struct skb_shared_info *pinfo = skb_shinfo(skb);
4126
4127 BUG_ON(skb->end - skb->tail < grow);
4128
4129 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4130
4131 skb->data_len -= grow;
4132 skb->tail += grow;
4133
4134 pinfo->frags[0].page_offset += grow;
4135 skb_frag_size_sub(&pinfo->frags[0], grow);
4136
4137 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4138 skb_frag_unref(skb, 0);
4139 memmove(pinfo->frags, pinfo->frags + 1,
4140 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4141 }
4142}
4143
bb728820 4144static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
4145{
4146 struct sk_buff **pp = NULL;
22061d80 4147 struct packet_offload *ptype;
d565b0a1 4148 __be16 type = skb->protocol;
22061d80 4149 struct list_head *head = &offload_base;
0da2afd5 4150 int same_flow;
5b252f0c 4151 enum gro_result ret;
a50e233c 4152 int grow;
d565b0a1 4153
9c62a68d 4154 if (!(skb->dev->features & NETIF_F_GRO))
d565b0a1
HX
4155 goto normal;
4156
5a212329 4157 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
f17f5c91
HX
4158 goto normal;
4159
89c5fa33
ED
4160 gro_list_prepare(napi, skb);
4161
d565b0a1
HX
4162 rcu_read_lock();
4163 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 4164 if (ptype->type != type || !ptype->callbacks.gro_receive)
d565b0a1
HX
4165 continue;
4166
86911732 4167 skb_set_network_header(skb, skb_gro_offset(skb));
efd9450e 4168 skb_reset_mac_len(skb);
d565b0a1
HX
4169 NAPI_GRO_CB(skb)->same_flow = 0;
4170 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 4171 NAPI_GRO_CB(skb)->free = 0;
b582ef09 4172 NAPI_GRO_CB(skb)->udp_mark = 0;
15e2396d 4173 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
d565b0a1 4174
662880f4
TH
4175 /* Setup for GRO checksum validation */
4176 switch (skb->ip_summed) {
4177 case CHECKSUM_COMPLETE:
4178 NAPI_GRO_CB(skb)->csum = skb->csum;
4179 NAPI_GRO_CB(skb)->csum_valid = 1;
4180 NAPI_GRO_CB(skb)->csum_cnt = 0;
4181 break;
4182 case CHECKSUM_UNNECESSARY:
4183 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4184 NAPI_GRO_CB(skb)->csum_valid = 0;
4185 break;
4186 default:
4187 NAPI_GRO_CB(skb)->csum_cnt = 0;
4188 NAPI_GRO_CB(skb)->csum_valid = 0;
4189 }
d565b0a1 4190
f191a1d1 4191 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
d565b0a1
HX
4192 break;
4193 }
4194 rcu_read_unlock();
4195
4196 if (&ptype->list == head)
4197 goto normal;
4198
0da2afd5 4199 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 4200 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 4201
d565b0a1
HX
4202 if (pp) {
4203 struct sk_buff *nskb = *pp;
4204
4205 *pp = nskb->next;
4206 nskb->next = NULL;
4207 napi_gro_complete(nskb);
4ae5544f 4208 napi->gro_count--;
d565b0a1
HX
4209 }
4210
0da2afd5 4211 if (same_flow)
d565b0a1
HX
4212 goto ok;
4213
600adc18 4214 if (NAPI_GRO_CB(skb)->flush)
d565b0a1 4215 goto normal;
d565b0a1 4216
600adc18
ED
4217 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4218 struct sk_buff *nskb = napi->gro_list;
4219
4220 /* locate the end of the list to select the 'oldest' flow */
4221 while (nskb->next) {
4222 pp = &nskb->next;
4223 nskb = *pp;
4224 }
4225 *pp = NULL;
4226 nskb->next = NULL;
4227 napi_gro_complete(nskb);
4228 } else {
4229 napi->gro_count++;
4230 }
d565b0a1 4231 NAPI_GRO_CB(skb)->count = 1;
2e71a6f8 4232 NAPI_GRO_CB(skb)->age = jiffies;
29e98242 4233 NAPI_GRO_CB(skb)->last = skb;
86911732 4234 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
4235 skb->next = napi->gro_list;
4236 napi->gro_list = skb;
5d0d9be8 4237 ret = GRO_HELD;
d565b0a1 4238
ad0f9904 4239pull:
a50e233c
ED
4240 grow = skb_gro_offset(skb) - skb_headlen(skb);
4241 if (grow > 0)
4242 gro_pull_from_frag0(skb, grow);
d565b0a1 4243ok:
5d0d9be8 4244 return ret;
d565b0a1
HX
4245
4246normal:
ad0f9904
HX
4247 ret = GRO_NORMAL;
4248 goto pull;
5d38a079 4249}
96e93eab 4250
bf5a755f
JC
4251struct packet_offload *gro_find_receive_by_type(__be16 type)
4252{
4253 struct list_head *offload_head = &offload_base;
4254 struct packet_offload *ptype;
4255
4256 list_for_each_entry_rcu(ptype, offload_head, list) {
4257 if (ptype->type != type || !ptype->callbacks.gro_receive)
4258 continue;
4259 return ptype;
4260 }
4261 return NULL;
4262}
e27a2f83 4263EXPORT_SYMBOL(gro_find_receive_by_type);
bf5a755f
JC
4264
4265struct packet_offload *gro_find_complete_by_type(__be16 type)
4266{
4267 struct list_head *offload_head = &offload_base;
4268 struct packet_offload *ptype;
4269
4270 list_for_each_entry_rcu(ptype, offload_head, list) {
4271 if (ptype->type != type || !ptype->callbacks.gro_complete)
4272 continue;
4273 return ptype;
4274 }
4275 return NULL;
4276}
e27a2f83 4277EXPORT_SYMBOL(gro_find_complete_by_type);
5d38a079 4278
bb728820 4279static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 4280{
5d0d9be8
HX
4281 switch (ret) {
4282 case GRO_NORMAL:
ae78dbfa 4283 if (netif_receive_skb_internal(skb))
c7c4b3b6
BH
4284 ret = GRO_DROP;
4285 break;
5d38a079 4286
5d0d9be8 4287 case GRO_DROP:
5d38a079
HX
4288 kfree_skb(skb);
4289 break;
5b252f0c 4290
daa86548 4291 case GRO_MERGED_FREE:
d7e8883c
ED
4292 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4293 kmem_cache_free(skbuff_head_cache, skb);
4294 else
4295 __kfree_skb(skb);
daa86548
ED
4296 break;
4297
5b252f0c
BH
4298 case GRO_HELD:
4299 case GRO_MERGED:
4300 break;
5d38a079
HX
4301 }
4302
c7c4b3b6 4303 return ret;
5d0d9be8 4304}
5d0d9be8 4305
c7c4b3b6 4306gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 4307{
ae78dbfa 4308 trace_napi_gro_receive_entry(skb);
86911732 4309
a50e233c
ED
4310 skb_gro_reset_offset(skb);
4311
89c5fa33 4312 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
d565b0a1
HX
4313}
4314EXPORT_SYMBOL(napi_gro_receive);
4315
d0c2b0d2 4316static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
96e93eab 4317{
93a35f59
ED
4318 if (unlikely(skb->pfmemalloc)) {
4319 consume_skb(skb);
4320 return;
4321 }
96e93eab 4322 __skb_pull(skb, skb_headlen(skb));
2a2a459e
ED
4323 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4324 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3701e513 4325 skb->vlan_tci = 0;
66c46d74 4326 skb->dev = napi->dev;
6d152e23 4327 skb->skb_iif = 0;
c3caf119
JC
4328 skb->encapsulation = 0;
4329 skb_shinfo(skb)->gso_type = 0;
e33d0ba8 4330 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
96e93eab
HX
4331
4332 napi->skb = skb;
4333}
96e93eab 4334
76620aaf 4335struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 4336{
5d38a079 4337 struct sk_buff *skb = napi->skb;
5d38a079
HX
4338
4339 if (!skb) {
fd11a83d 4340 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
84b9cd63 4341 napi->skb = skb;
80595d59 4342 }
96e93eab
HX
4343 return skb;
4344}
76620aaf 4345EXPORT_SYMBOL(napi_get_frags);
96e93eab 4346
a50e233c
ED
4347static gro_result_t napi_frags_finish(struct napi_struct *napi,
4348 struct sk_buff *skb,
4349 gro_result_t ret)
96e93eab 4350{
5d0d9be8
HX
4351 switch (ret) {
4352 case GRO_NORMAL:
a50e233c
ED
4353 case GRO_HELD:
4354 __skb_push(skb, ETH_HLEN);
4355 skb->protocol = eth_type_trans(skb, skb->dev);
4356 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
c7c4b3b6 4357 ret = GRO_DROP;
86911732 4358 break;
5d38a079 4359
5d0d9be8 4360 case GRO_DROP:
5d0d9be8
HX
4361 case GRO_MERGED_FREE:
4362 napi_reuse_skb(napi, skb);
4363 break;
5b252f0c
BH
4364
4365 case GRO_MERGED:
4366 break;
5d0d9be8 4367 }
5d38a079 4368
c7c4b3b6 4369 return ret;
5d38a079 4370}
5d0d9be8 4371
a50e233c
ED
4372/* Upper GRO stack assumes network header starts at gro_offset=0
4373 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4374 * We copy ethernet header into skb->data to have a common layout.
4375 */
4adb9c4a 4376static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
76620aaf
HX
4377{
4378 struct sk_buff *skb = napi->skb;
a50e233c
ED
4379 const struct ethhdr *eth;
4380 unsigned int hlen = sizeof(*eth);
76620aaf
HX
4381
4382 napi->skb = NULL;
4383
a50e233c
ED
4384 skb_reset_mac_header(skb);
4385 skb_gro_reset_offset(skb);
4386
4387 eth = skb_gro_header_fast(skb, 0);
4388 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4389 eth = skb_gro_header_slow(skb, hlen, 0);
4390 if (unlikely(!eth)) {
4391 napi_reuse_skb(napi, skb);
4392 return NULL;
4393 }
4394 } else {
4395 gro_pull_from_frag0(skb, hlen);
4396 NAPI_GRO_CB(skb)->frag0 += hlen;
4397 NAPI_GRO_CB(skb)->frag0_len -= hlen;
76620aaf 4398 }
a50e233c
ED
4399 __skb_pull(skb, hlen);
4400
4401 /*
4402 * This works because the only protocols we care about don't require
4403 * special handling.
4404 * We'll fix it up properly in napi_frags_finish()
4405 */
4406 skb->protocol = eth->h_proto;
76620aaf 4407
76620aaf
HX
4408 return skb;
4409}
76620aaf 4410
c7c4b3b6 4411gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 4412{
76620aaf 4413 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
4414
4415 if (!skb)
c7c4b3b6 4416 return GRO_DROP;
5d0d9be8 4417
ae78dbfa
BH
4418 trace_napi_gro_frags_entry(skb);
4419
89c5fa33 4420 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5d0d9be8 4421}
5d38a079
HX
4422EXPORT_SYMBOL(napi_gro_frags);
4423
573e8fca
TH
4424/* Compute the checksum from gro_offset and return the folded value
4425 * after adding in any pseudo checksum.
4426 */
4427__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4428{
4429 __wsum wsum;
4430 __sum16 sum;
4431
4432 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4433
4434 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4435 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4436 if (likely(!sum)) {
4437 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4438 !skb->csum_complete_sw)
4439 netdev_rx_csum_fault(skb->dev);
4440 }
4441
4442 NAPI_GRO_CB(skb)->csum = wsum;
4443 NAPI_GRO_CB(skb)->csum_valid = 1;
4444
4445 return sum;
4446}
4447EXPORT_SYMBOL(__skb_gro_checksum_complete);
4448
e326bed2 4449/*
855abcf0 4450 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
e326bed2
ED
4451 * Note: called with local irq disabled, but exits with local irq enabled.
4452 */
4453static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4454{
4455#ifdef CONFIG_RPS
4456 struct softnet_data *remsd = sd->rps_ipi_list;
4457
4458 if (remsd) {
4459 sd->rps_ipi_list = NULL;
4460
4461 local_irq_enable();
4462
4463 /* Send pending IPI's to kick RPS processing on remote cpus. */
4464 while (remsd) {
4465 struct softnet_data *next = remsd->rps_ipi_next;
4466
4467 if (cpu_online(remsd->cpu))
c46fff2a 4468 smp_call_function_single_async(remsd->cpu,
fce8ad15 4469 &remsd->csd);
e326bed2
ED
4470 remsd = next;
4471 }
4472 } else
4473#endif
4474 local_irq_enable();
4475}
4476
d75b1ade
ED
4477static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4478{
4479#ifdef CONFIG_RPS
4480 return sd->rps_ipi_list != NULL;
4481#else
4482 return false;
4483#endif
4484}
4485
bea3348e 4486static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
4487{
4488 int work = 0;
eecfd7c4 4489 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
1da177e4 4490
e326bed2
ED
4491 /* Check if we have pending ipi, its better to send them now,
4492 * not waiting net_rx_action() end.
4493 */
d75b1ade 4494 if (sd_has_rps_ipi_waiting(sd)) {
e326bed2
ED
4495 local_irq_disable();
4496 net_rps_action_and_irq_enable(sd);
4497 }
d75b1ade 4498
bea3348e 4499 napi->weight = weight_p;
6e7676c1 4500 local_irq_disable();
11ef7a89 4501 while (1) {
1da177e4 4502 struct sk_buff *skb;
6e7676c1
CG
4503
4504 while ((skb = __skb_dequeue(&sd->process_queue))) {
4505 local_irq_enable();
4506 __netif_receive_skb(skb);
6e7676c1 4507 local_irq_disable();
76cc8b13
TH
4508 input_queue_head_incr(sd);
4509 if (++work >= quota) {
4510 local_irq_enable();
4511 return work;
4512 }
6e7676c1 4513 }
1da177e4 4514
e36fa2f7 4515 rps_lock(sd);
11ef7a89 4516 if (skb_queue_empty(&sd->input_pkt_queue)) {
eecfd7c4
ED
4517 /*
4518 * Inline a custom version of __napi_complete().
4519 * only current cpu owns and manipulates this napi,
11ef7a89
TH
4520 * and NAPI_STATE_SCHED is the only possible flag set
4521 * on backlog.
4522 * We can use a plain write instead of clear_bit(),
eecfd7c4
ED
4523 * and we dont need an smp_mb() memory barrier.
4524 */
eecfd7c4 4525 napi->state = 0;
11ef7a89 4526 rps_unlock(sd);
eecfd7c4 4527
11ef7a89 4528 break;
bea3348e 4529 }
11ef7a89
TH
4530
4531 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4532 &sd->process_queue);
e36fa2f7 4533 rps_unlock(sd);
6e7676c1
CG
4534 }
4535 local_irq_enable();
1da177e4 4536
bea3348e
SH
4537 return work;
4538}
1da177e4 4539
bea3348e
SH
4540/**
4541 * __napi_schedule - schedule for receive
c4ea43c5 4542 * @n: entry to schedule
bea3348e 4543 *
bc9ad166
ED
4544 * The entry's receive function will be scheduled to run.
4545 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
bea3348e 4546 */
b5606c2d 4547void __napi_schedule(struct napi_struct *n)
bea3348e
SH
4548{
4549 unsigned long flags;
1da177e4 4550
bea3348e 4551 local_irq_save(flags);
903ceff7 4552 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
bea3348e 4553 local_irq_restore(flags);
1da177e4 4554}
bea3348e
SH
4555EXPORT_SYMBOL(__napi_schedule);
4556
bc9ad166
ED
4557/**
4558 * __napi_schedule_irqoff - schedule for receive
4559 * @n: entry to schedule
4560 *
4561 * Variant of __napi_schedule() assuming hard irqs are masked
4562 */
4563void __napi_schedule_irqoff(struct napi_struct *n)
4564{
4565 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4566}
4567EXPORT_SYMBOL(__napi_schedule_irqoff);
4568
d565b0a1
HX
4569void __napi_complete(struct napi_struct *n)
4570{
4571 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
d565b0a1 4572
d75b1ade 4573 list_del_init(&n->poll_list);
4e857c58 4574 smp_mb__before_atomic();
d565b0a1
HX
4575 clear_bit(NAPI_STATE_SCHED, &n->state);
4576}
4577EXPORT_SYMBOL(__napi_complete);
4578
3b47d303 4579void napi_complete_done(struct napi_struct *n, int work_done)
d565b0a1
HX
4580{
4581 unsigned long flags;
4582
4583 /*
4584 * don't let napi dequeue from the cpu poll list
4585 * just in case its running on a different cpu
4586 */
4587 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4588 return;
4589
3b47d303
ED
4590 if (n->gro_list) {
4591 unsigned long timeout = 0;
d75b1ade 4592
3b47d303
ED
4593 if (work_done)
4594 timeout = n->dev->gro_flush_timeout;
4595
4596 if (timeout)
4597 hrtimer_start(&n->timer, ns_to_ktime(timeout),
4598 HRTIMER_MODE_REL_PINNED);
4599 else
4600 napi_gro_flush(n, false);
4601 }
d75b1ade
ED
4602 if (likely(list_empty(&n->poll_list))) {
4603 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4604 } else {
4605 /* If n->poll_list is not empty, we need to mask irqs */
4606 local_irq_save(flags);
4607 __napi_complete(n);
4608 local_irq_restore(flags);
4609 }
d565b0a1 4610}
3b47d303 4611EXPORT_SYMBOL(napi_complete_done);
d565b0a1 4612
af12fa6e
ET
4613/* must be called under rcu_read_lock(), as we dont take a reference */
4614struct napi_struct *napi_by_id(unsigned int napi_id)
4615{
4616 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4617 struct napi_struct *napi;
4618
4619 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4620 if (napi->napi_id == napi_id)
4621 return napi;
4622
4623 return NULL;
4624}
4625EXPORT_SYMBOL_GPL(napi_by_id);
4626
4627void napi_hash_add(struct napi_struct *napi)
4628{
4629 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4630
4631 spin_lock(&napi_hash_lock);
4632
4633 /* 0 is not a valid id, we also skip an id that is taken
4634 * we expect both events to be extremely rare
4635 */
4636 napi->napi_id = 0;
4637 while (!napi->napi_id) {
4638 napi->napi_id = ++napi_gen_id;
4639 if (napi_by_id(napi->napi_id))
4640 napi->napi_id = 0;
4641 }
4642
4643 hlist_add_head_rcu(&napi->napi_hash_node,
4644 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4645
4646 spin_unlock(&napi_hash_lock);
4647 }
4648}
4649EXPORT_SYMBOL_GPL(napi_hash_add);
4650
4651/* Warning : caller is responsible to make sure rcu grace period
4652 * is respected before freeing memory containing @napi
4653 */
4654void napi_hash_del(struct napi_struct *napi)
4655{
4656 spin_lock(&napi_hash_lock);
4657
4658 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4659 hlist_del_rcu(&napi->napi_hash_node);
4660
4661 spin_unlock(&napi_hash_lock);
4662}
4663EXPORT_SYMBOL_GPL(napi_hash_del);
4664
3b47d303
ED
4665static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4666{
4667 struct napi_struct *napi;
4668
4669 napi = container_of(timer, struct napi_struct, timer);
4670 if (napi->gro_list)
4671 napi_schedule(napi);
4672
4673 return HRTIMER_NORESTART;
4674}
4675
d565b0a1
HX
4676void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4677 int (*poll)(struct napi_struct *, int), int weight)
4678{
4679 INIT_LIST_HEAD(&napi->poll_list);
3b47d303
ED
4680 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4681 napi->timer.function = napi_watchdog;
4ae5544f 4682 napi->gro_count = 0;
d565b0a1 4683 napi->gro_list = NULL;
5d38a079 4684 napi->skb = NULL;
d565b0a1 4685 napi->poll = poll;
82dc3c63
ED
4686 if (weight > NAPI_POLL_WEIGHT)
4687 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4688 weight, dev->name);
d565b0a1
HX
4689 napi->weight = weight;
4690 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 4691 napi->dev = dev;
5d38a079 4692#ifdef CONFIG_NETPOLL
d565b0a1
HX
4693 spin_lock_init(&napi->poll_lock);
4694 napi->poll_owner = -1;
4695#endif
4696 set_bit(NAPI_STATE_SCHED, &napi->state);
4697}
4698EXPORT_SYMBOL(netif_napi_add);
4699
3b47d303
ED
4700void napi_disable(struct napi_struct *n)
4701{
4702 might_sleep();
4703 set_bit(NAPI_STATE_DISABLE, &n->state);
4704
4705 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4706 msleep(1);
4707
4708 hrtimer_cancel(&n->timer);
4709
4710 clear_bit(NAPI_STATE_DISABLE, &n->state);
4711}
4712EXPORT_SYMBOL(napi_disable);
4713
d565b0a1
HX
4714void netif_napi_del(struct napi_struct *napi)
4715{
d7b06636 4716 list_del_init(&napi->dev_list);
76620aaf 4717 napi_free_frags(napi);
d565b0a1 4718
289dccbe 4719 kfree_skb_list(napi->gro_list);
d565b0a1 4720 napi->gro_list = NULL;
4ae5544f 4721 napi->gro_count = 0;
d565b0a1
HX
4722}
4723EXPORT_SYMBOL(netif_napi_del);
4724
726ce70e
HX
4725static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4726{
4727 void *have;
4728 int work, weight;
4729
4730 list_del_init(&n->poll_list);
4731
4732 have = netpoll_poll_lock(n);
4733
4734 weight = n->weight;
4735
4736 /* This NAPI_STATE_SCHED test is for avoiding a race
4737 * with netpoll's poll_napi(). Only the entity which
4738 * obtains the lock and sees NAPI_STATE_SCHED set will
4739 * actually make the ->poll() call. Therefore we avoid
4740 * accidentally calling ->poll() when NAPI is not scheduled.
4741 */
4742 work = 0;
4743 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4744 work = n->poll(n, weight);
4745 trace_napi_poll(n);
4746 }
4747
4748 WARN_ON_ONCE(work > weight);
4749
4750 if (likely(work < weight))
4751 goto out_unlock;
4752
4753 /* Drivers must not modify the NAPI state if they
4754 * consume the entire weight. In such cases this code
4755 * still "owns" the NAPI instance and therefore can
4756 * move the instance around on the list at-will.
4757 */
4758 if (unlikely(napi_disable_pending(n))) {
4759 napi_complete(n);
4760 goto out_unlock;
4761 }
4762
4763 if (n->gro_list) {
4764 /* flush too old packets
4765 * If HZ < 1000, flush all packets.
4766 */
4767 napi_gro_flush(n, HZ >= 1000);
4768 }
4769
001ce546
HX
4770 /* Some drivers may have called napi_schedule
4771 * prior to exhausting their budget.
4772 */
4773 if (unlikely(!list_empty(&n->poll_list))) {
4774 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4775 n->dev ? n->dev->name : "backlog");
4776 goto out_unlock;
4777 }
4778
726ce70e
HX
4779 list_add_tail(&n->poll_list, repoll);
4780
4781out_unlock:
4782 netpoll_poll_unlock(have);
4783
4784 return work;
4785}
4786
1da177e4
LT
4787static void net_rx_action(struct softirq_action *h)
4788{
903ceff7 4789 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
24f8b238 4790 unsigned long time_limit = jiffies + 2;
51b0bded 4791 int budget = netdev_budget;
d75b1ade
ED
4792 LIST_HEAD(list);
4793 LIST_HEAD(repoll);
53fb95d3 4794
1da177e4 4795 local_irq_disable();
d75b1ade
ED
4796 list_splice_init(&sd->poll_list, &list);
4797 local_irq_enable();
1da177e4 4798
ceb8d5bf 4799 for (;;) {
bea3348e 4800 struct napi_struct *n;
1da177e4 4801
ceb8d5bf
HX
4802 if (list_empty(&list)) {
4803 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4804 return;
4805 break;
4806 }
4807
6bd373eb
HX
4808 n = list_first_entry(&list, struct napi_struct, poll_list);
4809 budget -= napi_poll(n, &repoll);
4810
d75b1ade 4811 /* If softirq window is exhausted then punt.
24f8b238
SH
4812 * Allow this to run for 2 jiffies since which will allow
4813 * an average latency of 1.5/HZ.
bea3348e 4814 */
ceb8d5bf
HX
4815 if (unlikely(budget <= 0 ||
4816 time_after_eq(jiffies, time_limit))) {
4817 sd->time_squeeze++;
4818 break;
4819 }
1da177e4 4820 }
d75b1ade 4821
d75b1ade
ED
4822 local_irq_disable();
4823
4824 list_splice_tail_init(&sd->poll_list, &list);
4825 list_splice_tail(&repoll, &list);
4826 list_splice(&list, &sd->poll_list);
4827 if (!list_empty(&sd->poll_list))
4828 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4829
e326bed2 4830 net_rps_action_and_irq_enable(sd);
1da177e4
LT
4831}
4832
aa9d8560 4833struct netdev_adjacent {
9ff162a8 4834 struct net_device *dev;
5d261913
VF
4835
4836 /* upper master flag, there can only be one master device per list */
9ff162a8 4837 bool master;
5d261913 4838
5d261913
VF
4839 /* counter for the number of times this device was added to us */
4840 u16 ref_nr;
4841
402dae96
VF
4842 /* private field for the users */
4843 void *private;
4844
9ff162a8
JP
4845 struct list_head list;
4846 struct rcu_head rcu;
9ff162a8
JP
4847};
4848
5d261913
VF
4849static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4850 struct net_device *adj_dev,
2f268f12 4851 struct list_head *adj_list)
9ff162a8 4852{
5d261913 4853 struct netdev_adjacent *adj;
5d261913 4854
2f268f12 4855 list_for_each_entry(adj, adj_list, list) {
5d261913
VF
4856 if (adj->dev == adj_dev)
4857 return adj;
9ff162a8
JP
4858 }
4859 return NULL;
4860}
4861
4862/**
4863 * netdev_has_upper_dev - Check if device is linked to an upper device
4864 * @dev: device
4865 * @upper_dev: upper device to check
4866 *
4867 * Find out if a device is linked to specified upper device and return true
4868 * in case it is. Note that this checks only immediate upper device,
4869 * not through a complete stack of devices. The caller must hold the RTNL lock.
4870 */
4871bool netdev_has_upper_dev(struct net_device *dev,
4872 struct net_device *upper_dev)
4873{
4874 ASSERT_RTNL();
4875
2f268f12 4876 return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
9ff162a8
JP
4877}
4878EXPORT_SYMBOL(netdev_has_upper_dev);
4879
4880/**
4881 * netdev_has_any_upper_dev - Check if device is linked to some device
4882 * @dev: device
4883 *
4884 * Find out if a device is linked to an upper device and return true in case
4885 * it is. The caller must hold the RTNL lock.
4886 */
1d143d9f 4887static bool netdev_has_any_upper_dev(struct net_device *dev)
9ff162a8
JP
4888{
4889 ASSERT_RTNL();
4890
2f268f12 4891 return !list_empty(&dev->all_adj_list.upper);
9ff162a8 4892}
9ff162a8
JP
4893
4894/**
4895 * netdev_master_upper_dev_get - Get master upper device
4896 * @dev: device
4897 *
4898 * Find a master upper device and return pointer to it or NULL in case
4899 * it's not there. The caller must hold the RTNL lock.
4900 */
4901struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4902{
aa9d8560 4903 struct netdev_adjacent *upper;
9ff162a8
JP
4904
4905 ASSERT_RTNL();
4906
2f268f12 4907 if (list_empty(&dev->adj_list.upper))
9ff162a8
JP
4908 return NULL;
4909
2f268f12 4910 upper = list_first_entry(&dev->adj_list.upper,
aa9d8560 4911 struct netdev_adjacent, list);
9ff162a8
JP
4912 if (likely(upper->master))
4913 return upper->dev;
4914 return NULL;
4915}
4916EXPORT_SYMBOL(netdev_master_upper_dev_get);
4917
b6ccba4c
VF
4918void *netdev_adjacent_get_private(struct list_head *adj_list)
4919{
4920 struct netdev_adjacent *adj;
4921
4922 adj = list_entry(adj_list, struct netdev_adjacent, list);
4923
4924 return adj->private;
4925}
4926EXPORT_SYMBOL(netdev_adjacent_get_private);
4927
44a40855
VY
4928/**
4929 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4930 * @dev: device
4931 * @iter: list_head ** of the current position
4932 *
4933 * Gets the next device from the dev's upper list, starting from iter
4934 * position. The caller must hold RCU read lock.
4935 */
4936struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4937 struct list_head **iter)
4938{
4939 struct netdev_adjacent *upper;
4940
4941 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4942
4943 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4944
4945 if (&upper->list == &dev->adj_list.upper)
4946 return NULL;
4947
4948 *iter = &upper->list;
4949
4950 return upper->dev;
4951}
4952EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4953
31088a11
VF
4954/**
4955 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
48311f46
VF
4956 * @dev: device
4957 * @iter: list_head ** of the current position
4958 *
4959 * Gets the next device from the dev's upper list, starting from iter
4960 * position. The caller must hold RCU read lock.
4961 */
2f268f12
VF
4962struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4963 struct list_head **iter)
48311f46
VF
4964{
4965 struct netdev_adjacent *upper;
4966
85328240 4967 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
48311f46
VF
4968
4969 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4970
2f268f12 4971 if (&upper->list == &dev->all_adj_list.upper)
48311f46
VF
4972 return NULL;
4973
4974 *iter = &upper->list;
4975
4976 return upper->dev;
4977}
2f268f12 4978EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
48311f46 4979
31088a11
VF
4980/**
4981 * netdev_lower_get_next_private - Get the next ->private from the
4982 * lower neighbour list
4983 * @dev: device
4984 * @iter: list_head ** of the current position
4985 *
4986 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4987 * list, starting from iter position. The caller must hold either hold the
4988 * RTNL lock or its own locking that guarantees that the neighbour lower
4989 * list will remain unchainged.
4990 */
4991void *netdev_lower_get_next_private(struct net_device *dev,
4992 struct list_head **iter)
4993{
4994 struct netdev_adjacent *lower;
4995
4996 lower = list_entry(*iter, struct netdev_adjacent, list);
4997
4998 if (&lower->list == &dev->adj_list.lower)
4999 return NULL;
5000
6859e7df 5001 *iter = lower->list.next;
31088a11
VF
5002
5003 return lower->private;
5004}
5005EXPORT_SYMBOL(netdev_lower_get_next_private);
5006
5007/**
5008 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5009 * lower neighbour list, RCU
5010 * variant
5011 * @dev: device
5012 * @iter: list_head ** of the current position
5013 *
5014 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5015 * list, starting from iter position. The caller must hold RCU read lock.
5016 */
5017void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5018 struct list_head **iter)
5019{
5020 struct netdev_adjacent *lower;
5021
5022 WARN_ON_ONCE(!rcu_read_lock_held());
5023
5024 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5025
5026 if (&lower->list == &dev->adj_list.lower)
5027 return NULL;
5028
6859e7df 5029 *iter = &lower->list;
31088a11
VF
5030
5031 return lower->private;
5032}
5033EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5034
4085ebe8
VY
5035/**
5036 * netdev_lower_get_next - Get the next device from the lower neighbour
5037 * list
5038 * @dev: device
5039 * @iter: list_head ** of the current position
5040 *
5041 * Gets the next netdev_adjacent from the dev's lower neighbour
5042 * list, starting from iter position. The caller must hold RTNL lock or
5043 * its own locking that guarantees that the neighbour lower
5044 * list will remain unchainged.
5045 */
5046void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5047{
5048 struct netdev_adjacent *lower;
5049
5050 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5051
5052 if (&lower->list == &dev->adj_list.lower)
5053 return NULL;
5054
5055 *iter = &lower->list;
5056
5057 return lower->dev;
5058}
5059EXPORT_SYMBOL(netdev_lower_get_next);
5060
e001bfad 5061/**
5062 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5063 * lower neighbour list, RCU
5064 * variant
5065 * @dev: device
5066 *
5067 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5068 * list. The caller must hold RCU read lock.
5069 */
5070void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5071{
5072 struct netdev_adjacent *lower;
5073
5074 lower = list_first_or_null_rcu(&dev->adj_list.lower,
5075 struct netdev_adjacent, list);
5076 if (lower)
5077 return lower->private;
5078 return NULL;
5079}
5080EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5081
9ff162a8
JP
5082/**
5083 * netdev_master_upper_dev_get_rcu - Get master upper device
5084 * @dev: device
5085 *
5086 * Find a master upper device and return pointer to it or NULL in case
5087 * it's not there. The caller must hold the RCU read lock.
5088 */
5089struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5090{
aa9d8560 5091 struct netdev_adjacent *upper;
9ff162a8 5092
2f268f12 5093 upper = list_first_or_null_rcu(&dev->adj_list.upper,
aa9d8560 5094 struct netdev_adjacent, list);
9ff162a8
JP
5095 if (upper && likely(upper->master))
5096 return upper->dev;
5097 return NULL;
5098}
5099EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5100
0a59f3a9 5101static int netdev_adjacent_sysfs_add(struct net_device *dev,
3ee32707
VF
5102 struct net_device *adj_dev,
5103 struct list_head *dev_list)
5104{
5105 char linkname[IFNAMSIZ+7];
5106 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5107 "upper_%s" : "lower_%s", adj_dev->name);
5108 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5109 linkname);
5110}
0a59f3a9 5111static void netdev_adjacent_sysfs_del(struct net_device *dev,
3ee32707
VF
5112 char *name,
5113 struct list_head *dev_list)
5114{
5115 char linkname[IFNAMSIZ+7];
5116 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5117 "upper_%s" : "lower_%s", name);
5118 sysfs_remove_link(&(dev->dev.kobj), linkname);
5119}
5120
7ce64c79
AF
5121static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5122 struct net_device *adj_dev,
5123 struct list_head *dev_list)
5124{
5125 return (dev_list == &dev->adj_list.upper ||
5126 dev_list == &dev->adj_list.lower) &&
5127 net_eq(dev_net(dev), dev_net(adj_dev));
5128}
3ee32707 5129
5d261913
VF
5130static int __netdev_adjacent_dev_insert(struct net_device *dev,
5131 struct net_device *adj_dev,
7863c054 5132 struct list_head *dev_list,
402dae96 5133 void *private, bool master)
5d261913
VF
5134{
5135 struct netdev_adjacent *adj;
842d67a7 5136 int ret;
5d261913 5137
7863c054 5138 adj = __netdev_find_adj(dev, adj_dev, dev_list);
5d261913
VF
5139
5140 if (adj) {
5d261913
VF
5141 adj->ref_nr++;
5142 return 0;
5143 }
5144
5145 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5146 if (!adj)
5147 return -ENOMEM;
5148
5149 adj->dev = adj_dev;
5150 adj->master = master;
5d261913 5151 adj->ref_nr = 1;
402dae96 5152 adj->private = private;
5d261913 5153 dev_hold(adj_dev);
2f268f12
VF
5154
5155 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5156 adj_dev->name, dev->name, adj_dev->name);
5d261913 5157
7ce64c79 5158 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
3ee32707 5159 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5831d66e
VF
5160 if (ret)
5161 goto free_adj;
5162 }
5163
7863c054 5164 /* Ensure that master link is always the first item in list. */
842d67a7
VF
5165 if (master) {
5166 ret = sysfs_create_link(&(dev->dev.kobj),
5167 &(adj_dev->dev.kobj), "master");
5168 if (ret)
5831d66e 5169 goto remove_symlinks;
842d67a7 5170
7863c054 5171 list_add_rcu(&adj->list, dev_list);
842d67a7 5172 } else {
7863c054 5173 list_add_tail_rcu(&adj->list, dev_list);
842d67a7 5174 }
5d261913
VF
5175
5176 return 0;
842d67a7 5177
5831d66e 5178remove_symlinks:
7ce64c79 5179 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
3ee32707 5180 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
842d67a7
VF
5181free_adj:
5182 kfree(adj);
974daef7 5183 dev_put(adj_dev);
842d67a7
VF
5184
5185 return ret;
5d261913
VF
5186}
5187
1d143d9f 5188static void __netdev_adjacent_dev_remove(struct net_device *dev,
5189 struct net_device *adj_dev,
5190 struct list_head *dev_list)
5d261913
VF
5191{
5192 struct netdev_adjacent *adj;
5193
7863c054 5194 adj = __netdev_find_adj(dev, adj_dev, dev_list);
5d261913 5195
2f268f12
VF
5196 if (!adj) {
5197 pr_err("tried to remove device %s from %s\n",
5198 dev->name, adj_dev->name);
5d261913 5199 BUG();
2f268f12 5200 }
5d261913
VF
5201
5202 if (adj->ref_nr > 1) {
2f268f12
VF
5203 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5204 adj->ref_nr-1);
5d261913
VF
5205 adj->ref_nr--;
5206 return;
5207 }
5208
842d67a7
VF
5209 if (adj->master)
5210 sysfs_remove_link(&(dev->dev.kobj), "master");
5211
7ce64c79 5212 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
3ee32707 5213 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5831d66e 5214
5d261913 5215 list_del_rcu(&adj->list);
2f268f12
VF
5216 pr_debug("dev_put for %s, because link removed from %s to %s\n",
5217 adj_dev->name, dev->name, adj_dev->name);
5d261913
VF
5218 dev_put(adj_dev);
5219 kfree_rcu(adj, rcu);
5220}
5221
1d143d9f 5222static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5223 struct net_device *upper_dev,
5224 struct list_head *up_list,
5225 struct list_head *down_list,
5226 void *private, bool master)
5d261913
VF
5227{
5228 int ret;
5229
402dae96
VF
5230 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5231 master);
5d261913
VF
5232 if (ret)
5233 return ret;
5234
402dae96
VF
5235 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5236 false);
5d261913 5237 if (ret) {
2f268f12 5238 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5d261913
VF
5239 return ret;
5240 }
5241
5242 return 0;
5243}
5244
1d143d9f 5245static int __netdev_adjacent_dev_link(struct net_device *dev,
5246 struct net_device *upper_dev)
5d261913 5247{
2f268f12
VF
5248 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5249 &dev->all_adj_list.upper,
5250 &upper_dev->all_adj_list.lower,
402dae96 5251 NULL, false);
5d261913
VF
5252}
5253
1d143d9f 5254static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5255 struct net_device *upper_dev,
5256 struct list_head *up_list,
5257 struct list_head *down_list)
5d261913 5258{
2f268f12
VF
5259 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5260 __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5d261913
VF
5261}
5262
1d143d9f 5263static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5264 struct net_device *upper_dev)
5d261913 5265{
2f268f12
VF
5266 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5267 &dev->all_adj_list.upper,
5268 &upper_dev->all_adj_list.lower);
5269}
5270
1d143d9f 5271static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5272 struct net_device *upper_dev,
5273 void *private, bool master)
2f268f12
VF
5274{
5275 int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5276
5277 if (ret)
5278 return ret;
5279
5280 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5281 &dev->adj_list.upper,
5282 &upper_dev->adj_list.lower,
402dae96 5283 private, master);
2f268f12
VF
5284 if (ret) {
5285 __netdev_adjacent_dev_unlink(dev, upper_dev);
5286 return ret;
5287 }
5288
5289 return 0;
5d261913
VF
5290}
5291
1d143d9f 5292static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5293 struct net_device *upper_dev)
2f268f12
VF
5294{
5295 __netdev_adjacent_dev_unlink(dev, upper_dev);
5296 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5297 &dev->adj_list.upper,
5298 &upper_dev->adj_list.lower);
5299}
5d261913 5300
9ff162a8 5301static int __netdev_upper_dev_link(struct net_device *dev,
402dae96
VF
5302 struct net_device *upper_dev, bool master,
5303 void *private)
9ff162a8 5304{
5d261913
VF
5305 struct netdev_adjacent *i, *j, *to_i, *to_j;
5306 int ret = 0;
9ff162a8
JP
5307
5308 ASSERT_RTNL();
5309
5310 if (dev == upper_dev)
5311 return -EBUSY;
5312
5313 /* To prevent loops, check if dev is not upper device to upper_dev. */
2f268f12 5314 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
9ff162a8
JP
5315 return -EBUSY;
5316
d66bf7dd 5317 if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
9ff162a8
JP
5318 return -EEXIST;
5319
5320 if (master && netdev_master_upper_dev_get(dev))
5321 return -EBUSY;
5322
402dae96
VF
5323 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5324 master);
5d261913
VF
5325 if (ret)
5326 return ret;
9ff162a8 5327
5d261913 5328 /* Now that we linked these devs, make all the upper_dev's
2f268f12 5329 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5d261913
VF
5330 * versa, and don't forget the devices itself. All of these
5331 * links are non-neighbours.
5332 */
2f268f12
VF
5333 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5334 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5335 pr_debug("Interlinking %s with %s, non-neighbour\n",
5336 i->dev->name, j->dev->name);
5d261913
VF
5337 ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5338 if (ret)
5339 goto rollback_mesh;
5340 }
5341 }
5342
5343 /* add dev to every upper_dev's upper device */
2f268f12
VF
5344 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5345 pr_debug("linking %s's upper device %s with %s\n",
5346 upper_dev->name, i->dev->name, dev->name);
5d261913
VF
5347 ret = __netdev_adjacent_dev_link(dev, i->dev);
5348 if (ret)
5349 goto rollback_upper_mesh;
5350 }
5351
5352 /* add upper_dev to every dev's lower device */
2f268f12
VF
5353 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5354 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5355 i->dev->name, upper_dev->name);
5d261913
VF
5356 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5357 if (ret)
5358 goto rollback_lower_mesh;
5359 }
9ff162a8 5360
42e52bf9 5361 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
9ff162a8 5362 return 0;
5d261913
VF
5363
5364rollback_lower_mesh:
5365 to_i = i;
2f268f12 5366 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5d261913
VF
5367 if (i == to_i)
5368 break;
5369 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5370 }
5371
5372 i = NULL;
5373
5374rollback_upper_mesh:
5375 to_i = i;
2f268f12 5376 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5d261913
VF
5377 if (i == to_i)
5378 break;
5379 __netdev_adjacent_dev_unlink(dev, i->dev);
5380 }
5381
5382 i = j = NULL;
5383
5384rollback_mesh:
5385 to_i = i;
5386 to_j = j;
2f268f12
VF
5387 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5388 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5d261913
VF
5389 if (i == to_i && j == to_j)
5390 break;
5391 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5392 }
5393 if (i == to_i)
5394 break;
5395 }
5396
2f268f12 5397 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5d261913
VF
5398
5399 return ret;
9ff162a8
JP
5400}
5401
5402/**
5403 * netdev_upper_dev_link - Add a link to the upper device
5404 * @dev: device
5405 * @upper_dev: new upper device
5406 *
5407 * Adds a link to device which is upper to this one. The caller must hold
5408 * the RTNL lock. On a failure a negative errno code is returned.
5409 * On success the reference counts are adjusted and the function
5410 * returns zero.
5411 */
5412int netdev_upper_dev_link(struct net_device *dev,
5413 struct net_device *upper_dev)
5414{
402dae96 5415 return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
9ff162a8
JP
5416}
5417EXPORT_SYMBOL(netdev_upper_dev_link);
5418
5419/**
5420 * netdev_master_upper_dev_link - Add a master link to the upper device
5421 * @dev: device
5422 * @upper_dev: new upper device
5423 *
5424 * Adds a link to device which is upper to this one. In this case, only
5425 * one master upper device can be linked, although other non-master devices
5426 * might be linked as well. The caller must hold the RTNL lock.
5427 * On a failure a negative errno code is returned. On success the reference
5428 * counts are adjusted and the function returns zero.
5429 */
5430int netdev_master_upper_dev_link(struct net_device *dev,
5431 struct net_device *upper_dev)
5432{
402dae96 5433 return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
9ff162a8
JP
5434}
5435EXPORT_SYMBOL(netdev_master_upper_dev_link);
5436
402dae96
VF
5437int netdev_master_upper_dev_link_private(struct net_device *dev,
5438 struct net_device *upper_dev,
5439 void *private)
5440{
5441 return __netdev_upper_dev_link(dev, upper_dev, true, private);
5442}
5443EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5444
9ff162a8
JP
5445/**
5446 * netdev_upper_dev_unlink - Removes a link to upper device
5447 * @dev: device
5448 * @upper_dev: new upper device
5449 *
5450 * Removes a link to device which is upper to this one. The caller must hold
5451 * the RTNL lock.
5452 */
5453void netdev_upper_dev_unlink(struct net_device *dev,
5454 struct net_device *upper_dev)
5455{
5d261913 5456 struct netdev_adjacent *i, *j;
9ff162a8
JP
5457 ASSERT_RTNL();
5458
2f268f12 5459 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5d261913
VF
5460
5461 /* Here is the tricky part. We must remove all dev's lower
5462 * devices from all upper_dev's upper devices and vice
5463 * versa, to maintain the graph relationship.
5464 */
2f268f12
VF
5465 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5466 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5d261913
VF
5467 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5468
5469 /* remove also the devices itself from lower/upper device
5470 * list
5471 */
2f268f12 5472 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5d261913
VF
5473 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5474
2f268f12 5475 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5d261913
VF
5476 __netdev_adjacent_dev_unlink(dev, i->dev);
5477
42e52bf9 5478 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
9ff162a8
JP
5479}
5480EXPORT_SYMBOL(netdev_upper_dev_unlink);
5481
61bd3857
MS
5482/**
5483 * netdev_bonding_info_change - Dispatch event about slave change
5484 * @dev: device
4a26e453 5485 * @bonding_info: info to dispatch
61bd3857
MS
5486 *
5487 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5488 * The caller must hold the RTNL lock.
5489 */
5490void netdev_bonding_info_change(struct net_device *dev,
5491 struct netdev_bonding_info *bonding_info)
5492{
5493 struct netdev_notifier_bonding_info info;
5494
5495 memcpy(&info.bonding_info, bonding_info,
5496 sizeof(struct netdev_bonding_info));
5497 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5498 &info.info);
5499}
5500EXPORT_SYMBOL(netdev_bonding_info_change);
5501
2ce1ee17 5502static void netdev_adjacent_add_links(struct net_device *dev)
4c75431a
AF
5503{
5504 struct netdev_adjacent *iter;
5505
5506 struct net *net = dev_net(dev);
5507
5508 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5509 if (!net_eq(net,dev_net(iter->dev)))
5510 continue;
5511 netdev_adjacent_sysfs_add(iter->dev, dev,
5512 &iter->dev->adj_list.lower);
5513 netdev_adjacent_sysfs_add(dev, iter->dev,
5514 &dev->adj_list.upper);
5515 }
5516
5517 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5518 if (!net_eq(net,dev_net(iter->dev)))
5519 continue;
5520 netdev_adjacent_sysfs_add(iter->dev, dev,
5521 &iter->dev->adj_list.upper);
5522 netdev_adjacent_sysfs_add(dev, iter->dev,
5523 &dev->adj_list.lower);
5524 }
5525}
5526
2ce1ee17 5527static void netdev_adjacent_del_links(struct net_device *dev)
4c75431a
AF
5528{
5529 struct netdev_adjacent *iter;
5530
5531 struct net *net = dev_net(dev);
5532
5533 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5534 if (!net_eq(net,dev_net(iter->dev)))
5535 continue;
5536 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5537 &iter->dev->adj_list.lower);
5538 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5539 &dev->adj_list.upper);
5540 }
5541
5542 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5543 if (!net_eq(net,dev_net(iter->dev)))
5544 continue;
5545 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5546 &iter->dev->adj_list.upper);
5547 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5548 &dev->adj_list.lower);
5549 }
5550}
5551
5bb025fa 5552void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
402dae96 5553{
5bb025fa 5554 struct netdev_adjacent *iter;
402dae96 5555
4c75431a
AF
5556 struct net *net = dev_net(dev);
5557
5bb025fa 5558 list_for_each_entry(iter, &dev->adj_list.upper, list) {
4c75431a
AF
5559 if (!net_eq(net,dev_net(iter->dev)))
5560 continue;
5bb025fa
VF
5561 netdev_adjacent_sysfs_del(iter->dev, oldname,
5562 &iter->dev->adj_list.lower);
5563 netdev_adjacent_sysfs_add(iter->dev, dev,
5564 &iter->dev->adj_list.lower);
5565 }
402dae96 5566
5bb025fa 5567 list_for_each_entry(iter, &dev->adj_list.lower, list) {
4c75431a
AF
5568 if (!net_eq(net,dev_net(iter->dev)))
5569 continue;
5bb025fa
VF
5570 netdev_adjacent_sysfs_del(iter->dev, oldname,
5571 &iter->dev->adj_list.upper);
5572 netdev_adjacent_sysfs_add(iter->dev, dev,
5573 &iter->dev->adj_list.upper);
5574 }
402dae96 5575}
402dae96
VF
5576
5577void *netdev_lower_dev_get_private(struct net_device *dev,
5578 struct net_device *lower_dev)
5579{
5580 struct netdev_adjacent *lower;
5581
5582 if (!lower_dev)
5583 return NULL;
5584 lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5585 if (!lower)
5586 return NULL;
5587
5588 return lower->private;
5589}
5590EXPORT_SYMBOL(netdev_lower_dev_get_private);
5591
4085ebe8
VY
5592
5593int dev_get_nest_level(struct net_device *dev,
5594 bool (*type_check)(struct net_device *dev))
5595{
5596 struct net_device *lower = NULL;
5597 struct list_head *iter;
5598 int max_nest = -1;
5599 int nest;
5600
5601 ASSERT_RTNL();
5602
5603 netdev_for_each_lower_dev(dev, lower, iter) {
5604 nest = dev_get_nest_level(lower, type_check);
5605 if (max_nest < nest)
5606 max_nest = nest;
5607 }
5608
5609 if (type_check(dev))
5610 max_nest++;
5611
5612 return max_nest;
5613}
5614EXPORT_SYMBOL(dev_get_nest_level);
5615
b6c40d68
PM
5616static void dev_change_rx_flags(struct net_device *dev, int flags)
5617{
d314774c
SH
5618 const struct net_device_ops *ops = dev->netdev_ops;
5619
d2615bf4 5620 if (ops->ndo_change_rx_flags)
d314774c 5621 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
5622}
5623
991fb3f7 5624static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
1da177e4 5625{
b536db93 5626 unsigned int old_flags = dev->flags;
d04a48b0
EB
5627 kuid_t uid;
5628 kgid_t gid;
1da177e4 5629
24023451
PM
5630 ASSERT_RTNL();
5631
dad9b335
WC
5632 dev->flags |= IFF_PROMISC;
5633 dev->promiscuity += inc;
5634 if (dev->promiscuity == 0) {
5635 /*
5636 * Avoid overflow.
5637 * If inc causes overflow, untouch promisc and return error.
5638 */
5639 if (inc < 0)
5640 dev->flags &= ~IFF_PROMISC;
5641 else {
5642 dev->promiscuity -= inc;
7b6cd1ce
JP
5643 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5644 dev->name);
dad9b335
WC
5645 return -EOVERFLOW;
5646 }
5647 }
52609c0b 5648 if (dev->flags != old_flags) {
7b6cd1ce
JP
5649 pr_info("device %s %s promiscuous mode\n",
5650 dev->name,
5651 dev->flags & IFF_PROMISC ? "entered" : "left");
8192b0c4
DH
5652 if (audit_enabled) {
5653 current_uid_gid(&uid, &gid);
7759db82
KHK
5654 audit_log(current->audit_context, GFP_ATOMIC,
5655 AUDIT_ANOM_PROMISCUOUS,
5656 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5657 dev->name, (dev->flags & IFF_PROMISC),
5658 (old_flags & IFF_PROMISC),
e1760bd5 5659 from_kuid(&init_user_ns, audit_get_loginuid(current)),
d04a48b0
EB
5660 from_kuid(&init_user_ns, uid),
5661 from_kgid(&init_user_ns, gid),
7759db82 5662 audit_get_sessionid(current));
8192b0c4 5663 }
24023451 5664
b6c40d68 5665 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 5666 }
991fb3f7
ND
5667 if (notify)
5668 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
dad9b335 5669 return 0;
1da177e4
LT
5670}
5671
4417da66
PM
5672/**
5673 * dev_set_promiscuity - update promiscuity count on a device
5674 * @dev: device
5675 * @inc: modifier
5676 *
5677 * Add or remove promiscuity from a device. While the count in the device
5678 * remains above zero the interface remains promiscuous. Once it hits zero
5679 * the device reverts back to normal filtering operation. A negative inc
5680 * value is used to drop promiscuity on the device.
dad9b335 5681 * Return 0 if successful or a negative errno code on error.
4417da66 5682 */
dad9b335 5683int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66 5684{
b536db93 5685 unsigned int old_flags = dev->flags;
dad9b335 5686 int err;
4417da66 5687
991fb3f7 5688 err = __dev_set_promiscuity(dev, inc, true);
4b5a698e 5689 if (err < 0)
dad9b335 5690 return err;
4417da66
PM
5691 if (dev->flags != old_flags)
5692 dev_set_rx_mode(dev);
dad9b335 5693 return err;
4417da66 5694}
d1b19dff 5695EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 5696
991fb3f7 5697static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
1da177e4 5698{
991fb3f7 5699 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
1da177e4 5700
24023451
PM
5701 ASSERT_RTNL();
5702
1da177e4 5703 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
5704 dev->allmulti += inc;
5705 if (dev->allmulti == 0) {
5706 /*
5707 * Avoid overflow.
5708 * If inc causes overflow, untouch allmulti and return error.
5709 */
5710 if (inc < 0)
5711 dev->flags &= ~IFF_ALLMULTI;
5712 else {
5713 dev->allmulti -= inc;
7b6cd1ce
JP
5714 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5715 dev->name);
dad9b335
WC
5716 return -EOVERFLOW;
5717 }
5718 }
24023451 5719 if (dev->flags ^ old_flags) {
b6c40d68 5720 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 5721 dev_set_rx_mode(dev);
991fb3f7
ND
5722 if (notify)
5723 __dev_notify_flags(dev, old_flags,
5724 dev->gflags ^ old_gflags);
24023451 5725 }
dad9b335 5726 return 0;
4417da66 5727}
991fb3f7
ND
5728
5729/**
5730 * dev_set_allmulti - update allmulti count on a device
5731 * @dev: device
5732 * @inc: modifier
5733 *
5734 * Add or remove reception of all multicast frames to a device. While the
5735 * count in the device remains above zero the interface remains listening
5736 * to all interfaces. Once it hits zero the device reverts back to normal
5737 * filtering operation. A negative @inc value is used to drop the counter
5738 * when releasing a resource needing all multicasts.
5739 * Return 0 if successful or a negative errno code on error.
5740 */
5741
5742int dev_set_allmulti(struct net_device *dev, int inc)
5743{
5744 return __dev_set_allmulti(dev, inc, true);
5745}
d1b19dff 5746EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
5747
5748/*
5749 * Upload unicast and multicast address lists to device and
5750 * configure RX filtering. When the device doesn't support unicast
53ccaae1 5751 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
5752 * are present.
5753 */
5754void __dev_set_rx_mode(struct net_device *dev)
5755{
d314774c
SH
5756 const struct net_device_ops *ops = dev->netdev_ops;
5757
4417da66
PM
5758 /* dev_open will call this function so the list will stay sane. */
5759 if (!(dev->flags&IFF_UP))
5760 return;
5761
5762 if (!netif_device_present(dev))
40b77c94 5763 return;
4417da66 5764
01789349 5765 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4417da66
PM
5766 /* Unicast addresses changes may only happen under the rtnl,
5767 * therefore calling __dev_set_promiscuity here is safe.
5768 */
32e7bfc4 5769 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
991fb3f7 5770 __dev_set_promiscuity(dev, 1, false);
2d348d1f 5771 dev->uc_promisc = true;
32e7bfc4 5772 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
991fb3f7 5773 __dev_set_promiscuity(dev, -1, false);
2d348d1f 5774 dev->uc_promisc = false;
4417da66 5775 }
4417da66 5776 }
01789349
JP
5777
5778 if (ops->ndo_set_rx_mode)
5779 ops->ndo_set_rx_mode(dev);
4417da66
PM
5780}
5781
5782void dev_set_rx_mode(struct net_device *dev)
5783{
b9e40857 5784 netif_addr_lock_bh(dev);
4417da66 5785 __dev_set_rx_mode(dev);
b9e40857 5786 netif_addr_unlock_bh(dev);
1da177e4
LT
5787}
5788
f0db275a
SH
5789/**
5790 * dev_get_flags - get flags reported to userspace
5791 * @dev: device
5792 *
5793 * Get the combination of flag bits exported through APIs to userspace.
5794 */
95c96174 5795unsigned int dev_get_flags(const struct net_device *dev)
1da177e4 5796{
95c96174 5797 unsigned int flags;
1da177e4
LT
5798
5799 flags = (dev->flags & ~(IFF_PROMISC |
5800 IFF_ALLMULTI |
b00055aa
SR
5801 IFF_RUNNING |
5802 IFF_LOWER_UP |
5803 IFF_DORMANT)) |
1da177e4
LT
5804 (dev->gflags & (IFF_PROMISC |
5805 IFF_ALLMULTI));
5806
b00055aa
SR
5807 if (netif_running(dev)) {
5808 if (netif_oper_up(dev))
5809 flags |= IFF_RUNNING;
5810 if (netif_carrier_ok(dev))
5811 flags |= IFF_LOWER_UP;
5812 if (netif_dormant(dev))
5813 flags |= IFF_DORMANT;
5814 }
1da177e4
LT
5815
5816 return flags;
5817}
d1b19dff 5818EXPORT_SYMBOL(dev_get_flags);
1da177e4 5819
bd380811 5820int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 5821{
b536db93 5822 unsigned int old_flags = dev->flags;
bd380811 5823 int ret;
1da177e4 5824
24023451
PM
5825 ASSERT_RTNL();
5826
1da177e4
LT
5827 /*
5828 * Set the flags on our device.
5829 */
5830
5831 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5832 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5833 IFF_AUTOMEDIA)) |
5834 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5835 IFF_ALLMULTI));
5836
5837 /*
5838 * Load in the correct multicast list now the flags have changed.
5839 */
5840
b6c40d68
PM
5841 if ((old_flags ^ flags) & IFF_MULTICAST)
5842 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 5843
4417da66 5844 dev_set_rx_mode(dev);
1da177e4
LT
5845
5846 /*
5847 * Have we downed the interface. We handle IFF_UP ourselves
5848 * according to user attempts to set it, rather than blindly
5849 * setting it.
5850 */
5851
5852 ret = 0;
d215d10f 5853 if ((old_flags ^ flags) & IFF_UP)
bd380811 5854 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4 5855
1da177e4 5856 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff 5857 int inc = (flags & IFF_PROMISC) ? 1 : -1;
991fb3f7 5858 unsigned int old_flags = dev->flags;
d1b19dff 5859
1da177e4 5860 dev->gflags ^= IFF_PROMISC;
991fb3f7
ND
5861
5862 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5863 if (dev->flags != old_flags)
5864 dev_set_rx_mode(dev);
1da177e4
LT
5865 }
5866
5867 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5868 is important. Some (broken) drivers set IFF_PROMISC, when
5869 IFF_ALLMULTI is requested not asking us and not reporting.
5870 */
5871 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
5872 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5873
1da177e4 5874 dev->gflags ^= IFF_ALLMULTI;
991fb3f7 5875 __dev_set_allmulti(dev, inc, false);
1da177e4
LT
5876 }
5877
bd380811
PM
5878 return ret;
5879}
5880
a528c219
ND
5881void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5882 unsigned int gchanges)
bd380811
PM
5883{
5884 unsigned int changes = dev->flags ^ old_flags;
5885
a528c219 5886 if (gchanges)
7f294054 5887 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
a528c219 5888
bd380811
PM
5889 if (changes & IFF_UP) {
5890 if (dev->flags & IFF_UP)
5891 call_netdevice_notifiers(NETDEV_UP, dev);
5892 else
5893 call_netdevice_notifiers(NETDEV_DOWN, dev);
5894 }
5895
5896 if (dev->flags & IFF_UP &&
be9efd36
JP
5897 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5898 struct netdev_notifier_change_info change_info;
5899
5900 change_info.flags_changed = changes;
5901 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5902 &change_info.info);
5903 }
bd380811
PM
5904}
5905
5906/**
5907 * dev_change_flags - change device settings
5908 * @dev: device
5909 * @flags: device state flags
5910 *
5911 * Change settings on device based state flags. The flags are
5912 * in the userspace exported format.
5913 */
b536db93 5914int dev_change_flags(struct net_device *dev, unsigned int flags)
bd380811 5915{
b536db93 5916 int ret;
991fb3f7 5917 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
bd380811
PM
5918
5919 ret = __dev_change_flags(dev, flags);
5920 if (ret < 0)
5921 return ret;
5922
991fb3f7 5923 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
a528c219 5924 __dev_notify_flags(dev, old_flags, changes);
1da177e4
LT
5925 return ret;
5926}
d1b19dff 5927EXPORT_SYMBOL(dev_change_flags);
1da177e4 5928
2315dc91
VF
5929static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5930{
5931 const struct net_device_ops *ops = dev->netdev_ops;
5932
5933 if (ops->ndo_change_mtu)
5934 return ops->ndo_change_mtu(dev, new_mtu);
5935
5936 dev->mtu = new_mtu;
5937 return 0;
5938}
5939
f0db275a
SH
5940/**
5941 * dev_set_mtu - Change maximum transfer unit
5942 * @dev: device
5943 * @new_mtu: new transfer unit
5944 *
5945 * Change the maximum transfer size of the network device.
5946 */
1da177e4
LT
5947int dev_set_mtu(struct net_device *dev, int new_mtu)
5948{
2315dc91 5949 int err, orig_mtu;
1da177e4
LT
5950
5951 if (new_mtu == dev->mtu)
5952 return 0;
5953
5954 /* MTU must be positive. */
5955 if (new_mtu < 0)
5956 return -EINVAL;
5957
5958 if (!netif_device_present(dev))
5959 return -ENODEV;
5960
1d486bfb
VF
5961 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5962 err = notifier_to_errno(err);
5963 if (err)
5964 return err;
d314774c 5965
2315dc91
VF
5966 orig_mtu = dev->mtu;
5967 err = __dev_set_mtu(dev, new_mtu);
d314774c 5968
2315dc91
VF
5969 if (!err) {
5970 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5971 err = notifier_to_errno(err);
5972 if (err) {
5973 /* setting mtu back and notifying everyone again,
5974 * so that they have a chance to revert changes.
5975 */
5976 __dev_set_mtu(dev, orig_mtu);
5977 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5978 }
5979 }
1da177e4
LT
5980 return err;
5981}
d1b19dff 5982EXPORT_SYMBOL(dev_set_mtu);
1da177e4 5983
cbda10fa
VD
5984/**
5985 * dev_set_group - Change group this device belongs to
5986 * @dev: device
5987 * @new_group: group this device should belong to
5988 */
5989void dev_set_group(struct net_device *dev, int new_group)
5990{
5991 dev->group = new_group;
5992}
5993EXPORT_SYMBOL(dev_set_group);
5994
f0db275a
SH
5995/**
5996 * dev_set_mac_address - Change Media Access Control Address
5997 * @dev: device
5998 * @sa: new address
5999 *
6000 * Change the hardware (MAC) address of the device
6001 */
1da177e4
LT
6002int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6003{
d314774c 6004 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
6005 int err;
6006
d314774c 6007 if (!ops->ndo_set_mac_address)
1da177e4
LT
6008 return -EOPNOTSUPP;
6009 if (sa->sa_family != dev->type)
6010 return -EINVAL;
6011 if (!netif_device_present(dev))
6012 return -ENODEV;
d314774c 6013 err = ops->ndo_set_mac_address(dev, sa);
f6521516
JP
6014 if (err)
6015 return err;
fbdeca2d 6016 dev->addr_assign_type = NET_ADDR_SET;
f6521516 6017 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7bf23575 6018 add_device_randomness(dev->dev_addr, dev->addr_len);
f6521516 6019 return 0;
1da177e4 6020}
d1b19dff 6021EXPORT_SYMBOL(dev_set_mac_address);
1da177e4 6022
4bf84c35
JP
6023/**
6024 * dev_change_carrier - Change device carrier
6025 * @dev: device
691b3b7e 6026 * @new_carrier: new value
4bf84c35
JP
6027 *
6028 * Change device carrier
6029 */
6030int dev_change_carrier(struct net_device *dev, bool new_carrier)
6031{
6032 const struct net_device_ops *ops = dev->netdev_ops;
6033
6034 if (!ops->ndo_change_carrier)
6035 return -EOPNOTSUPP;
6036 if (!netif_device_present(dev))
6037 return -ENODEV;
6038 return ops->ndo_change_carrier(dev, new_carrier);
6039}
6040EXPORT_SYMBOL(dev_change_carrier);
6041
66b52b0d
JP
6042/**
6043 * dev_get_phys_port_id - Get device physical port ID
6044 * @dev: device
6045 * @ppid: port ID
6046 *
6047 * Get device physical port ID
6048 */
6049int dev_get_phys_port_id(struct net_device *dev,
02637fce 6050 struct netdev_phys_item_id *ppid)
66b52b0d
JP
6051{
6052 const struct net_device_ops *ops = dev->netdev_ops;
6053
6054 if (!ops->ndo_get_phys_port_id)
6055 return -EOPNOTSUPP;
6056 return ops->ndo_get_phys_port_id(dev, ppid);
6057}
6058EXPORT_SYMBOL(dev_get_phys_port_id);
6059
db24a904
DA
6060/**
6061 * dev_get_phys_port_name - Get device physical port name
6062 * @dev: device
6063 * @name: port name
6064 *
6065 * Get device physical port name
6066 */
6067int dev_get_phys_port_name(struct net_device *dev,
6068 char *name, size_t len)
6069{
6070 const struct net_device_ops *ops = dev->netdev_ops;
6071
6072 if (!ops->ndo_get_phys_port_name)
6073 return -EOPNOTSUPP;
6074 return ops->ndo_get_phys_port_name(dev, name, len);
6075}
6076EXPORT_SYMBOL(dev_get_phys_port_name);
6077
1da177e4
LT
6078/**
6079 * dev_new_index - allocate an ifindex
c4ea43c5 6080 * @net: the applicable net namespace
1da177e4
LT
6081 *
6082 * Returns a suitable unique value for a new device interface
6083 * number. The caller must hold the rtnl semaphore or the
6084 * dev_base_lock to be sure it remains unique.
6085 */
881d966b 6086static int dev_new_index(struct net *net)
1da177e4 6087{
aa79e66e 6088 int ifindex = net->ifindex;
1da177e4
LT
6089 for (;;) {
6090 if (++ifindex <= 0)
6091 ifindex = 1;
881d966b 6092 if (!__dev_get_by_index(net, ifindex))
aa79e66e 6093 return net->ifindex = ifindex;
1da177e4
LT
6094 }
6095}
6096
1da177e4 6097/* Delayed registration/unregisteration */
3b5b34fd 6098static LIST_HEAD(net_todo_list);
200b916f 6099DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
1da177e4 6100
6f05f629 6101static void net_set_todo(struct net_device *dev)
1da177e4 6102{
1da177e4 6103 list_add_tail(&dev->todo_list, &net_todo_list);
50624c93 6104 dev_net(dev)->dev_unreg_count++;
1da177e4
LT
6105}
6106
9b5e383c 6107static void rollback_registered_many(struct list_head *head)
93ee31f1 6108{
e93737b0 6109 struct net_device *dev, *tmp;
5cde2829 6110 LIST_HEAD(close_head);
9b5e383c 6111
93ee31f1
DL
6112 BUG_ON(dev_boot_phase);
6113 ASSERT_RTNL();
6114
e93737b0 6115 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 6116 /* Some devices call without registering
e93737b0
KK
6117 * for initialization unwind. Remove those
6118 * devices and proceed with the remaining.
9b5e383c
ED
6119 */
6120 if (dev->reg_state == NETREG_UNINITIALIZED) {
7b6cd1ce
JP
6121 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6122 dev->name, dev);
93ee31f1 6123
9b5e383c 6124 WARN_ON(1);
e93737b0
KK
6125 list_del(&dev->unreg_list);
6126 continue;
9b5e383c 6127 }
449f4544 6128 dev->dismantle = true;
9b5e383c 6129 BUG_ON(dev->reg_state != NETREG_REGISTERED);
44345724 6130 }
93ee31f1 6131
44345724 6132 /* If device is running, close it first. */
5cde2829
EB
6133 list_for_each_entry(dev, head, unreg_list)
6134 list_add_tail(&dev->close_list, &close_head);
99c4a26a 6135 dev_close_many(&close_head, true);
93ee31f1 6136
44345724 6137 list_for_each_entry(dev, head, unreg_list) {
9b5e383c
ED
6138 /* And unlink it from device chain. */
6139 unlist_netdevice(dev);
93ee31f1 6140
9b5e383c
ED
6141 dev->reg_state = NETREG_UNREGISTERING;
6142 }
93ee31f1
DL
6143
6144 synchronize_net();
6145
9b5e383c 6146 list_for_each_entry(dev, head, unreg_list) {
395eea6c
MB
6147 struct sk_buff *skb = NULL;
6148
9b5e383c
ED
6149 /* Shutdown queueing discipline. */
6150 dev_shutdown(dev);
93ee31f1
DL
6151
6152
9b5e383c
ED
6153 /* Notify protocols, that we are about to destroy
6154 this device. They should clean all the things.
6155 */
6156 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 6157
395eea6c
MB
6158 if (!dev->rtnl_link_ops ||
6159 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6160 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6161 GFP_KERNEL);
6162
9b5e383c
ED
6163 /*
6164 * Flush the unicast and multicast chains
6165 */
a748ee24 6166 dev_uc_flush(dev);
22bedad3 6167 dev_mc_flush(dev);
93ee31f1 6168
9b5e383c
ED
6169 if (dev->netdev_ops->ndo_uninit)
6170 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 6171
395eea6c
MB
6172 if (skb)
6173 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
56bfa7ee 6174
9ff162a8
JP
6175 /* Notifier chain MUST detach us all upper devices. */
6176 WARN_ON(netdev_has_any_upper_dev(dev));
93ee31f1 6177
9b5e383c
ED
6178 /* Remove entries from kobject tree */
6179 netdev_unregister_kobject(dev);
024e9679
AD
6180#ifdef CONFIG_XPS
6181 /* Remove XPS queueing entries */
6182 netif_reset_xps_queues_gt(dev, 0);
6183#endif
9b5e383c 6184 }
93ee31f1 6185
850a545b 6186 synchronize_net();
395264d5 6187
a5ee1551 6188 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
6189 dev_put(dev);
6190}
6191
6192static void rollback_registered(struct net_device *dev)
6193{
6194 LIST_HEAD(single);
6195
6196 list_add(&dev->unreg_list, &single);
6197 rollback_registered_many(&single);
ceaaec98 6198 list_del(&single);
93ee31f1
DL
6199}
6200
c8f44aff
MM
6201static netdev_features_t netdev_fix_features(struct net_device *dev,
6202 netdev_features_t features)
b63365a2 6203{
57422dc5
MM
6204 /* Fix illegal checksum combinations */
6205 if ((features & NETIF_F_HW_CSUM) &&
6206 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 6207 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
57422dc5
MM
6208 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6209 }
6210
b63365a2 6211 /* TSO requires that SG is present as well. */
ea2d3688 6212 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6f404e44 6213 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
ea2d3688 6214 features &= ~NETIF_F_ALL_TSO;
b63365a2
HX
6215 }
6216
ec5f0615
PS
6217 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6218 !(features & NETIF_F_IP_CSUM)) {
6219 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6220 features &= ~NETIF_F_TSO;
6221 features &= ~NETIF_F_TSO_ECN;
6222 }
6223
6224 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6225 !(features & NETIF_F_IPV6_CSUM)) {
6226 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6227 features &= ~NETIF_F_TSO6;
6228 }
6229
31d8b9e0
BH
6230 /* TSO ECN requires that TSO is present as well. */
6231 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6232 features &= ~NETIF_F_TSO_ECN;
6233
212b573f
MM
6234 /* Software GSO depends on SG. */
6235 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6f404e44 6236 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
212b573f
MM
6237 features &= ~NETIF_F_GSO;
6238 }
6239
acd1130e 6240 /* UFO needs SG and checksumming */
b63365a2 6241 if (features & NETIF_F_UFO) {
79032644
MM
6242 /* maybe split UFO into V4 and V6? */
6243 if (!((features & NETIF_F_GEN_CSUM) ||
6244 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6245 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 6246 netdev_dbg(dev,
acd1130e 6247 "Dropping NETIF_F_UFO since no checksum offload features.\n");
b63365a2
HX
6248 features &= ~NETIF_F_UFO;
6249 }
6250
6251 if (!(features & NETIF_F_SG)) {
6f404e44 6252 netdev_dbg(dev,
acd1130e 6253 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
b63365a2
HX
6254 features &= ~NETIF_F_UFO;
6255 }
6256 }
6257
d0290214
JP
6258#ifdef CONFIG_NET_RX_BUSY_POLL
6259 if (dev->netdev_ops->ndo_busy_poll)
6260 features |= NETIF_F_BUSY_POLL;
6261 else
6262#endif
6263 features &= ~NETIF_F_BUSY_POLL;
6264
b63365a2
HX
6265 return features;
6266}
b63365a2 6267
6cb6a27c 6268int __netdev_update_features(struct net_device *dev)
5455c699 6269{
c8f44aff 6270 netdev_features_t features;
5455c699
MM
6271 int err = 0;
6272
87267485
MM
6273 ASSERT_RTNL();
6274
5455c699
MM
6275 features = netdev_get_wanted_features(dev);
6276
6277 if (dev->netdev_ops->ndo_fix_features)
6278 features = dev->netdev_ops->ndo_fix_features(dev, features);
6279
6280 /* driver might be less strict about feature dependencies */
6281 features = netdev_fix_features(dev, features);
6282
6283 if (dev->features == features)
6cb6a27c 6284 return 0;
5455c699 6285
c8f44aff
MM
6286 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6287 &dev->features, &features);
5455c699
MM
6288
6289 if (dev->netdev_ops->ndo_set_features)
6290 err = dev->netdev_ops->ndo_set_features(dev, features);
6291
6cb6a27c 6292 if (unlikely(err < 0)) {
5455c699 6293 netdev_err(dev,
c8f44aff
MM
6294 "set_features() failed (%d); wanted %pNF, left %pNF\n",
6295 err, &features, &dev->features);
6cb6a27c
MM
6296 return -1;
6297 }
6298
6299 if (!err)
6300 dev->features = features;
6301
6302 return 1;
6303}
6304
afe12cc8
MM
6305/**
6306 * netdev_update_features - recalculate device features
6307 * @dev: the device to check
6308 *
6309 * Recalculate dev->features set and send notifications if it
6310 * has changed. Should be called after driver or hardware dependent
6311 * conditions might have changed that influence the features.
6312 */
6cb6a27c
MM
6313void netdev_update_features(struct net_device *dev)
6314{
6315 if (__netdev_update_features(dev))
6316 netdev_features_change(dev);
5455c699
MM
6317}
6318EXPORT_SYMBOL(netdev_update_features);
6319
afe12cc8
MM
6320/**
6321 * netdev_change_features - recalculate device features
6322 * @dev: the device to check
6323 *
6324 * Recalculate dev->features set and send notifications even
6325 * if they have not changed. Should be called instead of
6326 * netdev_update_features() if also dev->vlan_features might
6327 * have changed to allow the changes to be propagated to stacked
6328 * VLAN devices.
6329 */
6330void netdev_change_features(struct net_device *dev)
6331{
6332 __netdev_update_features(dev);
6333 netdev_features_change(dev);
6334}
6335EXPORT_SYMBOL(netdev_change_features);
6336
fc4a7489
PM
6337/**
6338 * netif_stacked_transfer_operstate - transfer operstate
6339 * @rootdev: the root or lower level device to transfer state from
6340 * @dev: the device to transfer operstate to
6341 *
6342 * Transfer operational state from root to device. This is normally
6343 * called when a stacking relationship exists between the root
6344 * device and the device(a leaf device).
6345 */
6346void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6347 struct net_device *dev)
6348{
6349 if (rootdev->operstate == IF_OPER_DORMANT)
6350 netif_dormant_on(dev);
6351 else
6352 netif_dormant_off(dev);
6353
6354 if (netif_carrier_ok(rootdev)) {
6355 if (!netif_carrier_ok(dev))
6356 netif_carrier_on(dev);
6357 } else {
6358 if (netif_carrier_ok(dev))
6359 netif_carrier_off(dev);
6360 }
6361}
6362EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6363
a953be53 6364#ifdef CONFIG_SYSFS
1b4bf461
ED
6365static int netif_alloc_rx_queues(struct net_device *dev)
6366{
1b4bf461 6367 unsigned int i, count = dev->num_rx_queues;
bd25fa7b 6368 struct netdev_rx_queue *rx;
10595902 6369 size_t sz = count * sizeof(*rx);
1b4bf461 6370
bd25fa7b 6371 BUG_ON(count < 1);
1b4bf461 6372
10595902
PG
6373 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6374 if (!rx) {
6375 rx = vzalloc(sz);
6376 if (!rx)
6377 return -ENOMEM;
6378 }
bd25fa7b
TH
6379 dev->_rx = rx;
6380
bd25fa7b 6381 for (i = 0; i < count; i++)
fe822240 6382 rx[i].dev = dev;
1b4bf461
ED
6383 return 0;
6384}
bf264145 6385#endif
1b4bf461 6386
aa942104
CG
6387static void netdev_init_one_queue(struct net_device *dev,
6388 struct netdev_queue *queue, void *_unused)
6389{
6390 /* Initialize queue lock */
6391 spin_lock_init(&queue->_xmit_lock);
6392 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6393 queue->xmit_lock_owner = -1;
b236da69 6394 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
aa942104 6395 queue->dev = dev;
114cf580
TH
6396#ifdef CONFIG_BQL
6397 dql_init(&queue->dql, HZ);
6398#endif
aa942104
CG
6399}
6400
60877a32
ED
6401static void netif_free_tx_queues(struct net_device *dev)
6402{
4cb28970 6403 kvfree(dev->_tx);
60877a32
ED
6404}
6405
e6484930
TH
6406static int netif_alloc_netdev_queues(struct net_device *dev)
6407{
6408 unsigned int count = dev->num_tx_queues;
6409 struct netdev_queue *tx;
60877a32 6410 size_t sz = count * sizeof(*tx);
e6484930 6411
60877a32 6412 BUG_ON(count < 1 || count > 0xffff);
62b5942a 6413
60877a32
ED
6414 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6415 if (!tx) {
6416 tx = vzalloc(sz);
6417 if (!tx)
6418 return -ENOMEM;
6419 }
e6484930 6420 dev->_tx = tx;
1d24eb48 6421
e6484930
TH
6422 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6423 spin_lock_init(&dev->tx_global_lock);
aa942104
CG
6424
6425 return 0;
e6484930
TH
6426}
6427
a2029240
DV
6428void netif_tx_stop_all_queues(struct net_device *dev)
6429{
6430 unsigned int i;
6431
6432 for (i = 0; i < dev->num_tx_queues; i++) {
6433 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6434 netif_tx_stop_queue(txq);
6435 }
6436}
6437EXPORT_SYMBOL(netif_tx_stop_all_queues);
6438
1da177e4
LT
6439/**
6440 * register_netdevice - register a network device
6441 * @dev: device to register
6442 *
6443 * Take a completed network device structure and add it to the kernel
6444 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6445 * chain. 0 is returned on success. A negative errno code is returned
6446 * on a failure to set up the device, or if the name is a duplicate.
6447 *
6448 * Callers must hold the rtnl semaphore. You may want
6449 * register_netdev() instead of this.
6450 *
6451 * BUGS:
6452 * The locking appears insufficient to guarantee two parallel registers
6453 * will not get the same name.
6454 */
6455
6456int register_netdevice(struct net_device *dev)
6457{
1da177e4 6458 int ret;
d314774c 6459 struct net *net = dev_net(dev);
1da177e4
LT
6460
6461 BUG_ON(dev_boot_phase);
6462 ASSERT_RTNL();
6463
b17a7c17
SH
6464 might_sleep();
6465
1da177e4
LT
6466 /* When net_device's are persistent, this will be fatal. */
6467 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 6468 BUG_ON(!net);
1da177e4 6469
f1f28aa3 6470 spin_lock_init(&dev->addr_list_lock);
cf508b12 6471 netdev_set_addr_lockdep_class(dev);
1da177e4 6472
828de4f6 6473 ret = dev_get_valid_name(net, dev, dev->name);
0696c3a8
PP
6474 if (ret < 0)
6475 goto out;
6476
1da177e4 6477 /* Init, if this function is available */
d314774c
SH
6478 if (dev->netdev_ops->ndo_init) {
6479 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
6480 if (ret) {
6481 if (ret > 0)
6482 ret = -EIO;
90833aa4 6483 goto out;
1da177e4
LT
6484 }
6485 }
4ec93edb 6486
f646968f
PM
6487 if (((dev->hw_features | dev->features) &
6488 NETIF_F_HW_VLAN_CTAG_FILTER) &&
d2ed273d
MM
6489 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6490 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6491 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6492 ret = -EINVAL;
6493 goto err_uninit;
6494 }
6495
9c7dafbf
PE
6496 ret = -EBUSY;
6497 if (!dev->ifindex)
6498 dev->ifindex = dev_new_index(net);
6499 else if (__dev_get_by_index(net, dev->ifindex))
6500 goto err_uninit;
6501
5455c699
MM
6502 /* Transfer changeable features to wanted_features and enable
6503 * software offloads (GSO and GRO).
6504 */
6505 dev->hw_features |= NETIF_F_SOFT_FEATURES;
14d1232f
MM
6506 dev->features |= NETIF_F_SOFT_FEATURES;
6507 dev->wanted_features = dev->features & dev->hw_features;
1da177e4 6508
34324dc2
MM
6509 if (!(dev->flags & IFF_LOOPBACK)) {
6510 dev->hw_features |= NETIF_F_NOCACHE_COPY;
c6e1a0d1
TH
6511 }
6512
1180e7d6 6513 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
16c3ea78 6514 */
1180e7d6 6515 dev->vlan_features |= NETIF_F_HIGHDMA;
16c3ea78 6516
ee579677
PS
6517 /* Make NETIF_F_SG inheritable to tunnel devices.
6518 */
6519 dev->hw_enc_features |= NETIF_F_SG;
6520
0d89d203
SH
6521 /* Make NETIF_F_SG inheritable to MPLS.
6522 */
6523 dev->mpls_features |= NETIF_F_SG;
6524
7ffbe3fd
JB
6525 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6526 ret = notifier_to_errno(ret);
6527 if (ret)
6528 goto err_uninit;
6529
8b41d188 6530 ret = netdev_register_kobject(dev);
b17a7c17 6531 if (ret)
7ce1b0ed 6532 goto err_uninit;
b17a7c17
SH
6533 dev->reg_state = NETREG_REGISTERED;
6534
6cb6a27c 6535 __netdev_update_features(dev);
8e9b59b2 6536
1da177e4
LT
6537 /*
6538 * Default initial state at registry is that the
6539 * device is present.
6540 */
6541
6542 set_bit(__LINK_STATE_PRESENT, &dev->state);
6543
8f4cccbb
BH
6544 linkwatch_init_dev(dev);
6545
1da177e4 6546 dev_init_scheduler(dev);
1da177e4 6547 dev_hold(dev);
ce286d32 6548 list_netdevice(dev);
7bf23575 6549 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 6550
948b337e
JP
6551 /* If the device has permanent device address, driver should
6552 * set dev_addr and also addr_assign_type should be set to
6553 * NET_ADDR_PERM (default value).
6554 */
6555 if (dev->addr_assign_type == NET_ADDR_PERM)
6556 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6557
1da177e4 6558 /* Notify protocols, that a new device appeared. */
056925ab 6559 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 6560 ret = notifier_to_errno(ret);
93ee31f1
DL
6561 if (ret) {
6562 rollback_registered(dev);
6563 dev->reg_state = NETREG_UNREGISTERED;
6564 }
d90a909e
EB
6565 /*
6566 * Prevent userspace races by waiting until the network
6567 * device is fully setup before sending notifications.
6568 */
a2835763
PM
6569 if (!dev->rtnl_link_ops ||
6570 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7f294054 6571 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
1da177e4
LT
6572
6573out:
6574 return ret;
7ce1b0ed
HX
6575
6576err_uninit:
d314774c
SH
6577 if (dev->netdev_ops->ndo_uninit)
6578 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 6579 goto out;
1da177e4 6580}
d1b19dff 6581EXPORT_SYMBOL(register_netdevice);
1da177e4 6582
937f1ba5
BH
6583/**
6584 * init_dummy_netdev - init a dummy network device for NAPI
6585 * @dev: device to init
6586 *
6587 * This takes a network device structure and initialize the minimum
6588 * amount of fields so it can be used to schedule NAPI polls without
6589 * registering a full blown interface. This is to be used by drivers
6590 * that need to tie several hardware interfaces to a single NAPI
6591 * poll scheduler due to HW limitations.
6592 */
6593int init_dummy_netdev(struct net_device *dev)
6594{
6595 /* Clear everything. Note we don't initialize spinlocks
6596 * are they aren't supposed to be taken by any of the
6597 * NAPI code and this dummy netdev is supposed to be
6598 * only ever used for NAPI polls
6599 */
6600 memset(dev, 0, sizeof(struct net_device));
6601
6602 /* make sure we BUG if trying to hit standard
6603 * register/unregister code path
6604 */
6605 dev->reg_state = NETREG_DUMMY;
6606
937f1ba5
BH
6607 /* NAPI wants this */
6608 INIT_LIST_HEAD(&dev->napi_list);
6609
6610 /* a dummy interface is started by default */
6611 set_bit(__LINK_STATE_PRESENT, &dev->state);
6612 set_bit(__LINK_STATE_START, &dev->state);
6613
29b4433d
ED
6614 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6615 * because users of this 'device' dont need to change
6616 * its refcount.
6617 */
6618
937f1ba5
BH
6619 return 0;
6620}
6621EXPORT_SYMBOL_GPL(init_dummy_netdev);
6622
6623
1da177e4
LT
6624/**
6625 * register_netdev - register a network device
6626 * @dev: device to register
6627 *
6628 * Take a completed network device structure and add it to the kernel
6629 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6630 * chain. 0 is returned on success. A negative errno code is returned
6631 * on a failure to set up the device, or if the name is a duplicate.
6632 *
38b4da38 6633 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
6634 * and expands the device name if you passed a format string to
6635 * alloc_netdev.
6636 */
6637int register_netdev(struct net_device *dev)
6638{
6639 int err;
6640
6641 rtnl_lock();
1da177e4 6642 err = register_netdevice(dev);
1da177e4
LT
6643 rtnl_unlock();
6644 return err;
6645}
6646EXPORT_SYMBOL(register_netdev);
6647
29b4433d
ED
6648int netdev_refcnt_read(const struct net_device *dev)
6649{
6650 int i, refcnt = 0;
6651
6652 for_each_possible_cpu(i)
6653 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6654 return refcnt;
6655}
6656EXPORT_SYMBOL(netdev_refcnt_read);
6657
2c53040f 6658/**
1da177e4 6659 * netdev_wait_allrefs - wait until all references are gone.
3de7a37b 6660 * @dev: target net_device
1da177e4
LT
6661 *
6662 * This is called when unregistering network devices.
6663 *
6664 * Any protocol or device that holds a reference should register
6665 * for netdevice notification, and cleanup and put back the
6666 * reference if they receive an UNREGISTER event.
6667 * We can get stuck here if buggy protocols don't correctly
4ec93edb 6668 * call dev_put.
1da177e4
LT
6669 */
6670static void netdev_wait_allrefs(struct net_device *dev)
6671{
6672 unsigned long rebroadcast_time, warning_time;
29b4433d 6673 int refcnt;
1da177e4 6674
e014debe
ED
6675 linkwatch_forget_dev(dev);
6676
1da177e4 6677 rebroadcast_time = warning_time = jiffies;
29b4433d
ED
6678 refcnt = netdev_refcnt_read(dev);
6679
6680 while (refcnt != 0) {
1da177e4 6681 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 6682 rtnl_lock();
1da177e4
LT
6683
6684 /* Rebroadcast unregister notification */
056925ab 6685 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
1da177e4 6686
748e2d93 6687 __rtnl_unlock();
0115e8e3 6688 rcu_barrier();
748e2d93
ED
6689 rtnl_lock();
6690
0115e8e3 6691 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
1da177e4
LT
6692 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6693 &dev->state)) {
6694 /* We must not have linkwatch events
6695 * pending on unregister. If this
6696 * happens, we simply run the queue
6697 * unscheduled, resulting in a noop
6698 * for this device.
6699 */
6700 linkwatch_run_queue();
6701 }
6702
6756ae4b 6703 __rtnl_unlock();
1da177e4
LT
6704
6705 rebroadcast_time = jiffies;
6706 }
6707
6708 msleep(250);
6709
29b4433d
ED
6710 refcnt = netdev_refcnt_read(dev);
6711
1da177e4 6712 if (time_after(jiffies, warning_time + 10 * HZ)) {
7b6cd1ce
JP
6713 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6714 dev->name, refcnt);
1da177e4
LT
6715 warning_time = jiffies;
6716 }
6717 }
6718}
6719
6720/* The sequence is:
6721 *
6722 * rtnl_lock();
6723 * ...
6724 * register_netdevice(x1);
6725 * register_netdevice(x2);
6726 * ...
6727 * unregister_netdevice(y1);
6728 * unregister_netdevice(y2);
6729 * ...
6730 * rtnl_unlock();
6731 * free_netdev(y1);
6732 * free_netdev(y2);
6733 *
58ec3b4d 6734 * We are invoked by rtnl_unlock().
1da177e4 6735 * This allows us to deal with problems:
b17a7c17 6736 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
6737 * without deadlocking with linkwatch via keventd.
6738 * 2) Since we run with the RTNL semaphore not held, we can sleep
6739 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
6740 *
6741 * We must not return until all unregister events added during
6742 * the interval the lock was held have been completed.
1da177e4 6743 */
1da177e4
LT
6744void netdev_run_todo(void)
6745{
626ab0e6 6746 struct list_head list;
1da177e4 6747
1da177e4 6748 /* Snapshot list, allow later requests */
626ab0e6 6749 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
6750
6751 __rtnl_unlock();
626ab0e6 6752
0115e8e3
ED
6753
6754 /* Wait for rcu callbacks to finish before next phase */
850a545b
EB
6755 if (!list_empty(&list))
6756 rcu_barrier();
6757
1da177e4
LT
6758 while (!list_empty(&list)) {
6759 struct net_device *dev
e5e26d75 6760 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
6761 list_del(&dev->todo_list);
6762
748e2d93 6763 rtnl_lock();
0115e8e3 6764 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
748e2d93 6765 __rtnl_unlock();
0115e8e3 6766
b17a7c17 6767 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7b6cd1ce 6768 pr_err("network todo '%s' but state %d\n",
b17a7c17
SH
6769 dev->name, dev->reg_state);
6770 dump_stack();
6771 continue;
6772 }
1da177e4 6773
b17a7c17 6774 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 6775
152102c7 6776 on_each_cpu(flush_backlog, dev, 1);
6e583ce5 6777
b17a7c17 6778 netdev_wait_allrefs(dev);
1da177e4 6779
b17a7c17 6780 /* paranoia */
29b4433d 6781 BUG_ON(netdev_refcnt_read(dev));
7866a621
SN
6782 BUG_ON(!list_empty(&dev->ptype_all));
6783 BUG_ON(!list_empty(&dev->ptype_specific));
33d480ce
ED
6784 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6785 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
547b792c 6786 WARN_ON(dev->dn_ptr);
1da177e4 6787
b17a7c17
SH
6788 if (dev->destructor)
6789 dev->destructor(dev);
9093bbb2 6790
50624c93
EB
6791 /* Report a network device has been unregistered */
6792 rtnl_lock();
6793 dev_net(dev)->dev_unreg_count--;
6794 __rtnl_unlock();
6795 wake_up(&netdev_unregistering_wq);
6796
9093bbb2
SH
6797 /* Free network device */
6798 kobject_put(&dev->dev.kobj);
1da177e4 6799 }
1da177e4
LT
6800}
6801
3cfde79c
BH
6802/* Convert net_device_stats to rtnl_link_stats64. They have the same
6803 * fields in the same order, with only the type differing.
6804 */
77a1abf5
ED
6805void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6806 const struct net_device_stats *netdev_stats)
3cfde79c
BH
6807{
6808#if BITS_PER_LONG == 64
77a1abf5
ED
6809 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6810 memcpy(stats64, netdev_stats, sizeof(*stats64));
3cfde79c
BH
6811#else
6812 size_t i, n = sizeof(*stats64) / sizeof(u64);
6813 const unsigned long *src = (const unsigned long *)netdev_stats;
6814 u64 *dst = (u64 *)stats64;
6815
6816 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6817 sizeof(*stats64) / sizeof(u64));
6818 for (i = 0; i < n; i++)
6819 dst[i] = src[i];
6820#endif
6821}
77a1abf5 6822EXPORT_SYMBOL(netdev_stats_to_stats64);
3cfde79c 6823
eeda3fd6
SH
6824/**
6825 * dev_get_stats - get network device statistics
6826 * @dev: device to get statistics from
28172739 6827 * @storage: place to store stats
eeda3fd6 6828 *
d7753516
BH
6829 * Get network statistics from device. Return @storage.
6830 * The device driver may provide its own method by setting
6831 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6832 * otherwise the internal statistics structure is used.
eeda3fd6 6833 */
d7753516
BH
6834struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6835 struct rtnl_link_stats64 *storage)
7004bf25 6836{
eeda3fd6
SH
6837 const struct net_device_ops *ops = dev->netdev_ops;
6838
28172739
ED
6839 if (ops->ndo_get_stats64) {
6840 memset(storage, 0, sizeof(*storage));
caf586e5
ED
6841 ops->ndo_get_stats64(dev, storage);
6842 } else if (ops->ndo_get_stats) {
3cfde79c 6843 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
caf586e5
ED
6844 } else {
6845 netdev_stats_to_stats64(storage, &dev->stats);
28172739 6846 }
caf586e5 6847 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
015f0688 6848 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
28172739 6849 return storage;
c45d286e 6850}
eeda3fd6 6851EXPORT_SYMBOL(dev_get_stats);
c45d286e 6852
24824a09 6853struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
dc2b4847 6854{
24824a09 6855 struct netdev_queue *queue = dev_ingress_queue(dev);
dc2b4847 6856
24824a09
ED
6857#ifdef CONFIG_NET_CLS_ACT
6858 if (queue)
6859 return queue;
6860 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6861 if (!queue)
6862 return NULL;
6863 netdev_init_one_queue(dev, queue, NULL);
2ce1ee17 6864 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
24824a09
ED
6865 queue->qdisc_sleeping = &noop_qdisc;
6866 rcu_assign_pointer(dev->ingress_queue, queue);
6867#endif
6868 return queue;
bb949fbd
DM
6869}
6870
2c60db03
ED
6871static const struct ethtool_ops default_ethtool_ops;
6872
d07d7507
SG
6873void netdev_set_default_ethtool_ops(struct net_device *dev,
6874 const struct ethtool_ops *ops)
6875{
6876 if (dev->ethtool_ops == &default_ethtool_ops)
6877 dev->ethtool_ops = ops;
6878}
6879EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6880
74d332c1
ED
6881void netdev_freemem(struct net_device *dev)
6882{
6883 char *addr = (char *)dev - dev->padded;
6884
4cb28970 6885 kvfree(addr);
74d332c1
ED
6886}
6887
1da177e4 6888/**
36909ea4 6889 * alloc_netdev_mqs - allocate network device
c835a677
TG
6890 * @sizeof_priv: size of private data to allocate space for
6891 * @name: device name format string
6892 * @name_assign_type: origin of device name
6893 * @setup: callback to initialize device
6894 * @txqs: the number of TX subqueues to allocate
6895 * @rxqs: the number of RX subqueues to allocate
1da177e4
LT
6896 *
6897 * Allocates a struct net_device with private data area for driver use
90e51adf 6898 * and performs basic initialization. Also allocates subqueue structs
36909ea4 6899 * for each queue on the device.
1da177e4 6900 */
36909ea4 6901struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
c835a677 6902 unsigned char name_assign_type,
36909ea4
TH
6903 void (*setup)(struct net_device *),
6904 unsigned int txqs, unsigned int rxqs)
1da177e4 6905{
1da177e4 6906 struct net_device *dev;
7943986c 6907 size_t alloc_size;
1ce8e7b5 6908 struct net_device *p;
1da177e4 6909
b6fe17d6
SH
6910 BUG_ON(strlen(name) >= sizeof(dev->name));
6911
36909ea4 6912 if (txqs < 1) {
7b6cd1ce 6913 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
55513fb4
TH
6914 return NULL;
6915 }
6916
a953be53 6917#ifdef CONFIG_SYSFS
36909ea4 6918 if (rxqs < 1) {
7b6cd1ce 6919 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
36909ea4
TH
6920 return NULL;
6921 }
6922#endif
6923
fd2ea0a7 6924 alloc_size = sizeof(struct net_device);
d1643d24
AD
6925 if (sizeof_priv) {
6926 /* ensure 32-byte alignment of private area */
1ce8e7b5 6927 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
6928 alloc_size += sizeof_priv;
6929 }
6930 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 6931 alloc_size += NETDEV_ALIGN - 1;
1da177e4 6932
74d332c1
ED
6933 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6934 if (!p)
6935 p = vzalloc(alloc_size);
62b5942a 6936 if (!p)
1da177e4 6937 return NULL;
1da177e4 6938
1ce8e7b5 6939 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 6940 dev->padded = (char *)dev - (char *)p;
ab9c73cc 6941
29b4433d
ED
6942 dev->pcpu_refcnt = alloc_percpu(int);
6943 if (!dev->pcpu_refcnt)
74d332c1 6944 goto free_dev;
ab9c73cc 6945
ab9c73cc 6946 if (dev_addr_init(dev))
29b4433d 6947 goto free_pcpu;
ab9c73cc 6948
22bedad3 6949 dev_mc_init(dev);
a748ee24 6950 dev_uc_init(dev);
ccffad25 6951
c346dca1 6952 dev_net_set(dev, &init_net);
1da177e4 6953
8d3bdbd5 6954 dev->gso_max_size = GSO_MAX_SIZE;
30b678d8 6955 dev->gso_max_segs = GSO_MAX_SEGS;
fcbeb976 6956 dev->gso_min_segs = 0;
8d3bdbd5 6957
8d3bdbd5
DM
6958 INIT_LIST_HEAD(&dev->napi_list);
6959 INIT_LIST_HEAD(&dev->unreg_list);
5cde2829 6960 INIT_LIST_HEAD(&dev->close_list);
8d3bdbd5 6961 INIT_LIST_HEAD(&dev->link_watch_list);
2f268f12
VF
6962 INIT_LIST_HEAD(&dev->adj_list.upper);
6963 INIT_LIST_HEAD(&dev->adj_list.lower);
6964 INIT_LIST_HEAD(&dev->all_adj_list.upper);
6965 INIT_LIST_HEAD(&dev->all_adj_list.lower);
7866a621
SN
6966 INIT_LIST_HEAD(&dev->ptype_all);
6967 INIT_LIST_HEAD(&dev->ptype_specific);
02875878 6968 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
8d3bdbd5
DM
6969 setup(dev);
6970
36909ea4
TH
6971 dev->num_tx_queues = txqs;
6972 dev->real_num_tx_queues = txqs;
ed9af2e8 6973 if (netif_alloc_netdev_queues(dev))
8d3bdbd5 6974 goto free_all;
e8a0464c 6975
a953be53 6976#ifdef CONFIG_SYSFS
36909ea4
TH
6977 dev->num_rx_queues = rxqs;
6978 dev->real_num_rx_queues = rxqs;
fe822240 6979 if (netif_alloc_rx_queues(dev))
8d3bdbd5 6980 goto free_all;
df334545 6981#endif
0a9627f2 6982
1da177e4 6983 strcpy(dev->name, name);
c835a677 6984 dev->name_assign_type = name_assign_type;
cbda10fa 6985 dev->group = INIT_NETDEV_GROUP;
2c60db03
ED
6986 if (!dev->ethtool_ops)
6987 dev->ethtool_ops = &default_ethtool_ops;
e687ad60
PN
6988
6989 nf_hook_ingress_init(dev);
6990
1da177e4 6991 return dev;
ab9c73cc 6992
8d3bdbd5
DM
6993free_all:
6994 free_netdev(dev);
6995 return NULL;
6996
29b4433d
ED
6997free_pcpu:
6998 free_percpu(dev->pcpu_refcnt);
74d332c1
ED
6999free_dev:
7000 netdev_freemem(dev);
ab9c73cc 7001 return NULL;
1da177e4 7002}
36909ea4 7003EXPORT_SYMBOL(alloc_netdev_mqs);
1da177e4
LT
7004
7005/**
7006 * free_netdev - free network device
7007 * @dev: device
7008 *
4ec93edb
YH
7009 * This function does the last stage of destroying an allocated device
7010 * interface. The reference to the device object is released.
1da177e4
LT
7011 * If this is the last reference then it will be freed.
7012 */
7013void free_netdev(struct net_device *dev)
7014{
d565b0a1
HX
7015 struct napi_struct *p, *n;
7016
60877a32 7017 netif_free_tx_queues(dev);
a953be53 7018#ifdef CONFIG_SYSFS
10595902 7019 kvfree(dev->_rx);
fe822240 7020#endif
e8a0464c 7021
33d480ce 7022 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
24824a09 7023
f001fde5
JP
7024 /* Flush device addresses */
7025 dev_addr_flush(dev);
7026
d565b0a1
HX
7027 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7028 netif_napi_del(p);
7029
29b4433d
ED
7030 free_percpu(dev->pcpu_refcnt);
7031 dev->pcpu_refcnt = NULL;
7032
3041a069 7033 /* Compatibility with error handling in drivers */
1da177e4 7034 if (dev->reg_state == NETREG_UNINITIALIZED) {
74d332c1 7035 netdev_freemem(dev);
1da177e4
LT
7036 return;
7037 }
7038
7039 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7040 dev->reg_state = NETREG_RELEASED;
7041
43cb76d9
GKH
7042 /* will free via device release */
7043 put_device(&dev->dev);
1da177e4 7044}
d1b19dff 7045EXPORT_SYMBOL(free_netdev);
4ec93edb 7046
f0db275a
SH
7047/**
7048 * synchronize_net - Synchronize with packet receive processing
7049 *
7050 * Wait for packets currently being received to be done.
7051 * Does not block later packets from starting.
7052 */
4ec93edb 7053void synchronize_net(void)
1da177e4
LT
7054{
7055 might_sleep();
be3fc413
ED
7056 if (rtnl_is_locked())
7057 synchronize_rcu_expedited();
7058 else
7059 synchronize_rcu();
1da177e4 7060}
d1b19dff 7061EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
7062
7063/**
44a0873d 7064 * unregister_netdevice_queue - remove device from the kernel
1da177e4 7065 * @dev: device
44a0873d 7066 * @head: list
6ebfbc06 7067 *
1da177e4 7068 * This function shuts down a device interface and removes it
d59b54b1 7069 * from the kernel tables.
44a0873d 7070 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
7071 *
7072 * Callers must hold the rtnl semaphore. You may want
7073 * unregister_netdev() instead of this.
7074 */
7075
44a0873d 7076void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 7077{
a6620712
HX
7078 ASSERT_RTNL();
7079
44a0873d 7080 if (head) {
9fdce099 7081 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
7082 } else {
7083 rollback_registered(dev);
7084 /* Finish processing unregister after unlock */
7085 net_set_todo(dev);
7086 }
1da177e4 7087}
44a0873d 7088EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 7089
9b5e383c
ED
7090/**
7091 * unregister_netdevice_many - unregister many devices
7092 * @head: list of devices
87757a91
ED
7093 *
7094 * Note: As most callers use a stack allocated list_head,
7095 * we force a list_del() to make sure stack wont be corrupted later.
9b5e383c
ED
7096 */
7097void unregister_netdevice_many(struct list_head *head)
7098{
7099 struct net_device *dev;
7100
7101 if (!list_empty(head)) {
7102 rollback_registered_many(head);
7103 list_for_each_entry(dev, head, unreg_list)
7104 net_set_todo(dev);
87757a91 7105 list_del(head);
9b5e383c
ED
7106 }
7107}
63c8099d 7108EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 7109
1da177e4
LT
7110/**
7111 * unregister_netdev - remove device from the kernel
7112 * @dev: device
7113 *
7114 * This function shuts down a device interface and removes it
d59b54b1 7115 * from the kernel tables.
1da177e4
LT
7116 *
7117 * This is just a wrapper for unregister_netdevice that takes
7118 * the rtnl semaphore. In general you want to use this and not
7119 * unregister_netdevice.
7120 */
7121void unregister_netdev(struct net_device *dev)
7122{
7123 rtnl_lock();
7124 unregister_netdevice(dev);
7125 rtnl_unlock();
7126}
1da177e4
LT
7127EXPORT_SYMBOL(unregister_netdev);
7128
ce286d32
EB
7129/**
7130 * dev_change_net_namespace - move device to different nethost namespace
7131 * @dev: device
7132 * @net: network namespace
7133 * @pat: If not NULL name pattern to try if the current device name
7134 * is already taken in the destination network namespace.
7135 *
7136 * This function shuts down a device interface and moves it
7137 * to a new network namespace. On success 0 is returned, on
7138 * a failure a netagive errno code is returned.
7139 *
7140 * Callers must hold the rtnl semaphore.
7141 */
7142
7143int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7144{
ce286d32
EB
7145 int err;
7146
7147 ASSERT_RTNL();
7148
7149 /* Don't allow namespace local devices to be moved. */
7150 err = -EINVAL;
7151 if (dev->features & NETIF_F_NETNS_LOCAL)
7152 goto out;
7153
7154 /* Ensure the device has been registrered */
ce286d32
EB
7155 if (dev->reg_state != NETREG_REGISTERED)
7156 goto out;
7157
7158 /* Get out if there is nothing todo */
7159 err = 0;
878628fb 7160 if (net_eq(dev_net(dev), net))
ce286d32
EB
7161 goto out;
7162
7163 /* Pick the destination device name, and ensure
7164 * we can use it in the destination network namespace.
7165 */
7166 err = -EEXIST;
d9031024 7167 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
7168 /* We get here if we can't use the current device name */
7169 if (!pat)
7170 goto out;
828de4f6 7171 if (dev_get_valid_name(net, dev, pat) < 0)
ce286d32
EB
7172 goto out;
7173 }
7174
7175 /*
7176 * And now a mini version of register_netdevice unregister_netdevice.
7177 */
7178
7179 /* If device is running close it first. */
9b772652 7180 dev_close(dev);
ce286d32
EB
7181
7182 /* And unlink it from device chain */
7183 err = -ENODEV;
7184 unlist_netdevice(dev);
7185
7186 synchronize_net();
7187
7188 /* Shutdown queueing discipline. */
7189 dev_shutdown(dev);
7190
7191 /* Notify protocols, that we are about to destroy
7192 this device. They should clean all the things.
3b27e105
DL
7193
7194 Note that dev->reg_state stays at NETREG_REGISTERED.
7195 This is wanted because this way 8021q and macvlan know
7196 the device is just moving and can keep their slaves up.
ce286d32
EB
7197 */
7198 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6549dd43
G
7199 rcu_barrier();
7200 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7f294054 7201 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
ce286d32
EB
7202
7203 /*
7204 * Flush the unicast and multicast chains
7205 */
a748ee24 7206 dev_uc_flush(dev);
22bedad3 7207 dev_mc_flush(dev);
ce286d32 7208
4e66ae2e
SH
7209 /* Send a netdev-removed uevent to the old namespace */
7210 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
4c75431a 7211 netdev_adjacent_del_links(dev);
4e66ae2e 7212
ce286d32 7213 /* Actually switch the network namespace */
c346dca1 7214 dev_net_set(dev, net);
ce286d32 7215
ce286d32 7216 /* If there is an ifindex conflict assign a new one */
7a66bbc9 7217 if (__dev_get_by_index(net, dev->ifindex))
ce286d32 7218 dev->ifindex = dev_new_index(net);
ce286d32 7219
4e66ae2e
SH
7220 /* Send a netdev-add uevent to the new namespace */
7221 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
4c75431a 7222 netdev_adjacent_add_links(dev);
4e66ae2e 7223
8b41d188 7224 /* Fixup kobjects */
a1b3f594 7225 err = device_rename(&dev->dev, dev->name);
8b41d188 7226 WARN_ON(err);
ce286d32
EB
7227
7228 /* Add the device back in the hashes */
7229 list_netdevice(dev);
7230
7231 /* Notify protocols, that a new device appeared. */
7232 call_netdevice_notifiers(NETDEV_REGISTER, dev);
7233
d90a909e
EB
7234 /*
7235 * Prevent userspace races by waiting until the network
7236 * device is fully setup before sending notifications.
7237 */
7f294054 7238 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
d90a909e 7239
ce286d32
EB
7240 synchronize_net();
7241 err = 0;
7242out:
7243 return err;
7244}
463d0183 7245EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 7246
1da177e4
LT
7247static int dev_cpu_callback(struct notifier_block *nfb,
7248 unsigned long action,
7249 void *ocpu)
7250{
7251 struct sk_buff **list_skb;
1da177e4
LT
7252 struct sk_buff *skb;
7253 unsigned int cpu, oldcpu = (unsigned long)ocpu;
7254 struct softnet_data *sd, *oldsd;
7255
8bb78442 7256 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
7257 return NOTIFY_OK;
7258
7259 local_irq_disable();
7260 cpu = smp_processor_id();
7261 sd = &per_cpu(softnet_data, cpu);
7262 oldsd = &per_cpu(softnet_data, oldcpu);
7263
7264 /* Find end of our completion_queue. */
7265 list_skb = &sd->completion_queue;
7266 while (*list_skb)
7267 list_skb = &(*list_skb)->next;
7268 /* Append completion queue from offline CPU. */
7269 *list_skb = oldsd->completion_queue;
7270 oldsd->completion_queue = NULL;
7271
1da177e4 7272 /* Append output queue from offline CPU. */
a9cbd588
CG
7273 if (oldsd->output_queue) {
7274 *sd->output_queue_tailp = oldsd->output_queue;
7275 sd->output_queue_tailp = oldsd->output_queue_tailp;
7276 oldsd->output_queue = NULL;
7277 oldsd->output_queue_tailp = &oldsd->output_queue;
7278 }
ac64da0b
ED
7279 /* Append NAPI poll list from offline CPU, with one exception :
7280 * process_backlog() must be called by cpu owning percpu backlog.
7281 * We properly handle process_queue & input_pkt_queue later.
7282 */
7283 while (!list_empty(&oldsd->poll_list)) {
7284 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7285 struct napi_struct,
7286 poll_list);
7287
7288 list_del_init(&napi->poll_list);
7289 if (napi->poll == process_backlog)
7290 napi->state = 0;
7291 else
7292 ____napi_schedule(sd, napi);
264524d5 7293 }
1da177e4
LT
7294
7295 raise_softirq_irqoff(NET_TX_SOFTIRQ);
7296 local_irq_enable();
7297
7298 /* Process offline CPU's input_pkt_queue */
76cc8b13 7299 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
91e83133 7300 netif_rx_ni(skb);
76cc8b13 7301 input_queue_head_incr(oldsd);
fec5e652 7302 }
ac64da0b 7303 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
91e83133 7304 netif_rx_ni(skb);
76cc8b13
TH
7305 input_queue_head_incr(oldsd);
7306 }
1da177e4
LT
7307
7308 return NOTIFY_OK;
7309}
1da177e4
LT
7310
7311
7f353bf2 7312/**
b63365a2
HX
7313 * netdev_increment_features - increment feature set by one
7314 * @all: current feature set
7315 * @one: new feature set
7316 * @mask: mask feature set
7f353bf2
HX
7317 *
7318 * Computes a new feature set after adding a device with feature set
b63365a2
HX
7319 * @one to the master device with current feature set @all. Will not
7320 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 7321 */
c8f44aff
MM
7322netdev_features_t netdev_increment_features(netdev_features_t all,
7323 netdev_features_t one, netdev_features_t mask)
b63365a2 7324{
1742f183
MM
7325 if (mask & NETIF_F_GEN_CSUM)
7326 mask |= NETIF_F_ALL_CSUM;
7327 mask |= NETIF_F_VLAN_CHALLENGED;
7f353bf2 7328
1742f183
MM
7329 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7330 all &= one | ~NETIF_F_ALL_FOR_ALL;
c6e1a0d1 7331
1742f183
MM
7332 /* If one device supports hw checksumming, set for all. */
7333 if (all & NETIF_F_GEN_CSUM)
7334 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7f353bf2
HX
7335
7336 return all;
7337}
b63365a2 7338EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 7339
430f03cd 7340static struct hlist_head * __net_init netdev_create_hash(void)
30d97d35
PE
7341{
7342 int i;
7343 struct hlist_head *hash;
7344
7345 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7346 if (hash != NULL)
7347 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7348 INIT_HLIST_HEAD(&hash[i]);
7349
7350 return hash;
7351}
7352
881d966b 7353/* Initialize per network namespace state */
4665079c 7354static int __net_init netdev_init(struct net *net)
881d966b 7355{
734b6541
RM
7356 if (net != &init_net)
7357 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 7358
30d97d35
PE
7359 net->dev_name_head = netdev_create_hash();
7360 if (net->dev_name_head == NULL)
7361 goto err_name;
881d966b 7362
30d97d35
PE
7363 net->dev_index_head = netdev_create_hash();
7364 if (net->dev_index_head == NULL)
7365 goto err_idx;
881d966b
EB
7366
7367 return 0;
30d97d35
PE
7368
7369err_idx:
7370 kfree(net->dev_name_head);
7371err_name:
7372 return -ENOMEM;
881d966b
EB
7373}
7374
f0db275a
SH
7375/**
7376 * netdev_drivername - network driver for the device
7377 * @dev: network device
f0db275a
SH
7378 *
7379 * Determine network driver for device.
7380 */
3019de12 7381const char *netdev_drivername(const struct net_device *dev)
6579e57b 7382{
cf04a4c7
SH
7383 const struct device_driver *driver;
7384 const struct device *parent;
3019de12 7385 const char *empty = "";
6579e57b
AV
7386
7387 parent = dev->dev.parent;
6579e57b 7388 if (!parent)
3019de12 7389 return empty;
6579e57b
AV
7390
7391 driver = parent->driver;
7392 if (driver && driver->name)
3019de12
DM
7393 return driver->name;
7394 return empty;
6579e57b
AV
7395}
7396
6ea754eb
JP
7397static void __netdev_printk(const char *level, const struct net_device *dev,
7398 struct va_format *vaf)
256df2f3 7399{
b004ff49 7400 if (dev && dev->dev.parent) {
6ea754eb
JP
7401 dev_printk_emit(level[1] - '0',
7402 dev->dev.parent,
7403 "%s %s %s%s: %pV",
7404 dev_driver_string(dev->dev.parent),
7405 dev_name(dev->dev.parent),
7406 netdev_name(dev), netdev_reg_state(dev),
7407 vaf);
b004ff49 7408 } else if (dev) {
6ea754eb
JP
7409 printk("%s%s%s: %pV",
7410 level, netdev_name(dev), netdev_reg_state(dev), vaf);
b004ff49 7411 } else {
6ea754eb 7412 printk("%s(NULL net_device): %pV", level, vaf);
b004ff49 7413 }
256df2f3
JP
7414}
7415
6ea754eb
JP
7416void netdev_printk(const char *level, const struct net_device *dev,
7417 const char *format, ...)
256df2f3
JP
7418{
7419 struct va_format vaf;
7420 va_list args;
256df2f3
JP
7421
7422 va_start(args, format);
7423
7424 vaf.fmt = format;
7425 vaf.va = &args;
7426
6ea754eb 7427 __netdev_printk(level, dev, &vaf);
b004ff49 7428
256df2f3 7429 va_end(args);
256df2f3
JP
7430}
7431EXPORT_SYMBOL(netdev_printk);
7432
7433#define define_netdev_printk_level(func, level) \
6ea754eb 7434void func(const struct net_device *dev, const char *fmt, ...) \
256df2f3 7435{ \
256df2f3
JP
7436 struct va_format vaf; \
7437 va_list args; \
7438 \
7439 va_start(args, fmt); \
7440 \
7441 vaf.fmt = fmt; \
7442 vaf.va = &args; \
7443 \
6ea754eb 7444 __netdev_printk(level, dev, &vaf); \
b004ff49 7445 \
256df2f3 7446 va_end(args); \
256df2f3
JP
7447} \
7448EXPORT_SYMBOL(func);
7449
7450define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7451define_netdev_printk_level(netdev_alert, KERN_ALERT);
7452define_netdev_printk_level(netdev_crit, KERN_CRIT);
7453define_netdev_printk_level(netdev_err, KERN_ERR);
7454define_netdev_printk_level(netdev_warn, KERN_WARNING);
7455define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7456define_netdev_printk_level(netdev_info, KERN_INFO);
7457
4665079c 7458static void __net_exit netdev_exit(struct net *net)
881d966b
EB
7459{
7460 kfree(net->dev_name_head);
7461 kfree(net->dev_index_head);
7462}
7463
022cbae6 7464static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
7465 .init = netdev_init,
7466 .exit = netdev_exit,
7467};
7468
4665079c 7469static void __net_exit default_device_exit(struct net *net)
ce286d32 7470{
e008b5fc 7471 struct net_device *dev, *aux;
ce286d32 7472 /*
e008b5fc 7473 * Push all migratable network devices back to the
ce286d32
EB
7474 * initial network namespace
7475 */
7476 rtnl_lock();
e008b5fc 7477 for_each_netdev_safe(net, dev, aux) {
ce286d32 7478 int err;
aca51397 7479 char fb_name[IFNAMSIZ];
ce286d32
EB
7480
7481 /* Ignore unmoveable devices (i.e. loopback) */
7482 if (dev->features & NETIF_F_NETNS_LOCAL)
7483 continue;
7484
e008b5fc
EB
7485 /* Leave virtual devices for the generic cleanup */
7486 if (dev->rtnl_link_ops)
7487 continue;
d0c082ce 7488
25985edc 7489 /* Push remaining network devices to init_net */
aca51397
PE
7490 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7491 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 7492 if (err) {
7b6cd1ce
JP
7493 pr_emerg("%s: failed to move %s to init_net: %d\n",
7494 __func__, dev->name, err);
aca51397 7495 BUG();
ce286d32
EB
7496 }
7497 }
7498 rtnl_unlock();
7499}
7500
50624c93
EB
7501static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7502{
7503 /* Return with the rtnl_lock held when there are no network
7504 * devices unregistering in any network namespace in net_list.
7505 */
7506 struct net *net;
7507 bool unregistering;
ff960a73 7508 DEFINE_WAIT_FUNC(wait, woken_wake_function);
50624c93 7509
ff960a73 7510 add_wait_queue(&netdev_unregistering_wq, &wait);
50624c93 7511 for (;;) {
50624c93
EB
7512 unregistering = false;
7513 rtnl_lock();
7514 list_for_each_entry(net, net_list, exit_list) {
7515 if (net->dev_unreg_count > 0) {
7516 unregistering = true;
7517 break;
7518 }
7519 }
7520 if (!unregistering)
7521 break;
7522 __rtnl_unlock();
ff960a73
PZ
7523
7524 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
50624c93 7525 }
ff960a73 7526 remove_wait_queue(&netdev_unregistering_wq, &wait);
50624c93
EB
7527}
7528
04dc7f6b
EB
7529static void __net_exit default_device_exit_batch(struct list_head *net_list)
7530{
7531 /* At exit all network devices most be removed from a network
b595076a 7532 * namespace. Do this in the reverse order of registration.
04dc7f6b
EB
7533 * Do this across as many network namespaces as possible to
7534 * improve batching efficiency.
7535 */
7536 struct net_device *dev;
7537 struct net *net;
7538 LIST_HEAD(dev_kill_list);
7539
50624c93
EB
7540 /* To prevent network device cleanup code from dereferencing
7541 * loopback devices or network devices that have been freed
7542 * wait here for all pending unregistrations to complete,
7543 * before unregistring the loopback device and allowing the
7544 * network namespace be freed.
7545 *
7546 * The netdev todo list containing all network devices
7547 * unregistrations that happen in default_device_exit_batch
7548 * will run in the rtnl_unlock() at the end of
7549 * default_device_exit_batch.
7550 */
7551 rtnl_lock_unregistering(net_list);
04dc7f6b
EB
7552 list_for_each_entry(net, net_list, exit_list) {
7553 for_each_netdev_reverse(net, dev) {
b0ab2fab 7554 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
04dc7f6b
EB
7555 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7556 else
7557 unregister_netdevice_queue(dev, &dev_kill_list);
7558 }
7559 }
7560 unregister_netdevice_many(&dev_kill_list);
7561 rtnl_unlock();
7562}
7563
022cbae6 7564static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 7565 .exit = default_device_exit,
04dc7f6b 7566 .exit_batch = default_device_exit_batch,
ce286d32
EB
7567};
7568
1da177e4
LT
7569/*
7570 * Initialize the DEV module. At boot time this walks the device list and
7571 * unhooks any devices that fail to initialise (normally hardware not
7572 * present) and leaves us with a valid list of present and active devices.
7573 *
7574 */
7575
7576/*
7577 * This is called single threaded during boot, so no need
7578 * to take the rtnl semaphore.
7579 */
7580static int __init net_dev_init(void)
7581{
7582 int i, rc = -ENOMEM;
7583
7584 BUG_ON(!dev_boot_phase);
7585
1da177e4
LT
7586 if (dev_proc_init())
7587 goto out;
7588
8b41d188 7589 if (netdev_kobject_init())
1da177e4
LT
7590 goto out;
7591
7592 INIT_LIST_HEAD(&ptype_all);
82d8a867 7593 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
7594 INIT_LIST_HEAD(&ptype_base[i]);
7595
62532da9
VY
7596 INIT_LIST_HEAD(&offload_base);
7597
881d966b
EB
7598 if (register_pernet_subsys(&netdev_net_ops))
7599 goto out;
1da177e4
LT
7600
7601 /*
7602 * Initialise the packet receive queues.
7603 */
7604
6f912042 7605 for_each_possible_cpu(i) {
e36fa2f7 7606 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 7607
e36fa2f7 7608 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 7609 skb_queue_head_init(&sd->process_queue);
e36fa2f7 7610 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588 7611 sd->output_queue_tailp = &sd->output_queue;
df334545 7612#ifdef CONFIG_RPS
e36fa2f7
ED
7613 sd->csd.func = rps_trigger_softirq;
7614 sd->csd.info = sd;
e36fa2f7 7615 sd->cpu = i;
1e94d72f 7616#endif
0a9627f2 7617
e36fa2f7
ED
7618 sd->backlog.poll = process_backlog;
7619 sd->backlog.weight = weight_p;
1da177e4
LT
7620 }
7621
1da177e4
LT
7622 dev_boot_phase = 0;
7623
505d4f73
EB
7624 /* The loopback device is special if any other network devices
7625 * is present in a network namespace the loopback device must
7626 * be present. Since we now dynamically allocate and free the
7627 * loopback device ensure this invariant is maintained by
7628 * keeping the loopback device as the first device on the
7629 * list of network devices. Ensuring the loopback devices
7630 * is the first device that appears and the last network device
7631 * that disappears.
7632 */
7633 if (register_pernet_device(&loopback_net_ops))
7634 goto out;
7635
7636 if (register_pernet_device(&default_device_ops))
7637 goto out;
7638
962cf36c
CM
7639 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7640 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
7641
7642 hotcpu_notifier(dev_cpu_callback, 0);
7643 dst_init();
1da177e4
LT
7644 rc = 0;
7645out:
7646 return rc;
7647}
7648
7649subsys_initcall(net_dev_init);
This page took 1.875426 seconds and 5 git commands to generate.