net: convert lls to use time_in_range()
[deliverable/linux.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
1da177e4 76#include <linux/bitops.h>
4fc268d2 77#include <linux/capability.h>
1da177e4
LT
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
08e9897d 81#include <linux/hash.h>
5a0e3ad6 82#include <linux/slab.h>
1da177e4 83#include <linux/sched.h>
4a3e2f71 84#include <linux/mutex.h>
1da177e4
LT
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
0187bdfb 94#include <linux/ethtool.h>
1da177e4
LT
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
1da177e4 100#include <linux/stat.h>
1da177e4
LT
101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
44540960 104#include <net/xfrm.h>
1da177e4
LT
105#include <linux/highmem.h>
106#include <linux/init.h>
1da177e4 107#include <linux/module.h>
1da177e4
LT
108#include <linux/netpoll.h>
109#include <linux/rcupdate.h>
110#include <linux/delay.h>
1da177e4 111#include <net/iw_handler.h>
1da177e4 112#include <asm/current.h>
5bdb9886 113#include <linux/audit.h>
db217334 114#include <linux/dmaengine.h>
f6a78bfc 115#include <linux/err.h>
c7fa9d18 116#include <linux/ctype.h>
723e98b7 117#include <linux/if_arp.h>
6de329e2 118#include <linux/if_vlan.h>
8f0f2223 119#include <linux/ip.h>
ad55dcaf 120#include <net/ip.h>
8f0f2223
DM
121#include <linux/ipv6.h>
122#include <linux/in.h>
b6b2fed1
DM
123#include <linux/jhash.h>
124#include <linux/random.h>
9cbc1cb8 125#include <trace/events/napi.h>
cf66ba58 126#include <trace/events/net.h>
07dc22e7 127#include <trace/events/skb.h>
5acbbd42 128#include <linux/pci.h>
caeda9b9 129#include <linux/inetdevice.h>
c445477d 130#include <linux/cpu_rmap.h>
c5905afb 131#include <linux/static_key.h>
af12fa6e 132#include <linux/hashtable.h>
60877a32 133#include <linux/vmalloc.h>
1da177e4 134
342709ef
PE
135#include "net-sysfs.h"
136
d565b0a1
HX
137/* Instead of increasing this, you should create a hash table. */
138#define MAX_GRO_SKBS 8
139
5d38a079
HX
140/* This should be increased if a protocol with a bigger head is added. */
141#define GRO_MAX_HEAD (MAX_HEADER + 128)
142
1da177e4 143static DEFINE_SPINLOCK(ptype_lock);
62532da9 144static DEFINE_SPINLOCK(offload_lock);
900ff8c6
CW
145struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
146struct list_head ptype_all __read_mostly; /* Taps */
62532da9 147static struct list_head offload_base __read_mostly;
1da177e4 148
1da177e4 149/*
7562f876 150 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
151 * semaphore.
152 *
c6d14c84 153 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
154 *
155 * Writers must hold the rtnl semaphore while they loop through the
7562f876 156 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
157 * actual updates. This allows pure readers to access the list even
158 * while a writer is preparing to update it.
159 *
160 * To put it another way, dev_base_lock is held for writing only to
161 * protect against pure readers; the rtnl semaphore provides the
162 * protection against other writers.
163 *
164 * See, for example usages, register_netdevice() and
165 * unregister_netdevice(), which must be called with the rtnl
166 * semaphore held.
167 */
1da177e4 168DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
169EXPORT_SYMBOL(dev_base_lock);
170
af12fa6e
ET
171/* protects napi_hash addition/deletion and napi_gen_id */
172static DEFINE_SPINLOCK(napi_hash_lock);
173
174static unsigned int napi_gen_id;
175static DEFINE_HASHTABLE(napi_hash, 8);
176
30e6c9fa 177seqcount_t devnet_rename_seq;
c91f6df2 178
4e985ada
TG
179static inline void dev_base_seq_inc(struct net *net)
180{
181 while (++net->dev_base_seq == 0);
182}
183
881d966b 184static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4 185{
95c96174
ED
186 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
187
08e9897d 188 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
189}
190
881d966b 191static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 192{
7c28bd0b 193 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
194}
195
e36fa2f7 196static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
197{
198#ifdef CONFIG_RPS
e36fa2f7 199 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
200#endif
201}
202
e36fa2f7 203static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
204{
205#ifdef CONFIG_RPS
e36fa2f7 206 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
207#endif
208}
209
ce286d32 210/* Device list insertion */
53759be9 211static void list_netdevice(struct net_device *dev)
ce286d32 212{
c346dca1 213 struct net *net = dev_net(dev);
ce286d32
EB
214
215 ASSERT_RTNL();
216
217 write_lock_bh(&dev_base_lock);
c6d14c84 218 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 219 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
220 hlist_add_head_rcu(&dev->index_hlist,
221 dev_index_hash(net, dev->ifindex));
ce286d32 222 write_unlock_bh(&dev_base_lock);
4e985ada
TG
223
224 dev_base_seq_inc(net);
ce286d32
EB
225}
226
fb699dfd
ED
227/* Device list removal
228 * caller must respect a RCU grace period before freeing/reusing dev
229 */
ce286d32
EB
230static void unlist_netdevice(struct net_device *dev)
231{
232 ASSERT_RTNL();
233
234 /* Unlink dev from the device chain */
235 write_lock_bh(&dev_base_lock);
c6d14c84 236 list_del_rcu(&dev->dev_list);
72c9528b 237 hlist_del_rcu(&dev->name_hlist);
fb699dfd 238 hlist_del_rcu(&dev->index_hlist);
ce286d32 239 write_unlock_bh(&dev_base_lock);
4e985ada
TG
240
241 dev_base_seq_inc(dev_net(dev));
ce286d32
EB
242}
243
1da177e4
LT
244/*
245 * Our notifier list
246 */
247
f07d5b94 248static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
249
250/*
251 * Device drivers call our routines to queue packets here. We empty the
252 * queue in the local softnet handler.
253 */
bea3348e 254
9958da05 255DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 256EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 257
cf508b12 258#ifdef CONFIG_LOCKDEP
723e98b7 259/*
c773e847 260 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
261 * according to dev->type
262 */
263static const unsigned short netdev_lock_type[] =
264 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
265 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
266 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
267 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
268 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
269 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
270 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
271 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
272 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
273 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
274 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
275 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
211ed865
PG
276 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
277 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
278 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
723e98b7 279
36cbd3dc 280static const char *const netdev_lock_name[] =
723e98b7
JP
281 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
282 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
283 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
284 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
285 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
286 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
287 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
288 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
289 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
290 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
291 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
292 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
211ed865
PG
293 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
294 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
295 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
296
297static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 298static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
299
300static inline unsigned short netdev_lock_pos(unsigned short dev_type)
301{
302 int i;
303
304 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
305 if (netdev_lock_type[i] == dev_type)
306 return i;
307 /* the last key is used by default */
308 return ARRAY_SIZE(netdev_lock_type) - 1;
309}
310
cf508b12
DM
311static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
312 unsigned short dev_type)
723e98b7
JP
313{
314 int i;
315
316 i = netdev_lock_pos(dev_type);
317 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
318 netdev_lock_name[i]);
319}
cf508b12
DM
320
321static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
322{
323 int i;
324
325 i = netdev_lock_pos(dev->type);
326 lockdep_set_class_and_name(&dev->addr_list_lock,
327 &netdev_addr_lock_key[i],
328 netdev_lock_name[i]);
329}
723e98b7 330#else
cf508b12
DM
331static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
332 unsigned short dev_type)
333{
334}
335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
336{
337}
338#endif
1da177e4
LT
339
340/*******************************************************************************
341
342 Protocol management and registration routines
343
344*******************************************************************************/
345
1da177e4
LT
346/*
347 * Add a protocol ID to the list. Now that the input handler is
348 * smarter we can dispense with all the messy stuff that used to be
349 * here.
350 *
351 * BEWARE!!! Protocol handlers, mangling input packets,
352 * MUST BE last in hash buckets and checking protocol handlers
353 * MUST start from promiscuous ptype_all chain in net_bh.
354 * It is true now, do not change it.
355 * Explanation follows: if protocol handler, mangling packet, will
356 * be the first on list, it is not able to sense, that packet
357 * is cloned and should be copied-on-write, so that it will
358 * change it and subsequent readers will get broken packet.
359 * --ANK (980803)
360 */
361
c07b68e8
ED
362static inline struct list_head *ptype_head(const struct packet_type *pt)
363{
364 if (pt->type == htons(ETH_P_ALL))
365 return &ptype_all;
366 else
367 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
368}
369
1da177e4
LT
370/**
371 * dev_add_pack - add packet handler
372 * @pt: packet type declaration
373 *
374 * Add a protocol handler to the networking stack. The passed &packet_type
375 * is linked into kernel lists and may not be freed until it has been
376 * removed from the kernel lists.
377 *
4ec93edb 378 * This call does not sleep therefore it can not
1da177e4
LT
379 * guarantee all CPU's that are in middle of receiving packets
380 * will see the new packet type (until the next received packet).
381 */
382
383void dev_add_pack(struct packet_type *pt)
384{
c07b68e8 385 struct list_head *head = ptype_head(pt);
1da177e4 386
c07b68e8
ED
387 spin_lock(&ptype_lock);
388 list_add_rcu(&pt->list, head);
389 spin_unlock(&ptype_lock);
1da177e4 390}
d1b19dff 391EXPORT_SYMBOL(dev_add_pack);
1da177e4 392
1da177e4
LT
393/**
394 * __dev_remove_pack - remove packet handler
395 * @pt: packet type declaration
396 *
397 * Remove a protocol handler that was previously added to the kernel
398 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
399 * from the kernel lists and can be freed or reused once this function
4ec93edb 400 * returns.
1da177e4
LT
401 *
402 * The packet type might still be in use by receivers
403 * and must not be freed until after all the CPU's have gone
404 * through a quiescent state.
405 */
406void __dev_remove_pack(struct packet_type *pt)
407{
c07b68e8 408 struct list_head *head = ptype_head(pt);
1da177e4
LT
409 struct packet_type *pt1;
410
c07b68e8 411 spin_lock(&ptype_lock);
1da177e4
LT
412
413 list_for_each_entry(pt1, head, list) {
414 if (pt == pt1) {
415 list_del_rcu(&pt->list);
416 goto out;
417 }
418 }
419
7b6cd1ce 420 pr_warn("dev_remove_pack: %p not found\n", pt);
1da177e4 421out:
c07b68e8 422 spin_unlock(&ptype_lock);
1da177e4 423}
d1b19dff
ED
424EXPORT_SYMBOL(__dev_remove_pack);
425
1da177e4
LT
426/**
427 * dev_remove_pack - remove packet handler
428 * @pt: packet type declaration
429 *
430 * Remove a protocol handler that was previously added to the kernel
431 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
432 * from the kernel lists and can be freed or reused once this function
433 * returns.
434 *
435 * This call sleeps to guarantee that no CPU is looking at the packet
436 * type after return.
437 */
438void dev_remove_pack(struct packet_type *pt)
439{
440 __dev_remove_pack(pt);
4ec93edb 441
1da177e4
LT
442 synchronize_net();
443}
d1b19dff 444EXPORT_SYMBOL(dev_remove_pack);
1da177e4 445
62532da9
VY
446
447/**
448 * dev_add_offload - register offload handlers
449 * @po: protocol offload declaration
450 *
451 * Add protocol offload handlers to the networking stack. The passed
452 * &proto_offload is linked into kernel lists and may not be freed until
453 * it has been removed from the kernel lists.
454 *
455 * This call does not sleep therefore it can not
456 * guarantee all CPU's that are in middle of receiving packets
457 * will see the new offload handlers (until the next received packet).
458 */
459void dev_add_offload(struct packet_offload *po)
460{
461 struct list_head *head = &offload_base;
462
463 spin_lock(&offload_lock);
464 list_add_rcu(&po->list, head);
465 spin_unlock(&offload_lock);
466}
467EXPORT_SYMBOL(dev_add_offload);
468
469/**
470 * __dev_remove_offload - remove offload handler
471 * @po: packet offload declaration
472 *
473 * Remove a protocol offload handler that was previously added to the
474 * kernel offload handlers by dev_add_offload(). The passed &offload_type
475 * is removed from the kernel lists and can be freed or reused once this
476 * function returns.
477 *
478 * The packet type might still be in use by receivers
479 * and must not be freed until after all the CPU's have gone
480 * through a quiescent state.
481 */
482void __dev_remove_offload(struct packet_offload *po)
483{
484 struct list_head *head = &offload_base;
485 struct packet_offload *po1;
486
c53aa505 487 spin_lock(&offload_lock);
62532da9
VY
488
489 list_for_each_entry(po1, head, list) {
490 if (po == po1) {
491 list_del_rcu(&po->list);
492 goto out;
493 }
494 }
495
496 pr_warn("dev_remove_offload: %p not found\n", po);
497out:
c53aa505 498 spin_unlock(&offload_lock);
62532da9
VY
499}
500EXPORT_SYMBOL(__dev_remove_offload);
501
502/**
503 * dev_remove_offload - remove packet offload handler
504 * @po: packet offload declaration
505 *
506 * Remove a packet offload handler that was previously added to the kernel
507 * offload handlers by dev_add_offload(). The passed &offload_type is
508 * removed from the kernel lists and can be freed or reused once this
509 * function returns.
510 *
511 * This call sleeps to guarantee that no CPU is looking at the packet
512 * type after return.
513 */
514void dev_remove_offload(struct packet_offload *po)
515{
516 __dev_remove_offload(po);
517
518 synchronize_net();
519}
520EXPORT_SYMBOL(dev_remove_offload);
521
1da177e4
LT
522/******************************************************************************
523
524 Device Boot-time Settings Routines
525
526*******************************************************************************/
527
528/* Boot time configuration table */
529static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
530
531/**
532 * netdev_boot_setup_add - add new setup entry
533 * @name: name of the device
534 * @map: configured settings for the device
535 *
536 * Adds new setup entry to the dev_boot_setup list. The function
537 * returns 0 on error and 1 on success. This is a generic routine to
538 * all netdevices.
539 */
540static int netdev_boot_setup_add(char *name, struct ifmap *map)
541{
542 struct netdev_boot_setup *s;
543 int i;
544
545 s = dev_boot_setup;
546 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
547 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
548 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 549 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
550 memcpy(&s[i].map, map, sizeof(s[i].map));
551 break;
552 }
553 }
554
555 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
556}
557
558/**
559 * netdev_boot_setup_check - check boot time settings
560 * @dev: the netdevice
561 *
562 * Check boot time settings for the device.
563 * The found settings are set for the device to be used
564 * later in the device probing.
565 * Returns 0 if no settings found, 1 if they are.
566 */
567int netdev_boot_setup_check(struct net_device *dev)
568{
569 struct netdev_boot_setup *s = dev_boot_setup;
570 int i;
571
572 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
573 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 574 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
575 dev->irq = s[i].map.irq;
576 dev->base_addr = s[i].map.base_addr;
577 dev->mem_start = s[i].map.mem_start;
578 dev->mem_end = s[i].map.mem_end;
579 return 1;
580 }
581 }
582 return 0;
583}
d1b19dff 584EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
585
586
587/**
588 * netdev_boot_base - get address from boot time settings
589 * @prefix: prefix for network device
590 * @unit: id for network device
591 *
592 * Check boot time settings for the base address of device.
593 * The found settings are set for the device to be used
594 * later in the device probing.
595 * Returns 0 if no settings found.
596 */
597unsigned long netdev_boot_base(const char *prefix, int unit)
598{
599 const struct netdev_boot_setup *s = dev_boot_setup;
600 char name[IFNAMSIZ];
601 int i;
602
603 sprintf(name, "%s%d", prefix, unit);
604
605 /*
606 * If device already registered then return base of 1
607 * to indicate not to probe for this interface
608 */
881d966b 609 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
610 return 1;
611
612 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
613 if (!strcmp(name, s[i].name))
614 return s[i].map.base_addr;
615 return 0;
616}
617
618/*
619 * Saves at boot time configured settings for any netdevice.
620 */
621int __init netdev_boot_setup(char *str)
622{
623 int ints[5];
624 struct ifmap map;
625
626 str = get_options(str, ARRAY_SIZE(ints), ints);
627 if (!str || !*str)
628 return 0;
629
630 /* Save settings */
631 memset(&map, 0, sizeof(map));
632 if (ints[0] > 0)
633 map.irq = ints[1];
634 if (ints[0] > 1)
635 map.base_addr = ints[2];
636 if (ints[0] > 2)
637 map.mem_start = ints[3];
638 if (ints[0] > 3)
639 map.mem_end = ints[4];
640
641 /* Add new entry to the list */
642 return netdev_boot_setup_add(str, &map);
643}
644
645__setup("netdev=", netdev_boot_setup);
646
647/*******************************************************************************
648
649 Device Interface Subroutines
650
651*******************************************************************************/
652
653/**
654 * __dev_get_by_name - find a device by its name
c4ea43c5 655 * @net: the applicable net namespace
1da177e4
LT
656 * @name: name to find
657 *
658 * Find an interface by name. Must be called under RTNL semaphore
659 * or @dev_base_lock. If the name is found a pointer to the device
660 * is returned. If the name is not found then %NULL is returned. The
661 * reference counters are not incremented so the caller must be
662 * careful with locks.
663 */
664
881d966b 665struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4 666{
0bd8d536
ED
667 struct net_device *dev;
668 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 669
b67bfe0d 670 hlist_for_each_entry(dev, head, name_hlist)
1da177e4
LT
671 if (!strncmp(dev->name, name, IFNAMSIZ))
672 return dev;
0bd8d536 673
1da177e4
LT
674 return NULL;
675}
d1b19dff 676EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 677
72c9528b
ED
678/**
679 * dev_get_by_name_rcu - find a device by its name
680 * @net: the applicable net namespace
681 * @name: name to find
682 *
683 * Find an interface by name.
684 * If the name is found a pointer to the device is returned.
685 * If the name is not found then %NULL is returned.
686 * The reference counters are not incremented so the caller must be
687 * careful with locks. The caller must hold RCU lock.
688 */
689
690struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
691{
72c9528b
ED
692 struct net_device *dev;
693 struct hlist_head *head = dev_name_hash(net, name);
694
b67bfe0d 695 hlist_for_each_entry_rcu(dev, head, name_hlist)
72c9528b
ED
696 if (!strncmp(dev->name, name, IFNAMSIZ))
697 return dev;
698
699 return NULL;
700}
701EXPORT_SYMBOL(dev_get_by_name_rcu);
702
1da177e4
LT
703/**
704 * dev_get_by_name - find a device by its name
c4ea43c5 705 * @net: the applicable net namespace
1da177e4
LT
706 * @name: name to find
707 *
708 * Find an interface by name. This can be called from any
709 * context and does its own locking. The returned handle has
710 * the usage count incremented and the caller must use dev_put() to
711 * release it when it is no longer needed. %NULL is returned if no
712 * matching device is found.
713 */
714
881d966b 715struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
716{
717 struct net_device *dev;
718
72c9528b
ED
719 rcu_read_lock();
720 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
721 if (dev)
722 dev_hold(dev);
72c9528b 723 rcu_read_unlock();
1da177e4
LT
724 return dev;
725}
d1b19dff 726EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
727
728/**
729 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 730 * @net: the applicable net namespace
1da177e4
LT
731 * @ifindex: index of device
732 *
733 * Search for an interface by index. Returns %NULL if the device
734 * is not found or a pointer to the device. The device has not
735 * had its reference counter increased so the caller must be careful
736 * about locking. The caller must hold either the RTNL semaphore
737 * or @dev_base_lock.
738 */
739
881d966b 740struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4 741{
0bd8d536
ED
742 struct net_device *dev;
743 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 744
b67bfe0d 745 hlist_for_each_entry(dev, head, index_hlist)
1da177e4
LT
746 if (dev->ifindex == ifindex)
747 return dev;
0bd8d536 748
1da177e4
LT
749 return NULL;
750}
d1b19dff 751EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 752
fb699dfd
ED
753/**
754 * dev_get_by_index_rcu - find a device by its ifindex
755 * @net: the applicable net namespace
756 * @ifindex: index of device
757 *
758 * Search for an interface by index. Returns %NULL if the device
759 * is not found or a pointer to the device. The device has not
760 * had its reference counter increased so the caller must be careful
761 * about locking. The caller must hold RCU lock.
762 */
763
764struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
765{
fb699dfd
ED
766 struct net_device *dev;
767 struct hlist_head *head = dev_index_hash(net, ifindex);
768
b67bfe0d 769 hlist_for_each_entry_rcu(dev, head, index_hlist)
fb699dfd
ED
770 if (dev->ifindex == ifindex)
771 return dev;
772
773 return NULL;
774}
775EXPORT_SYMBOL(dev_get_by_index_rcu);
776
1da177e4
LT
777
778/**
779 * dev_get_by_index - find a device by its ifindex
c4ea43c5 780 * @net: the applicable net namespace
1da177e4
LT
781 * @ifindex: index of device
782 *
783 * Search for an interface by index. Returns NULL if the device
784 * is not found or a pointer to the device. The device returned has
785 * had a reference added and the pointer is safe until the user calls
786 * dev_put to indicate they have finished with it.
787 */
788
881d966b 789struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
790{
791 struct net_device *dev;
792
fb699dfd
ED
793 rcu_read_lock();
794 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
795 if (dev)
796 dev_hold(dev);
fb699dfd 797 rcu_read_unlock();
1da177e4
LT
798 return dev;
799}
d1b19dff 800EXPORT_SYMBOL(dev_get_by_index);
1da177e4
LT
801
802/**
941666c2 803 * dev_getbyhwaddr_rcu - find a device by its hardware address
c4ea43c5 804 * @net: the applicable net namespace
1da177e4
LT
805 * @type: media type of device
806 * @ha: hardware address
807 *
808 * Search for an interface by MAC address. Returns NULL if the device
c506653d
ED
809 * is not found or a pointer to the device.
810 * The caller must hold RCU or RTNL.
941666c2 811 * The returned device has not had its ref count increased
1da177e4
LT
812 * and the caller must therefore be careful about locking
813 *
1da177e4
LT
814 */
815
941666c2
ED
816struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
817 const char *ha)
1da177e4
LT
818{
819 struct net_device *dev;
820
941666c2 821 for_each_netdev_rcu(net, dev)
1da177e4
LT
822 if (dev->type == type &&
823 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
824 return dev;
825
826 return NULL;
1da177e4 827}
941666c2 828EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
cf309e3f 829
881d966b 830struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
831{
832 struct net_device *dev;
833
4e9cac2b 834 ASSERT_RTNL();
881d966b 835 for_each_netdev(net, dev)
4e9cac2b 836 if (dev->type == type)
7562f876
PE
837 return dev;
838
839 return NULL;
4e9cac2b 840}
4e9cac2b
PM
841EXPORT_SYMBOL(__dev_getfirstbyhwtype);
842
881d966b 843struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 844{
99fe3c39 845 struct net_device *dev, *ret = NULL;
4e9cac2b 846
99fe3c39
ED
847 rcu_read_lock();
848 for_each_netdev_rcu(net, dev)
849 if (dev->type == type) {
850 dev_hold(dev);
851 ret = dev;
852 break;
853 }
854 rcu_read_unlock();
855 return ret;
1da177e4 856}
1da177e4
LT
857EXPORT_SYMBOL(dev_getfirstbyhwtype);
858
859/**
bb69ae04 860 * dev_get_by_flags_rcu - find any device with given flags
c4ea43c5 861 * @net: the applicable net namespace
1da177e4
LT
862 * @if_flags: IFF_* values
863 * @mask: bitmask of bits in if_flags to check
864 *
865 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04
ED
866 * is not found or a pointer to the device. Must be called inside
867 * rcu_read_lock(), and result refcount is unchanged.
1da177e4
LT
868 */
869
bb69ae04 870struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
d1b19dff 871 unsigned short mask)
1da177e4 872{
7562f876 873 struct net_device *dev, *ret;
1da177e4 874
7562f876 875 ret = NULL;
c6d14c84 876 for_each_netdev_rcu(net, dev) {
1da177e4 877 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 878 ret = dev;
1da177e4
LT
879 break;
880 }
881 }
7562f876 882 return ret;
1da177e4 883}
bb69ae04 884EXPORT_SYMBOL(dev_get_by_flags_rcu);
1da177e4
LT
885
886/**
887 * dev_valid_name - check if name is okay for network device
888 * @name: name string
889 *
890 * Network device names need to be valid file names to
c7fa9d18
DM
891 * to allow sysfs to work. We also disallow any kind of
892 * whitespace.
1da177e4 893 */
95f050bf 894bool dev_valid_name(const char *name)
1da177e4 895{
c7fa9d18 896 if (*name == '\0')
95f050bf 897 return false;
b6fe17d6 898 if (strlen(name) >= IFNAMSIZ)
95f050bf 899 return false;
c7fa9d18 900 if (!strcmp(name, ".") || !strcmp(name, ".."))
95f050bf 901 return false;
c7fa9d18
DM
902
903 while (*name) {
904 if (*name == '/' || isspace(*name))
95f050bf 905 return false;
c7fa9d18
DM
906 name++;
907 }
95f050bf 908 return true;
1da177e4 909}
d1b19dff 910EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
911
912/**
b267b179
EB
913 * __dev_alloc_name - allocate a name for a device
914 * @net: network namespace to allocate the device name in
1da177e4 915 * @name: name format string
b267b179 916 * @buf: scratch buffer and result name string
1da177e4
LT
917 *
918 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
919 * id. It scans list of devices to build up a free map, then chooses
920 * the first empty slot. The caller must hold the dev_base or rtnl lock
921 * while allocating the name and adding the device in order to avoid
922 * duplicates.
923 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
924 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
925 */
926
b267b179 927static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
928{
929 int i = 0;
1da177e4
LT
930 const char *p;
931 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 932 unsigned long *inuse;
1da177e4
LT
933 struct net_device *d;
934
935 p = strnchr(name, IFNAMSIZ-1, '%');
936 if (p) {
937 /*
938 * Verify the string as this thing may have come from
939 * the user. There must be either one "%d" and no other "%"
940 * characters.
941 */
942 if (p[1] != 'd' || strchr(p + 2, '%'))
943 return -EINVAL;
944
945 /* Use one page as a bit array of possible slots */
cfcabdcc 946 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
947 if (!inuse)
948 return -ENOMEM;
949
881d966b 950 for_each_netdev(net, d) {
1da177e4
LT
951 if (!sscanf(d->name, name, &i))
952 continue;
953 if (i < 0 || i >= max_netdevices)
954 continue;
955
956 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 957 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
958 if (!strncmp(buf, d->name, IFNAMSIZ))
959 set_bit(i, inuse);
960 }
961
962 i = find_first_zero_bit(inuse, max_netdevices);
963 free_page((unsigned long) inuse);
964 }
965
d9031024
OP
966 if (buf != name)
967 snprintf(buf, IFNAMSIZ, name, i);
b267b179 968 if (!__dev_get_by_name(net, buf))
1da177e4 969 return i;
1da177e4
LT
970
971 /* It is possible to run out of possible slots
972 * when the name is long and there isn't enough space left
973 * for the digits, or if all bits are used.
974 */
975 return -ENFILE;
976}
977
b267b179
EB
978/**
979 * dev_alloc_name - allocate a name for a device
980 * @dev: device
981 * @name: name format string
982 *
983 * Passed a format string - eg "lt%d" it will try and find a suitable
984 * id. It scans list of devices to build up a free map, then chooses
985 * the first empty slot. The caller must hold the dev_base or rtnl lock
986 * while allocating the name and adding the device in order to avoid
987 * duplicates.
988 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
989 * Returns the number of the unit assigned or a negative errno code.
990 */
991
992int dev_alloc_name(struct net_device *dev, const char *name)
993{
994 char buf[IFNAMSIZ];
995 struct net *net;
996 int ret;
997
c346dca1
YH
998 BUG_ON(!dev_net(dev));
999 net = dev_net(dev);
b267b179
EB
1000 ret = __dev_alloc_name(net, name, buf);
1001 if (ret >= 0)
1002 strlcpy(dev->name, buf, IFNAMSIZ);
1003 return ret;
1004}
d1b19dff 1005EXPORT_SYMBOL(dev_alloc_name);
b267b179 1006
828de4f6
G
1007static int dev_alloc_name_ns(struct net *net,
1008 struct net_device *dev,
1009 const char *name)
d9031024 1010{
828de4f6
G
1011 char buf[IFNAMSIZ];
1012 int ret;
8ce6cebc 1013
828de4f6
G
1014 ret = __dev_alloc_name(net, name, buf);
1015 if (ret >= 0)
1016 strlcpy(dev->name, buf, IFNAMSIZ);
1017 return ret;
1018}
1019
1020static int dev_get_valid_name(struct net *net,
1021 struct net_device *dev,
1022 const char *name)
1023{
1024 BUG_ON(!net);
8ce6cebc 1025
d9031024
OP
1026 if (!dev_valid_name(name))
1027 return -EINVAL;
1028
1c5cae81 1029 if (strchr(name, '%'))
828de4f6 1030 return dev_alloc_name_ns(net, dev, name);
d9031024
OP
1031 else if (__dev_get_by_name(net, name))
1032 return -EEXIST;
8ce6cebc
DL
1033 else if (dev->name != name)
1034 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
1035
1036 return 0;
1037}
1da177e4
LT
1038
1039/**
1040 * dev_change_name - change name of a device
1041 * @dev: device
1042 * @newname: name (or format string) must be at least IFNAMSIZ
1043 *
1044 * Change name of a device, can pass format strings "eth%d".
1045 * for wildcarding.
1046 */
cf04a4c7 1047int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 1048{
fcc5a03a 1049 char oldname[IFNAMSIZ];
1da177e4 1050 int err = 0;
fcc5a03a 1051 int ret;
881d966b 1052 struct net *net;
1da177e4
LT
1053
1054 ASSERT_RTNL();
c346dca1 1055 BUG_ON(!dev_net(dev));
1da177e4 1056
c346dca1 1057 net = dev_net(dev);
1da177e4
LT
1058 if (dev->flags & IFF_UP)
1059 return -EBUSY;
1060
30e6c9fa 1061 write_seqcount_begin(&devnet_rename_seq);
c91f6df2
BH
1062
1063 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
30e6c9fa 1064 write_seqcount_end(&devnet_rename_seq);
c8d90dca 1065 return 0;
c91f6df2 1066 }
c8d90dca 1067
fcc5a03a
HX
1068 memcpy(oldname, dev->name, IFNAMSIZ);
1069
828de4f6 1070 err = dev_get_valid_name(net, dev, newname);
c91f6df2 1071 if (err < 0) {
30e6c9fa 1072 write_seqcount_end(&devnet_rename_seq);
d9031024 1073 return err;
c91f6df2 1074 }
1da177e4 1075
fcc5a03a 1076rollback:
a1b3f594
EB
1077 ret = device_rename(&dev->dev, dev->name);
1078 if (ret) {
1079 memcpy(dev->name, oldname, IFNAMSIZ);
30e6c9fa 1080 write_seqcount_end(&devnet_rename_seq);
a1b3f594 1081 return ret;
dcc99773 1082 }
7f988eab 1083
30e6c9fa 1084 write_seqcount_end(&devnet_rename_seq);
c91f6df2 1085
7f988eab 1086 write_lock_bh(&dev_base_lock);
372b2312 1087 hlist_del_rcu(&dev->name_hlist);
72c9528b
ED
1088 write_unlock_bh(&dev_base_lock);
1089
1090 synchronize_rcu();
1091
1092 write_lock_bh(&dev_base_lock);
1093 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1094 write_unlock_bh(&dev_base_lock);
1095
056925ab 1096 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1097 ret = notifier_to_errno(ret);
1098
1099 if (ret) {
91e9c07b
ED
1100 /* err >= 0 after dev_alloc_name() or stores the first errno */
1101 if (err >= 0) {
fcc5a03a 1102 err = ret;
30e6c9fa 1103 write_seqcount_begin(&devnet_rename_seq);
fcc5a03a
HX
1104 memcpy(dev->name, oldname, IFNAMSIZ);
1105 goto rollback;
91e9c07b 1106 } else {
7b6cd1ce 1107 pr_err("%s: name change rollback failed: %d\n",
91e9c07b 1108 dev->name, ret);
fcc5a03a
HX
1109 }
1110 }
1da177e4
LT
1111
1112 return err;
1113}
1114
0b815a1a
SH
1115/**
1116 * dev_set_alias - change ifalias of a device
1117 * @dev: device
1118 * @alias: name up to IFALIASZ
f0db275a 1119 * @len: limit of bytes to copy from info
0b815a1a
SH
1120 *
1121 * Set ifalias for a device,
1122 */
1123int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1124{
7364e445
AK
1125 char *new_ifalias;
1126
0b815a1a
SH
1127 ASSERT_RTNL();
1128
1129 if (len >= IFALIASZ)
1130 return -EINVAL;
1131
96ca4a2c 1132 if (!len) {
388dfc2d
SK
1133 kfree(dev->ifalias);
1134 dev->ifalias = NULL;
96ca4a2c
OH
1135 return 0;
1136 }
1137
7364e445
AK
1138 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1139 if (!new_ifalias)
0b815a1a 1140 return -ENOMEM;
7364e445 1141 dev->ifalias = new_ifalias;
0b815a1a
SH
1142
1143 strlcpy(dev->ifalias, alias, len+1);
1144 return len;
1145}
1146
1147
d8a33ac4 1148/**
3041a069 1149 * netdev_features_change - device changes features
d8a33ac4
SH
1150 * @dev: device to cause notification
1151 *
1152 * Called to indicate a device has changed features.
1153 */
1154void netdev_features_change(struct net_device *dev)
1155{
056925ab 1156 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1157}
1158EXPORT_SYMBOL(netdev_features_change);
1159
1da177e4
LT
1160/**
1161 * netdev_state_change - device changes state
1162 * @dev: device to cause notification
1163 *
1164 * Called to indicate a device has changed state. This function calls
1165 * the notifier chains for netdev_chain and sends a NEWLINK message
1166 * to the routing socket.
1167 */
1168void netdev_state_change(struct net_device *dev)
1169{
1170 if (dev->flags & IFF_UP) {
056925ab 1171 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1da177e4
LT
1172 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1173 }
1174}
d1b19dff 1175EXPORT_SYMBOL(netdev_state_change);
1da177e4 1176
ee89bab1
AW
1177/**
1178 * netdev_notify_peers - notify network peers about existence of @dev
1179 * @dev: network device
1180 *
1181 * Generate traffic such that interested network peers are aware of
1182 * @dev, such as by generating a gratuitous ARP. This may be used when
1183 * a device wants to inform the rest of the network about some sort of
1184 * reconfiguration such as a failover event or virtual machine
1185 * migration.
1186 */
1187void netdev_notify_peers(struct net_device *dev)
c1da4ac7 1188{
ee89bab1
AW
1189 rtnl_lock();
1190 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1191 rtnl_unlock();
c1da4ac7 1192}
ee89bab1 1193EXPORT_SYMBOL(netdev_notify_peers);
c1da4ac7 1194
bd380811 1195static int __dev_open(struct net_device *dev)
1da177e4 1196{
d314774c 1197 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1198 int ret;
1da177e4 1199
e46b66bc
BH
1200 ASSERT_RTNL();
1201
1da177e4
LT
1202 if (!netif_device_present(dev))
1203 return -ENODEV;
1204
ca99ca14
NH
1205 /* Block netpoll from trying to do any rx path servicing.
1206 * If we don't do this there is a chance ndo_poll_controller
1207 * or ndo_poll may be running while we open the device
1208 */
da6e378b 1209 netpoll_rx_disable(dev);
ca99ca14 1210
3b8bcfd5
JB
1211 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1212 ret = notifier_to_errno(ret);
1213 if (ret)
1214 return ret;
1215
1da177e4 1216 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1217
d314774c
SH
1218 if (ops->ndo_validate_addr)
1219 ret = ops->ndo_validate_addr(dev);
bada339b 1220
d314774c
SH
1221 if (!ret && ops->ndo_open)
1222 ret = ops->ndo_open(dev);
1da177e4 1223
ca99ca14
NH
1224 netpoll_rx_enable(dev);
1225
bada339b
JG
1226 if (ret)
1227 clear_bit(__LINK_STATE_START, &dev->state);
1228 else {
1da177e4 1229 dev->flags |= IFF_UP;
b4bd07c2 1230 net_dmaengine_get();
4417da66 1231 dev_set_rx_mode(dev);
1da177e4 1232 dev_activate(dev);
7bf23575 1233 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 1234 }
bada339b 1235
1da177e4
LT
1236 return ret;
1237}
1238
1239/**
bd380811
PM
1240 * dev_open - prepare an interface for use.
1241 * @dev: device to open
1da177e4 1242 *
bd380811
PM
1243 * Takes a device from down to up state. The device's private open
1244 * function is invoked and then the multicast lists are loaded. Finally
1245 * the device is moved into the up state and a %NETDEV_UP message is
1246 * sent to the netdev notifier chain.
1247 *
1248 * Calling this function on an active interface is a nop. On a failure
1249 * a negative errno code is returned.
1da177e4 1250 */
bd380811
PM
1251int dev_open(struct net_device *dev)
1252{
1253 int ret;
1254
bd380811
PM
1255 if (dev->flags & IFF_UP)
1256 return 0;
1257
bd380811
PM
1258 ret = __dev_open(dev);
1259 if (ret < 0)
1260 return ret;
1261
bd380811
PM
1262 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1263 call_netdevice_notifiers(NETDEV_UP, dev);
1264
1265 return ret;
1266}
1267EXPORT_SYMBOL(dev_open);
1268
44345724 1269static int __dev_close_many(struct list_head *head)
1da177e4 1270{
44345724 1271 struct net_device *dev;
e46b66bc 1272
bd380811 1273 ASSERT_RTNL();
9d5010db
DM
1274 might_sleep();
1275
44345724 1276 list_for_each_entry(dev, head, unreg_list) {
44345724 1277 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1278
44345724 1279 clear_bit(__LINK_STATE_START, &dev->state);
1da177e4 1280
44345724
OP
1281 /* Synchronize to scheduled poll. We cannot touch poll list, it
1282 * can be even on different cpu. So just clear netif_running().
1283 *
1284 * dev->stop() will invoke napi_disable() on all of it's
1285 * napi_struct instances on this device.
1286 */
1287 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1288 }
1da177e4 1289
44345724 1290 dev_deactivate_many(head);
d8b2a4d2 1291
44345724
OP
1292 list_for_each_entry(dev, head, unreg_list) {
1293 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4 1294
44345724
OP
1295 /*
1296 * Call the device specific close. This cannot fail.
1297 * Only if device is UP
1298 *
1299 * We allow it to be called even after a DETACH hot-plug
1300 * event.
1301 */
1302 if (ops->ndo_stop)
1303 ops->ndo_stop(dev);
1304
44345724 1305 dev->flags &= ~IFF_UP;
44345724
OP
1306 net_dmaengine_put();
1307 }
1308
1309 return 0;
1310}
1311
1312static int __dev_close(struct net_device *dev)
1313{
f87e6f47 1314 int retval;
44345724
OP
1315 LIST_HEAD(single);
1316
ca99ca14 1317 /* Temporarily disable netpoll until the interface is down */
da6e378b 1318 netpoll_rx_disable(dev);
ca99ca14 1319
44345724 1320 list_add(&dev->unreg_list, &single);
f87e6f47
LT
1321 retval = __dev_close_many(&single);
1322 list_del(&single);
ca99ca14
NH
1323
1324 netpoll_rx_enable(dev);
f87e6f47 1325 return retval;
44345724
OP
1326}
1327
3fbd8758 1328static int dev_close_many(struct list_head *head)
44345724
OP
1329{
1330 struct net_device *dev, *tmp;
1331 LIST_HEAD(tmp_list);
1da177e4 1332
44345724
OP
1333 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1334 if (!(dev->flags & IFF_UP))
1335 list_move(&dev->unreg_list, &tmp_list);
1336
1337 __dev_close_many(head);
1da177e4 1338
44345724
OP
1339 list_for_each_entry(dev, head, unreg_list) {
1340 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1341 call_netdevice_notifiers(NETDEV_DOWN, dev);
1342 }
bd380811 1343
44345724
OP
1344 /* rollback_registered_many needs the complete original list */
1345 list_splice(&tmp_list, head);
bd380811
PM
1346 return 0;
1347}
1348
1349/**
1350 * dev_close - shutdown an interface.
1351 * @dev: device to shutdown
1352 *
1353 * This function moves an active device into down state. A
1354 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1355 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1356 * chain.
1357 */
1358int dev_close(struct net_device *dev)
1359{
e14a5993
ED
1360 if (dev->flags & IFF_UP) {
1361 LIST_HEAD(single);
1da177e4 1362
ca99ca14 1363 /* Block netpoll rx while the interface is going down */
da6e378b 1364 netpoll_rx_disable(dev);
ca99ca14 1365
e14a5993
ED
1366 list_add(&dev->unreg_list, &single);
1367 dev_close_many(&single);
1368 list_del(&single);
ca99ca14
NH
1369
1370 netpoll_rx_enable(dev);
e14a5993 1371 }
da6e378b 1372 return 0;
1da177e4 1373}
d1b19dff 1374EXPORT_SYMBOL(dev_close);
1da177e4
LT
1375
1376
0187bdfb
BH
1377/**
1378 * dev_disable_lro - disable Large Receive Offload on a device
1379 * @dev: device
1380 *
1381 * Disable Large Receive Offload (LRO) on a net device. Must be
1382 * called under RTNL. This is needed if received packets may be
1383 * forwarded to another interface.
1384 */
1385void dev_disable_lro(struct net_device *dev)
1386{
f11970e3
NH
1387 /*
1388 * If we're trying to disable lro on a vlan device
1389 * use the underlying physical device instead
1390 */
1391 if (is_vlan_dev(dev))
1392 dev = vlan_dev_real_dev(dev);
1393
bc5787c6
MM
1394 dev->wanted_features &= ~NETIF_F_LRO;
1395 netdev_update_features(dev);
27660515 1396
22d5969f
MM
1397 if (unlikely(dev->features & NETIF_F_LRO))
1398 netdev_WARN(dev, "failed to disable LRO!\n");
0187bdfb
BH
1399}
1400EXPORT_SYMBOL(dev_disable_lro);
1401
351638e7
JP
1402static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1403 struct net_device *dev)
1404{
1405 struct netdev_notifier_info info;
1406
1407 netdev_notifier_info_init(&info, dev);
1408 return nb->notifier_call(nb, val, &info);
1409}
0187bdfb 1410
881d966b
EB
1411static int dev_boot_phase = 1;
1412
1da177e4
LT
1413/**
1414 * register_netdevice_notifier - register a network notifier block
1415 * @nb: notifier
1416 *
1417 * Register a notifier to be called when network device events occur.
1418 * The notifier passed is linked into the kernel structures and must
1419 * not be reused until it has been unregistered. A negative errno code
1420 * is returned on a failure.
1421 *
1422 * When registered all registration and up events are replayed
4ec93edb 1423 * to the new notifier to allow device to have a race free
1da177e4
LT
1424 * view of the network device list.
1425 */
1426
1427int register_netdevice_notifier(struct notifier_block *nb)
1428{
1429 struct net_device *dev;
fcc5a03a 1430 struct net_device *last;
881d966b 1431 struct net *net;
1da177e4
LT
1432 int err;
1433
1434 rtnl_lock();
f07d5b94 1435 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1436 if (err)
1437 goto unlock;
881d966b
EB
1438 if (dev_boot_phase)
1439 goto unlock;
1440 for_each_net(net) {
1441 for_each_netdev(net, dev) {
351638e7 1442 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
881d966b
EB
1443 err = notifier_to_errno(err);
1444 if (err)
1445 goto rollback;
1446
1447 if (!(dev->flags & IFF_UP))
1448 continue;
1da177e4 1449
351638e7 1450 call_netdevice_notifier(nb, NETDEV_UP, dev);
881d966b 1451 }
1da177e4 1452 }
fcc5a03a
HX
1453
1454unlock:
1da177e4
LT
1455 rtnl_unlock();
1456 return err;
fcc5a03a
HX
1457
1458rollback:
1459 last = dev;
881d966b
EB
1460 for_each_net(net) {
1461 for_each_netdev(net, dev) {
1462 if (dev == last)
8f891489 1463 goto outroll;
fcc5a03a 1464
881d966b 1465 if (dev->flags & IFF_UP) {
351638e7
JP
1466 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1467 dev);
1468 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
881d966b 1469 }
351638e7 1470 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
fcc5a03a 1471 }
fcc5a03a 1472 }
c67625a1 1473
8f891489 1474outroll:
c67625a1 1475 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1476 goto unlock;
1da177e4 1477}
d1b19dff 1478EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1479
1480/**
1481 * unregister_netdevice_notifier - unregister a network notifier block
1482 * @nb: notifier
1483 *
1484 * Unregister a notifier previously registered by
1485 * register_netdevice_notifier(). The notifier is unlinked into the
1486 * kernel structures and may then be reused. A negative errno code
1487 * is returned on a failure.
7d3d43da
EB
1488 *
1489 * After unregistering unregister and down device events are synthesized
1490 * for all devices on the device list to the removed notifier to remove
1491 * the need for special case cleanup code.
1da177e4
LT
1492 */
1493
1494int unregister_netdevice_notifier(struct notifier_block *nb)
1495{
7d3d43da
EB
1496 struct net_device *dev;
1497 struct net *net;
9f514950
HX
1498 int err;
1499
1500 rtnl_lock();
f07d5b94 1501 err = raw_notifier_chain_unregister(&netdev_chain, nb);
7d3d43da
EB
1502 if (err)
1503 goto unlock;
1504
1505 for_each_net(net) {
1506 for_each_netdev(net, dev) {
1507 if (dev->flags & IFF_UP) {
351638e7
JP
1508 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1509 dev);
1510 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
7d3d43da 1511 }
351638e7 1512 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
7d3d43da
EB
1513 }
1514 }
1515unlock:
9f514950
HX
1516 rtnl_unlock();
1517 return err;
1da177e4 1518}
d1b19dff 1519EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4 1520
351638e7
JP
1521/**
1522 * call_netdevice_notifiers_info - call all network notifier blocks
1523 * @val: value passed unmodified to notifier function
1524 * @dev: net_device pointer passed unmodified to notifier function
1525 * @info: notifier information data
1526 *
1527 * Call all network notifier blocks. Parameters and return value
1528 * are as for raw_notifier_call_chain().
1529 */
1530
1531int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1532 struct netdev_notifier_info *info)
1533{
1534 ASSERT_RTNL();
1535 netdev_notifier_info_init(info, dev);
1536 return raw_notifier_call_chain(&netdev_chain, val, info);
1537}
1538EXPORT_SYMBOL(call_netdevice_notifiers_info);
1539
1da177e4
LT
1540/**
1541 * call_netdevice_notifiers - call all network notifier blocks
1542 * @val: value passed unmodified to notifier function
c4ea43c5 1543 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1544 *
1545 * Call all network notifier blocks. Parameters and return value
f07d5b94 1546 * are as for raw_notifier_call_chain().
1da177e4
LT
1547 */
1548
ad7379d4 1549int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1550{
351638e7
JP
1551 struct netdev_notifier_info info;
1552
1553 return call_netdevice_notifiers_info(val, dev, &info);
1da177e4 1554}
edf947f1 1555EXPORT_SYMBOL(call_netdevice_notifiers);
1da177e4 1556
c5905afb 1557static struct static_key netstamp_needed __read_mostly;
b90e5794 1558#ifdef HAVE_JUMP_LABEL
c5905afb 1559/* We are not allowed to call static_key_slow_dec() from irq context
b90e5794 1560 * If net_disable_timestamp() is called from irq context, defer the
c5905afb 1561 * static_key_slow_dec() calls.
b90e5794
ED
1562 */
1563static atomic_t netstamp_needed_deferred;
1564#endif
1da177e4
LT
1565
1566void net_enable_timestamp(void)
1567{
b90e5794
ED
1568#ifdef HAVE_JUMP_LABEL
1569 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1570
1571 if (deferred) {
1572 while (--deferred)
c5905afb 1573 static_key_slow_dec(&netstamp_needed);
b90e5794
ED
1574 return;
1575 }
1576#endif
c5905afb 1577 static_key_slow_inc(&netstamp_needed);
1da177e4 1578}
d1b19dff 1579EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1580
1581void net_disable_timestamp(void)
1582{
b90e5794
ED
1583#ifdef HAVE_JUMP_LABEL
1584 if (in_interrupt()) {
1585 atomic_inc(&netstamp_needed_deferred);
1586 return;
1587 }
1588#endif
c5905afb 1589 static_key_slow_dec(&netstamp_needed);
1da177e4 1590}
d1b19dff 1591EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1592
3b098e2d 1593static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4 1594{
588f0330 1595 skb->tstamp.tv64 = 0;
c5905afb 1596 if (static_key_false(&netstamp_needed))
a61bbcf2 1597 __net_timestamp(skb);
1da177e4
LT
1598}
1599
588f0330 1600#define net_timestamp_check(COND, SKB) \
c5905afb 1601 if (static_key_false(&netstamp_needed)) { \
588f0330
ED
1602 if ((COND) && !(SKB)->tstamp.tv64) \
1603 __net_timestamp(SKB); \
1604 } \
3b098e2d 1605
79b569f0
DL
1606static inline bool is_skb_forwardable(struct net_device *dev,
1607 struct sk_buff *skb)
1608{
1609 unsigned int len;
1610
1611 if (!(dev->flags & IFF_UP))
1612 return false;
1613
1614 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1615 if (skb->len <= len)
1616 return true;
1617
1618 /* if TSO is enabled, we don't care about the length as the packet
1619 * could be forwarded without being segmented before
1620 */
1621 if (skb_is_gso(skb))
1622 return true;
1623
1624 return false;
1625}
1626
44540960
AB
1627/**
1628 * dev_forward_skb - loopback an skb to another netif
1629 *
1630 * @dev: destination network device
1631 * @skb: buffer to forward
1632 *
1633 * return values:
1634 * NET_RX_SUCCESS (no congestion)
6ec82562 1635 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1636 *
1637 * dev_forward_skb can be used for injecting an skb from the
1638 * start_xmit function of one device into the receive queue
1639 * of another device.
1640 *
1641 * The receiving device may be in another namespace, so
1642 * we have to clear all information in the skb that could
1643 * impact namespace isolation.
1644 */
1645int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1646{
48c83012
MT
1647 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1648 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1649 atomic_long_inc(&dev->rx_dropped);
1650 kfree_skb(skb);
1651 return NET_RX_DROP;
1652 }
1653 }
1654
79b569f0 1655 if (unlikely(!is_skb_forwardable(dev, skb))) {
caf586e5 1656 atomic_long_inc(&dev->rx_dropped);
6ec82562 1657 kfree_skb(skb);
44540960 1658 return NET_RX_DROP;
6ec82562 1659 }
621e84d6 1660 skb_scrub_packet(skb);
44540960 1661 skb->protocol = eth_type_trans(skb, dev);
44540960
AB
1662 return netif_rx(skb);
1663}
1664EXPORT_SYMBOL_GPL(dev_forward_skb);
1665
71d9dec2
CG
1666static inline int deliver_skb(struct sk_buff *skb,
1667 struct packet_type *pt_prev,
1668 struct net_device *orig_dev)
1669{
1080e512
MT
1670 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1671 return -ENOMEM;
71d9dec2
CG
1672 atomic_inc(&skb->users);
1673 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1674}
1675
c0de08d0
EL
1676static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1677{
a3d744e9 1678 if (!ptype->af_packet_priv || !skb->sk)
c0de08d0
EL
1679 return false;
1680
1681 if (ptype->id_match)
1682 return ptype->id_match(ptype, skb->sk);
1683 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1684 return true;
1685
1686 return false;
1687}
1688
1da177e4
LT
1689/*
1690 * Support routine. Sends outgoing frames to any network
1691 * taps currently in use.
1692 */
1693
f6a78bfc 1694static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1695{
1696 struct packet_type *ptype;
71d9dec2
CG
1697 struct sk_buff *skb2 = NULL;
1698 struct packet_type *pt_prev = NULL;
a61bbcf2 1699
1da177e4
LT
1700 rcu_read_lock();
1701 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1702 /* Never send packets back to the socket
1703 * they originated from - MvS (miquels@drinkel.ow.org)
1704 */
1705 if ((ptype->dev == dev || !ptype->dev) &&
c0de08d0 1706 (!skb_loop_sk(ptype, skb))) {
71d9dec2
CG
1707 if (pt_prev) {
1708 deliver_skb(skb2, pt_prev, skb->dev);
1709 pt_prev = ptype;
1710 continue;
1711 }
1712
1713 skb2 = skb_clone(skb, GFP_ATOMIC);
1da177e4
LT
1714 if (!skb2)
1715 break;
1716
70978182
ED
1717 net_timestamp_set(skb2);
1718
1da177e4
LT
1719 /* skb->nh should be correctly
1720 set by sender, so that the second statement is
1721 just protection against buggy protocols.
1722 */
459a98ed 1723 skb_reset_mac_header(skb2);
1da177e4 1724
d56f90a7 1725 if (skb_network_header(skb2) < skb2->data ||
ced14f68 1726 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
e87cc472
JP
1727 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1728 ntohs(skb2->protocol),
1729 dev->name);
c1d2bbe1 1730 skb_reset_network_header(skb2);
1da177e4
LT
1731 }
1732
b0e380b1 1733 skb2->transport_header = skb2->network_header;
1da177e4 1734 skb2->pkt_type = PACKET_OUTGOING;
71d9dec2 1735 pt_prev = ptype;
1da177e4
LT
1736 }
1737 }
71d9dec2
CG
1738 if (pt_prev)
1739 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1da177e4
LT
1740 rcu_read_unlock();
1741}
1742
2c53040f
BH
1743/**
1744 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
4f57c087
JF
1745 * @dev: Network device
1746 * @txq: number of queues available
1747 *
1748 * If real_num_tx_queues is changed the tc mappings may no longer be
1749 * valid. To resolve this verify the tc mapping remains valid and if
1750 * not NULL the mapping. With no priorities mapping to this
1751 * offset/count pair it will no longer be used. In the worst case TC0
1752 * is invalid nothing can be done so disable priority mappings. If is
1753 * expected that drivers will fix this mapping if they can before
1754 * calling netif_set_real_num_tx_queues.
1755 */
bb134d22 1756static void netif_setup_tc(struct net_device *dev, unsigned int txq)
4f57c087
JF
1757{
1758 int i;
1759 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1760
1761 /* If TC0 is invalidated disable TC mapping */
1762 if (tc->offset + tc->count > txq) {
7b6cd1ce 1763 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
4f57c087
JF
1764 dev->num_tc = 0;
1765 return;
1766 }
1767
1768 /* Invalidated prio to tc mappings set to TC0 */
1769 for (i = 1; i < TC_BITMASK + 1; i++) {
1770 int q = netdev_get_prio_tc_map(dev, i);
1771
1772 tc = &dev->tc_to_txq[q];
1773 if (tc->offset + tc->count > txq) {
7b6cd1ce
JP
1774 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1775 i, q);
4f57c087
JF
1776 netdev_set_prio_tc_map(dev, i, 0);
1777 }
1778 }
1779}
1780
537c00de
AD
1781#ifdef CONFIG_XPS
1782static DEFINE_MUTEX(xps_map_mutex);
1783#define xmap_dereference(P) \
1784 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1785
10cdc3f3
AD
1786static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1787 int cpu, u16 index)
537c00de 1788{
10cdc3f3
AD
1789 struct xps_map *map = NULL;
1790 int pos;
537c00de 1791
10cdc3f3
AD
1792 if (dev_maps)
1793 map = xmap_dereference(dev_maps->cpu_map[cpu]);
537c00de 1794
10cdc3f3
AD
1795 for (pos = 0; map && pos < map->len; pos++) {
1796 if (map->queues[pos] == index) {
537c00de
AD
1797 if (map->len > 1) {
1798 map->queues[pos] = map->queues[--map->len];
1799 } else {
10cdc3f3 1800 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
537c00de
AD
1801 kfree_rcu(map, rcu);
1802 map = NULL;
1803 }
10cdc3f3 1804 break;
537c00de 1805 }
537c00de
AD
1806 }
1807
10cdc3f3
AD
1808 return map;
1809}
1810
024e9679 1811static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
10cdc3f3
AD
1812{
1813 struct xps_dev_maps *dev_maps;
024e9679 1814 int cpu, i;
10cdc3f3
AD
1815 bool active = false;
1816
1817 mutex_lock(&xps_map_mutex);
1818 dev_maps = xmap_dereference(dev->xps_maps);
1819
1820 if (!dev_maps)
1821 goto out_no_maps;
1822
1823 for_each_possible_cpu(cpu) {
024e9679
AD
1824 for (i = index; i < dev->num_tx_queues; i++) {
1825 if (!remove_xps_queue(dev_maps, cpu, i))
1826 break;
1827 }
1828 if (i == dev->num_tx_queues)
10cdc3f3
AD
1829 active = true;
1830 }
1831
1832 if (!active) {
537c00de
AD
1833 RCU_INIT_POINTER(dev->xps_maps, NULL);
1834 kfree_rcu(dev_maps, rcu);
1835 }
1836
024e9679
AD
1837 for (i = index; i < dev->num_tx_queues; i++)
1838 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1839 NUMA_NO_NODE);
1840
537c00de
AD
1841out_no_maps:
1842 mutex_unlock(&xps_map_mutex);
1843}
1844
01c5f864
AD
1845static struct xps_map *expand_xps_map(struct xps_map *map,
1846 int cpu, u16 index)
1847{
1848 struct xps_map *new_map;
1849 int alloc_len = XPS_MIN_MAP_ALLOC;
1850 int i, pos;
1851
1852 for (pos = 0; map && pos < map->len; pos++) {
1853 if (map->queues[pos] != index)
1854 continue;
1855 return map;
1856 }
1857
1858 /* Need to add queue to this CPU's existing map */
1859 if (map) {
1860 if (pos < map->alloc_len)
1861 return map;
1862
1863 alloc_len = map->alloc_len * 2;
1864 }
1865
1866 /* Need to allocate new map to store queue on this CPU's map */
1867 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1868 cpu_to_node(cpu));
1869 if (!new_map)
1870 return NULL;
1871
1872 for (i = 0; i < pos; i++)
1873 new_map->queues[i] = map->queues[i];
1874 new_map->alloc_len = alloc_len;
1875 new_map->len = pos;
1876
1877 return new_map;
1878}
1879
537c00de
AD
1880int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1881{
01c5f864 1882 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
537c00de 1883 struct xps_map *map, *new_map;
537c00de 1884 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
01c5f864
AD
1885 int cpu, numa_node_id = -2;
1886 bool active = false;
537c00de
AD
1887
1888 mutex_lock(&xps_map_mutex);
1889
1890 dev_maps = xmap_dereference(dev->xps_maps);
1891
01c5f864
AD
1892 /* allocate memory for queue storage */
1893 for_each_online_cpu(cpu) {
1894 if (!cpumask_test_cpu(cpu, mask))
1895 continue;
1896
1897 if (!new_dev_maps)
1898 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2bb60cb9
AD
1899 if (!new_dev_maps) {
1900 mutex_unlock(&xps_map_mutex);
01c5f864 1901 return -ENOMEM;
2bb60cb9 1902 }
01c5f864
AD
1903
1904 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1905 NULL;
1906
1907 map = expand_xps_map(map, cpu, index);
1908 if (!map)
1909 goto error;
1910
1911 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1912 }
1913
1914 if (!new_dev_maps)
1915 goto out_no_new_maps;
1916
537c00de 1917 for_each_possible_cpu(cpu) {
01c5f864
AD
1918 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1919 /* add queue to CPU maps */
1920 int pos = 0;
1921
1922 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1923 while ((pos < map->len) && (map->queues[pos] != index))
1924 pos++;
1925
1926 if (pos == map->len)
1927 map->queues[map->len++] = index;
537c00de 1928#ifdef CONFIG_NUMA
537c00de
AD
1929 if (numa_node_id == -2)
1930 numa_node_id = cpu_to_node(cpu);
1931 else if (numa_node_id != cpu_to_node(cpu))
1932 numa_node_id = -1;
537c00de 1933#endif
01c5f864
AD
1934 } else if (dev_maps) {
1935 /* fill in the new device map from the old device map */
1936 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1937 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
537c00de 1938 }
01c5f864 1939
537c00de
AD
1940 }
1941
01c5f864
AD
1942 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1943
537c00de 1944 /* Cleanup old maps */
01c5f864
AD
1945 if (dev_maps) {
1946 for_each_possible_cpu(cpu) {
1947 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1948 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1949 if (map && map != new_map)
1950 kfree_rcu(map, rcu);
1951 }
537c00de 1952
01c5f864 1953 kfree_rcu(dev_maps, rcu);
537c00de
AD
1954 }
1955
01c5f864
AD
1956 dev_maps = new_dev_maps;
1957 active = true;
537c00de 1958
01c5f864
AD
1959out_no_new_maps:
1960 /* update Tx queue numa node */
537c00de
AD
1961 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1962 (numa_node_id >= 0) ? numa_node_id :
1963 NUMA_NO_NODE);
1964
01c5f864
AD
1965 if (!dev_maps)
1966 goto out_no_maps;
1967
1968 /* removes queue from unused CPUs */
1969 for_each_possible_cpu(cpu) {
1970 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1971 continue;
1972
1973 if (remove_xps_queue(dev_maps, cpu, index))
1974 active = true;
1975 }
1976
1977 /* free map if not active */
1978 if (!active) {
1979 RCU_INIT_POINTER(dev->xps_maps, NULL);
1980 kfree_rcu(dev_maps, rcu);
1981 }
1982
1983out_no_maps:
537c00de
AD
1984 mutex_unlock(&xps_map_mutex);
1985
1986 return 0;
1987error:
01c5f864
AD
1988 /* remove any maps that we added */
1989 for_each_possible_cpu(cpu) {
1990 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1991 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1992 NULL;
1993 if (new_map && new_map != map)
1994 kfree(new_map);
1995 }
1996
537c00de
AD
1997 mutex_unlock(&xps_map_mutex);
1998
537c00de
AD
1999 kfree(new_dev_maps);
2000 return -ENOMEM;
2001}
2002EXPORT_SYMBOL(netif_set_xps_queue);
2003
2004#endif
f0796d5c
JF
2005/*
2006 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2007 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2008 */
e6484930 2009int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
f0796d5c 2010{
1d24eb48
TH
2011 int rc;
2012
e6484930
TH
2013 if (txq < 1 || txq > dev->num_tx_queues)
2014 return -EINVAL;
f0796d5c 2015
5c56580b
BH
2016 if (dev->reg_state == NETREG_REGISTERED ||
2017 dev->reg_state == NETREG_UNREGISTERING) {
e6484930
TH
2018 ASSERT_RTNL();
2019
1d24eb48
TH
2020 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2021 txq);
bf264145
TH
2022 if (rc)
2023 return rc;
2024
4f57c087
JF
2025 if (dev->num_tc)
2026 netif_setup_tc(dev, txq);
2027
024e9679 2028 if (txq < dev->real_num_tx_queues) {
e6484930 2029 qdisc_reset_all_tx_gt(dev, txq);
024e9679
AD
2030#ifdef CONFIG_XPS
2031 netif_reset_xps_queues_gt(dev, txq);
2032#endif
2033 }
f0796d5c 2034 }
e6484930
TH
2035
2036 dev->real_num_tx_queues = txq;
2037 return 0;
f0796d5c
JF
2038}
2039EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 2040
62fe0b40
BH
2041#ifdef CONFIG_RPS
2042/**
2043 * netif_set_real_num_rx_queues - set actual number of RX queues used
2044 * @dev: Network device
2045 * @rxq: Actual number of RX queues
2046 *
2047 * This must be called either with the rtnl_lock held or before
2048 * registration of the net device. Returns 0 on success, or a
4e7f7951
BH
2049 * negative error code. If called before registration, it always
2050 * succeeds.
62fe0b40
BH
2051 */
2052int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2053{
2054 int rc;
2055
bd25fa7b
TH
2056 if (rxq < 1 || rxq > dev->num_rx_queues)
2057 return -EINVAL;
2058
62fe0b40
BH
2059 if (dev->reg_state == NETREG_REGISTERED) {
2060 ASSERT_RTNL();
2061
62fe0b40
BH
2062 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2063 rxq);
2064 if (rc)
2065 return rc;
62fe0b40
BH
2066 }
2067
2068 dev->real_num_rx_queues = rxq;
2069 return 0;
2070}
2071EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2072#endif
2073
2c53040f
BH
2074/**
2075 * netif_get_num_default_rss_queues - default number of RSS queues
16917b87
YM
2076 *
2077 * This routine should set an upper limit on the number of RSS queues
2078 * used by default by multiqueue devices.
2079 */
a55b138b 2080int netif_get_num_default_rss_queues(void)
16917b87
YM
2081{
2082 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2083}
2084EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2085
def82a1d 2086static inline void __netif_reschedule(struct Qdisc *q)
56079431 2087{
def82a1d
JP
2088 struct softnet_data *sd;
2089 unsigned long flags;
56079431 2090
def82a1d
JP
2091 local_irq_save(flags);
2092 sd = &__get_cpu_var(softnet_data);
a9cbd588
CG
2093 q->next_sched = NULL;
2094 *sd->output_queue_tailp = q;
2095 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
2096 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2097 local_irq_restore(flags);
2098}
2099
2100void __netif_schedule(struct Qdisc *q)
2101{
2102 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2103 __netif_reschedule(q);
56079431
DV
2104}
2105EXPORT_SYMBOL(__netif_schedule);
2106
bea3348e 2107void dev_kfree_skb_irq(struct sk_buff *skb)
56079431 2108{
3578b0c8 2109 if (atomic_dec_and_test(&skb->users)) {
bea3348e
SH
2110 struct softnet_data *sd;
2111 unsigned long flags;
56079431 2112
bea3348e
SH
2113 local_irq_save(flags);
2114 sd = &__get_cpu_var(softnet_data);
2115 skb->next = sd->completion_queue;
2116 sd->completion_queue = skb;
2117 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2118 local_irq_restore(flags);
2119 }
56079431 2120}
bea3348e 2121EXPORT_SYMBOL(dev_kfree_skb_irq);
56079431
DV
2122
2123void dev_kfree_skb_any(struct sk_buff *skb)
2124{
2125 if (in_irq() || irqs_disabled())
2126 dev_kfree_skb_irq(skb);
2127 else
2128 dev_kfree_skb(skb);
2129}
2130EXPORT_SYMBOL(dev_kfree_skb_any);
2131
2132
bea3348e
SH
2133/**
2134 * netif_device_detach - mark device as removed
2135 * @dev: network device
2136 *
2137 * Mark device as removed from system and therefore no longer available.
2138 */
56079431
DV
2139void netif_device_detach(struct net_device *dev)
2140{
2141 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2142 netif_running(dev)) {
d543103a 2143 netif_tx_stop_all_queues(dev);
56079431
DV
2144 }
2145}
2146EXPORT_SYMBOL(netif_device_detach);
2147
bea3348e
SH
2148/**
2149 * netif_device_attach - mark device as attached
2150 * @dev: network device
2151 *
2152 * Mark device as attached from system and restart if needed.
2153 */
56079431
DV
2154void netif_device_attach(struct net_device *dev)
2155{
2156 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2157 netif_running(dev)) {
d543103a 2158 netif_tx_wake_all_queues(dev);
4ec93edb 2159 __netdev_watchdog_up(dev);
56079431
DV
2160 }
2161}
2162EXPORT_SYMBOL(netif_device_attach);
2163
36c92474
BH
2164static void skb_warn_bad_offload(const struct sk_buff *skb)
2165{
65e9d2fa 2166 static const netdev_features_t null_features = 0;
36c92474
BH
2167 struct net_device *dev = skb->dev;
2168 const char *driver = "";
2169
c846ad9b
BG
2170 if (!net_ratelimit())
2171 return;
2172
36c92474
BH
2173 if (dev && dev->dev.parent)
2174 driver = dev_driver_string(dev->dev.parent);
2175
2176 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2177 "gso_type=%d ip_summed=%d\n",
65e9d2fa
MM
2178 driver, dev ? &dev->features : &null_features,
2179 skb->sk ? &skb->sk->sk_route_caps : &null_features,
36c92474
BH
2180 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2181 skb_shinfo(skb)->gso_type, skb->ip_summed);
2182}
2183
1da177e4
LT
2184/*
2185 * Invalidate hardware checksum when packet is to be mangled, and
2186 * complete checksum manually on outgoing path.
2187 */
84fa7933 2188int skb_checksum_help(struct sk_buff *skb)
1da177e4 2189{
d3bc23e7 2190 __wsum csum;
663ead3b 2191 int ret = 0, offset;
1da177e4 2192
84fa7933 2193 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
2194 goto out_set_summed;
2195
2196 if (unlikely(skb_shinfo(skb)->gso_size)) {
36c92474
BH
2197 skb_warn_bad_offload(skb);
2198 return -EINVAL;
1da177e4
LT
2199 }
2200
cef401de
ED
2201 /* Before computing a checksum, we should make sure no frag could
2202 * be modified by an external entity : checksum could be wrong.
2203 */
2204 if (skb_has_shared_frag(skb)) {
2205 ret = __skb_linearize(skb);
2206 if (ret)
2207 goto out;
2208 }
2209
55508d60 2210 offset = skb_checksum_start_offset(skb);
a030847e
HX
2211 BUG_ON(offset >= skb_headlen(skb));
2212 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2213
2214 offset += skb->csum_offset;
2215 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2216
2217 if (skb_cloned(skb) &&
2218 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
2219 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2220 if (ret)
2221 goto out;
2222 }
2223
a030847e 2224 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 2225out_set_summed:
1da177e4 2226 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 2227out:
1da177e4
LT
2228 return ret;
2229}
d1b19dff 2230EXPORT_SYMBOL(skb_checksum_help);
1da177e4 2231
ec5f0615 2232__be16 skb_network_protocol(struct sk_buff *skb)
f6a78bfc 2233{
252e3346 2234 __be16 type = skb->protocol;
c80a8512 2235 int vlan_depth = ETH_HLEN;
f6a78bfc 2236
19acc327
PS
2237 /* Tunnel gso handlers can set protocol to ethernet. */
2238 if (type == htons(ETH_P_TEB)) {
2239 struct ethhdr *eth;
2240
2241 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2242 return 0;
2243
2244 eth = (struct ethhdr *)skb_mac_header(skb);
2245 type = eth->h_proto;
2246 }
2247
8ad227ff 2248 while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
c8d5bcd1 2249 struct vlan_hdr *vh;
7b9c6090 2250
c8d5bcd1 2251 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
ec5f0615 2252 return 0;
7b9c6090 2253
c8d5bcd1
JG
2254 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2255 type = vh->h_vlan_encapsulated_proto;
2256 vlan_depth += VLAN_HLEN;
7b9c6090
JG
2257 }
2258
ec5f0615
PS
2259 return type;
2260}
2261
2262/**
2263 * skb_mac_gso_segment - mac layer segmentation handler.
2264 * @skb: buffer to segment
2265 * @features: features for the output path (see dev->features)
2266 */
2267struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2268 netdev_features_t features)
2269{
2270 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2271 struct packet_offload *ptype;
2272 __be16 type = skb_network_protocol(skb);
2273
2274 if (unlikely(!type))
2275 return ERR_PTR(-EINVAL);
2276
f6a78bfc
HX
2277 __skb_pull(skb, skb->mac_len);
2278
2279 rcu_read_lock();
22061d80 2280 list_for_each_entry_rcu(ptype, &offload_base, list) {
f191a1d1 2281 if (ptype->type == type && ptype->callbacks.gso_segment) {
84fa7933 2282 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
05e8ef4a
PS
2283 int err;
2284
f191a1d1 2285 err = ptype->callbacks.gso_send_check(skb);
a430a43d
HX
2286 segs = ERR_PTR(err);
2287 if (err || skb_gso_ok(skb, features))
2288 break;
d56f90a7
ACM
2289 __skb_push(skb, (skb->data -
2290 skb_network_header(skb)));
a430a43d 2291 }
f191a1d1 2292 segs = ptype->callbacks.gso_segment(skb, features);
f6a78bfc
HX
2293 break;
2294 }
2295 }
2296 rcu_read_unlock();
2297
98e399f8 2298 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 2299
f6a78bfc
HX
2300 return segs;
2301}
05e8ef4a
PS
2302EXPORT_SYMBOL(skb_mac_gso_segment);
2303
2304
2305/* openvswitch calls this on rx path, so we need a different check.
2306 */
2307static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2308{
2309 if (tx_path)
2310 return skb->ip_summed != CHECKSUM_PARTIAL;
2311 else
2312 return skb->ip_summed == CHECKSUM_NONE;
2313}
2314
2315/**
2316 * __skb_gso_segment - Perform segmentation on skb.
2317 * @skb: buffer to segment
2318 * @features: features for the output path (see dev->features)
2319 * @tx_path: whether it is called in TX path
2320 *
2321 * This function segments the given skb and returns a list of segments.
2322 *
2323 * It may return NULL if the skb requires no segmentation. This is
2324 * only possible when GSO is used for verifying header integrity.
2325 */
2326struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2327 netdev_features_t features, bool tx_path)
2328{
2329 if (unlikely(skb_needs_check(skb, tx_path))) {
2330 int err;
2331
2332 skb_warn_bad_offload(skb);
2333
2334 if (skb_header_cloned(skb) &&
2335 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2336 return ERR_PTR(err);
2337 }
2338
68c33163 2339 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
05e8ef4a
PS
2340 skb_reset_mac_header(skb);
2341 skb_reset_mac_len(skb);
2342
2343 return skb_mac_gso_segment(skb, features);
2344}
12b0004d 2345EXPORT_SYMBOL(__skb_gso_segment);
f6a78bfc 2346
fb286bb2
HX
2347/* Take action when hardware reception checksum errors are detected. */
2348#ifdef CONFIG_BUG
2349void netdev_rx_csum_fault(struct net_device *dev)
2350{
2351 if (net_ratelimit()) {
7b6cd1ce 2352 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
fb286bb2
HX
2353 dump_stack();
2354 }
2355}
2356EXPORT_SYMBOL(netdev_rx_csum_fault);
2357#endif
2358
1da177e4
LT
2359/* Actually, we should eliminate this check as soon as we know, that:
2360 * 1. IOMMU is present and allows to map all the memory.
2361 * 2. No high memory really exists on this machine.
2362 */
2363
9092c658 2364static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 2365{
3d3a8533 2366#ifdef CONFIG_HIGHMEM
1da177e4 2367 int i;
5acbbd42 2368 if (!(dev->features & NETIF_F_HIGHDMA)) {
ea2ab693
IC
2369 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2370 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2371 if (PageHighMem(skb_frag_page(frag)))
5acbbd42 2372 return 1;
ea2ab693 2373 }
5acbbd42 2374 }
1da177e4 2375
5acbbd42
FT
2376 if (PCI_DMA_BUS_IS_PHYS) {
2377 struct device *pdev = dev->dev.parent;
1da177e4 2378
9092c658
ED
2379 if (!pdev)
2380 return 0;
5acbbd42 2381 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
ea2ab693
IC
2382 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2383 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
5acbbd42
FT
2384 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2385 return 1;
2386 }
2387 }
3d3a8533 2388#endif
1da177e4
LT
2389 return 0;
2390}
1da177e4 2391
f6a78bfc
HX
2392struct dev_gso_cb {
2393 void (*destructor)(struct sk_buff *skb);
2394};
2395
2396#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2397
2398static void dev_gso_skb_destructor(struct sk_buff *skb)
2399{
2400 struct dev_gso_cb *cb;
2401
2402 do {
2403 struct sk_buff *nskb = skb->next;
2404
2405 skb->next = nskb->next;
2406 nskb->next = NULL;
2407 kfree_skb(nskb);
2408 } while (skb->next);
2409
2410 cb = DEV_GSO_CB(skb);
2411 if (cb->destructor)
2412 cb->destructor(skb);
2413}
2414
2415/**
2416 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2417 * @skb: buffer to segment
91ecb63c 2418 * @features: device features as applicable to this skb
f6a78bfc
HX
2419 *
2420 * This function segments the given skb and stores the list of segments
2421 * in skb->next.
2422 */
c8f44aff 2423static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
f6a78bfc 2424{
f6a78bfc 2425 struct sk_buff *segs;
576a30eb
HX
2426
2427 segs = skb_gso_segment(skb, features);
2428
2429 /* Verifying header integrity only. */
2430 if (!segs)
2431 return 0;
f6a78bfc 2432
801678c5 2433 if (IS_ERR(segs))
f6a78bfc
HX
2434 return PTR_ERR(segs);
2435
2436 skb->next = segs;
2437 DEV_GSO_CB(skb)->destructor = skb->destructor;
2438 skb->destructor = dev_gso_skb_destructor;
2439
2440 return 0;
2441}
2442
c8f44aff
MM
2443static netdev_features_t harmonize_features(struct sk_buff *skb,
2444 __be16 protocol, netdev_features_t features)
f01a5236 2445{
c0d680e5
EC
2446 if (skb->ip_summed != CHECKSUM_NONE &&
2447 !can_checksum_protocol(features, protocol)) {
f01a5236 2448 features &= ~NETIF_F_ALL_CSUM;
f01a5236
JG
2449 } else if (illegal_highdma(skb->dev, skb)) {
2450 features &= ~NETIF_F_SG;
2451 }
2452
2453 return features;
2454}
2455
c8f44aff 2456netdev_features_t netif_skb_features(struct sk_buff *skb)
58e998c6
JG
2457{
2458 __be16 protocol = skb->protocol;
c8f44aff 2459 netdev_features_t features = skb->dev->features;
58e998c6 2460
30b678d8
BH
2461 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2462 features &= ~NETIF_F_GSO_MASK;
2463
8ad227ff 2464 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
58e998c6
JG
2465 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2466 protocol = veh->h_vlan_encapsulated_proto;
f01a5236
JG
2467 } else if (!vlan_tx_tag_present(skb)) {
2468 return harmonize_features(skb, protocol, features);
2469 }
58e998c6 2470
8ad227ff
PM
2471 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2472 NETIF_F_HW_VLAN_STAG_TX);
f01a5236 2473
8ad227ff 2474 if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
f01a5236
JG
2475 return harmonize_features(skb, protocol, features);
2476 } else {
2477 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
8ad227ff
PM
2478 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2479 NETIF_F_HW_VLAN_STAG_TX;
f01a5236
JG
2480 return harmonize_features(skb, protocol, features);
2481 }
58e998c6 2482}
f01a5236 2483EXPORT_SYMBOL(netif_skb_features);
58e998c6 2484
6afff0ca
JF
2485/*
2486 * Returns true if either:
2487 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
d1a53dfd 2488 * 2. skb is fragmented and the device does not support SG.
6afff0ca
JF
2489 */
2490static inline int skb_needs_linearize(struct sk_buff *skb,
6708c9e5 2491 netdev_features_t features)
6afff0ca 2492{
02932ce9
JG
2493 return skb_is_nonlinear(skb) &&
2494 ((skb_has_frag_list(skb) &&
2495 !(features & NETIF_F_FRAGLIST)) ||
e1e78db6 2496 (skb_shinfo(skb)->nr_frags &&
02932ce9 2497 !(features & NETIF_F_SG)));
6afff0ca
JF
2498}
2499
fd2ea0a7
DM
2500int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2501 struct netdev_queue *txq)
f6a78bfc 2502{
00829823 2503 const struct net_device_ops *ops = dev->netdev_ops;
572a9d7b 2504 int rc = NETDEV_TX_OK;
ec764bf0 2505 unsigned int skb_len;
00829823 2506
f6a78bfc 2507 if (likely(!skb->next)) {
c8f44aff 2508 netdev_features_t features;
fc741216 2509
93f154b5 2510 /*
25985edc 2511 * If device doesn't need skb->dst, release it right now while
93f154b5
ED
2512 * its hot in this cpu cache
2513 */
adf30907
ED
2514 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2515 skb_dst_drop(skb);
2516
fc741216
JG
2517 features = netif_skb_features(skb);
2518
7b9c6090 2519 if (vlan_tx_tag_present(skb) &&
86a9bad3
PM
2520 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2521 skb = __vlan_put_tag(skb, skb->vlan_proto,
2522 vlan_tx_tag_get(skb));
7b9c6090
JG
2523 if (unlikely(!skb))
2524 goto out;
2525
2526 skb->vlan_tci = 0;
2527 }
2528
fc70fb64
AD
2529 /* If encapsulation offload request, verify we are testing
2530 * hardware encapsulation features instead of standard
2531 * features for the netdev
2532 */
2533 if (skb->encapsulation)
2534 features &= dev->hw_enc_features;
2535
fc741216 2536 if (netif_needs_gso(skb, features)) {
91ecb63c 2537 if (unlikely(dev_gso_segment(skb, features)))
9ccb8975
DM
2538 goto out_kfree_skb;
2539 if (skb->next)
2540 goto gso;
6afff0ca 2541 } else {
02932ce9 2542 if (skb_needs_linearize(skb, features) &&
6afff0ca
JF
2543 __skb_linearize(skb))
2544 goto out_kfree_skb;
2545
2546 /* If packet is not checksummed and device does not
2547 * support checksumming for this protocol, complete
2548 * checksumming here.
2549 */
2550 if (skb->ip_summed == CHECKSUM_PARTIAL) {
fc70fb64
AD
2551 if (skb->encapsulation)
2552 skb_set_inner_transport_header(skb,
2553 skb_checksum_start_offset(skb));
2554 else
2555 skb_set_transport_header(skb,
2556 skb_checksum_start_offset(skb));
03634668 2557 if (!(features & NETIF_F_ALL_CSUM) &&
6afff0ca
JF
2558 skb_checksum_help(skb))
2559 goto out_kfree_skb;
2560 }
9ccb8975
DM
2561 }
2562
b40863c6
ED
2563 if (!list_empty(&ptype_all))
2564 dev_queue_xmit_nit(skb, dev);
2565
ec764bf0 2566 skb_len = skb->len;
ac45f602 2567 rc = ops->ndo_start_xmit(skb, dev);
ec764bf0 2568 trace_net_dev_xmit(skb, rc, dev, skb_len);
ec634fe3 2569 if (rc == NETDEV_TX_OK)
08baf561 2570 txq_trans_update(txq);
ac45f602 2571 return rc;
f6a78bfc
HX
2572 }
2573
576a30eb 2574gso:
f6a78bfc
HX
2575 do {
2576 struct sk_buff *nskb = skb->next;
f6a78bfc
HX
2577
2578 skb->next = nskb->next;
2579 nskb->next = NULL;
068a2de5 2580
b40863c6
ED
2581 if (!list_empty(&ptype_all))
2582 dev_queue_xmit_nit(nskb, dev);
2583
ec764bf0 2584 skb_len = nskb->len;
00829823 2585 rc = ops->ndo_start_xmit(nskb, dev);
ec764bf0 2586 trace_net_dev_xmit(nskb, rc, dev, skb_len);
ec634fe3 2587 if (unlikely(rc != NETDEV_TX_OK)) {
572a9d7b
PM
2588 if (rc & ~NETDEV_TX_MASK)
2589 goto out_kfree_gso_skb;
f54d9e8d 2590 nskb->next = skb->next;
f6a78bfc
HX
2591 skb->next = nskb;
2592 return rc;
2593 }
08baf561 2594 txq_trans_update(txq);
73466498 2595 if (unlikely(netif_xmit_stopped(txq) && skb->next))
f54d9e8d 2596 return NETDEV_TX_BUSY;
f6a78bfc 2597 } while (skb->next);
4ec93edb 2598
572a9d7b 2599out_kfree_gso_skb:
0c772159 2600 if (likely(skb->next == NULL)) {
572a9d7b 2601 skb->destructor = DEV_GSO_CB(skb)->destructor;
0c772159
SS
2602 consume_skb(skb);
2603 return rc;
2604 }
f6a78bfc
HX
2605out_kfree_skb:
2606 kfree_skb(skb);
7b9c6090 2607out:
572a9d7b 2608 return rc;
f6a78bfc
HX
2609}
2610
1def9238
ED
2611static void qdisc_pkt_len_init(struct sk_buff *skb)
2612{
2613 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2614
2615 qdisc_skb_cb(skb)->pkt_len = skb->len;
2616
2617 /* To get more precise estimation of bytes sent on wire,
2618 * we add to pkt_len the headers size of all segments
2619 */
2620 if (shinfo->gso_size) {
757b8b1d 2621 unsigned int hdr_len;
15e5a030 2622 u16 gso_segs = shinfo->gso_segs;
1def9238 2623
757b8b1d
ED
2624 /* mac layer + network layer */
2625 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2626
2627 /* + transport layer */
1def9238
ED
2628 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2629 hdr_len += tcp_hdrlen(skb);
2630 else
2631 hdr_len += sizeof(struct udphdr);
15e5a030
JW
2632
2633 if (shinfo->gso_type & SKB_GSO_DODGY)
2634 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2635 shinfo->gso_size);
2636
2637 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
1def9238
ED
2638 }
2639}
2640
bbd8a0d3
KK
2641static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2642 struct net_device *dev,
2643 struct netdev_queue *txq)
2644{
2645 spinlock_t *root_lock = qdisc_lock(q);
a2da570d 2646 bool contended;
bbd8a0d3
KK
2647 int rc;
2648
1def9238 2649 qdisc_pkt_len_init(skb);
a2da570d 2650 qdisc_calculate_pkt_len(skb, q);
79640a4c
ED
2651 /*
2652 * Heuristic to force contended enqueues to serialize on a
2653 * separate lock before trying to get qdisc main lock.
2654 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2655 * and dequeue packets faster.
2656 */
a2da570d 2657 contended = qdisc_is_running(q);
79640a4c
ED
2658 if (unlikely(contended))
2659 spin_lock(&q->busylock);
2660
bbd8a0d3
KK
2661 spin_lock(root_lock);
2662 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2663 kfree_skb(skb);
2664 rc = NET_XMIT_DROP;
2665 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 2666 qdisc_run_begin(q)) {
bbd8a0d3
KK
2667 /*
2668 * This is a work-conserving queue; there are no old skbs
2669 * waiting to be sent out; and the qdisc is not running -
2670 * xmit the skb directly.
2671 */
7fee226a
ED
2672 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2673 skb_dst_force(skb);
bfe0d029 2674
bfe0d029
ED
2675 qdisc_bstats_update(q, skb);
2676
79640a4c
ED
2677 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2678 if (unlikely(contended)) {
2679 spin_unlock(&q->busylock);
2680 contended = false;
2681 }
bbd8a0d3 2682 __qdisc_run(q);
79640a4c 2683 } else
bc135b23 2684 qdisc_run_end(q);
bbd8a0d3
KK
2685
2686 rc = NET_XMIT_SUCCESS;
2687 } else {
7fee226a 2688 skb_dst_force(skb);
a2da570d 2689 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
79640a4c
ED
2690 if (qdisc_run_begin(q)) {
2691 if (unlikely(contended)) {
2692 spin_unlock(&q->busylock);
2693 contended = false;
2694 }
2695 __qdisc_run(q);
2696 }
bbd8a0d3
KK
2697 }
2698 spin_unlock(root_lock);
79640a4c
ED
2699 if (unlikely(contended))
2700 spin_unlock(&q->busylock);
bbd8a0d3
KK
2701 return rc;
2702}
2703
5bc1421e
NH
2704#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2705static void skb_update_prio(struct sk_buff *skb)
2706{
6977a79d 2707 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
5bc1421e 2708
91c68ce2
ED
2709 if (!skb->priority && skb->sk && map) {
2710 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2711
2712 if (prioidx < map->priomap_len)
2713 skb->priority = map->priomap[prioidx];
2714 }
5bc1421e
NH
2715}
2716#else
2717#define skb_update_prio(skb)
2718#endif
2719
745e20f1 2720static DEFINE_PER_CPU(int, xmit_recursion);
11a766ce 2721#define RECURSION_LIMIT 10
745e20f1 2722
95603e22
MM
2723/**
2724 * dev_loopback_xmit - loop back @skb
2725 * @skb: buffer to transmit
2726 */
2727int dev_loopback_xmit(struct sk_buff *skb)
2728{
2729 skb_reset_mac_header(skb);
2730 __skb_pull(skb, skb_network_offset(skb));
2731 skb->pkt_type = PACKET_LOOPBACK;
2732 skb->ip_summed = CHECKSUM_UNNECESSARY;
2733 WARN_ON(!skb_dst(skb));
2734 skb_dst_force(skb);
2735 netif_rx_ni(skb);
2736 return 0;
2737}
2738EXPORT_SYMBOL(dev_loopback_xmit);
2739
d29f749e
DJ
2740/**
2741 * dev_queue_xmit - transmit a buffer
2742 * @skb: buffer to transmit
2743 *
2744 * Queue a buffer for transmission to a network device. The caller must
2745 * have set the device and priority and built the buffer before calling
2746 * this function. The function can be called from an interrupt.
2747 *
2748 * A negative errno code is returned on a failure. A success does not
2749 * guarantee the frame will be transmitted as it may be dropped due
2750 * to congestion or traffic shaping.
2751 *
2752 * -----------------------------------------------------------------------------------
2753 * I notice this method can also return errors from the queue disciplines,
2754 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2755 * be positive.
2756 *
2757 * Regardless of the return value, the skb is consumed, so it is currently
2758 * difficult to retry a send to this method. (You can bump the ref count
2759 * before sending to hold a reference for retry if you are careful.)
2760 *
2761 * When calling this method, interrupts MUST be enabled. This is because
2762 * the BH enable code must have IRQs enabled so that it will not deadlock.
2763 * --BLG
2764 */
1da177e4
LT
2765int dev_queue_xmit(struct sk_buff *skb)
2766{
2767 struct net_device *dev = skb->dev;
dc2b4847 2768 struct netdev_queue *txq;
1da177e4
LT
2769 struct Qdisc *q;
2770 int rc = -ENOMEM;
2771
6d1ccff6
ED
2772 skb_reset_mac_header(skb);
2773
4ec93edb
YH
2774 /* Disable soft irqs for various locks below. Also
2775 * stops preemption for RCU.
1da177e4 2776 */
4ec93edb 2777 rcu_read_lock_bh();
1da177e4 2778
5bc1421e
NH
2779 skb_update_prio(skb);
2780
8c4c49df 2781 txq = netdev_pick_tx(dev, skb);
a898def2 2782 q = rcu_dereference_bh(txq->qdisc);
37437bb2 2783
1da177e4 2784#ifdef CONFIG_NET_CLS_ACT
d1b19dff 2785 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4 2786#endif
cf66ba58 2787 trace_net_dev_queue(skb);
1da177e4 2788 if (q->enqueue) {
bbd8a0d3 2789 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 2790 goto out;
1da177e4
LT
2791 }
2792
2793 /* The device has no queue. Common case for software devices:
2794 loopback, all the sorts of tunnels...
2795
932ff279
HX
2796 Really, it is unlikely that netif_tx_lock protection is necessary
2797 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
2798 counters.)
2799 However, it is possible, that they rely on protection
2800 made by us here.
2801
2802 Check this and shot the lock. It is not prone from deadlocks.
2803 Either shot noqueue qdisc, it is even simpler 8)
2804 */
2805 if (dev->flags & IFF_UP) {
2806 int cpu = smp_processor_id(); /* ok because BHs are off */
2807
c773e847 2808 if (txq->xmit_lock_owner != cpu) {
1da177e4 2809
745e20f1
ED
2810 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2811 goto recursion_alert;
2812
c773e847 2813 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 2814
73466498 2815 if (!netif_xmit_stopped(txq)) {
745e20f1 2816 __this_cpu_inc(xmit_recursion);
572a9d7b 2817 rc = dev_hard_start_xmit(skb, dev, txq);
745e20f1 2818 __this_cpu_dec(xmit_recursion);
572a9d7b 2819 if (dev_xmit_complete(rc)) {
c773e847 2820 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2821 goto out;
2822 }
2823 }
c773e847 2824 HARD_TX_UNLOCK(dev, txq);
e87cc472
JP
2825 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2826 dev->name);
1da177e4
LT
2827 } else {
2828 /* Recursion is detected! It is possible,
745e20f1
ED
2829 * unfortunately
2830 */
2831recursion_alert:
e87cc472
JP
2832 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2833 dev->name);
1da177e4
LT
2834 }
2835 }
2836
2837 rc = -ENETDOWN;
d4828d85 2838 rcu_read_unlock_bh();
1da177e4 2839
1da177e4
LT
2840 kfree_skb(skb);
2841 return rc;
2842out:
d4828d85 2843 rcu_read_unlock_bh();
1da177e4
LT
2844 return rc;
2845}
d1b19dff 2846EXPORT_SYMBOL(dev_queue_xmit);
1da177e4
LT
2847
2848
2849/*=======================================================================
2850 Receiver routines
2851 =======================================================================*/
2852
6b2bedc3 2853int netdev_max_backlog __read_mostly = 1000;
c9e6bc64
ED
2854EXPORT_SYMBOL(netdev_max_backlog);
2855
3b098e2d 2856int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
2857int netdev_budget __read_mostly = 300;
2858int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 2859
eecfd7c4
ED
2860/* Called with irq disabled */
2861static inline void ____napi_schedule(struct softnet_data *sd,
2862 struct napi_struct *napi)
2863{
2864 list_add_tail(&napi->poll_list, &sd->poll_list);
2865 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2866}
2867
bfb564e7
KK
2868#ifdef CONFIG_RPS
2869
2870/* One global table that all flow-based protocols share. */
6e3f7faf 2871struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
bfb564e7
KK
2872EXPORT_SYMBOL(rps_sock_flow_table);
2873
c5905afb 2874struct static_key rps_needed __read_mostly;
adc9300e 2875
c445477d
BH
2876static struct rps_dev_flow *
2877set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2878 struct rps_dev_flow *rflow, u16 next_cpu)
2879{
09994d1b 2880 if (next_cpu != RPS_NO_CPU) {
c445477d
BH
2881#ifdef CONFIG_RFS_ACCEL
2882 struct netdev_rx_queue *rxqueue;
2883 struct rps_dev_flow_table *flow_table;
2884 struct rps_dev_flow *old_rflow;
2885 u32 flow_id;
2886 u16 rxq_index;
2887 int rc;
2888
2889 /* Should we steer this flow to a different hardware queue? */
69a19ee6
BH
2890 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2891 !(dev->features & NETIF_F_NTUPLE))
c445477d
BH
2892 goto out;
2893 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2894 if (rxq_index == skb_get_rx_queue(skb))
2895 goto out;
2896
2897 rxqueue = dev->_rx + rxq_index;
2898 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2899 if (!flow_table)
2900 goto out;
2901 flow_id = skb->rxhash & flow_table->mask;
2902 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2903 rxq_index, flow_id);
2904 if (rc < 0)
2905 goto out;
2906 old_rflow = rflow;
2907 rflow = &flow_table->flows[flow_id];
c445477d
BH
2908 rflow->filter = rc;
2909 if (old_rflow->filter == rflow->filter)
2910 old_rflow->filter = RPS_NO_FILTER;
2911 out:
2912#endif
2913 rflow->last_qtail =
09994d1b 2914 per_cpu(softnet_data, next_cpu).input_queue_head;
c445477d
BH
2915 }
2916
09994d1b 2917 rflow->cpu = next_cpu;
c445477d
BH
2918 return rflow;
2919}
2920
bfb564e7
KK
2921/*
2922 * get_rps_cpu is called from netif_receive_skb and returns the target
2923 * CPU from the RPS map of the receiving queue for a given skb.
2924 * rcu_read_lock must be held on entry.
2925 */
2926static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2927 struct rps_dev_flow **rflowp)
2928{
2929 struct netdev_rx_queue *rxqueue;
6e3f7faf 2930 struct rps_map *map;
bfb564e7
KK
2931 struct rps_dev_flow_table *flow_table;
2932 struct rps_sock_flow_table *sock_flow_table;
2933 int cpu = -1;
2934 u16 tcpu;
2935
2936 if (skb_rx_queue_recorded(skb)) {
2937 u16 index = skb_get_rx_queue(skb);
62fe0b40
BH
2938 if (unlikely(index >= dev->real_num_rx_queues)) {
2939 WARN_ONCE(dev->real_num_rx_queues > 1,
2940 "%s received packet on queue %u, but number "
2941 "of RX queues is %u\n",
2942 dev->name, index, dev->real_num_rx_queues);
bfb564e7
KK
2943 goto done;
2944 }
2945 rxqueue = dev->_rx + index;
2946 } else
2947 rxqueue = dev->_rx;
2948
6e3f7faf
ED
2949 map = rcu_dereference(rxqueue->rps_map);
2950 if (map) {
85875236 2951 if (map->len == 1 &&
33d480ce 2952 !rcu_access_pointer(rxqueue->rps_flow_table)) {
6febfca9
CG
2953 tcpu = map->cpus[0];
2954 if (cpu_online(tcpu))
2955 cpu = tcpu;
2956 goto done;
2957 }
33d480ce 2958 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
bfb564e7 2959 goto done;
6febfca9 2960 }
bfb564e7 2961
2d47b459 2962 skb_reset_network_header(skb);
bfb564e7
KK
2963 if (!skb_get_rxhash(skb))
2964 goto done;
2965
fec5e652
TH
2966 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2967 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2968 if (flow_table && sock_flow_table) {
2969 u16 next_cpu;
2970 struct rps_dev_flow *rflow;
2971
2972 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2973 tcpu = rflow->cpu;
2974
2975 next_cpu = sock_flow_table->ents[skb->rxhash &
2976 sock_flow_table->mask];
2977
2978 /*
2979 * If the desired CPU (where last recvmsg was done) is
2980 * different from current CPU (one in the rx-queue flow
2981 * table entry), switch if one of the following holds:
2982 * - Current CPU is unset (equal to RPS_NO_CPU).
2983 * - Current CPU is offline.
2984 * - The current CPU's queue tail has advanced beyond the
2985 * last packet that was enqueued using this table entry.
2986 * This guarantees that all previous packets for the flow
2987 * have been dequeued, thus preserving in order delivery.
2988 */
2989 if (unlikely(tcpu != next_cpu) &&
2990 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2991 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
baefa31d
TH
2992 rflow->last_qtail)) >= 0)) {
2993 tcpu = next_cpu;
c445477d 2994 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
baefa31d 2995 }
c445477d 2996
fec5e652
TH
2997 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2998 *rflowp = rflow;
2999 cpu = tcpu;
3000 goto done;
3001 }
3002 }
3003
0a9627f2 3004 if (map) {
fec5e652 3005 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
0a9627f2
TH
3006
3007 if (cpu_online(tcpu)) {
3008 cpu = tcpu;
3009 goto done;
3010 }
3011 }
3012
3013done:
0a9627f2
TH
3014 return cpu;
3015}
3016
c445477d
BH
3017#ifdef CONFIG_RFS_ACCEL
3018
3019/**
3020 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3021 * @dev: Device on which the filter was set
3022 * @rxq_index: RX queue index
3023 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3024 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3025 *
3026 * Drivers that implement ndo_rx_flow_steer() should periodically call
3027 * this function for each installed filter and remove the filters for
3028 * which it returns %true.
3029 */
3030bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3031 u32 flow_id, u16 filter_id)
3032{
3033 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3034 struct rps_dev_flow_table *flow_table;
3035 struct rps_dev_flow *rflow;
3036 bool expire = true;
3037 int cpu;
3038
3039 rcu_read_lock();
3040 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3041 if (flow_table && flow_id <= flow_table->mask) {
3042 rflow = &flow_table->flows[flow_id];
3043 cpu = ACCESS_ONCE(rflow->cpu);
3044 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3045 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3046 rflow->last_qtail) <
3047 (int)(10 * flow_table->mask)))
3048 expire = false;
3049 }
3050 rcu_read_unlock();
3051 return expire;
3052}
3053EXPORT_SYMBOL(rps_may_expire_flow);
3054
3055#endif /* CONFIG_RFS_ACCEL */
3056
0a9627f2 3057/* Called from hardirq (IPI) context */
e36fa2f7 3058static void rps_trigger_softirq(void *data)
0a9627f2 3059{
e36fa2f7
ED
3060 struct softnet_data *sd = data;
3061
eecfd7c4 3062 ____napi_schedule(sd, &sd->backlog);
dee42870 3063 sd->received_rps++;
0a9627f2 3064}
e36fa2f7 3065
fec5e652 3066#endif /* CONFIG_RPS */
0a9627f2 3067
e36fa2f7
ED
3068/*
3069 * Check if this softnet_data structure is another cpu one
3070 * If yes, queue it to our IPI list and return 1
3071 * If no, return 0
3072 */
3073static int rps_ipi_queued(struct softnet_data *sd)
3074{
3075#ifdef CONFIG_RPS
3076 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3077
3078 if (sd != mysd) {
3079 sd->rps_ipi_next = mysd->rps_ipi_list;
3080 mysd->rps_ipi_list = sd;
3081
3082 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3083 return 1;
3084 }
3085#endif /* CONFIG_RPS */
3086 return 0;
3087}
3088
99bbc707
WB
3089#ifdef CONFIG_NET_FLOW_LIMIT
3090int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3091#endif
3092
3093static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3094{
3095#ifdef CONFIG_NET_FLOW_LIMIT
3096 struct sd_flow_limit *fl;
3097 struct softnet_data *sd;
3098 unsigned int old_flow, new_flow;
3099
3100 if (qlen < (netdev_max_backlog >> 1))
3101 return false;
3102
3103 sd = &__get_cpu_var(softnet_data);
3104
3105 rcu_read_lock();
3106 fl = rcu_dereference(sd->flow_limit);
3107 if (fl) {
3108 new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3109 old_flow = fl->history[fl->history_head];
3110 fl->history[fl->history_head] = new_flow;
3111
3112 fl->history_head++;
3113 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3114
3115 if (likely(fl->buckets[old_flow]))
3116 fl->buckets[old_flow]--;
3117
3118 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3119 fl->count++;
3120 rcu_read_unlock();
3121 return true;
3122 }
3123 }
3124 rcu_read_unlock();
3125#endif
3126 return false;
3127}
3128
0a9627f2
TH
3129/*
3130 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3131 * queue (may be a remote CPU queue).
3132 */
fec5e652
TH
3133static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3134 unsigned int *qtail)
0a9627f2 3135{
e36fa2f7 3136 struct softnet_data *sd;
0a9627f2 3137 unsigned long flags;
99bbc707 3138 unsigned int qlen;
0a9627f2 3139
e36fa2f7 3140 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
3141
3142 local_irq_save(flags);
0a9627f2 3143
e36fa2f7 3144 rps_lock(sd);
99bbc707
WB
3145 qlen = skb_queue_len(&sd->input_pkt_queue);
3146 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
6e7676c1 3147 if (skb_queue_len(&sd->input_pkt_queue)) {
0a9627f2 3148enqueue:
e36fa2f7 3149 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 3150 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 3151 rps_unlock(sd);
152102c7 3152 local_irq_restore(flags);
0a9627f2
TH
3153 return NET_RX_SUCCESS;
3154 }
3155
ebda37c2
ED
3156 /* Schedule NAPI for backlog device
3157 * We can use non atomic operation since we own the queue lock
3158 */
3159 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 3160 if (!rps_ipi_queued(sd))
eecfd7c4 3161 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
3162 }
3163 goto enqueue;
3164 }
3165
dee42870 3166 sd->dropped++;
e36fa2f7 3167 rps_unlock(sd);
0a9627f2 3168
0a9627f2
TH
3169 local_irq_restore(flags);
3170
caf586e5 3171 atomic_long_inc(&skb->dev->rx_dropped);
0a9627f2
TH
3172 kfree_skb(skb);
3173 return NET_RX_DROP;
3174}
1da177e4 3175
1da177e4
LT
3176/**
3177 * netif_rx - post buffer to the network code
3178 * @skb: buffer to post
3179 *
3180 * This function receives a packet from a device driver and queues it for
3181 * the upper (protocol) levels to process. It always succeeds. The buffer
3182 * may be dropped during processing for congestion control or by the
3183 * protocol layers.
3184 *
3185 * return values:
3186 * NET_RX_SUCCESS (no congestion)
1da177e4
LT
3187 * NET_RX_DROP (packet was dropped)
3188 *
3189 */
3190
3191int netif_rx(struct sk_buff *skb)
3192{
b0e28f1e 3193 int ret;
1da177e4
LT
3194
3195 /* if netpoll wants it, pretend we never saw it */
3196 if (netpoll_rx(skb))
3197 return NET_RX_DROP;
3198
588f0330 3199 net_timestamp_check(netdev_tstamp_prequeue, skb);
1da177e4 3200
cf66ba58 3201 trace_netif_rx(skb);
df334545 3202#ifdef CONFIG_RPS
c5905afb 3203 if (static_key_false(&rps_needed)) {
fec5e652 3204 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
3205 int cpu;
3206
cece1945 3207 preempt_disable();
b0e28f1e 3208 rcu_read_lock();
fec5e652
TH
3209
3210 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
3211 if (cpu < 0)
3212 cpu = smp_processor_id();
fec5e652
TH
3213
3214 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3215
b0e28f1e 3216 rcu_read_unlock();
cece1945 3217 preempt_enable();
adc9300e
ED
3218 } else
3219#endif
fec5e652
TH
3220 {
3221 unsigned int qtail;
3222 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3223 put_cpu();
3224 }
b0e28f1e 3225 return ret;
1da177e4 3226}
d1b19dff 3227EXPORT_SYMBOL(netif_rx);
1da177e4
LT
3228
3229int netif_rx_ni(struct sk_buff *skb)
3230{
3231 int err;
3232
3233 preempt_disable();
3234 err = netif_rx(skb);
3235 if (local_softirq_pending())
3236 do_softirq();
3237 preempt_enable();
3238
3239 return err;
3240}
1da177e4
LT
3241EXPORT_SYMBOL(netif_rx_ni);
3242
1da177e4
LT
3243static void net_tx_action(struct softirq_action *h)
3244{
3245 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3246
3247 if (sd->completion_queue) {
3248 struct sk_buff *clist;
3249
3250 local_irq_disable();
3251 clist = sd->completion_queue;
3252 sd->completion_queue = NULL;
3253 local_irq_enable();
3254
3255 while (clist) {
3256 struct sk_buff *skb = clist;
3257 clist = clist->next;
3258
547b792c 3259 WARN_ON(atomic_read(&skb->users));
07dc22e7 3260 trace_kfree_skb(skb, net_tx_action);
1da177e4
LT
3261 __kfree_skb(skb);
3262 }
3263 }
3264
3265 if (sd->output_queue) {
37437bb2 3266 struct Qdisc *head;
1da177e4
LT
3267
3268 local_irq_disable();
3269 head = sd->output_queue;
3270 sd->output_queue = NULL;
a9cbd588 3271 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
3272 local_irq_enable();
3273
3274 while (head) {
37437bb2
DM
3275 struct Qdisc *q = head;
3276 spinlock_t *root_lock;
3277
1da177e4
LT
3278 head = head->next_sched;
3279
5fb66229 3280 root_lock = qdisc_lock(q);
37437bb2 3281 if (spin_trylock(root_lock)) {
def82a1d
JP
3282 smp_mb__before_clear_bit();
3283 clear_bit(__QDISC_STATE_SCHED,
3284 &q->state);
37437bb2
DM
3285 qdisc_run(q);
3286 spin_unlock(root_lock);
1da177e4 3287 } else {
195648bb 3288 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 3289 &q->state)) {
195648bb 3290 __netif_reschedule(q);
e8a83e10
JP
3291 } else {
3292 smp_mb__before_clear_bit();
3293 clear_bit(__QDISC_STATE_SCHED,
3294 &q->state);
3295 }
1da177e4
LT
3296 }
3297 }
3298 }
3299}
3300
ab95bfe0
JP
3301#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3302 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
da678292
MM
3303/* This hook is defined here for ATM LANE */
3304int (*br_fdb_test_addr_hook)(struct net_device *dev,
3305 unsigned char *addr) __read_mostly;
4fb019a0 3306EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 3307#endif
1da177e4 3308
1da177e4
LT
3309#ifdef CONFIG_NET_CLS_ACT
3310/* TODO: Maybe we should just force sch_ingress to be compiled in
3311 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3312 * a compare and 2 stores extra right now if we dont have it on
3313 * but have CONFIG_NET_CLS_ACT
25985edc
LDM
3314 * NOTE: This doesn't stop any functionality; if you dont have
3315 * the ingress scheduler, you just can't add policies on ingress.
1da177e4
LT
3316 *
3317 */
24824a09 3318static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
1da177e4 3319{
1da177e4 3320 struct net_device *dev = skb->dev;
f697c3e8 3321 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
3322 int result = TC_ACT_OK;
3323 struct Qdisc *q;
4ec93edb 3324
de384830 3325 if (unlikely(MAX_RED_LOOP < ttl++)) {
e87cc472
JP
3326 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3327 skb->skb_iif, dev->ifindex);
f697c3e8
HX
3328 return TC_ACT_SHOT;
3329 }
1da177e4 3330
f697c3e8
HX
3331 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3332 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 3333
83874000 3334 q = rxq->qdisc;
8d50b53d 3335 if (q != &noop_qdisc) {
83874000 3336 spin_lock(qdisc_lock(q));
a9312ae8
DM
3337 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3338 result = qdisc_enqueue_root(skb, q);
83874000
DM
3339 spin_unlock(qdisc_lock(q));
3340 }
f697c3e8
HX
3341
3342 return result;
3343}
86e65da9 3344
f697c3e8
HX
3345static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3346 struct packet_type **pt_prev,
3347 int *ret, struct net_device *orig_dev)
3348{
24824a09
ED
3349 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3350
3351 if (!rxq || rxq->qdisc == &noop_qdisc)
f697c3e8 3352 goto out;
1da177e4 3353
f697c3e8
HX
3354 if (*pt_prev) {
3355 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3356 *pt_prev = NULL;
1da177e4
LT
3357 }
3358
24824a09 3359 switch (ing_filter(skb, rxq)) {
f697c3e8
HX
3360 case TC_ACT_SHOT:
3361 case TC_ACT_STOLEN:
3362 kfree_skb(skb);
3363 return NULL;
3364 }
3365
3366out:
3367 skb->tc_verd = 0;
3368 return skb;
1da177e4
LT
3369}
3370#endif
3371
ab95bfe0
JP
3372/**
3373 * netdev_rx_handler_register - register receive handler
3374 * @dev: device to register a handler for
3375 * @rx_handler: receive handler to register
93e2c32b 3376 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0
JP
3377 *
3378 * Register a receive hander for a device. This handler will then be
3379 * called from __netif_receive_skb. A negative errno code is returned
3380 * on a failure.
3381 *
3382 * The caller must hold the rtnl_mutex.
8a4eb573
JP
3383 *
3384 * For a general description of rx_handler, see enum rx_handler_result.
ab95bfe0
JP
3385 */
3386int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
3387 rx_handler_func_t *rx_handler,
3388 void *rx_handler_data)
ab95bfe0
JP
3389{
3390 ASSERT_RTNL();
3391
3392 if (dev->rx_handler)
3393 return -EBUSY;
3394
00cfec37 3395 /* Note: rx_handler_data must be set before rx_handler */
93e2c32b 3396 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
3397 rcu_assign_pointer(dev->rx_handler, rx_handler);
3398
3399 return 0;
3400}
3401EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3402
3403/**
3404 * netdev_rx_handler_unregister - unregister receive handler
3405 * @dev: device to unregister a handler from
3406 *
166ec369 3407 * Unregister a receive handler from a device.
ab95bfe0
JP
3408 *
3409 * The caller must hold the rtnl_mutex.
3410 */
3411void netdev_rx_handler_unregister(struct net_device *dev)
3412{
3413
3414 ASSERT_RTNL();
a9b3cd7f 3415 RCU_INIT_POINTER(dev->rx_handler, NULL);
00cfec37
ED
3416 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3417 * section has a guarantee to see a non NULL rx_handler_data
3418 * as well.
3419 */
3420 synchronize_net();
a9b3cd7f 3421 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
ab95bfe0
JP
3422}
3423EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3424
b4b9e355
MG
3425/*
3426 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3427 * the special handling of PFMEMALLOC skbs.
3428 */
3429static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3430{
3431 switch (skb->protocol) {
3432 case __constant_htons(ETH_P_ARP):
3433 case __constant_htons(ETH_P_IP):
3434 case __constant_htons(ETH_P_IPV6):
3435 case __constant_htons(ETH_P_8021Q):
8ad227ff 3436 case __constant_htons(ETH_P_8021AD):
b4b9e355
MG
3437 return true;
3438 default:
3439 return false;
3440 }
3441}
3442
9754e293 3443static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
1da177e4
LT
3444{
3445 struct packet_type *ptype, *pt_prev;
ab95bfe0 3446 rx_handler_func_t *rx_handler;
f2ccd8fa 3447 struct net_device *orig_dev;
63d8ea7f 3448 struct net_device *null_or_dev;
8a4eb573 3449 bool deliver_exact = false;
1da177e4 3450 int ret = NET_RX_DROP;
252e3346 3451 __be16 type;
1da177e4 3452
588f0330 3453 net_timestamp_check(!netdev_tstamp_prequeue, skb);
81bbb3d4 3454
cf66ba58 3455 trace_netif_receive_skb(skb);
9b22ea56 3456
1da177e4 3457 /* if we've gotten here through NAPI, check netpoll */
bea3348e 3458 if (netpoll_receive_skb(skb))
b4b9e355 3459 goto out;
1da177e4 3460
cc9bd5ce 3461 orig_dev = skb->dev;
8f903c70 3462
c1d2bbe1 3463 skb_reset_network_header(skb);
fda55eca
ED
3464 if (!skb_transport_header_was_set(skb))
3465 skb_reset_transport_header(skb);
0b5c9db1 3466 skb_reset_mac_len(skb);
1da177e4
LT
3467
3468 pt_prev = NULL;
3469
3470 rcu_read_lock();
3471
63d8ea7f 3472another_round:
b6858177 3473 skb->skb_iif = skb->dev->ifindex;
63d8ea7f
DM
3474
3475 __this_cpu_inc(softnet_data.processed);
3476
8ad227ff
PM
3477 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3478 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
bcc6d479
JP
3479 skb = vlan_untag(skb);
3480 if (unlikely(!skb))
b4b9e355 3481 goto unlock;
bcc6d479
JP
3482 }
3483
1da177e4
LT
3484#ifdef CONFIG_NET_CLS_ACT
3485 if (skb->tc_verd & TC_NCLS) {
3486 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3487 goto ncls;
3488 }
3489#endif
3490
9754e293 3491 if (pfmemalloc)
b4b9e355
MG
3492 goto skip_taps;
3493
1da177e4 3494 list_for_each_entry_rcu(ptype, &ptype_all, list) {
63d8ea7f 3495 if (!ptype->dev || ptype->dev == skb->dev) {
4ec93edb 3496 if (pt_prev)
f2ccd8fa 3497 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3498 pt_prev = ptype;
3499 }
3500 }
3501
b4b9e355 3502skip_taps:
1da177e4 3503#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
3504 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3505 if (!skb)
b4b9e355 3506 goto unlock;
1da177e4
LT
3507ncls:
3508#endif
3509
9754e293 3510 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
b4b9e355
MG
3511 goto drop;
3512
2425717b
JF
3513 if (vlan_tx_tag_present(skb)) {
3514 if (pt_prev) {
3515 ret = deliver_skb(skb, pt_prev, orig_dev);
3516 pt_prev = NULL;
3517 }
48cc32d3 3518 if (vlan_do_receive(&skb))
2425717b
JF
3519 goto another_round;
3520 else if (unlikely(!skb))
b4b9e355 3521 goto unlock;
2425717b
JF
3522 }
3523
48cc32d3 3524 rx_handler = rcu_dereference(skb->dev->rx_handler);
ab95bfe0
JP
3525 if (rx_handler) {
3526 if (pt_prev) {
3527 ret = deliver_skb(skb, pt_prev, orig_dev);
3528 pt_prev = NULL;
3529 }
8a4eb573
JP
3530 switch (rx_handler(&skb)) {
3531 case RX_HANDLER_CONSUMED:
3bc1b1ad 3532 ret = NET_RX_SUCCESS;
b4b9e355 3533 goto unlock;
8a4eb573 3534 case RX_HANDLER_ANOTHER:
63d8ea7f 3535 goto another_round;
8a4eb573
JP
3536 case RX_HANDLER_EXACT:
3537 deliver_exact = true;
3538 case RX_HANDLER_PASS:
3539 break;
3540 default:
3541 BUG();
3542 }
ab95bfe0 3543 }
1da177e4 3544
48cc32d3
FZ
3545 if (vlan_tx_nonzero_tag_present(skb))
3546 skb->pkt_type = PACKET_OTHERHOST;
3547
63d8ea7f 3548 /* deliver only exact match when indicated */
8a4eb573 3549 null_or_dev = deliver_exact ? skb->dev : NULL;
1f3c8804 3550
1da177e4 3551 type = skb->protocol;
82d8a867
PE
3552 list_for_each_entry_rcu(ptype,
3553 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
63d8ea7f 3554 if (ptype->type == type &&
e3f48d37
JP
3555 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3556 ptype->dev == orig_dev)) {
4ec93edb 3557 if (pt_prev)
f2ccd8fa 3558 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3559 pt_prev = ptype;
3560 }
3561 }
3562
3563 if (pt_prev) {
1080e512 3564 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
0e698bf6 3565 goto drop;
1080e512
MT
3566 else
3567 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4 3568 } else {
b4b9e355 3569drop:
caf586e5 3570 atomic_long_inc(&skb->dev->rx_dropped);
1da177e4
LT
3571 kfree_skb(skb);
3572 /* Jamal, now you will not able to escape explaining
3573 * me how you were going to use this. :-)
3574 */
3575 ret = NET_RX_DROP;
3576 }
3577
b4b9e355 3578unlock:
1da177e4 3579 rcu_read_unlock();
b4b9e355 3580out:
9754e293
DM
3581 return ret;
3582}
3583
3584static int __netif_receive_skb(struct sk_buff *skb)
3585{
3586 int ret;
3587
3588 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3589 unsigned long pflags = current->flags;
3590
3591 /*
3592 * PFMEMALLOC skbs are special, they should
3593 * - be delivered to SOCK_MEMALLOC sockets only
3594 * - stay away from userspace
3595 * - have bounded memory usage
3596 *
3597 * Use PF_MEMALLOC as this saves us from propagating the allocation
3598 * context down to all allocation sites.
3599 */
3600 current->flags |= PF_MEMALLOC;
3601 ret = __netif_receive_skb_core(skb, true);
3602 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3603 } else
3604 ret = __netif_receive_skb_core(skb, false);
3605
1da177e4
LT
3606 return ret;
3607}
0a9627f2
TH
3608
3609/**
3610 * netif_receive_skb - process receive buffer from network
3611 * @skb: buffer to process
3612 *
3613 * netif_receive_skb() is the main receive data processing function.
3614 * It always succeeds. The buffer may be dropped during processing
3615 * for congestion control or by the protocol layers.
3616 *
3617 * This function may only be called from softirq context and interrupts
3618 * should be enabled.
3619 *
3620 * Return values (usually ignored):
3621 * NET_RX_SUCCESS: no congestion
3622 * NET_RX_DROP: packet was dropped
3623 */
3624int netif_receive_skb(struct sk_buff *skb)
3625{
588f0330 3626 net_timestamp_check(netdev_tstamp_prequeue, skb);
3b098e2d 3627
c1f19b51
RC
3628 if (skb_defer_rx_timestamp(skb))
3629 return NET_RX_SUCCESS;
3630
df334545 3631#ifdef CONFIG_RPS
c5905afb 3632 if (static_key_false(&rps_needed)) {
3b098e2d
ED
3633 struct rps_dev_flow voidflow, *rflow = &voidflow;
3634 int cpu, ret;
fec5e652 3635
3b098e2d
ED
3636 rcu_read_lock();
3637
3638 cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 3639
3b098e2d
ED
3640 if (cpu >= 0) {
3641 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3642 rcu_read_unlock();
adc9300e 3643 return ret;
3b098e2d 3644 }
adc9300e 3645 rcu_read_unlock();
fec5e652 3646 }
1e94d72f 3647#endif
adc9300e 3648 return __netif_receive_skb(skb);
0a9627f2 3649}
d1b19dff 3650EXPORT_SYMBOL(netif_receive_skb);
1da177e4 3651
88751275
ED
3652/* Network device is going away, flush any packets still pending
3653 * Called with irqs disabled.
3654 */
152102c7 3655static void flush_backlog(void *arg)
6e583ce5 3656{
152102c7 3657 struct net_device *dev = arg;
e36fa2f7 3658 struct softnet_data *sd = &__get_cpu_var(softnet_data);
6e583ce5
SH
3659 struct sk_buff *skb, *tmp;
3660
e36fa2f7 3661 rps_lock(sd);
6e7676c1 3662 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6e583ce5 3663 if (skb->dev == dev) {
e36fa2f7 3664 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 3665 kfree_skb(skb);
76cc8b13 3666 input_queue_head_incr(sd);
6e583ce5 3667 }
6e7676c1 3668 }
e36fa2f7 3669 rps_unlock(sd);
6e7676c1
CG
3670
3671 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3672 if (skb->dev == dev) {
3673 __skb_unlink(skb, &sd->process_queue);
3674 kfree_skb(skb);
76cc8b13 3675 input_queue_head_incr(sd);
6e7676c1
CG
3676 }
3677 }
6e583ce5
SH
3678}
3679
d565b0a1
HX
3680static int napi_gro_complete(struct sk_buff *skb)
3681{
22061d80 3682 struct packet_offload *ptype;
d565b0a1 3683 __be16 type = skb->protocol;
22061d80 3684 struct list_head *head = &offload_base;
d565b0a1
HX
3685 int err = -ENOENT;
3686
c3c7c254
ED
3687 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3688
fc59f9a3
HX
3689 if (NAPI_GRO_CB(skb)->count == 1) {
3690 skb_shinfo(skb)->gso_size = 0;
d565b0a1 3691 goto out;
fc59f9a3 3692 }
d565b0a1
HX
3693
3694 rcu_read_lock();
3695 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 3696 if (ptype->type != type || !ptype->callbacks.gro_complete)
d565b0a1
HX
3697 continue;
3698
f191a1d1 3699 err = ptype->callbacks.gro_complete(skb);
d565b0a1
HX
3700 break;
3701 }
3702 rcu_read_unlock();
3703
3704 if (err) {
3705 WARN_ON(&ptype->list == head);
3706 kfree_skb(skb);
3707 return NET_RX_SUCCESS;
3708 }
3709
3710out:
d565b0a1
HX
3711 return netif_receive_skb(skb);
3712}
3713
2e71a6f8
ED
3714/* napi->gro_list contains packets ordered by age.
3715 * youngest packets at the head of it.
3716 * Complete skbs in reverse order to reduce latencies.
3717 */
3718void napi_gro_flush(struct napi_struct *napi, bool flush_old)
d565b0a1 3719{
2e71a6f8 3720 struct sk_buff *skb, *prev = NULL;
d565b0a1 3721
2e71a6f8
ED
3722 /* scan list and build reverse chain */
3723 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3724 skb->prev = prev;
3725 prev = skb;
3726 }
3727
3728 for (skb = prev; skb; skb = prev) {
d565b0a1 3729 skb->next = NULL;
2e71a6f8
ED
3730
3731 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3732 return;
3733
3734 prev = skb->prev;
d565b0a1 3735 napi_gro_complete(skb);
2e71a6f8 3736 napi->gro_count--;
d565b0a1
HX
3737 }
3738
3739 napi->gro_list = NULL;
3740}
86cac58b 3741EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 3742
89c5fa33
ED
3743static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3744{
3745 struct sk_buff *p;
3746 unsigned int maclen = skb->dev->hard_header_len;
3747
3748 for (p = napi->gro_list; p; p = p->next) {
3749 unsigned long diffs;
3750
3751 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3752 diffs |= p->vlan_tci ^ skb->vlan_tci;
3753 if (maclen == ETH_HLEN)
3754 diffs |= compare_ether_header(skb_mac_header(p),
3755 skb_gro_mac_header(skb));
3756 else if (!diffs)
3757 diffs = memcmp(skb_mac_header(p),
3758 skb_gro_mac_header(skb),
3759 maclen);
3760 NAPI_GRO_CB(p)->same_flow = !diffs;
3761 NAPI_GRO_CB(p)->flush = 0;
3762 }
3763}
3764
bb728820 3765static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
3766{
3767 struct sk_buff **pp = NULL;
22061d80 3768 struct packet_offload *ptype;
d565b0a1 3769 __be16 type = skb->protocol;
22061d80 3770 struct list_head *head = &offload_base;
0da2afd5 3771 int same_flow;
5b252f0c 3772 enum gro_result ret;
d565b0a1 3773
ce9e76c8 3774 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
d565b0a1
HX
3775 goto normal;
3776
21dc3301 3777 if (skb_is_gso(skb) || skb_has_frag_list(skb))
f17f5c91
HX
3778 goto normal;
3779
89c5fa33
ED
3780 gro_list_prepare(napi, skb);
3781
d565b0a1
HX
3782 rcu_read_lock();
3783 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 3784 if (ptype->type != type || !ptype->callbacks.gro_receive)
d565b0a1
HX
3785 continue;
3786
86911732 3787 skb_set_network_header(skb, skb_gro_offset(skb));
efd9450e 3788 skb_reset_mac_len(skb);
d565b0a1
HX
3789 NAPI_GRO_CB(skb)->same_flow = 0;
3790 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 3791 NAPI_GRO_CB(skb)->free = 0;
d565b0a1 3792
f191a1d1 3793 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
d565b0a1
HX
3794 break;
3795 }
3796 rcu_read_unlock();
3797
3798 if (&ptype->list == head)
3799 goto normal;
3800
0da2afd5 3801 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 3802 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 3803
d565b0a1
HX
3804 if (pp) {
3805 struct sk_buff *nskb = *pp;
3806
3807 *pp = nskb->next;
3808 nskb->next = NULL;
3809 napi_gro_complete(nskb);
4ae5544f 3810 napi->gro_count--;
d565b0a1
HX
3811 }
3812
0da2afd5 3813 if (same_flow)
d565b0a1
HX
3814 goto ok;
3815
4ae5544f 3816 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
d565b0a1 3817 goto normal;
d565b0a1 3818
4ae5544f 3819 napi->gro_count++;
d565b0a1 3820 NAPI_GRO_CB(skb)->count = 1;
2e71a6f8 3821 NAPI_GRO_CB(skb)->age = jiffies;
86911732 3822 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
3823 skb->next = napi->gro_list;
3824 napi->gro_list = skb;
5d0d9be8 3825 ret = GRO_HELD;
d565b0a1 3826
ad0f9904 3827pull:
cb18978c
HX
3828 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3829 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3830
3831 BUG_ON(skb->end - skb->tail < grow);
3832
3833 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3834
3835 skb->tail += grow;
3836 skb->data_len -= grow;
3837
3838 skb_shinfo(skb)->frags[0].page_offset += grow;
9e903e08 3839 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
cb18978c 3840
9e903e08 3841 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
ea2ab693 3842 skb_frag_unref(skb, 0);
cb18978c
HX
3843 memmove(skb_shinfo(skb)->frags,
3844 skb_shinfo(skb)->frags + 1,
e5093aec 3845 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
cb18978c 3846 }
ad0f9904
HX
3847 }
3848
d565b0a1 3849ok:
5d0d9be8 3850 return ret;
d565b0a1
HX
3851
3852normal:
ad0f9904
HX
3853 ret = GRO_NORMAL;
3854 goto pull;
5d38a079 3855}
96e93eab 3856
5d38a079 3857
bb728820 3858static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 3859{
5d0d9be8
HX
3860 switch (ret) {
3861 case GRO_NORMAL:
c7c4b3b6
BH
3862 if (netif_receive_skb(skb))
3863 ret = GRO_DROP;
3864 break;
5d38a079 3865
5d0d9be8 3866 case GRO_DROP:
5d38a079
HX
3867 kfree_skb(skb);
3868 break;
5b252f0c 3869
daa86548 3870 case GRO_MERGED_FREE:
d7e8883c
ED
3871 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3872 kmem_cache_free(skbuff_head_cache, skb);
3873 else
3874 __kfree_skb(skb);
daa86548
ED
3875 break;
3876
5b252f0c
BH
3877 case GRO_HELD:
3878 case GRO_MERGED:
3879 break;
5d38a079
HX
3880 }
3881
c7c4b3b6 3882 return ret;
5d0d9be8 3883}
5d0d9be8 3884
ca07e43e 3885static void skb_gro_reset_offset(struct sk_buff *skb)
78a478d0 3886{
ca07e43e
ED
3887 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3888 const skb_frag_t *frag0 = &pinfo->frags[0];
3889
78a478d0
HX
3890 NAPI_GRO_CB(skb)->data_offset = 0;
3891 NAPI_GRO_CB(skb)->frag0 = NULL;
7489594c 3892 NAPI_GRO_CB(skb)->frag0_len = 0;
78a478d0 3893
ced14f68 3894 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
ca07e43e
ED
3895 pinfo->nr_frags &&
3896 !PageHighMem(skb_frag_page(frag0))) {
3897 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3898 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
7489594c 3899 }
78a478d0 3900}
78a478d0 3901
c7c4b3b6 3902gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 3903{
86911732
HX
3904 skb_gro_reset_offset(skb);
3905
89c5fa33 3906 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
d565b0a1
HX
3907}
3908EXPORT_SYMBOL(napi_gro_receive);
3909
d0c2b0d2 3910static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
96e93eab 3911{
96e93eab 3912 __skb_pull(skb, skb_headlen(skb));
2a2a459e
ED
3913 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3914 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3701e513 3915 skb->vlan_tci = 0;
66c46d74 3916 skb->dev = napi->dev;
6d152e23 3917 skb->skb_iif = 0;
96e93eab
HX
3918
3919 napi->skb = skb;
3920}
96e93eab 3921
76620aaf 3922struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 3923{
5d38a079 3924 struct sk_buff *skb = napi->skb;
5d38a079
HX
3925
3926 if (!skb) {
89d71a66
ED
3927 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3928 if (skb)
3929 napi->skb = skb;
80595d59 3930 }
96e93eab
HX
3931 return skb;
3932}
76620aaf 3933EXPORT_SYMBOL(napi_get_frags);
96e93eab 3934
bb728820 3935static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
c7c4b3b6 3936 gro_result_t ret)
96e93eab 3937{
5d0d9be8
HX
3938 switch (ret) {
3939 case GRO_NORMAL:
86911732 3940 case GRO_HELD:
e76b69cc 3941 skb->protocol = eth_type_trans(skb, skb->dev);
86911732 3942
c7c4b3b6
BH
3943 if (ret == GRO_HELD)
3944 skb_gro_pull(skb, -ETH_HLEN);
3945 else if (netif_receive_skb(skb))
3946 ret = GRO_DROP;
86911732 3947 break;
5d38a079 3948
5d0d9be8 3949 case GRO_DROP:
5d0d9be8
HX
3950 case GRO_MERGED_FREE:
3951 napi_reuse_skb(napi, skb);
3952 break;
5b252f0c
BH
3953
3954 case GRO_MERGED:
3955 break;
5d0d9be8 3956 }
5d38a079 3957
c7c4b3b6 3958 return ret;
5d38a079 3959}
5d0d9be8 3960
4adb9c4a 3961static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
76620aaf
HX
3962{
3963 struct sk_buff *skb = napi->skb;
3964 struct ethhdr *eth;
a5b1cf28
HX
3965 unsigned int hlen;
3966 unsigned int off;
76620aaf
HX
3967
3968 napi->skb = NULL;
3969
3970 skb_reset_mac_header(skb);
3971 skb_gro_reset_offset(skb);
3972
a5b1cf28
HX
3973 off = skb_gro_offset(skb);
3974 hlen = off + sizeof(*eth);
3975 eth = skb_gro_header_fast(skb, off);
3976 if (skb_gro_header_hard(skb, hlen)) {
3977 eth = skb_gro_header_slow(skb, hlen, off);
3978 if (unlikely(!eth)) {
3979 napi_reuse_skb(napi, skb);
3980 skb = NULL;
3981 goto out;
3982 }
76620aaf
HX
3983 }
3984
3985 skb_gro_pull(skb, sizeof(*eth));
3986
3987 /*
3988 * This works because the only protocols we care about don't require
3989 * special handling. We'll fix it up properly at the end.
3990 */
3991 skb->protocol = eth->h_proto;
3992
3993out:
3994 return skb;
3995}
76620aaf 3996
c7c4b3b6 3997gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 3998{
76620aaf 3999 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
4000
4001 if (!skb)
c7c4b3b6 4002 return GRO_DROP;
5d0d9be8 4003
89c5fa33 4004 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5d0d9be8 4005}
5d38a079
HX
4006EXPORT_SYMBOL(napi_gro_frags);
4007
e326bed2
ED
4008/*
4009 * net_rps_action sends any pending IPI's for rps.
4010 * Note: called with local irq disabled, but exits with local irq enabled.
4011 */
4012static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4013{
4014#ifdef CONFIG_RPS
4015 struct softnet_data *remsd = sd->rps_ipi_list;
4016
4017 if (remsd) {
4018 sd->rps_ipi_list = NULL;
4019
4020 local_irq_enable();
4021
4022 /* Send pending IPI's to kick RPS processing on remote cpus. */
4023 while (remsd) {
4024 struct softnet_data *next = remsd->rps_ipi_next;
4025
4026 if (cpu_online(remsd->cpu))
4027 __smp_call_function_single(remsd->cpu,
4028 &remsd->csd, 0);
4029 remsd = next;
4030 }
4031 } else
4032#endif
4033 local_irq_enable();
4034}
4035
bea3348e 4036static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
4037{
4038 int work = 0;
eecfd7c4 4039 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
1da177e4 4040
e326bed2
ED
4041#ifdef CONFIG_RPS
4042 /* Check if we have pending ipi, its better to send them now,
4043 * not waiting net_rx_action() end.
4044 */
4045 if (sd->rps_ipi_list) {
4046 local_irq_disable();
4047 net_rps_action_and_irq_enable(sd);
4048 }
4049#endif
bea3348e 4050 napi->weight = weight_p;
6e7676c1
CG
4051 local_irq_disable();
4052 while (work < quota) {
1da177e4 4053 struct sk_buff *skb;
6e7676c1
CG
4054 unsigned int qlen;
4055
4056 while ((skb = __skb_dequeue(&sd->process_queue))) {
4057 local_irq_enable();
4058 __netif_receive_skb(skb);
6e7676c1 4059 local_irq_disable();
76cc8b13
TH
4060 input_queue_head_incr(sd);
4061 if (++work >= quota) {
4062 local_irq_enable();
4063 return work;
4064 }
6e7676c1 4065 }
1da177e4 4066
e36fa2f7 4067 rps_lock(sd);
6e7676c1 4068 qlen = skb_queue_len(&sd->input_pkt_queue);
76cc8b13 4069 if (qlen)
6e7676c1
CG
4070 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4071 &sd->process_queue);
76cc8b13 4072
6e7676c1 4073 if (qlen < quota - work) {
eecfd7c4
ED
4074 /*
4075 * Inline a custom version of __napi_complete().
4076 * only current cpu owns and manipulates this napi,
4077 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4078 * we can use a plain write instead of clear_bit(),
4079 * and we dont need an smp_mb() memory barrier.
4080 */
4081 list_del(&napi->poll_list);
4082 napi->state = 0;
4083
6e7676c1 4084 quota = work + qlen;
bea3348e 4085 }
e36fa2f7 4086 rps_unlock(sd);
6e7676c1
CG
4087 }
4088 local_irq_enable();
1da177e4 4089
bea3348e
SH
4090 return work;
4091}
1da177e4 4092
bea3348e
SH
4093/**
4094 * __napi_schedule - schedule for receive
c4ea43c5 4095 * @n: entry to schedule
bea3348e
SH
4096 *
4097 * The entry's receive function will be scheduled to run
4098 */
b5606c2d 4099void __napi_schedule(struct napi_struct *n)
bea3348e
SH
4100{
4101 unsigned long flags;
1da177e4 4102
bea3348e 4103 local_irq_save(flags);
eecfd7c4 4104 ____napi_schedule(&__get_cpu_var(softnet_data), n);
bea3348e 4105 local_irq_restore(flags);
1da177e4 4106}
bea3348e
SH
4107EXPORT_SYMBOL(__napi_schedule);
4108
d565b0a1
HX
4109void __napi_complete(struct napi_struct *n)
4110{
4111 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4112 BUG_ON(n->gro_list);
4113
4114 list_del(&n->poll_list);
4115 smp_mb__before_clear_bit();
4116 clear_bit(NAPI_STATE_SCHED, &n->state);
4117}
4118EXPORT_SYMBOL(__napi_complete);
4119
4120void napi_complete(struct napi_struct *n)
4121{
4122 unsigned long flags;
4123
4124 /*
4125 * don't let napi dequeue from the cpu poll list
4126 * just in case its running on a different cpu
4127 */
4128 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4129 return;
4130
2e71a6f8 4131 napi_gro_flush(n, false);
d565b0a1
HX
4132 local_irq_save(flags);
4133 __napi_complete(n);
4134 local_irq_restore(flags);
4135}
4136EXPORT_SYMBOL(napi_complete);
4137
af12fa6e
ET
4138/* must be called under rcu_read_lock(), as we dont take a reference */
4139struct napi_struct *napi_by_id(unsigned int napi_id)
4140{
4141 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4142 struct napi_struct *napi;
4143
4144 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4145 if (napi->napi_id == napi_id)
4146 return napi;
4147
4148 return NULL;
4149}
4150EXPORT_SYMBOL_GPL(napi_by_id);
4151
4152void napi_hash_add(struct napi_struct *napi)
4153{
4154 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4155
4156 spin_lock(&napi_hash_lock);
4157
4158 /* 0 is not a valid id, we also skip an id that is taken
4159 * we expect both events to be extremely rare
4160 */
4161 napi->napi_id = 0;
4162 while (!napi->napi_id) {
4163 napi->napi_id = ++napi_gen_id;
4164 if (napi_by_id(napi->napi_id))
4165 napi->napi_id = 0;
4166 }
4167
4168 hlist_add_head_rcu(&napi->napi_hash_node,
4169 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4170
4171 spin_unlock(&napi_hash_lock);
4172 }
4173}
4174EXPORT_SYMBOL_GPL(napi_hash_add);
4175
4176/* Warning : caller is responsible to make sure rcu grace period
4177 * is respected before freeing memory containing @napi
4178 */
4179void napi_hash_del(struct napi_struct *napi)
4180{
4181 spin_lock(&napi_hash_lock);
4182
4183 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4184 hlist_del_rcu(&napi->napi_hash_node);
4185
4186 spin_unlock(&napi_hash_lock);
4187}
4188EXPORT_SYMBOL_GPL(napi_hash_del);
4189
d565b0a1
HX
4190void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4191 int (*poll)(struct napi_struct *, int), int weight)
4192{
4193 INIT_LIST_HEAD(&napi->poll_list);
4ae5544f 4194 napi->gro_count = 0;
d565b0a1 4195 napi->gro_list = NULL;
5d38a079 4196 napi->skb = NULL;
d565b0a1 4197 napi->poll = poll;
82dc3c63
ED
4198 if (weight > NAPI_POLL_WEIGHT)
4199 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4200 weight, dev->name);
d565b0a1
HX
4201 napi->weight = weight;
4202 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 4203 napi->dev = dev;
5d38a079 4204#ifdef CONFIG_NETPOLL
d565b0a1
HX
4205 spin_lock_init(&napi->poll_lock);
4206 napi->poll_owner = -1;
4207#endif
4208 set_bit(NAPI_STATE_SCHED, &napi->state);
4209}
4210EXPORT_SYMBOL(netif_napi_add);
4211
4212void netif_napi_del(struct napi_struct *napi)
4213{
4214 struct sk_buff *skb, *next;
4215
d7b06636 4216 list_del_init(&napi->dev_list);
76620aaf 4217 napi_free_frags(napi);
d565b0a1
HX
4218
4219 for (skb = napi->gro_list; skb; skb = next) {
4220 next = skb->next;
4221 skb->next = NULL;
4222 kfree_skb(skb);
4223 }
4224
4225 napi->gro_list = NULL;
4ae5544f 4226 napi->gro_count = 0;
d565b0a1
HX
4227}
4228EXPORT_SYMBOL(netif_napi_del);
4229
1da177e4
LT
4230static void net_rx_action(struct softirq_action *h)
4231{
e326bed2 4232 struct softnet_data *sd = &__get_cpu_var(softnet_data);
24f8b238 4233 unsigned long time_limit = jiffies + 2;
51b0bded 4234 int budget = netdev_budget;
53fb95d3
MM
4235 void *have;
4236
1da177e4
LT
4237 local_irq_disable();
4238
e326bed2 4239 while (!list_empty(&sd->poll_list)) {
bea3348e
SH
4240 struct napi_struct *n;
4241 int work, weight;
1da177e4 4242
bea3348e 4243 /* If softirq window is exhuasted then punt.
24f8b238
SH
4244 * Allow this to run for 2 jiffies since which will allow
4245 * an average latency of 1.5/HZ.
bea3348e 4246 */
d1f41b67 4247 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
1da177e4
LT
4248 goto softnet_break;
4249
4250 local_irq_enable();
4251
bea3348e
SH
4252 /* Even though interrupts have been re-enabled, this
4253 * access is safe because interrupts can only add new
4254 * entries to the tail of this list, and only ->poll()
4255 * calls can remove this head entry from the list.
4256 */
e326bed2 4257 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
1da177e4 4258
bea3348e
SH
4259 have = netpoll_poll_lock(n);
4260
4261 weight = n->weight;
4262
0a7606c1
DM
4263 /* This NAPI_STATE_SCHED test is for avoiding a race
4264 * with netpoll's poll_napi(). Only the entity which
4265 * obtains the lock and sees NAPI_STATE_SCHED set will
4266 * actually make the ->poll() call. Therefore we avoid
25985edc 4267 * accidentally calling ->poll() when NAPI is not scheduled.
0a7606c1
DM
4268 */
4269 work = 0;
4ea7e386 4270 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
0a7606c1 4271 work = n->poll(n, weight);
4ea7e386
NH
4272 trace_napi_poll(n);
4273 }
bea3348e
SH
4274
4275 WARN_ON_ONCE(work > weight);
4276
4277 budget -= work;
4278
4279 local_irq_disable();
4280
4281 /* Drivers must not modify the NAPI state if they
4282 * consume the entire weight. In such cases this code
4283 * still "owns" the NAPI instance and therefore can
4284 * move the instance around on the list at-will.
4285 */
fed17f30 4286 if (unlikely(work == weight)) {
ff780cd8
HX
4287 if (unlikely(napi_disable_pending(n))) {
4288 local_irq_enable();
4289 napi_complete(n);
4290 local_irq_disable();
2e71a6f8
ED
4291 } else {
4292 if (n->gro_list) {
4293 /* flush too old packets
4294 * If HZ < 1000, flush all packets.
4295 */
4296 local_irq_enable();
4297 napi_gro_flush(n, HZ >= 1000);
4298 local_irq_disable();
4299 }
e326bed2 4300 list_move_tail(&n->poll_list, &sd->poll_list);
2e71a6f8 4301 }
fed17f30 4302 }
bea3348e
SH
4303
4304 netpoll_poll_unlock(have);
1da177e4
LT
4305 }
4306out:
e326bed2 4307 net_rps_action_and_irq_enable(sd);
0a9627f2 4308
db217334
CL
4309#ifdef CONFIG_NET_DMA
4310 /*
4311 * There may not be any more sk_buffs coming right now, so push
4312 * any pending DMA copies to hardware
4313 */
2ba05622 4314 dma_issue_pending_all();
db217334 4315#endif
bea3348e 4316
1da177e4
LT
4317 return;
4318
4319softnet_break:
dee42870 4320 sd->time_squeeze++;
1da177e4
LT
4321 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4322 goto out;
4323}
4324
9ff162a8
JP
4325struct netdev_upper {
4326 struct net_device *dev;
4327 bool master;
4328 struct list_head list;
4329 struct rcu_head rcu;
4330 struct list_head search_list;
4331};
4332
4333static void __append_search_uppers(struct list_head *search_list,
4334 struct net_device *dev)
4335{
4336 struct netdev_upper *upper;
4337
4338 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4339 /* check if this upper is not already in search list */
4340 if (list_empty(&upper->search_list))
4341 list_add_tail(&upper->search_list, search_list);
4342 }
4343}
4344
4345static bool __netdev_search_upper_dev(struct net_device *dev,
4346 struct net_device *upper_dev)
4347{
4348 LIST_HEAD(search_list);
4349 struct netdev_upper *upper;
4350 struct netdev_upper *tmp;
4351 bool ret = false;
4352
4353 __append_search_uppers(&search_list, dev);
4354 list_for_each_entry(upper, &search_list, search_list) {
4355 if (upper->dev == upper_dev) {
4356 ret = true;
4357 break;
4358 }
4359 __append_search_uppers(&search_list, upper->dev);
4360 }
4361 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4362 INIT_LIST_HEAD(&upper->search_list);
4363 return ret;
4364}
4365
4366static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4367 struct net_device *upper_dev)
4368{
4369 struct netdev_upper *upper;
4370
4371 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4372 if (upper->dev == upper_dev)
4373 return upper;
4374 }
4375 return NULL;
4376}
4377
4378/**
4379 * netdev_has_upper_dev - Check if device is linked to an upper device
4380 * @dev: device
4381 * @upper_dev: upper device to check
4382 *
4383 * Find out if a device is linked to specified upper device and return true
4384 * in case it is. Note that this checks only immediate upper device,
4385 * not through a complete stack of devices. The caller must hold the RTNL lock.
4386 */
4387bool netdev_has_upper_dev(struct net_device *dev,
4388 struct net_device *upper_dev)
4389{
4390 ASSERT_RTNL();
4391
4392 return __netdev_find_upper(dev, upper_dev);
4393}
4394EXPORT_SYMBOL(netdev_has_upper_dev);
4395
4396/**
4397 * netdev_has_any_upper_dev - Check if device is linked to some device
4398 * @dev: device
4399 *
4400 * Find out if a device is linked to an upper device and return true in case
4401 * it is. The caller must hold the RTNL lock.
4402 */
4403bool netdev_has_any_upper_dev(struct net_device *dev)
4404{
4405 ASSERT_RTNL();
4406
4407 return !list_empty(&dev->upper_dev_list);
4408}
4409EXPORT_SYMBOL(netdev_has_any_upper_dev);
4410
4411/**
4412 * netdev_master_upper_dev_get - Get master upper device
4413 * @dev: device
4414 *
4415 * Find a master upper device and return pointer to it or NULL in case
4416 * it's not there. The caller must hold the RTNL lock.
4417 */
4418struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4419{
4420 struct netdev_upper *upper;
4421
4422 ASSERT_RTNL();
4423
4424 if (list_empty(&dev->upper_dev_list))
4425 return NULL;
4426
4427 upper = list_first_entry(&dev->upper_dev_list,
4428 struct netdev_upper, list);
4429 if (likely(upper->master))
4430 return upper->dev;
4431 return NULL;
4432}
4433EXPORT_SYMBOL(netdev_master_upper_dev_get);
4434
4435/**
4436 * netdev_master_upper_dev_get_rcu - Get master upper device
4437 * @dev: device
4438 *
4439 * Find a master upper device and return pointer to it or NULL in case
4440 * it's not there. The caller must hold the RCU read lock.
4441 */
4442struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4443{
4444 struct netdev_upper *upper;
4445
4446 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4447 struct netdev_upper, list);
4448 if (upper && likely(upper->master))
4449 return upper->dev;
4450 return NULL;
4451}
4452EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4453
4454static int __netdev_upper_dev_link(struct net_device *dev,
4455 struct net_device *upper_dev, bool master)
4456{
4457 struct netdev_upper *upper;
4458
4459 ASSERT_RTNL();
4460
4461 if (dev == upper_dev)
4462 return -EBUSY;
4463
4464 /* To prevent loops, check if dev is not upper device to upper_dev. */
4465 if (__netdev_search_upper_dev(upper_dev, dev))
4466 return -EBUSY;
4467
4468 if (__netdev_find_upper(dev, upper_dev))
4469 return -EEXIST;
4470
4471 if (master && netdev_master_upper_dev_get(dev))
4472 return -EBUSY;
4473
4474 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4475 if (!upper)
4476 return -ENOMEM;
4477
4478 upper->dev = upper_dev;
4479 upper->master = master;
4480 INIT_LIST_HEAD(&upper->search_list);
4481
4482 /* Ensure that master upper link is always the first item in list. */
4483 if (master)
4484 list_add_rcu(&upper->list, &dev->upper_dev_list);
4485 else
4486 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4487 dev_hold(upper_dev);
42e52bf9 4488 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
9ff162a8
JP
4489 return 0;
4490}
4491
4492/**
4493 * netdev_upper_dev_link - Add a link to the upper device
4494 * @dev: device
4495 * @upper_dev: new upper device
4496 *
4497 * Adds a link to device which is upper to this one. The caller must hold
4498 * the RTNL lock. On a failure a negative errno code is returned.
4499 * On success the reference counts are adjusted and the function
4500 * returns zero.
4501 */
4502int netdev_upper_dev_link(struct net_device *dev,
4503 struct net_device *upper_dev)
4504{
4505 return __netdev_upper_dev_link(dev, upper_dev, false);
4506}
4507EXPORT_SYMBOL(netdev_upper_dev_link);
4508
4509/**
4510 * netdev_master_upper_dev_link - Add a master link to the upper device
4511 * @dev: device
4512 * @upper_dev: new upper device
4513 *
4514 * Adds a link to device which is upper to this one. In this case, only
4515 * one master upper device can be linked, although other non-master devices
4516 * might be linked as well. The caller must hold the RTNL lock.
4517 * On a failure a negative errno code is returned. On success the reference
4518 * counts are adjusted and the function returns zero.
4519 */
4520int netdev_master_upper_dev_link(struct net_device *dev,
4521 struct net_device *upper_dev)
4522{
4523 return __netdev_upper_dev_link(dev, upper_dev, true);
4524}
4525EXPORT_SYMBOL(netdev_master_upper_dev_link);
4526
4527/**
4528 * netdev_upper_dev_unlink - Removes a link to upper device
4529 * @dev: device
4530 * @upper_dev: new upper device
4531 *
4532 * Removes a link to device which is upper to this one. The caller must hold
4533 * the RTNL lock.
4534 */
4535void netdev_upper_dev_unlink(struct net_device *dev,
4536 struct net_device *upper_dev)
4537{
4538 struct netdev_upper *upper;
4539
4540 ASSERT_RTNL();
4541
4542 upper = __netdev_find_upper(dev, upper_dev);
4543 if (!upper)
4544 return;
4545 list_del_rcu(&upper->list);
4546 dev_put(upper_dev);
4547 kfree_rcu(upper, rcu);
42e52bf9 4548 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
9ff162a8
JP
4549}
4550EXPORT_SYMBOL(netdev_upper_dev_unlink);
4551
b6c40d68
PM
4552static void dev_change_rx_flags(struct net_device *dev, int flags)
4553{
d314774c
SH
4554 const struct net_device_ops *ops = dev->netdev_ops;
4555
4556 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4557 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
4558}
4559
dad9b335 4560static int __dev_set_promiscuity(struct net_device *dev, int inc)
1da177e4 4561{
b536db93 4562 unsigned int old_flags = dev->flags;
d04a48b0
EB
4563 kuid_t uid;
4564 kgid_t gid;
1da177e4 4565
24023451
PM
4566 ASSERT_RTNL();
4567
dad9b335
WC
4568 dev->flags |= IFF_PROMISC;
4569 dev->promiscuity += inc;
4570 if (dev->promiscuity == 0) {
4571 /*
4572 * Avoid overflow.
4573 * If inc causes overflow, untouch promisc and return error.
4574 */
4575 if (inc < 0)
4576 dev->flags &= ~IFF_PROMISC;
4577 else {
4578 dev->promiscuity -= inc;
7b6cd1ce
JP
4579 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4580 dev->name);
dad9b335
WC
4581 return -EOVERFLOW;
4582 }
4583 }
52609c0b 4584 if (dev->flags != old_flags) {
7b6cd1ce
JP
4585 pr_info("device %s %s promiscuous mode\n",
4586 dev->name,
4587 dev->flags & IFF_PROMISC ? "entered" : "left");
8192b0c4
DH
4588 if (audit_enabled) {
4589 current_uid_gid(&uid, &gid);
7759db82
KHK
4590 audit_log(current->audit_context, GFP_ATOMIC,
4591 AUDIT_ANOM_PROMISCUOUS,
4592 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4593 dev->name, (dev->flags & IFF_PROMISC),
4594 (old_flags & IFF_PROMISC),
e1760bd5 4595 from_kuid(&init_user_ns, audit_get_loginuid(current)),
d04a48b0
EB
4596 from_kuid(&init_user_ns, uid),
4597 from_kgid(&init_user_ns, gid),
7759db82 4598 audit_get_sessionid(current));
8192b0c4 4599 }
24023451 4600
b6c40d68 4601 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 4602 }
dad9b335 4603 return 0;
1da177e4
LT
4604}
4605
4417da66
PM
4606/**
4607 * dev_set_promiscuity - update promiscuity count on a device
4608 * @dev: device
4609 * @inc: modifier
4610 *
4611 * Add or remove promiscuity from a device. While the count in the device
4612 * remains above zero the interface remains promiscuous. Once it hits zero
4613 * the device reverts back to normal filtering operation. A negative inc
4614 * value is used to drop promiscuity on the device.
dad9b335 4615 * Return 0 if successful or a negative errno code on error.
4417da66 4616 */
dad9b335 4617int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66 4618{
b536db93 4619 unsigned int old_flags = dev->flags;
dad9b335 4620 int err;
4417da66 4621
dad9b335 4622 err = __dev_set_promiscuity(dev, inc);
4b5a698e 4623 if (err < 0)
dad9b335 4624 return err;
4417da66
PM
4625 if (dev->flags != old_flags)
4626 dev_set_rx_mode(dev);
dad9b335 4627 return err;
4417da66 4628}
d1b19dff 4629EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 4630
1da177e4
LT
4631/**
4632 * dev_set_allmulti - update allmulti count on a device
4633 * @dev: device
4634 * @inc: modifier
4635 *
4636 * Add or remove reception of all multicast frames to a device. While the
4637 * count in the device remains above zero the interface remains listening
4638 * to all interfaces. Once it hits zero the device reverts back to normal
4639 * filtering operation. A negative @inc value is used to drop the counter
4640 * when releasing a resource needing all multicasts.
dad9b335 4641 * Return 0 if successful or a negative errno code on error.
1da177e4
LT
4642 */
4643
dad9b335 4644int dev_set_allmulti(struct net_device *dev, int inc)
1da177e4 4645{
b536db93 4646 unsigned int old_flags = dev->flags;
1da177e4 4647
24023451
PM
4648 ASSERT_RTNL();
4649
1da177e4 4650 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
4651 dev->allmulti += inc;
4652 if (dev->allmulti == 0) {
4653 /*
4654 * Avoid overflow.
4655 * If inc causes overflow, untouch allmulti and return error.
4656 */
4657 if (inc < 0)
4658 dev->flags &= ~IFF_ALLMULTI;
4659 else {
4660 dev->allmulti -= inc;
7b6cd1ce
JP
4661 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4662 dev->name);
dad9b335
WC
4663 return -EOVERFLOW;
4664 }
4665 }
24023451 4666 if (dev->flags ^ old_flags) {
b6c40d68 4667 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 4668 dev_set_rx_mode(dev);
24023451 4669 }
dad9b335 4670 return 0;
4417da66 4671}
d1b19dff 4672EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
4673
4674/*
4675 * Upload unicast and multicast address lists to device and
4676 * configure RX filtering. When the device doesn't support unicast
53ccaae1 4677 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
4678 * are present.
4679 */
4680void __dev_set_rx_mode(struct net_device *dev)
4681{
d314774c
SH
4682 const struct net_device_ops *ops = dev->netdev_ops;
4683
4417da66
PM
4684 /* dev_open will call this function so the list will stay sane. */
4685 if (!(dev->flags&IFF_UP))
4686 return;
4687
4688 if (!netif_device_present(dev))
40b77c94 4689 return;
4417da66 4690
01789349 4691 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4417da66
PM
4692 /* Unicast addresses changes may only happen under the rtnl,
4693 * therefore calling __dev_set_promiscuity here is safe.
4694 */
32e7bfc4 4695 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4417da66 4696 __dev_set_promiscuity(dev, 1);
2d348d1f 4697 dev->uc_promisc = true;
32e7bfc4 4698 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4417da66 4699 __dev_set_promiscuity(dev, -1);
2d348d1f 4700 dev->uc_promisc = false;
4417da66 4701 }
4417da66 4702 }
01789349
JP
4703
4704 if (ops->ndo_set_rx_mode)
4705 ops->ndo_set_rx_mode(dev);
4417da66
PM
4706}
4707
4708void dev_set_rx_mode(struct net_device *dev)
4709{
b9e40857 4710 netif_addr_lock_bh(dev);
4417da66 4711 __dev_set_rx_mode(dev);
b9e40857 4712 netif_addr_unlock_bh(dev);
1da177e4
LT
4713}
4714
f0db275a
SH
4715/**
4716 * dev_get_flags - get flags reported to userspace
4717 * @dev: device
4718 *
4719 * Get the combination of flag bits exported through APIs to userspace.
4720 */
95c96174 4721unsigned int dev_get_flags(const struct net_device *dev)
1da177e4 4722{
95c96174 4723 unsigned int flags;
1da177e4
LT
4724
4725 flags = (dev->flags & ~(IFF_PROMISC |
4726 IFF_ALLMULTI |
b00055aa
SR
4727 IFF_RUNNING |
4728 IFF_LOWER_UP |
4729 IFF_DORMANT)) |
1da177e4
LT
4730 (dev->gflags & (IFF_PROMISC |
4731 IFF_ALLMULTI));
4732
b00055aa
SR
4733 if (netif_running(dev)) {
4734 if (netif_oper_up(dev))
4735 flags |= IFF_RUNNING;
4736 if (netif_carrier_ok(dev))
4737 flags |= IFF_LOWER_UP;
4738 if (netif_dormant(dev))
4739 flags |= IFF_DORMANT;
4740 }
1da177e4
LT
4741
4742 return flags;
4743}
d1b19dff 4744EXPORT_SYMBOL(dev_get_flags);
1da177e4 4745
bd380811 4746int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 4747{
b536db93 4748 unsigned int old_flags = dev->flags;
bd380811 4749 int ret;
1da177e4 4750
24023451
PM
4751 ASSERT_RTNL();
4752
1da177e4
LT
4753 /*
4754 * Set the flags on our device.
4755 */
4756
4757 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4758 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4759 IFF_AUTOMEDIA)) |
4760 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4761 IFF_ALLMULTI));
4762
4763 /*
4764 * Load in the correct multicast list now the flags have changed.
4765 */
4766
b6c40d68
PM
4767 if ((old_flags ^ flags) & IFF_MULTICAST)
4768 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 4769
4417da66 4770 dev_set_rx_mode(dev);
1da177e4
LT
4771
4772 /*
4773 * Have we downed the interface. We handle IFF_UP ourselves
4774 * according to user attempts to set it, rather than blindly
4775 * setting it.
4776 */
4777
4778 ret = 0;
4779 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
bd380811 4780 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4
LT
4781
4782 if (!ret)
4417da66 4783 dev_set_rx_mode(dev);
1da177e4
LT
4784 }
4785
1da177e4 4786 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff
ED
4787 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4788
1da177e4
LT
4789 dev->gflags ^= IFF_PROMISC;
4790 dev_set_promiscuity(dev, inc);
4791 }
4792
4793 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4794 is important. Some (broken) drivers set IFF_PROMISC, when
4795 IFF_ALLMULTI is requested not asking us and not reporting.
4796 */
4797 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
4798 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4799
1da177e4
LT
4800 dev->gflags ^= IFF_ALLMULTI;
4801 dev_set_allmulti(dev, inc);
4802 }
4803
bd380811
PM
4804 return ret;
4805}
4806
4807void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4808{
4809 unsigned int changes = dev->flags ^ old_flags;
4810
4811 if (changes & IFF_UP) {
4812 if (dev->flags & IFF_UP)
4813 call_netdevice_notifiers(NETDEV_UP, dev);
4814 else
4815 call_netdevice_notifiers(NETDEV_DOWN, dev);
4816 }
4817
4818 if (dev->flags & IFF_UP &&
be9efd36
JP
4819 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
4820 struct netdev_notifier_change_info change_info;
4821
4822 change_info.flags_changed = changes;
4823 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
4824 &change_info.info);
4825 }
bd380811
PM
4826}
4827
4828/**
4829 * dev_change_flags - change device settings
4830 * @dev: device
4831 * @flags: device state flags
4832 *
4833 * Change settings on device based state flags. The flags are
4834 * in the userspace exported format.
4835 */
b536db93 4836int dev_change_flags(struct net_device *dev, unsigned int flags)
bd380811 4837{
b536db93
ED
4838 int ret;
4839 unsigned int changes, old_flags = dev->flags;
bd380811
PM
4840
4841 ret = __dev_change_flags(dev, flags);
4842 if (ret < 0)
4843 return ret;
4844
4845 changes = old_flags ^ dev->flags;
7c355f53
TG
4846 if (changes)
4847 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
1da177e4 4848
bd380811 4849 __dev_notify_flags(dev, old_flags);
1da177e4
LT
4850 return ret;
4851}
d1b19dff 4852EXPORT_SYMBOL(dev_change_flags);
1da177e4 4853
f0db275a
SH
4854/**
4855 * dev_set_mtu - Change maximum transfer unit
4856 * @dev: device
4857 * @new_mtu: new transfer unit
4858 *
4859 * Change the maximum transfer size of the network device.
4860 */
1da177e4
LT
4861int dev_set_mtu(struct net_device *dev, int new_mtu)
4862{
d314774c 4863 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4864 int err;
4865
4866 if (new_mtu == dev->mtu)
4867 return 0;
4868
4869 /* MTU must be positive. */
4870 if (new_mtu < 0)
4871 return -EINVAL;
4872
4873 if (!netif_device_present(dev))
4874 return -ENODEV;
4875
4876 err = 0;
d314774c
SH
4877 if (ops->ndo_change_mtu)
4878 err = ops->ndo_change_mtu(dev, new_mtu);
1da177e4
LT
4879 else
4880 dev->mtu = new_mtu;
d314774c 4881
e3d8fabe 4882 if (!err)
056925ab 4883 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
1da177e4
LT
4884 return err;
4885}
d1b19dff 4886EXPORT_SYMBOL(dev_set_mtu);
1da177e4 4887
cbda10fa
VD
4888/**
4889 * dev_set_group - Change group this device belongs to
4890 * @dev: device
4891 * @new_group: group this device should belong to
4892 */
4893void dev_set_group(struct net_device *dev, int new_group)
4894{
4895 dev->group = new_group;
4896}
4897EXPORT_SYMBOL(dev_set_group);
4898
f0db275a
SH
4899/**
4900 * dev_set_mac_address - Change Media Access Control Address
4901 * @dev: device
4902 * @sa: new address
4903 *
4904 * Change the hardware (MAC) address of the device
4905 */
1da177e4
LT
4906int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4907{
d314774c 4908 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4909 int err;
4910
d314774c 4911 if (!ops->ndo_set_mac_address)
1da177e4
LT
4912 return -EOPNOTSUPP;
4913 if (sa->sa_family != dev->type)
4914 return -EINVAL;
4915 if (!netif_device_present(dev))
4916 return -ENODEV;
d314774c 4917 err = ops->ndo_set_mac_address(dev, sa);
f6521516
JP
4918 if (err)
4919 return err;
fbdeca2d 4920 dev->addr_assign_type = NET_ADDR_SET;
f6521516 4921 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7bf23575 4922 add_device_randomness(dev->dev_addr, dev->addr_len);
f6521516 4923 return 0;
1da177e4 4924}
d1b19dff 4925EXPORT_SYMBOL(dev_set_mac_address);
1da177e4 4926
4bf84c35
JP
4927/**
4928 * dev_change_carrier - Change device carrier
4929 * @dev: device
691b3b7e 4930 * @new_carrier: new value
4bf84c35
JP
4931 *
4932 * Change device carrier
4933 */
4934int dev_change_carrier(struct net_device *dev, bool new_carrier)
4935{
4936 const struct net_device_ops *ops = dev->netdev_ops;
4937
4938 if (!ops->ndo_change_carrier)
4939 return -EOPNOTSUPP;
4940 if (!netif_device_present(dev))
4941 return -ENODEV;
4942 return ops->ndo_change_carrier(dev, new_carrier);
4943}
4944EXPORT_SYMBOL(dev_change_carrier);
4945
1da177e4
LT
4946/**
4947 * dev_new_index - allocate an ifindex
c4ea43c5 4948 * @net: the applicable net namespace
1da177e4
LT
4949 *
4950 * Returns a suitable unique value for a new device interface
4951 * number. The caller must hold the rtnl semaphore or the
4952 * dev_base_lock to be sure it remains unique.
4953 */
881d966b 4954static int dev_new_index(struct net *net)
1da177e4 4955{
aa79e66e 4956 int ifindex = net->ifindex;
1da177e4
LT
4957 for (;;) {
4958 if (++ifindex <= 0)
4959 ifindex = 1;
881d966b 4960 if (!__dev_get_by_index(net, ifindex))
aa79e66e 4961 return net->ifindex = ifindex;
1da177e4
LT
4962 }
4963}
4964
1da177e4 4965/* Delayed registration/unregisteration */
3b5b34fd 4966static LIST_HEAD(net_todo_list);
1da177e4 4967
6f05f629 4968static void net_set_todo(struct net_device *dev)
1da177e4 4969{
1da177e4 4970 list_add_tail(&dev->todo_list, &net_todo_list);
1da177e4
LT
4971}
4972
9b5e383c 4973static void rollback_registered_many(struct list_head *head)
93ee31f1 4974{
e93737b0 4975 struct net_device *dev, *tmp;
9b5e383c 4976
93ee31f1
DL
4977 BUG_ON(dev_boot_phase);
4978 ASSERT_RTNL();
4979
e93737b0 4980 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 4981 /* Some devices call without registering
e93737b0
KK
4982 * for initialization unwind. Remove those
4983 * devices and proceed with the remaining.
9b5e383c
ED
4984 */
4985 if (dev->reg_state == NETREG_UNINITIALIZED) {
7b6cd1ce
JP
4986 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4987 dev->name, dev);
93ee31f1 4988
9b5e383c 4989 WARN_ON(1);
e93737b0
KK
4990 list_del(&dev->unreg_list);
4991 continue;
9b5e383c 4992 }
449f4544 4993 dev->dismantle = true;
9b5e383c 4994 BUG_ON(dev->reg_state != NETREG_REGISTERED);
44345724 4995 }
93ee31f1 4996
44345724
OP
4997 /* If device is running, close it first. */
4998 dev_close_many(head);
93ee31f1 4999
44345724 5000 list_for_each_entry(dev, head, unreg_list) {
9b5e383c
ED
5001 /* And unlink it from device chain. */
5002 unlist_netdevice(dev);
93ee31f1 5003
9b5e383c
ED
5004 dev->reg_state = NETREG_UNREGISTERING;
5005 }
93ee31f1
DL
5006
5007 synchronize_net();
5008
9b5e383c
ED
5009 list_for_each_entry(dev, head, unreg_list) {
5010 /* Shutdown queueing discipline. */
5011 dev_shutdown(dev);
93ee31f1
DL
5012
5013
9b5e383c
ED
5014 /* Notify protocols, that we are about to destroy
5015 this device. They should clean all the things.
5016 */
5017 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 5018
a2835763
PM
5019 if (!dev->rtnl_link_ops ||
5020 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5021 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5022
9b5e383c
ED
5023 /*
5024 * Flush the unicast and multicast chains
5025 */
a748ee24 5026 dev_uc_flush(dev);
22bedad3 5027 dev_mc_flush(dev);
93ee31f1 5028
9b5e383c
ED
5029 if (dev->netdev_ops->ndo_uninit)
5030 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 5031
9ff162a8
JP
5032 /* Notifier chain MUST detach us all upper devices. */
5033 WARN_ON(netdev_has_any_upper_dev(dev));
93ee31f1 5034
9b5e383c
ED
5035 /* Remove entries from kobject tree */
5036 netdev_unregister_kobject(dev);
024e9679
AD
5037#ifdef CONFIG_XPS
5038 /* Remove XPS queueing entries */
5039 netif_reset_xps_queues_gt(dev, 0);
5040#endif
9b5e383c 5041 }
93ee31f1 5042
850a545b 5043 synchronize_net();
395264d5 5044
a5ee1551 5045 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
5046 dev_put(dev);
5047}
5048
5049static void rollback_registered(struct net_device *dev)
5050{
5051 LIST_HEAD(single);
5052
5053 list_add(&dev->unreg_list, &single);
5054 rollback_registered_many(&single);
ceaaec98 5055 list_del(&single);
93ee31f1
DL
5056}
5057
c8f44aff
MM
5058static netdev_features_t netdev_fix_features(struct net_device *dev,
5059 netdev_features_t features)
b63365a2 5060{
57422dc5
MM
5061 /* Fix illegal checksum combinations */
5062 if ((features & NETIF_F_HW_CSUM) &&
5063 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5064 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
57422dc5
MM
5065 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5066 }
5067
b63365a2 5068 /* TSO requires that SG is present as well. */
ea2d3688 5069 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6f404e44 5070 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
ea2d3688 5071 features &= ~NETIF_F_ALL_TSO;
b63365a2
HX
5072 }
5073
ec5f0615
PS
5074 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5075 !(features & NETIF_F_IP_CSUM)) {
5076 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5077 features &= ~NETIF_F_TSO;
5078 features &= ~NETIF_F_TSO_ECN;
5079 }
5080
5081 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5082 !(features & NETIF_F_IPV6_CSUM)) {
5083 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5084 features &= ~NETIF_F_TSO6;
5085 }
5086
31d8b9e0
BH
5087 /* TSO ECN requires that TSO is present as well. */
5088 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5089 features &= ~NETIF_F_TSO_ECN;
5090
212b573f
MM
5091 /* Software GSO depends on SG. */
5092 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6f404e44 5093 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
212b573f
MM
5094 features &= ~NETIF_F_GSO;
5095 }
5096
acd1130e 5097 /* UFO needs SG and checksumming */
b63365a2 5098 if (features & NETIF_F_UFO) {
79032644
MM
5099 /* maybe split UFO into V4 and V6? */
5100 if (!((features & NETIF_F_GEN_CSUM) ||
5101 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5102 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5103 netdev_dbg(dev,
acd1130e 5104 "Dropping NETIF_F_UFO since no checksum offload features.\n");
b63365a2
HX
5105 features &= ~NETIF_F_UFO;
5106 }
5107
5108 if (!(features & NETIF_F_SG)) {
6f404e44 5109 netdev_dbg(dev,
acd1130e 5110 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
b63365a2
HX
5111 features &= ~NETIF_F_UFO;
5112 }
5113 }
5114
5115 return features;
5116}
b63365a2 5117
6cb6a27c 5118int __netdev_update_features(struct net_device *dev)
5455c699 5119{
c8f44aff 5120 netdev_features_t features;
5455c699
MM
5121 int err = 0;
5122
87267485
MM
5123 ASSERT_RTNL();
5124
5455c699
MM
5125 features = netdev_get_wanted_features(dev);
5126
5127 if (dev->netdev_ops->ndo_fix_features)
5128 features = dev->netdev_ops->ndo_fix_features(dev, features);
5129
5130 /* driver might be less strict about feature dependencies */
5131 features = netdev_fix_features(dev, features);
5132
5133 if (dev->features == features)
6cb6a27c 5134 return 0;
5455c699 5135
c8f44aff
MM
5136 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5137 &dev->features, &features);
5455c699
MM
5138
5139 if (dev->netdev_ops->ndo_set_features)
5140 err = dev->netdev_ops->ndo_set_features(dev, features);
5141
6cb6a27c 5142 if (unlikely(err < 0)) {
5455c699 5143 netdev_err(dev,
c8f44aff
MM
5144 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5145 err, &features, &dev->features);
6cb6a27c
MM
5146 return -1;
5147 }
5148
5149 if (!err)
5150 dev->features = features;
5151
5152 return 1;
5153}
5154
afe12cc8
MM
5155/**
5156 * netdev_update_features - recalculate device features
5157 * @dev: the device to check
5158 *
5159 * Recalculate dev->features set and send notifications if it
5160 * has changed. Should be called after driver or hardware dependent
5161 * conditions might have changed that influence the features.
5162 */
6cb6a27c
MM
5163void netdev_update_features(struct net_device *dev)
5164{
5165 if (__netdev_update_features(dev))
5166 netdev_features_change(dev);
5455c699
MM
5167}
5168EXPORT_SYMBOL(netdev_update_features);
5169
afe12cc8
MM
5170/**
5171 * netdev_change_features - recalculate device features
5172 * @dev: the device to check
5173 *
5174 * Recalculate dev->features set and send notifications even
5175 * if they have not changed. Should be called instead of
5176 * netdev_update_features() if also dev->vlan_features might
5177 * have changed to allow the changes to be propagated to stacked
5178 * VLAN devices.
5179 */
5180void netdev_change_features(struct net_device *dev)
5181{
5182 __netdev_update_features(dev);
5183 netdev_features_change(dev);
5184}
5185EXPORT_SYMBOL(netdev_change_features);
5186
fc4a7489
PM
5187/**
5188 * netif_stacked_transfer_operstate - transfer operstate
5189 * @rootdev: the root or lower level device to transfer state from
5190 * @dev: the device to transfer operstate to
5191 *
5192 * Transfer operational state from root to device. This is normally
5193 * called when a stacking relationship exists between the root
5194 * device and the device(a leaf device).
5195 */
5196void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5197 struct net_device *dev)
5198{
5199 if (rootdev->operstate == IF_OPER_DORMANT)
5200 netif_dormant_on(dev);
5201 else
5202 netif_dormant_off(dev);
5203
5204 if (netif_carrier_ok(rootdev)) {
5205 if (!netif_carrier_ok(dev))
5206 netif_carrier_on(dev);
5207 } else {
5208 if (netif_carrier_ok(dev))
5209 netif_carrier_off(dev);
5210 }
5211}
5212EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5213
bf264145 5214#ifdef CONFIG_RPS
1b4bf461
ED
5215static int netif_alloc_rx_queues(struct net_device *dev)
5216{
1b4bf461 5217 unsigned int i, count = dev->num_rx_queues;
bd25fa7b 5218 struct netdev_rx_queue *rx;
1b4bf461 5219
bd25fa7b 5220 BUG_ON(count < 1);
1b4bf461 5221
bd25fa7b 5222 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
62b5942a 5223 if (!rx)
bd25fa7b 5224 return -ENOMEM;
62b5942a 5225
bd25fa7b
TH
5226 dev->_rx = rx;
5227
bd25fa7b 5228 for (i = 0; i < count; i++)
fe822240 5229 rx[i].dev = dev;
1b4bf461
ED
5230 return 0;
5231}
bf264145 5232#endif
1b4bf461 5233
aa942104
CG
5234static void netdev_init_one_queue(struct net_device *dev,
5235 struct netdev_queue *queue, void *_unused)
5236{
5237 /* Initialize queue lock */
5238 spin_lock_init(&queue->_xmit_lock);
5239 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5240 queue->xmit_lock_owner = -1;
b236da69 5241 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
aa942104 5242 queue->dev = dev;
114cf580
TH
5243#ifdef CONFIG_BQL
5244 dql_init(&queue->dql, HZ);
5245#endif
aa942104
CG
5246}
5247
60877a32
ED
5248static void netif_free_tx_queues(struct net_device *dev)
5249{
5250 if (is_vmalloc_addr(dev->_tx))
5251 vfree(dev->_tx);
5252 else
5253 kfree(dev->_tx);
5254}
5255
e6484930
TH
5256static int netif_alloc_netdev_queues(struct net_device *dev)
5257{
5258 unsigned int count = dev->num_tx_queues;
5259 struct netdev_queue *tx;
60877a32 5260 size_t sz = count * sizeof(*tx);
e6484930 5261
60877a32 5262 BUG_ON(count < 1 || count > 0xffff);
62b5942a 5263
60877a32
ED
5264 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5265 if (!tx) {
5266 tx = vzalloc(sz);
5267 if (!tx)
5268 return -ENOMEM;
5269 }
e6484930 5270 dev->_tx = tx;
1d24eb48 5271
e6484930
TH
5272 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5273 spin_lock_init(&dev->tx_global_lock);
aa942104
CG
5274
5275 return 0;
e6484930
TH
5276}
5277
1da177e4
LT
5278/**
5279 * register_netdevice - register a network device
5280 * @dev: device to register
5281 *
5282 * Take a completed network device structure and add it to the kernel
5283 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5284 * chain. 0 is returned on success. A negative errno code is returned
5285 * on a failure to set up the device, or if the name is a duplicate.
5286 *
5287 * Callers must hold the rtnl semaphore. You may want
5288 * register_netdev() instead of this.
5289 *
5290 * BUGS:
5291 * The locking appears insufficient to guarantee two parallel registers
5292 * will not get the same name.
5293 */
5294
5295int register_netdevice(struct net_device *dev)
5296{
1da177e4 5297 int ret;
d314774c 5298 struct net *net = dev_net(dev);
1da177e4
LT
5299
5300 BUG_ON(dev_boot_phase);
5301 ASSERT_RTNL();
5302
b17a7c17
SH
5303 might_sleep();
5304
1da177e4
LT
5305 /* When net_device's are persistent, this will be fatal. */
5306 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 5307 BUG_ON(!net);
1da177e4 5308
f1f28aa3 5309 spin_lock_init(&dev->addr_list_lock);
cf508b12 5310 netdev_set_addr_lockdep_class(dev);
1da177e4 5311
1da177e4
LT
5312 dev->iflink = -1;
5313
828de4f6 5314 ret = dev_get_valid_name(net, dev, dev->name);
0696c3a8
PP
5315 if (ret < 0)
5316 goto out;
5317
1da177e4 5318 /* Init, if this function is available */
d314774c
SH
5319 if (dev->netdev_ops->ndo_init) {
5320 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
5321 if (ret) {
5322 if (ret > 0)
5323 ret = -EIO;
90833aa4 5324 goto out;
1da177e4
LT
5325 }
5326 }
4ec93edb 5327
f646968f
PM
5328 if (((dev->hw_features | dev->features) &
5329 NETIF_F_HW_VLAN_CTAG_FILTER) &&
d2ed273d
MM
5330 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5331 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5332 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5333 ret = -EINVAL;
5334 goto err_uninit;
5335 }
5336
9c7dafbf
PE
5337 ret = -EBUSY;
5338 if (!dev->ifindex)
5339 dev->ifindex = dev_new_index(net);
5340 else if (__dev_get_by_index(net, dev->ifindex))
5341 goto err_uninit;
5342
1da177e4
LT
5343 if (dev->iflink == -1)
5344 dev->iflink = dev->ifindex;
5345
5455c699
MM
5346 /* Transfer changeable features to wanted_features and enable
5347 * software offloads (GSO and GRO).
5348 */
5349 dev->hw_features |= NETIF_F_SOFT_FEATURES;
14d1232f
MM
5350 dev->features |= NETIF_F_SOFT_FEATURES;
5351 dev->wanted_features = dev->features & dev->hw_features;
1da177e4 5352
c6e1a0d1 5353 /* Turn on no cache copy if HW is doing checksum */
34324dc2
MM
5354 if (!(dev->flags & IFF_LOOPBACK)) {
5355 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5356 if (dev->features & NETIF_F_ALL_CSUM) {
5357 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5358 dev->features |= NETIF_F_NOCACHE_COPY;
5359 }
c6e1a0d1
TH
5360 }
5361
1180e7d6 5362 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
16c3ea78 5363 */
1180e7d6 5364 dev->vlan_features |= NETIF_F_HIGHDMA;
16c3ea78 5365
ee579677
PS
5366 /* Make NETIF_F_SG inheritable to tunnel devices.
5367 */
5368 dev->hw_enc_features |= NETIF_F_SG;
5369
0d89d203
SH
5370 /* Make NETIF_F_SG inheritable to MPLS.
5371 */
5372 dev->mpls_features |= NETIF_F_SG;
5373
7ffbe3fd
JB
5374 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5375 ret = notifier_to_errno(ret);
5376 if (ret)
5377 goto err_uninit;
5378
8b41d188 5379 ret = netdev_register_kobject(dev);
b17a7c17 5380 if (ret)
7ce1b0ed 5381 goto err_uninit;
b17a7c17
SH
5382 dev->reg_state = NETREG_REGISTERED;
5383
6cb6a27c 5384 __netdev_update_features(dev);
8e9b59b2 5385
1da177e4
LT
5386 /*
5387 * Default initial state at registry is that the
5388 * device is present.
5389 */
5390
5391 set_bit(__LINK_STATE_PRESENT, &dev->state);
5392
8f4cccbb
BH
5393 linkwatch_init_dev(dev);
5394
1da177e4 5395 dev_init_scheduler(dev);
1da177e4 5396 dev_hold(dev);
ce286d32 5397 list_netdevice(dev);
7bf23575 5398 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 5399
948b337e
JP
5400 /* If the device has permanent device address, driver should
5401 * set dev_addr and also addr_assign_type should be set to
5402 * NET_ADDR_PERM (default value).
5403 */
5404 if (dev->addr_assign_type == NET_ADDR_PERM)
5405 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5406
1da177e4 5407 /* Notify protocols, that a new device appeared. */
056925ab 5408 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 5409 ret = notifier_to_errno(ret);
93ee31f1
DL
5410 if (ret) {
5411 rollback_registered(dev);
5412 dev->reg_state = NETREG_UNREGISTERED;
5413 }
d90a909e
EB
5414 /*
5415 * Prevent userspace races by waiting until the network
5416 * device is fully setup before sending notifications.
5417 */
a2835763
PM
5418 if (!dev->rtnl_link_ops ||
5419 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5420 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
1da177e4
LT
5421
5422out:
5423 return ret;
7ce1b0ed
HX
5424
5425err_uninit:
d314774c
SH
5426 if (dev->netdev_ops->ndo_uninit)
5427 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 5428 goto out;
1da177e4 5429}
d1b19dff 5430EXPORT_SYMBOL(register_netdevice);
1da177e4 5431
937f1ba5
BH
5432/**
5433 * init_dummy_netdev - init a dummy network device for NAPI
5434 * @dev: device to init
5435 *
5436 * This takes a network device structure and initialize the minimum
5437 * amount of fields so it can be used to schedule NAPI polls without
5438 * registering a full blown interface. This is to be used by drivers
5439 * that need to tie several hardware interfaces to a single NAPI
5440 * poll scheduler due to HW limitations.
5441 */
5442int init_dummy_netdev(struct net_device *dev)
5443{
5444 /* Clear everything. Note we don't initialize spinlocks
5445 * are they aren't supposed to be taken by any of the
5446 * NAPI code and this dummy netdev is supposed to be
5447 * only ever used for NAPI polls
5448 */
5449 memset(dev, 0, sizeof(struct net_device));
5450
5451 /* make sure we BUG if trying to hit standard
5452 * register/unregister code path
5453 */
5454 dev->reg_state = NETREG_DUMMY;
5455
937f1ba5
BH
5456 /* NAPI wants this */
5457 INIT_LIST_HEAD(&dev->napi_list);
5458
5459 /* a dummy interface is started by default */
5460 set_bit(__LINK_STATE_PRESENT, &dev->state);
5461 set_bit(__LINK_STATE_START, &dev->state);
5462
29b4433d
ED
5463 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5464 * because users of this 'device' dont need to change
5465 * its refcount.
5466 */
5467
937f1ba5
BH
5468 return 0;
5469}
5470EXPORT_SYMBOL_GPL(init_dummy_netdev);
5471
5472
1da177e4
LT
5473/**
5474 * register_netdev - register a network device
5475 * @dev: device to register
5476 *
5477 * Take a completed network device structure and add it to the kernel
5478 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5479 * chain. 0 is returned on success. A negative errno code is returned
5480 * on a failure to set up the device, or if the name is a duplicate.
5481 *
38b4da38 5482 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
5483 * and expands the device name if you passed a format string to
5484 * alloc_netdev.
5485 */
5486int register_netdev(struct net_device *dev)
5487{
5488 int err;
5489
5490 rtnl_lock();
1da177e4 5491 err = register_netdevice(dev);
1da177e4
LT
5492 rtnl_unlock();
5493 return err;
5494}
5495EXPORT_SYMBOL(register_netdev);
5496
29b4433d
ED
5497int netdev_refcnt_read(const struct net_device *dev)
5498{
5499 int i, refcnt = 0;
5500
5501 for_each_possible_cpu(i)
5502 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5503 return refcnt;
5504}
5505EXPORT_SYMBOL(netdev_refcnt_read);
5506
2c53040f 5507/**
1da177e4 5508 * netdev_wait_allrefs - wait until all references are gone.
3de7a37b 5509 * @dev: target net_device
1da177e4
LT
5510 *
5511 * This is called when unregistering network devices.
5512 *
5513 * Any protocol or device that holds a reference should register
5514 * for netdevice notification, and cleanup and put back the
5515 * reference if they receive an UNREGISTER event.
5516 * We can get stuck here if buggy protocols don't correctly
4ec93edb 5517 * call dev_put.
1da177e4
LT
5518 */
5519static void netdev_wait_allrefs(struct net_device *dev)
5520{
5521 unsigned long rebroadcast_time, warning_time;
29b4433d 5522 int refcnt;
1da177e4 5523
e014debe
ED
5524 linkwatch_forget_dev(dev);
5525
1da177e4 5526 rebroadcast_time = warning_time = jiffies;
29b4433d
ED
5527 refcnt = netdev_refcnt_read(dev);
5528
5529 while (refcnt != 0) {
1da177e4 5530 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 5531 rtnl_lock();
1da177e4
LT
5532
5533 /* Rebroadcast unregister notification */
056925ab 5534 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
1da177e4 5535
748e2d93 5536 __rtnl_unlock();
0115e8e3 5537 rcu_barrier();
748e2d93
ED
5538 rtnl_lock();
5539
0115e8e3 5540 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
1da177e4
LT
5541 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5542 &dev->state)) {
5543 /* We must not have linkwatch events
5544 * pending on unregister. If this
5545 * happens, we simply run the queue
5546 * unscheduled, resulting in a noop
5547 * for this device.
5548 */
5549 linkwatch_run_queue();
5550 }
5551
6756ae4b 5552 __rtnl_unlock();
1da177e4
LT
5553
5554 rebroadcast_time = jiffies;
5555 }
5556
5557 msleep(250);
5558
29b4433d
ED
5559 refcnt = netdev_refcnt_read(dev);
5560
1da177e4 5561 if (time_after(jiffies, warning_time + 10 * HZ)) {
7b6cd1ce
JP
5562 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5563 dev->name, refcnt);
1da177e4
LT
5564 warning_time = jiffies;
5565 }
5566 }
5567}
5568
5569/* The sequence is:
5570 *
5571 * rtnl_lock();
5572 * ...
5573 * register_netdevice(x1);
5574 * register_netdevice(x2);
5575 * ...
5576 * unregister_netdevice(y1);
5577 * unregister_netdevice(y2);
5578 * ...
5579 * rtnl_unlock();
5580 * free_netdev(y1);
5581 * free_netdev(y2);
5582 *
58ec3b4d 5583 * We are invoked by rtnl_unlock().
1da177e4 5584 * This allows us to deal with problems:
b17a7c17 5585 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
5586 * without deadlocking with linkwatch via keventd.
5587 * 2) Since we run with the RTNL semaphore not held, we can sleep
5588 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
5589 *
5590 * We must not return until all unregister events added during
5591 * the interval the lock was held have been completed.
1da177e4 5592 */
1da177e4
LT
5593void netdev_run_todo(void)
5594{
626ab0e6 5595 struct list_head list;
1da177e4 5596
1da177e4 5597 /* Snapshot list, allow later requests */
626ab0e6 5598 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
5599
5600 __rtnl_unlock();
626ab0e6 5601
0115e8e3
ED
5602
5603 /* Wait for rcu callbacks to finish before next phase */
850a545b
EB
5604 if (!list_empty(&list))
5605 rcu_barrier();
5606
1da177e4
LT
5607 while (!list_empty(&list)) {
5608 struct net_device *dev
e5e26d75 5609 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
5610 list_del(&dev->todo_list);
5611
748e2d93 5612 rtnl_lock();
0115e8e3 5613 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
748e2d93 5614 __rtnl_unlock();
0115e8e3 5615
b17a7c17 5616 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7b6cd1ce 5617 pr_err("network todo '%s' but state %d\n",
b17a7c17
SH
5618 dev->name, dev->reg_state);
5619 dump_stack();
5620 continue;
5621 }
1da177e4 5622
b17a7c17 5623 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 5624
152102c7 5625 on_each_cpu(flush_backlog, dev, 1);
6e583ce5 5626
b17a7c17 5627 netdev_wait_allrefs(dev);
1da177e4 5628
b17a7c17 5629 /* paranoia */
29b4433d 5630 BUG_ON(netdev_refcnt_read(dev));
33d480ce
ED
5631 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5632 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
547b792c 5633 WARN_ON(dev->dn_ptr);
1da177e4 5634
b17a7c17
SH
5635 if (dev->destructor)
5636 dev->destructor(dev);
9093bbb2
SH
5637
5638 /* Free network device */
5639 kobject_put(&dev->dev.kobj);
1da177e4 5640 }
1da177e4
LT
5641}
5642
3cfde79c
BH
5643/* Convert net_device_stats to rtnl_link_stats64. They have the same
5644 * fields in the same order, with only the type differing.
5645 */
77a1abf5
ED
5646void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5647 const struct net_device_stats *netdev_stats)
3cfde79c
BH
5648{
5649#if BITS_PER_LONG == 64
77a1abf5
ED
5650 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5651 memcpy(stats64, netdev_stats, sizeof(*stats64));
3cfde79c
BH
5652#else
5653 size_t i, n = sizeof(*stats64) / sizeof(u64);
5654 const unsigned long *src = (const unsigned long *)netdev_stats;
5655 u64 *dst = (u64 *)stats64;
5656
5657 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5658 sizeof(*stats64) / sizeof(u64));
5659 for (i = 0; i < n; i++)
5660 dst[i] = src[i];
5661#endif
5662}
77a1abf5 5663EXPORT_SYMBOL(netdev_stats_to_stats64);
3cfde79c 5664
eeda3fd6
SH
5665/**
5666 * dev_get_stats - get network device statistics
5667 * @dev: device to get statistics from
28172739 5668 * @storage: place to store stats
eeda3fd6 5669 *
d7753516
BH
5670 * Get network statistics from device. Return @storage.
5671 * The device driver may provide its own method by setting
5672 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5673 * otherwise the internal statistics structure is used.
eeda3fd6 5674 */
d7753516
BH
5675struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5676 struct rtnl_link_stats64 *storage)
7004bf25 5677{
eeda3fd6
SH
5678 const struct net_device_ops *ops = dev->netdev_ops;
5679
28172739
ED
5680 if (ops->ndo_get_stats64) {
5681 memset(storage, 0, sizeof(*storage));
caf586e5
ED
5682 ops->ndo_get_stats64(dev, storage);
5683 } else if (ops->ndo_get_stats) {
3cfde79c 5684 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
caf586e5
ED
5685 } else {
5686 netdev_stats_to_stats64(storage, &dev->stats);
28172739 5687 }
caf586e5 5688 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
28172739 5689 return storage;
c45d286e 5690}
eeda3fd6 5691EXPORT_SYMBOL(dev_get_stats);
c45d286e 5692
24824a09 5693struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
dc2b4847 5694{
24824a09 5695 struct netdev_queue *queue = dev_ingress_queue(dev);
dc2b4847 5696
24824a09
ED
5697#ifdef CONFIG_NET_CLS_ACT
5698 if (queue)
5699 return queue;
5700 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5701 if (!queue)
5702 return NULL;
5703 netdev_init_one_queue(dev, queue, NULL);
24824a09
ED
5704 queue->qdisc = &noop_qdisc;
5705 queue->qdisc_sleeping = &noop_qdisc;
5706 rcu_assign_pointer(dev->ingress_queue, queue);
5707#endif
5708 return queue;
bb949fbd
DM
5709}
5710
2c60db03
ED
5711static const struct ethtool_ops default_ethtool_ops;
5712
d07d7507
SG
5713void netdev_set_default_ethtool_ops(struct net_device *dev,
5714 const struct ethtool_ops *ops)
5715{
5716 if (dev->ethtool_ops == &default_ethtool_ops)
5717 dev->ethtool_ops = ops;
5718}
5719EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5720
1da177e4 5721/**
36909ea4 5722 * alloc_netdev_mqs - allocate network device
1da177e4
LT
5723 * @sizeof_priv: size of private data to allocate space for
5724 * @name: device name format string
5725 * @setup: callback to initialize device
36909ea4
TH
5726 * @txqs: the number of TX subqueues to allocate
5727 * @rxqs: the number of RX subqueues to allocate
1da177e4
LT
5728 *
5729 * Allocates a struct net_device with private data area for driver use
f25f4e44 5730 * and performs basic initialization. Also allocates subquue structs
36909ea4 5731 * for each queue on the device.
1da177e4 5732 */
36909ea4
TH
5733struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5734 void (*setup)(struct net_device *),
5735 unsigned int txqs, unsigned int rxqs)
1da177e4 5736{
1da177e4 5737 struct net_device *dev;
7943986c 5738 size_t alloc_size;
1ce8e7b5 5739 struct net_device *p;
1da177e4 5740
b6fe17d6
SH
5741 BUG_ON(strlen(name) >= sizeof(dev->name));
5742
36909ea4 5743 if (txqs < 1) {
7b6cd1ce 5744 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
55513fb4
TH
5745 return NULL;
5746 }
5747
36909ea4
TH
5748#ifdef CONFIG_RPS
5749 if (rxqs < 1) {
7b6cd1ce 5750 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
36909ea4
TH
5751 return NULL;
5752 }
5753#endif
5754
fd2ea0a7 5755 alloc_size = sizeof(struct net_device);
d1643d24
AD
5756 if (sizeof_priv) {
5757 /* ensure 32-byte alignment of private area */
1ce8e7b5 5758 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
5759 alloc_size += sizeof_priv;
5760 }
5761 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 5762 alloc_size += NETDEV_ALIGN - 1;
1da177e4 5763
31380de9 5764 p = kzalloc(alloc_size, GFP_KERNEL);
62b5942a 5765 if (!p)
1da177e4 5766 return NULL;
1da177e4 5767
1ce8e7b5 5768 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 5769 dev->padded = (char *)dev - (char *)p;
ab9c73cc 5770
29b4433d
ED
5771 dev->pcpu_refcnt = alloc_percpu(int);
5772 if (!dev->pcpu_refcnt)
e6484930 5773 goto free_p;
ab9c73cc 5774
ab9c73cc 5775 if (dev_addr_init(dev))
29b4433d 5776 goto free_pcpu;
ab9c73cc 5777
22bedad3 5778 dev_mc_init(dev);
a748ee24 5779 dev_uc_init(dev);
ccffad25 5780
c346dca1 5781 dev_net_set(dev, &init_net);
1da177e4 5782
8d3bdbd5 5783 dev->gso_max_size = GSO_MAX_SIZE;
30b678d8 5784 dev->gso_max_segs = GSO_MAX_SEGS;
8d3bdbd5 5785
8d3bdbd5
DM
5786 INIT_LIST_HEAD(&dev->napi_list);
5787 INIT_LIST_HEAD(&dev->unreg_list);
5788 INIT_LIST_HEAD(&dev->link_watch_list);
9ff162a8 5789 INIT_LIST_HEAD(&dev->upper_dev_list);
8d3bdbd5
DM
5790 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5791 setup(dev);
5792
36909ea4
TH
5793 dev->num_tx_queues = txqs;
5794 dev->real_num_tx_queues = txqs;
ed9af2e8 5795 if (netif_alloc_netdev_queues(dev))
8d3bdbd5 5796 goto free_all;
e8a0464c 5797
df334545 5798#ifdef CONFIG_RPS
36909ea4
TH
5799 dev->num_rx_queues = rxqs;
5800 dev->real_num_rx_queues = rxqs;
fe822240 5801 if (netif_alloc_rx_queues(dev))
8d3bdbd5 5802 goto free_all;
df334545 5803#endif
0a9627f2 5804
1da177e4 5805 strcpy(dev->name, name);
cbda10fa 5806 dev->group = INIT_NETDEV_GROUP;
2c60db03
ED
5807 if (!dev->ethtool_ops)
5808 dev->ethtool_ops = &default_ethtool_ops;
1da177e4 5809 return dev;
ab9c73cc 5810
8d3bdbd5
DM
5811free_all:
5812 free_netdev(dev);
5813 return NULL;
5814
29b4433d
ED
5815free_pcpu:
5816 free_percpu(dev->pcpu_refcnt);
60877a32 5817 netif_free_tx_queues(dev);
fe822240
TH
5818#ifdef CONFIG_RPS
5819 kfree(dev->_rx);
5820#endif
5821
ab9c73cc
JP
5822free_p:
5823 kfree(p);
5824 return NULL;
1da177e4 5825}
36909ea4 5826EXPORT_SYMBOL(alloc_netdev_mqs);
1da177e4
LT
5827
5828/**
5829 * free_netdev - free network device
5830 * @dev: device
5831 *
4ec93edb
YH
5832 * This function does the last stage of destroying an allocated device
5833 * interface. The reference to the device object is released.
1da177e4
LT
5834 * If this is the last reference then it will be freed.
5835 */
5836void free_netdev(struct net_device *dev)
5837{
d565b0a1
HX
5838 struct napi_struct *p, *n;
5839
f3005d7f
DL
5840 release_net(dev_net(dev));
5841
60877a32 5842 netif_free_tx_queues(dev);
fe822240
TH
5843#ifdef CONFIG_RPS
5844 kfree(dev->_rx);
5845#endif
e8a0464c 5846
33d480ce 5847 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
24824a09 5848
f001fde5
JP
5849 /* Flush device addresses */
5850 dev_addr_flush(dev);
5851
d565b0a1
HX
5852 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5853 netif_napi_del(p);
5854
29b4433d
ED
5855 free_percpu(dev->pcpu_refcnt);
5856 dev->pcpu_refcnt = NULL;
5857
3041a069 5858 /* Compatibility with error handling in drivers */
1da177e4
LT
5859 if (dev->reg_state == NETREG_UNINITIALIZED) {
5860 kfree((char *)dev - dev->padded);
5861 return;
5862 }
5863
5864 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5865 dev->reg_state = NETREG_RELEASED;
5866
43cb76d9
GKH
5867 /* will free via device release */
5868 put_device(&dev->dev);
1da177e4 5869}
d1b19dff 5870EXPORT_SYMBOL(free_netdev);
4ec93edb 5871
f0db275a
SH
5872/**
5873 * synchronize_net - Synchronize with packet receive processing
5874 *
5875 * Wait for packets currently being received to be done.
5876 * Does not block later packets from starting.
5877 */
4ec93edb 5878void synchronize_net(void)
1da177e4
LT
5879{
5880 might_sleep();
be3fc413
ED
5881 if (rtnl_is_locked())
5882 synchronize_rcu_expedited();
5883 else
5884 synchronize_rcu();
1da177e4 5885}
d1b19dff 5886EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
5887
5888/**
44a0873d 5889 * unregister_netdevice_queue - remove device from the kernel
1da177e4 5890 * @dev: device
44a0873d 5891 * @head: list
6ebfbc06 5892 *
1da177e4 5893 * This function shuts down a device interface and removes it
d59b54b1 5894 * from the kernel tables.
44a0873d 5895 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
5896 *
5897 * Callers must hold the rtnl semaphore. You may want
5898 * unregister_netdev() instead of this.
5899 */
5900
44a0873d 5901void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 5902{
a6620712
HX
5903 ASSERT_RTNL();
5904
44a0873d 5905 if (head) {
9fdce099 5906 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
5907 } else {
5908 rollback_registered(dev);
5909 /* Finish processing unregister after unlock */
5910 net_set_todo(dev);
5911 }
1da177e4 5912}
44a0873d 5913EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 5914
9b5e383c
ED
5915/**
5916 * unregister_netdevice_many - unregister many devices
5917 * @head: list of devices
9b5e383c
ED
5918 */
5919void unregister_netdevice_many(struct list_head *head)
5920{
5921 struct net_device *dev;
5922
5923 if (!list_empty(head)) {
5924 rollback_registered_many(head);
5925 list_for_each_entry(dev, head, unreg_list)
5926 net_set_todo(dev);
5927 }
5928}
63c8099d 5929EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 5930
1da177e4
LT
5931/**
5932 * unregister_netdev - remove device from the kernel
5933 * @dev: device
5934 *
5935 * This function shuts down a device interface and removes it
d59b54b1 5936 * from the kernel tables.
1da177e4
LT
5937 *
5938 * This is just a wrapper for unregister_netdevice that takes
5939 * the rtnl semaphore. In general you want to use this and not
5940 * unregister_netdevice.
5941 */
5942void unregister_netdev(struct net_device *dev)
5943{
5944 rtnl_lock();
5945 unregister_netdevice(dev);
5946 rtnl_unlock();
5947}
1da177e4
LT
5948EXPORT_SYMBOL(unregister_netdev);
5949
ce286d32
EB
5950/**
5951 * dev_change_net_namespace - move device to different nethost namespace
5952 * @dev: device
5953 * @net: network namespace
5954 * @pat: If not NULL name pattern to try if the current device name
5955 * is already taken in the destination network namespace.
5956 *
5957 * This function shuts down a device interface and moves it
5958 * to a new network namespace. On success 0 is returned, on
5959 * a failure a netagive errno code is returned.
5960 *
5961 * Callers must hold the rtnl semaphore.
5962 */
5963
5964int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5965{
ce286d32
EB
5966 int err;
5967
5968 ASSERT_RTNL();
5969
5970 /* Don't allow namespace local devices to be moved. */
5971 err = -EINVAL;
5972 if (dev->features & NETIF_F_NETNS_LOCAL)
5973 goto out;
5974
5975 /* Ensure the device has been registrered */
ce286d32
EB
5976 if (dev->reg_state != NETREG_REGISTERED)
5977 goto out;
5978
5979 /* Get out if there is nothing todo */
5980 err = 0;
878628fb 5981 if (net_eq(dev_net(dev), net))
ce286d32
EB
5982 goto out;
5983
5984 /* Pick the destination device name, and ensure
5985 * we can use it in the destination network namespace.
5986 */
5987 err = -EEXIST;
d9031024 5988 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
5989 /* We get here if we can't use the current device name */
5990 if (!pat)
5991 goto out;
828de4f6 5992 if (dev_get_valid_name(net, dev, pat) < 0)
ce286d32
EB
5993 goto out;
5994 }
5995
5996 /*
5997 * And now a mini version of register_netdevice unregister_netdevice.
5998 */
5999
6000 /* If device is running close it first. */
9b772652 6001 dev_close(dev);
ce286d32
EB
6002
6003 /* And unlink it from device chain */
6004 err = -ENODEV;
6005 unlist_netdevice(dev);
6006
6007 synchronize_net();
6008
6009 /* Shutdown queueing discipline. */
6010 dev_shutdown(dev);
6011
6012 /* Notify protocols, that we are about to destroy
6013 this device. They should clean all the things.
3b27e105
DL
6014
6015 Note that dev->reg_state stays at NETREG_REGISTERED.
6016 This is wanted because this way 8021q and macvlan know
6017 the device is just moving and can keep their slaves up.
ce286d32
EB
6018 */
6019 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6549dd43
G
6020 rcu_barrier();
6021 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
d2237d35 6022 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
ce286d32
EB
6023
6024 /*
6025 * Flush the unicast and multicast chains
6026 */
a748ee24 6027 dev_uc_flush(dev);
22bedad3 6028 dev_mc_flush(dev);
ce286d32 6029
4e66ae2e
SH
6030 /* Send a netdev-removed uevent to the old namespace */
6031 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6032
ce286d32 6033 /* Actually switch the network namespace */
c346dca1 6034 dev_net_set(dev, net);
ce286d32 6035
ce286d32
EB
6036 /* If there is an ifindex conflict assign a new one */
6037 if (__dev_get_by_index(net, dev->ifindex)) {
6038 int iflink = (dev->iflink == dev->ifindex);
6039 dev->ifindex = dev_new_index(net);
6040 if (iflink)
6041 dev->iflink = dev->ifindex;
6042 }
6043
4e66ae2e
SH
6044 /* Send a netdev-add uevent to the new namespace */
6045 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6046
8b41d188 6047 /* Fixup kobjects */
a1b3f594 6048 err = device_rename(&dev->dev, dev->name);
8b41d188 6049 WARN_ON(err);
ce286d32
EB
6050
6051 /* Add the device back in the hashes */
6052 list_netdevice(dev);
6053
6054 /* Notify protocols, that a new device appeared. */
6055 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6056
d90a909e
EB
6057 /*
6058 * Prevent userspace races by waiting until the network
6059 * device is fully setup before sending notifications.
6060 */
6061 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6062
ce286d32
EB
6063 synchronize_net();
6064 err = 0;
6065out:
6066 return err;
6067}
463d0183 6068EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 6069
1da177e4
LT
6070static int dev_cpu_callback(struct notifier_block *nfb,
6071 unsigned long action,
6072 void *ocpu)
6073{
6074 struct sk_buff **list_skb;
1da177e4
LT
6075 struct sk_buff *skb;
6076 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6077 struct softnet_data *sd, *oldsd;
6078
8bb78442 6079 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
6080 return NOTIFY_OK;
6081
6082 local_irq_disable();
6083 cpu = smp_processor_id();
6084 sd = &per_cpu(softnet_data, cpu);
6085 oldsd = &per_cpu(softnet_data, oldcpu);
6086
6087 /* Find end of our completion_queue. */
6088 list_skb = &sd->completion_queue;
6089 while (*list_skb)
6090 list_skb = &(*list_skb)->next;
6091 /* Append completion queue from offline CPU. */
6092 *list_skb = oldsd->completion_queue;
6093 oldsd->completion_queue = NULL;
6094
1da177e4 6095 /* Append output queue from offline CPU. */
a9cbd588
CG
6096 if (oldsd->output_queue) {
6097 *sd->output_queue_tailp = oldsd->output_queue;
6098 sd->output_queue_tailp = oldsd->output_queue_tailp;
6099 oldsd->output_queue = NULL;
6100 oldsd->output_queue_tailp = &oldsd->output_queue;
6101 }
264524d5
HC
6102 /* Append NAPI poll list from offline CPU. */
6103 if (!list_empty(&oldsd->poll_list)) {
6104 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6105 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6106 }
1da177e4
LT
6107
6108 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6109 local_irq_enable();
6110
6111 /* Process offline CPU's input_pkt_queue */
76cc8b13 6112 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
1da177e4 6113 netif_rx(skb);
76cc8b13 6114 input_queue_head_incr(oldsd);
fec5e652 6115 }
76cc8b13 6116 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6e7676c1 6117 netif_rx(skb);
76cc8b13
TH
6118 input_queue_head_incr(oldsd);
6119 }
1da177e4
LT
6120
6121 return NOTIFY_OK;
6122}
1da177e4
LT
6123
6124
7f353bf2 6125/**
b63365a2
HX
6126 * netdev_increment_features - increment feature set by one
6127 * @all: current feature set
6128 * @one: new feature set
6129 * @mask: mask feature set
7f353bf2
HX
6130 *
6131 * Computes a new feature set after adding a device with feature set
b63365a2
HX
6132 * @one to the master device with current feature set @all. Will not
6133 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 6134 */
c8f44aff
MM
6135netdev_features_t netdev_increment_features(netdev_features_t all,
6136 netdev_features_t one, netdev_features_t mask)
b63365a2 6137{
1742f183
MM
6138 if (mask & NETIF_F_GEN_CSUM)
6139 mask |= NETIF_F_ALL_CSUM;
6140 mask |= NETIF_F_VLAN_CHALLENGED;
7f353bf2 6141
1742f183
MM
6142 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6143 all &= one | ~NETIF_F_ALL_FOR_ALL;
c6e1a0d1 6144
1742f183
MM
6145 /* If one device supports hw checksumming, set for all. */
6146 if (all & NETIF_F_GEN_CSUM)
6147 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7f353bf2
HX
6148
6149 return all;
6150}
b63365a2 6151EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 6152
430f03cd 6153static struct hlist_head * __net_init netdev_create_hash(void)
30d97d35
PE
6154{
6155 int i;
6156 struct hlist_head *hash;
6157
6158 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6159 if (hash != NULL)
6160 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6161 INIT_HLIST_HEAD(&hash[i]);
6162
6163 return hash;
6164}
6165
881d966b 6166/* Initialize per network namespace state */
4665079c 6167static int __net_init netdev_init(struct net *net)
881d966b 6168{
734b6541
RM
6169 if (net != &init_net)
6170 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 6171
30d97d35
PE
6172 net->dev_name_head = netdev_create_hash();
6173 if (net->dev_name_head == NULL)
6174 goto err_name;
881d966b 6175
30d97d35
PE
6176 net->dev_index_head = netdev_create_hash();
6177 if (net->dev_index_head == NULL)
6178 goto err_idx;
881d966b
EB
6179
6180 return 0;
30d97d35
PE
6181
6182err_idx:
6183 kfree(net->dev_name_head);
6184err_name:
6185 return -ENOMEM;
881d966b
EB
6186}
6187
f0db275a
SH
6188/**
6189 * netdev_drivername - network driver for the device
6190 * @dev: network device
f0db275a
SH
6191 *
6192 * Determine network driver for device.
6193 */
3019de12 6194const char *netdev_drivername(const struct net_device *dev)
6579e57b 6195{
cf04a4c7
SH
6196 const struct device_driver *driver;
6197 const struct device *parent;
3019de12 6198 const char *empty = "";
6579e57b
AV
6199
6200 parent = dev->dev.parent;
6579e57b 6201 if (!parent)
3019de12 6202 return empty;
6579e57b
AV
6203
6204 driver = parent->driver;
6205 if (driver && driver->name)
3019de12
DM
6206 return driver->name;
6207 return empty;
6579e57b
AV
6208}
6209
b004ff49 6210static int __netdev_printk(const char *level, const struct net_device *dev,
256df2f3
JP
6211 struct va_format *vaf)
6212{
6213 int r;
6214
b004ff49 6215 if (dev && dev->dev.parent) {
666f355f
JP
6216 r = dev_printk_emit(level[1] - '0',
6217 dev->dev.parent,
6218 "%s %s %s: %pV",
6219 dev_driver_string(dev->dev.parent),
6220 dev_name(dev->dev.parent),
6221 netdev_name(dev), vaf);
b004ff49 6222 } else if (dev) {
256df2f3 6223 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
b004ff49 6224 } else {
256df2f3 6225 r = printk("%s(NULL net_device): %pV", level, vaf);
b004ff49 6226 }
256df2f3
JP
6227
6228 return r;
6229}
6230
6231int netdev_printk(const char *level, const struct net_device *dev,
6232 const char *format, ...)
6233{
6234 struct va_format vaf;
6235 va_list args;
6236 int r;
6237
6238 va_start(args, format);
6239
6240 vaf.fmt = format;
6241 vaf.va = &args;
6242
6243 r = __netdev_printk(level, dev, &vaf);
b004ff49 6244
256df2f3
JP
6245 va_end(args);
6246
6247 return r;
6248}
6249EXPORT_SYMBOL(netdev_printk);
6250
6251#define define_netdev_printk_level(func, level) \
6252int func(const struct net_device *dev, const char *fmt, ...) \
6253{ \
6254 int r; \
6255 struct va_format vaf; \
6256 va_list args; \
6257 \
6258 va_start(args, fmt); \
6259 \
6260 vaf.fmt = fmt; \
6261 vaf.va = &args; \
6262 \
6263 r = __netdev_printk(level, dev, &vaf); \
b004ff49 6264 \
256df2f3
JP
6265 va_end(args); \
6266 \
6267 return r; \
6268} \
6269EXPORT_SYMBOL(func);
6270
6271define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6272define_netdev_printk_level(netdev_alert, KERN_ALERT);
6273define_netdev_printk_level(netdev_crit, KERN_CRIT);
6274define_netdev_printk_level(netdev_err, KERN_ERR);
6275define_netdev_printk_level(netdev_warn, KERN_WARNING);
6276define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6277define_netdev_printk_level(netdev_info, KERN_INFO);
6278
4665079c 6279static void __net_exit netdev_exit(struct net *net)
881d966b
EB
6280{
6281 kfree(net->dev_name_head);
6282 kfree(net->dev_index_head);
6283}
6284
022cbae6 6285static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
6286 .init = netdev_init,
6287 .exit = netdev_exit,
6288};
6289
4665079c 6290static void __net_exit default_device_exit(struct net *net)
ce286d32 6291{
e008b5fc 6292 struct net_device *dev, *aux;
ce286d32 6293 /*
e008b5fc 6294 * Push all migratable network devices back to the
ce286d32
EB
6295 * initial network namespace
6296 */
6297 rtnl_lock();
e008b5fc 6298 for_each_netdev_safe(net, dev, aux) {
ce286d32 6299 int err;
aca51397 6300 char fb_name[IFNAMSIZ];
ce286d32
EB
6301
6302 /* Ignore unmoveable devices (i.e. loopback) */
6303 if (dev->features & NETIF_F_NETNS_LOCAL)
6304 continue;
6305
e008b5fc
EB
6306 /* Leave virtual devices for the generic cleanup */
6307 if (dev->rtnl_link_ops)
6308 continue;
d0c082ce 6309
25985edc 6310 /* Push remaining network devices to init_net */
aca51397
PE
6311 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6312 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 6313 if (err) {
7b6cd1ce
JP
6314 pr_emerg("%s: failed to move %s to init_net: %d\n",
6315 __func__, dev->name, err);
aca51397 6316 BUG();
ce286d32
EB
6317 }
6318 }
6319 rtnl_unlock();
6320}
6321
04dc7f6b
EB
6322static void __net_exit default_device_exit_batch(struct list_head *net_list)
6323{
6324 /* At exit all network devices most be removed from a network
b595076a 6325 * namespace. Do this in the reverse order of registration.
04dc7f6b
EB
6326 * Do this across as many network namespaces as possible to
6327 * improve batching efficiency.
6328 */
6329 struct net_device *dev;
6330 struct net *net;
6331 LIST_HEAD(dev_kill_list);
6332
6333 rtnl_lock();
6334 list_for_each_entry(net, net_list, exit_list) {
6335 for_each_netdev_reverse(net, dev) {
6336 if (dev->rtnl_link_ops)
6337 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6338 else
6339 unregister_netdevice_queue(dev, &dev_kill_list);
6340 }
6341 }
6342 unregister_netdevice_many(&dev_kill_list);
ceaaec98 6343 list_del(&dev_kill_list);
04dc7f6b
EB
6344 rtnl_unlock();
6345}
6346
022cbae6 6347static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 6348 .exit = default_device_exit,
04dc7f6b 6349 .exit_batch = default_device_exit_batch,
ce286d32
EB
6350};
6351
1da177e4
LT
6352/*
6353 * Initialize the DEV module. At boot time this walks the device list and
6354 * unhooks any devices that fail to initialise (normally hardware not
6355 * present) and leaves us with a valid list of present and active devices.
6356 *
6357 */
6358
6359/*
6360 * This is called single threaded during boot, so no need
6361 * to take the rtnl semaphore.
6362 */
6363static int __init net_dev_init(void)
6364{
6365 int i, rc = -ENOMEM;
6366
6367 BUG_ON(!dev_boot_phase);
6368
1da177e4
LT
6369 if (dev_proc_init())
6370 goto out;
6371
8b41d188 6372 if (netdev_kobject_init())
1da177e4
LT
6373 goto out;
6374
6375 INIT_LIST_HEAD(&ptype_all);
82d8a867 6376 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
6377 INIT_LIST_HEAD(&ptype_base[i]);
6378
62532da9
VY
6379 INIT_LIST_HEAD(&offload_base);
6380
881d966b
EB
6381 if (register_pernet_subsys(&netdev_net_ops))
6382 goto out;
1da177e4
LT
6383
6384 /*
6385 * Initialise the packet receive queues.
6386 */
6387
6f912042 6388 for_each_possible_cpu(i) {
e36fa2f7 6389 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 6390
dee42870 6391 memset(sd, 0, sizeof(*sd));
e36fa2f7 6392 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 6393 skb_queue_head_init(&sd->process_queue);
e36fa2f7
ED
6394 sd->completion_queue = NULL;
6395 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588
CG
6396 sd->output_queue = NULL;
6397 sd->output_queue_tailp = &sd->output_queue;
df334545 6398#ifdef CONFIG_RPS
e36fa2f7
ED
6399 sd->csd.func = rps_trigger_softirq;
6400 sd->csd.info = sd;
6401 sd->csd.flags = 0;
6402 sd->cpu = i;
1e94d72f 6403#endif
0a9627f2 6404
e36fa2f7
ED
6405 sd->backlog.poll = process_backlog;
6406 sd->backlog.weight = weight_p;
6407 sd->backlog.gro_list = NULL;
6408 sd->backlog.gro_count = 0;
99bbc707
WB
6409
6410#ifdef CONFIG_NET_FLOW_LIMIT
6411 sd->flow_limit = NULL;
6412#endif
1da177e4
LT
6413 }
6414
1da177e4
LT
6415 dev_boot_phase = 0;
6416
505d4f73
EB
6417 /* The loopback device is special if any other network devices
6418 * is present in a network namespace the loopback device must
6419 * be present. Since we now dynamically allocate and free the
6420 * loopback device ensure this invariant is maintained by
6421 * keeping the loopback device as the first device on the
6422 * list of network devices. Ensuring the loopback devices
6423 * is the first device that appears and the last network device
6424 * that disappears.
6425 */
6426 if (register_pernet_device(&loopback_net_ops))
6427 goto out;
6428
6429 if (register_pernet_device(&default_device_ops))
6430 goto out;
6431
962cf36c
CM
6432 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6433 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
6434
6435 hotcpu_notifier(dev_cpu_callback, 0);
6436 dst_init();
1da177e4
LT
6437 rc = 0;
6438out:
6439 return rc;
6440}
6441
6442subsys_initcall(net_dev_init);
This page took 2.322429 seconds and 5 git commands to generate.