rtnetlink: add babel protocol recognition
[deliverable/linux.git] / net / core / dev.c
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <net/mpls.h>
122 #include <linux/ipv6.h>
123 #include <linux/in.h>
124 #include <linux/jhash.h>
125 #include <linux/random.h>
126 #include <trace/events/napi.h>
127 #include <trace/events/net.h>
128 #include <trace/events/skb.h>
129 #include <linux/pci.h>
130 #include <linux/inetdevice.h>
131 #include <linux/cpu_rmap.h>
132 #include <linux/static_key.h>
133 #include <linux/hashtable.h>
134 #include <linux/vmalloc.h>
135 #include <linux/if_macvlan.h>
136 #include <linux/errqueue.h>
137
138 #include "net-sysfs.h"
139
140 /* Instead of increasing this, you should create a hash table. */
141 #define MAX_GRO_SKBS 8
142
143 /* This should be increased if a protocol with a bigger head is added. */
144 #define GRO_MAX_HEAD (MAX_HEADER + 128)
145
146 static DEFINE_SPINLOCK(ptype_lock);
147 static DEFINE_SPINLOCK(offload_lock);
148 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
149 struct list_head ptype_all __read_mostly; /* Taps */
150 static struct list_head offload_base __read_mostly;
151
152 static int netif_rx_internal(struct sk_buff *skb);
153 static int call_netdevice_notifiers_info(unsigned long val,
154 struct net_device *dev,
155 struct netdev_notifier_info *info);
156
157 /*
158 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
159 * semaphore.
160 *
161 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
162 *
163 * Writers must hold the rtnl semaphore while they loop through the
164 * dev_base_head list, and hold dev_base_lock for writing when they do the
165 * actual updates. This allows pure readers to access the list even
166 * while a writer is preparing to update it.
167 *
168 * To put it another way, dev_base_lock is held for writing only to
169 * protect against pure readers; the rtnl semaphore provides the
170 * protection against other writers.
171 *
172 * See, for example usages, register_netdevice() and
173 * unregister_netdevice(), which must be called with the rtnl
174 * semaphore held.
175 */
176 DEFINE_RWLOCK(dev_base_lock);
177 EXPORT_SYMBOL(dev_base_lock);
178
179 /* protects napi_hash addition/deletion and napi_gen_id */
180 static DEFINE_SPINLOCK(napi_hash_lock);
181
182 static unsigned int napi_gen_id;
183 static DEFINE_HASHTABLE(napi_hash, 8);
184
185 static seqcount_t devnet_rename_seq;
186
187 static inline void dev_base_seq_inc(struct net *net)
188 {
189 while (++net->dev_base_seq == 0);
190 }
191
192 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
193 {
194 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
195
196 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
197 }
198
199 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
200 {
201 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
202 }
203
204 static inline void rps_lock(struct softnet_data *sd)
205 {
206 #ifdef CONFIG_RPS
207 spin_lock(&sd->input_pkt_queue.lock);
208 #endif
209 }
210
211 static inline void rps_unlock(struct softnet_data *sd)
212 {
213 #ifdef CONFIG_RPS
214 spin_unlock(&sd->input_pkt_queue.lock);
215 #endif
216 }
217
218 /* Device list insertion */
219 static void list_netdevice(struct net_device *dev)
220 {
221 struct net *net = dev_net(dev);
222
223 ASSERT_RTNL();
224
225 write_lock_bh(&dev_base_lock);
226 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
227 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
228 hlist_add_head_rcu(&dev->index_hlist,
229 dev_index_hash(net, dev->ifindex));
230 write_unlock_bh(&dev_base_lock);
231
232 dev_base_seq_inc(net);
233 }
234
235 /* Device list removal
236 * caller must respect a RCU grace period before freeing/reusing dev
237 */
238 static void unlist_netdevice(struct net_device *dev)
239 {
240 ASSERT_RTNL();
241
242 /* Unlink dev from the device chain */
243 write_lock_bh(&dev_base_lock);
244 list_del_rcu(&dev->dev_list);
245 hlist_del_rcu(&dev->name_hlist);
246 hlist_del_rcu(&dev->index_hlist);
247 write_unlock_bh(&dev_base_lock);
248
249 dev_base_seq_inc(dev_net(dev));
250 }
251
252 /*
253 * Our notifier list
254 */
255
256 static RAW_NOTIFIER_HEAD(netdev_chain);
257
258 /*
259 * Device drivers call our routines to queue packets here. We empty the
260 * queue in the local softnet handler.
261 */
262
263 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
264 EXPORT_PER_CPU_SYMBOL(softnet_data);
265
266 #ifdef CONFIG_LOCKDEP
267 /*
268 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
269 * according to dev->type
270 */
271 static const unsigned short netdev_lock_type[] =
272 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
273 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
274 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
275 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
276 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
277 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
278 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
279 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
280 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
281 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
282 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
283 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
284 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
285 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
286 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
287
288 static const char *const netdev_lock_name[] =
289 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
290 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
291 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
292 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
293 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
294 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
295 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
296 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
297 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
298 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
299 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
300 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
301 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
302 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
303 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
304
305 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
306 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
307
308 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
309 {
310 int i;
311
312 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
313 if (netdev_lock_type[i] == dev_type)
314 return i;
315 /* the last key is used by default */
316 return ARRAY_SIZE(netdev_lock_type) - 1;
317 }
318
319 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
320 unsigned short dev_type)
321 {
322 int i;
323
324 i = netdev_lock_pos(dev_type);
325 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
326 netdev_lock_name[i]);
327 }
328
329 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
330 {
331 int i;
332
333 i = netdev_lock_pos(dev->type);
334 lockdep_set_class_and_name(&dev->addr_list_lock,
335 &netdev_addr_lock_key[i],
336 netdev_lock_name[i]);
337 }
338 #else
339 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
340 unsigned short dev_type)
341 {
342 }
343 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
344 {
345 }
346 #endif
347
348 /*******************************************************************************
349
350 Protocol management and registration routines
351
352 *******************************************************************************/
353
354 /*
355 * Add a protocol ID to the list. Now that the input handler is
356 * smarter we can dispense with all the messy stuff that used to be
357 * here.
358 *
359 * BEWARE!!! Protocol handlers, mangling input packets,
360 * MUST BE last in hash buckets and checking protocol handlers
361 * MUST start from promiscuous ptype_all chain in net_bh.
362 * It is true now, do not change it.
363 * Explanation follows: if protocol handler, mangling packet, will
364 * be the first on list, it is not able to sense, that packet
365 * is cloned and should be copied-on-write, so that it will
366 * change it and subsequent readers will get broken packet.
367 * --ANK (980803)
368 */
369
370 static inline struct list_head *ptype_head(const struct packet_type *pt)
371 {
372 if (pt->type == htons(ETH_P_ALL))
373 return &ptype_all;
374 else
375 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
376 }
377
378 /**
379 * dev_add_pack - add packet handler
380 * @pt: packet type declaration
381 *
382 * Add a protocol handler to the networking stack. The passed &packet_type
383 * is linked into kernel lists and may not be freed until it has been
384 * removed from the kernel lists.
385 *
386 * This call does not sleep therefore it can not
387 * guarantee all CPU's that are in middle of receiving packets
388 * will see the new packet type (until the next received packet).
389 */
390
391 void dev_add_pack(struct packet_type *pt)
392 {
393 struct list_head *head = ptype_head(pt);
394
395 spin_lock(&ptype_lock);
396 list_add_rcu(&pt->list, head);
397 spin_unlock(&ptype_lock);
398 }
399 EXPORT_SYMBOL(dev_add_pack);
400
401 /**
402 * __dev_remove_pack - remove packet handler
403 * @pt: packet type declaration
404 *
405 * Remove a protocol handler that was previously added to the kernel
406 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
407 * from the kernel lists and can be freed or reused once this function
408 * returns.
409 *
410 * The packet type might still be in use by receivers
411 * and must not be freed until after all the CPU's have gone
412 * through a quiescent state.
413 */
414 void __dev_remove_pack(struct packet_type *pt)
415 {
416 struct list_head *head = ptype_head(pt);
417 struct packet_type *pt1;
418
419 spin_lock(&ptype_lock);
420
421 list_for_each_entry(pt1, head, list) {
422 if (pt == pt1) {
423 list_del_rcu(&pt->list);
424 goto out;
425 }
426 }
427
428 pr_warn("dev_remove_pack: %p not found\n", pt);
429 out:
430 spin_unlock(&ptype_lock);
431 }
432 EXPORT_SYMBOL(__dev_remove_pack);
433
434 /**
435 * dev_remove_pack - remove packet handler
436 * @pt: packet type declaration
437 *
438 * Remove a protocol handler that was previously added to the kernel
439 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
440 * from the kernel lists and can be freed or reused once this function
441 * returns.
442 *
443 * This call sleeps to guarantee that no CPU is looking at the packet
444 * type after return.
445 */
446 void dev_remove_pack(struct packet_type *pt)
447 {
448 __dev_remove_pack(pt);
449
450 synchronize_net();
451 }
452 EXPORT_SYMBOL(dev_remove_pack);
453
454
455 /**
456 * dev_add_offload - register offload handlers
457 * @po: protocol offload declaration
458 *
459 * Add protocol offload handlers to the networking stack. The passed
460 * &proto_offload is linked into kernel lists and may not be freed until
461 * it has been removed from the kernel lists.
462 *
463 * This call does not sleep therefore it can not
464 * guarantee all CPU's that are in middle of receiving packets
465 * will see the new offload handlers (until the next received packet).
466 */
467 void dev_add_offload(struct packet_offload *po)
468 {
469 struct list_head *head = &offload_base;
470
471 spin_lock(&offload_lock);
472 list_add_rcu(&po->list, head);
473 spin_unlock(&offload_lock);
474 }
475 EXPORT_SYMBOL(dev_add_offload);
476
477 /**
478 * __dev_remove_offload - remove offload handler
479 * @po: packet offload declaration
480 *
481 * Remove a protocol offload handler that was previously added to the
482 * kernel offload handlers by dev_add_offload(). The passed &offload_type
483 * is removed from the kernel lists and can be freed or reused once this
484 * function returns.
485 *
486 * The packet type might still be in use by receivers
487 * and must not be freed until after all the CPU's have gone
488 * through a quiescent state.
489 */
490 static void __dev_remove_offload(struct packet_offload *po)
491 {
492 struct list_head *head = &offload_base;
493 struct packet_offload *po1;
494
495 spin_lock(&offload_lock);
496
497 list_for_each_entry(po1, head, list) {
498 if (po == po1) {
499 list_del_rcu(&po->list);
500 goto out;
501 }
502 }
503
504 pr_warn("dev_remove_offload: %p not found\n", po);
505 out:
506 spin_unlock(&offload_lock);
507 }
508
509 /**
510 * dev_remove_offload - remove packet offload handler
511 * @po: packet offload declaration
512 *
513 * Remove a packet offload handler that was previously added to the kernel
514 * offload handlers by dev_add_offload(). The passed &offload_type is
515 * removed from the kernel lists and can be freed or reused once this
516 * function returns.
517 *
518 * This call sleeps to guarantee that no CPU is looking at the packet
519 * type after return.
520 */
521 void dev_remove_offload(struct packet_offload *po)
522 {
523 __dev_remove_offload(po);
524
525 synchronize_net();
526 }
527 EXPORT_SYMBOL(dev_remove_offload);
528
529 /******************************************************************************
530
531 Device Boot-time Settings Routines
532
533 *******************************************************************************/
534
535 /* Boot time configuration table */
536 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
537
538 /**
539 * netdev_boot_setup_add - add new setup entry
540 * @name: name of the device
541 * @map: configured settings for the device
542 *
543 * Adds new setup entry to the dev_boot_setup list. The function
544 * returns 0 on error and 1 on success. This is a generic routine to
545 * all netdevices.
546 */
547 static int netdev_boot_setup_add(char *name, struct ifmap *map)
548 {
549 struct netdev_boot_setup *s;
550 int i;
551
552 s = dev_boot_setup;
553 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
554 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
555 memset(s[i].name, 0, sizeof(s[i].name));
556 strlcpy(s[i].name, name, IFNAMSIZ);
557 memcpy(&s[i].map, map, sizeof(s[i].map));
558 break;
559 }
560 }
561
562 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
563 }
564
565 /**
566 * netdev_boot_setup_check - check boot time settings
567 * @dev: the netdevice
568 *
569 * Check boot time settings for the device.
570 * The found settings are set for the device to be used
571 * later in the device probing.
572 * Returns 0 if no settings found, 1 if they are.
573 */
574 int netdev_boot_setup_check(struct net_device *dev)
575 {
576 struct netdev_boot_setup *s = dev_boot_setup;
577 int i;
578
579 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
580 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
581 !strcmp(dev->name, s[i].name)) {
582 dev->irq = s[i].map.irq;
583 dev->base_addr = s[i].map.base_addr;
584 dev->mem_start = s[i].map.mem_start;
585 dev->mem_end = s[i].map.mem_end;
586 return 1;
587 }
588 }
589 return 0;
590 }
591 EXPORT_SYMBOL(netdev_boot_setup_check);
592
593
594 /**
595 * netdev_boot_base - get address from boot time settings
596 * @prefix: prefix for network device
597 * @unit: id for network device
598 *
599 * Check boot time settings for the base address of device.
600 * The found settings are set for the device to be used
601 * later in the device probing.
602 * Returns 0 if no settings found.
603 */
604 unsigned long netdev_boot_base(const char *prefix, int unit)
605 {
606 const struct netdev_boot_setup *s = dev_boot_setup;
607 char name[IFNAMSIZ];
608 int i;
609
610 sprintf(name, "%s%d", prefix, unit);
611
612 /*
613 * If device already registered then return base of 1
614 * to indicate not to probe for this interface
615 */
616 if (__dev_get_by_name(&init_net, name))
617 return 1;
618
619 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
620 if (!strcmp(name, s[i].name))
621 return s[i].map.base_addr;
622 return 0;
623 }
624
625 /*
626 * Saves at boot time configured settings for any netdevice.
627 */
628 int __init netdev_boot_setup(char *str)
629 {
630 int ints[5];
631 struct ifmap map;
632
633 str = get_options(str, ARRAY_SIZE(ints), ints);
634 if (!str || !*str)
635 return 0;
636
637 /* Save settings */
638 memset(&map, 0, sizeof(map));
639 if (ints[0] > 0)
640 map.irq = ints[1];
641 if (ints[0] > 1)
642 map.base_addr = ints[2];
643 if (ints[0] > 2)
644 map.mem_start = ints[3];
645 if (ints[0] > 3)
646 map.mem_end = ints[4];
647
648 /* Add new entry to the list */
649 return netdev_boot_setup_add(str, &map);
650 }
651
652 __setup("netdev=", netdev_boot_setup);
653
654 /*******************************************************************************
655
656 Device Interface Subroutines
657
658 *******************************************************************************/
659
660 /**
661 * __dev_get_by_name - find a device by its name
662 * @net: the applicable net namespace
663 * @name: name to find
664 *
665 * Find an interface by name. Must be called under RTNL semaphore
666 * or @dev_base_lock. If the name is found a pointer to the device
667 * is returned. If the name is not found then %NULL is returned. The
668 * reference counters are not incremented so the caller must be
669 * careful with locks.
670 */
671
672 struct net_device *__dev_get_by_name(struct net *net, const char *name)
673 {
674 struct net_device *dev;
675 struct hlist_head *head = dev_name_hash(net, name);
676
677 hlist_for_each_entry(dev, head, name_hlist)
678 if (!strncmp(dev->name, name, IFNAMSIZ))
679 return dev;
680
681 return NULL;
682 }
683 EXPORT_SYMBOL(__dev_get_by_name);
684
685 /**
686 * dev_get_by_name_rcu - find a device by its name
687 * @net: the applicable net namespace
688 * @name: name to find
689 *
690 * Find an interface by name.
691 * If the name is found a pointer to the device is returned.
692 * If the name is not found then %NULL is returned.
693 * The reference counters are not incremented so the caller must be
694 * careful with locks. The caller must hold RCU lock.
695 */
696
697 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
698 {
699 struct net_device *dev;
700 struct hlist_head *head = dev_name_hash(net, name);
701
702 hlist_for_each_entry_rcu(dev, head, name_hlist)
703 if (!strncmp(dev->name, name, IFNAMSIZ))
704 return dev;
705
706 return NULL;
707 }
708 EXPORT_SYMBOL(dev_get_by_name_rcu);
709
710 /**
711 * dev_get_by_name - find a device by its name
712 * @net: the applicable net namespace
713 * @name: name to find
714 *
715 * Find an interface by name. This can be called from any
716 * context and does its own locking. The returned handle has
717 * the usage count incremented and the caller must use dev_put() to
718 * release it when it is no longer needed. %NULL is returned if no
719 * matching device is found.
720 */
721
722 struct net_device *dev_get_by_name(struct net *net, const char *name)
723 {
724 struct net_device *dev;
725
726 rcu_read_lock();
727 dev = dev_get_by_name_rcu(net, name);
728 if (dev)
729 dev_hold(dev);
730 rcu_read_unlock();
731 return dev;
732 }
733 EXPORT_SYMBOL(dev_get_by_name);
734
735 /**
736 * __dev_get_by_index - find a device by its ifindex
737 * @net: the applicable net namespace
738 * @ifindex: index of device
739 *
740 * Search for an interface by index. Returns %NULL if the device
741 * is not found or a pointer to the device. The device has not
742 * had its reference counter increased so the caller must be careful
743 * about locking. The caller must hold either the RTNL semaphore
744 * or @dev_base_lock.
745 */
746
747 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
748 {
749 struct net_device *dev;
750 struct hlist_head *head = dev_index_hash(net, ifindex);
751
752 hlist_for_each_entry(dev, head, index_hlist)
753 if (dev->ifindex == ifindex)
754 return dev;
755
756 return NULL;
757 }
758 EXPORT_SYMBOL(__dev_get_by_index);
759
760 /**
761 * dev_get_by_index_rcu - find a device by its ifindex
762 * @net: the applicable net namespace
763 * @ifindex: index of device
764 *
765 * Search for an interface by index. Returns %NULL if the device
766 * is not found or a pointer to the device. The device has not
767 * had its reference counter increased so the caller must be careful
768 * about locking. The caller must hold RCU lock.
769 */
770
771 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
772 {
773 struct net_device *dev;
774 struct hlist_head *head = dev_index_hash(net, ifindex);
775
776 hlist_for_each_entry_rcu(dev, head, index_hlist)
777 if (dev->ifindex == ifindex)
778 return dev;
779
780 return NULL;
781 }
782 EXPORT_SYMBOL(dev_get_by_index_rcu);
783
784
785 /**
786 * dev_get_by_index - find a device by its ifindex
787 * @net: the applicable net namespace
788 * @ifindex: index of device
789 *
790 * Search for an interface by index. Returns NULL if the device
791 * is not found or a pointer to the device. The device returned has
792 * had a reference added and the pointer is safe until the user calls
793 * dev_put to indicate they have finished with it.
794 */
795
796 struct net_device *dev_get_by_index(struct net *net, int ifindex)
797 {
798 struct net_device *dev;
799
800 rcu_read_lock();
801 dev = dev_get_by_index_rcu(net, ifindex);
802 if (dev)
803 dev_hold(dev);
804 rcu_read_unlock();
805 return dev;
806 }
807 EXPORT_SYMBOL(dev_get_by_index);
808
809 /**
810 * netdev_get_name - get a netdevice name, knowing its ifindex.
811 * @net: network namespace
812 * @name: a pointer to the buffer where the name will be stored.
813 * @ifindex: the ifindex of the interface to get the name from.
814 *
815 * The use of raw_seqcount_begin() and cond_resched() before
816 * retrying is required as we want to give the writers a chance
817 * to complete when CONFIG_PREEMPT is not set.
818 */
819 int netdev_get_name(struct net *net, char *name, int ifindex)
820 {
821 struct net_device *dev;
822 unsigned int seq;
823
824 retry:
825 seq = raw_seqcount_begin(&devnet_rename_seq);
826 rcu_read_lock();
827 dev = dev_get_by_index_rcu(net, ifindex);
828 if (!dev) {
829 rcu_read_unlock();
830 return -ENODEV;
831 }
832
833 strcpy(name, dev->name);
834 rcu_read_unlock();
835 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
836 cond_resched();
837 goto retry;
838 }
839
840 return 0;
841 }
842
843 /**
844 * dev_getbyhwaddr_rcu - find a device by its hardware address
845 * @net: the applicable net namespace
846 * @type: media type of device
847 * @ha: hardware address
848 *
849 * Search for an interface by MAC address. Returns NULL if the device
850 * is not found or a pointer to the device.
851 * The caller must hold RCU or RTNL.
852 * The returned device has not had its ref count increased
853 * and the caller must therefore be careful about locking
854 *
855 */
856
857 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
858 const char *ha)
859 {
860 struct net_device *dev;
861
862 for_each_netdev_rcu(net, dev)
863 if (dev->type == type &&
864 !memcmp(dev->dev_addr, ha, dev->addr_len))
865 return dev;
866
867 return NULL;
868 }
869 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
870
871 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
872 {
873 struct net_device *dev;
874
875 ASSERT_RTNL();
876 for_each_netdev(net, dev)
877 if (dev->type == type)
878 return dev;
879
880 return NULL;
881 }
882 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
883
884 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
885 {
886 struct net_device *dev, *ret = NULL;
887
888 rcu_read_lock();
889 for_each_netdev_rcu(net, dev)
890 if (dev->type == type) {
891 dev_hold(dev);
892 ret = dev;
893 break;
894 }
895 rcu_read_unlock();
896 return ret;
897 }
898 EXPORT_SYMBOL(dev_getfirstbyhwtype);
899
900 /**
901 * __dev_get_by_flags - find any device with given flags
902 * @net: the applicable net namespace
903 * @if_flags: IFF_* values
904 * @mask: bitmask of bits in if_flags to check
905 *
906 * Search for any interface with the given flags. Returns NULL if a device
907 * is not found or a pointer to the device. Must be called inside
908 * rtnl_lock(), and result refcount is unchanged.
909 */
910
911 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
912 unsigned short mask)
913 {
914 struct net_device *dev, *ret;
915
916 ASSERT_RTNL();
917
918 ret = NULL;
919 for_each_netdev(net, dev) {
920 if (((dev->flags ^ if_flags) & mask) == 0) {
921 ret = dev;
922 break;
923 }
924 }
925 return ret;
926 }
927 EXPORT_SYMBOL(__dev_get_by_flags);
928
929 /**
930 * dev_valid_name - check if name is okay for network device
931 * @name: name string
932 *
933 * Network device names need to be valid file names to
934 * to allow sysfs to work. We also disallow any kind of
935 * whitespace.
936 */
937 bool dev_valid_name(const char *name)
938 {
939 if (*name == '\0')
940 return false;
941 if (strlen(name) >= IFNAMSIZ)
942 return false;
943 if (!strcmp(name, ".") || !strcmp(name, ".."))
944 return false;
945
946 while (*name) {
947 if (*name == '/' || isspace(*name))
948 return false;
949 name++;
950 }
951 return true;
952 }
953 EXPORT_SYMBOL(dev_valid_name);
954
955 /**
956 * __dev_alloc_name - allocate a name for a device
957 * @net: network namespace to allocate the device name in
958 * @name: name format string
959 * @buf: scratch buffer and result name string
960 *
961 * Passed a format string - eg "lt%d" it will try and find a suitable
962 * id. It scans list of devices to build up a free map, then chooses
963 * the first empty slot. The caller must hold the dev_base or rtnl lock
964 * while allocating the name and adding the device in order to avoid
965 * duplicates.
966 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
967 * Returns the number of the unit assigned or a negative errno code.
968 */
969
970 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
971 {
972 int i = 0;
973 const char *p;
974 const int max_netdevices = 8*PAGE_SIZE;
975 unsigned long *inuse;
976 struct net_device *d;
977
978 p = strnchr(name, IFNAMSIZ-1, '%');
979 if (p) {
980 /*
981 * Verify the string as this thing may have come from
982 * the user. There must be either one "%d" and no other "%"
983 * characters.
984 */
985 if (p[1] != 'd' || strchr(p + 2, '%'))
986 return -EINVAL;
987
988 /* Use one page as a bit array of possible slots */
989 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
990 if (!inuse)
991 return -ENOMEM;
992
993 for_each_netdev(net, d) {
994 if (!sscanf(d->name, name, &i))
995 continue;
996 if (i < 0 || i >= max_netdevices)
997 continue;
998
999 /* avoid cases where sscanf is not exact inverse of printf */
1000 snprintf(buf, IFNAMSIZ, name, i);
1001 if (!strncmp(buf, d->name, IFNAMSIZ))
1002 set_bit(i, inuse);
1003 }
1004
1005 i = find_first_zero_bit(inuse, max_netdevices);
1006 free_page((unsigned long) inuse);
1007 }
1008
1009 if (buf != name)
1010 snprintf(buf, IFNAMSIZ, name, i);
1011 if (!__dev_get_by_name(net, buf))
1012 return i;
1013
1014 /* It is possible to run out of possible slots
1015 * when the name is long and there isn't enough space left
1016 * for the digits, or if all bits are used.
1017 */
1018 return -ENFILE;
1019 }
1020
1021 /**
1022 * dev_alloc_name - allocate a name for a device
1023 * @dev: device
1024 * @name: name format string
1025 *
1026 * Passed a format string - eg "lt%d" it will try and find a suitable
1027 * id. It scans list of devices to build up a free map, then chooses
1028 * the first empty slot. The caller must hold the dev_base or rtnl lock
1029 * while allocating the name and adding the device in order to avoid
1030 * duplicates.
1031 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1032 * Returns the number of the unit assigned or a negative errno code.
1033 */
1034
1035 int dev_alloc_name(struct net_device *dev, const char *name)
1036 {
1037 char buf[IFNAMSIZ];
1038 struct net *net;
1039 int ret;
1040
1041 BUG_ON(!dev_net(dev));
1042 net = dev_net(dev);
1043 ret = __dev_alloc_name(net, name, buf);
1044 if (ret >= 0)
1045 strlcpy(dev->name, buf, IFNAMSIZ);
1046 return ret;
1047 }
1048 EXPORT_SYMBOL(dev_alloc_name);
1049
1050 static int dev_alloc_name_ns(struct net *net,
1051 struct net_device *dev,
1052 const char *name)
1053 {
1054 char buf[IFNAMSIZ];
1055 int ret;
1056
1057 ret = __dev_alloc_name(net, name, buf);
1058 if (ret >= 0)
1059 strlcpy(dev->name, buf, IFNAMSIZ);
1060 return ret;
1061 }
1062
1063 static int dev_get_valid_name(struct net *net,
1064 struct net_device *dev,
1065 const char *name)
1066 {
1067 BUG_ON(!net);
1068
1069 if (!dev_valid_name(name))
1070 return -EINVAL;
1071
1072 if (strchr(name, '%'))
1073 return dev_alloc_name_ns(net, dev, name);
1074 else if (__dev_get_by_name(net, name))
1075 return -EEXIST;
1076 else if (dev->name != name)
1077 strlcpy(dev->name, name, IFNAMSIZ);
1078
1079 return 0;
1080 }
1081
1082 /**
1083 * dev_change_name - change name of a device
1084 * @dev: device
1085 * @newname: name (or format string) must be at least IFNAMSIZ
1086 *
1087 * Change name of a device, can pass format strings "eth%d".
1088 * for wildcarding.
1089 */
1090 int dev_change_name(struct net_device *dev, const char *newname)
1091 {
1092 unsigned char old_assign_type;
1093 char oldname[IFNAMSIZ];
1094 int err = 0;
1095 int ret;
1096 struct net *net;
1097
1098 ASSERT_RTNL();
1099 BUG_ON(!dev_net(dev));
1100
1101 net = dev_net(dev);
1102 if (dev->flags & IFF_UP)
1103 return -EBUSY;
1104
1105 write_seqcount_begin(&devnet_rename_seq);
1106
1107 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1108 write_seqcount_end(&devnet_rename_seq);
1109 return 0;
1110 }
1111
1112 memcpy(oldname, dev->name, IFNAMSIZ);
1113
1114 err = dev_get_valid_name(net, dev, newname);
1115 if (err < 0) {
1116 write_seqcount_end(&devnet_rename_seq);
1117 return err;
1118 }
1119
1120 if (oldname[0] && !strchr(oldname, '%'))
1121 netdev_info(dev, "renamed from %s\n", oldname);
1122
1123 old_assign_type = dev->name_assign_type;
1124 dev->name_assign_type = NET_NAME_RENAMED;
1125
1126 rollback:
1127 ret = device_rename(&dev->dev, dev->name);
1128 if (ret) {
1129 memcpy(dev->name, oldname, IFNAMSIZ);
1130 dev->name_assign_type = old_assign_type;
1131 write_seqcount_end(&devnet_rename_seq);
1132 return ret;
1133 }
1134
1135 write_seqcount_end(&devnet_rename_seq);
1136
1137 netdev_adjacent_rename_links(dev, oldname);
1138
1139 write_lock_bh(&dev_base_lock);
1140 hlist_del_rcu(&dev->name_hlist);
1141 write_unlock_bh(&dev_base_lock);
1142
1143 synchronize_rcu();
1144
1145 write_lock_bh(&dev_base_lock);
1146 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1147 write_unlock_bh(&dev_base_lock);
1148
1149 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1150 ret = notifier_to_errno(ret);
1151
1152 if (ret) {
1153 /* err >= 0 after dev_alloc_name() or stores the first errno */
1154 if (err >= 0) {
1155 err = ret;
1156 write_seqcount_begin(&devnet_rename_seq);
1157 memcpy(dev->name, oldname, IFNAMSIZ);
1158 memcpy(oldname, newname, IFNAMSIZ);
1159 dev->name_assign_type = old_assign_type;
1160 old_assign_type = NET_NAME_RENAMED;
1161 goto rollback;
1162 } else {
1163 pr_err("%s: name change rollback failed: %d\n",
1164 dev->name, ret);
1165 }
1166 }
1167
1168 return err;
1169 }
1170
1171 /**
1172 * dev_set_alias - change ifalias of a device
1173 * @dev: device
1174 * @alias: name up to IFALIASZ
1175 * @len: limit of bytes to copy from info
1176 *
1177 * Set ifalias for a device,
1178 */
1179 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1180 {
1181 char *new_ifalias;
1182
1183 ASSERT_RTNL();
1184
1185 if (len >= IFALIASZ)
1186 return -EINVAL;
1187
1188 if (!len) {
1189 kfree(dev->ifalias);
1190 dev->ifalias = NULL;
1191 return 0;
1192 }
1193
1194 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1195 if (!new_ifalias)
1196 return -ENOMEM;
1197 dev->ifalias = new_ifalias;
1198
1199 strlcpy(dev->ifalias, alias, len+1);
1200 return len;
1201 }
1202
1203
1204 /**
1205 * netdev_features_change - device changes features
1206 * @dev: device to cause notification
1207 *
1208 * Called to indicate a device has changed features.
1209 */
1210 void netdev_features_change(struct net_device *dev)
1211 {
1212 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1213 }
1214 EXPORT_SYMBOL(netdev_features_change);
1215
1216 /**
1217 * netdev_state_change - device changes state
1218 * @dev: device to cause notification
1219 *
1220 * Called to indicate a device has changed state. This function calls
1221 * the notifier chains for netdev_chain and sends a NEWLINK message
1222 * to the routing socket.
1223 */
1224 void netdev_state_change(struct net_device *dev)
1225 {
1226 if (dev->flags & IFF_UP) {
1227 struct netdev_notifier_change_info change_info;
1228
1229 change_info.flags_changed = 0;
1230 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1231 &change_info.info);
1232 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1233 }
1234 }
1235 EXPORT_SYMBOL(netdev_state_change);
1236
1237 /**
1238 * netdev_notify_peers - notify network peers about existence of @dev
1239 * @dev: network device
1240 *
1241 * Generate traffic such that interested network peers are aware of
1242 * @dev, such as by generating a gratuitous ARP. This may be used when
1243 * a device wants to inform the rest of the network about some sort of
1244 * reconfiguration such as a failover event or virtual machine
1245 * migration.
1246 */
1247 void netdev_notify_peers(struct net_device *dev)
1248 {
1249 rtnl_lock();
1250 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1251 rtnl_unlock();
1252 }
1253 EXPORT_SYMBOL(netdev_notify_peers);
1254
1255 static int __dev_open(struct net_device *dev)
1256 {
1257 const struct net_device_ops *ops = dev->netdev_ops;
1258 int ret;
1259
1260 ASSERT_RTNL();
1261
1262 if (!netif_device_present(dev))
1263 return -ENODEV;
1264
1265 /* Block netpoll from trying to do any rx path servicing.
1266 * If we don't do this there is a chance ndo_poll_controller
1267 * or ndo_poll may be running while we open the device
1268 */
1269 netpoll_poll_disable(dev);
1270
1271 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1272 ret = notifier_to_errno(ret);
1273 if (ret)
1274 return ret;
1275
1276 set_bit(__LINK_STATE_START, &dev->state);
1277
1278 if (ops->ndo_validate_addr)
1279 ret = ops->ndo_validate_addr(dev);
1280
1281 if (!ret && ops->ndo_open)
1282 ret = ops->ndo_open(dev);
1283
1284 netpoll_poll_enable(dev);
1285
1286 if (ret)
1287 clear_bit(__LINK_STATE_START, &dev->state);
1288 else {
1289 dev->flags |= IFF_UP;
1290 dev_set_rx_mode(dev);
1291 dev_activate(dev);
1292 add_device_randomness(dev->dev_addr, dev->addr_len);
1293 }
1294
1295 return ret;
1296 }
1297
1298 /**
1299 * dev_open - prepare an interface for use.
1300 * @dev: device to open
1301 *
1302 * Takes a device from down to up state. The device's private open
1303 * function is invoked and then the multicast lists are loaded. Finally
1304 * the device is moved into the up state and a %NETDEV_UP message is
1305 * sent to the netdev notifier chain.
1306 *
1307 * Calling this function on an active interface is a nop. On a failure
1308 * a negative errno code is returned.
1309 */
1310 int dev_open(struct net_device *dev)
1311 {
1312 int ret;
1313
1314 if (dev->flags & IFF_UP)
1315 return 0;
1316
1317 ret = __dev_open(dev);
1318 if (ret < 0)
1319 return ret;
1320
1321 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1322 call_netdevice_notifiers(NETDEV_UP, dev);
1323
1324 return ret;
1325 }
1326 EXPORT_SYMBOL(dev_open);
1327
1328 static int __dev_close_many(struct list_head *head)
1329 {
1330 struct net_device *dev;
1331
1332 ASSERT_RTNL();
1333 might_sleep();
1334
1335 list_for_each_entry(dev, head, close_list) {
1336 /* Temporarily disable netpoll until the interface is down */
1337 netpoll_poll_disable(dev);
1338
1339 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1340
1341 clear_bit(__LINK_STATE_START, &dev->state);
1342
1343 /* Synchronize to scheduled poll. We cannot touch poll list, it
1344 * can be even on different cpu. So just clear netif_running().
1345 *
1346 * dev->stop() will invoke napi_disable() on all of it's
1347 * napi_struct instances on this device.
1348 */
1349 smp_mb__after_atomic(); /* Commit netif_running(). */
1350 }
1351
1352 dev_deactivate_many(head);
1353
1354 list_for_each_entry(dev, head, close_list) {
1355 const struct net_device_ops *ops = dev->netdev_ops;
1356
1357 /*
1358 * Call the device specific close. This cannot fail.
1359 * Only if device is UP
1360 *
1361 * We allow it to be called even after a DETACH hot-plug
1362 * event.
1363 */
1364 if (ops->ndo_stop)
1365 ops->ndo_stop(dev);
1366
1367 dev->flags &= ~IFF_UP;
1368 netpoll_poll_enable(dev);
1369 }
1370
1371 return 0;
1372 }
1373
1374 static int __dev_close(struct net_device *dev)
1375 {
1376 int retval;
1377 LIST_HEAD(single);
1378
1379 list_add(&dev->close_list, &single);
1380 retval = __dev_close_many(&single);
1381 list_del(&single);
1382
1383 return retval;
1384 }
1385
1386 static int dev_close_many(struct list_head *head)
1387 {
1388 struct net_device *dev, *tmp;
1389
1390 /* Remove the devices that don't need to be closed */
1391 list_for_each_entry_safe(dev, tmp, head, close_list)
1392 if (!(dev->flags & IFF_UP))
1393 list_del_init(&dev->close_list);
1394
1395 __dev_close_many(head);
1396
1397 list_for_each_entry_safe(dev, tmp, head, close_list) {
1398 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1399 call_netdevice_notifiers(NETDEV_DOWN, dev);
1400 list_del_init(&dev->close_list);
1401 }
1402
1403 return 0;
1404 }
1405
1406 /**
1407 * dev_close - shutdown an interface.
1408 * @dev: device to shutdown
1409 *
1410 * This function moves an active device into down state. A
1411 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1412 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1413 * chain.
1414 */
1415 int dev_close(struct net_device *dev)
1416 {
1417 if (dev->flags & IFF_UP) {
1418 LIST_HEAD(single);
1419
1420 list_add(&dev->close_list, &single);
1421 dev_close_many(&single);
1422 list_del(&single);
1423 }
1424 return 0;
1425 }
1426 EXPORT_SYMBOL(dev_close);
1427
1428
1429 /**
1430 * dev_disable_lro - disable Large Receive Offload on a device
1431 * @dev: device
1432 *
1433 * Disable Large Receive Offload (LRO) on a net device. Must be
1434 * called under RTNL. This is needed if received packets may be
1435 * forwarded to another interface.
1436 */
1437 void dev_disable_lro(struct net_device *dev)
1438 {
1439 /*
1440 * If we're trying to disable lro on a vlan device
1441 * use the underlying physical device instead
1442 */
1443 if (is_vlan_dev(dev))
1444 dev = vlan_dev_real_dev(dev);
1445
1446 /* the same for macvlan devices */
1447 if (netif_is_macvlan(dev))
1448 dev = macvlan_dev_real_dev(dev);
1449
1450 dev->wanted_features &= ~NETIF_F_LRO;
1451 netdev_update_features(dev);
1452
1453 if (unlikely(dev->features & NETIF_F_LRO))
1454 netdev_WARN(dev, "failed to disable LRO!\n");
1455 }
1456 EXPORT_SYMBOL(dev_disable_lro);
1457
1458 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1459 struct net_device *dev)
1460 {
1461 struct netdev_notifier_info info;
1462
1463 netdev_notifier_info_init(&info, dev);
1464 return nb->notifier_call(nb, val, &info);
1465 }
1466
1467 static int dev_boot_phase = 1;
1468
1469 /**
1470 * register_netdevice_notifier - register a network notifier block
1471 * @nb: notifier
1472 *
1473 * Register a notifier to be called when network device events occur.
1474 * The notifier passed is linked into the kernel structures and must
1475 * not be reused until it has been unregistered. A negative errno code
1476 * is returned on a failure.
1477 *
1478 * When registered all registration and up events are replayed
1479 * to the new notifier to allow device to have a race free
1480 * view of the network device list.
1481 */
1482
1483 int register_netdevice_notifier(struct notifier_block *nb)
1484 {
1485 struct net_device *dev;
1486 struct net_device *last;
1487 struct net *net;
1488 int err;
1489
1490 rtnl_lock();
1491 err = raw_notifier_chain_register(&netdev_chain, nb);
1492 if (err)
1493 goto unlock;
1494 if (dev_boot_phase)
1495 goto unlock;
1496 for_each_net(net) {
1497 for_each_netdev(net, dev) {
1498 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1499 err = notifier_to_errno(err);
1500 if (err)
1501 goto rollback;
1502
1503 if (!(dev->flags & IFF_UP))
1504 continue;
1505
1506 call_netdevice_notifier(nb, NETDEV_UP, dev);
1507 }
1508 }
1509
1510 unlock:
1511 rtnl_unlock();
1512 return err;
1513
1514 rollback:
1515 last = dev;
1516 for_each_net(net) {
1517 for_each_netdev(net, dev) {
1518 if (dev == last)
1519 goto outroll;
1520
1521 if (dev->flags & IFF_UP) {
1522 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1523 dev);
1524 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1525 }
1526 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1527 }
1528 }
1529
1530 outroll:
1531 raw_notifier_chain_unregister(&netdev_chain, nb);
1532 goto unlock;
1533 }
1534 EXPORT_SYMBOL(register_netdevice_notifier);
1535
1536 /**
1537 * unregister_netdevice_notifier - unregister a network notifier block
1538 * @nb: notifier
1539 *
1540 * Unregister a notifier previously registered by
1541 * register_netdevice_notifier(). The notifier is unlinked into the
1542 * kernel structures and may then be reused. A negative errno code
1543 * is returned on a failure.
1544 *
1545 * After unregistering unregister and down device events are synthesized
1546 * for all devices on the device list to the removed notifier to remove
1547 * the need for special case cleanup code.
1548 */
1549
1550 int unregister_netdevice_notifier(struct notifier_block *nb)
1551 {
1552 struct net_device *dev;
1553 struct net *net;
1554 int err;
1555
1556 rtnl_lock();
1557 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1558 if (err)
1559 goto unlock;
1560
1561 for_each_net(net) {
1562 for_each_netdev(net, dev) {
1563 if (dev->flags & IFF_UP) {
1564 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1565 dev);
1566 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1567 }
1568 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1569 }
1570 }
1571 unlock:
1572 rtnl_unlock();
1573 return err;
1574 }
1575 EXPORT_SYMBOL(unregister_netdevice_notifier);
1576
1577 /**
1578 * call_netdevice_notifiers_info - call all network notifier blocks
1579 * @val: value passed unmodified to notifier function
1580 * @dev: net_device pointer passed unmodified to notifier function
1581 * @info: notifier information data
1582 *
1583 * Call all network notifier blocks. Parameters and return value
1584 * are as for raw_notifier_call_chain().
1585 */
1586
1587 static int call_netdevice_notifiers_info(unsigned long val,
1588 struct net_device *dev,
1589 struct netdev_notifier_info *info)
1590 {
1591 ASSERT_RTNL();
1592 netdev_notifier_info_init(info, dev);
1593 return raw_notifier_call_chain(&netdev_chain, val, info);
1594 }
1595
1596 /**
1597 * call_netdevice_notifiers - call all network notifier blocks
1598 * @val: value passed unmodified to notifier function
1599 * @dev: net_device pointer passed unmodified to notifier function
1600 *
1601 * Call all network notifier blocks. Parameters and return value
1602 * are as for raw_notifier_call_chain().
1603 */
1604
1605 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1606 {
1607 struct netdev_notifier_info info;
1608
1609 return call_netdevice_notifiers_info(val, dev, &info);
1610 }
1611 EXPORT_SYMBOL(call_netdevice_notifiers);
1612
1613 static struct static_key netstamp_needed __read_mostly;
1614 #ifdef HAVE_JUMP_LABEL
1615 /* We are not allowed to call static_key_slow_dec() from irq context
1616 * If net_disable_timestamp() is called from irq context, defer the
1617 * static_key_slow_dec() calls.
1618 */
1619 static atomic_t netstamp_needed_deferred;
1620 #endif
1621
1622 void net_enable_timestamp(void)
1623 {
1624 #ifdef HAVE_JUMP_LABEL
1625 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1626
1627 if (deferred) {
1628 while (--deferred)
1629 static_key_slow_dec(&netstamp_needed);
1630 return;
1631 }
1632 #endif
1633 static_key_slow_inc(&netstamp_needed);
1634 }
1635 EXPORT_SYMBOL(net_enable_timestamp);
1636
1637 void net_disable_timestamp(void)
1638 {
1639 #ifdef HAVE_JUMP_LABEL
1640 if (in_interrupt()) {
1641 atomic_inc(&netstamp_needed_deferred);
1642 return;
1643 }
1644 #endif
1645 static_key_slow_dec(&netstamp_needed);
1646 }
1647 EXPORT_SYMBOL(net_disable_timestamp);
1648
1649 static inline void net_timestamp_set(struct sk_buff *skb)
1650 {
1651 skb->tstamp.tv64 = 0;
1652 if (static_key_false(&netstamp_needed))
1653 __net_timestamp(skb);
1654 }
1655
1656 #define net_timestamp_check(COND, SKB) \
1657 if (static_key_false(&netstamp_needed)) { \
1658 if ((COND) && !(SKB)->tstamp.tv64) \
1659 __net_timestamp(SKB); \
1660 } \
1661
1662 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1663 {
1664 unsigned int len;
1665
1666 if (!(dev->flags & IFF_UP))
1667 return false;
1668
1669 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1670 if (skb->len <= len)
1671 return true;
1672
1673 /* if TSO is enabled, we don't care about the length as the packet
1674 * could be forwarded without being segmented before
1675 */
1676 if (skb_is_gso(skb))
1677 return true;
1678
1679 return false;
1680 }
1681 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1682
1683 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1684 {
1685 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1686 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1687 atomic_long_inc(&dev->rx_dropped);
1688 kfree_skb(skb);
1689 return NET_RX_DROP;
1690 }
1691 }
1692
1693 if (unlikely(!is_skb_forwardable(dev, skb))) {
1694 atomic_long_inc(&dev->rx_dropped);
1695 kfree_skb(skb);
1696 return NET_RX_DROP;
1697 }
1698
1699 skb_scrub_packet(skb, true);
1700 skb->protocol = eth_type_trans(skb, dev);
1701
1702 return 0;
1703 }
1704 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1705
1706 /**
1707 * dev_forward_skb - loopback an skb to another netif
1708 *
1709 * @dev: destination network device
1710 * @skb: buffer to forward
1711 *
1712 * return values:
1713 * NET_RX_SUCCESS (no congestion)
1714 * NET_RX_DROP (packet was dropped, but freed)
1715 *
1716 * dev_forward_skb can be used for injecting an skb from the
1717 * start_xmit function of one device into the receive queue
1718 * of another device.
1719 *
1720 * The receiving device may be in another namespace, so
1721 * we have to clear all information in the skb that could
1722 * impact namespace isolation.
1723 */
1724 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1725 {
1726 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1727 }
1728 EXPORT_SYMBOL_GPL(dev_forward_skb);
1729
1730 static inline int deliver_skb(struct sk_buff *skb,
1731 struct packet_type *pt_prev,
1732 struct net_device *orig_dev)
1733 {
1734 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1735 return -ENOMEM;
1736 atomic_inc(&skb->users);
1737 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1738 }
1739
1740 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1741 {
1742 if (!ptype->af_packet_priv || !skb->sk)
1743 return false;
1744
1745 if (ptype->id_match)
1746 return ptype->id_match(ptype, skb->sk);
1747 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1748 return true;
1749
1750 return false;
1751 }
1752
1753 /*
1754 * Support routine. Sends outgoing frames to any network
1755 * taps currently in use.
1756 */
1757
1758 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1759 {
1760 struct packet_type *ptype;
1761 struct sk_buff *skb2 = NULL;
1762 struct packet_type *pt_prev = NULL;
1763
1764 rcu_read_lock();
1765 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1766 /* Never send packets back to the socket
1767 * they originated from - MvS (miquels@drinkel.ow.org)
1768 */
1769 if ((ptype->dev == dev || !ptype->dev) &&
1770 (!skb_loop_sk(ptype, skb))) {
1771 if (pt_prev) {
1772 deliver_skb(skb2, pt_prev, skb->dev);
1773 pt_prev = ptype;
1774 continue;
1775 }
1776
1777 skb2 = skb_clone(skb, GFP_ATOMIC);
1778 if (!skb2)
1779 break;
1780
1781 net_timestamp_set(skb2);
1782
1783 /* skb->nh should be correctly
1784 set by sender, so that the second statement is
1785 just protection against buggy protocols.
1786 */
1787 skb_reset_mac_header(skb2);
1788
1789 if (skb_network_header(skb2) < skb2->data ||
1790 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1791 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1792 ntohs(skb2->protocol),
1793 dev->name);
1794 skb_reset_network_header(skb2);
1795 }
1796
1797 skb2->transport_header = skb2->network_header;
1798 skb2->pkt_type = PACKET_OUTGOING;
1799 pt_prev = ptype;
1800 }
1801 }
1802 if (pt_prev)
1803 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1804 rcu_read_unlock();
1805 }
1806
1807 /**
1808 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1809 * @dev: Network device
1810 * @txq: number of queues available
1811 *
1812 * If real_num_tx_queues is changed the tc mappings may no longer be
1813 * valid. To resolve this verify the tc mapping remains valid and if
1814 * not NULL the mapping. With no priorities mapping to this
1815 * offset/count pair it will no longer be used. In the worst case TC0
1816 * is invalid nothing can be done so disable priority mappings. If is
1817 * expected that drivers will fix this mapping if they can before
1818 * calling netif_set_real_num_tx_queues.
1819 */
1820 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1821 {
1822 int i;
1823 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1824
1825 /* If TC0 is invalidated disable TC mapping */
1826 if (tc->offset + tc->count > txq) {
1827 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1828 dev->num_tc = 0;
1829 return;
1830 }
1831
1832 /* Invalidated prio to tc mappings set to TC0 */
1833 for (i = 1; i < TC_BITMASK + 1; i++) {
1834 int q = netdev_get_prio_tc_map(dev, i);
1835
1836 tc = &dev->tc_to_txq[q];
1837 if (tc->offset + tc->count > txq) {
1838 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1839 i, q);
1840 netdev_set_prio_tc_map(dev, i, 0);
1841 }
1842 }
1843 }
1844
1845 #ifdef CONFIG_XPS
1846 static DEFINE_MUTEX(xps_map_mutex);
1847 #define xmap_dereference(P) \
1848 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1849
1850 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1851 int cpu, u16 index)
1852 {
1853 struct xps_map *map = NULL;
1854 int pos;
1855
1856 if (dev_maps)
1857 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1858
1859 for (pos = 0; map && pos < map->len; pos++) {
1860 if (map->queues[pos] == index) {
1861 if (map->len > 1) {
1862 map->queues[pos] = map->queues[--map->len];
1863 } else {
1864 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1865 kfree_rcu(map, rcu);
1866 map = NULL;
1867 }
1868 break;
1869 }
1870 }
1871
1872 return map;
1873 }
1874
1875 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1876 {
1877 struct xps_dev_maps *dev_maps;
1878 int cpu, i;
1879 bool active = false;
1880
1881 mutex_lock(&xps_map_mutex);
1882 dev_maps = xmap_dereference(dev->xps_maps);
1883
1884 if (!dev_maps)
1885 goto out_no_maps;
1886
1887 for_each_possible_cpu(cpu) {
1888 for (i = index; i < dev->num_tx_queues; i++) {
1889 if (!remove_xps_queue(dev_maps, cpu, i))
1890 break;
1891 }
1892 if (i == dev->num_tx_queues)
1893 active = true;
1894 }
1895
1896 if (!active) {
1897 RCU_INIT_POINTER(dev->xps_maps, NULL);
1898 kfree_rcu(dev_maps, rcu);
1899 }
1900
1901 for (i = index; i < dev->num_tx_queues; i++)
1902 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1903 NUMA_NO_NODE);
1904
1905 out_no_maps:
1906 mutex_unlock(&xps_map_mutex);
1907 }
1908
1909 static struct xps_map *expand_xps_map(struct xps_map *map,
1910 int cpu, u16 index)
1911 {
1912 struct xps_map *new_map;
1913 int alloc_len = XPS_MIN_MAP_ALLOC;
1914 int i, pos;
1915
1916 for (pos = 0; map && pos < map->len; pos++) {
1917 if (map->queues[pos] != index)
1918 continue;
1919 return map;
1920 }
1921
1922 /* Need to add queue to this CPU's existing map */
1923 if (map) {
1924 if (pos < map->alloc_len)
1925 return map;
1926
1927 alloc_len = map->alloc_len * 2;
1928 }
1929
1930 /* Need to allocate new map to store queue on this CPU's map */
1931 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1932 cpu_to_node(cpu));
1933 if (!new_map)
1934 return NULL;
1935
1936 for (i = 0; i < pos; i++)
1937 new_map->queues[i] = map->queues[i];
1938 new_map->alloc_len = alloc_len;
1939 new_map->len = pos;
1940
1941 return new_map;
1942 }
1943
1944 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1945 u16 index)
1946 {
1947 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1948 struct xps_map *map, *new_map;
1949 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1950 int cpu, numa_node_id = -2;
1951 bool active = false;
1952
1953 mutex_lock(&xps_map_mutex);
1954
1955 dev_maps = xmap_dereference(dev->xps_maps);
1956
1957 /* allocate memory for queue storage */
1958 for_each_online_cpu(cpu) {
1959 if (!cpumask_test_cpu(cpu, mask))
1960 continue;
1961
1962 if (!new_dev_maps)
1963 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1964 if (!new_dev_maps) {
1965 mutex_unlock(&xps_map_mutex);
1966 return -ENOMEM;
1967 }
1968
1969 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1970 NULL;
1971
1972 map = expand_xps_map(map, cpu, index);
1973 if (!map)
1974 goto error;
1975
1976 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1977 }
1978
1979 if (!new_dev_maps)
1980 goto out_no_new_maps;
1981
1982 for_each_possible_cpu(cpu) {
1983 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1984 /* add queue to CPU maps */
1985 int pos = 0;
1986
1987 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1988 while ((pos < map->len) && (map->queues[pos] != index))
1989 pos++;
1990
1991 if (pos == map->len)
1992 map->queues[map->len++] = index;
1993 #ifdef CONFIG_NUMA
1994 if (numa_node_id == -2)
1995 numa_node_id = cpu_to_node(cpu);
1996 else if (numa_node_id != cpu_to_node(cpu))
1997 numa_node_id = -1;
1998 #endif
1999 } else if (dev_maps) {
2000 /* fill in the new device map from the old device map */
2001 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2002 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2003 }
2004
2005 }
2006
2007 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2008
2009 /* Cleanup old maps */
2010 if (dev_maps) {
2011 for_each_possible_cpu(cpu) {
2012 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2013 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2014 if (map && map != new_map)
2015 kfree_rcu(map, rcu);
2016 }
2017
2018 kfree_rcu(dev_maps, rcu);
2019 }
2020
2021 dev_maps = new_dev_maps;
2022 active = true;
2023
2024 out_no_new_maps:
2025 /* update Tx queue numa node */
2026 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2027 (numa_node_id >= 0) ? numa_node_id :
2028 NUMA_NO_NODE);
2029
2030 if (!dev_maps)
2031 goto out_no_maps;
2032
2033 /* removes queue from unused CPUs */
2034 for_each_possible_cpu(cpu) {
2035 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2036 continue;
2037
2038 if (remove_xps_queue(dev_maps, cpu, index))
2039 active = true;
2040 }
2041
2042 /* free map if not active */
2043 if (!active) {
2044 RCU_INIT_POINTER(dev->xps_maps, NULL);
2045 kfree_rcu(dev_maps, rcu);
2046 }
2047
2048 out_no_maps:
2049 mutex_unlock(&xps_map_mutex);
2050
2051 return 0;
2052 error:
2053 /* remove any maps that we added */
2054 for_each_possible_cpu(cpu) {
2055 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2056 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2057 NULL;
2058 if (new_map && new_map != map)
2059 kfree(new_map);
2060 }
2061
2062 mutex_unlock(&xps_map_mutex);
2063
2064 kfree(new_dev_maps);
2065 return -ENOMEM;
2066 }
2067 EXPORT_SYMBOL(netif_set_xps_queue);
2068
2069 #endif
2070 /*
2071 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2072 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2073 */
2074 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2075 {
2076 int rc;
2077
2078 if (txq < 1 || txq > dev->num_tx_queues)
2079 return -EINVAL;
2080
2081 if (dev->reg_state == NETREG_REGISTERED ||
2082 dev->reg_state == NETREG_UNREGISTERING) {
2083 ASSERT_RTNL();
2084
2085 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2086 txq);
2087 if (rc)
2088 return rc;
2089
2090 if (dev->num_tc)
2091 netif_setup_tc(dev, txq);
2092
2093 if (txq < dev->real_num_tx_queues) {
2094 qdisc_reset_all_tx_gt(dev, txq);
2095 #ifdef CONFIG_XPS
2096 netif_reset_xps_queues_gt(dev, txq);
2097 #endif
2098 }
2099 }
2100
2101 dev->real_num_tx_queues = txq;
2102 return 0;
2103 }
2104 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2105
2106 #ifdef CONFIG_SYSFS
2107 /**
2108 * netif_set_real_num_rx_queues - set actual number of RX queues used
2109 * @dev: Network device
2110 * @rxq: Actual number of RX queues
2111 *
2112 * This must be called either with the rtnl_lock held or before
2113 * registration of the net device. Returns 0 on success, or a
2114 * negative error code. If called before registration, it always
2115 * succeeds.
2116 */
2117 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2118 {
2119 int rc;
2120
2121 if (rxq < 1 || rxq > dev->num_rx_queues)
2122 return -EINVAL;
2123
2124 if (dev->reg_state == NETREG_REGISTERED) {
2125 ASSERT_RTNL();
2126
2127 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2128 rxq);
2129 if (rc)
2130 return rc;
2131 }
2132
2133 dev->real_num_rx_queues = rxq;
2134 return 0;
2135 }
2136 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2137 #endif
2138
2139 /**
2140 * netif_get_num_default_rss_queues - default number of RSS queues
2141 *
2142 * This routine should set an upper limit on the number of RSS queues
2143 * used by default by multiqueue devices.
2144 */
2145 int netif_get_num_default_rss_queues(void)
2146 {
2147 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2148 }
2149 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2150
2151 static inline void __netif_reschedule(struct Qdisc *q)
2152 {
2153 struct softnet_data *sd;
2154 unsigned long flags;
2155
2156 local_irq_save(flags);
2157 sd = this_cpu_ptr(&softnet_data);
2158 q->next_sched = NULL;
2159 *sd->output_queue_tailp = q;
2160 sd->output_queue_tailp = &q->next_sched;
2161 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2162 local_irq_restore(flags);
2163 }
2164
2165 void __netif_schedule(struct Qdisc *q)
2166 {
2167 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2168 __netif_reschedule(q);
2169 }
2170 EXPORT_SYMBOL(__netif_schedule);
2171
2172 struct dev_kfree_skb_cb {
2173 enum skb_free_reason reason;
2174 };
2175
2176 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2177 {
2178 return (struct dev_kfree_skb_cb *)skb->cb;
2179 }
2180
2181 void netif_schedule_queue(struct netdev_queue *txq)
2182 {
2183 rcu_read_lock();
2184 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2185 struct Qdisc *q = rcu_dereference(txq->qdisc);
2186
2187 __netif_schedule(q);
2188 }
2189 rcu_read_unlock();
2190 }
2191 EXPORT_SYMBOL(netif_schedule_queue);
2192
2193 /**
2194 * netif_wake_subqueue - allow sending packets on subqueue
2195 * @dev: network device
2196 * @queue_index: sub queue index
2197 *
2198 * Resume individual transmit queue of a device with multiple transmit queues.
2199 */
2200 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2201 {
2202 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2203
2204 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2205 struct Qdisc *q;
2206
2207 rcu_read_lock();
2208 q = rcu_dereference(txq->qdisc);
2209 __netif_schedule(q);
2210 rcu_read_unlock();
2211 }
2212 }
2213 EXPORT_SYMBOL(netif_wake_subqueue);
2214
2215 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2216 {
2217 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2218 struct Qdisc *q;
2219
2220 rcu_read_lock();
2221 q = rcu_dereference(dev_queue->qdisc);
2222 __netif_schedule(q);
2223 rcu_read_unlock();
2224 }
2225 }
2226 EXPORT_SYMBOL(netif_tx_wake_queue);
2227
2228 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2229 {
2230 unsigned long flags;
2231
2232 if (likely(atomic_read(&skb->users) == 1)) {
2233 smp_rmb();
2234 atomic_set(&skb->users, 0);
2235 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2236 return;
2237 }
2238 get_kfree_skb_cb(skb)->reason = reason;
2239 local_irq_save(flags);
2240 skb->next = __this_cpu_read(softnet_data.completion_queue);
2241 __this_cpu_write(softnet_data.completion_queue, skb);
2242 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2243 local_irq_restore(flags);
2244 }
2245 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2246
2247 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2248 {
2249 if (in_irq() || irqs_disabled())
2250 __dev_kfree_skb_irq(skb, reason);
2251 else
2252 dev_kfree_skb(skb);
2253 }
2254 EXPORT_SYMBOL(__dev_kfree_skb_any);
2255
2256
2257 /**
2258 * netif_device_detach - mark device as removed
2259 * @dev: network device
2260 *
2261 * Mark device as removed from system and therefore no longer available.
2262 */
2263 void netif_device_detach(struct net_device *dev)
2264 {
2265 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2266 netif_running(dev)) {
2267 netif_tx_stop_all_queues(dev);
2268 }
2269 }
2270 EXPORT_SYMBOL(netif_device_detach);
2271
2272 /**
2273 * netif_device_attach - mark device as attached
2274 * @dev: network device
2275 *
2276 * Mark device as attached from system and restart if needed.
2277 */
2278 void netif_device_attach(struct net_device *dev)
2279 {
2280 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2281 netif_running(dev)) {
2282 netif_tx_wake_all_queues(dev);
2283 __netdev_watchdog_up(dev);
2284 }
2285 }
2286 EXPORT_SYMBOL(netif_device_attach);
2287
2288 static void skb_warn_bad_offload(const struct sk_buff *skb)
2289 {
2290 static const netdev_features_t null_features = 0;
2291 struct net_device *dev = skb->dev;
2292 const char *driver = "";
2293
2294 if (!net_ratelimit())
2295 return;
2296
2297 if (dev && dev->dev.parent)
2298 driver = dev_driver_string(dev->dev.parent);
2299
2300 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2301 "gso_type=%d ip_summed=%d\n",
2302 driver, dev ? &dev->features : &null_features,
2303 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2304 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2305 skb_shinfo(skb)->gso_type, skb->ip_summed);
2306 }
2307
2308 /*
2309 * Invalidate hardware checksum when packet is to be mangled, and
2310 * complete checksum manually on outgoing path.
2311 */
2312 int skb_checksum_help(struct sk_buff *skb)
2313 {
2314 __wsum csum;
2315 int ret = 0, offset;
2316
2317 if (skb->ip_summed == CHECKSUM_COMPLETE)
2318 goto out_set_summed;
2319
2320 if (unlikely(skb_shinfo(skb)->gso_size)) {
2321 skb_warn_bad_offload(skb);
2322 return -EINVAL;
2323 }
2324
2325 /* Before computing a checksum, we should make sure no frag could
2326 * be modified by an external entity : checksum could be wrong.
2327 */
2328 if (skb_has_shared_frag(skb)) {
2329 ret = __skb_linearize(skb);
2330 if (ret)
2331 goto out;
2332 }
2333
2334 offset = skb_checksum_start_offset(skb);
2335 BUG_ON(offset >= skb_headlen(skb));
2336 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2337
2338 offset += skb->csum_offset;
2339 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2340
2341 if (skb_cloned(skb) &&
2342 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2343 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2344 if (ret)
2345 goto out;
2346 }
2347
2348 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2349 out_set_summed:
2350 skb->ip_summed = CHECKSUM_NONE;
2351 out:
2352 return ret;
2353 }
2354 EXPORT_SYMBOL(skb_checksum_help);
2355
2356 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2357 {
2358 unsigned int vlan_depth = skb->mac_len;
2359 __be16 type = skb->protocol;
2360
2361 /* Tunnel gso handlers can set protocol to ethernet. */
2362 if (type == htons(ETH_P_TEB)) {
2363 struct ethhdr *eth;
2364
2365 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2366 return 0;
2367
2368 eth = (struct ethhdr *)skb_mac_header(skb);
2369 type = eth->h_proto;
2370 }
2371
2372 /* if skb->protocol is 802.1Q/AD then the header should already be
2373 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2374 * ETH_HLEN otherwise
2375 */
2376 if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2377 if (vlan_depth) {
2378 if (WARN_ON(vlan_depth < VLAN_HLEN))
2379 return 0;
2380 vlan_depth -= VLAN_HLEN;
2381 } else {
2382 vlan_depth = ETH_HLEN;
2383 }
2384 do {
2385 struct vlan_hdr *vh;
2386
2387 if (unlikely(!pskb_may_pull(skb,
2388 vlan_depth + VLAN_HLEN)))
2389 return 0;
2390
2391 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2392 type = vh->h_vlan_encapsulated_proto;
2393 vlan_depth += VLAN_HLEN;
2394 } while (type == htons(ETH_P_8021Q) ||
2395 type == htons(ETH_P_8021AD));
2396 }
2397
2398 *depth = vlan_depth;
2399
2400 return type;
2401 }
2402
2403 /**
2404 * skb_mac_gso_segment - mac layer segmentation handler.
2405 * @skb: buffer to segment
2406 * @features: features for the output path (see dev->features)
2407 */
2408 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2409 netdev_features_t features)
2410 {
2411 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2412 struct packet_offload *ptype;
2413 int vlan_depth = skb->mac_len;
2414 __be16 type = skb_network_protocol(skb, &vlan_depth);
2415
2416 if (unlikely(!type))
2417 return ERR_PTR(-EINVAL);
2418
2419 __skb_pull(skb, vlan_depth);
2420
2421 rcu_read_lock();
2422 list_for_each_entry_rcu(ptype, &offload_base, list) {
2423 if (ptype->type == type && ptype->callbacks.gso_segment) {
2424 segs = ptype->callbacks.gso_segment(skb, features);
2425 break;
2426 }
2427 }
2428 rcu_read_unlock();
2429
2430 __skb_push(skb, skb->data - skb_mac_header(skb));
2431
2432 return segs;
2433 }
2434 EXPORT_SYMBOL(skb_mac_gso_segment);
2435
2436
2437 /* openvswitch calls this on rx path, so we need a different check.
2438 */
2439 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2440 {
2441 if (tx_path)
2442 return skb->ip_summed != CHECKSUM_PARTIAL;
2443 else
2444 return skb->ip_summed == CHECKSUM_NONE;
2445 }
2446
2447 /**
2448 * __skb_gso_segment - Perform segmentation on skb.
2449 * @skb: buffer to segment
2450 * @features: features for the output path (see dev->features)
2451 * @tx_path: whether it is called in TX path
2452 *
2453 * This function segments the given skb and returns a list of segments.
2454 *
2455 * It may return NULL if the skb requires no segmentation. This is
2456 * only possible when GSO is used for verifying header integrity.
2457 */
2458 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2459 netdev_features_t features, bool tx_path)
2460 {
2461 if (unlikely(skb_needs_check(skb, tx_path))) {
2462 int err;
2463
2464 skb_warn_bad_offload(skb);
2465
2466 err = skb_cow_head(skb, 0);
2467 if (err < 0)
2468 return ERR_PTR(err);
2469 }
2470
2471 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2472 SKB_GSO_CB(skb)->encap_level = 0;
2473
2474 skb_reset_mac_header(skb);
2475 skb_reset_mac_len(skb);
2476
2477 return skb_mac_gso_segment(skb, features);
2478 }
2479 EXPORT_SYMBOL(__skb_gso_segment);
2480
2481 /* Take action when hardware reception checksum errors are detected. */
2482 #ifdef CONFIG_BUG
2483 void netdev_rx_csum_fault(struct net_device *dev)
2484 {
2485 if (net_ratelimit()) {
2486 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2487 dump_stack();
2488 }
2489 }
2490 EXPORT_SYMBOL(netdev_rx_csum_fault);
2491 #endif
2492
2493 /* Actually, we should eliminate this check as soon as we know, that:
2494 * 1. IOMMU is present and allows to map all the memory.
2495 * 2. No high memory really exists on this machine.
2496 */
2497
2498 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2499 {
2500 #ifdef CONFIG_HIGHMEM
2501 int i;
2502 if (!(dev->features & NETIF_F_HIGHDMA)) {
2503 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2504 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2505 if (PageHighMem(skb_frag_page(frag)))
2506 return 1;
2507 }
2508 }
2509
2510 if (PCI_DMA_BUS_IS_PHYS) {
2511 struct device *pdev = dev->dev.parent;
2512
2513 if (!pdev)
2514 return 0;
2515 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2516 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2517 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2518 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2519 return 1;
2520 }
2521 }
2522 #endif
2523 return 0;
2524 }
2525
2526 /* If MPLS offload request, verify we are testing hardware MPLS features
2527 * instead of standard features for the netdev.
2528 */
2529 #ifdef CONFIG_NET_MPLS_GSO
2530 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2531 netdev_features_t features,
2532 __be16 type)
2533 {
2534 if (eth_p_mpls(type))
2535 features &= skb->dev->mpls_features;
2536
2537 return features;
2538 }
2539 #else
2540 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2541 netdev_features_t features,
2542 __be16 type)
2543 {
2544 return features;
2545 }
2546 #endif
2547
2548 static netdev_features_t harmonize_features(struct sk_buff *skb,
2549 netdev_features_t features)
2550 {
2551 int tmp;
2552 __be16 type;
2553
2554 type = skb_network_protocol(skb, &tmp);
2555 features = net_mpls_features(skb, features, type);
2556
2557 if (skb->ip_summed != CHECKSUM_NONE &&
2558 !can_checksum_protocol(features, type)) {
2559 features &= ~NETIF_F_ALL_CSUM;
2560 } else if (illegal_highdma(skb->dev, skb)) {
2561 features &= ~NETIF_F_SG;
2562 }
2563
2564 return features;
2565 }
2566
2567 netdev_features_t netif_skb_features(struct sk_buff *skb)
2568 {
2569 const struct net_device *dev = skb->dev;
2570 netdev_features_t features = dev->features;
2571 u16 gso_segs = skb_shinfo(skb)->gso_segs;
2572 __be16 protocol = skb->protocol;
2573
2574 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2575 features &= ~NETIF_F_GSO_MASK;
2576
2577 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2578 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2579 protocol = veh->h_vlan_encapsulated_proto;
2580 } else if (!vlan_tx_tag_present(skb)) {
2581 return harmonize_features(skb, features);
2582 }
2583
2584 features = netdev_intersect_features(features,
2585 dev->vlan_features |
2586 NETIF_F_HW_VLAN_CTAG_TX |
2587 NETIF_F_HW_VLAN_STAG_TX);
2588
2589 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2590 features = netdev_intersect_features(features,
2591 NETIF_F_SG |
2592 NETIF_F_HIGHDMA |
2593 NETIF_F_FRAGLIST |
2594 NETIF_F_GEN_CSUM |
2595 NETIF_F_HW_VLAN_CTAG_TX |
2596 NETIF_F_HW_VLAN_STAG_TX);
2597
2598 return harmonize_features(skb, features);
2599 }
2600 EXPORT_SYMBOL(netif_skb_features);
2601
2602 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2603 struct netdev_queue *txq, bool more)
2604 {
2605 unsigned int len;
2606 int rc;
2607
2608 if (!list_empty(&ptype_all))
2609 dev_queue_xmit_nit(skb, dev);
2610
2611 len = skb->len;
2612 trace_net_dev_start_xmit(skb, dev);
2613 rc = netdev_start_xmit(skb, dev, txq, more);
2614 trace_net_dev_xmit(skb, rc, dev, len);
2615
2616 return rc;
2617 }
2618
2619 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2620 struct netdev_queue *txq, int *ret)
2621 {
2622 struct sk_buff *skb = first;
2623 int rc = NETDEV_TX_OK;
2624
2625 while (skb) {
2626 struct sk_buff *next = skb->next;
2627
2628 skb->next = NULL;
2629 rc = xmit_one(skb, dev, txq, next != NULL);
2630 if (unlikely(!dev_xmit_complete(rc))) {
2631 skb->next = next;
2632 goto out;
2633 }
2634
2635 skb = next;
2636 if (netif_xmit_stopped(txq) && skb) {
2637 rc = NETDEV_TX_BUSY;
2638 break;
2639 }
2640 }
2641
2642 out:
2643 *ret = rc;
2644 return skb;
2645 }
2646
2647 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2648 netdev_features_t features)
2649 {
2650 if (vlan_tx_tag_present(skb) &&
2651 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2652 skb = __vlan_put_tag(skb, skb->vlan_proto,
2653 vlan_tx_tag_get(skb));
2654 if (skb)
2655 skb->vlan_tci = 0;
2656 }
2657 return skb;
2658 }
2659
2660 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2661 {
2662 netdev_features_t features;
2663
2664 if (skb->next)
2665 return skb;
2666
2667 features = netif_skb_features(skb);
2668 skb = validate_xmit_vlan(skb, features);
2669 if (unlikely(!skb))
2670 goto out_null;
2671
2672 /* If encapsulation offload request, verify we are testing
2673 * hardware encapsulation features instead of standard
2674 * features for the netdev
2675 */
2676 if (skb->encapsulation)
2677 features &= dev->hw_enc_features;
2678
2679 if (netif_needs_gso(dev, skb, features)) {
2680 struct sk_buff *segs;
2681
2682 segs = skb_gso_segment(skb, features);
2683 if (IS_ERR(segs)) {
2684 segs = NULL;
2685 } else if (segs) {
2686 consume_skb(skb);
2687 skb = segs;
2688 }
2689 } else {
2690 if (skb_needs_linearize(skb, features) &&
2691 __skb_linearize(skb))
2692 goto out_kfree_skb;
2693
2694 /* If packet is not checksummed and device does not
2695 * support checksumming for this protocol, complete
2696 * checksumming here.
2697 */
2698 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2699 if (skb->encapsulation)
2700 skb_set_inner_transport_header(skb,
2701 skb_checksum_start_offset(skb));
2702 else
2703 skb_set_transport_header(skb,
2704 skb_checksum_start_offset(skb));
2705 if (!(features & NETIF_F_ALL_CSUM) &&
2706 skb_checksum_help(skb))
2707 goto out_kfree_skb;
2708 }
2709 }
2710
2711 return skb;
2712
2713 out_kfree_skb:
2714 kfree_skb(skb);
2715 out_null:
2716 return NULL;
2717 }
2718
2719 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2720 {
2721 struct sk_buff *next, *head = NULL, *tail;
2722
2723 for (; skb != NULL; skb = next) {
2724 next = skb->next;
2725 skb->next = NULL;
2726
2727 /* in case skb wont be segmented, point to itself */
2728 skb->prev = skb;
2729
2730 skb = validate_xmit_skb(skb, dev);
2731 if (!skb)
2732 continue;
2733
2734 if (!head)
2735 head = skb;
2736 else
2737 tail->next = skb;
2738 /* If skb was segmented, skb->prev points to
2739 * the last segment. If not, it still contains skb.
2740 */
2741 tail = skb->prev;
2742 }
2743 return head;
2744 }
2745
2746 static void qdisc_pkt_len_init(struct sk_buff *skb)
2747 {
2748 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2749
2750 qdisc_skb_cb(skb)->pkt_len = skb->len;
2751
2752 /* To get more precise estimation of bytes sent on wire,
2753 * we add to pkt_len the headers size of all segments
2754 */
2755 if (shinfo->gso_size) {
2756 unsigned int hdr_len;
2757 u16 gso_segs = shinfo->gso_segs;
2758
2759 /* mac layer + network layer */
2760 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2761
2762 /* + transport layer */
2763 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2764 hdr_len += tcp_hdrlen(skb);
2765 else
2766 hdr_len += sizeof(struct udphdr);
2767
2768 if (shinfo->gso_type & SKB_GSO_DODGY)
2769 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2770 shinfo->gso_size);
2771
2772 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2773 }
2774 }
2775
2776 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2777 struct net_device *dev,
2778 struct netdev_queue *txq)
2779 {
2780 spinlock_t *root_lock = qdisc_lock(q);
2781 bool contended;
2782 int rc;
2783
2784 qdisc_pkt_len_init(skb);
2785 qdisc_calculate_pkt_len(skb, q);
2786 /*
2787 * Heuristic to force contended enqueues to serialize on a
2788 * separate lock before trying to get qdisc main lock.
2789 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2790 * often and dequeue packets faster.
2791 */
2792 contended = qdisc_is_running(q);
2793 if (unlikely(contended))
2794 spin_lock(&q->busylock);
2795
2796 spin_lock(root_lock);
2797 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2798 kfree_skb(skb);
2799 rc = NET_XMIT_DROP;
2800 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2801 qdisc_run_begin(q)) {
2802 /*
2803 * This is a work-conserving queue; there are no old skbs
2804 * waiting to be sent out; and the qdisc is not running -
2805 * xmit the skb directly.
2806 */
2807
2808 qdisc_bstats_update(q, skb);
2809
2810 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2811 if (unlikely(contended)) {
2812 spin_unlock(&q->busylock);
2813 contended = false;
2814 }
2815 __qdisc_run(q);
2816 } else
2817 qdisc_run_end(q);
2818
2819 rc = NET_XMIT_SUCCESS;
2820 } else {
2821 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2822 if (qdisc_run_begin(q)) {
2823 if (unlikely(contended)) {
2824 spin_unlock(&q->busylock);
2825 contended = false;
2826 }
2827 __qdisc_run(q);
2828 }
2829 }
2830 spin_unlock(root_lock);
2831 if (unlikely(contended))
2832 spin_unlock(&q->busylock);
2833 return rc;
2834 }
2835
2836 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2837 static void skb_update_prio(struct sk_buff *skb)
2838 {
2839 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2840
2841 if (!skb->priority && skb->sk && map) {
2842 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2843
2844 if (prioidx < map->priomap_len)
2845 skb->priority = map->priomap[prioidx];
2846 }
2847 }
2848 #else
2849 #define skb_update_prio(skb)
2850 #endif
2851
2852 static DEFINE_PER_CPU(int, xmit_recursion);
2853 #define RECURSION_LIMIT 10
2854
2855 /**
2856 * dev_loopback_xmit - loop back @skb
2857 * @skb: buffer to transmit
2858 */
2859 int dev_loopback_xmit(struct sk_buff *skb)
2860 {
2861 skb_reset_mac_header(skb);
2862 __skb_pull(skb, skb_network_offset(skb));
2863 skb->pkt_type = PACKET_LOOPBACK;
2864 skb->ip_summed = CHECKSUM_UNNECESSARY;
2865 WARN_ON(!skb_dst(skb));
2866 skb_dst_force(skb);
2867 netif_rx_ni(skb);
2868 return 0;
2869 }
2870 EXPORT_SYMBOL(dev_loopback_xmit);
2871
2872 /**
2873 * __dev_queue_xmit - transmit a buffer
2874 * @skb: buffer to transmit
2875 * @accel_priv: private data used for L2 forwarding offload
2876 *
2877 * Queue a buffer for transmission to a network device. The caller must
2878 * have set the device and priority and built the buffer before calling
2879 * this function. The function can be called from an interrupt.
2880 *
2881 * A negative errno code is returned on a failure. A success does not
2882 * guarantee the frame will be transmitted as it may be dropped due
2883 * to congestion or traffic shaping.
2884 *
2885 * -----------------------------------------------------------------------------------
2886 * I notice this method can also return errors from the queue disciplines,
2887 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2888 * be positive.
2889 *
2890 * Regardless of the return value, the skb is consumed, so it is currently
2891 * difficult to retry a send to this method. (You can bump the ref count
2892 * before sending to hold a reference for retry if you are careful.)
2893 *
2894 * When calling this method, interrupts MUST be enabled. This is because
2895 * the BH enable code must have IRQs enabled so that it will not deadlock.
2896 * --BLG
2897 */
2898 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2899 {
2900 struct net_device *dev = skb->dev;
2901 struct netdev_queue *txq;
2902 struct Qdisc *q;
2903 int rc = -ENOMEM;
2904
2905 skb_reset_mac_header(skb);
2906
2907 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2908 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2909
2910 /* Disable soft irqs for various locks below. Also
2911 * stops preemption for RCU.
2912 */
2913 rcu_read_lock_bh();
2914
2915 skb_update_prio(skb);
2916
2917 /* If device/qdisc don't need skb->dst, release it right now while
2918 * its hot in this cpu cache.
2919 */
2920 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2921 skb_dst_drop(skb);
2922 else
2923 skb_dst_force(skb);
2924
2925 txq = netdev_pick_tx(dev, skb, accel_priv);
2926 q = rcu_dereference_bh(txq->qdisc);
2927
2928 #ifdef CONFIG_NET_CLS_ACT
2929 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2930 #endif
2931 trace_net_dev_queue(skb);
2932 if (q->enqueue) {
2933 rc = __dev_xmit_skb(skb, q, dev, txq);
2934 goto out;
2935 }
2936
2937 /* The device has no queue. Common case for software devices:
2938 loopback, all the sorts of tunnels...
2939
2940 Really, it is unlikely that netif_tx_lock protection is necessary
2941 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2942 counters.)
2943 However, it is possible, that they rely on protection
2944 made by us here.
2945
2946 Check this and shot the lock. It is not prone from deadlocks.
2947 Either shot noqueue qdisc, it is even simpler 8)
2948 */
2949 if (dev->flags & IFF_UP) {
2950 int cpu = smp_processor_id(); /* ok because BHs are off */
2951
2952 if (txq->xmit_lock_owner != cpu) {
2953
2954 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2955 goto recursion_alert;
2956
2957 skb = validate_xmit_skb(skb, dev);
2958 if (!skb)
2959 goto drop;
2960
2961 HARD_TX_LOCK(dev, txq, cpu);
2962
2963 if (!netif_xmit_stopped(txq)) {
2964 __this_cpu_inc(xmit_recursion);
2965 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2966 __this_cpu_dec(xmit_recursion);
2967 if (dev_xmit_complete(rc)) {
2968 HARD_TX_UNLOCK(dev, txq);
2969 goto out;
2970 }
2971 }
2972 HARD_TX_UNLOCK(dev, txq);
2973 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2974 dev->name);
2975 } else {
2976 /* Recursion is detected! It is possible,
2977 * unfortunately
2978 */
2979 recursion_alert:
2980 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2981 dev->name);
2982 }
2983 }
2984
2985 rc = -ENETDOWN;
2986 drop:
2987 rcu_read_unlock_bh();
2988
2989 atomic_long_inc(&dev->tx_dropped);
2990 kfree_skb_list(skb);
2991 return rc;
2992 out:
2993 rcu_read_unlock_bh();
2994 return rc;
2995 }
2996
2997 int dev_queue_xmit(struct sk_buff *skb)
2998 {
2999 return __dev_queue_xmit(skb, NULL);
3000 }
3001 EXPORT_SYMBOL(dev_queue_xmit);
3002
3003 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3004 {
3005 return __dev_queue_xmit(skb, accel_priv);
3006 }
3007 EXPORT_SYMBOL(dev_queue_xmit_accel);
3008
3009
3010 /*=======================================================================
3011 Receiver routines
3012 =======================================================================*/
3013
3014 int netdev_max_backlog __read_mostly = 1000;
3015 EXPORT_SYMBOL(netdev_max_backlog);
3016
3017 int netdev_tstamp_prequeue __read_mostly = 1;
3018 int netdev_budget __read_mostly = 300;
3019 int weight_p __read_mostly = 64; /* old backlog weight */
3020
3021 /* Called with irq disabled */
3022 static inline void ____napi_schedule(struct softnet_data *sd,
3023 struct napi_struct *napi)
3024 {
3025 list_add_tail(&napi->poll_list, &sd->poll_list);
3026 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3027 }
3028
3029 #ifdef CONFIG_RPS
3030
3031 /* One global table that all flow-based protocols share. */
3032 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3033 EXPORT_SYMBOL(rps_sock_flow_table);
3034
3035 struct static_key rps_needed __read_mostly;
3036
3037 static struct rps_dev_flow *
3038 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3039 struct rps_dev_flow *rflow, u16 next_cpu)
3040 {
3041 if (next_cpu != RPS_NO_CPU) {
3042 #ifdef CONFIG_RFS_ACCEL
3043 struct netdev_rx_queue *rxqueue;
3044 struct rps_dev_flow_table *flow_table;
3045 struct rps_dev_flow *old_rflow;
3046 u32 flow_id;
3047 u16 rxq_index;
3048 int rc;
3049
3050 /* Should we steer this flow to a different hardware queue? */
3051 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3052 !(dev->features & NETIF_F_NTUPLE))
3053 goto out;
3054 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3055 if (rxq_index == skb_get_rx_queue(skb))
3056 goto out;
3057
3058 rxqueue = dev->_rx + rxq_index;
3059 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3060 if (!flow_table)
3061 goto out;
3062 flow_id = skb_get_hash(skb) & flow_table->mask;
3063 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3064 rxq_index, flow_id);
3065 if (rc < 0)
3066 goto out;
3067 old_rflow = rflow;
3068 rflow = &flow_table->flows[flow_id];
3069 rflow->filter = rc;
3070 if (old_rflow->filter == rflow->filter)
3071 old_rflow->filter = RPS_NO_FILTER;
3072 out:
3073 #endif
3074 rflow->last_qtail =
3075 per_cpu(softnet_data, next_cpu).input_queue_head;
3076 }
3077
3078 rflow->cpu = next_cpu;
3079 return rflow;
3080 }
3081
3082 /*
3083 * get_rps_cpu is called from netif_receive_skb and returns the target
3084 * CPU from the RPS map of the receiving queue for a given skb.
3085 * rcu_read_lock must be held on entry.
3086 */
3087 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3088 struct rps_dev_flow **rflowp)
3089 {
3090 struct netdev_rx_queue *rxqueue;
3091 struct rps_map *map;
3092 struct rps_dev_flow_table *flow_table;
3093 struct rps_sock_flow_table *sock_flow_table;
3094 int cpu = -1;
3095 u16 tcpu;
3096 u32 hash;
3097
3098 if (skb_rx_queue_recorded(skb)) {
3099 u16 index = skb_get_rx_queue(skb);
3100 if (unlikely(index >= dev->real_num_rx_queues)) {
3101 WARN_ONCE(dev->real_num_rx_queues > 1,
3102 "%s received packet on queue %u, but number "
3103 "of RX queues is %u\n",
3104 dev->name, index, dev->real_num_rx_queues);
3105 goto done;
3106 }
3107 rxqueue = dev->_rx + index;
3108 } else
3109 rxqueue = dev->_rx;
3110
3111 map = rcu_dereference(rxqueue->rps_map);
3112 if (map) {
3113 if (map->len == 1 &&
3114 !rcu_access_pointer(rxqueue->rps_flow_table)) {
3115 tcpu = map->cpus[0];
3116 if (cpu_online(tcpu))
3117 cpu = tcpu;
3118 goto done;
3119 }
3120 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3121 goto done;
3122 }
3123
3124 skb_reset_network_header(skb);
3125 hash = skb_get_hash(skb);
3126 if (!hash)
3127 goto done;
3128
3129 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3130 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3131 if (flow_table && sock_flow_table) {
3132 u16 next_cpu;
3133 struct rps_dev_flow *rflow;
3134
3135 rflow = &flow_table->flows[hash & flow_table->mask];
3136 tcpu = rflow->cpu;
3137
3138 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3139
3140 /*
3141 * If the desired CPU (where last recvmsg was done) is
3142 * different from current CPU (one in the rx-queue flow
3143 * table entry), switch if one of the following holds:
3144 * - Current CPU is unset (equal to RPS_NO_CPU).
3145 * - Current CPU is offline.
3146 * - The current CPU's queue tail has advanced beyond the
3147 * last packet that was enqueued using this table entry.
3148 * This guarantees that all previous packets for the flow
3149 * have been dequeued, thus preserving in order delivery.
3150 */
3151 if (unlikely(tcpu != next_cpu) &&
3152 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3153 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3154 rflow->last_qtail)) >= 0)) {
3155 tcpu = next_cpu;
3156 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3157 }
3158
3159 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3160 *rflowp = rflow;
3161 cpu = tcpu;
3162 goto done;
3163 }
3164 }
3165
3166 if (map) {
3167 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3168 if (cpu_online(tcpu)) {
3169 cpu = tcpu;
3170 goto done;
3171 }
3172 }
3173
3174 done:
3175 return cpu;
3176 }
3177
3178 #ifdef CONFIG_RFS_ACCEL
3179
3180 /**
3181 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3182 * @dev: Device on which the filter was set
3183 * @rxq_index: RX queue index
3184 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3185 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3186 *
3187 * Drivers that implement ndo_rx_flow_steer() should periodically call
3188 * this function for each installed filter and remove the filters for
3189 * which it returns %true.
3190 */
3191 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3192 u32 flow_id, u16 filter_id)
3193 {
3194 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3195 struct rps_dev_flow_table *flow_table;
3196 struct rps_dev_flow *rflow;
3197 bool expire = true;
3198 int cpu;
3199
3200 rcu_read_lock();
3201 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3202 if (flow_table && flow_id <= flow_table->mask) {
3203 rflow = &flow_table->flows[flow_id];
3204 cpu = ACCESS_ONCE(rflow->cpu);
3205 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3206 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3207 rflow->last_qtail) <
3208 (int)(10 * flow_table->mask)))
3209 expire = false;
3210 }
3211 rcu_read_unlock();
3212 return expire;
3213 }
3214 EXPORT_SYMBOL(rps_may_expire_flow);
3215
3216 #endif /* CONFIG_RFS_ACCEL */
3217
3218 /* Called from hardirq (IPI) context */
3219 static void rps_trigger_softirq(void *data)
3220 {
3221 struct softnet_data *sd = data;
3222
3223 ____napi_schedule(sd, &sd->backlog);
3224 sd->received_rps++;
3225 }
3226
3227 #endif /* CONFIG_RPS */
3228
3229 /*
3230 * Check if this softnet_data structure is another cpu one
3231 * If yes, queue it to our IPI list and return 1
3232 * If no, return 0
3233 */
3234 static int rps_ipi_queued(struct softnet_data *sd)
3235 {
3236 #ifdef CONFIG_RPS
3237 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3238
3239 if (sd != mysd) {
3240 sd->rps_ipi_next = mysd->rps_ipi_list;
3241 mysd->rps_ipi_list = sd;
3242
3243 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3244 return 1;
3245 }
3246 #endif /* CONFIG_RPS */
3247 return 0;
3248 }
3249
3250 #ifdef CONFIG_NET_FLOW_LIMIT
3251 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3252 #endif
3253
3254 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3255 {
3256 #ifdef CONFIG_NET_FLOW_LIMIT
3257 struct sd_flow_limit *fl;
3258 struct softnet_data *sd;
3259 unsigned int old_flow, new_flow;
3260
3261 if (qlen < (netdev_max_backlog >> 1))
3262 return false;
3263
3264 sd = this_cpu_ptr(&softnet_data);
3265
3266 rcu_read_lock();
3267 fl = rcu_dereference(sd->flow_limit);
3268 if (fl) {
3269 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3270 old_flow = fl->history[fl->history_head];
3271 fl->history[fl->history_head] = new_flow;
3272
3273 fl->history_head++;
3274 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3275
3276 if (likely(fl->buckets[old_flow]))
3277 fl->buckets[old_flow]--;
3278
3279 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3280 fl->count++;
3281 rcu_read_unlock();
3282 return true;
3283 }
3284 }
3285 rcu_read_unlock();
3286 #endif
3287 return false;
3288 }
3289
3290 /*
3291 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3292 * queue (may be a remote CPU queue).
3293 */
3294 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3295 unsigned int *qtail)
3296 {
3297 struct softnet_data *sd;
3298 unsigned long flags;
3299 unsigned int qlen;
3300
3301 sd = &per_cpu(softnet_data, cpu);
3302
3303 local_irq_save(flags);
3304
3305 rps_lock(sd);
3306 qlen = skb_queue_len(&sd->input_pkt_queue);
3307 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3308 if (skb_queue_len(&sd->input_pkt_queue)) {
3309 enqueue:
3310 __skb_queue_tail(&sd->input_pkt_queue, skb);
3311 input_queue_tail_incr_save(sd, qtail);
3312 rps_unlock(sd);
3313 local_irq_restore(flags);
3314 return NET_RX_SUCCESS;
3315 }
3316
3317 /* Schedule NAPI for backlog device
3318 * We can use non atomic operation since we own the queue lock
3319 */
3320 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3321 if (!rps_ipi_queued(sd))
3322 ____napi_schedule(sd, &sd->backlog);
3323 }
3324 goto enqueue;
3325 }
3326
3327 sd->dropped++;
3328 rps_unlock(sd);
3329
3330 local_irq_restore(flags);
3331
3332 atomic_long_inc(&skb->dev->rx_dropped);
3333 kfree_skb(skb);
3334 return NET_RX_DROP;
3335 }
3336
3337 static int netif_rx_internal(struct sk_buff *skb)
3338 {
3339 int ret;
3340
3341 net_timestamp_check(netdev_tstamp_prequeue, skb);
3342
3343 trace_netif_rx(skb);
3344 #ifdef CONFIG_RPS
3345 if (static_key_false(&rps_needed)) {
3346 struct rps_dev_flow voidflow, *rflow = &voidflow;
3347 int cpu;
3348
3349 preempt_disable();
3350 rcu_read_lock();
3351
3352 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3353 if (cpu < 0)
3354 cpu = smp_processor_id();
3355
3356 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3357
3358 rcu_read_unlock();
3359 preempt_enable();
3360 } else
3361 #endif
3362 {
3363 unsigned int qtail;
3364 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3365 put_cpu();
3366 }
3367 return ret;
3368 }
3369
3370 /**
3371 * netif_rx - post buffer to the network code
3372 * @skb: buffer to post
3373 *
3374 * This function receives a packet from a device driver and queues it for
3375 * the upper (protocol) levels to process. It always succeeds. The buffer
3376 * may be dropped during processing for congestion control or by the
3377 * protocol layers.
3378 *
3379 * return values:
3380 * NET_RX_SUCCESS (no congestion)
3381 * NET_RX_DROP (packet was dropped)
3382 *
3383 */
3384
3385 int netif_rx(struct sk_buff *skb)
3386 {
3387 trace_netif_rx_entry(skb);
3388
3389 return netif_rx_internal(skb);
3390 }
3391 EXPORT_SYMBOL(netif_rx);
3392
3393 int netif_rx_ni(struct sk_buff *skb)
3394 {
3395 int err;
3396
3397 trace_netif_rx_ni_entry(skb);
3398
3399 preempt_disable();
3400 err = netif_rx_internal(skb);
3401 if (local_softirq_pending())
3402 do_softirq();
3403 preempt_enable();
3404
3405 return err;
3406 }
3407 EXPORT_SYMBOL(netif_rx_ni);
3408
3409 static void net_tx_action(struct softirq_action *h)
3410 {
3411 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3412
3413 if (sd->completion_queue) {
3414 struct sk_buff *clist;
3415
3416 local_irq_disable();
3417 clist = sd->completion_queue;
3418 sd->completion_queue = NULL;
3419 local_irq_enable();
3420
3421 while (clist) {
3422 struct sk_buff *skb = clist;
3423 clist = clist->next;
3424
3425 WARN_ON(atomic_read(&skb->users));
3426 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3427 trace_consume_skb(skb);
3428 else
3429 trace_kfree_skb(skb, net_tx_action);
3430 __kfree_skb(skb);
3431 }
3432 }
3433
3434 if (sd->output_queue) {
3435 struct Qdisc *head;
3436
3437 local_irq_disable();
3438 head = sd->output_queue;
3439 sd->output_queue = NULL;
3440 sd->output_queue_tailp = &sd->output_queue;
3441 local_irq_enable();
3442
3443 while (head) {
3444 struct Qdisc *q = head;
3445 spinlock_t *root_lock;
3446
3447 head = head->next_sched;
3448
3449 root_lock = qdisc_lock(q);
3450 if (spin_trylock(root_lock)) {
3451 smp_mb__before_atomic();
3452 clear_bit(__QDISC_STATE_SCHED,
3453 &q->state);
3454 qdisc_run(q);
3455 spin_unlock(root_lock);
3456 } else {
3457 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3458 &q->state)) {
3459 __netif_reschedule(q);
3460 } else {
3461 smp_mb__before_atomic();
3462 clear_bit(__QDISC_STATE_SCHED,
3463 &q->state);
3464 }
3465 }
3466 }
3467 }
3468 }
3469
3470 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3471 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3472 /* This hook is defined here for ATM LANE */
3473 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3474 unsigned char *addr) __read_mostly;
3475 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3476 #endif
3477
3478 #ifdef CONFIG_NET_CLS_ACT
3479 /* TODO: Maybe we should just force sch_ingress to be compiled in
3480 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3481 * a compare and 2 stores extra right now if we dont have it on
3482 * but have CONFIG_NET_CLS_ACT
3483 * NOTE: This doesn't stop any functionality; if you dont have
3484 * the ingress scheduler, you just can't add policies on ingress.
3485 *
3486 */
3487 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3488 {
3489 struct net_device *dev = skb->dev;
3490 u32 ttl = G_TC_RTTL(skb->tc_verd);
3491 int result = TC_ACT_OK;
3492 struct Qdisc *q;
3493
3494 if (unlikely(MAX_RED_LOOP < ttl++)) {
3495 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3496 skb->skb_iif, dev->ifindex);
3497 return TC_ACT_SHOT;
3498 }
3499
3500 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3501 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3502
3503 q = rcu_dereference(rxq->qdisc);
3504 if (q != &noop_qdisc) {
3505 spin_lock(qdisc_lock(q));
3506 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3507 result = qdisc_enqueue_root(skb, q);
3508 spin_unlock(qdisc_lock(q));
3509 }
3510
3511 return result;
3512 }
3513
3514 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3515 struct packet_type **pt_prev,
3516 int *ret, struct net_device *orig_dev)
3517 {
3518 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3519
3520 if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3521 goto out;
3522
3523 if (*pt_prev) {
3524 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3525 *pt_prev = NULL;
3526 }
3527
3528 switch (ing_filter(skb, rxq)) {
3529 case TC_ACT_SHOT:
3530 case TC_ACT_STOLEN:
3531 kfree_skb(skb);
3532 return NULL;
3533 }
3534
3535 out:
3536 skb->tc_verd = 0;
3537 return skb;
3538 }
3539 #endif
3540
3541 /**
3542 * netdev_rx_handler_register - register receive handler
3543 * @dev: device to register a handler for
3544 * @rx_handler: receive handler to register
3545 * @rx_handler_data: data pointer that is used by rx handler
3546 *
3547 * Register a receive handler for a device. This handler will then be
3548 * called from __netif_receive_skb. A negative errno code is returned
3549 * on a failure.
3550 *
3551 * The caller must hold the rtnl_mutex.
3552 *
3553 * For a general description of rx_handler, see enum rx_handler_result.
3554 */
3555 int netdev_rx_handler_register(struct net_device *dev,
3556 rx_handler_func_t *rx_handler,
3557 void *rx_handler_data)
3558 {
3559 ASSERT_RTNL();
3560
3561 if (dev->rx_handler)
3562 return -EBUSY;
3563
3564 /* Note: rx_handler_data must be set before rx_handler */
3565 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3566 rcu_assign_pointer(dev->rx_handler, rx_handler);
3567
3568 return 0;
3569 }
3570 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3571
3572 /**
3573 * netdev_rx_handler_unregister - unregister receive handler
3574 * @dev: device to unregister a handler from
3575 *
3576 * Unregister a receive handler from a device.
3577 *
3578 * The caller must hold the rtnl_mutex.
3579 */
3580 void netdev_rx_handler_unregister(struct net_device *dev)
3581 {
3582
3583 ASSERT_RTNL();
3584 RCU_INIT_POINTER(dev->rx_handler, NULL);
3585 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3586 * section has a guarantee to see a non NULL rx_handler_data
3587 * as well.
3588 */
3589 synchronize_net();
3590 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3591 }
3592 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3593
3594 /*
3595 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3596 * the special handling of PFMEMALLOC skbs.
3597 */
3598 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3599 {
3600 switch (skb->protocol) {
3601 case htons(ETH_P_ARP):
3602 case htons(ETH_P_IP):
3603 case htons(ETH_P_IPV6):
3604 case htons(ETH_P_8021Q):
3605 case htons(ETH_P_8021AD):
3606 return true;
3607 default:
3608 return false;
3609 }
3610 }
3611
3612 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3613 {
3614 struct packet_type *ptype, *pt_prev;
3615 rx_handler_func_t *rx_handler;
3616 struct net_device *orig_dev;
3617 struct net_device *null_or_dev;
3618 bool deliver_exact = false;
3619 int ret = NET_RX_DROP;
3620 __be16 type;
3621
3622 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3623
3624 trace_netif_receive_skb(skb);
3625
3626 orig_dev = skb->dev;
3627
3628 skb_reset_network_header(skb);
3629 if (!skb_transport_header_was_set(skb))
3630 skb_reset_transport_header(skb);
3631 skb_reset_mac_len(skb);
3632
3633 pt_prev = NULL;
3634
3635 rcu_read_lock();
3636
3637 another_round:
3638 skb->skb_iif = skb->dev->ifindex;
3639
3640 __this_cpu_inc(softnet_data.processed);
3641
3642 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3643 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3644 skb = skb_vlan_untag(skb);
3645 if (unlikely(!skb))
3646 goto unlock;
3647 }
3648
3649 #ifdef CONFIG_NET_CLS_ACT
3650 if (skb->tc_verd & TC_NCLS) {
3651 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3652 goto ncls;
3653 }
3654 #endif
3655
3656 if (pfmemalloc)
3657 goto skip_taps;
3658
3659 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3660 if (!ptype->dev || ptype->dev == skb->dev) {
3661 if (pt_prev)
3662 ret = deliver_skb(skb, pt_prev, orig_dev);
3663 pt_prev = ptype;
3664 }
3665 }
3666
3667 skip_taps:
3668 #ifdef CONFIG_NET_CLS_ACT
3669 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3670 if (!skb)
3671 goto unlock;
3672 ncls:
3673 #endif
3674
3675 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3676 goto drop;
3677
3678 if (vlan_tx_tag_present(skb)) {
3679 if (pt_prev) {
3680 ret = deliver_skb(skb, pt_prev, orig_dev);
3681 pt_prev = NULL;
3682 }
3683 if (vlan_do_receive(&skb))
3684 goto another_round;
3685 else if (unlikely(!skb))
3686 goto unlock;
3687 }
3688
3689 rx_handler = rcu_dereference(skb->dev->rx_handler);
3690 if (rx_handler) {
3691 if (pt_prev) {
3692 ret = deliver_skb(skb, pt_prev, orig_dev);
3693 pt_prev = NULL;
3694 }
3695 switch (rx_handler(&skb)) {
3696 case RX_HANDLER_CONSUMED:
3697 ret = NET_RX_SUCCESS;
3698 goto unlock;
3699 case RX_HANDLER_ANOTHER:
3700 goto another_round;
3701 case RX_HANDLER_EXACT:
3702 deliver_exact = true;
3703 case RX_HANDLER_PASS:
3704 break;
3705 default:
3706 BUG();
3707 }
3708 }
3709
3710 if (unlikely(vlan_tx_tag_present(skb))) {
3711 if (vlan_tx_tag_get_id(skb))
3712 skb->pkt_type = PACKET_OTHERHOST;
3713 /* Note: we might in the future use prio bits
3714 * and set skb->priority like in vlan_do_receive()
3715 * For the time being, just ignore Priority Code Point
3716 */
3717 skb->vlan_tci = 0;
3718 }
3719
3720 /* deliver only exact match when indicated */
3721 null_or_dev = deliver_exact ? skb->dev : NULL;
3722
3723 type = skb->protocol;
3724 list_for_each_entry_rcu(ptype,
3725 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3726 if (ptype->type == type &&
3727 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3728 ptype->dev == orig_dev)) {
3729 if (pt_prev)
3730 ret = deliver_skb(skb, pt_prev, orig_dev);
3731 pt_prev = ptype;
3732 }
3733 }
3734
3735 if (pt_prev) {
3736 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3737 goto drop;
3738 else
3739 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3740 } else {
3741 drop:
3742 atomic_long_inc(&skb->dev->rx_dropped);
3743 kfree_skb(skb);
3744 /* Jamal, now you will not able to escape explaining
3745 * me how you were going to use this. :-)
3746 */
3747 ret = NET_RX_DROP;
3748 }
3749
3750 unlock:
3751 rcu_read_unlock();
3752 return ret;
3753 }
3754
3755 static int __netif_receive_skb(struct sk_buff *skb)
3756 {
3757 int ret;
3758
3759 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3760 unsigned long pflags = current->flags;
3761
3762 /*
3763 * PFMEMALLOC skbs are special, they should
3764 * - be delivered to SOCK_MEMALLOC sockets only
3765 * - stay away from userspace
3766 * - have bounded memory usage
3767 *
3768 * Use PF_MEMALLOC as this saves us from propagating the allocation
3769 * context down to all allocation sites.
3770 */
3771 current->flags |= PF_MEMALLOC;
3772 ret = __netif_receive_skb_core(skb, true);
3773 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3774 } else
3775 ret = __netif_receive_skb_core(skb, false);
3776
3777 return ret;
3778 }
3779
3780 static int netif_receive_skb_internal(struct sk_buff *skb)
3781 {
3782 net_timestamp_check(netdev_tstamp_prequeue, skb);
3783
3784 if (skb_defer_rx_timestamp(skb))
3785 return NET_RX_SUCCESS;
3786
3787 #ifdef CONFIG_RPS
3788 if (static_key_false(&rps_needed)) {
3789 struct rps_dev_flow voidflow, *rflow = &voidflow;
3790 int cpu, ret;
3791
3792 rcu_read_lock();
3793
3794 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3795
3796 if (cpu >= 0) {
3797 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3798 rcu_read_unlock();
3799 return ret;
3800 }
3801 rcu_read_unlock();
3802 }
3803 #endif
3804 return __netif_receive_skb(skb);
3805 }
3806
3807 /**
3808 * netif_receive_skb - process receive buffer from network
3809 * @skb: buffer to process
3810 *
3811 * netif_receive_skb() is the main receive data processing function.
3812 * It always succeeds. The buffer may be dropped during processing
3813 * for congestion control or by the protocol layers.
3814 *
3815 * This function may only be called from softirq context and interrupts
3816 * should be enabled.
3817 *
3818 * Return values (usually ignored):
3819 * NET_RX_SUCCESS: no congestion
3820 * NET_RX_DROP: packet was dropped
3821 */
3822 int netif_receive_skb(struct sk_buff *skb)
3823 {
3824 trace_netif_receive_skb_entry(skb);
3825
3826 return netif_receive_skb_internal(skb);
3827 }
3828 EXPORT_SYMBOL(netif_receive_skb);
3829
3830 /* Network device is going away, flush any packets still pending
3831 * Called with irqs disabled.
3832 */
3833 static void flush_backlog(void *arg)
3834 {
3835 struct net_device *dev = arg;
3836 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3837 struct sk_buff *skb, *tmp;
3838
3839 rps_lock(sd);
3840 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3841 if (skb->dev == dev) {
3842 __skb_unlink(skb, &sd->input_pkt_queue);
3843 kfree_skb(skb);
3844 input_queue_head_incr(sd);
3845 }
3846 }
3847 rps_unlock(sd);
3848
3849 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3850 if (skb->dev == dev) {
3851 __skb_unlink(skb, &sd->process_queue);
3852 kfree_skb(skb);
3853 input_queue_head_incr(sd);
3854 }
3855 }
3856 }
3857
3858 static int napi_gro_complete(struct sk_buff *skb)
3859 {
3860 struct packet_offload *ptype;
3861 __be16 type = skb->protocol;
3862 struct list_head *head = &offload_base;
3863 int err = -ENOENT;
3864
3865 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3866
3867 if (NAPI_GRO_CB(skb)->count == 1) {
3868 skb_shinfo(skb)->gso_size = 0;
3869 goto out;
3870 }
3871
3872 rcu_read_lock();
3873 list_for_each_entry_rcu(ptype, head, list) {
3874 if (ptype->type != type || !ptype->callbacks.gro_complete)
3875 continue;
3876
3877 err = ptype->callbacks.gro_complete(skb, 0);
3878 break;
3879 }
3880 rcu_read_unlock();
3881
3882 if (err) {
3883 WARN_ON(&ptype->list == head);
3884 kfree_skb(skb);
3885 return NET_RX_SUCCESS;
3886 }
3887
3888 out:
3889 return netif_receive_skb_internal(skb);
3890 }
3891
3892 /* napi->gro_list contains packets ordered by age.
3893 * youngest packets at the head of it.
3894 * Complete skbs in reverse order to reduce latencies.
3895 */
3896 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3897 {
3898 struct sk_buff *skb, *prev = NULL;
3899
3900 /* scan list and build reverse chain */
3901 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3902 skb->prev = prev;
3903 prev = skb;
3904 }
3905
3906 for (skb = prev; skb; skb = prev) {
3907 skb->next = NULL;
3908
3909 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3910 return;
3911
3912 prev = skb->prev;
3913 napi_gro_complete(skb);
3914 napi->gro_count--;
3915 }
3916
3917 napi->gro_list = NULL;
3918 }
3919 EXPORT_SYMBOL(napi_gro_flush);
3920
3921 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3922 {
3923 struct sk_buff *p;
3924 unsigned int maclen = skb->dev->hard_header_len;
3925 u32 hash = skb_get_hash_raw(skb);
3926
3927 for (p = napi->gro_list; p; p = p->next) {
3928 unsigned long diffs;
3929
3930 NAPI_GRO_CB(p)->flush = 0;
3931
3932 if (hash != skb_get_hash_raw(p)) {
3933 NAPI_GRO_CB(p)->same_flow = 0;
3934 continue;
3935 }
3936
3937 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3938 diffs |= p->vlan_tci ^ skb->vlan_tci;
3939 if (maclen == ETH_HLEN)
3940 diffs |= compare_ether_header(skb_mac_header(p),
3941 skb_mac_header(skb));
3942 else if (!diffs)
3943 diffs = memcmp(skb_mac_header(p),
3944 skb_mac_header(skb),
3945 maclen);
3946 NAPI_GRO_CB(p)->same_flow = !diffs;
3947 }
3948 }
3949
3950 static void skb_gro_reset_offset(struct sk_buff *skb)
3951 {
3952 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3953 const skb_frag_t *frag0 = &pinfo->frags[0];
3954
3955 NAPI_GRO_CB(skb)->data_offset = 0;
3956 NAPI_GRO_CB(skb)->frag0 = NULL;
3957 NAPI_GRO_CB(skb)->frag0_len = 0;
3958
3959 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3960 pinfo->nr_frags &&
3961 !PageHighMem(skb_frag_page(frag0))) {
3962 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3963 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3964 }
3965 }
3966
3967 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3968 {
3969 struct skb_shared_info *pinfo = skb_shinfo(skb);
3970
3971 BUG_ON(skb->end - skb->tail < grow);
3972
3973 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3974
3975 skb->data_len -= grow;
3976 skb->tail += grow;
3977
3978 pinfo->frags[0].page_offset += grow;
3979 skb_frag_size_sub(&pinfo->frags[0], grow);
3980
3981 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3982 skb_frag_unref(skb, 0);
3983 memmove(pinfo->frags, pinfo->frags + 1,
3984 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3985 }
3986 }
3987
3988 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3989 {
3990 struct sk_buff **pp = NULL;
3991 struct packet_offload *ptype;
3992 __be16 type = skb->protocol;
3993 struct list_head *head = &offload_base;
3994 int same_flow;
3995 enum gro_result ret;
3996 int grow;
3997
3998 if (!(skb->dev->features & NETIF_F_GRO))
3999 goto normal;
4000
4001 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4002 goto normal;
4003
4004 gro_list_prepare(napi, skb);
4005
4006 rcu_read_lock();
4007 list_for_each_entry_rcu(ptype, head, list) {
4008 if (ptype->type != type || !ptype->callbacks.gro_receive)
4009 continue;
4010
4011 skb_set_network_header(skb, skb_gro_offset(skb));
4012 skb_reset_mac_len(skb);
4013 NAPI_GRO_CB(skb)->same_flow = 0;
4014 NAPI_GRO_CB(skb)->flush = 0;
4015 NAPI_GRO_CB(skb)->free = 0;
4016 NAPI_GRO_CB(skb)->udp_mark = 0;
4017
4018 /* Setup for GRO checksum validation */
4019 switch (skb->ip_summed) {
4020 case CHECKSUM_COMPLETE:
4021 NAPI_GRO_CB(skb)->csum = skb->csum;
4022 NAPI_GRO_CB(skb)->csum_valid = 1;
4023 NAPI_GRO_CB(skb)->csum_cnt = 0;
4024 break;
4025 case CHECKSUM_UNNECESSARY:
4026 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4027 NAPI_GRO_CB(skb)->csum_valid = 0;
4028 break;
4029 default:
4030 NAPI_GRO_CB(skb)->csum_cnt = 0;
4031 NAPI_GRO_CB(skb)->csum_valid = 0;
4032 }
4033
4034 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4035 break;
4036 }
4037 rcu_read_unlock();
4038
4039 if (&ptype->list == head)
4040 goto normal;
4041
4042 same_flow = NAPI_GRO_CB(skb)->same_flow;
4043 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4044
4045 if (pp) {
4046 struct sk_buff *nskb = *pp;
4047
4048 *pp = nskb->next;
4049 nskb->next = NULL;
4050 napi_gro_complete(nskb);
4051 napi->gro_count--;
4052 }
4053
4054 if (same_flow)
4055 goto ok;
4056
4057 if (NAPI_GRO_CB(skb)->flush)
4058 goto normal;
4059
4060 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4061 struct sk_buff *nskb = napi->gro_list;
4062
4063 /* locate the end of the list to select the 'oldest' flow */
4064 while (nskb->next) {
4065 pp = &nskb->next;
4066 nskb = *pp;
4067 }
4068 *pp = NULL;
4069 nskb->next = NULL;
4070 napi_gro_complete(nskb);
4071 } else {
4072 napi->gro_count++;
4073 }
4074 NAPI_GRO_CB(skb)->count = 1;
4075 NAPI_GRO_CB(skb)->age = jiffies;
4076 NAPI_GRO_CB(skb)->last = skb;
4077 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4078 skb->next = napi->gro_list;
4079 napi->gro_list = skb;
4080 ret = GRO_HELD;
4081
4082 pull:
4083 grow = skb_gro_offset(skb) - skb_headlen(skb);
4084 if (grow > 0)
4085 gro_pull_from_frag0(skb, grow);
4086 ok:
4087 return ret;
4088
4089 normal:
4090 ret = GRO_NORMAL;
4091 goto pull;
4092 }
4093
4094 struct packet_offload *gro_find_receive_by_type(__be16 type)
4095 {
4096 struct list_head *offload_head = &offload_base;
4097 struct packet_offload *ptype;
4098
4099 list_for_each_entry_rcu(ptype, offload_head, list) {
4100 if (ptype->type != type || !ptype->callbacks.gro_receive)
4101 continue;
4102 return ptype;
4103 }
4104 return NULL;
4105 }
4106 EXPORT_SYMBOL(gro_find_receive_by_type);
4107
4108 struct packet_offload *gro_find_complete_by_type(__be16 type)
4109 {
4110 struct list_head *offload_head = &offload_base;
4111 struct packet_offload *ptype;
4112
4113 list_for_each_entry_rcu(ptype, offload_head, list) {
4114 if (ptype->type != type || !ptype->callbacks.gro_complete)
4115 continue;
4116 return ptype;
4117 }
4118 return NULL;
4119 }
4120 EXPORT_SYMBOL(gro_find_complete_by_type);
4121
4122 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4123 {
4124 switch (ret) {
4125 case GRO_NORMAL:
4126 if (netif_receive_skb_internal(skb))
4127 ret = GRO_DROP;
4128 break;
4129
4130 case GRO_DROP:
4131 kfree_skb(skb);
4132 break;
4133
4134 case GRO_MERGED_FREE:
4135 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4136 kmem_cache_free(skbuff_head_cache, skb);
4137 else
4138 __kfree_skb(skb);
4139 break;
4140
4141 case GRO_HELD:
4142 case GRO_MERGED:
4143 break;
4144 }
4145
4146 return ret;
4147 }
4148
4149 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4150 {
4151 trace_napi_gro_receive_entry(skb);
4152
4153 skb_gro_reset_offset(skb);
4154
4155 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4156 }
4157 EXPORT_SYMBOL(napi_gro_receive);
4158
4159 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4160 {
4161 if (unlikely(skb->pfmemalloc)) {
4162 consume_skb(skb);
4163 return;
4164 }
4165 __skb_pull(skb, skb_headlen(skb));
4166 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4167 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4168 skb->vlan_tci = 0;
4169 skb->dev = napi->dev;
4170 skb->skb_iif = 0;
4171 skb->encapsulation = 0;
4172 skb_shinfo(skb)->gso_type = 0;
4173 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4174
4175 napi->skb = skb;
4176 }
4177
4178 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4179 {
4180 struct sk_buff *skb = napi->skb;
4181
4182 if (!skb) {
4183 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4184 napi->skb = skb;
4185 }
4186 return skb;
4187 }
4188 EXPORT_SYMBOL(napi_get_frags);
4189
4190 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4191 struct sk_buff *skb,
4192 gro_result_t ret)
4193 {
4194 switch (ret) {
4195 case GRO_NORMAL:
4196 case GRO_HELD:
4197 __skb_push(skb, ETH_HLEN);
4198 skb->protocol = eth_type_trans(skb, skb->dev);
4199 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4200 ret = GRO_DROP;
4201 break;
4202
4203 case GRO_DROP:
4204 case GRO_MERGED_FREE:
4205 napi_reuse_skb(napi, skb);
4206 break;
4207
4208 case GRO_MERGED:
4209 break;
4210 }
4211
4212 return ret;
4213 }
4214
4215 /* Upper GRO stack assumes network header starts at gro_offset=0
4216 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4217 * We copy ethernet header into skb->data to have a common layout.
4218 */
4219 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4220 {
4221 struct sk_buff *skb = napi->skb;
4222 const struct ethhdr *eth;
4223 unsigned int hlen = sizeof(*eth);
4224
4225 napi->skb = NULL;
4226
4227 skb_reset_mac_header(skb);
4228 skb_gro_reset_offset(skb);
4229
4230 eth = skb_gro_header_fast(skb, 0);
4231 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4232 eth = skb_gro_header_slow(skb, hlen, 0);
4233 if (unlikely(!eth)) {
4234 napi_reuse_skb(napi, skb);
4235 return NULL;
4236 }
4237 } else {
4238 gro_pull_from_frag0(skb, hlen);
4239 NAPI_GRO_CB(skb)->frag0 += hlen;
4240 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4241 }
4242 __skb_pull(skb, hlen);
4243
4244 /*
4245 * This works because the only protocols we care about don't require
4246 * special handling.
4247 * We'll fix it up properly in napi_frags_finish()
4248 */
4249 skb->protocol = eth->h_proto;
4250
4251 return skb;
4252 }
4253
4254 gro_result_t napi_gro_frags(struct napi_struct *napi)
4255 {
4256 struct sk_buff *skb = napi_frags_skb(napi);
4257
4258 if (!skb)
4259 return GRO_DROP;
4260
4261 trace_napi_gro_frags_entry(skb);
4262
4263 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4264 }
4265 EXPORT_SYMBOL(napi_gro_frags);
4266
4267 /* Compute the checksum from gro_offset and return the folded value
4268 * after adding in any pseudo checksum.
4269 */
4270 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4271 {
4272 __wsum wsum;
4273 __sum16 sum;
4274
4275 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4276
4277 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4278 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4279 if (likely(!sum)) {
4280 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4281 !skb->csum_complete_sw)
4282 netdev_rx_csum_fault(skb->dev);
4283 }
4284
4285 NAPI_GRO_CB(skb)->csum = wsum;
4286 NAPI_GRO_CB(skb)->csum_valid = 1;
4287
4288 return sum;
4289 }
4290 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4291
4292 /*
4293 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4294 * Note: called with local irq disabled, but exits with local irq enabled.
4295 */
4296 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4297 {
4298 #ifdef CONFIG_RPS
4299 struct softnet_data *remsd = sd->rps_ipi_list;
4300
4301 if (remsd) {
4302 sd->rps_ipi_list = NULL;
4303
4304 local_irq_enable();
4305
4306 /* Send pending IPI's to kick RPS processing on remote cpus. */
4307 while (remsd) {
4308 struct softnet_data *next = remsd->rps_ipi_next;
4309
4310 if (cpu_online(remsd->cpu))
4311 smp_call_function_single_async(remsd->cpu,
4312 &remsd->csd);
4313 remsd = next;
4314 }
4315 } else
4316 #endif
4317 local_irq_enable();
4318 }
4319
4320 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4321 {
4322 #ifdef CONFIG_RPS
4323 return sd->rps_ipi_list != NULL;
4324 #else
4325 return false;
4326 #endif
4327 }
4328
4329 static int process_backlog(struct napi_struct *napi, int quota)
4330 {
4331 int work = 0;
4332 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4333
4334 /* Check if we have pending ipi, its better to send them now,
4335 * not waiting net_rx_action() end.
4336 */
4337 if (sd_has_rps_ipi_waiting(sd)) {
4338 local_irq_disable();
4339 net_rps_action_and_irq_enable(sd);
4340 }
4341
4342 napi->weight = weight_p;
4343 local_irq_disable();
4344 while (1) {
4345 struct sk_buff *skb;
4346
4347 while ((skb = __skb_dequeue(&sd->process_queue))) {
4348 local_irq_enable();
4349 __netif_receive_skb(skb);
4350 local_irq_disable();
4351 input_queue_head_incr(sd);
4352 if (++work >= quota) {
4353 local_irq_enable();
4354 return work;
4355 }
4356 }
4357
4358 rps_lock(sd);
4359 if (skb_queue_empty(&sd->input_pkt_queue)) {
4360 /*
4361 * Inline a custom version of __napi_complete().
4362 * only current cpu owns and manipulates this napi,
4363 * and NAPI_STATE_SCHED is the only possible flag set
4364 * on backlog.
4365 * We can use a plain write instead of clear_bit(),
4366 * and we dont need an smp_mb() memory barrier.
4367 */
4368 napi->state = 0;
4369 rps_unlock(sd);
4370
4371 break;
4372 }
4373
4374 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4375 &sd->process_queue);
4376 rps_unlock(sd);
4377 }
4378 local_irq_enable();
4379
4380 return work;
4381 }
4382
4383 /**
4384 * __napi_schedule - schedule for receive
4385 * @n: entry to schedule
4386 *
4387 * The entry's receive function will be scheduled to run.
4388 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4389 */
4390 void __napi_schedule(struct napi_struct *n)
4391 {
4392 unsigned long flags;
4393
4394 local_irq_save(flags);
4395 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4396 local_irq_restore(flags);
4397 }
4398 EXPORT_SYMBOL(__napi_schedule);
4399
4400 /**
4401 * __napi_schedule_irqoff - schedule for receive
4402 * @n: entry to schedule
4403 *
4404 * Variant of __napi_schedule() assuming hard irqs are masked
4405 */
4406 void __napi_schedule_irqoff(struct napi_struct *n)
4407 {
4408 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4409 }
4410 EXPORT_SYMBOL(__napi_schedule_irqoff);
4411
4412 void __napi_complete(struct napi_struct *n)
4413 {
4414 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4415 BUG_ON(n->gro_list);
4416
4417 list_del_init(&n->poll_list);
4418 smp_mb__before_atomic();
4419 clear_bit(NAPI_STATE_SCHED, &n->state);
4420 }
4421 EXPORT_SYMBOL(__napi_complete);
4422
4423 void napi_complete(struct napi_struct *n)
4424 {
4425 unsigned long flags;
4426
4427 /*
4428 * don't let napi dequeue from the cpu poll list
4429 * just in case its running on a different cpu
4430 */
4431 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4432 return;
4433
4434 napi_gro_flush(n, false);
4435
4436 if (likely(list_empty(&n->poll_list))) {
4437 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4438 } else {
4439 /* If n->poll_list is not empty, we need to mask irqs */
4440 local_irq_save(flags);
4441 __napi_complete(n);
4442 local_irq_restore(flags);
4443 }
4444 }
4445 EXPORT_SYMBOL(napi_complete);
4446
4447 /* must be called under rcu_read_lock(), as we dont take a reference */
4448 struct napi_struct *napi_by_id(unsigned int napi_id)
4449 {
4450 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4451 struct napi_struct *napi;
4452
4453 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4454 if (napi->napi_id == napi_id)
4455 return napi;
4456
4457 return NULL;
4458 }
4459 EXPORT_SYMBOL_GPL(napi_by_id);
4460
4461 void napi_hash_add(struct napi_struct *napi)
4462 {
4463 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4464
4465 spin_lock(&napi_hash_lock);
4466
4467 /* 0 is not a valid id, we also skip an id that is taken
4468 * we expect both events to be extremely rare
4469 */
4470 napi->napi_id = 0;
4471 while (!napi->napi_id) {
4472 napi->napi_id = ++napi_gen_id;
4473 if (napi_by_id(napi->napi_id))
4474 napi->napi_id = 0;
4475 }
4476
4477 hlist_add_head_rcu(&napi->napi_hash_node,
4478 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4479
4480 spin_unlock(&napi_hash_lock);
4481 }
4482 }
4483 EXPORT_SYMBOL_GPL(napi_hash_add);
4484
4485 /* Warning : caller is responsible to make sure rcu grace period
4486 * is respected before freeing memory containing @napi
4487 */
4488 void napi_hash_del(struct napi_struct *napi)
4489 {
4490 spin_lock(&napi_hash_lock);
4491
4492 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4493 hlist_del_rcu(&napi->napi_hash_node);
4494
4495 spin_unlock(&napi_hash_lock);
4496 }
4497 EXPORT_SYMBOL_GPL(napi_hash_del);
4498
4499 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4500 int (*poll)(struct napi_struct *, int), int weight)
4501 {
4502 INIT_LIST_HEAD(&napi->poll_list);
4503 napi->gro_count = 0;
4504 napi->gro_list = NULL;
4505 napi->skb = NULL;
4506 napi->poll = poll;
4507 if (weight > NAPI_POLL_WEIGHT)
4508 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4509 weight, dev->name);
4510 napi->weight = weight;
4511 list_add(&napi->dev_list, &dev->napi_list);
4512 napi->dev = dev;
4513 #ifdef CONFIG_NETPOLL
4514 spin_lock_init(&napi->poll_lock);
4515 napi->poll_owner = -1;
4516 #endif
4517 set_bit(NAPI_STATE_SCHED, &napi->state);
4518 }
4519 EXPORT_SYMBOL(netif_napi_add);
4520
4521 void netif_napi_del(struct napi_struct *napi)
4522 {
4523 list_del_init(&napi->dev_list);
4524 napi_free_frags(napi);
4525
4526 kfree_skb_list(napi->gro_list);
4527 napi->gro_list = NULL;
4528 napi->gro_count = 0;
4529 }
4530 EXPORT_SYMBOL(netif_napi_del);
4531
4532 static void net_rx_action(struct softirq_action *h)
4533 {
4534 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4535 unsigned long time_limit = jiffies + 2;
4536 int budget = netdev_budget;
4537 LIST_HEAD(list);
4538 LIST_HEAD(repoll);
4539 void *have;
4540
4541 local_irq_disable();
4542 list_splice_init(&sd->poll_list, &list);
4543 local_irq_enable();
4544
4545 while (!list_empty(&list)) {
4546 struct napi_struct *n;
4547 int work, weight;
4548
4549 /* If softirq window is exhausted then punt.
4550 * Allow this to run for 2 jiffies since which will allow
4551 * an average latency of 1.5/HZ.
4552 */
4553 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4554 goto softnet_break;
4555
4556
4557 n = list_first_entry(&list, struct napi_struct, poll_list);
4558 list_del_init(&n->poll_list);
4559
4560 have = netpoll_poll_lock(n);
4561
4562 weight = n->weight;
4563
4564 /* This NAPI_STATE_SCHED test is for avoiding a race
4565 * with netpoll's poll_napi(). Only the entity which
4566 * obtains the lock and sees NAPI_STATE_SCHED set will
4567 * actually make the ->poll() call. Therefore we avoid
4568 * accidentally calling ->poll() when NAPI is not scheduled.
4569 */
4570 work = 0;
4571 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4572 work = n->poll(n, weight);
4573 trace_napi_poll(n);
4574 }
4575
4576 WARN_ON_ONCE(work > weight);
4577
4578 budget -= work;
4579
4580 /* Drivers must not modify the NAPI state if they
4581 * consume the entire weight. In such cases this code
4582 * still "owns" the NAPI instance and therefore can
4583 * move the instance around on the list at-will.
4584 */
4585 if (unlikely(work == weight)) {
4586 if (unlikely(napi_disable_pending(n))) {
4587 napi_complete(n);
4588 } else {
4589 if (n->gro_list) {
4590 /* flush too old packets
4591 * If HZ < 1000, flush all packets.
4592 */
4593 napi_gro_flush(n, HZ >= 1000);
4594 }
4595 list_add_tail(&n->poll_list, &repoll);
4596 }
4597 }
4598
4599 netpoll_poll_unlock(have);
4600 }
4601
4602 if (!sd_has_rps_ipi_waiting(sd) &&
4603 list_empty(&list) &&
4604 list_empty(&repoll))
4605 return;
4606 out:
4607 local_irq_disable();
4608
4609 list_splice_tail_init(&sd->poll_list, &list);
4610 list_splice_tail(&repoll, &list);
4611 list_splice(&list, &sd->poll_list);
4612 if (!list_empty(&sd->poll_list))
4613 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4614
4615 net_rps_action_and_irq_enable(sd);
4616
4617 return;
4618
4619 softnet_break:
4620 sd->time_squeeze++;
4621 goto out;
4622 }
4623
4624 struct netdev_adjacent {
4625 struct net_device *dev;
4626
4627 /* upper master flag, there can only be one master device per list */
4628 bool master;
4629
4630 /* counter for the number of times this device was added to us */
4631 u16 ref_nr;
4632
4633 /* private field for the users */
4634 void *private;
4635
4636 struct list_head list;
4637 struct rcu_head rcu;
4638 };
4639
4640 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4641 struct net_device *adj_dev,
4642 struct list_head *adj_list)
4643 {
4644 struct netdev_adjacent *adj;
4645
4646 list_for_each_entry(adj, adj_list, list) {
4647 if (adj->dev == adj_dev)
4648 return adj;
4649 }
4650 return NULL;
4651 }
4652
4653 /**
4654 * netdev_has_upper_dev - Check if device is linked to an upper device
4655 * @dev: device
4656 * @upper_dev: upper device to check
4657 *
4658 * Find out if a device is linked to specified upper device and return true
4659 * in case it is. Note that this checks only immediate upper device,
4660 * not through a complete stack of devices. The caller must hold the RTNL lock.
4661 */
4662 bool netdev_has_upper_dev(struct net_device *dev,
4663 struct net_device *upper_dev)
4664 {
4665 ASSERT_RTNL();
4666
4667 return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4668 }
4669 EXPORT_SYMBOL(netdev_has_upper_dev);
4670
4671 /**
4672 * netdev_has_any_upper_dev - Check if device is linked to some device
4673 * @dev: device
4674 *
4675 * Find out if a device is linked to an upper device and return true in case
4676 * it is. The caller must hold the RTNL lock.
4677 */
4678 static bool netdev_has_any_upper_dev(struct net_device *dev)
4679 {
4680 ASSERT_RTNL();
4681
4682 return !list_empty(&dev->all_adj_list.upper);
4683 }
4684
4685 /**
4686 * netdev_master_upper_dev_get - Get master upper device
4687 * @dev: device
4688 *
4689 * Find a master upper device and return pointer to it or NULL in case
4690 * it's not there. The caller must hold the RTNL lock.
4691 */
4692 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4693 {
4694 struct netdev_adjacent *upper;
4695
4696 ASSERT_RTNL();
4697
4698 if (list_empty(&dev->adj_list.upper))
4699 return NULL;
4700
4701 upper = list_first_entry(&dev->adj_list.upper,
4702 struct netdev_adjacent, list);
4703 if (likely(upper->master))
4704 return upper->dev;
4705 return NULL;
4706 }
4707 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4708
4709 void *netdev_adjacent_get_private(struct list_head *adj_list)
4710 {
4711 struct netdev_adjacent *adj;
4712
4713 adj = list_entry(adj_list, struct netdev_adjacent, list);
4714
4715 return adj->private;
4716 }
4717 EXPORT_SYMBOL(netdev_adjacent_get_private);
4718
4719 /**
4720 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4721 * @dev: device
4722 * @iter: list_head ** of the current position
4723 *
4724 * Gets the next device from the dev's upper list, starting from iter
4725 * position. The caller must hold RCU read lock.
4726 */
4727 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4728 struct list_head **iter)
4729 {
4730 struct netdev_adjacent *upper;
4731
4732 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4733
4734 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4735
4736 if (&upper->list == &dev->adj_list.upper)
4737 return NULL;
4738
4739 *iter = &upper->list;
4740
4741 return upper->dev;
4742 }
4743 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4744
4745 /**
4746 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4747 * @dev: device
4748 * @iter: list_head ** of the current position
4749 *
4750 * Gets the next device from the dev's upper list, starting from iter
4751 * position. The caller must hold RCU read lock.
4752 */
4753 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4754 struct list_head **iter)
4755 {
4756 struct netdev_adjacent *upper;
4757
4758 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4759
4760 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4761
4762 if (&upper->list == &dev->all_adj_list.upper)
4763 return NULL;
4764
4765 *iter = &upper->list;
4766
4767 return upper->dev;
4768 }
4769 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4770
4771 /**
4772 * netdev_lower_get_next_private - Get the next ->private from the
4773 * lower neighbour list
4774 * @dev: device
4775 * @iter: list_head ** of the current position
4776 *
4777 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4778 * list, starting from iter position. The caller must hold either hold the
4779 * RTNL lock or its own locking that guarantees that the neighbour lower
4780 * list will remain unchainged.
4781 */
4782 void *netdev_lower_get_next_private(struct net_device *dev,
4783 struct list_head **iter)
4784 {
4785 struct netdev_adjacent *lower;
4786
4787 lower = list_entry(*iter, struct netdev_adjacent, list);
4788
4789 if (&lower->list == &dev->adj_list.lower)
4790 return NULL;
4791
4792 *iter = lower->list.next;
4793
4794 return lower->private;
4795 }
4796 EXPORT_SYMBOL(netdev_lower_get_next_private);
4797
4798 /**
4799 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4800 * lower neighbour list, RCU
4801 * variant
4802 * @dev: device
4803 * @iter: list_head ** of the current position
4804 *
4805 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4806 * list, starting from iter position. The caller must hold RCU read lock.
4807 */
4808 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4809 struct list_head **iter)
4810 {
4811 struct netdev_adjacent *lower;
4812
4813 WARN_ON_ONCE(!rcu_read_lock_held());
4814
4815 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4816
4817 if (&lower->list == &dev->adj_list.lower)
4818 return NULL;
4819
4820 *iter = &lower->list;
4821
4822 return lower->private;
4823 }
4824 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4825
4826 /**
4827 * netdev_lower_get_next - Get the next device from the lower neighbour
4828 * list
4829 * @dev: device
4830 * @iter: list_head ** of the current position
4831 *
4832 * Gets the next netdev_adjacent from the dev's lower neighbour
4833 * list, starting from iter position. The caller must hold RTNL lock or
4834 * its own locking that guarantees that the neighbour lower
4835 * list will remain unchainged.
4836 */
4837 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4838 {
4839 struct netdev_adjacent *lower;
4840
4841 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4842
4843 if (&lower->list == &dev->adj_list.lower)
4844 return NULL;
4845
4846 *iter = &lower->list;
4847
4848 return lower->dev;
4849 }
4850 EXPORT_SYMBOL(netdev_lower_get_next);
4851
4852 /**
4853 * netdev_lower_get_first_private_rcu - Get the first ->private from the
4854 * lower neighbour list, RCU
4855 * variant
4856 * @dev: device
4857 *
4858 * Gets the first netdev_adjacent->private from the dev's lower neighbour
4859 * list. The caller must hold RCU read lock.
4860 */
4861 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4862 {
4863 struct netdev_adjacent *lower;
4864
4865 lower = list_first_or_null_rcu(&dev->adj_list.lower,
4866 struct netdev_adjacent, list);
4867 if (lower)
4868 return lower->private;
4869 return NULL;
4870 }
4871 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4872
4873 /**
4874 * netdev_master_upper_dev_get_rcu - Get master upper device
4875 * @dev: device
4876 *
4877 * Find a master upper device and return pointer to it or NULL in case
4878 * it's not there. The caller must hold the RCU read lock.
4879 */
4880 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4881 {
4882 struct netdev_adjacent *upper;
4883
4884 upper = list_first_or_null_rcu(&dev->adj_list.upper,
4885 struct netdev_adjacent, list);
4886 if (upper && likely(upper->master))
4887 return upper->dev;
4888 return NULL;
4889 }
4890 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4891
4892 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4893 struct net_device *adj_dev,
4894 struct list_head *dev_list)
4895 {
4896 char linkname[IFNAMSIZ+7];
4897 sprintf(linkname, dev_list == &dev->adj_list.upper ?
4898 "upper_%s" : "lower_%s", adj_dev->name);
4899 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4900 linkname);
4901 }
4902 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4903 char *name,
4904 struct list_head *dev_list)
4905 {
4906 char linkname[IFNAMSIZ+7];
4907 sprintf(linkname, dev_list == &dev->adj_list.upper ?
4908 "upper_%s" : "lower_%s", name);
4909 sysfs_remove_link(&(dev->dev.kobj), linkname);
4910 }
4911
4912 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4913 struct net_device *adj_dev,
4914 struct list_head *dev_list)
4915 {
4916 return (dev_list == &dev->adj_list.upper ||
4917 dev_list == &dev->adj_list.lower) &&
4918 net_eq(dev_net(dev), dev_net(adj_dev));
4919 }
4920
4921 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4922 struct net_device *adj_dev,
4923 struct list_head *dev_list,
4924 void *private, bool master)
4925 {
4926 struct netdev_adjacent *adj;
4927 int ret;
4928
4929 adj = __netdev_find_adj(dev, adj_dev, dev_list);
4930
4931 if (adj) {
4932 adj->ref_nr++;
4933 return 0;
4934 }
4935
4936 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4937 if (!adj)
4938 return -ENOMEM;
4939
4940 adj->dev = adj_dev;
4941 adj->master = master;
4942 adj->ref_nr = 1;
4943 adj->private = private;
4944 dev_hold(adj_dev);
4945
4946 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4947 adj_dev->name, dev->name, adj_dev->name);
4948
4949 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
4950 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4951 if (ret)
4952 goto free_adj;
4953 }
4954
4955 /* Ensure that master link is always the first item in list. */
4956 if (master) {
4957 ret = sysfs_create_link(&(dev->dev.kobj),
4958 &(adj_dev->dev.kobj), "master");
4959 if (ret)
4960 goto remove_symlinks;
4961
4962 list_add_rcu(&adj->list, dev_list);
4963 } else {
4964 list_add_tail_rcu(&adj->list, dev_list);
4965 }
4966
4967 return 0;
4968
4969 remove_symlinks:
4970 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
4971 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4972 free_adj:
4973 kfree(adj);
4974 dev_put(adj_dev);
4975
4976 return ret;
4977 }
4978
4979 static void __netdev_adjacent_dev_remove(struct net_device *dev,
4980 struct net_device *adj_dev,
4981 struct list_head *dev_list)
4982 {
4983 struct netdev_adjacent *adj;
4984
4985 adj = __netdev_find_adj(dev, adj_dev, dev_list);
4986
4987 if (!adj) {
4988 pr_err("tried to remove device %s from %s\n",
4989 dev->name, adj_dev->name);
4990 BUG();
4991 }
4992
4993 if (adj->ref_nr > 1) {
4994 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4995 adj->ref_nr-1);
4996 adj->ref_nr--;
4997 return;
4998 }
4999
5000 if (adj->master)
5001 sysfs_remove_link(&(dev->dev.kobj), "master");
5002
5003 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5004 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5005
5006 list_del_rcu(&adj->list);
5007 pr_debug("dev_put for %s, because link removed from %s to %s\n",
5008 adj_dev->name, dev->name, adj_dev->name);
5009 dev_put(adj_dev);
5010 kfree_rcu(adj, rcu);
5011 }
5012
5013 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5014 struct net_device *upper_dev,
5015 struct list_head *up_list,
5016 struct list_head *down_list,
5017 void *private, bool master)
5018 {
5019 int ret;
5020
5021 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5022 master);
5023 if (ret)
5024 return ret;
5025
5026 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5027 false);
5028 if (ret) {
5029 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5030 return ret;
5031 }
5032
5033 return 0;
5034 }
5035
5036 static int __netdev_adjacent_dev_link(struct net_device *dev,
5037 struct net_device *upper_dev)
5038 {
5039 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5040 &dev->all_adj_list.upper,
5041 &upper_dev->all_adj_list.lower,
5042 NULL, false);
5043 }
5044
5045 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5046 struct net_device *upper_dev,
5047 struct list_head *up_list,
5048 struct list_head *down_list)
5049 {
5050 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5051 __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5052 }
5053
5054 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5055 struct net_device *upper_dev)
5056 {
5057 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5058 &dev->all_adj_list.upper,
5059 &upper_dev->all_adj_list.lower);
5060 }
5061
5062 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5063 struct net_device *upper_dev,
5064 void *private, bool master)
5065 {
5066 int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5067
5068 if (ret)
5069 return ret;
5070
5071 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5072 &dev->adj_list.upper,
5073 &upper_dev->adj_list.lower,
5074 private, master);
5075 if (ret) {
5076 __netdev_adjacent_dev_unlink(dev, upper_dev);
5077 return ret;
5078 }
5079
5080 return 0;
5081 }
5082
5083 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5084 struct net_device *upper_dev)
5085 {
5086 __netdev_adjacent_dev_unlink(dev, upper_dev);
5087 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5088 &dev->adj_list.upper,
5089 &upper_dev->adj_list.lower);
5090 }
5091
5092 static int __netdev_upper_dev_link(struct net_device *dev,
5093 struct net_device *upper_dev, bool master,
5094 void *private)
5095 {
5096 struct netdev_adjacent *i, *j, *to_i, *to_j;
5097 int ret = 0;
5098
5099 ASSERT_RTNL();
5100
5101 if (dev == upper_dev)
5102 return -EBUSY;
5103
5104 /* To prevent loops, check if dev is not upper device to upper_dev. */
5105 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5106 return -EBUSY;
5107
5108 if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5109 return -EEXIST;
5110
5111 if (master && netdev_master_upper_dev_get(dev))
5112 return -EBUSY;
5113
5114 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5115 master);
5116 if (ret)
5117 return ret;
5118
5119 /* Now that we linked these devs, make all the upper_dev's
5120 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5121 * versa, and don't forget the devices itself. All of these
5122 * links are non-neighbours.
5123 */
5124 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5125 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5126 pr_debug("Interlinking %s with %s, non-neighbour\n",
5127 i->dev->name, j->dev->name);
5128 ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5129 if (ret)
5130 goto rollback_mesh;
5131 }
5132 }
5133
5134 /* add dev to every upper_dev's upper device */
5135 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5136 pr_debug("linking %s's upper device %s with %s\n",
5137 upper_dev->name, i->dev->name, dev->name);
5138 ret = __netdev_adjacent_dev_link(dev, i->dev);
5139 if (ret)
5140 goto rollback_upper_mesh;
5141 }
5142
5143 /* add upper_dev to every dev's lower device */
5144 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5145 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5146 i->dev->name, upper_dev->name);
5147 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5148 if (ret)
5149 goto rollback_lower_mesh;
5150 }
5151
5152 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5153 return 0;
5154
5155 rollback_lower_mesh:
5156 to_i = i;
5157 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5158 if (i == to_i)
5159 break;
5160 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5161 }
5162
5163 i = NULL;
5164
5165 rollback_upper_mesh:
5166 to_i = i;
5167 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5168 if (i == to_i)
5169 break;
5170 __netdev_adjacent_dev_unlink(dev, i->dev);
5171 }
5172
5173 i = j = NULL;
5174
5175 rollback_mesh:
5176 to_i = i;
5177 to_j = j;
5178 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5179 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5180 if (i == to_i && j == to_j)
5181 break;
5182 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5183 }
5184 if (i == to_i)
5185 break;
5186 }
5187
5188 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5189
5190 return ret;
5191 }
5192
5193 /**
5194 * netdev_upper_dev_link - Add a link to the upper device
5195 * @dev: device
5196 * @upper_dev: new upper device
5197 *
5198 * Adds a link to device which is upper to this one. The caller must hold
5199 * the RTNL lock. On a failure a negative errno code is returned.
5200 * On success the reference counts are adjusted and the function
5201 * returns zero.
5202 */
5203 int netdev_upper_dev_link(struct net_device *dev,
5204 struct net_device *upper_dev)
5205 {
5206 return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5207 }
5208 EXPORT_SYMBOL(netdev_upper_dev_link);
5209
5210 /**
5211 * netdev_master_upper_dev_link - Add a master link to the upper device
5212 * @dev: device
5213 * @upper_dev: new upper device
5214 *
5215 * Adds a link to device which is upper to this one. In this case, only
5216 * one master upper device can be linked, although other non-master devices
5217 * might be linked as well. The caller must hold the RTNL lock.
5218 * On a failure a negative errno code is returned. On success the reference
5219 * counts are adjusted and the function returns zero.
5220 */
5221 int netdev_master_upper_dev_link(struct net_device *dev,
5222 struct net_device *upper_dev)
5223 {
5224 return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5225 }
5226 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5227
5228 int netdev_master_upper_dev_link_private(struct net_device *dev,
5229 struct net_device *upper_dev,
5230 void *private)
5231 {
5232 return __netdev_upper_dev_link(dev, upper_dev, true, private);
5233 }
5234 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5235
5236 /**
5237 * netdev_upper_dev_unlink - Removes a link to upper device
5238 * @dev: device
5239 * @upper_dev: new upper device
5240 *
5241 * Removes a link to device which is upper to this one. The caller must hold
5242 * the RTNL lock.
5243 */
5244 void netdev_upper_dev_unlink(struct net_device *dev,
5245 struct net_device *upper_dev)
5246 {
5247 struct netdev_adjacent *i, *j;
5248 ASSERT_RTNL();
5249
5250 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5251
5252 /* Here is the tricky part. We must remove all dev's lower
5253 * devices from all upper_dev's upper devices and vice
5254 * versa, to maintain the graph relationship.
5255 */
5256 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5257 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5258 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5259
5260 /* remove also the devices itself from lower/upper device
5261 * list
5262 */
5263 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5264 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5265
5266 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5267 __netdev_adjacent_dev_unlink(dev, i->dev);
5268
5269 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5270 }
5271 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5272
5273 void netdev_adjacent_add_links(struct net_device *dev)
5274 {
5275 struct netdev_adjacent *iter;
5276
5277 struct net *net = dev_net(dev);
5278
5279 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5280 if (!net_eq(net,dev_net(iter->dev)))
5281 continue;
5282 netdev_adjacent_sysfs_add(iter->dev, dev,
5283 &iter->dev->adj_list.lower);
5284 netdev_adjacent_sysfs_add(dev, iter->dev,
5285 &dev->adj_list.upper);
5286 }
5287
5288 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5289 if (!net_eq(net,dev_net(iter->dev)))
5290 continue;
5291 netdev_adjacent_sysfs_add(iter->dev, dev,
5292 &iter->dev->adj_list.upper);
5293 netdev_adjacent_sysfs_add(dev, iter->dev,
5294 &dev->adj_list.lower);
5295 }
5296 }
5297
5298 void netdev_adjacent_del_links(struct net_device *dev)
5299 {
5300 struct netdev_adjacent *iter;
5301
5302 struct net *net = dev_net(dev);
5303
5304 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5305 if (!net_eq(net,dev_net(iter->dev)))
5306 continue;
5307 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5308 &iter->dev->adj_list.lower);
5309 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5310 &dev->adj_list.upper);
5311 }
5312
5313 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5314 if (!net_eq(net,dev_net(iter->dev)))
5315 continue;
5316 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5317 &iter->dev->adj_list.upper);
5318 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5319 &dev->adj_list.lower);
5320 }
5321 }
5322
5323 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5324 {
5325 struct netdev_adjacent *iter;
5326
5327 struct net *net = dev_net(dev);
5328
5329 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5330 if (!net_eq(net,dev_net(iter->dev)))
5331 continue;
5332 netdev_adjacent_sysfs_del(iter->dev, oldname,
5333 &iter->dev->adj_list.lower);
5334 netdev_adjacent_sysfs_add(iter->dev, dev,
5335 &iter->dev->adj_list.lower);
5336 }
5337
5338 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5339 if (!net_eq(net,dev_net(iter->dev)))
5340 continue;
5341 netdev_adjacent_sysfs_del(iter->dev, oldname,
5342 &iter->dev->adj_list.upper);
5343 netdev_adjacent_sysfs_add(iter->dev, dev,
5344 &iter->dev->adj_list.upper);
5345 }
5346 }
5347
5348 void *netdev_lower_dev_get_private(struct net_device *dev,
5349 struct net_device *lower_dev)
5350 {
5351 struct netdev_adjacent *lower;
5352
5353 if (!lower_dev)
5354 return NULL;
5355 lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5356 if (!lower)
5357 return NULL;
5358
5359 return lower->private;
5360 }
5361 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5362
5363
5364 int dev_get_nest_level(struct net_device *dev,
5365 bool (*type_check)(struct net_device *dev))
5366 {
5367 struct net_device *lower = NULL;
5368 struct list_head *iter;
5369 int max_nest = -1;
5370 int nest;
5371
5372 ASSERT_RTNL();
5373
5374 netdev_for_each_lower_dev(dev, lower, iter) {
5375 nest = dev_get_nest_level(lower, type_check);
5376 if (max_nest < nest)
5377 max_nest = nest;
5378 }
5379
5380 if (type_check(dev))
5381 max_nest++;
5382
5383 return max_nest;
5384 }
5385 EXPORT_SYMBOL(dev_get_nest_level);
5386
5387 static void dev_change_rx_flags(struct net_device *dev, int flags)
5388 {
5389 const struct net_device_ops *ops = dev->netdev_ops;
5390
5391 if (ops->ndo_change_rx_flags)
5392 ops->ndo_change_rx_flags(dev, flags);
5393 }
5394
5395 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5396 {
5397 unsigned int old_flags = dev->flags;
5398 kuid_t uid;
5399 kgid_t gid;
5400
5401 ASSERT_RTNL();
5402
5403 dev->flags |= IFF_PROMISC;
5404 dev->promiscuity += inc;
5405 if (dev->promiscuity == 0) {
5406 /*
5407 * Avoid overflow.
5408 * If inc causes overflow, untouch promisc and return error.
5409 */
5410 if (inc < 0)
5411 dev->flags &= ~IFF_PROMISC;
5412 else {
5413 dev->promiscuity -= inc;
5414 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5415 dev->name);
5416 return -EOVERFLOW;
5417 }
5418 }
5419 if (dev->flags != old_flags) {
5420 pr_info("device %s %s promiscuous mode\n",
5421 dev->name,
5422 dev->flags & IFF_PROMISC ? "entered" : "left");
5423 if (audit_enabled) {
5424 current_uid_gid(&uid, &gid);
5425 audit_log(current->audit_context, GFP_ATOMIC,
5426 AUDIT_ANOM_PROMISCUOUS,
5427 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5428 dev->name, (dev->flags & IFF_PROMISC),
5429 (old_flags & IFF_PROMISC),
5430 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5431 from_kuid(&init_user_ns, uid),
5432 from_kgid(&init_user_ns, gid),
5433 audit_get_sessionid(current));
5434 }
5435
5436 dev_change_rx_flags(dev, IFF_PROMISC);
5437 }
5438 if (notify)
5439 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5440 return 0;
5441 }
5442
5443 /**
5444 * dev_set_promiscuity - update promiscuity count on a device
5445 * @dev: device
5446 * @inc: modifier
5447 *
5448 * Add or remove promiscuity from a device. While the count in the device
5449 * remains above zero the interface remains promiscuous. Once it hits zero
5450 * the device reverts back to normal filtering operation. A negative inc
5451 * value is used to drop promiscuity on the device.
5452 * Return 0 if successful or a negative errno code on error.
5453 */
5454 int dev_set_promiscuity(struct net_device *dev, int inc)
5455 {
5456 unsigned int old_flags = dev->flags;
5457 int err;
5458
5459 err = __dev_set_promiscuity(dev, inc, true);
5460 if (err < 0)
5461 return err;
5462 if (dev->flags != old_flags)
5463 dev_set_rx_mode(dev);
5464 return err;
5465 }
5466 EXPORT_SYMBOL(dev_set_promiscuity);
5467
5468 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5469 {
5470 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5471
5472 ASSERT_RTNL();
5473
5474 dev->flags |= IFF_ALLMULTI;
5475 dev->allmulti += inc;
5476 if (dev->allmulti == 0) {
5477 /*
5478 * Avoid overflow.
5479 * If inc causes overflow, untouch allmulti and return error.
5480 */
5481 if (inc < 0)
5482 dev->flags &= ~IFF_ALLMULTI;
5483 else {
5484 dev->allmulti -= inc;
5485 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5486 dev->name);
5487 return -EOVERFLOW;
5488 }
5489 }
5490 if (dev->flags ^ old_flags) {
5491 dev_change_rx_flags(dev, IFF_ALLMULTI);
5492 dev_set_rx_mode(dev);
5493 if (notify)
5494 __dev_notify_flags(dev, old_flags,
5495 dev->gflags ^ old_gflags);
5496 }
5497 return 0;
5498 }
5499
5500 /**
5501 * dev_set_allmulti - update allmulti count on a device
5502 * @dev: device
5503 * @inc: modifier
5504 *
5505 * Add or remove reception of all multicast frames to a device. While the
5506 * count in the device remains above zero the interface remains listening
5507 * to all interfaces. Once it hits zero the device reverts back to normal
5508 * filtering operation. A negative @inc value is used to drop the counter
5509 * when releasing a resource needing all multicasts.
5510 * Return 0 if successful or a negative errno code on error.
5511 */
5512
5513 int dev_set_allmulti(struct net_device *dev, int inc)
5514 {
5515 return __dev_set_allmulti(dev, inc, true);
5516 }
5517 EXPORT_SYMBOL(dev_set_allmulti);
5518
5519 /*
5520 * Upload unicast and multicast address lists to device and
5521 * configure RX filtering. When the device doesn't support unicast
5522 * filtering it is put in promiscuous mode while unicast addresses
5523 * are present.
5524 */
5525 void __dev_set_rx_mode(struct net_device *dev)
5526 {
5527 const struct net_device_ops *ops = dev->netdev_ops;
5528
5529 /* dev_open will call this function so the list will stay sane. */
5530 if (!(dev->flags&IFF_UP))
5531 return;
5532
5533 if (!netif_device_present(dev))
5534 return;
5535
5536 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5537 /* Unicast addresses changes may only happen under the rtnl,
5538 * therefore calling __dev_set_promiscuity here is safe.
5539 */
5540 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5541 __dev_set_promiscuity(dev, 1, false);
5542 dev->uc_promisc = true;
5543 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5544 __dev_set_promiscuity(dev, -1, false);
5545 dev->uc_promisc = false;
5546 }
5547 }
5548
5549 if (ops->ndo_set_rx_mode)
5550 ops->ndo_set_rx_mode(dev);
5551 }
5552
5553 void dev_set_rx_mode(struct net_device *dev)
5554 {
5555 netif_addr_lock_bh(dev);
5556 __dev_set_rx_mode(dev);
5557 netif_addr_unlock_bh(dev);
5558 }
5559
5560 /**
5561 * dev_get_flags - get flags reported to userspace
5562 * @dev: device
5563 *
5564 * Get the combination of flag bits exported through APIs to userspace.
5565 */
5566 unsigned int dev_get_flags(const struct net_device *dev)
5567 {
5568 unsigned int flags;
5569
5570 flags = (dev->flags & ~(IFF_PROMISC |
5571 IFF_ALLMULTI |
5572 IFF_RUNNING |
5573 IFF_LOWER_UP |
5574 IFF_DORMANT)) |
5575 (dev->gflags & (IFF_PROMISC |
5576 IFF_ALLMULTI));
5577
5578 if (netif_running(dev)) {
5579 if (netif_oper_up(dev))
5580 flags |= IFF_RUNNING;
5581 if (netif_carrier_ok(dev))
5582 flags |= IFF_LOWER_UP;
5583 if (netif_dormant(dev))
5584 flags |= IFF_DORMANT;
5585 }
5586
5587 return flags;
5588 }
5589 EXPORT_SYMBOL(dev_get_flags);
5590
5591 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5592 {
5593 unsigned int old_flags = dev->flags;
5594 int ret;
5595
5596 ASSERT_RTNL();
5597
5598 /*
5599 * Set the flags on our device.
5600 */
5601
5602 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5603 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5604 IFF_AUTOMEDIA)) |
5605 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5606 IFF_ALLMULTI));
5607
5608 /*
5609 * Load in the correct multicast list now the flags have changed.
5610 */
5611
5612 if ((old_flags ^ flags) & IFF_MULTICAST)
5613 dev_change_rx_flags(dev, IFF_MULTICAST);
5614
5615 dev_set_rx_mode(dev);
5616
5617 /*
5618 * Have we downed the interface. We handle IFF_UP ourselves
5619 * according to user attempts to set it, rather than blindly
5620 * setting it.
5621 */
5622
5623 ret = 0;
5624 if ((old_flags ^ flags) & IFF_UP)
5625 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5626
5627 if ((flags ^ dev->gflags) & IFF_PROMISC) {
5628 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5629 unsigned int old_flags = dev->flags;
5630
5631 dev->gflags ^= IFF_PROMISC;
5632
5633 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5634 if (dev->flags != old_flags)
5635 dev_set_rx_mode(dev);
5636 }
5637
5638 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5639 is important. Some (broken) drivers set IFF_PROMISC, when
5640 IFF_ALLMULTI is requested not asking us and not reporting.
5641 */
5642 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5643 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5644
5645 dev->gflags ^= IFF_ALLMULTI;
5646 __dev_set_allmulti(dev, inc, false);
5647 }
5648
5649 return ret;
5650 }
5651
5652 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5653 unsigned int gchanges)
5654 {
5655 unsigned int changes = dev->flags ^ old_flags;
5656
5657 if (gchanges)
5658 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5659
5660 if (changes & IFF_UP) {
5661 if (dev->flags & IFF_UP)
5662 call_netdevice_notifiers(NETDEV_UP, dev);
5663 else
5664 call_netdevice_notifiers(NETDEV_DOWN, dev);
5665 }
5666
5667 if (dev->flags & IFF_UP &&
5668 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5669 struct netdev_notifier_change_info change_info;
5670
5671 change_info.flags_changed = changes;
5672 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5673 &change_info.info);
5674 }
5675 }
5676
5677 /**
5678 * dev_change_flags - change device settings
5679 * @dev: device
5680 * @flags: device state flags
5681 *
5682 * Change settings on device based state flags. The flags are
5683 * in the userspace exported format.
5684 */
5685 int dev_change_flags(struct net_device *dev, unsigned int flags)
5686 {
5687 int ret;
5688 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5689
5690 ret = __dev_change_flags(dev, flags);
5691 if (ret < 0)
5692 return ret;
5693
5694 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5695 __dev_notify_flags(dev, old_flags, changes);
5696 return ret;
5697 }
5698 EXPORT_SYMBOL(dev_change_flags);
5699
5700 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5701 {
5702 const struct net_device_ops *ops = dev->netdev_ops;
5703
5704 if (ops->ndo_change_mtu)
5705 return ops->ndo_change_mtu(dev, new_mtu);
5706
5707 dev->mtu = new_mtu;
5708 return 0;
5709 }
5710
5711 /**
5712 * dev_set_mtu - Change maximum transfer unit
5713 * @dev: device
5714 * @new_mtu: new transfer unit
5715 *
5716 * Change the maximum transfer size of the network device.
5717 */
5718 int dev_set_mtu(struct net_device *dev, int new_mtu)
5719 {
5720 int err, orig_mtu;
5721
5722 if (new_mtu == dev->mtu)
5723 return 0;
5724
5725 /* MTU must be positive. */
5726 if (new_mtu < 0)
5727 return -EINVAL;
5728
5729 if (!netif_device_present(dev))
5730 return -ENODEV;
5731
5732 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5733 err = notifier_to_errno(err);
5734 if (err)
5735 return err;
5736
5737 orig_mtu = dev->mtu;
5738 err = __dev_set_mtu(dev, new_mtu);
5739
5740 if (!err) {
5741 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5742 err = notifier_to_errno(err);
5743 if (err) {
5744 /* setting mtu back and notifying everyone again,
5745 * so that they have a chance to revert changes.
5746 */
5747 __dev_set_mtu(dev, orig_mtu);
5748 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5749 }
5750 }
5751 return err;
5752 }
5753 EXPORT_SYMBOL(dev_set_mtu);
5754
5755 /**
5756 * dev_set_group - Change group this device belongs to
5757 * @dev: device
5758 * @new_group: group this device should belong to
5759 */
5760 void dev_set_group(struct net_device *dev, int new_group)
5761 {
5762 dev->group = new_group;
5763 }
5764 EXPORT_SYMBOL(dev_set_group);
5765
5766 /**
5767 * dev_set_mac_address - Change Media Access Control Address
5768 * @dev: device
5769 * @sa: new address
5770 *
5771 * Change the hardware (MAC) address of the device
5772 */
5773 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5774 {
5775 const struct net_device_ops *ops = dev->netdev_ops;
5776 int err;
5777
5778 if (!ops->ndo_set_mac_address)
5779 return -EOPNOTSUPP;
5780 if (sa->sa_family != dev->type)
5781 return -EINVAL;
5782 if (!netif_device_present(dev))
5783 return -ENODEV;
5784 err = ops->ndo_set_mac_address(dev, sa);
5785 if (err)
5786 return err;
5787 dev->addr_assign_type = NET_ADDR_SET;
5788 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5789 add_device_randomness(dev->dev_addr, dev->addr_len);
5790 return 0;
5791 }
5792 EXPORT_SYMBOL(dev_set_mac_address);
5793
5794 /**
5795 * dev_change_carrier - Change device carrier
5796 * @dev: device
5797 * @new_carrier: new value
5798 *
5799 * Change device carrier
5800 */
5801 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5802 {
5803 const struct net_device_ops *ops = dev->netdev_ops;
5804
5805 if (!ops->ndo_change_carrier)
5806 return -EOPNOTSUPP;
5807 if (!netif_device_present(dev))
5808 return -ENODEV;
5809 return ops->ndo_change_carrier(dev, new_carrier);
5810 }
5811 EXPORT_SYMBOL(dev_change_carrier);
5812
5813 /**
5814 * dev_get_phys_port_id - Get device physical port ID
5815 * @dev: device
5816 * @ppid: port ID
5817 *
5818 * Get device physical port ID
5819 */
5820 int dev_get_phys_port_id(struct net_device *dev,
5821 struct netdev_phys_port_id *ppid)
5822 {
5823 const struct net_device_ops *ops = dev->netdev_ops;
5824
5825 if (!ops->ndo_get_phys_port_id)
5826 return -EOPNOTSUPP;
5827 return ops->ndo_get_phys_port_id(dev, ppid);
5828 }
5829 EXPORT_SYMBOL(dev_get_phys_port_id);
5830
5831 /**
5832 * dev_new_index - allocate an ifindex
5833 * @net: the applicable net namespace
5834 *
5835 * Returns a suitable unique value for a new device interface
5836 * number. The caller must hold the rtnl semaphore or the
5837 * dev_base_lock to be sure it remains unique.
5838 */
5839 static int dev_new_index(struct net *net)
5840 {
5841 int ifindex = net->ifindex;
5842 for (;;) {
5843 if (++ifindex <= 0)
5844 ifindex = 1;
5845 if (!__dev_get_by_index(net, ifindex))
5846 return net->ifindex = ifindex;
5847 }
5848 }
5849
5850 /* Delayed registration/unregisteration */
5851 static LIST_HEAD(net_todo_list);
5852 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5853
5854 static void net_set_todo(struct net_device *dev)
5855 {
5856 list_add_tail(&dev->todo_list, &net_todo_list);
5857 dev_net(dev)->dev_unreg_count++;
5858 }
5859
5860 static void rollback_registered_many(struct list_head *head)
5861 {
5862 struct net_device *dev, *tmp;
5863 LIST_HEAD(close_head);
5864
5865 BUG_ON(dev_boot_phase);
5866 ASSERT_RTNL();
5867
5868 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5869 /* Some devices call without registering
5870 * for initialization unwind. Remove those
5871 * devices and proceed with the remaining.
5872 */
5873 if (dev->reg_state == NETREG_UNINITIALIZED) {
5874 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5875 dev->name, dev);
5876
5877 WARN_ON(1);
5878 list_del(&dev->unreg_list);
5879 continue;
5880 }
5881 dev->dismantle = true;
5882 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5883 }
5884
5885 /* If device is running, close it first. */
5886 list_for_each_entry(dev, head, unreg_list)
5887 list_add_tail(&dev->close_list, &close_head);
5888 dev_close_many(&close_head);
5889
5890 list_for_each_entry(dev, head, unreg_list) {
5891 /* And unlink it from device chain. */
5892 unlist_netdevice(dev);
5893
5894 dev->reg_state = NETREG_UNREGISTERING;
5895 }
5896
5897 synchronize_net();
5898
5899 list_for_each_entry(dev, head, unreg_list) {
5900 /* Shutdown queueing discipline. */
5901 dev_shutdown(dev);
5902
5903
5904 /* Notify protocols, that we are about to destroy
5905 this device. They should clean all the things.
5906 */
5907 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5908
5909 /*
5910 * Flush the unicast and multicast chains
5911 */
5912 dev_uc_flush(dev);
5913 dev_mc_flush(dev);
5914
5915 if (dev->netdev_ops->ndo_uninit)
5916 dev->netdev_ops->ndo_uninit(dev);
5917
5918 if (!dev->rtnl_link_ops ||
5919 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5920 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5921
5922 /* Notifier chain MUST detach us all upper devices. */
5923 WARN_ON(netdev_has_any_upper_dev(dev));
5924
5925 /* Remove entries from kobject tree */
5926 netdev_unregister_kobject(dev);
5927 #ifdef CONFIG_XPS
5928 /* Remove XPS queueing entries */
5929 netif_reset_xps_queues_gt(dev, 0);
5930 #endif
5931 }
5932
5933 synchronize_net();
5934
5935 list_for_each_entry(dev, head, unreg_list)
5936 dev_put(dev);
5937 }
5938
5939 static void rollback_registered(struct net_device *dev)
5940 {
5941 LIST_HEAD(single);
5942
5943 list_add(&dev->unreg_list, &single);
5944 rollback_registered_many(&single);
5945 list_del(&single);
5946 }
5947
5948 static netdev_features_t netdev_fix_features(struct net_device *dev,
5949 netdev_features_t features)
5950 {
5951 /* Fix illegal checksum combinations */
5952 if ((features & NETIF_F_HW_CSUM) &&
5953 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5954 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5955 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5956 }
5957
5958 /* TSO requires that SG is present as well. */
5959 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5960 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5961 features &= ~NETIF_F_ALL_TSO;
5962 }
5963
5964 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5965 !(features & NETIF_F_IP_CSUM)) {
5966 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5967 features &= ~NETIF_F_TSO;
5968 features &= ~NETIF_F_TSO_ECN;
5969 }
5970
5971 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5972 !(features & NETIF_F_IPV6_CSUM)) {
5973 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5974 features &= ~NETIF_F_TSO6;
5975 }
5976
5977 /* TSO ECN requires that TSO is present as well. */
5978 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5979 features &= ~NETIF_F_TSO_ECN;
5980
5981 /* Software GSO depends on SG. */
5982 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5983 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5984 features &= ~NETIF_F_GSO;
5985 }
5986
5987 /* UFO needs SG and checksumming */
5988 if (features & NETIF_F_UFO) {
5989 /* maybe split UFO into V4 and V6? */
5990 if (!((features & NETIF_F_GEN_CSUM) ||
5991 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5992 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5993 netdev_dbg(dev,
5994 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5995 features &= ~NETIF_F_UFO;
5996 }
5997
5998 if (!(features & NETIF_F_SG)) {
5999 netdev_dbg(dev,
6000 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6001 features &= ~NETIF_F_UFO;
6002 }
6003 }
6004
6005 #ifdef CONFIG_NET_RX_BUSY_POLL
6006 if (dev->netdev_ops->ndo_busy_poll)
6007 features |= NETIF_F_BUSY_POLL;
6008 else
6009 #endif
6010 features &= ~NETIF_F_BUSY_POLL;
6011
6012 return features;
6013 }
6014
6015 int __netdev_update_features(struct net_device *dev)
6016 {
6017 netdev_features_t features;
6018 int err = 0;
6019
6020 ASSERT_RTNL();
6021
6022 features = netdev_get_wanted_features(dev);
6023
6024 if (dev->netdev_ops->ndo_fix_features)
6025 features = dev->netdev_ops->ndo_fix_features(dev, features);
6026
6027 /* driver might be less strict about feature dependencies */
6028 features = netdev_fix_features(dev, features);
6029
6030 if (dev->features == features)
6031 return 0;
6032
6033 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6034 &dev->features, &features);
6035
6036 if (dev->netdev_ops->ndo_set_features)
6037 err = dev->netdev_ops->ndo_set_features(dev, features);
6038
6039 if (unlikely(err < 0)) {
6040 netdev_err(dev,
6041 "set_features() failed (%d); wanted %pNF, left %pNF\n",
6042 err, &features, &dev->features);
6043 return -1;
6044 }
6045
6046 if (!err)
6047 dev->features = features;
6048
6049 return 1;
6050 }
6051
6052 /**
6053 * netdev_update_features - recalculate device features
6054 * @dev: the device to check
6055 *
6056 * Recalculate dev->features set and send notifications if it
6057 * has changed. Should be called after driver or hardware dependent
6058 * conditions might have changed that influence the features.
6059 */
6060 void netdev_update_features(struct net_device *dev)
6061 {
6062 if (__netdev_update_features(dev))
6063 netdev_features_change(dev);
6064 }
6065 EXPORT_SYMBOL(netdev_update_features);
6066
6067 /**
6068 * netdev_change_features - recalculate device features
6069 * @dev: the device to check
6070 *
6071 * Recalculate dev->features set and send notifications even
6072 * if they have not changed. Should be called instead of
6073 * netdev_update_features() if also dev->vlan_features might
6074 * have changed to allow the changes to be propagated to stacked
6075 * VLAN devices.
6076 */
6077 void netdev_change_features(struct net_device *dev)
6078 {
6079 __netdev_update_features(dev);
6080 netdev_features_change(dev);
6081 }
6082 EXPORT_SYMBOL(netdev_change_features);
6083
6084 /**
6085 * netif_stacked_transfer_operstate - transfer operstate
6086 * @rootdev: the root or lower level device to transfer state from
6087 * @dev: the device to transfer operstate to
6088 *
6089 * Transfer operational state from root to device. This is normally
6090 * called when a stacking relationship exists between the root
6091 * device and the device(a leaf device).
6092 */
6093 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6094 struct net_device *dev)
6095 {
6096 if (rootdev->operstate == IF_OPER_DORMANT)
6097 netif_dormant_on(dev);
6098 else
6099 netif_dormant_off(dev);
6100
6101 if (netif_carrier_ok(rootdev)) {
6102 if (!netif_carrier_ok(dev))
6103 netif_carrier_on(dev);
6104 } else {
6105 if (netif_carrier_ok(dev))
6106 netif_carrier_off(dev);
6107 }
6108 }
6109 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6110
6111 #ifdef CONFIG_SYSFS
6112 static int netif_alloc_rx_queues(struct net_device *dev)
6113 {
6114 unsigned int i, count = dev->num_rx_queues;
6115 struct netdev_rx_queue *rx;
6116
6117 BUG_ON(count < 1);
6118
6119 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6120 if (!rx)
6121 return -ENOMEM;
6122
6123 dev->_rx = rx;
6124
6125 for (i = 0; i < count; i++)
6126 rx[i].dev = dev;
6127 return 0;
6128 }
6129 #endif
6130
6131 static void netdev_init_one_queue(struct net_device *dev,
6132 struct netdev_queue *queue, void *_unused)
6133 {
6134 /* Initialize queue lock */
6135 spin_lock_init(&queue->_xmit_lock);
6136 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6137 queue->xmit_lock_owner = -1;
6138 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6139 queue->dev = dev;
6140 #ifdef CONFIG_BQL
6141 dql_init(&queue->dql, HZ);
6142 #endif
6143 }
6144
6145 static void netif_free_tx_queues(struct net_device *dev)
6146 {
6147 kvfree(dev->_tx);
6148 }
6149
6150 static int netif_alloc_netdev_queues(struct net_device *dev)
6151 {
6152 unsigned int count = dev->num_tx_queues;
6153 struct netdev_queue *tx;
6154 size_t sz = count * sizeof(*tx);
6155
6156 BUG_ON(count < 1 || count > 0xffff);
6157
6158 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6159 if (!tx) {
6160 tx = vzalloc(sz);
6161 if (!tx)
6162 return -ENOMEM;
6163 }
6164 dev->_tx = tx;
6165
6166 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6167 spin_lock_init(&dev->tx_global_lock);
6168
6169 return 0;
6170 }
6171
6172 /**
6173 * register_netdevice - register a network device
6174 * @dev: device to register
6175 *
6176 * Take a completed network device structure and add it to the kernel
6177 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6178 * chain. 0 is returned on success. A negative errno code is returned
6179 * on a failure to set up the device, or if the name is a duplicate.
6180 *
6181 * Callers must hold the rtnl semaphore. You may want
6182 * register_netdev() instead of this.
6183 *
6184 * BUGS:
6185 * The locking appears insufficient to guarantee two parallel registers
6186 * will not get the same name.
6187 */
6188
6189 int register_netdevice(struct net_device *dev)
6190 {
6191 int ret;
6192 struct net *net = dev_net(dev);
6193
6194 BUG_ON(dev_boot_phase);
6195 ASSERT_RTNL();
6196
6197 might_sleep();
6198
6199 /* When net_device's are persistent, this will be fatal. */
6200 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6201 BUG_ON(!net);
6202
6203 spin_lock_init(&dev->addr_list_lock);
6204 netdev_set_addr_lockdep_class(dev);
6205
6206 dev->iflink = -1;
6207
6208 ret = dev_get_valid_name(net, dev, dev->name);
6209 if (ret < 0)
6210 goto out;
6211
6212 /* Init, if this function is available */
6213 if (dev->netdev_ops->ndo_init) {
6214 ret = dev->netdev_ops->ndo_init(dev);
6215 if (ret) {
6216 if (ret > 0)
6217 ret = -EIO;
6218 goto out;
6219 }
6220 }
6221
6222 if (((dev->hw_features | dev->features) &
6223 NETIF_F_HW_VLAN_CTAG_FILTER) &&
6224 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6225 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6226 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6227 ret = -EINVAL;
6228 goto err_uninit;
6229 }
6230
6231 ret = -EBUSY;
6232 if (!dev->ifindex)
6233 dev->ifindex = dev_new_index(net);
6234 else if (__dev_get_by_index(net, dev->ifindex))
6235 goto err_uninit;
6236
6237 if (dev->iflink == -1)
6238 dev->iflink = dev->ifindex;
6239
6240 /* Transfer changeable features to wanted_features and enable
6241 * software offloads (GSO and GRO).
6242 */
6243 dev->hw_features |= NETIF_F_SOFT_FEATURES;
6244 dev->features |= NETIF_F_SOFT_FEATURES;
6245 dev->wanted_features = dev->features & dev->hw_features;
6246
6247 if (!(dev->flags & IFF_LOOPBACK)) {
6248 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6249 }
6250
6251 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6252 */
6253 dev->vlan_features |= NETIF_F_HIGHDMA;
6254
6255 /* Make NETIF_F_SG inheritable to tunnel devices.
6256 */
6257 dev->hw_enc_features |= NETIF_F_SG;
6258
6259 /* Make NETIF_F_SG inheritable to MPLS.
6260 */
6261 dev->mpls_features |= NETIF_F_SG;
6262
6263 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6264 ret = notifier_to_errno(ret);
6265 if (ret)
6266 goto err_uninit;
6267
6268 ret = netdev_register_kobject(dev);
6269 if (ret)
6270 goto err_uninit;
6271 dev->reg_state = NETREG_REGISTERED;
6272
6273 __netdev_update_features(dev);
6274
6275 /*
6276 * Default initial state at registry is that the
6277 * device is present.
6278 */
6279
6280 set_bit(__LINK_STATE_PRESENT, &dev->state);
6281
6282 linkwatch_init_dev(dev);
6283
6284 dev_init_scheduler(dev);
6285 dev_hold(dev);
6286 list_netdevice(dev);
6287 add_device_randomness(dev->dev_addr, dev->addr_len);
6288
6289 /* If the device has permanent device address, driver should
6290 * set dev_addr and also addr_assign_type should be set to
6291 * NET_ADDR_PERM (default value).
6292 */
6293 if (dev->addr_assign_type == NET_ADDR_PERM)
6294 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6295
6296 /* Notify protocols, that a new device appeared. */
6297 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6298 ret = notifier_to_errno(ret);
6299 if (ret) {
6300 rollback_registered(dev);
6301 dev->reg_state = NETREG_UNREGISTERED;
6302 }
6303 /*
6304 * Prevent userspace races by waiting until the network
6305 * device is fully setup before sending notifications.
6306 */
6307 if (!dev->rtnl_link_ops ||
6308 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6309 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6310
6311 out:
6312 return ret;
6313
6314 err_uninit:
6315 if (dev->netdev_ops->ndo_uninit)
6316 dev->netdev_ops->ndo_uninit(dev);
6317 goto out;
6318 }
6319 EXPORT_SYMBOL(register_netdevice);
6320
6321 /**
6322 * init_dummy_netdev - init a dummy network device for NAPI
6323 * @dev: device to init
6324 *
6325 * This takes a network device structure and initialize the minimum
6326 * amount of fields so it can be used to schedule NAPI polls without
6327 * registering a full blown interface. This is to be used by drivers
6328 * that need to tie several hardware interfaces to a single NAPI
6329 * poll scheduler due to HW limitations.
6330 */
6331 int init_dummy_netdev(struct net_device *dev)
6332 {
6333 /* Clear everything. Note we don't initialize spinlocks
6334 * are they aren't supposed to be taken by any of the
6335 * NAPI code and this dummy netdev is supposed to be
6336 * only ever used for NAPI polls
6337 */
6338 memset(dev, 0, sizeof(struct net_device));
6339
6340 /* make sure we BUG if trying to hit standard
6341 * register/unregister code path
6342 */
6343 dev->reg_state = NETREG_DUMMY;
6344
6345 /* NAPI wants this */
6346 INIT_LIST_HEAD(&dev->napi_list);
6347
6348 /* a dummy interface is started by default */
6349 set_bit(__LINK_STATE_PRESENT, &dev->state);
6350 set_bit(__LINK_STATE_START, &dev->state);
6351
6352 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6353 * because users of this 'device' dont need to change
6354 * its refcount.
6355 */
6356
6357 return 0;
6358 }
6359 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6360
6361
6362 /**
6363 * register_netdev - register a network device
6364 * @dev: device to register
6365 *
6366 * Take a completed network device structure and add it to the kernel
6367 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6368 * chain. 0 is returned on success. A negative errno code is returned
6369 * on a failure to set up the device, or if the name is a duplicate.
6370 *
6371 * This is a wrapper around register_netdevice that takes the rtnl semaphore
6372 * and expands the device name if you passed a format string to
6373 * alloc_netdev.
6374 */
6375 int register_netdev(struct net_device *dev)
6376 {
6377 int err;
6378
6379 rtnl_lock();
6380 err = register_netdevice(dev);
6381 rtnl_unlock();
6382 return err;
6383 }
6384 EXPORT_SYMBOL(register_netdev);
6385
6386 int netdev_refcnt_read(const struct net_device *dev)
6387 {
6388 int i, refcnt = 0;
6389
6390 for_each_possible_cpu(i)
6391 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6392 return refcnt;
6393 }
6394 EXPORT_SYMBOL(netdev_refcnt_read);
6395
6396 /**
6397 * netdev_wait_allrefs - wait until all references are gone.
6398 * @dev: target net_device
6399 *
6400 * This is called when unregistering network devices.
6401 *
6402 * Any protocol or device that holds a reference should register
6403 * for netdevice notification, and cleanup and put back the
6404 * reference if they receive an UNREGISTER event.
6405 * We can get stuck here if buggy protocols don't correctly
6406 * call dev_put.
6407 */
6408 static void netdev_wait_allrefs(struct net_device *dev)
6409 {
6410 unsigned long rebroadcast_time, warning_time;
6411 int refcnt;
6412
6413 linkwatch_forget_dev(dev);
6414
6415 rebroadcast_time = warning_time = jiffies;
6416 refcnt = netdev_refcnt_read(dev);
6417
6418 while (refcnt != 0) {
6419 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6420 rtnl_lock();
6421
6422 /* Rebroadcast unregister notification */
6423 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6424
6425 __rtnl_unlock();
6426 rcu_barrier();
6427 rtnl_lock();
6428
6429 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6430 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6431 &dev->state)) {
6432 /* We must not have linkwatch events
6433 * pending on unregister. If this
6434 * happens, we simply run the queue
6435 * unscheduled, resulting in a noop
6436 * for this device.
6437 */
6438 linkwatch_run_queue();
6439 }
6440
6441 __rtnl_unlock();
6442
6443 rebroadcast_time = jiffies;
6444 }
6445
6446 msleep(250);
6447
6448 refcnt = netdev_refcnt_read(dev);
6449
6450 if (time_after(jiffies, warning_time + 10 * HZ)) {
6451 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6452 dev->name, refcnt);
6453 warning_time = jiffies;
6454 }
6455 }
6456 }
6457
6458 /* The sequence is:
6459 *
6460 * rtnl_lock();
6461 * ...
6462 * register_netdevice(x1);
6463 * register_netdevice(x2);
6464 * ...
6465 * unregister_netdevice(y1);
6466 * unregister_netdevice(y2);
6467 * ...
6468 * rtnl_unlock();
6469 * free_netdev(y1);
6470 * free_netdev(y2);
6471 *
6472 * We are invoked by rtnl_unlock().
6473 * This allows us to deal with problems:
6474 * 1) We can delete sysfs objects which invoke hotplug
6475 * without deadlocking with linkwatch via keventd.
6476 * 2) Since we run with the RTNL semaphore not held, we can sleep
6477 * safely in order to wait for the netdev refcnt to drop to zero.
6478 *
6479 * We must not return until all unregister events added during
6480 * the interval the lock was held have been completed.
6481 */
6482 void netdev_run_todo(void)
6483 {
6484 struct list_head list;
6485
6486 /* Snapshot list, allow later requests */
6487 list_replace_init(&net_todo_list, &list);
6488
6489 __rtnl_unlock();
6490
6491
6492 /* Wait for rcu callbacks to finish before next phase */
6493 if (!list_empty(&list))
6494 rcu_barrier();
6495
6496 while (!list_empty(&list)) {
6497 struct net_device *dev
6498 = list_first_entry(&list, struct net_device, todo_list);
6499 list_del(&dev->todo_list);
6500
6501 rtnl_lock();
6502 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6503 __rtnl_unlock();
6504
6505 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6506 pr_err("network todo '%s' but state %d\n",
6507 dev->name, dev->reg_state);
6508 dump_stack();
6509 continue;
6510 }
6511
6512 dev->reg_state = NETREG_UNREGISTERED;
6513
6514 on_each_cpu(flush_backlog, dev, 1);
6515
6516 netdev_wait_allrefs(dev);
6517
6518 /* paranoia */
6519 BUG_ON(netdev_refcnt_read(dev));
6520 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6521 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6522 WARN_ON(dev->dn_ptr);
6523
6524 if (dev->destructor)
6525 dev->destructor(dev);
6526
6527 /* Report a network device has been unregistered */
6528 rtnl_lock();
6529 dev_net(dev)->dev_unreg_count--;
6530 __rtnl_unlock();
6531 wake_up(&netdev_unregistering_wq);
6532
6533 /* Free network device */
6534 kobject_put(&dev->dev.kobj);
6535 }
6536 }
6537
6538 /* Convert net_device_stats to rtnl_link_stats64. They have the same
6539 * fields in the same order, with only the type differing.
6540 */
6541 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6542 const struct net_device_stats *netdev_stats)
6543 {
6544 #if BITS_PER_LONG == 64
6545 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6546 memcpy(stats64, netdev_stats, sizeof(*stats64));
6547 #else
6548 size_t i, n = sizeof(*stats64) / sizeof(u64);
6549 const unsigned long *src = (const unsigned long *)netdev_stats;
6550 u64 *dst = (u64 *)stats64;
6551
6552 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6553 sizeof(*stats64) / sizeof(u64));
6554 for (i = 0; i < n; i++)
6555 dst[i] = src[i];
6556 #endif
6557 }
6558 EXPORT_SYMBOL(netdev_stats_to_stats64);
6559
6560 /**
6561 * dev_get_stats - get network device statistics
6562 * @dev: device to get statistics from
6563 * @storage: place to store stats
6564 *
6565 * Get network statistics from device. Return @storage.
6566 * The device driver may provide its own method by setting
6567 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6568 * otherwise the internal statistics structure is used.
6569 */
6570 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6571 struct rtnl_link_stats64 *storage)
6572 {
6573 const struct net_device_ops *ops = dev->netdev_ops;
6574
6575 if (ops->ndo_get_stats64) {
6576 memset(storage, 0, sizeof(*storage));
6577 ops->ndo_get_stats64(dev, storage);
6578 } else if (ops->ndo_get_stats) {
6579 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6580 } else {
6581 netdev_stats_to_stats64(storage, &dev->stats);
6582 }
6583 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6584 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6585 return storage;
6586 }
6587 EXPORT_SYMBOL(dev_get_stats);
6588
6589 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6590 {
6591 struct netdev_queue *queue = dev_ingress_queue(dev);
6592
6593 #ifdef CONFIG_NET_CLS_ACT
6594 if (queue)
6595 return queue;
6596 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6597 if (!queue)
6598 return NULL;
6599 netdev_init_one_queue(dev, queue, NULL);
6600 queue->qdisc = &noop_qdisc;
6601 queue->qdisc_sleeping = &noop_qdisc;
6602 rcu_assign_pointer(dev->ingress_queue, queue);
6603 #endif
6604 return queue;
6605 }
6606
6607 static const struct ethtool_ops default_ethtool_ops;
6608
6609 void netdev_set_default_ethtool_ops(struct net_device *dev,
6610 const struct ethtool_ops *ops)
6611 {
6612 if (dev->ethtool_ops == &default_ethtool_ops)
6613 dev->ethtool_ops = ops;
6614 }
6615 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6616
6617 void netdev_freemem(struct net_device *dev)
6618 {
6619 char *addr = (char *)dev - dev->padded;
6620
6621 kvfree(addr);
6622 }
6623
6624 /**
6625 * alloc_netdev_mqs - allocate network device
6626 * @sizeof_priv: size of private data to allocate space for
6627 * @name: device name format string
6628 * @name_assign_type: origin of device name
6629 * @setup: callback to initialize device
6630 * @txqs: the number of TX subqueues to allocate
6631 * @rxqs: the number of RX subqueues to allocate
6632 *
6633 * Allocates a struct net_device with private data area for driver use
6634 * and performs basic initialization. Also allocates subqueue structs
6635 * for each queue on the device.
6636 */
6637 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6638 unsigned char name_assign_type,
6639 void (*setup)(struct net_device *),
6640 unsigned int txqs, unsigned int rxqs)
6641 {
6642 struct net_device *dev;
6643 size_t alloc_size;
6644 struct net_device *p;
6645
6646 BUG_ON(strlen(name) >= sizeof(dev->name));
6647
6648 if (txqs < 1) {
6649 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6650 return NULL;
6651 }
6652
6653 #ifdef CONFIG_SYSFS
6654 if (rxqs < 1) {
6655 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6656 return NULL;
6657 }
6658 #endif
6659
6660 alloc_size = sizeof(struct net_device);
6661 if (sizeof_priv) {
6662 /* ensure 32-byte alignment of private area */
6663 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6664 alloc_size += sizeof_priv;
6665 }
6666 /* ensure 32-byte alignment of whole construct */
6667 alloc_size += NETDEV_ALIGN - 1;
6668
6669 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6670 if (!p)
6671 p = vzalloc(alloc_size);
6672 if (!p)
6673 return NULL;
6674
6675 dev = PTR_ALIGN(p, NETDEV_ALIGN);
6676 dev->padded = (char *)dev - (char *)p;
6677
6678 dev->pcpu_refcnt = alloc_percpu(int);
6679 if (!dev->pcpu_refcnt)
6680 goto free_dev;
6681
6682 if (dev_addr_init(dev))
6683 goto free_pcpu;
6684
6685 dev_mc_init(dev);
6686 dev_uc_init(dev);
6687
6688 dev_net_set(dev, &init_net);
6689
6690 dev->gso_max_size = GSO_MAX_SIZE;
6691 dev->gso_max_segs = GSO_MAX_SEGS;
6692 dev->gso_min_segs = 0;
6693
6694 INIT_LIST_HEAD(&dev->napi_list);
6695 INIT_LIST_HEAD(&dev->unreg_list);
6696 INIT_LIST_HEAD(&dev->close_list);
6697 INIT_LIST_HEAD(&dev->link_watch_list);
6698 INIT_LIST_HEAD(&dev->adj_list.upper);
6699 INIT_LIST_HEAD(&dev->adj_list.lower);
6700 INIT_LIST_HEAD(&dev->all_adj_list.upper);
6701 INIT_LIST_HEAD(&dev->all_adj_list.lower);
6702 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6703 setup(dev);
6704
6705 dev->num_tx_queues = txqs;
6706 dev->real_num_tx_queues = txqs;
6707 if (netif_alloc_netdev_queues(dev))
6708 goto free_all;
6709
6710 #ifdef CONFIG_SYSFS
6711 dev->num_rx_queues = rxqs;
6712 dev->real_num_rx_queues = rxqs;
6713 if (netif_alloc_rx_queues(dev))
6714 goto free_all;
6715 #endif
6716
6717 strcpy(dev->name, name);
6718 dev->name_assign_type = name_assign_type;
6719 dev->group = INIT_NETDEV_GROUP;
6720 if (!dev->ethtool_ops)
6721 dev->ethtool_ops = &default_ethtool_ops;
6722 return dev;
6723
6724 free_all:
6725 free_netdev(dev);
6726 return NULL;
6727
6728 free_pcpu:
6729 free_percpu(dev->pcpu_refcnt);
6730 free_dev:
6731 netdev_freemem(dev);
6732 return NULL;
6733 }
6734 EXPORT_SYMBOL(alloc_netdev_mqs);
6735
6736 /**
6737 * free_netdev - free network device
6738 * @dev: device
6739 *
6740 * This function does the last stage of destroying an allocated device
6741 * interface. The reference to the device object is released.
6742 * If this is the last reference then it will be freed.
6743 */
6744 void free_netdev(struct net_device *dev)
6745 {
6746 struct napi_struct *p, *n;
6747
6748 release_net(dev_net(dev));
6749
6750 netif_free_tx_queues(dev);
6751 #ifdef CONFIG_SYSFS
6752 kfree(dev->_rx);
6753 #endif
6754
6755 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6756
6757 /* Flush device addresses */
6758 dev_addr_flush(dev);
6759
6760 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6761 netif_napi_del(p);
6762
6763 free_percpu(dev->pcpu_refcnt);
6764 dev->pcpu_refcnt = NULL;
6765
6766 /* Compatibility with error handling in drivers */
6767 if (dev->reg_state == NETREG_UNINITIALIZED) {
6768 netdev_freemem(dev);
6769 return;
6770 }
6771
6772 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6773 dev->reg_state = NETREG_RELEASED;
6774
6775 /* will free via device release */
6776 put_device(&dev->dev);
6777 }
6778 EXPORT_SYMBOL(free_netdev);
6779
6780 /**
6781 * synchronize_net - Synchronize with packet receive processing
6782 *
6783 * Wait for packets currently being received to be done.
6784 * Does not block later packets from starting.
6785 */
6786 void synchronize_net(void)
6787 {
6788 might_sleep();
6789 if (rtnl_is_locked())
6790 synchronize_rcu_expedited();
6791 else
6792 synchronize_rcu();
6793 }
6794 EXPORT_SYMBOL(synchronize_net);
6795
6796 /**
6797 * unregister_netdevice_queue - remove device from the kernel
6798 * @dev: device
6799 * @head: list
6800 *
6801 * This function shuts down a device interface and removes it
6802 * from the kernel tables.
6803 * If head not NULL, device is queued to be unregistered later.
6804 *
6805 * Callers must hold the rtnl semaphore. You may want
6806 * unregister_netdev() instead of this.
6807 */
6808
6809 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6810 {
6811 ASSERT_RTNL();
6812
6813 if (head) {
6814 list_move_tail(&dev->unreg_list, head);
6815 } else {
6816 rollback_registered(dev);
6817 /* Finish processing unregister after unlock */
6818 net_set_todo(dev);
6819 }
6820 }
6821 EXPORT_SYMBOL(unregister_netdevice_queue);
6822
6823 /**
6824 * unregister_netdevice_many - unregister many devices
6825 * @head: list of devices
6826 *
6827 * Note: As most callers use a stack allocated list_head,
6828 * we force a list_del() to make sure stack wont be corrupted later.
6829 */
6830 void unregister_netdevice_many(struct list_head *head)
6831 {
6832 struct net_device *dev;
6833
6834 if (!list_empty(head)) {
6835 rollback_registered_many(head);
6836 list_for_each_entry(dev, head, unreg_list)
6837 net_set_todo(dev);
6838 list_del(head);
6839 }
6840 }
6841 EXPORT_SYMBOL(unregister_netdevice_many);
6842
6843 /**
6844 * unregister_netdev - remove device from the kernel
6845 * @dev: device
6846 *
6847 * This function shuts down a device interface and removes it
6848 * from the kernel tables.
6849 *
6850 * This is just a wrapper for unregister_netdevice that takes
6851 * the rtnl semaphore. In general you want to use this and not
6852 * unregister_netdevice.
6853 */
6854 void unregister_netdev(struct net_device *dev)
6855 {
6856 rtnl_lock();
6857 unregister_netdevice(dev);
6858 rtnl_unlock();
6859 }
6860 EXPORT_SYMBOL(unregister_netdev);
6861
6862 /**
6863 * dev_change_net_namespace - move device to different nethost namespace
6864 * @dev: device
6865 * @net: network namespace
6866 * @pat: If not NULL name pattern to try if the current device name
6867 * is already taken in the destination network namespace.
6868 *
6869 * This function shuts down a device interface and moves it
6870 * to a new network namespace. On success 0 is returned, on
6871 * a failure a netagive errno code is returned.
6872 *
6873 * Callers must hold the rtnl semaphore.
6874 */
6875
6876 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6877 {
6878 int err;
6879
6880 ASSERT_RTNL();
6881
6882 /* Don't allow namespace local devices to be moved. */
6883 err = -EINVAL;
6884 if (dev->features & NETIF_F_NETNS_LOCAL)
6885 goto out;
6886
6887 /* Ensure the device has been registrered */
6888 if (dev->reg_state != NETREG_REGISTERED)
6889 goto out;
6890
6891 /* Get out if there is nothing todo */
6892 err = 0;
6893 if (net_eq(dev_net(dev), net))
6894 goto out;
6895
6896 /* Pick the destination device name, and ensure
6897 * we can use it in the destination network namespace.
6898 */
6899 err = -EEXIST;
6900 if (__dev_get_by_name(net, dev->name)) {
6901 /* We get here if we can't use the current device name */
6902 if (!pat)
6903 goto out;
6904 if (dev_get_valid_name(net, dev, pat) < 0)
6905 goto out;
6906 }
6907
6908 /*
6909 * And now a mini version of register_netdevice unregister_netdevice.
6910 */
6911
6912 /* If device is running close it first. */
6913 dev_close(dev);
6914
6915 /* And unlink it from device chain */
6916 err = -ENODEV;
6917 unlist_netdevice(dev);
6918
6919 synchronize_net();
6920
6921 /* Shutdown queueing discipline. */
6922 dev_shutdown(dev);
6923
6924 /* Notify protocols, that we are about to destroy
6925 this device. They should clean all the things.
6926
6927 Note that dev->reg_state stays at NETREG_REGISTERED.
6928 This is wanted because this way 8021q and macvlan know
6929 the device is just moving and can keep their slaves up.
6930 */
6931 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6932 rcu_barrier();
6933 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6934 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6935
6936 /*
6937 * Flush the unicast and multicast chains
6938 */
6939 dev_uc_flush(dev);
6940 dev_mc_flush(dev);
6941
6942 /* Send a netdev-removed uevent to the old namespace */
6943 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6944 netdev_adjacent_del_links(dev);
6945
6946 /* Actually switch the network namespace */
6947 dev_net_set(dev, net);
6948
6949 /* If there is an ifindex conflict assign a new one */
6950 if (__dev_get_by_index(net, dev->ifindex)) {
6951 int iflink = (dev->iflink == dev->ifindex);
6952 dev->ifindex = dev_new_index(net);
6953 if (iflink)
6954 dev->iflink = dev->ifindex;
6955 }
6956
6957 /* Send a netdev-add uevent to the new namespace */
6958 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6959 netdev_adjacent_add_links(dev);
6960
6961 /* Fixup kobjects */
6962 err = device_rename(&dev->dev, dev->name);
6963 WARN_ON(err);
6964
6965 /* Add the device back in the hashes */
6966 list_netdevice(dev);
6967
6968 /* Notify protocols, that a new device appeared. */
6969 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6970
6971 /*
6972 * Prevent userspace races by waiting until the network
6973 * device is fully setup before sending notifications.
6974 */
6975 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6976
6977 synchronize_net();
6978 err = 0;
6979 out:
6980 return err;
6981 }
6982 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6983
6984 static int dev_cpu_callback(struct notifier_block *nfb,
6985 unsigned long action,
6986 void *ocpu)
6987 {
6988 struct sk_buff **list_skb;
6989 struct sk_buff *skb;
6990 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6991 struct softnet_data *sd, *oldsd;
6992
6993 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6994 return NOTIFY_OK;
6995
6996 local_irq_disable();
6997 cpu = smp_processor_id();
6998 sd = &per_cpu(softnet_data, cpu);
6999 oldsd = &per_cpu(softnet_data, oldcpu);
7000
7001 /* Find end of our completion_queue. */
7002 list_skb = &sd->completion_queue;
7003 while (*list_skb)
7004 list_skb = &(*list_skb)->next;
7005 /* Append completion queue from offline CPU. */
7006 *list_skb = oldsd->completion_queue;
7007 oldsd->completion_queue = NULL;
7008
7009 /* Append output queue from offline CPU. */
7010 if (oldsd->output_queue) {
7011 *sd->output_queue_tailp = oldsd->output_queue;
7012 sd->output_queue_tailp = oldsd->output_queue_tailp;
7013 oldsd->output_queue = NULL;
7014 oldsd->output_queue_tailp = &oldsd->output_queue;
7015 }
7016 /* Append NAPI poll list from offline CPU. */
7017 if (!list_empty(&oldsd->poll_list)) {
7018 list_splice_init(&oldsd->poll_list, &sd->poll_list);
7019 raise_softirq_irqoff(NET_RX_SOFTIRQ);
7020 }
7021
7022 raise_softirq_irqoff(NET_TX_SOFTIRQ);
7023 local_irq_enable();
7024
7025 /* Process offline CPU's input_pkt_queue */
7026 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7027 netif_rx_internal(skb);
7028 input_queue_head_incr(oldsd);
7029 }
7030 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
7031 netif_rx_internal(skb);
7032 input_queue_head_incr(oldsd);
7033 }
7034
7035 return NOTIFY_OK;
7036 }
7037
7038
7039 /**
7040 * netdev_increment_features - increment feature set by one
7041 * @all: current feature set
7042 * @one: new feature set
7043 * @mask: mask feature set
7044 *
7045 * Computes a new feature set after adding a device with feature set
7046 * @one to the master device with current feature set @all. Will not
7047 * enable anything that is off in @mask. Returns the new feature set.
7048 */
7049 netdev_features_t netdev_increment_features(netdev_features_t all,
7050 netdev_features_t one, netdev_features_t mask)
7051 {
7052 if (mask & NETIF_F_GEN_CSUM)
7053 mask |= NETIF_F_ALL_CSUM;
7054 mask |= NETIF_F_VLAN_CHALLENGED;
7055
7056 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7057 all &= one | ~NETIF_F_ALL_FOR_ALL;
7058
7059 /* If one device supports hw checksumming, set for all. */
7060 if (all & NETIF_F_GEN_CSUM)
7061 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7062
7063 return all;
7064 }
7065 EXPORT_SYMBOL(netdev_increment_features);
7066
7067 static struct hlist_head * __net_init netdev_create_hash(void)
7068 {
7069 int i;
7070 struct hlist_head *hash;
7071
7072 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7073 if (hash != NULL)
7074 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7075 INIT_HLIST_HEAD(&hash[i]);
7076
7077 return hash;
7078 }
7079
7080 /* Initialize per network namespace state */
7081 static int __net_init netdev_init(struct net *net)
7082 {
7083 if (net != &init_net)
7084 INIT_LIST_HEAD(&net->dev_base_head);
7085
7086 net->dev_name_head = netdev_create_hash();
7087 if (net->dev_name_head == NULL)
7088 goto err_name;
7089
7090 net->dev_index_head = netdev_create_hash();
7091 if (net->dev_index_head == NULL)
7092 goto err_idx;
7093
7094 return 0;
7095
7096 err_idx:
7097 kfree(net->dev_name_head);
7098 err_name:
7099 return -ENOMEM;
7100 }
7101
7102 /**
7103 * netdev_drivername - network driver for the device
7104 * @dev: network device
7105 *
7106 * Determine network driver for device.
7107 */
7108 const char *netdev_drivername(const struct net_device *dev)
7109 {
7110 const struct device_driver *driver;
7111 const struct device *parent;
7112 const char *empty = "";
7113
7114 parent = dev->dev.parent;
7115 if (!parent)
7116 return empty;
7117
7118 driver = parent->driver;
7119 if (driver && driver->name)
7120 return driver->name;
7121 return empty;
7122 }
7123
7124 static void __netdev_printk(const char *level, const struct net_device *dev,
7125 struct va_format *vaf)
7126 {
7127 if (dev && dev->dev.parent) {
7128 dev_printk_emit(level[1] - '0',
7129 dev->dev.parent,
7130 "%s %s %s%s: %pV",
7131 dev_driver_string(dev->dev.parent),
7132 dev_name(dev->dev.parent),
7133 netdev_name(dev), netdev_reg_state(dev),
7134 vaf);
7135 } else if (dev) {
7136 printk("%s%s%s: %pV",
7137 level, netdev_name(dev), netdev_reg_state(dev), vaf);
7138 } else {
7139 printk("%s(NULL net_device): %pV", level, vaf);
7140 }
7141 }
7142
7143 void netdev_printk(const char *level, const struct net_device *dev,
7144 const char *format, ...)
7145 {
7146 struct va_format vaf;
7147 va_list args;
7148
7149 va_start(args, format);
7150
7151 vaf.fmt = format;
7152 vaf.va = &args;
7153
7154 __netdev_printk(level, dev, &vaf);
7155
7156 va_end(args);
7157 }
7158 EXPORT_SYMBOL(netdev_printk);
7159
7160 #define define_netdev_printk_level(func, level) \
7161 void func(const struct net_device *dev, const char *fmt, ...) \
7162 { \
7163 struct va_format vaf; \
7164 va_list args; \
7165 \
7166 va_start(args, fmt); \
7167 \
7168 vaf.fmt = fmt; \
7169 vaf.va = &args; \
7170 \
7171 __netdev_printk(level, dev, &vaf); \
7172 \
7173 va_end(args); \
7174 } \
7175 EXPORT_SYMBOL(func);
7176
7177 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7178 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7179 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7180 define_netdev_printk_level(netdev_err, KERN_ERR);
7181 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7182 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7183 define_netdev_printk_level(netdev_info, KERN_INFO);
7184
7185 static void __net_exit netdev_exit(struct net *net)
7186 {
7187 kfree(net->dev_name_head);
7188 kfree(net->dev_index_head);
7189 }
7190
7191 static struct pernet_operations __net_initdata netdev_net_ops = {
7192 .init = netdev_init,
7193 .exit = netdev_exit,
7194 };
7195
7196 static void __net_exit default_device_exit(struct net *net)
7197 {
7198 struct net_device *dev, *aux;
7199 /*
7200 * Push all migratable network devices back to the
7201 * initial network namespace
7202 */
7203 rtnl_lock();
7204 for_each_netdev_safe(net, dev, aux) {
7205 int err;
7206 char fb_name[IFNAMSIZ];
7207
7208 /* Ignore unmoveable devices (i.e. loopback) */
7209 if (dev->features & NETIF_F_NETNS_LOCAL)
7210 continue;
7211
7212 /* Leave virtual devices for the generic cleanup */
7213 if (dev->rtnl_link_ops)
7214 continue;
7215
7216 /* Push remaining network devices to init_net */
7217 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7218 err = dev_change_net_namespace(dev, &init_net, fb_name);
7219 if (err) {
7220 pr_emerg("%s: failed to move %s to init_net: %d\n",
7221 __func__, dev->name, err);
7222 BUG();
7223 }
7224 }
7225 rtnl_unlock();
7226 }
7227
7228 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7229 {
7230 /* Return with the rtnl_lock held when there are no network
7231 * devices unregistering in any network namespace in net_list.
7232 */
7233 struct net *net;
7234 bool unregistering;
7235 DEFINE_WAIT(wait);
7236
7237 for (;;) {
7238 prepare_to_wait(&netdev_unregistering_wq, &wait,
7239 TASK_UNINTERRUPTIBLE);
7240 unregistering = false;
7241 rtnl_lock();
7242 list_for_each_entry(net, net_list, exit_list) {
7243 if (net->dev_unreg_count > 0) {
7244 unregistering = true;
7245 break;
7246 }
7247 }
7248 if (!unregistering)
7249 break;
7250 __rtnl_unlock();
7251 schedule();
7252 }
7253 finish_wait(&netdev_unregistering_wq, &wait);
7254 }
7255
7256 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7257 {
7258 /* At exit all network devices most be removed from a network
7259 * namespace. Do this in the reverse order of registration.
7260 * Do this across as many network namespaces as possible to
7261 * improve batching efficiency.
7262 */
7263 struct net_device *dev;
7264 struct net *net;
7265 LIST_HEAD(dev_kill_list);
7266
7267 /* To prevent network device cleanup code from dereferencing
7268 * loopback devices or network devices that have been freed
7269 * wait here for all pending unregistrations to complete,
7270 * before unregistring the loopback device and allowing the
7271 * network namespace be freed.
7272 *
7273 * The netdev todo list containing all network devices
7274 * unregistrations that happen in default_device_exit_batch
7275 * will run in the rtnl_unlock() at the end of
7276 * default_device_exit_batch.
7277 */
7278 rtnl_lock_unregistering(net_list);
7279 list_for_each_entry(net, net_list, exit_list) {
7280 for_each_netdev_reverse(net, dev) {
7281 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7282 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7283 else
7284 unregister_netdevice_queue(dev, &dev_kill_list);
7285 }
7286 }
7287 unregister_netdevice_many(&dev_kill_list);
7288 rtnl_unlock();
7289 }
7290
7291 static struct pernet_operations __net_initdata default_device_ops = {
7292 .exit = default_device_exit,
7293 .exit_batch = default_device_exit_batch,
7294 };
7295
7296 /*
7297 * Initialize the DEV module. At boot time this walks the device list and
7298 * unhooks any devices that fail to initialise (normally hardware not
7299 * present) and leaves us with a valid list of present and active devices.
7300 *
7301 */
7302
7303 /*
7304 * This is called single threaded during boot, so no need
7305 * to take the rtnl semaphore.
7306 */
7307 static int __init net_dev_init(void)
7308 {
7309 int i, rc = -ENOMEM;
7310
7311 BUG_ON(!dev_boot_phase);
7312
7313 if (dev_proc_init())
7314 goto out;
7315
7316 if (netdev_kobject_init())
7317 goto out;
7318
7319 INIT_LIST_HEAD(&ptype_all);
7320 for (i = 0; i < PTYPE_HASH_SIZE; i++)
7321 INIT_LIST_HEAD(&ptype_base[i]);
7322
7323 INIT_LIST_HEAD(&offload_base);
7324
7325 if (register_pernet_subsys(&netdev_net_ops))
7326 goto out;
7327
7328 /*
7329 * Initialise the packet receive queues.
7330 */
7331
7332 for_each_possible_cpu(i) {
7333 struct softnet_data *sd = &per_cpu(softnet_data, i);
7334
7335 skb_queue_head_init(&sd->input_pkt_queue);
7336 skb_queue_head_init(&sd->process_queue);
7337 INIT_LIST_HEAD(&sd->poll_list);
7338 sd->output_queue_tailp = &sd->output_queue;
7339 #ifdef CONFIG_RPS
7340 sd->csd.func = rps_trigger_softirq;
7341 sd->csd.info = sd;
7342 sd->cpu = i;
7343 #endif
7344
7345 sd->backlog.poll = process_backlog;
7346 sd->backlog.weight = weight_p;
7347 }
7348
7349 dev_boot_phase = 0;
7350
7351 /* The loopback device is special if any other network devices
7352 * is present in a network namespace the loopback device must
7353 * be present. Since we now dynamically allocate and free the
7354 * loopback device ensure this invariant is maintained by
7355 * keeping the loopback device as the first device on the
7356 * list of network devices. Ensuring the loopback devices
7357 * is the first device that appears and the last network device
7358 * that disappears.
7359 */
7360 if (register_pernet_device(&loopback_net_ops))
7361 goto out;
7362
7363 if (register_pernet_device(&default_device_ops))
7364 goto out;
7365
7366 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7367 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7368
7369 hotcpu_notifier(dev_cpu_callback, 0);
7370 dst_init();
7371 rc = 0;
7372 out:
7373 return rc;
7374 }
7375
7376 subsys_initcall(net_dev_init);
This page took 0.178209 seconds and 6 git commands to generate.