net: remove delay at device dismantle
[deliverable/linux.git] / net / core / dev.c
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/proc_fs.h>
101 #include <linux/seq_file.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/net_tstamp.h>
136 #include <linux/static_key.h>
137 #include <net/flow_keys.h>
138
139 #include "net-sysfs.h"
140
141 /* Instead of increasing this, you should create a hash table. */
142 #define MAX_GRO_SKBS 8
143
144 /* This should be increased if a protocol with a bigger head is added. */
145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146
147 /*
148 * The list of packet types we will receive (as opposed to discard)
149 * and the routines to invoke.
150 *
151 * Why 16. Because with 16 the only overlap we get on a hash of the
152 * low nibble of the protocol value is RARP/SNAP/X.25.
153 *
154 * NOTE: That is no longer true with the addition of VLAN tags. Not
155 * sure which should go first, but I bet it won't make much
156 * difference if we are running VLANs. The good news is that
157 * this protocol won't be in the list unless compiled in, so
158 * the average user (w/out VLANs) will not be adversely affected.
159 * --BLG
160 *
161 * 0800 IP
162 * 8100 802.1Q VLAN
163 * 0001 802.3
164 * 0002 AX.25
165 * 0004 802.2
166 * 8035 RARP
167 * 0005 SNAP
168 * 0805 X.25
169 * 0806 ARP
170 * 8137 IPX
171 * 0009 Localtalk
172 * 86DD IPv6
173 */
174
175 #define PTYPE_HASH_SIZE (16)
176 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
177
178 static DEFINE_SPINLOCK(ptype_lock);
179 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
180 static struct list_head ptype_all __read_mostly; /* Taps */
181
182 /*
183 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
184 * semaphore.
185 *
186 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
187 *
188 * Writers must hold the rtnl semaphore while they loop through the
189 * dev_base_head list, and hold dev_base_lock for writing when they do the
190 * actual updates. This allows pure readers to access the list even
191 * while a writer is preparing to update it.
192 *
193 * To put it another way, dev_base_lock is held for writing only to
194 * protect against pure readers; the rtnl semaphore provides the
195 * protection against other writers.
196 *
197 * See, for example usages, register_netdevice() and
198 * unregister_netdevice(), which must be called with the rtnl
199 * semaphore held.
200 */
201 DEFINE_RWLOCK(dev_base_lock);
202 EXPORT_SYMBOL(dev_base_lock);
203
204 static inline void dev_base_seq_inc(struct net *net)
205 {
206 while (++net->dev_base_seq == 0);
207 }
208
209 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
210 {
211 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
212
213 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
214 }
215
216 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
217 {
218 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
219 }
220
221 static inline void rps_lock(struct softnet_data *sd)
222 {
223 #ifdef CONFIG_RPS
224 spin_lock(&sd->input_pkt_queue.lock);
225 #endif
226 }
227
228 static inline void rps_unlock(struct softnet_data *sd)
229 {
230 #ifdef CONFIG_RPS
231 spin_unlock(&sd->input_pkt_queue.lock);
232 #endif
233 }
234
235 /* Device list insertion */
236 static int list_netdevice(struct net_device *dev)
237 {
238 struct net *net = dev_net(dev);
239
240 ASSERT_RTNL();
241
242 write_lock_bh(&dev_base_lock);
243 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
244 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
245 hlist_add_head_rcu(&dev->index_hlist,
246 dev_index_hash(net, dev->ifindex));
247 write_unlock_bh(&dev_base_lock);
248
249 dev_base_seq_inc(net);
250
251 return 0;
252 }
253
254 /* Device list removal
255 * caller must respect a RCU grace period before freeing/reusing dev
256 */
257 static void unlist_netdevice(struct net_device *dev)
258 {
259 ASSERT_RTNL();
260
261 /* Unlink dev from the device chain */
262 write_lock_bh(&dev_base_lock);
263 list_del_rcu(&dev->dev_list);
264 hlist_del_rcu(&dev->name_hlist);
265 hlist_del_rcu(&dev->index_hlist);
266 write_unlock_bh(&dev_base_lock);
267
268 dev_base_seq_inc(dev_net(dev));
269 }
270
271 /*
272 * Our notifier list
273 */
274
275 static RAW_NOTIFIER_HEAD(netdev_chain);
276
277 /*
278 * Device drivers call our routines to queue packets here. We empty the
279 * queue in the local softnet handler.
280 */
281
282 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
283 EXPORT_PER_CPU_SYMBOL(softnet_data);
284
285 #ifdef CONFIG_LOCKDEP
286 /*
287 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
288 * according to dev->type
289 */
290 static const unsigned short netdev_lock_type[] =
291 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
292 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
293 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
294 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
295 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
296 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
297 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
298 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
299 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
300 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
301 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
302 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
303 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
304 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
305 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
306
307 static const char *const netdev_lock_name[] =
308 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
309 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
310 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
311 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
312 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
313 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
314 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
315 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
316 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
317 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
318 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
319 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
320 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
321 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
322 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
323
324 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
325 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
326
327 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
328 {
329 int i;
330
331 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
332 if (netdev_lock_type[i] == dev_type)
333 return i;
334 /* the last key is used by default */
335 return ARRAY_SIZE(netdev_lock_type) - 1;
336 }
337
338 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
339 unsigned short dev_type)
340 {
341 int i;
342
343 i = netdev_lock_pos(dev_type);
344 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
345 netdev_lock_name[i]);
346 }
347
348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349 {
350 int i;
351
352 i = netdev_lock_pos(dev->type);
353 lockdep_set_class_and_name(&dev->addr_list_lock,
354 &netdev_addr_lock_key[i],
355 netdev_lock_name[i]);
356 }
357 #else
358 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
359 unsigned short dev_type)
360 {
361 }
362 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
363 {
364 }
365 #endif
366
367 /*******************************************************************************
368
369 Protocol management and registration routines
370
371 *******************************************************************************/
372
373 /*
374 * Add a protocol ID to the list. Now that the input handler is
375 * smarter we can dispense with all the messy stuff that used to be
376 * here.
377 *
378 * BEWARE!!! Protocol handlers, mangling input packets,
379 * MUST BE last in hash buckets and checking protocol handlers
380 * MUST start from promiscuous ptype_all chain in net_bh.
381 * It is true now, do not change it.
382 * Explanation follows: if protocol handler, mangling packet, will
383 * be the first on list, it is not able to sense, that packet
384 * is cloned and should be copied-on-write, so that it will
385 * change it and subsequent readers will get broken packet.
386 * --ANK (980803)
387 */
388
389 static inline struct list_head *ptype_head(const struct packet_type *pt)
390 {
391 if (pt->type == htons(ETH_P_ALL))
392 return &ptype_all;
393 else
394 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
395 }
396
397 /**
398 * dev_add_pack - add packet handler
399 * @pt: packet type declaration
400 *
401 * Add a protocol handler to the networking stack. The passed &packet_type
402 * is linked into kernel lists and may not be freed until it has been
403 * removed from the kernel lists.
404 *
405 * This call does not sleep therefore it can not
406 * guarantee all CPU's that are in middle of receiving packets
407 * will see the new packet type (until the next received packet).
408 */
409
410 void dev_add_pack(struct packet_type *pt)
411 {
412 struct list_head *head = ptype_head(pt);
413
414 spin_lock(&ptype_lock);
415 list_add_rcu(&pt->list, head);
416 spin_unlock(&ptype_lock);
417 }
418 EXPORT_SYMBOL(dev_add_pack);
419
420 /**
421 * __dev_remove_pack - remove packet handler
422 * @pt: packet type declaration
423 *
424 * Remove a protocol handler that was previously added to the kernel
425 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
426 * from the kernel lists and can be freed or reused once this function
427 * returns.
428 *
429 * The packet type might still be in use by receivers
430 * and must not be freed until after all the CPU's have gone
431 * through a quiescent state.
432 */
433 void __dev_remove_pack(struct packet_type *pt)
434 {
435 struct list_head *head = ptype_head(pt);
436 struct packet_type *pt1;
437
438 spin_lock(&ptype_lock);
439
440 list_for_each_entry(pt1, head, list) {
441 if (pt == pt1) {
442 list_del_rcu(&pt->list);
443 goto out;
444 }
445 }
446
447 pr_warn("dev_remove_pack: %p not found\n", pt);
448 out:
449 spin_unlock(&ptype_lock);
450 }
451 EXPORT_SYMBOL(__dev_remove_pack);
452
453 /**
454 * dev_remove_pack - remove packet handler
455 * @pt: packet type declaration
456 *
457 * Remove a protocol handler that was previously added to the kernel
458 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
459 * from the kernel lists and can be freed or reused once this function
460 * returns.
461 *
462 * This call sleeps to guarantee that no CPU is looking at the packet
463 * type after return.
464 */
465 void dev_remove_pack(struct packet_type *pt)
466 {
467 __dev_remove_pack(pt);
468
469 synchronize_net();
470 }
471 EXPORT_SYMBOL(dev_remove_pack);
472
473 /******************************************************************************
474
475 Device Boot-time Settings Routines
476
477 *******************************************************************************/
478
479 /* Boot time configuration table */
480 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
481
482 /**
483 * netdev_boot_setup_add - add new setup entry
484 * @name: name of the device
485 * @map: configured settings for the device
486 *
487 * Adds new setup entry to the dev_boot_setup list. The function
488 * returns 0 on error and 1 on success. This is a generic routine to
489 * all netdevices.
490 */
491 static int netdev_boot_setup_add(char *name, struct ifmap *map)
492 {
493 struct netdev_boot_setup *s;
494 int i;
495
496 s = dev_boot_setup;
497 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
498 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
499 memset(s[i].name, 0, sizeof(s[i].name));
500 strlcpy(s[i].name, name, IFNAMSIZ);
501 memcpy(&s[i].map, map, sizeof(s[i].map));
502 break;
503 }
504 }
505
506 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
507 }
508
509 /**
510 * netdev_boot_setup_check - check boot time settings
511 * @dev: the netdevice
512 *
513 * Check boot time settings for the device.
514 * The found settings are set for the device to be used
515 * later in the device probing.
516 * Returns 0 if no settings found, 1 if they are.
517 */
518 int netdev_boot_setup_check(struct net_device *dev)
519 {
520 struct netdev_boot_setup *s = dev_boot_setup;
521 int i;
522
523 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
524 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
525 !strcmp(dev->name, s[i].name)) {
526 dev->irq = s[i].map.irq;
527 dev->base_addr = s[i].map.base_addr;
528 dev->mem_start = s[i].map.mem_start;
529 dev->mem_end = s[i].map.mem_end;
530 return 1;
531 }
532 }
533 return 0;
534 }
535 EXPORT_SYMBOL(netdev_boot_setup_check);
536
537
538 /**
539 * netdev_boot_base - get address from boot time settings
540 * @prefix: prefix for network device
541 * @unit: id for network device
542 *
543 * Check boot time settings for the base address of device.
544 * The found settings are set for the device to be used
545 * later in the device probing.
546 * Returns 0 if no settings found.
547 */
548 unsigned long netdev_boot_base(const char *prefix, int unit)
549 {
550 const struct netdev_boot_setup *s = dev_boot_setup;
551 char name[IFNAMSIZ];
552 int i;
553
554 sprintf(name, "%s%d", prefix, unit);
555
556 /*
557 * If device already registered then return base of 1
558 * to indicate not to probe for this interface
559 */
560 if (__dev_get_by_name(&init_net, name))
561 return 1;
562
563 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
564 if (!strcmp(name, s[i].name))
565 return s[i].map.base_addr;
566 return 0;
567 }
568
569 /*
570 * Saves at boot time configured settings for any netdevice.
571 */
572 int __init netdev_boot_setup(char *str)
573 {
574 int ints[5];
575 struct ifmap map;
576
577 str = get_options(str, ARRAY_SIZE(ints), ints);
578 if (!str || !*str)
579 return 0;
580
581 /* Save settings */
582 memset(&map, 0, sizeof(map));
583 if (ints[0] > 0)
584 map.irq = ints[1];
585 if (ints[0] > 1)
586 map.base_addr = ints[2];
587 if (ints[0] > 2)
588 map.mem_start = ints[3];
589 if (ints[0] > 3)
590 map.mem_end = ints[4];
591
592 /* Add new entry to the list */
593 return netdev_boot_setup_add(str, &map);
594 }
595
596 __setup("netdev=", netdev_boot_setup);
597
598 /*******************************************************************************
599
600 Device Interface Subroutines
601
602 *******************************************************************************/
603
604 /**
605 * __dev_get_by_name - find a device by its name
606 * @net: the applicable net namespace
607 * @name: name to find
608 *
609 * Find an interface by name. Must be called under RTNL semaphore
610 * or @dev_base_lock. If the name is found a pointer to the device
611 * is returned. If the name is not found then %NULL is returned. The
612 * reference counters are not incremented so the caller must be
613 * careful with locks.
614 */
615
616 struct net_device *__dev_get_by_name(struct net *net, const char *name)
617 {
618 struct hlist_node *p;
619 struct net_device *dev;
620 struct hlist_head *head = dev_name_hash(net, name);
621
622 hlist_for_each_entry(dev, p, head, name_hlist)
623 if (!strncmp(dev->name, name, IFNAMSIZ))
624 return dev;
625
626 return NULL;
627 }
628 EXPORT_SYMBOL(__dev_get_by_name);
629
630 /**
631 * dev_get_by_name_rcu - find a device by its name
632 * @net: the applicable net namespace
633 * @name: name to find
634 *
635 * Find an interface by name.
636 * If the name is found a pointer to the device is returned.
637 * If the name is not found then %NULL is returned.
638 * The reference counters are not incremented so the caller must be
639 * careful with locks. The caller must hold RCU lock.
640 */
641
642 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
643 {
644 struct hlist_node *p;
645 struct net_device *dev;
646 struct hlist_head *head = dev_name_hash(net, name);
647
648 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
649 if (!strncmp(dev->name, name, IFNAMSIZ))
650 return dev;
651
652 return NULL;
653 }
654 EXPORT_SYMBOL(dev_get_by_name_rcu);
655
656 /**
657 * dev_get_by_name - find a device by its name
658 * @net: the applicable net namespace
659 * @name: name to find
660 *
661 * Find an interface by name. This can be called from any
662 * context and does its own locking. The returned handle has
663 * the usage count incremented and the caller must use dev_put() to
664 * release it when it is no longer needed. %NULL is returned if no
665 * matching device is found.
666 */
667
668 struct net_device *dev_get_by_name(struct net *net, const char *name)
669 {
670 struct net_device *dev;
671
672 rcu_read_lock();
673 dev = dev_get_by_name_rcu(net, name);
674 if (dev)
675 dev_hold(dev);
676 rcu_read_unlock();
677 return dev;
678 }
679 EXPORT_SYMBOL(dev_get_by_name);
680
681 /**
682 * __dev_get_by_index - find a device by its ifindex
683 * @net: the applicable net namespace
684 * @ifindex: index of device
685 *
686 * Search for an interface by index. Returns %NULL if the device
687 * is not found or a pointer to the device. The device has not
688 * had its reference counter increased so the caller must be careful
689 * about locking. The caller must hold either the RTNL semaphore
690 * or @dev_base_lock.
691 */
692
693 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
694 {
695 struct hlist_node *p;
696 struct net_device *dev;
697 struct hlist_head *head = dev_index_hash(net, ifindex);
698
699 hlist_for_each_entry(dev, p, head, index_hlist)
700 if (dev->ifindex == ifindex)
701 return dev;
702
703 return NULL;
704 }
705 EXPORT_SYMBOL(__dev_get_by_index);
706
707 /**
708 * dev_get_by_index_rcu - find a device by its ifindex
709 * @net: the applicable net namespace
710 * @ifindex: index of device
711 *
712 * Search for an interface by index. Returns %NULL if the device
713 * is not found or a pointer to the device. The device has not
714 * had its reference counter increased so the caller must be careful
715 * about locking. The caller must hold RCU lock.
716 */
717
718 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
719 {
720 struct hlist_node *p;
721 struct net_device *dev;
722 struct hlist_head *head = dev_index_hash(net, ifindex);
723
724 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
725 if (dev->ifindex == ifindex)
726 return dev;
727
728 return NULL;
729 }
730 EXPORT_SYMBOL(dev_get_by_index_rcu);
731
732
733 /**
734 * dev_get_by_index - find a device by its ifindex
735 * @net: the applicable net namespace
736 * @ifindex: index of device
737 *
738 * Search for an interface by index. Returns NULL if the device
739 * is not found or a pointer to the device. The device returned has
740 * had a reference added and the pointer is safe until the user calls
741 * dev_put to indicate they have finished with it.
742 */
743
744 struct net_device *dev_get_by_index(struct net *net, int ifindex)
745 {
746 struct net_device *dev;
747
748 rcu_read_lock();
749 dev = dev_get_by_index_rcu(net, ifindex);
750 if (dev)
751 dev_hold(dev);
752 rcu_read_unlock();
753 return dev;
754 }
755 EXPORT_SYMBOL(dev_get_by_index);
756
757 /**
758 * dev_getbyhwaddr_rcu - find a device by its hardware address
759 * @net: the applicable net namespace
760 * @type: media type of device
761 * @ha: hardware address
762 *
763 * Search for an interface by MAC address. Returns NULL if the device
764 * is not found or a pointer to the device.
765 * The caller must hold RCU or RTNL.
766 * The returned device has not had its ref count increased
767 * and the caller must therefore be careful about locking
768 *
769 */
770
771 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
772 const char *ha)
773 {
774 struct net_device *dev;
775
776 for_each_netdev_rcu(net, dev)
777 if (dev->type == type &&
778 !memcmp(dev->dev_addr, ha, dev->addr_len))
779 return dev;
780
781 return NULL;
782 }
783 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
784
785 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
786 {
787 struct net_device *dev;
788
789 ASSERT_RTNL();
790 for_each_netdev(net, dev)
791 if (dev->type == type)
792 return dev;
793
794 return NULL;
795 }
796 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
797
798 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
799 {
800 struct net_device *dev, *ret = NULL;
801
802 rcu_read_lock();
803 for_each_netdev_rcu(net, dev)
804 if (dev->type == type) {
805 dev_hold(dev);
806 ret = dev;
807 break;
808 }
809 rcu_read_unlock();
810 return ret;
811 }
812 EXPORT_SYMBOL(dev_getfirstbyhwtype);
813
814 /**
815 * dev_get_by_flags_rcu - find any device with given flags
816 * @net: the applicable net namespace
817 * @if_flags: IFF_* values
818 * @mask: bitmask of bits in if_flags to check
819 *
820 * Search for any interface with the given flags. Returns NULL if a device
821 * is not found or a pointer to the device. Must be called inside
822 * rcu_read_lock(), and result refcount is unchanged.
823 */
824
825 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
826 unsigned short mask)
827 {
828 struct net_device *dev, *ret;
829
830 ret = NULL;
831 for_each_netdev_rcu(net, dev) {
832 if (((dev->flags ^ if_flags) & mask) == 0) {
833 ret = dev;
834 break;
835 }
836 }
837 return ret;
838 }
839 EXPORT_SYMBOL(dev_get_by_flags_rcu);
840
841 /**
842 * dev_valid_name - check if name is okay for network device
843 * @name: name string
844 *
845 * Network device names need to be valid file names to
846 * to allow sysfs to work. We also disallow any kind of
847 * whitespace.
848 */
849 bool dev_valid_name(const char *name)
850 {
851 if (*name == '\0')
852 return false;
853 if (strlen(name) >= IFNAMSIZ)
854 return false;
855 if (!strcmp(name, ".") || !strcmp(name, ".."))
856 return false;
857
858 while (*name) {
859 if (*name == '/' || isspace(*name))
860 return false;
861 name++;
862 }
863 return true;
864 }
865 EXPORT_SYMBOL(dev_valid_name);
866
867 /**
868 * __dev_alloc_name - allocate a name for a device
869 * @net: network namespace to allocate the device name in
870 * @name: name format string
871 * @buf: scratch buffer and result name string
872 *
873 * Passed a format string - eg "lt%d" it will try and find a suitable
874 * id. It scans list of devices to build up a free map, then chooses
875 * the first empty slot. The caller must hold the dev_base or rtnl lock
876 * while allocating the name and adding the device in order to avoid
877 * duplicates.
878 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
879 * Returns the number of the unit assigned or a negative errno code.
880 */
881
882 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
883 {
884 int i = 0;
885 const char *p;
886 const int max_netdevices = 8*PAGE_SIZE;
887 unsigned long *inuse;
888 struct net_device *d;
889
890 p = strnchr(name, IFNAMSIZ-1, '%');
891 if (p) {
892 /*
893 * Verify the string as this thing may have come from
894 * the user. There must be either one "%d" and no other "%"
895 * characters.
896 */
897 if (p[1] != 'd' || strchr(p + 2, '%'))
898 return -EINVAL;
899
900 /* Use one page as a bit array of possible slots */
901 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
902 if (!inuse)
903 return -ENOMEM;
904
905 for_each_netdev(net, d) {
906 if (!sscanf(d->name, name, &i))
907 continue;
908 if (i < 0 || i >= max_netdevices)
909 continue;
910
911 /* avoid cases where sscanf is not exact inverse of printf */
912 snprintf(buf, IFNAMSIZ, name, i);
913 if (!strncmp(buf, d->name, IFNAMSIZ))
914 set_bit(i, inuse);
915 }
916
917 i = find_first_zero_bit(inuse, max_netdevices);
918 free_page((unsigned long) inuse);
919 }
920
921 if (buf != name)
922 snprintf(buf, IFNAMSIZ, name, i);
923 if (!__dev_get_by_name(net, buf))
924 return i;
925
926 /* It is possible to run out of possible slots
927 * when the name is long and there isn't enough space left
928 * for the digits, or if all bits are used.
929 */
930 return -ENFILE;
931 }
932
933 /**
934 * dev_alloc_name - allocate a name for a device
935 * @dev: device
936 * @name: name format string
937 *
938 * Passed a format string - eg "lt%d" it will try and find a suitable
939 * id. It scans list of devices to build up a free map, then chooses
940 * the first empty slot. The caller must hold the dev_base or rtnl lock
941 * while allocating the name and adding the device in order to avoid
942 * duplicates.
943 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
944 * Returns the number of the unit assigned or a negative errno code.
945 */
946
947 int dev_alloc_name(struct net_device *dev, const char *name)
948 {
949 char buf[IFNAMSIZ];
950 struct net *net;
951 int ret;
952
953 BUG_ON(!dev_net(dev));
954 net = dev_net(dev);
955 ret = __dev_alloc_name(net, name, buf);
956 if (ret >= 0)
957 strlcpy(dev->name, buf, IFNAMSIZ);
958 return ret;
959 }
960 EXPORT_SYMBOL(dev_alloc_name);
961
962 static int dev_get_valid_name(struct net_device *dev, const char *name)
963 {
964 struct net *net;
965
966 BUG_ON(!dev_net(dev));
967 net = dev_net(dev);
968
969 if (!dev_valid_name(name))
970 return -EINVAL;
971
972 if (strchr(name, '%'))
973 return dev_alloc_name(dev, name);
974 else if (__dev_get_by_name(net, name))
975 return -EEXIST;
976 else if (dev->name != name)
977 strlcpy(dev->name, name, IFNAMSIZ);
978
979 return 0;
980 }
981
982 /**
983 * dev_change_name - change name of a device
984 * @dev: device
985 * @newname: name (or format string) must be at least IFNAMSIZ
986 *
987 * Change name of a device, can pass format strings "eth%d".
988 * for wildcarding.
989 */
990 int dev_change_name(struct net_device *dev, const char *newname)
991 {
992 char oldname[IFNAMSIZ];
993 int err = 0;
994 int ret;
995 struct net *net;
996
997 ASSERT_RTNL();
998 BUG_ON(!dev_net(dev));
999
1000 net = dev_net(dev);
1001 if (dev->flags & IFF_UP)
1002 return -EBUSY;
1003
1004 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1005 return 0;
1006
1007 memcpy(oldname, dev->name, IFNAMSIZ);
1008
1009 err = dev_get_valid_name(dev, newname);
1010 if (err < 0)
1011 return err;
1012
1013 rollback:
1014 ret = device_rename(&dev->dev, dev->name);
1015 if (ret) {
1016 memcpy(dev->name, oldname, IFNAMSIZ);
1017 return ret;
1018 }
1019
1020 write_lock_bh(&dev_base_lock);
1021 hlist_del_rcu(&dev->name_hlist);
1022 write_unlock_bh(&dev_base_lock);
1023
1024 synchronize_rcu();
1025
1026 write_lock_bh(&dev_base_lock);
1027 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1028 write_unlock_bh(&dev_base_lock);
1029
1030 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1031 ret = notifier_to_errno(ret);
1032
1033 if (ret) {
1034 /* err >= 0 after dev_alloc_name() or stores the first errno */
1035 if (err >= 0) {
1036 err = ret;
1037 memcpy(dev->name, oldname, IFNAMSIZ);
1038 goto rollback;
1039 } else {
1040 pr_err("%s: name change rollback failed: %d\n",
1041 dev->name, ret);
1042 }
1043 }
1044
1045 return err;
1046 }
1047
1048 /**
1049 * dev_set_alias - change ifalias of a device
1050 * @dev: device
1051 * @alias: name up to IFALIASZ
1052 * @len: limit of bytes to copy from info
1053 *
1054 * Set ifalias for a device,
1055 */
1056 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057 {
1058 char *new_ifalias;
1059
1060 ASSERT_RTNL();
1061
1062 if (len >= IFALIASZ)
1063 return -EINVAL;
1064
1065 if (!len) {
1066 if (dev->ifalias) {
1067 kfree(dev->ifalias);
1068 dev->ifalias = NULL;
1069 }
1070 return 0;
1071 }
1072
1073 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1074 if (!new_ifalias)
1075 return -ENOMEM;
1076 dev->ifalias = new_ifalias;
1077
1078 strlcpy(dev->ifalias, alias, len+1);
1079 return len;
1080 }
1081
1082
1083 /**
1084 * netdev_features_change - device changes features
1085 * @dev: device to cause notification
1086 *
1087 * Called to indicate a device has changed features.
1088 */
1089 void netdev_features_change(struct net_device *dev)
1090 {
1091 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1092 }
1093 EXPORT_SYMBOL(netdev_features_change);
1094
1095 /**
1096 * netdev_state_change - device changes state
1097 * @dev: device to cause notification
1098 *
1099 * Called to indicate a device has changed state. This function calls
1100 * the notifier chains for netdev_chain and sends a NEWLINK message
1101 * to the routing socket.
1102 */
1103 void netdev_state_change(struct net_device *dev)
1104 {
1105 if (dev->flags & IFF_UP) {
1106 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1107 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1108 }
1109 }
1110 EXPORT_SYMBOL(netdev_state_change);
1111
1112 /**
1113 * netdev_notify_peers - notify network peers about existence of @dev
1114 * @dev: network device
1115 *
1116 * Generate traffic such that interested network peers are aware of
1117 * @dev, such as by generating a gratuitous ARP. This may be used when
1118 * a device wants to inform the rest of the network about some sort of
1119 * reconfiguration such as a failover event or virtual machine
1120 * migration.
1121 */
1122 void netdev_notify_peers(struct net_device *dev)
1123 {
1124 rtnl_lock();
1125 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1126 rtnl_unlock();
1127 }
1128 EXPORT_SYMBOL(netdev_notify_peers);
1129
1130 /**
1131 * dev_load - load a network module
1132 * @net: the applicable net namespace
1133 * @name: name of interface
1134 *
1135 * If a network interface is not present and the process has suitable
1136 * privileges this function loads the module. If module loading is not
1137 * available in this kernel then it becomes a nop.
1138 */
1139
1140 void dev_load(struct net *net, const char *name)
1141 {
1142 struct net_device *dev;
1143 int no_module;
1144
1145 rcu_read_lock();
1146 dev = dev_get_by_name_rcu(net, name);
1147 rcu_read_unlock();
1148
1149 no_module = !dev;
1150 if (no_module && capable(CAP_NET_ADMIN))
1151 no_module = request_module("netdev-%s", name);
1152 if (no_module && capable(CAP_SYS_MODULE)) {
1153 if (!request_module("%s", name))
1154 pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1155 name);
1156 }
1157 }
1158 EXPORT_SYMBOL(dev_load);
1159
1160 static int __dev_open(struct net_device *dev)
1161 {
1162 const struct net_device_ops *ops = dev->netdev_ops;
1163 int ret;
1164
1165 ASSERT_RTNL();
1166
1167 if (!netif_device_present(dev))
1168 return -ENODEV;
1169
1170 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1171 ret = notifier_to_errno(ret);
1172 if (ret)
1173 return ret;
1174
1175 set_bit(__LINK_STATE_START, &dev->state);
1176
1177 if (ops->ndo_validate_addr)
1178 ret = ops->ndo_validate_addr(dev);
1179
1180 if (!ret && ops->ndo_open)
1181 ret = ops->ndo_open(dev);
1182
1183 if (ret)
1184 clear_bit(__LINK_STATE_START, &dev->state);
1185 else {
1186 dev->flags |= IFF_UP;
1187 net_dmaengine_get();
1188 dev_set_rx_mode(dev);
1189 dev_activate(dev);
1190 add_device_randomness(dev->dev_addr, dev->addr_len);
1191 }
1192
1193 return ret;
1194 }
1195
1196 /**
1197 * dev_open - prepare an interface for use.
1198 * @dev: device to open
1199 *
1200 * Takes a device from down to up state. The device's private open
1201 * function is invoked and then the multicast lists are loaded. Finally
1202 * the device is moved into the up state and a %NETDEV_UP message is
1203 * sent to the netdev notifier chain.
1204 *
1205 * Calling this function on an active interface is a nop. On a failure
1206 * a negative errno code is returned.
1207 */
1208 int dev_open(struct net_device *dev)
1209 {
1210 int ret;
1211
1212 if (dev->flags & IFF_UP)
1213 return 0;
1214
1215 ret = __dev_open(dev);
1216 if (ret < 0)
1217 return ret;
1218
1219 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1220 call_netdevice_notifiers(NETDEV_UP, dev);
1221
1222 return ret;
1223 }
1224 EXPORT_SYMBOL(dev_open);
1225
1226 static int __dev_close_many(struct list_head *head)
1227 {
1228 struct net_device *dev;
1229
1230 ASSERT_RTNL();
1231 might_sleep();
1232
1233 list_for_each_entry(dev, head, unreg_list) {
1234 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1235
1236 clear_bit(__LINK_STATE_START, &dev->state);
1237
1238 /* Synchronize to scheduled poll. We cannot touch poll list, it
1239 * can be even on different cpu. So just clear netif_running().
1240 *
1241 * dev->stop() will invoke napi_disable() on all of it's
1242 * napi_struct instances on this device.
1243 */
1244 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1245 }
1246
1247 dev_deactivate_many(head);
1248
1249 list_for_each_entry(dev, head, unreg_list) {
1250 const struct net_device_ops *ops = dev->netdev_ops;
1251
1252 /*
1253 * Call the device specific close. This cannot fail.
1254 * Only if device is UP
1255 *
1256 * We allow it to be called even after a DETACH hot-plug
1257 * event.
1258 */
1259 if (ops->ndo_stop)
1260 ops->ndo_stop(dev);
1261
1262 dev->flags &= ~IFF_UP;
1263 net_dmaengine_put();
1264 }
1265
1266 return 0;
1267 }
1268
1269 static int __dev_close(struct net_device *dev)
1270 {
1271 int retval;
1272 LIST_HEAD(single);
1273
1274 list_add(&dev->unreg_list, &single);
1275 retval = __dev_close_many(&single);
1276 list_del(&single);
1277 return retval;
1278 }
1279
1280 static int dev_close_many(struct list_head *head)
1281 {
1282 struct net_device *dev, *tmp;
1283 LIST_HEAD(tmp_list);
1284
1285 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1286 if (!(dev->flags & IFF_UP))
1287 list_move(&dev->unreg_list, &tmp_list);
1288
1289 __dev_close_many(head);
1290
1291 list_for_each_entry(dev, head, unreg_list) {
1292 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1293 call_netdevice_notifiers(NETDEV_DOWN, dev);
1294 }
1295
1296 /* rollback_registered_many needs the complete original list */
1297 list_splice(&tmp_list, head);
1298 return 0;
1299 }
1300
1301 /**
1302 * dev_close - shutdown an interface.
1303 * @dev: device to shutdown
1304 *
1305 * This function moves an active device into down state. A
1306 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1307 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1308 * chain.
1309 */
1310 int dev_close(struct net_device *dev)
1311 {
1312 if (dev->flags & IFF_UP) {
1313 LIST_HEAD(single);
1314
1315 list_add(&dev->unreg_list, &single);
1316 dev_close_many(&single);
1317 list_del(&single);
1318 }
1319 return 0;
1320 }
1321 EXPORT_SYMBOL(dev_close);
1322
1323
1324 /**
1325 * dev_disable_lro - disable Large Receive Offload on a device
1326 * @dev: device
1327 *
1328 * Disable Large Receive Offload (LRO) on a net device. Must be
1329 * called under RTNL. This is needed if received packets may be
1330 * forwarded to another interface.
1331 */
1332 void dev_disable_lro(struct net_device *dev)
1333 {
1334 /*
1335 * If we're trying to disable lro on a vlan device
1336 * use the underlying physical device instead
1337 */
1338 if (is_vlan_dev(dev))
1339 dev = vlan_dev_real_dev(dev);
1340
1341 dev->wanted_features &= ~NETIF_F_LRO;
1342 netdev_update_features(dev);
1343
1344 if (unlikely(dev->features & NETIF_F_LRO))
1345 netdev_WARN(dev, "failed to disable LRO!\n");
1346 }
1347 EXPORT_SYMBOL(dev_disable_lro);
1348
1349
1350 static int dev_boot_phase = 1;
1351
1352 /**
1353 * register_netdevice_notifier - register a network notifier block
1354 * @nb: notifier
1355 *
1356 * Register a notifier to be called when network device events occur.
1357 * The notifier passed is linked into the kernel structures and must
1358 * not be reused until it has been unregistered. A negative errno code
1359 * is returned on a failure.
1360 *
1361 * When registered all registration and up events are replayed
1362 * to the new notifier to allow device to have a race free
1363 * view of the network device list.
1364 */
1365
1366 int register_netdevice_notifier(struct notifier_block *nb)
1367 {
1368 struct net_device *dev;
1369 struct net_device *last;
1370 struct net *net;
1371 int err;
1372
1373 rtnl_lock();
1374 err = raw_notifier_chain_register(&netdev_chain, nb);
1375 if (err)
1376 goto unlock;
1377 if (dev_boot_phase)
1378 goto unlock;
1379 for_each_net(net) {
1380 for_each_netdev(net, dev) {
1381 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1382 err = notifier_to_errno(err);
1383 if (err)
1384 goto rollback;
1385
1386 if (!(dev->flags & IFF_UP))
1387 continue;
1388
1389 nb->notifier_call(nb, NETDEV_UP, dev);
1390 }
1391 }
1392
1393 unlock:
1394 rtnl_unlock();
1395 return err;
1396
1397 rollback:
1398 last = dev;
1399 for_each_net(net) {
1400 for_each_netdev(net, dev) {
1401 if (dev == last)
1402 goto outroll;
1403
1404 if (dev->flags & IFF_UP) {
1405 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1406 nb->notifier_call(nb, NETDEV_DOWN, dev);
1407 }
1408 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1409 }
1410 }
1411
1412 outroll:
1413 raw_notifier_chain_unregister(&netdev_chain, nb);
1414 goto unlock;
1415 }
1416 EXPORT_SYMBOL(register_netdevice_notifier);
1417
1418 /**
1419 * unregister_netdevice_notifier - unregister a network notifier block
1420 * @nb: notifier
1421 *
1422 * Unregister a notifier previously registered by
1423 * register_netdevice_notifier(). The notifier is unlinked into the
1424 * kernel structures and may then be reused. A negative errno code
1425 * is returned on a failure.
1426 *
1427 * After unregistering unregister and down device events are synthesized
1428 * for all devices on the device list to the removed notifier to remove
1429 * the need for special case cleanup code.
1430 */
1431
1432 int unregister_netdevice_notifier(struct notifier_block *nb)
1433 {
1434 struct net_device *dev;
1435 struct net *net;
1436 int err;
1437
1438 rtnl_lock();
1439 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1440 if (err)
1441 goto unlock;
1442
1443 for_each_net(net) {
1444 for_each_netdev(net, dev) {
1445 if (dev->flags & IFF_UP) {
1446 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1447 nb->notifier_call(nb, NETDEV_DOWN, dev);
1448 }
1449 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1450 }
1451 }
1452 unlock:
1453 rtnl_unlock();
1454 return err;
1455 }
1456 EXPORT_SYMBOL(unregister_netdevice_notifier);
1457
1458 /**
1459 * call_netdevice_notifiers - call all network notifier blocks
1460 * @val: value passed unmodified to notifier function
1461 * @dev: net_device pointer passed unmodified to notifier function
1462 *
1463 * Call all network notifier blocks. Parameters and return value
1464 * are as for raw_notifier_call_chain().
1465 */
1466
1467 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1468 {
1469 if (val != NETDEV_UNREGISTER_FINAL)
1470 ASSERT_RTNL();
1471 return raw_notifier_call_chain(&netdev_chain, val, dev);
1472 }
1473 EXPORT_SYMBOL(call_netdevice_notifiers);
1474
1475 static struct static_key netstamp_needed __read_mostly;
1476 #ifdef HAVE_JUMP_LABEL
1477 /* We are not allowed to call static_key_slow_dec() from irq context
1478 * If net_disable_timestamp() is called from irq context, defer the
1479 * static_key_slow_dec() calls.
1480 */
1481 static atomic_t netstamp_needed_deferred;
1482 #endif
1483
1484 void net_enable_timestamp(void)
1485 {
1486 #ifdef HAVE_JUMP_LABEL
1487 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1488
1489 if (deferred) {
1490 while (--deferred)
1491 static_key_slow_dec(&netstamp_needed);
1492 return;
1493 }
1494 #endif
1495 WARN_ON(in_interrupt());
1496 static_key_slow_inc(&netstamp_needed);
1497 }
1498 EXPORT_SYMBOL(net_enable_timestamp);
1499
1500 void net_disable_timestamp(void)
1501 {
1502 #ifdef HAVE_JUMP_LABEL
1503 if (in_interrupt()) {
1504 atomic_inc(&netstamp_needed_deferred);
1505 return;
1506 }
1507 #endif
1508 static_key_slow_dec(&netstamp_needed);
1509 }
1510 EXPORT_SYMBOL(net_disable_timestamp);
1511
1512 static inline void net_timestamp_set(struct sk_buff *skb)
1513 {
1514 skb->tstamp.tv64 = 0;
1515 if (static_key_false(&netstamp_needed))
1516 __net_timestamp(skb);
1517 }
1518
1519 #define net_timestamp_check(COND, SKB) \
1520 if (static_key_false(&netstamp_needed)) { \
1521 if ((COND) && !(SKB)->tstamp.tv64) \
1522 __net_timestamp(SKB); \
1523 } \
1524
1525 static int net_hwtstamp_validate(struct ifreq *ifr)
1526 {
1527 struct hwtstamp_config cfg;
1528 enum hwtstamp_tx_types tx_type;
1529 enum hwtstamp_rx_filters rx_filter;
1530 int tx_type_valid = 0;
1531 int rx_filter_valid = 0;
1532
1533 if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1534 return -EFAULT;
1535
1536 if (cfg.flags) /* reserved for future extensions */
1537 return -EINVAL;
1538
1539 tx_type = cfg.tx_type;
1540 rx_filter = cfg.rx_filter;
1541
1542 switch (tx_type) {
1543 case HWTSTAMP_TX_OFF:
1544 case HWTSTAMP_TX_ON:
1545 case HWTSTAMP_TX_ONESTEP_SYNC:
1546 tx_type_valid = 1;
1547 break;
1548 }
1549
1550 switch (rx_filter) {
1551 case HWTSTAMP_FILTER_NONE:
1552 case HWTSTAMP_FILTER_ALL:
1553 case HWTSTAMP_FILTER_SOME:
1554 case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1555 case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1556 case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1557 case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1558 case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1559 case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1560 case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1561 case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1562 case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1563 case HWTSTAMP_FILTER_PTP_V2_EVENT:
1564 case HWTSTAMP_FILTER_PTP_V2_SYNC:
1565 case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1566 rx_filter_valid = 1;
1567 break;
1568 }
1569
1570 if (!tx_type_valid || !rx_filter_valid)
1571 return -ERANGE;
1572
1573 return 0;
1574 }
1575
1576 static inline bool is_skb_forwardable(struct net_device *dev,
1577 struct sk_buff *skb)
1578 {
1579 unsigned int len;
1580
1581 if (!(dev->flags & IFF_UP))
1582 return false;
1583
1584 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1585 if (skb->len <= len)
1586 return true;
1587
1588 /* if TSO is enabled, we don't care about the length as the packet
1589 * could be forwarded without being segmented before
1590 */
1591 if (skb_is_gso(skb))
1592 return true;
1593
1594 return false;
1595 }
1596
1597 /**
1598 * dev_forward_skb - loopback an skb to another netif
1599 *
1600 * @dev: destination network device
1601 * @skb: buffer to forward
1602 *
1603 * return values:
1604 * NET_RX_SUCCESS (no congestion)
1605 * NET_RX_DROP (packet was dropped, but freed)
1606 *
1607 * dev_forward_skb can be used for injecting an skb from the
1608 * start_xmit function of one device into the receive queue
1609 * of another device.
1610 *
1611 * The receiving device may be in another namespace, so
1612 * we have to clear all information in the skb that could
1613 * impact namespace isolation.
1614 */
1615 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1616 {
1617 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1618 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1619 atomic_long_inc(&dev->rx_dropped);
1620 kfree_skb(skb);
1621 return NET_RX_DROP;
1622 }
1623 }
1624
1625 skb_orphan(skb);
1626 nf_reset(skb);
1627
1628 if (unlikely(!is_skb_forwardable(dev, skb))) {
1629 atomic_long_inc(&dev->rx_dropped);
1630 kfree_skb(skb);
1631 return NET_RX_DROP;
1632 }
1633 skb->skb_iif = 0;
1634 skb->dev = dev;
1635 skb_dst_drop(skb);
1636 skb->tstamp.tv64 = 0;
1637 skb->pkt_type = PACKET_HOST;
1638 skb->protocol = eth_type_trans(skb, dev);
1639 skb->mark = 0;
1640 secpath_reset(skb);
1641 nf_reset(skb);
1642 return netif_rx(skb);
1643 }
1644 EXPORT_SYMBOL_GPL(dev_forward_skb);
1645
1646 static inline int deliver_skb(struct sk_buff *skb,
1647 struct packet_type *pt_prev,
1648 struct net_device *orig_dev)
1649 {
1650 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1651 return -ENOMEM;
1652 atomic_inc(&skb->users);
1653 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1654 }
1655
1656 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1657 {
1658 if (ptype->af_packet_priv == NULL)
1659 return false;
1660
1661 if (ptype->id_match)
1662 return ptype->id_match(ptype, skb->sk);
1663 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1664 return true;
1665
1666 return false;
1667 }
1668
1669 /*
1670 * Support routine. Sends outgoing frames to any network
1671 * taps currently in use.
1672 */
1673
1674 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1675 {
1676 struct packet_type *ptype;
1677 struct sk_buff *skb2 = NULL;
1678 struct packet_type *pt_prev = NULL;
1679
1680 rcu_read_lock();
1681 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1682 /* Never send packets back to the socket
1683 * they originated from - MvS (miquels@drinkel.ow.org)
1684 */
1685 if ((ptype->dev == dev || !ptype->dev) &&
1686 (!skb_loop_sk(ptype, skb))) {
1687 if (pt_prev) {
1688 deliver_skb(skb2, pt_prev, skb->dev);
1689 pt_prev = ptype;
1690 continue;
1691 }
1692
1693 skb2 = skb_clone(skb, GFP_ATOMIC);
1694 if (!skb2)
1695 break;
1696
1697 net_timestamp_set(skb2);
1698
1699 /* skb->nh should be correctly
1700 set by sender, so that the second statement is
1701 just protection against buggy protocols.
1702 */
1703 skb_reset_mac_header(skb2);
1704
1705 if (skb_network_header(skb2) < skb2->data ||
1706 skb2->network_header > skb2->tail) {
1707 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1708 ntohs(skb2->protocol),
1709 dev->name);
1710 skb_reset_network_header(skb2);
1711 }
1712
1713 skb2->transport_header = skb2->network_header;
1714 skb2->pkt_type = PACKET_OUTGOING;
1715 pt_prev = ptype;
1716 }
1717 }
1718 if (pt_prev)
1719 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1720 rcu_read_unlock();
1721 }
1722
1723 /**
1724 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1725 * @dev: Network device
1726 * @txq: number of queues available
1727 *
1728 * If real_num_tx_queues is changed the tc mappings may no longer be
1729 * valid. To resolve this verify the tc mapping remains valid and if
1730 * not NULL the mapping. With no priorities mapping to this
1731 * offset/count pair it will no longer be used. In the worst case TC0
1732 * is invalid nothing can be done so disable priority mappings. If is
1733 * expected that drivers will fix this mapping if they can before
1734 * calling netif_set_real_num_tx_queues.
1735 */
1736 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1737 {
1738 int i;
1739 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1740
1741 /* If TC0 is invalidated disable TC mapping */
1742 if (tc->offset + tc->count > txq) {
1743 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1744 dev->num_tc = 0;
1745 return;
1746 }
1747
1748 /* Invalidated prio to tc mappings set to TC0 */
1749 for (i = 1; i < TC_BITMASK + 1; i++) {
1750 int q = netdev_get_prio_tc_map(dev, i);
1751
1752 tc = &dev->tc_to_txq[q];
1753 if (tc->offset + tc->count > txq) {
1754 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1755 i, q);
1756 netdev_set_prio_tc_map(dev, i, 0);
1757 }
1758 }
1759 }
1760
1761 /*
1762 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1763 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1764 */
1765 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1766 {
1767 int rc;
1768
1769 if (txq < 1 || txq > dev->num_tx_queues)
1770 return -EINVAL;
1771
1772 if (dev->reg_state == NETREG_REGISTERED ||
1773 dev->reg_state == NETREG_UNREGISTERING) {
1774 ASSERT_RTNL();
1775
1776 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1777 txq);
1778 if (rc)
1779 return rc;
1780
1781 if (dev->num_tc)
1782 netif_setup_tc(dev, txq);
1783
1784 if (txq < dev->real_num_tx_queues)
1785 qdisc_reset_all_tx_gt(dev, txq);
1786 }
1787
1788 dev->real_num_tx_queues = txq;
1789 return 0;
1790 }
1791 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1792
1793 #ifdef CONFIG_RPS
1794 /**
1795 * netif_set_real_num_rx_queues - set actual number of RX queues used
1796 * @dev: Network device
1797 * @rxq: Actual number of RX queues
1798 *
1799 * This must be called either with the rtnl_lock held or before
1800 * registration of the net device. Returns 0 on success, or a
1801 * negative error code. If called before registration, it always
1802 * succeeds.
1803 */
1804 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1805 {
1806 int rc;
1807
1808 if (rxq < 1 || rxq > dev->num_rx_queues)
1809 return -EINVAL;
1810
1811 if (dev->reg_state == NETREG_REGISTERED) {
1812 ASSERT_RTNL();
1813
1814 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1815 rxq);
1816 if (rc)
1817 return rc;
1818 }
1819
1820 dev->real_num_rx_queues = rxq;
1821 return 0;
1822 }
1823 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1824 #endif
1825
1826 /**
1827 * netif_get_num_default_rss_queues - default number of RSS queues
1828 *
1829 * This routine should set an upper limit on the number of RSS queues
1830 * used by default by multiqueue devices.
1831 */
1832 int netif_get_num_default_rss_queues(void)
1833 {
1834 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
1835 }
1836 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
1837
1838 static inline void __netif_reschedule(struct Qdisc *q)
1839 {
1840 struct softnet_data *sd;
1841 unsigned long flags;
1842
1843 local_irq_save(flags);
1844 sd = &__get_cpu_var(softnet_data);
1845 q->next_sched = NULL;
1846 *sd->output_queue_tailp = q;
1847 sd->output_queue_tailp = &q->next_sched;
1848 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1849 local_irq_restore(flags);
1850 }
1851
1852 void __netif_schedule(struct Qdisc *q)
1853 {
1854 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1855 __netif_reschedule(q);
1856 }
1857 EXPORT_SYMBOL(__netif_schedule);
1858
1859 void dev_kfree_skb_irq(struct sk_buff *skb)
1860 {
1861 if (atomic_dec_and_test(&skb->users)) {
1862 struct softnet_data *sd;
1863 unsigned long flags;
1864
1865 local_irq_save(flags);
1866 sd = &__get_cpu_var(softnet_data);
1867 skb->next = sd->completion_queue;
1868 sd->completion_queue = skb;
1869 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1870 local_irq_restore(flags);
1871 }
1872 }
1873 EXPORT_SYMBOL(dev_kfree_skb_irq);
1874
1875 void dev_kfree_skb_any(struct sk_buff *skb)
1876 {
1877 if (in_irq() || irqs_disabled())
1878 dev_kfree_skb_irq(skb);
1879 else
1880 dev_kfree_skb(skb);
1881 }
1882 EXPORT_SYMBOL(dev_kfree_skb_any);
1883
1884
1885 /**
1886 * netif_device_detach - mark device as removed
1887 * @dev: network device
1888 *
1889 * Mark device as removed from system and therefore no longer available.
1890 */
1891 void netif_device_detach(struct net_device *dev)
1892 {
1893 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1894 netif_running(dev)) {
1895 netif_tx_stop_all_queues(dev);
1896 }
1897 }
1898 EXPORT_SYMBOL(netif_device_detach);
1899
1900 /**
1901 * netif_device_attach - mark device as attached
1902 * @dev: network device
1903 *
1904 * Mark device as attached from system and restart if needed.
1905 */
1906 void netif_device_attach(struct net_device *dev)
1907 {
1908 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1909 netif_running(dev)) {
1910 netif_tx_wake_all_queues(dev);
1911 __netdev_watchdog_up(dev);
1912 }
1913 }
1914 EXPORT_SYMBOL(netif_device_attach);
1915
1916 static void skb_warn_bad_offload(const struct sk_buff *skb)
1917 {
1918 static const netdev_features_t null_features = 0;
1919 struct net_device *dev = skb->dev;
1920 const char *driver = "";
1921
1922 if (dev && dev->dev.parent)
1923 driver = dev_driver_string(dev->dev.parent);
1924
1925 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1926 "gso_type=%d ip_summed=%d\n",
1927 driver, dev ? &dev->features : &null_features,
1928 skb->sk ? &skb->sk->sk_route_caps : &null_features,
1929 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1930 skb_shinfo(skb)->gso_type, skb->ip_summed);
1931 }
1932
1933 /*
1934 * Invalidate hardware checksum when packet is to be mangled, and
1935 * complete checksum manually on outgoing path.
1936 */
1937 int skb_checksum_help(struct sk_buff *skb)
1938 {
1939 __wsum csum;
1940 int ret = 0, offset;
1941
1942 if (skb->ip_summed == CHECKSUM_COMPLETE)
1943 goto out_set_summed;
1944
1945 if (unlikely(skb_shinfo(skb)->gso_size)) {
1946 skb_warn_bad_offload(skb);
1947 return -EINVAL;
1948 }
1949
1950 offset = skb_checksum_start_offset(skb);
1951 BUG_ON(offset >= skb_headlen(skb));
1952 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1953
1954 offset += skb->csum_offset;
1955 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1956
1957 if (skb_cloned(skb) &&
1958 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1959 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1960 if (ret)
1961 goto out;
1962 }
1963
1964 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1965 out_set_summed:
1966 skb->ip_summed = CHECKSUM_NONE;
1967 out:
1968 return ret;
1969 }
1970 EXPORT_SYMBOL(skb_checksum_help);
1971
1972 /**
1973 * skb_gso_segment - Perform segmentation on skb.
1974 * @skb: buffer to segment
1975 * @features: features for the output path (see dev->features)
1976 *
1977 * This function segments the given skb and returns a list of segments.
1978 *
1979 * It may return NULL if the skb requires no segmentation. This is
1980 * only possible when GSO is used for verifying header integrity.
1981 */
1982 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1983 netdev_features_t features)
1984 {
1985 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1986 struct packet_type *ptype;
1987 __be16 type = skb->protocol;
1988 int vlan_depth = ETH_HLEN;
1989 int err;
1990
1991 while (type == htons(ETH_P_8021Q)) {
1992 struct vlan_hdr *vh;
1993
1994 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1995 return ERR_PTR(-EINVAL);
1996
1997 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1998 type = vh->h_vlan_encapsulated_proto;
1999 vlan_depth += VLAN_HLEN;
2000 }
2001
2002 skb_reset_mac_header(skb);
2003 skb->mac_len = skb->network_header - skb->mac_header;
2004 __skb_pull(skb, skb->mac_len);
2005
2006 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2007 skb_warn_bad_offload(skb);
2008
2009 if (skb_header_cloned(skb) &&
2010 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2011 return ERR_PTR(err);
2012 }
2013
2014 rcu_read_lock();
2015 list_for_each_entry_rcu(ptype,
2016 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2017 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
2018 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2019 err = ptype->gso_send_check(skb);
2020 segs = ERR_PTR(err);
2021 if (err || skb_gso_ok(skb, features))
2022 break;
2023 __skb_push(skb, (skb->data -
2024 skb_network_header(skb)));
2025 }
2026 segs = ptype->gso_segment(skb, features);
2027 break;
2028 }
2029 }
2030 rcu_read_unlock();
2031
2032 __skb_push(skb, skb->data - skb_mac_header(skb));
2033
2034 return segs;
2035 }
2036 EXPORT_SYMBOL(skb_gso_segment);
2037
2038 /* Take action when hardware reception checksum errors are detected. */
2039 #ifdef CONFIG_BUG
2040 void netdev_rx_csum_fault(struct net_device *dev)
2041 {
2042 if (net_ratelimit()) {
2043 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2044 dump_stack();
2045 }
2046 }
2047 EXPORT_SYMBOL(netdev_rx_csum_fault);
2048 #endif
2049
2050 /* Actually, we should eliminate this check as soon as we know, that:
2051 * 1. IOMMU is present and allows to map all the memory.
2052 * 2. No high memory really exists on this machine.
2053 */
2054
2055 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2056 {
2057 #ifdef CONFIG_HIGHMEM
2058 int i;
2059 if (!(dev->features & NETIF_F_HIGHDMA)) {
2060 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2061 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2062 if (PageHighMem(skb_frag_page(frag)))
2063 return 1;
2064 }
2065 }
2066
2067 if (PCI_DMA_BUS_IS_PHYS) {
2068 struct device *pdev = dev->dev.parent;
2069
2070 if (!pdev)
2071 return 0;
2072 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2073 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2074 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2075 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2076 return 1;
2077 }
2078 }
2079 #endif
2080 return 0;
2081 }
2082
2083 struct dev_gso_cb {
2084 void (*destructor)(struct sk_buff *skb);
2085 };
2086
2087 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2088
2089 static void dev_gso_skb_destructor(struct sk_buff *skb)
2090 {
2091 struct dev_gso_cb *cb;
2092
2093 do {
2094 struct sk_buff *nskb = skb->next;
2095
2096 skb->next = nskb->next;
2097 nskb->next = NULL;
2098 kfree_skb(nskb);
2099 } while (skb->next);
2100
2101 cb = DEV_GSO_CB(skb);
2102 if (cb->destructor)
2103 cb->destructor(skb);
2104 }
2105
2106 /**
2107 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2108 * @skb: buffer to segment
2109 * @features: device features as applicable to this skb
2110 *
2111 * This function segments the given skb and stores the list of segments
2112 * in skb->next.
2113 */
2114 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2115 {
2116 struct sk_buff *segs;
2117
2118 segs = skb_gso_segment(skb, features);
2119
2120 /* Verifying header integrity only. */
2121 if (!segs)
2122 return 0;
2123
2124 if (IS_ERR(segs))
2125 return PTR_ERR(segs);
2126
2127 skb->next = segs;
2128 DEV_GSO_CB(skb)->destructor = skb->destructor;
2129 skb->destructor = dev_gso_skb_destructor;
2130
2131 return 0;
2132 }
2133
2134 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2135 {
2136 return ((features & NETIF_F_GEN_CSUM) ||
2137 ((features & NETIF_F_V4_CSUM) &&
2138 protocol == htons(ETH_P_IP)) ||
2139 ((features & NETIF_F_V6_CSUM) &&
2140 protocol == htons(ETH_P_IPV6)) ||
2141 ((features & NETIF_F_FCOE_CRC) &&
2142 protocol == htons(ETH_P_FCOE)));
2143 }
2144
2145 static netdev_features_t harmonize_features(struct sk_buff *skb,
2146 __be16 protocol, netdev_features_t features)
2147 {
2148 if (!can_checksum_protocol(features, protocol)) {
2149 features &= ~NETIF_F_ALL_CSUM;
2150 features &= ~NETIF_F_SG;
2151 } else if (illegal_highdma(skb->dev, skb)) {
2152 features &= ~NETIF_F_SG;
2153 }
2154
2155 return features;
2156 }
2157
2158 netdev_features_t netif_skb_features(struct sk_buff *skb)
2159 {
2160 __be16 protocol = skb->protocol;
2161 netdev_features_t features = skb->dev->features;
2162
2163 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2164 features &= ~NETIF_F_GSO_MASK;
2165
2166 if (protocol == htons(ETH_P_8021Q)) {
2167 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2168 protocol = veh->h_vlan_encapsulated_proto;
2169 } else if (!vlan_tx_tag_present(skb)) {
2170 return harmonize_features(skb, protocol, features);
2171 }
2172
2173 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2174
2175 if (protocol != htons(ETH_P_8021Q)) {
2176 return harmonize_features(skb, protocol, features);
2177 } else {
2178 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2179 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2180 return harmonize_features(skb, protocol, features);
2181 }
2182 }
2183 EXPORT_SYMBOL(netif_skb_features);
2184
2185 /*
2186 * Returns true if either:
2187 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2188 * 2. skb is fragmented and the device does not support SG, or if
2189 * at least one of fragments is in highmem and device does not
2190 * support DMA from it.
2191 */
2192 static inline int skb_needs_linearize(struct sk_buff *skb,
2193 int features)
2194 {
2195 return skb_is_nonlinear(skb) &&
2196 ((skb_has_frag_list(skb) &&
2197 !(features & NETIF_F_FRAGLIST)) ||
2198 (skb_shinfo(skb)->nr_frags &&
2199 !(features & NETIF_F_SG)));
2200 }
2201
2202 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2203 struct netdev_queue *txq)
2204 {
2205 const struct net_device_ops *ops = dev->netdev_ops;
2206 int rc = NETDEV_TX_OK;
2207 unsigned int skb_len;
2208
2209 if (likely(!skb->next)) {
2210 netdev_features_t features;
2211
2212 /*
2213 * If device doesn't need skb->dst, release it right now while
2214 * its hot in this cpu cache
2215 */
2216 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2217 skb_dst_drop(skb);
2218
2219 if (!list_empty(&ptype_all))
2220 dev_queue_xmit_nit(skb, dev);
2221
2222 features = netif_skb_features(skb);
2223
2224 if (vlan_tx_tag_present(skb) &&
2225 !(features & NETIF_F_HW_VLAN_TX)) {
2226 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2227 if (unlikely(!skb))
2228 goto out;
2229
2230 skb->vlan_tci = 0;
2231 }
2232
2233 if (netif_needs_gso(skb, features)) {
2234 if (unlikely(dev_gso_segment(skb, features)))
2235 goto out_kfree_skb;
2236 if (skb->next)
2237 goto gso;
2238 } else {
2239 if (skb_needs_linearize(skb, features) &&
2240 __skb_linearize(skb))
2241 goto out_kfree_skb;
2242
2243 /* If packet is not checksummed and device does not
2244 * support checksumming for this protocol, complete
2245 * checksumming here.
2246 */
2247 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2248 skb_set_transport_header(skb,
2249 skb_checksum_start_offset(skb));
2250 if (!(features & NETIF_F_ALL_CSUM) &&
2251 skb_checksum_help(skb))
2252 goto out_kfree_skb;
2253 }
2254 }
2255
2256 skb_len = skb->len;
2257 rc = ops->ndo_start_xmit(skb, dev);
2258 trace_net_dev_xmit(skb, rc, dev, skb_len);
2259 if (rc == NETDEV_TX_OK)
2260 txq_trans_update(txq);
2261 return rc;
2262 }
2263
2264 gso:
2265 do {
2266 struct sk_buff *nskb = skb->next;
2267
2268 skb->next = nskb->next;
2269 nskb->next = NULL;
2270
2271 /*
2272 * If device doesn't need nskb->dst, release it right now while
2273 * its hot in this cpu cache
2274 */
2275 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2276 skb_dst_drop(nskb);
2277
2278 skb_len = nskb->len;
2279 rc = ops->ndo_start_xmit(nskb, dev);
2280 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2281 if (unlikely(rc != NETDEV_TX_OK)) {
2282 if (rc & ~NETDEV_TX_MASK)
2283 goto out_kfree_gso_skb;
2284 nskb->next = skb->next;
2285 skb->next = nskb;
2286 return rc;
2287 }
2288 txq_trans_update(txq);
2289 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2290 return NETDEV_TX_BUSY;
2291 } while (skb->next);
2292
2293 out_kfree_gso_skb:
2294 if (likely(skb->next == NULL))
2295 skb->destructor = DEV_GSO_CB(skb)->destructor;
2296 out_kfree_skb:
2297 kfree_skb(skb);
2298 out:
2299 return rc;
2300 }
2301
2302 static u32 hashrnd __read_mostly;
2303
2304 /*
2305 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2306 * to be used as a distribution range.
2307 */
2308 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2309 unsigned int num_tx_queues)
2310 {
2311 u32 hash;
2312 u16 qoffset = 0;
2313 u16 qcount = num_tx_queues;
2314
2315 if (skb_rx_queue_recorded(skb)) {
2316 hash = skb_get_rx_queue(skb);
2317 while (unlikely(hash >= num_tx_queues))
2318 hash -= num_tx_queues;
2319 return hash;
2320 }
2321
2322 if (dev->num_tc) {
2323 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2324 qoffset = dev->tc_to_txq[tc].offset;
2325 qcount = dev->tc_to_txq[tc].count;
2326 }
2327
2328 if (skb->sk && skb->sk->sk_hash)
2329 hash = skb->sk->sk_hash;
2330 else
2331 hash = (__force u16) skb->protocol;
2332 hash = jhash_1word(hash, hashrnd);
2333
2334 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2335 }
2336 EXPORT_SYMBOL(__skb_tx_hash);
2337
2338 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2339 {
2340 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2341 net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2342 dev->name, queue_index,
2343 dev->real_num_tx_queues);
2344 return 0;
2345 }
2346 return queue_index;
2347 }
2348
2349 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2350 {
2351 #ifdef CONFIG_XPS
2352 struct xps_dev_maps *dev_maps;
2353 struct xps_map *map;
2354 int queue_index = -1;
2355
2356 rcu_read_lock();
2357 dev_maps = rcu_dereference(dev->xps_maps);
2358 if (dev_maps) {
2359 map = rcu_dereference(
2360 dev_maps->cpu_map[raw_smp_processor_id()]);
2361 if (map) {
2362 if (map->len == 1)
2363 queue_index = map->queues[0];
2364 else {
2365 u32 hash;
2366 if (skb->sk && skb->sk->sk_hash)
2367 hash = skb->sk->sk_hash;
2368 else
2369 hash = (__force u16) skb->protocol ^
2370 skb->rxhash;
2371 hash = jhash_1word(hash, hashrnd);
2372 queue_index = map->queues[
2373 ((u64)hash * map->len) >> 32];
2374 }
2375 if (unlikely(queue_index >= dev->real_num_tx_queues))
2376 queue_index = -1;
2377 }
2378 }
2379 rcu_read_unlock();
2380
2381 return queue_index;
2382 #else
2383 return -1;
2384 #endif
2385 }
2386
2387 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2388 struct sk_buff *skb)
2389 {
2390 int queue_index;
2391 const struct net_device_ops *ops = dev->netdev_ops;
2392
2393 if (dev->real_num_tx_queues == 1)
2394 queue_index = 0;
2395 else if (ops->ndo_select_queue) {
2396 queue_index = ops->ndo_select_queue(dev, skb);
2397 queue_index = dev_cap_txqueue(dev, queue_index);
2398 } else {
2399 struct sock *sk = skb->sk;
2400 queue_index = sk_tx_queue_get(sk);
2401
2402 if (queue_index < 0 || skb->ooo_okay ||
2403 queue_index >= dev->real_num_tx_queues) {
2404 int old_index = queue_index;
2405
2406 queue_index = get_xps_queue(dev, skb);
2407 if (queue_index < 0)
2408 queue_index = skb_tx_hash(dev, skb);
2409
2410 if (queue_index != old_index && sk) {
2411 struct dst_entry *dst =
2412 rcu_dereference_check(sk->sk_dst_cache, 1);
2413
2414 if (dst && skb_dst(skb) == dst)
2415 sk_tx_queue_set(sk, queue_index);
2416 }
2417 }
2418 }
2419
2420 skb_set_queue_mapping(skb, queue_index);
2421 return netdev_get_tx_queue(dev, queue_index);
2422 }
2423
2424 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2425 struct net_device *dev,
2426 struct netdev_queue *txq)
2427 {
2428 spinlock_t *root_lock = qdisc_lock(q);
2429 bool contended;
2430 int rc;
2431
2432 qdisc_skb_cb(skb)->pkt_len = skb->len;
2433 qdisc_calculate_pkt_len(skb, q);
2434 /*
2435 * Heuristic to force contended enqueues to serialize on a
2436 * separate lock before trying to get qdisc main lock.
2437 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2438 * and dequeue packets faster.
2439 */
2440 contended = qdisc_is_running(q);
2441 if (unlikely(contended))
2442 spin_lock(&q->busylock);
2443
2444 spin_lock(root_lock);
2445 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2446 kfree_skb(skb);
2447 rc = NET_XMIT_DROP;
2448 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2449 qdisc_run_begin(q)) {
2450 /*
2451 * This is a work-conserving queue; there are no old skbs
2452 * waiting to be sent out; and the qdisc is not running -
2453 * xmit the skb directly.
2454 */
2455 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2456 skb_dst_force(skb);
2457
2458 qdisc_bstats_update(q, skb);
2459
2460 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2461 if (unlikely(contended)) {
2462 spin_unlock(&q->busylock);
2463 contended = false;
2464 }
2465 __qdisc_run(q);
2466 } else
2467 qdisc_run_end(q);
2468
2469 rc = NET_XMIT_SUCCESS;
2470 } else {
2471 skb_dst_force(skb);
2472 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2473 if (qdisc_run_begin(q)) {
2474 if (unlikely(contended)) {
2475 spin_unlock(&q->busylock);
2476 contended = false;
2477 }
2478 __qdisc_run(q);
2479 }
2480 }
2481 spin_unlock(root_lock);
2482 if (unlikely(contended))
2483 spin_unlock(&q->busylock);
2484 return rc;
2485 }
2486
2487 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2488 static void skb_update_prio(struct sk_buff *skb)
2489 {
2490 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2491
2492 if (!skb->priority && skb->sk && map) {
2493 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2494
2495 if (prioidx < map->priomap_len)
2496 skb->priority = map->priomap[prioidx];
2497 }
2498 }
2499 #else
2500 #define skb_update_prio(skb)
2501 #endif
2502
2503 static DEFINE_PER_CPU(int, xmit_recursion);
2504 #define RECURSION_LIMIT 10
2505
2506 /**
2507 * dev_loopback_xmit - loop back @skb
2508 * @skb: buffer to transmit
2509 */
2510 int dev_loopback_xmit(struct sk_buff *skb)
2511 {
2512 skb_reset_mac_header(skb);
2513 __skb_pull(skb, skb_network_offset(skb));
2514 skb->pkt_type = PACKET_LOOPBACK;
2515 skb->ip_summed = CHECKSUM_UNNECESSARY;
2516 WARN_ON(!skb_dst(skb));
2517 skb_dst_force(skb);
2518 netif_rx_ni(skb);
2519 return 0;
2520 }
2521 EXPORT_SYMBOL(dev_loopback_xmit);
2522
2523 /**
2524 * dev_queue_xmit - transmit a buffer
2525 * @skb: buffer to transmit
2526 *
2527 * Queue a buffer for transmission to a network device. The caller must
2528 * have set the device and priority and built the buffer before calling
2529 * this function. The function can be called from an interrupt.
2530 *
2531 * A negative errno code is returned on a failure. A success does not
2532 * guarantee the frame will be transmitted as it may be dropped due
2533 * to congestion or traffic shaping.
2534 *
2535 * -----------------------------------------------------------------------------------
2536 * I notice this method can also return errors from the queue disciplines,
2537 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2538 * be positive.
2539 *
2540 * Regardless of the return value, the skb is consumed, so it is currently
2541 * difficult to retry a send to this method. (You can bump the ref count
2542 * before sending to hold a reference for retry if you are careful.)
2543 *
2544 * When calling this method, interrupts MUST be enabled. This is because
2545 * the BH enable code must have IRQs enabled so that it will not deadlock.
2546 * --BLG
2547 */
2548 int dev_queue_xmit(struct sk_buff *skb)
2549 {
2550 struct net_device *dev = skb->dev;
2551 struct netdev_queue *txq;
2552 struct Qdisc *q;
2553 int rc = -ENOMEM;
2554
2555 /* Disable soft irqs for various locks below. Also
2556 * stops preemption for RCU.
2557 */
2558 rcu_read_lock_bh();
2559
2560 skb_update_prio(skb);
2561
2562 txq = dev_pick_tx(dev, skb);
2563 q = rcu_dereference_bh(txq->qdisc);
2564
2565 #ifdef CONFIG_NET_CLS_ACT
2566 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2567 #endif
2568 trace_net_dev_queue(skb);
2569 if (q->enqueue) {
2570 rc = __dev_xmit_skb(skb, q, dev, txq);
2571 goto out;
2572 }
2573
2574 /* The device has no queue. Common case for software devices:
2575 loopback, all the sorts of tunnels...
2576
2577 Really, it is unlikely that netif_tx_lock protection is necessary
2578 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2579 counters.)
2580 However, it is possible, that they rely on protection
2581 made by us here.
2582
2583 Check this and shot the lock. It is not prone from deadlocks.
2584 Either shot noqueue qdisc, it is even simpler 8)
2585 */
2586 if (dev->flags & IFF_UP) {
2587 int cpu = smp_processor_id(); /* ok because BHs are off */
2588
2589 if (txq->xmit_lock_owner != cpu) {
2590
2591 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2592 goto recursion_alert;
2593
2594 HARD_TX_LOCK(dev, txq, cpu);
2595
2596 if (!netif_xmit_stopped(txq)) {
2597 __this_cpu_inc(xmit_recursion);
2598 rc = dev_hard_start_xmit(skb, dev, txq);
2599 __this_cpu_dec(xmit_recursion);
2600 if (dev_xmit_complete(rc)) {
2601 HARD_TX_UNLOCK(dev, txq);
2602 goto out;
2603 }
2604 }
2605 HARD_TX_UNLOCK(dev, txq);
2606 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2607 dev->name);
2608 } else {
2609 /* Recursion is detected! It is possible,
2610 * unfortunately
2611 */
2612 recursion_alert:
2613 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2614 dev->name);
2615 }
2616 }
2617
2618 rc = -ENETDOWN;
2619 rcu_read_unlock_bh();
2620
2621 kfree_skb(skb);
2622 return rc;
2623 out:
2624 rcu_read_unlock_bh();
2625 return rc;
2626 }
2627 EXPORT_SYMBOL(dev_queue_xmit);
2628
2629
2630 /*=======================================================================
2631 Receiver routines
2632 =======================================================================*/
2633
2634 int netdev_max_backlog __read_mostly = 1000;
2635 int netdev_tstamp_prequeue __read_mostly = 1;
2636 int netdev_budget __read_mostly = 300;
2637 int weight_p __read_mostly = 64; /* old backlog weight */
2638
2639 /* Called with irq disabled */
2640 static inline void ____napi_schedule(struct softnet_data *sd,
2641 struct napi_struct *napi)
2642 {
2643 list_add_tail(&napi->poll_list, &sd->poll_list);
2644 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2645 }
2646
2647 /*
2648 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2649 * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
2650 * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
2651 * if hash is a canonical 4-tuple hash over transport ports.
2652 */
2653 void __skb_get_rxhash(struct sk_buff *skb)
2654 {
2655 struct flow_keys keys;
2656 u32 hash;
2657
2658 if (!skb_flow_dissect(skb, &keys))
2659 return;
2660
2661 if (keys.ports) {
2662 if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2663 swap(keys.port16[0], keys.port16[1]);
2664 skb->l4_rxhash = 1;
2665 }
2666
2667 /* get a consistent hash (same value on both flow directions) */
2668 if ((__force u32)keys.dst < (__force u32)keys.src)
2669 swap(keys.dst, keys.src);
2670
2671 hash = jhash_3words((__force u32)keys.dst,
2672 (__force u32)keys.src,
2673 (__force u32)keys.ports, hashrnd);
2674 if (!hash)
2675 hash = 1;
2676
2677 skb->rxhash = hash;
2678 }
2679 EXPORT_SYMBOL(__skb_get_rxhash);
2680
2681 #ifdef CONFIG_RPS
2682
2683 /* One global table that all flow-based protocols share. */
2684 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2685 EXPORT_SYMBOL(rps_sock_flow_table);
2686
2687 struct static_key rps_needed __read_mostly;
2688
2689 static struct rps_dev_flow *
2690 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2691 struct rps_dev_flow *rflow, u16 next_cpu)
2692 {
2693 if (next_cpu != RPS_NO_CPU) {
2694 #ifdef CONFIG_RFS_ACCEL
2695 struct netdev_rx_queue *rxqueue;
2696 struct rps_dev_flow_table *flow_table;
2697 struct rps_dev_flow *old_rflow;
2698 u32 flow_id;
2699 u16 rxq_index;
2700 int rc;
2701
2702 /* Should we steer this flow to a different hardware queue? */
2703 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2704 !(dev->features & NETIF_F_NTUPLE))
2705 goto out;
2706 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2707 if (rxq_index == skb_get_rx_queue(skb))
2708 goto out;
2709
2710 rxqueue = dev->_rx + rxq_index;
2711 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2712 if (!flow_table)
2713 goto out;
2714 flow_id = skb->rxhash & flow_table->mask;
2715 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2716 rxq_index, flow_id);
2717 if (rc < 0)
2718 goto out;
2719 old_rflow = rflow;
2720 rflow = &flow_table->flows[flow_id];
2721 rflow->filter = rc;
2722 if (old_rflow->filter == rflow->filter)
2723 old_rflow->filter = RPS_NO_FILTER;
2724 out:
2725 #endif
2726 rflow->last_qtail =
2727 per_cpu(softnet_data, next_cpu).input_queue_head;
2728 }
2729
2730 rflow->cpu = next_cpu;
2731 return rflow;
2732 }
2733
2734 /*
2735 * get_rps_cpu is called from netif_receive_skb and returns the target
2736 * CPU from the RPS map of the receiving queue for a given skb.
2737 * rcu_read_lock must be held on entry.
2738 */
2739 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2740 struct rps_dev_flow **rflowp)
2741 {
2742 struct netdev_rx_queue *rxqueue;
2743 struct rps_map *map;
2744 struct rps_dev_flow_table *flow_table;
2745 struct rps_sock_flow_table *sock_flow_table;
2746 int cpu = -1;
2747 u16 tcpu;
2748
2749 if (skb_rx_queue_recorded(skb)) {
2750 u16 index = skb_get_rx_queue(skb);
2751 if (unlikely(index >= dev->real_num_rx_queues)) {
2752 WARN_ONCE(dev->real_num_rx_queues > 1,
2753 "%s received packet on queue %u, but number "
2754 "of RX queues is %u\n",
2755 dev->name, index, dev->real_num_rx_queues);
2756 goto done;
2757 }
2758 rxqueue = dev->_rx + index;
2759 } else
2760 rxqueue = dev->_rx;
2761
2762 map = rcu_dereference(rxqueue->rps_map);
2763 if (map) {
2764 if (map->len == 1 &&
2765 !rcu_access_pointer(rxqueue->rps_flow_table)) {
2766 tcpu = map->cpus[0];
2767 if (cpu_online(tcpu))
2768 cpu = tcpu;
2769 goto done;
2770 }
2771 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2772 goto done;
2773 }
2774
2775 skb_reset_network_header(skb);
2776 if (!skb_get_rxhash(skb))
2777 goto done;
2778
2779 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2780 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2781 if (flow_table && sock_flow_table) {
2782 u16 next_cpu;
2783 struct rps_dev_flow *rflow;
2784
2785 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2786 tcpu = rflow->cpu;
2787
2788 next_cpu = sock_flow_table->ents[skb->rxhash &
2789 sock_flow_table->mask];
2790
2791 /*
2792 * If the desired CPU (where last recvmsg was done) is
2793 * different from current CPU (one in the rx-queue flow
2794 * table entry), switch if one of the following holds:
2795 * - Current CPU is unset (equal to RPS_NO_CPU).
2796 * - Current CPU is offline.
2797 * - The current CPU's queue tail has advanced beyond the
2798 * last packet that was enqueued using this table entry.
2799 * This guarantees that all previous packets for the flow
2800 * have been dequeued, thus preserving in order delivery.
2801 */
2802 if (unlikely(tcpu != next_cpu) &&
2803 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2804 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2805 rflow->last_qtail)) >= 0))
2806 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2807
2808 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2809 *rflowp = rflow;
2810 cpu = tcpu;
2811 goto done;
2812 }
2813 }
2814
2815 if (map) {
2816 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2817
2818 if (cpu_online(tcpu)) {
2819 cpu = tcpu;
2820 goto done;
2821 }
2822 }
2823
2824 done:
2825 return cpu;
2826 }
2827
2828 #ifdef CONFIG_RFS_ACCEL
2829
2830 /**
2831 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2832 * @dev: Device on which the filter was set
2833 * @rxq_index: RX queue index
2834 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2835 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2836 *
2837 * Drivers that implement ndo_rx_flow_steer() should periodically call
2838 * this function for each installed filter and remove the filters for
2839 * which it returns %true.
2840 */
2841 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2842 u32 flow_id, u16 filter_id)
2843 {
2844 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2845 struct rps_dev_flow_table *flow_table;
2846 struct rps_dev_flow *rflow;
2847 bool expire = true;
2848 int cpu;
2849
2850 rcu_read_lock();
2851 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2852 if (flow_table && flow_id <= flow_table->mask) {
2853 rflow = &flow_table->flows[flow_id];
2854 cpu = ACCESS_ONCE(rflow->cpu);
2855 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2856 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2857 rflow->last_qtail) <
2858 (int)(10 * flow_table->mask)))
2859 expire = false;
2860 }
2861 rcu_read_unlock();
2862 return expire;
2863 }
2864 EXPORT_SYMBOL(rps_may_expire_flow);
2865
2866 #endif /* CONFIG_RFS_ACCEL */
2867
2868 /* Called from hardirq (IPI) context */
2869 static void rps_trigger_softirq(void *data)
2870 {
2871 struct softnet_data *sd = data;
2872
2873 ____napi_schedule(sd, &sd->backlog);
2874 sd->received_rps++;
2875 }
2876
2877 #endif /* CONFIG_RPS */
2878
2879 /*
2880 * Check if this softnet_data structure is another cpu one
2881 * If yes, queue it to our IPI list and return 1
2882 * If no, return 0
2883 */
2884 static int rps_ipi_queued(struct softnet_data *sd)
2885 {
2886 #ifdef CONFIG_RPS
2887 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2888
2889 if (sd != mysd) {
2890 sd->rps_ipi_next = mysd->rps_ipi_list;
2891 mysd->rps_ipi_list = sd;
2892
2893 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2894 return 1;
2895 }
2896 #endif /* CONFIG_RPS */
2897 return 0;
2898 }
2899
2900 /*
2901 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2902 * queue (may be a remote CPU queue).
2903 */
2904 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2905 unsigned int *qtail)
2906 {
2907 struct softnet_data *sd;
2908 unsigned long flags;
2909
2910 sd = &per_cpu(softnet_data, cpu);
2911
2912 local_irq_save(flags);
2913
2914 rps_lock(sd);
2915 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2916 if (skb_queue_len(&sd->input_pkt_queue)) {
2917 enqueue:
2918 __skb_queue_tail(&sd->input_pkt_queue, skb);
2919 input_queue_tail_incr_save(sd, qtail);
2920 rps_unlock(sd);
2921 local_irq_restore(flags);
2922 return NET_RX_SUCCESS;
2923 }
2924
2925 /* Schedule NAPI for backlog device
2926 * We can use non atomic operation since we own the queue lock
2927 */
2928 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2929 if (!rps_ipi_queued(sd))
2930 ____napi_schedule(sd, &sd->backlog);
2931 }
2932 goto enqueue;
2933 }
2934
2935 sd->dropped++;
2936 rps_unlock(sd);
2937
2938 local_irq_restore(flags);
2939
2940 atomic_long_inc(&skb->dev->rx_dropped);
2941 kfree_skb(skb);
2942 return NET_RX_DROP;
2943 }
2944
2945 /**
2946 * netif_rx - post buffer to the network code
2947 * @skb: buffer to post
2948 *
2949 * This function receives a packet from a device driver and queues it for
2950 * the upper (protocol) levels to process. It always succeeds. The buffer
2951 * may be dropped during processing for congestion control or by the
2952 * protocol layers.
2953 *
2954 * return values:
2955 * NET_RX_SUCCESS (no congestion)
2956 * NET_RX_DROP (packet was dropped)
2957 *
2958 */
2959
2960 int netif_rx(struct sk_buff *skb)
2961 {
2962 int ret;
2963
2964 /* if netpoll wants it, pretend we never saw it */
2965 if (netpoll_rx(skb))
2966 return NET_RX_DROP;
2967
2968 net_timestamp_check(netdev_tstamp_prequeue, skb);
2969
2970 trace_netif_rx(skb);
2971 #ifdef CONFIG_RPS
2972 if (static_key_false(&rps_needed)) {
2973 struct rps_dev_flow voidflow, *rflow = &voidflow;
2974 int cpu;
2975
2976 preempt_disable();
2977 rcu_read_lock();
2978
2979 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2980 if (cpu < 0)
2981 cpu = smp_processor_id();
2982
2983 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2984
2985 rcu_read_unlock();
2986 preempt_enable();
2987 } else
2988 #endif
2989 {
2990 unsigned int qtail;
2991 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2992 put_cpu();
2993 }
2994 return ret;
2995 }
2996 EXPORT_SYMBOL(netif_rx);
2997
2998 int netif_rx_ni(struct sk_buff *skb)
2999 {
3000 int err;
3001
3002 preempt_disable();
3003 err = netif_rx(skb);
3004 if (local_softirq_pending())
3005 do_softirq();
3006 preempt_enable();
3007
3008 return err;
3009 }
3010 EXPORT_SYMBOL(netif_rx_ni);
3011
3012 static void net_tx_action(struct softirq_action *h)
3013 {
3014 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3015
3016 if (sd->completion_queue) {
3017 struct sk_buff *clist;
3018
3019 local_irq_disable();
3020 clist = sd->completion_queue;
3021 sd->completion_queue = NULL;
3022 local_irq_enable();
3023
3024 while (clist) {
3025 struct sk_buff *skb = clist;
3026 clist = clist->next;
3027
3028 WARN_ON(atomic_read(&skb->users));
3029 trace_kfree_skb(skb, net_tx_action);
3030 __kfree_skb(skb);
3031 }
3032 }
3033
3034 if (sd->output_queue) {
3035 struct Qdisc *head;
3036
3037 local_irq_disable();
3038 head = sd->output_queue;
3039 sd->output_queue = NULL;
3040 sd->output_queue_tailp = &sd->output_queue;
3041 local_irq_enable();
3042
3043 while (head) {
3044 struct Qdisc *q = head;
3045 spinlock_t *root_lock;
3046
3047 head = head->next_sched;
3048
3049 root_lock = qdisc_lock(q);
3050 if (spin_trylock(root_lock)) {
3051 smp_mb__before_clear_bit();
3052 clear_bit(__QDISC_STATE_SCHED,
3053 &q->state);
3054 qdisc_run(q);
3055 spin_unlock(root_lock);
3056 } else {
3057 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3058 &q->state)) {
3059 __netif_reschedule(q);
3060 } else {
3061 smp_mb__before_clear_bit();
3062 clear_bit(__QDISC_STATE_SCHED,
3063 &q->state);
3064 }
3065 }
3066 }
3067 }
3068 }
3069
3070 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3071 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3072 /* This hook is defined here for ATM LANE */
3073 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3074 unsigned char *addr) __read_mostly;
3075 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3076 #endif
3077
3078 #ifdef CONFIG_NET_CLS_ACT
3079 /* TODO: Maybe we should just force sch_ingress to be compiled in
3080 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3081 * a compare and 2 stores extra right now if we dont have it on
3082 * but have CONFIG_NET_CLS_ACT
3083 * NOTE: This doesn't stop any functionality; if you dont have
3084 * the ingress scheduler, you just can't add policies on ingress.
3085 *
3086 */
3087 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3088 {
3089 struct net_device *dev = skb->dev;
3090 u32 ttl = G_TC_RTTL(skb->tc_verd);
3091 int result = TC_ACT_OK;
3092 struct Qdisc *q;
3093
3094 if (unlikely(MAX_RED_LOOP < ttl++)) {
3095 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3096 skb->skb_iif, dev->ifindex);
3097 return TC_ACT_SHOT;
3098 }
3099
3100 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3101 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3102
3103 q = rxq->qdisc;
3104 if (q != &noop_qdisc) {
3105 spin_lock(qdisc_lock(q));
3106 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3107 result = qdisc_enqueue_root(skb, q);
3108 spin_unlock(qdisc_lock(q));
3109 }
3110
3111 return result;
3112 }
3113
3114 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3115 struct packet_type **pt_prev,
3116 int *ret, struct net_device *orig_dev)
3117 {
3118 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3119
3120 if (!rxq || rxq->qdisc == &noop_qdisc)
3121 goto out;
3122
3123 if (*pt_prev) {
3124 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3125 *pt_prev = NULL;
3126 }
3127
3128 switch (ing_filter(skb, rxq)) {
3129 case TC_ACT_SHOT:
3130 case TC_ACT_STOLEN:
3131 kfree_skb(skb);
3132 return NULL;
3133 }
3134
3135 out:
3136 skb->tc_verd = 0;
3137 return skb;
3138 }
3139 #endif
3140
3141 /**
3142 * netdev_rx_handler_register - register receive handler
3143 * @dev: device to register a handler for
3144 * @rx_handler: receive handler to register
3145 * @rx_handler_data: data pointer that is used by rx handler
3146 *
3147 * Register a receive hander for a device. This handler will then be
3148 * called from __netif_receive_skb. A negative errno code is returned
3149 * on a failure.
3150 *
3151 * The caller must hold the rtnl_mutex.
3152 *
3153 * For a general description of rx_handler, see enum rx_handler_result.
3154 */
3155 int netdev_rx_handler_register(struct net_device *dev,
3156 rx_handler_func_t *rx_handler,
3157 void *rx_handler_data)
3158 {
3159 ASSERT_RTNL();
3160
3161 if (dev->rx_handler)
3162 return -EBUSY;
3163
3164 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3165 rcu_assign_pointer(dev->rx_handler, rx_handler);
3166
3167 return 0;
3168 }
3169 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3170
3171 /**
3172 * netdev_rx_handler_unregister - unregister receive handler
3173 * @dev: device to unregister a handler from
3174 *
3175 * Unregister a receive hander from a device.
3176 *
3177 * The caller must hold the rtnl_mutex.
3178 */
3179 void netdev_rx_handler_unregister(struct net_device *dev)
3180 {
3181
3182 ASSERT_RTNL();
3183 RCU_INIT_POINTER(dev->rx_handler, NULL);
3184 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3185 }
3186 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3187
3188 /*
3189 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3190 * the special handling of PFMEMALLOC skbs.
3191 */
3192 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3193 {
3194 switch (skb->protocol) {
3195 case __constant_htons(ETH_P_ARP):
3196 case __constant_htons(ETH_P_IP):
3197 case __constant_htons(ETH_P_IPV6):
3198 case __constant_htons(ETH_P_8021Q):
3199 return true;
3200 default:
3201 return false;
3202 }
3203 }
3204
3205 static int __netif_receive_skb(struct sk_buff *skb)
3206 {
3207 struct packet_type *ptype, *pt_prev;
3208 rx_handler_func_t *rx_handler;
3209 struct net_device *orig_dev;
3210 struct net_device *null_or_dev;
3211 bool deliver_exact = false;
3212 int ret = NET_RX_DROP;
3213 __be16 type;
3214 unsigned long pflags = current->flags;
3215
3216 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3217
3218 trace_netif_receive_skb(skb);
3219
3220 /*
3221 * PFMEMALLOC skbs are special, they should
3222 * - be delivered to SOCK_MEMALLOC sockets only
3223 * - stay away from userspace
3224 * - have bounded memory usage
3225 *
3226 * Use PF_MEMALLOC as this saves us from propagating the allocation
3227 * context down to all allocation sites.
3228 */
3229 if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3230 current->flags |= PF_MEMALLOC;
3231
3232 /* if we've gotten here through NAPI, check netpoll */
3233 if (netpoll_receive_skb(skb))
3234 goto out;
3235
3236 orig_dev = skb->dev;
3237
3238 skb_reset_network_header(skb);
3239 skb_reset_transport_header(skb);
3240 skb_reset_mac_len(skb);
3241
3242 pt_prev = NULL;
3243
3244 rcu_read_lock();
3245
3246 another_round:
3247 skb->skb_iif = skb->dev->ifindex;
3248
3249 __this_cpu_inc(softnet_data.processed);
3250
3251 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3252 skb = vlan_untag(skb);
3253 if (unlikely(!skb))
3254 goto unlock;
3255 }
3256
3257 #ifdef CONFIG_NET_CLS_ACT
3258 if (skb->tc_verd & TC_NCLS) {
3259 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3260 goto ncls;
3261 }
3262 #endif
3263
3264 if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3265 goto skip_taps;
3266
3267 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3268 if (!ptype->dev || ptype->dev == skb->dev) {
3269 if (pt_prev)
3270 ret = deliver_skb(skb, pt_prev, orig_dev);
3271 pt_prev = ptype;
3272 }
3273 }
3274
3275 skip_taps:
3276 #ifdef CONFIG_NET_CLS_ACT
3277 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3278 if (!skb)
3279 goto unlock;
3280 ncls:
3281 #endif
3282
3283 if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3284 && !skb_pfmemalloc_protocol(skb))
3285 goto drop;
3286
3287 rx_handler = rcu_dereference(skb->dev->rx_handler);
3288 if (vlan_tx_tag_present(skb)) {
3289 if (pt_prev) {
3290 ret = deliver_skb(skb, pt_prev, orig_dev);
3291 pt_prev = NULL;
3292 }
3293 if (vlan_do_receive(&skb, !rx_handler))
3294 goto another_round;
3295 else if (unlikely(!skb))
3296 goto unlock;
3297 }
3298
3299 if (rx_handler) {
3300 if (pt_prev) {
3301 ret = deliver_skb(skb, pt_prev, orig_dev);
3302 pt_prev = NULL;
3303 }
3304 switch (rx_handler(&skb)) {
3305 case RX_HANDLER_CONSUMED:
3306 goto unlock;
3307 case RX_HANDLER_ANOTHER:
3308 goto another_round;
3309 case RX_HANDLER_EXACT:
3310 deliver_exact = true;
3311 case RX_HANDLER_PASS:
3312 break;
3313 default:
3314 BUG();
3315 }
3316 }
3317
3318 /* deliver only exact match when indicated */
3319 null_or_dev = deliver_exact ? skb->dev : NULL;
3320
3321 type = skb->protocol;
3322 list_for_each_entry_rcu(ptype,
3323 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3324 if (ptype->type == type &&
3325 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3326 ptype->dev == orig_dev)) {
3327 if (pt_prev)
3328 ret = deliver_skb(skb, pt_prev, orig_dev);
3329 pt_prev = ptype;
3330 }
3331 }
3332
3333 if (pt_prev) {
3334 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3335 ret = -ENOMEM;
3336 else
3337 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3338 } else {
3339 drop:
3340 atomic_long_inc(&skb->dev->rx_dropped);
3341 kfree_skb(skb);
3342 /* Jamal, now you will not able to escape explaining
3343 * me how you were going to use this. :-)
3344 */
3345 ret = NET_RX_DROP;
3346 }
3347
3348 unlock:
3349 rcu_read_unlock();
3350 out:
3351 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3352 return ret;
3353 }
3354
3355 /**
3356 * netif_receive_skb - process receive buffer from network
3357 * @skb: buffer to process
3358 *
3359 * netif_receive_skb() is the main receive data processing function.
3360 * It always succeeds. The buffer may be dropped during processing
3361 * for congestion control or by the protocol layers.
3362 *
3363 * This function may only be called from softirq context and interrupts
3364 * should be enabled.
3365 *
3366 * Return values (usually ignored):
3367 * NET_RX_SUCCESS: no congestion
3368 * NET_RX_DROP: packet was dropped
3369 */
3370 int netif_receive_skb(struct sk_buff *skb)
3371 {
3372 net_timestamp_check(netdev_tstamp_prequeue, skb);
3373
3374 if (skb_defer_rx_timestamp(skb))
3375 return NET_RX_SUCCESS;
3376
3377 #ifdef CONFIG_RPS
3378 if (static_key_false(&rps_needed)) {
3379 struct rps_dev_flow voidflow, *rflow = &voidflow;
3380 int cpu, ret;
3381
3382 rcu_read_lock();
3383
3384 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3385
3386 if (cpu >= 0) {
3387 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3388 rcu_read_unlock();
3389 return ret;
3390 }
3391 rcu_read_unlock();
3392 }
3393 #endif
3394 return __netif_receive_skb(skb);
3395 }
3396 EXPORT_SYMBOL(netif_receive_skb);
3397
3398 /* Network device is going away, flush any packets still pending
3399 * Called with irqs disabled.
3400 */
3401 static void flush_backlog(void *arg)
3402 {
3403 struct net_device *dev = arg;
3404 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3405 struct sk_buff *skb, *tmp;
3406
3407 rps_lock(sd);
3408 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3409 if (skb->dev == dev) {
3410 __skb_unlink(skb, &sd->input_pkt_queue);
3411 kfree_skb(skb);
3412 input_queue_head_incr(sd);
3413 }
3414 }
3415 rps_unlock(sd);
3416
3417 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3418 if (skb->dev == dev) {
3419 __skb_unlink(skb, &sd->process_queue);
3420 kfree_skb(skb);
3421 input_queue_head_incr(sd);
3422 }
3423 }
3424 }
3425
3426 static int napi_gro_complete(struct sk_buff *skb)
3427 {
3428 struct packet_type *ptype;
3429 __be16 type = skb->protocol;
3430 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3431 int err = -ENOENT;
3432
3433 if (NAPI_GRO_CB(skb)->count == 1) {
3434 skb_shinfo(skb)->gso_size = 0;
3435 goto out;
3436 }
3437
3438 rcu_read_lock();
3439 list_for_each_entry_rcu(ptype, head, list) {
3440 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3441 continue;
3442
3443 err = ptype->gro_complete(skb);
3444 break;
3445 }
3446 rcu_read_unlock();
3447
3448 if (err) {
3449 WARN_ON(&ptype->list == head);
3450 kfree_skb(skb);
3451 return NET_RX_SUCCESS;
3452 }
3453
3454 out:
3455 return netif_receive_skb(skb);
3456 }
3457
3458 inline void napi_gro_flush(struct napi_struct *napi)
3459 {
3460 struct sk_buff *skb, *next;
3461
3462 for (skb = napi->gro_list; skb; skb = next) {
3463 next = skb->next;
3464 skb->next = NULL;
3465 napi_gro_complete(skb);
3466 }
3467
3468 napi->gro_count = 0;
3469 napi->gro_list = NULL;
3470 }
3471 EXPORT_SYMBOL(napi_gro_flush);
3472
3473 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3474 {
3475 struct sk_buff **pp = NULL;
3476 struct packet_type *ptype;
3477 __be16 type = skb->protocol;
3478 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3479 int same_flow;
3480 int mac_len;
3481 enum gro_result ret;
3482
3483 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3484 goto normal;
3485
3486 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3487 goto normal;
3488
3489 rcu_read_lock();
3490 list_for_each_entry_rcu(ptype, head, list) {
3491 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3492 continue;
3493
3494 skb_set_network_header(skb, skb_gro_offset(skb));
3495 mac_len = skb->network_header - skb->mac_header;
3496 skb->mac_len = mac_len;
3497 NAPI_GRO_CB(skb)->same_flow = 0;
3498 NAPI_GRO_CB(skb)->flush = 0;
3499 NAPI_GRO_CB(skb)->free = 0;
3500
3501 pp = ptype->gro_receive(&napi->gro_list, skb);
3502 break;
3503 }
3504 rcu_read_unlock();
3505
3506 if (&ptype->list == head)
3507 goto normal;
3508
3509 same_flow = NAPI_GRO_CB(skb)->same_flow;
3510 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3511
3512 if (pp) {
3513 struct sk_buff *nskb = *pp;
3514
3515 *pp = nskb->next;
3516 nskb->next = NULL;
3517 napi_gro_complete(nskb);
3518 napi->gro_count--;
3519 }
3520
3521 if (same_flow)
3522 goto ok;
3523
3524 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3525 goto normal;
3526
3527 napi->gro_count++;
3528 NAPI_GRO_CB(skb)->count = 1;
3529 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3530 skb->next = napi->gro_list;
3531 napi->gro_list = skb;
3532 ret = GRO_HELD;
3533
3534 pull:
3535 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3536 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3537
3538 BUG_ON(skb->end - skb->tail < grow);
3539
3540 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3541
3542 skb->tail += grow;
3543 skb->data_len -= grow;
3544
3545 skb_shinfo(skb)->frags[0].page_offset += grow;
3546 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3547
3548 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3549 skb_frag_unref(skb, 0);
3550 memmove(skb_shinfo(skb)->frags,
3551 skb_shinfo(skb)->frags + 1,
3552 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3553 }
3554 }
3555
3556 ok:
3557 return ret;
3558
3559 normal:
3560 ret = GRO_NORMAL;
3561 goto pull;
3562 }
3563 EXPORT_SYMBOL(dev_gro_receive);
3564
3565 static inline gro_result_t
3566 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3567 {
3568 struct sk_buff *p;
3569 unsigned int maclen = skb->dev->hard_header_len;
3570
3571 for (p = napi->gro_list; p; p = p->next) {
3572 unsigned long diffs;
3573
3574 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3575 diffs |= p->vlan_tci ^ skb->vlan_tci;
3576 if (maclen == ETH_HLEN)
3577 diffs |= compare_ether_header(skb_mac_header(p),
3578 skb_gro_mac_header(skb));
3579 else if (!diffs)
3580 diffs = memcmp(skb_mac_header(p),
3581 skb_gro_mac_header(skb),
3582 maclen);
3583 NAPI_GRO_CB(p)->same_flow = !diffs;
3584 NAPI_GRO_CB(p)->flush = 0;
3585 }
3586
3587 return dev_gro_receive(napi, skb);
3588 }
3589
3590 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3591 {
3592 switch (ret) {
3593 case GRO_NORMAL:
3594 if (netif_receive_skb(skb))
3595 ret = GRO_DROP;
3596 break;
3597
3598 case GRO_DROP:
3599 kfree_skb(skb);
3600 break;
3601
3602 case GRO_MERGED_FREE:
3603 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3604 kmem_cache_free(skbuff_head_cache, skb);
3605 else
3606 __kfree_skb(skb);
3607 break;
3608
3609 case GRO_HELD:
3610 case GRO_MERGED:
3611 break;
3612 }
3613
3614 return ret;
3615 }
3616 EXPORT_SYMBOL(napi_skb_finish);
3617
3618 void skb_gro_reset_offset(struct sk_buff *skb)
3619 {
3620 NAPI_GRO_CB(skb)->data_offset = 0;
3621 NAPI_GRO_CB(skb)->frag0 = NULL;
3622 NAPI_GRO_CB(skb)->frag0_len = 0;
3623
3624 if (skb->mac_header == skb->tail &&
3625 !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3626 NAPI_GRO_CB(skb)->frag0 =
3627 skb_frag_address(&skb_shinfo(skb)->frags[0]);
3628 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3629 }
3630 }
3631 EXPORT_SYMBOL(skb_gro_reset_offset);
3632
3633 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3634 {
3635 skb_gro_reset_offset(skb);
3636
3637 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3638 }
3639 EXPORT_SYMBOL(napi_gro_receive);
3640
3641 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3642 {
3643 __skb_pull(skb, skb_headlen(skb));
3644 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3645 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3646 skb->vlan_tci = 0;
3647 skb->dev = napi->dev;
3648 skb->skb_iif = 0;
3649
3650 napi->skb = skb;
3651 }
3652
3653 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3654 {
3655 struct sk_buff *skb = napi->skb;
3656
3657 if (!skb) {
3658 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3659 if (skb)
3660 napi->skb = skb;
3661 }
3662 return skb;
3663 }
3664 EXPORT_SYMBOL(napi_get_frags);
3665
3666 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3667 gro_result_t ret)
3668 {
3669 switch (ret) {
3670 case GRO_NORMAL:
3671 case GRO_HELD:
3672 skb->protocol = eth_type_trans(skb, skb->dev);
3673
3674 if (ret == GRO_HELD)
3675 skb_gro_pull(skb, -ETH_HLEN);
3676 else if (netif_receive_skb(skb))
3677 ret = GRO_DROP;
3678 break;
3679
3680 case GRO_DROP:
3681 case GRO_MERGED_FREE:
3682 napi_reuse_skb(napi, skb);
3683 break;
3684
3685 case GRO_MERGED:
3686 break;
3687 }
3688
3689 return ret;
3690 }
3691 EXPORT_SYMBOL(napi_frags_finish);
3692
3693 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3694 {
3695 struct sk_buff *skb = napi->skb;
3696 struct ethhdr *eth;
3697 unsigned int hlen;
3698 unsigned int off;
3699
3700 napi->skb = NULL;
3701
3702 skb_reset_mac_header(skb);
3703 skb_gro_reset_offset(skb);
3704
3705 off = skb_gro_offset(skb);
3706 hlen = off + sizeof(*eth);
3707 eth = skb_gro_header_fast(skb, off);
3708 if (skb_gro_header_hard(skb, hlen)) {
3709 eth = skb_gro_header_slow(skb, hlen, off);
3710 if (unlikely(!eth)) {
3711 napi_reuse_skb(napi, skb);
3712 skb = NULL;
3713 goto out;
3714 }
3715 }
3716
3717 skb_gro_pull(skb, sizeof(*eth));
3718
3719 /*
3720 * This works because the only protocols we care about don't require
3721 * special handling. We'll fix it up properly at the end.
3722 */
3723 skb->protocol = eth->h_proto;
3724
3725 out:
3726 return skb;
3727 }
3728
3729 gro_result_t napi_gro_frags(struct napi_struct *napi)
3730 {
3731 struct sk_buff *skb = napi_frags_skb(napi);
3732
3733 if (!skb)
3734 return GRO_DROP;
3735
3736 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3737 }
3738 EXPORT_SYMBOL(napi_gro_frags);
3739
3740 /*
3741 * net_rps_action sends any pending IPI's for rps.
3742 * Note: called with local irq disabled, but exits with local irq enabled.
3743 */
3744 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3745 {
3746 #ifdef CONFIG_RPS
3747 struct softnet_data *remsd = sd->rps_ipi_list;
3748
3749 if (remsd) {
3750 sd->rps_ipi_list = NULL;
3751
3752 local_irq_enable();
3753
3754 /* Send pending IPI's to kick RPS processing on remote cpus. */
3755 while (remsd) {
3756 struct softnet_data *next = remsd->rps_ipi_next;
3757
3758 if (cpu_online(remsd->cpu))
3759 __smp_call_function_single(remsd->cpu,
3760 &remsd->csd, 0);
3761 remsd = next;
3762 }
3763 } else
3764 #endif
3765 local_irq_enable();
3766 }
3767
3768 static int process_backlog(struct napi_struct *napi, int quota)
3769 {
3770 int work = 0;
3771 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3772
3773 #ifdef CONFIG_RPS
3774 /* Check if we have pending ipi, its better to send them now,
3775 * not waiting net_rx_action() end.
3776 */
3777 if (sd->rps_ipi_list) {
3778 local_irq_disable();
3779 net_rps_action_and_irq_enable(sd);
3780 }
3781 #endif
3782 napi->weight = weight_p;
3783 local_irq_disable();
3784 while (work < quota) {
3785 struct sk_buff *skb;
3786 unsigned int qlen;
3787
3788 while ((skb = __skb_dequeue(&sd->process_queue))) {
3789 local_irq_enable();
3790 __netif_receive_skb(skb);
3791 local_irq_disable();
3792 input_queue_head_incr(sd);
3793 if (++work >= quota) {
3794 local_irq_enable();
3795 return work;
3796 }
3797 }
3798
3799 rps_lock(sd);
3800 qlen = skb_queue_len(&sd->input_pkt_queue);
3801 if (qlen)
3802 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3803 &sd->process_queue);
3804
3805 if (qlen < quota - work) {
3806 /*
3807 * Inline a custom version of __napi_complete().
3808 * only current cpu owns and manipulates this napi,
3809 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3810 * we can use a plain write instead of clear_bit(),
3811 * and we dont need an smp_mb() memory barrier.
3812 */
3813 list_del(&napi->poll_list);
3814 napi->state = 0;
3815
3816 quota = work + qlen;
3817 }
3818 rps_unlock(sd);
3819 }
3820 local_irq_enable();
3821
3822 return work;
3823 }
3824
3825 /**
3826 * __napi_schedule - schedule for receive
3827 * @n: entry to schedule
3828 *
3829 * The entry's receive function will be scheduled to run
3830 */
3831 void __napi_schedule(struct napi_struct *n)
3832 {
3833 unsigned long flags;
3834
3835 local_irq_save(flags);
3836 ____napi_schedule(&__get_cpu_var(softnet_data), n);
3837 local_irq_restore(flags);
3838 }
3839 EXPORT_SYMBOL(__napi_schedule);
3840
3841 void __napi_complete(struct napi_struct *n)
3842 {
3843 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3844 BUG_ON(n->gro_list);
3845
3846 list_del(&n->poll_list);
3847 smp_mb__before_clear_bit();
3848 clear_bit(NAPI_STATE_SCHED, &n->state);
3849 }
3850 EXPORT_SYMBOL(__napi_complete);
3851
3852 void napi_complete(struct napi_struct *n)
3853 {
3854 unsigned long flags;
3855
3856 /*
3857 * don't let napi dequeue from the cpu poll list
3858 * just in case its running on a different cpu
3859 */
3860 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3861 return;
3862
3863 napi_gro_flush(n);
3864 local_irq_save(flags);
3865 __napi_complete(n);
3866 local_irq_restore(flags);
3867 }
3868 EXPORT_SYMBOL(napi_complete);
3869
3870 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3871 int (*poll)(struct napi_struct *, int), int weight)
3872 {
3873 INIT_LIST_HEAD(&napi->poll_list);
3874 napi->gro_count = 0;
3875 napi->gro_list = NULL;
3876 napi->skb = NULL;
3877 napi->poll = poll;
3878 napi->weight = weight;
3879 list_add(&napi->dev_list, &dev->napi_list);
3880 napi->dev = dev;
3881 #ifdef CONFIG_NETPOLL
3882 spin_lock_init(&napi->poll_lock);
3883 napi->poll_owner = -1;
3884 #endif
3885 set_bit(NAPI_STATE_SCHED, &napi->state);
3886 }
3887 EXPORT_SYMBOL(netif_napi_add);
3888
3889 void netif_napi_del(struct napi_struct *napi)
3890 {
3891 struct sk_buff *skb, *next;
3892
3893 list_del_init(&napi->dev_list);
3894 napi_free_frags(napi);
3895
3896 for (skb = napi->gro_list; skb; skb = next) {
3897 next = skb->next;
3898 skb->next = NULL;
3899 kfree_skb(skb);
3900 }
3901
3902 napi->gro_list = NULL;
3903 napi->gro_count = 0;
3904 }
3905 EXPORT_SYMBOL(netif_napi_del);
3906
3907 static void net_rx_action(struct softirq_action *h)
3908 {
3909 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3910 unsigned long time_limit = jiffies + 2;
3911 int budget = netdev_budget;
3912 void *have;
3913
3914 local_irq_disable();
3915
3916 while (!list_empty(&sd->poll_list)) {
3917 struct napi_struct *n;
3918 int work, weight;
3919
3920 /* If softirq window is exhuasted then punt.
3921 * Allow this to run for 2 jiffies since which will allow
3922 * an average latency of 1.5/HZ.
3923 */
3924 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3925 goto softnet_break;
3926
3927 local_irq_enable();
3928
3929 /* Even though interrupts have been re-enabled, this
3930 * access is safe because interrupts can only add new
3931 * entries to the tail of this list, and only ->poll()
3932 * calls can remove this head entry from the list.
3933 */
3934 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3935
3936 have = netpoll_poll_lock(n);
3937
3938 weight = n->weight;
3939
3940 /* This NAPI_STATE_SCHED test is for avoiding a race
3941 * with netpoll's poll_napi(). Only the entity which
3942 * obtains the lock and sees NAPI_STATE_SCHED set will
3943 * actually make the ->poll() call. Therefore we avoid
3944 * accidentally calling ->poll() when NAPI is not scheduled.
3945 */
3946 work = 0;
3947 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3948 work = n->poll(n, weight);
3949 trace_napi_poll(n);
3950 }
3951
3952 WARN_ON_ONCE(work > weight);
3953
3954 budget -= work;
3955
3956 local_irq_disable();
3957
3958 /* Drivers must not modify the NAPI state if they
3959 * consume the entire weight. In such cases this code
3960 * still "owns" the NAPI instance and therefore can
3961 * move the instance around on the list at-will.
3962 */
3963 if (unlikely(work == weight)) {
3964 if (unlikely(napi_disable_pending(n))) {
3965 local_irq_enable();
3966 napi_complete(n);
3967 local_irq_disable();
3968 } else
3969 list_move_tail(&n->poll_list, &sd->poll_list);
3970 }
3971
3972 netpoll_poll_unlock(have);
3973 }
3974 out:
3975 net_rps_action_and_irq_enable(sd);
3976
3977 #ifdef CONFIG_NET_DMA
3978 /*
3979 * There may not be any more sk_buffs coming right now, so push
3980 * any pending DMA copies to hardware
3981 */
3982 dma_issue_pending_all();
3983 #endif
3984
3985 return;
3986
3987 softnet_break:
3988 sd->time_squeeze++;
3989 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3990 goto out;
3991 }
3992
3993 static gifconf_func_t *gifconf_list[NPROTO];
3994
3995 /**
3996 * register_gifconf - register a SIOCGIF handler
3997 * @family: Address family
3998 * @gifconf: Function handler
3999 *
4000 * Register protocol dependent address dumping routines. The handler
4001 * that is passed must not be freed or reused until it has been replaced
4002 * by another handler.
4003 */
4004 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4005 {
4006 if (family >= NPROTO)
4007 return -EINVAL;
4008 gifconf_list[family] = gifconf;
4009 return 0;
4010 }
4011 EXPORT_SYMBOL(register_gifconf);
4012
4013
4014 /*
4015 * Map an interface index to its name (SIOCGIFNAME)
4016 */
4017
4018 /*
4019 * We need this ioctl for efficient implementation of the
4020 * if_indextoname() function required by the IPv6 API. Without
4021 * it, we would have to search all the interfaces to find a
4022 * match. --pb
4023 */
4024
4025 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4026 {
4027 struct net_device *dev;
4028 struct ifreq ifr;
4029
4030 /*
4031 * Fetch the caller's info block.
4032 */
4033
4034 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4035 return -EFAULT;
4036
4037 rcu_read_lock();
4038 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4039 if (!dev) {
4040 rcu_read_unlock();
4041 return -ENODEV;
4042 }
4043
4044 strcpy(ifr.ifr_name, dev->name);
4045 rcu_read_unlock();
4046
4047 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4048 return -EFAULT;
4049 return 0;
4050 }
4051
4052 /*
4053 * Perform a SIOCGIFCONF call. This structure will change
4054 * size eventually, and there is nothing I can do about it.
4055 * Thus we will need a 'compatibility mode'.
4056 */
4057
4058 static int dev_ifconf(struct net *net, char __user *arg)
4059 {
4060 struct ifconf ifc;
4061 struct net_device *dev;
4062 char __user *pos;
4063 int len;
4064 int total;
4065 int i;
4066
4067 /*
4068 * Fetch the caller's info block.
4069 */
4070
4071 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4072 return -EFAULT;
4073
4074 pos = ifc.ifc_buf;
4075 len = ifc.ifc_len;
4076
4077 /*
4078 * Loop over the interfaces, and write an info block for each.
4079 */
4080
4081 total = 0;
4082 for_each_netdev(net, dev) {
4083 for (i = 0; i < NPROTO; i++) {
4084 if (gifconf_list[i]) {
4085 int done;
4086 if (!pos)
4087 done = gifconf_list[i](dev, NULL, 0);
4088 else
4089 done = gifconf_list[i](dev, pos + total,
4090 len - total);
4091 if (done < 0)
4092 return -EFAULT;
4093 total += done;
4094 }
4095 }
4096 }
4097
4098 /*
4099 * All done. Write the updated control block back to the caller.
4100 */
4101 ifc.ifc_len = total;
4102
4103 /*
4104 * Both BSD and Solaris return 0 here, so we do too.
4105 */
4106 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4107 }
4108
4109 #ifdef CONFIG_PROC_FS
4110
4111 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4112
4113 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4114 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4115 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4116
4117 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4118 {
4119 struct net *net = seq_file_net(seq);
4120 struct net_device *dev;
4121 struct hlist_node *p;
4122 struct hlist_head *h;
4123 unsigned int count = 0, offset = get_offset(*pos);
4124
4125 h = &net->dev_name_head[get_bucket(*pos)];
4126 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4127 if (++count == offset)
4128 return dev;
4129 }
4130
4131 return NULL;
4132 }
4133
4134 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4135 {
4136 struct net_device *dev;
4137 unsigned int bucket;
4138
4139 do {
4140 dev = dev_from_same_bucket(seq, pos);
4141 if (dev)
4142 return dev;
4143
4144 bucket = get_bucket(*pos) + 1;
4145 *pos = set_bucket_offset(bucket, 1);
4146 } while (bucket < NETDEV_HASHENTRIES);
4147
4148 return NULL;
4149 }
4150
4151 /*
4152 * This is invoked by the /proc filesystem handler to display a device
4153 * in detail.
4154 */
4155 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4156 __acquires(RCU)
4157 {
4158 rcu_read_lock();
4159 if (!*pos)
4160 return SEQ_START_TOKEN;
4161
4162 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4163 return NULL;
4164
4165 return dev_from_bucket(seq, pos);
4166 }
4167
4168 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4169 {
4170 ++*pos;
4171 return dev_from_bucket(seq, pos);
4172 }
4173
4174 void dev_seq_stop(struct seq_file *seq, void *v)
4175 __releases(RCU)
4176 {
4177 rcu_read_unlock();
4178 }
4179
4180 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4181 {
4182 struct rtnl_link_stats64 temp;
4183 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4184
4185 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4186 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4187 dev->name, stats->rx_bytes, stats->rx_packets,
4188 stats->rx_errors,
4189 stats->rx_dropped + stats->rx_missed_errors,
4190 stats->rx_fifo_errors,
4191 stats->rx_length_errors + stats->rx_over_errors +
4192 stats->rx_crc_errors + stats->rx_frame_errors,
4193 stats->rx_compressed, stats->multicast,
4194 stats->tx_bytes, stats->tx_packets,
4195 stats->tx_errors, stats->tx_dropped,
4196 stats->tx_fifo_errors, stats->collisions,
4197 stats->tx_carrier_errors +
4198 stats->tx_aborted_errors +
4199 stats->tx_window_errors +
4200 stats->tx_heartbeat_errors,
4201 stats->tx_compressed);
4202 }
4203
4204 /*
4205 * Called from the PROCfs module. This now uses the new arbitrary sized
4206 * /proc/net interface to create /proc/net/dev
4207 */
4208 static int dev_seq_show(struct seq_file *seq, void *v)
4209 {
4210 if (v == SEQ_START_TOKEN)
4211 seq_puts(seq, "Inter-| Receive "
4212 " | Transmit\n"
4213 " face |bytes packets errs drop fifo frame "
4214 "compressed multicast|bytes packets errs "
4215 "drop fifo colls carrier compressed\n");
4216 else
4217 dev_seq_printf_stats(seq, v);
4218 return 0;
4219 }
4220
4221 static struct softnet_data *softnet_get_online(loff_t *pos)
4222 {
4223 struct softnet_data *sd = NULL;
4224
4225 while (*pos < nr_cpu_ids)
4226 if (cpu_online(*pos)) {
4227 sd = &per_cpu(softnet_data, *pos);
4228 break;
4229 } else
4230 ++*pos;
4231 return sd;
4232 }
4233
4234 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4235 {
4236 return softnet_get_online(pos);
4237 }
4238
4239 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4240 {
4241 ++*pos;
4242 return softnet_get_online(pos);
4243 }
4244
4245 static void softnet_seq_stop(struct seq_file *seq, void *v)
4246 {
4247 }
4248
4249 static int softnet_seq_show(struct seq_file *seq, void *v)
4250 {
4251 struct softnet_data *sd = v;
4252
4253 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4254 sd->processed, sd->dropped, sd->time_squeeze, 0,
4255 0, 0, 0, 0, /* was fastroute */
4256 sd->cpu_collision, sd->received_rps);
4257 return 0;
4258 }
4259
4260 static const struct seq_operations dev_seq_ops = {
4261 .start = dev_seq_start,
4262 .next = dev_seq_next,
4263 .stop = dev_seq_stop,
4264 .show = dev_seq_show,
4265 };
4266
4267 static int dev_seq_open(struct inode *inode, struct file *file)
4268 {
4269 return seq_open_net(inode, file, &dev_seq_ops,
4270 sizeof(struct seq_net_private));
4271 }
4272
4273 static const struct file_operations dev_seq_fops = {
4274 .owner = THIS_MODULE,
4275 .open = dev_seq_open,
4276 .read = seq_read,
4277 .llseek = seq_lseek,
4278 .release = seq_release_net,
4279 };
4280
4281 static const struct seq_operations softnet_seq_ops = {
4282 .start = softnet_seq_start,
4283 .next = softnet_seq_next,
4284 .stop = softnet_seq_stop,
4285 .show = softnet_seq_show,
4286 };
4287
4288 static int softnet_seq_open(struct inode *inode, struct file *file)
4289 {
4290 return seq_open(file, &softnet_seq_ops);
4291 }
4292
4293 static const struct file_operations softnet_seq_fops = {
4294 .owner = THIS_MODULE,
4295 .open = softnet_seq_open,
4296 .read = seq_read,
4297 .llseek = seq_lseek,
4298 .release = seq_release,
4299 };
4300
4301 static void *ptype_get_idx(loff_t pos)
4302 {
4303 struct packet_type *pt = NULL;
4304 loff_t i = 0;
4305 int t;
4306
4307 list_for_each_entry_rcu(pt, &ptype_all, list) {
4308 if (i == pos)
4309 return pt;
4310 ++i;
4311 }
4312
4313 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4314 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4315 if (i == pos)
4316 return pt;
4317 ++i;
4318 }
4319 }
4320 return NULL;
4321 }
4322
4323 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4324 __acquires(RCU)
4325 {
4326 rcu_read_lock();
4327 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4328 }
4329
4330 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4331 {
4332 struct packet_type *pt;
4333 struct list_head *nxt;
4334 int hash;
4335
4336 ++*pos;
4337 if (v == SEQ_START_TOKEN)
4338 return ptype_get_idx(0);
4339
4340 pt = v;
4341 nxt = pt->list.next;
4342 if (pt->type == htons(ETH_P_ALL)) {
4343 if (nxt != &ptype_all)
4344 goto found;
4345 hash = 0;
4346 nxt = ptype_base[0].next;
4347 } else
4348 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4349
4350 while (nxt == &ptype_base[hash]) {
4351 if (++hash >= PTYPE_HASH_SIZE)
4352 return NULL;
4353 nxt = ptype_base[hash].next;
4354 }
4355 found:
4356 return list_entry(nxt, struct packet_type, list);
4357 }
4358
4359 static void ptype_seq_stop(struct seq_file *seq, void *v)
4360 __releases(RCU)
4361 {
4362 rcu_read_unlock();
4363 }
4364
4365 static int ptype_seq_show(struct seq_file *seq, void *v)
4366 {
4367 struct packet_type *pt = v;
4368
4369 if (v == SEQ_START_TOKEN)
4370 seq_puts(seq, "Type Device Function\n");
4371 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4372 if (pt->type == htons(ETH_P_ALL))
4373 seq_puts(seq, "ALL ");
4374 else
4375 seq_printf(seq, "%04x", ntohs(pt->type));
4376
4377 seq_printf(seq, " %-8s %pF\n",
4378 pt->dev ? pt->dev->name : "", pt->func);
4379 }
4380
4381 return 0;
4382 }
4383
4384 static const struct seq_operations ptype_seq_ops = {
4385 .start = ptype_seq_start,
4386 .next = ptype_seq_next,
4387 .stop = ptype_seq_stop,
4388 .show = ptype_seq_show,
4389 };
4390
4391 static int ptype_seq_open(struct inode *inode, struct file *file)
4392 {
4393 return seq_open_net(inode, file, &ptype_seq_ops,
4394 sizeof(struct seq_net_private));
4395 }
4396
4397 static const struct file_operations ptype_seq_fops = {
4398 .owner = THIS_MODULE,
4399 .open = ptype_seq_open,
4400 .read = seq_read,
4401 .llseek = seq_lseek,
4402 .release = seq_release_net,
4403 };
4404
4405
4406 static int __net_init dev_proc_net_init(struct net *net)
4407 {
4408 int rc = -ENOMEM;
4409
4410 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4411 goto out;
4412 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4413 goto out_dev;
4414 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4415 goto out_softnet;
4416
4417 if (wext_proc_init(net))
4418 goto out_ptype;
4419 rc = 0;
4420 out:
4421 return rc;
4422 out_ptype:
4423 proc_net_remove(net, "ptype");
4424 out_softnet:
4425 proc_net_remove(net, "softnet_stat");
4426 out_dev:
4427 proc_net_remove(net, "dev");
4428 goto out;
4429 }
4430
4431 static void __net_exit dev_proc_net_exit(struct net *net)
4432 {
4433 wext_proc_exit(net);
4434
4435 proc_net_remove(net, "ptype");
4436 proc_net_remove(net, "softnet_stat");
4437 proc_net_remove(net, "dev");
4438 }
4439
4440 static struct pernet_operations __net_initdata dev_proc_ops = {
4441 .init = dev_proc_net_init,
4442 .exit = dev_proc_net_exit,
4443 };
4444
4445 static int __init dev_proc_init(void)
4446 {
4447 return register_pernet_subsys(&dev_proc_ops);
4448 }
4449 #else
4450 #define dev_proc_init() 0
4451 #endif /* CONFIG_PROC_FS */
4452
4453
4454 /**
4455 * netdev_set_master - set up master pointer
4456 * @slave: slave device
4457 * @master: new master device
4458 *
4459 * Changes the master device of the slave. Pass %NULL to break the
4460 * bonding. The caller must hold the RTNL semaphore. On a failure
4461 * a negative errno code is returned. On success the reference counts
4462 * are adjusted and the function returns zero.
4463 */
4464 int netdev_set_master(struct net_device *slave, struct net_device *master)
4465 {
4466 struct net_device *old = slave->master;
4467
4468 ASSERT_RTNL();
4469
4470 if (master) {
4471 if (old)
4472 return -EBUSY;
4473 dev_hold(master);
4474 }
4475
4476 slave->master = master;
4477
4478 if (old)
4479 dev_put(old);
4480 return 0;
4481 }
4482 EXPORT_SYMBOL(netdev_set_master);
4483
4484 /**
4485 * netdev_set_bond_master - set up bonding master/slave pair
4486 * @slave: slave device
4487 * @master: new master device
4488 *
4489 * Changes the master device of the slave. Pass %NULL to break the
4490 * bonding. The caller must hold the RTNL semaphore. On a failure
4491 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4492 * to the routing socket and the function returns zero.
4493 */
4494 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4495 {
4496 int err;
4497
4498 ASSERT_RTNL();
4499
4500 err = netdev_set_master(slave, master);
4501 if (err)
4502 return err;
4503 if (master)
4504 slave->flags |= IFF_SLAVE;
4505 else
4506 slave->flags &= ~IFF_SLAVE;
4507
4508 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4509 return 0;
4510 }
4511 EXPORT_SYMBOL(netdev_set_bond_master);
4512
4513 static void dev_change_rx_flags(struct net_device *dev, int flags)
4514 {
4515 const struct net_device_ops *ops = dev->netdev_ops;
4516
4517 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4518 ops->ndo_change_rx_flags(dev, flags);
4519 }
4520
4521 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4522 {
4523 unsigned int old_flags = dev->flags;
4524 uid_t uid;
4525 gid_t gid;
4526
4527 ASSERT_RTNL();
4528
4529 dev->flags |= IFF_PROMISC;
4530 dev->promiscuity += inc;
4531 if (dev->promiscuity == 0) {
4532 /*
4533 * Avoid overflow.
4534 * If inc causes overflow, untouch promisc and return error.
4535 */
4536 if (inc < 0)
4537 dev->flags &= ~IFF_PROMISC;
4538 else {
4539 dev->promiscuity -= inc;
4540 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4541 dev->name);
4542 return -EOVERFLOW;
4543 }
4544 }
4545 if (dev->flags != old_flags) {
4546 pr_info("device %s %s promiscuous mode\n",
4547 dev->name,
4548 dev->flags & IFF_PROMISC ? "entered" : "left");
4549 if (audit_enabled) {
4550 current_uid_gid(&uid, &gid);
4551 audit_log(current->audit_context, GFP_ATOMIC,
4552 AUDIT_ANOM_PROMISCUOUS,
4553 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4554 dev->name, (dev->flags & IFF_PROMISC),
4555 (old_flags & IFF_PROMISC),
4556 audit_get_loginuid(current),
4557 uid, gid,
4558 audit_get_sessionid(current));
4559 }
4560
4561 dev_change_rx_flags(dev, IFF_PROMISC);
4562 }
4563 return 0;
4564 }
4565
4566 /**
4567 * dev_set_promiscuity - update promiscuity count on a device
4568 * @dev: device
4569 * @inc: modifier
4570 *
4571 * Add or remove promiscuity from a device. While the count in the device
4572 * remains above zero the interface remains promiscuous. Once it hits zero
4573 * the device reverts back to normal filtering operation. A negative inc
4574 * value is used to drop promiscuity on the device.
4575 * Return 0 if successful or a negative errno code on error.
4576 */
4577 int dev_set_promiscuity(struct net_device *dev, int inc)
4578 {
4579 unsigned int old_flags = dev->flags;
4580 int err;
4581
4582 err = __dev_set_promiscuity(dev, inc);
4583 if (err < 0)
4584 return err;
4585 if (dev->flags != old_flags)
4586 dev_set_rx_mode(dev);
4587 return err;
4588 }
4589 EXPORT_SYMBOL(dev_set_promiscuity);
4590
4591 /**
4592 * dev_set_allmulti - update allmulti count on a device
4593 * @dev: device
4594 * @inc: modifier
4595 *
4596 * Add or remove reception of all multicast frames to a device. While the
4597 * count in the device remains above zero the interface remains listening
4598 * to all interfaces. Once it hits zero the device reverts back to normal
4599 * filtering operation. A negative @inc value is used to drop the counter
4600 * when releasing a resource needing all multicasts.
4601 * Return 0 if successful or a negative errno code on error.
4602 */
4603
4604 int dev_set_allmulti(struct net_device *dev, int inc)
4605 {
4606 unsigned int old_flags = dev->flags;
4607
4608 ASSERT_RTNL();
4609
4610 dev->flags |= IFF_ALLMULTI;
4611 dev->allmulti += inc;
4612 if (dev->allmulti == 0) {
4613 /*
4614 * Avoid overflow.
4615 * If inc causes overflow, untouch allmulti and return error.
4616 */
4617 if (inc < 0)
4618 dev->flags &= ~IFF_ALLMULTI;
4619 else {
4620 dev->allmulti -= inc;
4621 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4622 dev->name);
4623 return -EOVERFLOW;
4624 }
4625 }
4626 if (dev->flags ^ old_flags) {
4627 dev_change_rx_flags(dev, IFF_ALLMULTI);
4628 dev_set_rx_mode(dev);
4629 }
4630 return 0;
4631 }
4632 EXPORT_SYMBOL(dev_set_allmulti);
4633
4634 /*
4635 * Upload unicast and multicast address lists to device and
4636 * configure RX filtering. When the device doesn't support unicast
4637 * filtering it is put in promiscuous mode while unicast addresses
4638 * are present.
4639 */
4640 void __dev_set_rx_mode(struct net_device *dev)
4641 {
4642 const struct net_device_ops *ops = dev->netdev_ops;
4643
4644 /* dev_open will call this function so the list will stay sane. */
4645 if (!(dev->flags&IFF_UP))
4646 return;
4647
4648 if (!netif_device_present(dev))
4649 return;
4650
4651 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4652 /* Unicast addresses changes may only happen under the rtnl,
4653 * therefore calling __dev_set_promiscuity here is safe.
4654 */
4655 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4656 __dev_set_promiscuity(dev, 1);
4657 dev->uc_promisc = true;
4658 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4659 __dev_set_promiscuity(dev, -1);
4660 dev->uc_promisc = false;
4661 }
4662 }
4663
4664 if (ops->ndo_set_rx_mode)
4665 ops->ndo_set_rx_mode(dev);
4666 }
4667
4668 void dev_set_rx_mode(struct net_device *dev)
4669 {
4670 netif_addr_lock_bh(dev);
4671 __dev_set_rx_mode(dev);
4672 netif_addr_unlock_bh(dev);
4673 }
4674
4675 /**
4676 * dev_get_flags - get flags reported to userspace
4677 * @dev: device
4678 *
4679 * Get the combination of flag bits exported through APIs to userspace.
4680 */
4681 unsigned int dev_get_flags(const struct net_device *dev)
4682 {
4683 unsigned int flags;
4684
4685 flags = (dev->flags & ~(IFF_PROMISC |
4686 IFF_ALLMULTI |
4687 IFF_RUNNING |
4688 IFF_LOWER_UP |
4689 IFF_DORMANT)) |
4690 (dev->gflags & (IFF_PROMISC |
4691 IFF_ALLMULTI));
4692
4693 if (netif_running(dev)) {
4694 if (netif_oper_up(dev))
4695 flags |= IFF_RUNNING;
4696 if (netif_carrier_ok(dev))
4697 flags |= IFF_LOWER_UP;
4698 if (netif_dormant(dev))
4699 flags |= IFF_DORMANT;
4700 }
4701
4702 return flags;
4703 }
4704 EXPORT_SYMBOL(dev_get_flags);
4705
4706 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4707 {
4708 unsigned int old_flags = dev->flags;
4709 int ret;
4710
4711 ASSERT_RTNL();
4712
4713 /*
4714 * Set the flags on our device.
4715 */
4716
4717 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4718 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4719 IFF_AUTOMEDIA)) |
4720 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4721 IFF_ALLMULTI));
4722
4723 /*
4724 * Load in the correct multicast list now the flags have changed.
4725 */
4726
4727 if ((old_flags ^ flags) & IFF_MULTICAST)
4728 dev_change_rx_flags(dev, IFF_MULTICAST);
4729
4730 dev_set_rx_mode(dev);
4731
4732 /*
4733 * Have we downed the interface. We handle IFF_UP ourselves
4734 * according to user attempts to set it, rather than blindly
4735 * setting it.
4736 */
4737
4738 ret = 0;
4739 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4740 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4741
4742 if (!ret)
4743 dev_set_rx_mode(dev);
4744 }
4745
4746 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4747 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4748
4749 dev->gflags ^= IFF_PROMISC;
4750 dev_set_promiscuity(dev, inc);
4751 }
4752
4753 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4754 is important. Some (broken) drivers set IFF_PROMISC, when
4755 IFF_ALLMULTI is requested not asking us and not reporting.
4756 */
4757 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4758 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4759
4760 dev->gflags ^= IFF_ALLMULTI;
4761 dev_set_allmulti(dev, inc);
4762 }
4763
4764 return ret;
4765 }
4766
4767 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4768 {
4769 unsigned int changes = dev->flags ^ old_flags;
4770
4771 if (changes & IFF_UP) {
4772 if (dev->flags & IFF_UP)
4773 call_netdevice_notifiers(NETDEV_UP, dev);
4774 else
4775 call_netdevice_notifiers(NETDEV_DOWN, dev);
4776 }
4777
4778 if (dev->flags & IFF_UP &&
4779 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4780 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4781 }
4782
4783 /**
4784 * dev_change_flags - change device settings
4785 * @dev: device
4786 * @flags: device state flags
4787 *
4788 * Change settings on device based state flags. The flags are
4789 * in the userspace exported format.
4790 */
4791 int dev_change_flags(struct net_device *dev, unsigned int flags)
4792 {
4793 int ret;
4794 unsigned int changes, old_flags = dev->flags;
4795
4796 ret = __dev_change_flags(dev, flags);
4797 if (ret < 0)
4798 return ret;
4799
4800 changes = old_flags ^ dev->flags;
4801 if (changes)
4802 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4803
4804 __dev_notify_flags(dev, old_flags);
4805 return ret;
4806 }
4807 EXPORT_SYMBOL(dev_change_flags);
4808
4809 /**
4810 * dev_set_mtu - Change maximum transfer unit
4811 * @dev: device
4812 * @new_mtu: new transfer unit
4813 *
4814 * Change the maximum transfer size of the network device.
4815 */
4816 int dev_set_mtu(struct net_device *dev, int new_mtu)
4817 {
4818 const struct net_device_ops *ops = dev->netdev_ops;
4819 int err;
4820
4821 if (new_mtu == dev->mtu)
4822 return 0;
4823
4824 /* MTU must be positive. */
4825 if (new_mtu < 0)
4826 return -EINVAL;
4827
4828 if (!netif_device_present(dev))
4829 return -ENODEV;
4830
4831 err = 0;
4832 if (ops->ndo_change_mtu)
4833 err = ops->ndo_change_mtu(dev, new_mtu);
4834 else
4835 dev->mtu = new_mtu;
4836
4837 if (!err && dev->flags & IFF_UP)
4838 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4839 return err;
4840 }
4841 EXPORT_SYMBOL(dev_set_mtu);
4842
4843 /**
4844 * dev_set_group - Change group this device belongs to
4845 * @dev: device
4846 * @new_group: group this device should belong to
4847 */
4848 void dev_set_group(struct net_device *dev, int new_group)
4849 {
4850 dev->group = new_group;
4851 }
4852 EXPORT_SYMBOL(dev_set_group);
4853
4854 /**
4855 * dev_set_mac_address - Change Media Access Control Address
4856 * @dev: device
4857 * @sa: new address
4858 *
4859 * Change the hardware (MAC) address of the device
4860 */
4861 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4862 {
4863 const struct net_device_ops *ops = dev->netdev_ops;
4864 int err;
4865
4866 if (!ops->ndo_set_mac_address)
4867 return -EOPNOTSUPP;
4868 if (sa->sa_family != dev->type)
4869 return -EINVAL;
4870 if (!netif_device_present(dev))
4871 return -ENODEV;
4872 err = ops->ndo_set_mac_address(dev, sa);
4873 if (!err)
4874 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4875 add_device_randomness(dev->dev_addr, dev->addr_len);
4876 return err;
4877 }
4878 EXPORT_SYMBOL(dev_set_mac_address);
4879
4880 /*
4881 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4882 */
4883 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4884 {
4885 int err;
4886 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4887
4888 if (!dev)
4889 return -ENODEV;
4890
4891 switch (cmd) {
4892 case SIOCGIFFLAGS: /* Get interface flags */
4893 ifr->ifr_flags = (short) dev_get_flags(dev);
4894 return 0;
4895
4896 case SIOCGIFMETRIC: /* Get the metric on the interface
4897 (currently unused) */
4898 ifr->ifr_metric = 0;
4899 return 0;
4900
4901 case SIOCGIFMTU: /* Get the MTU of a device */
4902 ifr->ifr_mtu = dev->mtu;
4903 return 0;
4904
4905 case SIOCGIFHWADDR:
4906 if (!dev->addr_len)
4907 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4908 else
4909 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4910 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4911 ifr->ifr_hwaddr.sa_family = dev->type;
4912 return 0;
4913
4914 case SIOCGIFSLAVE:
4915 err = -EINVAL;
4916 break;
4917
4918 case SIOCGIFMAP:
4919 ifr->ifr_map.mem_start = dev->mem_start;
4920 ifr->ifr_map.mem_end = dev->mem_end;
4921 ifr->ifr_map.base_addr = dev->base_addr;
4922 ifr->ifr_map.irq = dev->irq;
4923 ifr->ifr_map.dma = dev->dma;
4924 ifr->ifr_map.port = dev->if_port;
4925 return 0;
4926
4927 case SIOCGIFINDEX:
4928 ifr->ifr_ifindex = dev->ifindex;
4929 return 0;
4930
4931 case SIOCGIFTXQLEN:
4932 ifr->ifr_qlen = dev->tx_queue_len;
4933 return 0;
4934
4935 default:
4936 /* dev_ioctl() should ensure this case
4937 * is never reached
4938 */
4939 WARN_ON(1);
4940 err = -ENOTTY;
4941 break;
4942
4943 }
4944 return err;
4945 }
4946
4947 /*
4948 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4949 */
4950 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4951 {
4952 int err;
4953 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4954 const struct net_device_ops *ops;
4955
4956 if (!dev)
4957 return -ENODEV;
4958
4959 ops = dev->netdev_ops;
4960
4961 switch (cmd) {
4962 case SIOCSIFFLAGS: /* Set interface flags */
4963 return dev_change_flags(dev, ifr->ifr_flags);
4964
4965 case SIOCSIFMETRIC: /* Set the metric on the interface
4966 (currently unused) */
4967 return -EOPNOTSUPP;
4968
4969 case SIOCSIFMTU: /* Set the MTU of a device */
4970 return dev_set_mtu(dev, ifr->ifr_mtu);
4971
4972 case SIOCSIFHWADDR:
4973 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4974
4975 case SIOCSIFHWBROADCAST:
4976 if (ifr->ifr_hwaddr.sa_family != dev->type)
4977 return -EINVAL;
4978 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4979 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4980 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4981 return 0;
4982
4983 case SIOCSIFMAP:
4984 if (ops->ndo_set_config) {
4985 if (!netif_device_present(dev))
4986 return -ENODEV;
4987 return ops->ndo_set_config(dev, &ifr->ifr_map);
4988 }
4989 return -EOPNOTSUPP;
4990
4991 case SIOCADDMULTI:
4992 if (!ops->ndo_set_rx_mode ||
4993 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4994 return -EINVAL;
4995 if (!netif_device_present(dev))
4996 return -ENODEV;
4997 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4998
4999 case SIOCDELMULTI:
5000 if (!ops->ndo_set_rx_mode ||
5001 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5002 return -EINVAL;
5003 if (!netif_device_present(dev))
5004 return -ENODEV;
5005 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5006
5007 case SIOCSIFTXQLEN:
5008 if (ifr->ifr_qlen < 0)
5009 return -EINVAL;
5010 dev->tx_queue_len = ifr->ifr_qlen;
5011 return 0;
5012
5013 case SIOCSIFNAME:
5014 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5015 return dev_change_name(dev, ifr->ifr_newname);
5016
5017 case SIOCSHWTSTAMP:
5018 err = net_hwtstamp_validate(ifr);
5019 if (err)
5020 return err;
5021 /* fall through */
5022
5023 /*
5024 * Unknown or private ioctl
5025 */
5026 default:
5027 if ((cmd >= SIOCDEVPRIVATE &&
5028 cmd <= SIOCDEVPRIVATE + 15) ||
5029 cmd == SIOCBONDENSLAVE ||
5030 cmd == SIOCBONDRELEASE ||
5031 cmd == SIOCBONDSETHWADDR ||
5032 cmd == SIOCBONDSLAVEINFOQUERY ||
5033 cmd == SIOCBONDINFOQUERY ||
5034 cmd == SIOCBONDCHANGEACTIVE ||
5035 cmd == SIOCGMIIPHY ||
5036 cmd == SIOCGMIIREG ||
5037 cmd == SIOCSMIIREG ||
5038 cmd == SIOCBRADDIF ||
5039 cmd == SIOCBRDELIF ||
5040 cmd == SIOCSHWTSTAMP ||
5041 cmd == SIOCWANDEV) {
5042 err = -EOPNOTSUPP;
5043 if (ops->ndo_do_ioctl) {
5044 if (netif_device_present(dev))
5045 err = ops->ndo_do_ioctl(dev, ifr, cmd);
5046 else
5047 err = -ENODEV;
5048 }
5049 } else
5050 err = -EINVAL;
5051
5052 }
5053 return err;
5054 }
5055
5056 /*
5057 * This function handles all "interface"-type I/O control requests. The actual
5058 * 'doing' part of this is dev_ifsioc above.
5059 */
5060
5061 /**
5062 * dev_ioctl - network device ioctl
5063 * @net: the applicable net namespace
5064 * @cmd: command to issue
5065 * @arg: pointer to a struct ifreq in user space
5066 *
5067 * Issue ioctl functions to devices. This is normally called by the
5068 * user space syscall interfaces but can sometimes be useful for
5069 * other purposes. The return value is the return from the syscall if
5070 * positive or a negative errno code on error.
5071 */
5072
5073 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5074 {
5075 struct ifreq ifr;
5076 int ret;
5077 char *colon;
5078
5079 /* One special case: SIOCGIFCONF takes ifconf argument
5080 and requires shared lock, because it sleeps writing
5081 to user space.
5082 */
5083
5084 if (cmd == SIOCGIFCONF) {
5085 rtnl_lock();
5086 ret = dev_ifconf(net, (char __user *) arg);
5087 rtnl_unlock();
5088 return ret;
5089 }
5090 if (cmd == SIOCGIFNAME)
5091 return dev_ifname(net, (struct ifreq __user *)arg);
5092
5093 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5094 return -EFAULT;
5095
5096 ifr.ifr_name[IFNAMSIZ-1] = 0;
5097
5098 colon = strchr(ifr.ifr_name, ':');
5099 if (colon)
5100 *colon = 0;
5101
5102 /*
5103 * See which interface the caller is talking about.
5104 */
5105
5106 switch (cmd) {
5107 /*
5108 * These ioctl calls:
5109 * - can be done by all.
5110 * - atomic and do not require locking.
5111 * - return a value
5112 */
5113 case SIOCGIFFLAGS:
5114 case SIOCGIFMETRIC:
5115 case SIOCGIFMTU:
5116 case SIOCGIFHWADDR:
5117 case SIOCGIFSLAVE:
5118 case SIOCGIFMAP:
5119 case SIOCGIFINDEX:
5120 case SIOCGIFTXQLEN:
5121 dev_load(net, ifr.ifr_name);
5122 rcu_read_lock();
5123 ret = dev_ifsioc_locked(net, &ifr, cmd);
5124 rcu_read_unlock();
5125 if (!ret) {
5126 if (colon)
5127 *colon = ':';
5128 if (copy_to_user(arg, &ifr,
5129 sizeof(struct ifreq)))
5130 ret = -EFAULT;
5131 }
5132 return ret;
5133
5134 case SIOCETHTOOL:
5135 dev_load(net, ifr.ifr_name);
5136 rtnl_lock();
5137 ret = dev_ethtool(net, &ifr);
5138 rtnl_unlock();
5139 if (!ret) {
5140 if (colon)
5141 *colon = ':';
5142 if (copy_to_user(arg, &ifr,
5143 sizeof(struct ifreq)))
5144 ret = -EFAULT;
5145 }
5146 return ret;
5147
5148 /*
5149 * These ioctl calls:
5150 * - require superuser power.
5151 * - require strict serialization.
5152 * - return a value
5153 */
5154 case SIOCGMIIPHY:
5155 case SIOCGMIIREG:
5156 case SIOCSIFNAME:
5157 if (!capable(CAP_NET_ADMIN))
5158 return -EPERM;
5159 dev_load(net, ifr.ifr_name);
5160 rtnl_lock();
5161 ret = dev_ifsioc(net, &ifr, cmd);
5162 rtnl_unlock();
5163 if (!ret) {
5164 if (colon)
5165 *colon = ':';
5166 if (copy_to_user(arg, &ifr,
5167 sizeof(struct ifreq)))
5168 ret = -EFAULT;
5169 }
5170 return ret;
5171
5172 /*
5173 * These ioctl calls:
5174 * - require superuser power.
5175 * - require strict serialization.
5176 * - do not return a value
5177 */
5178 case SIOCSIFFLAGS:
5179 case SIOCSIFMETRIC:
5180 case SIOCSIFMTU:
5181 case SIOCSIFMAP:
5182 case SIOCSIFHWADDR:
5183 case SIOCSIFSLAVE:
5184 case SIOCADDMULTI:
5185 case SIOCDELMULTI:
5186 case SIOCSIFHWBROADCAST:
5187 case SIOCSIFTXQLEN:
5188 case SIOCSMIIREG:
5189 case SIOCBONDENSLAVE:
5190 case SIOCBONDRELEASE:
5191 case SIOCBONDSETHWADDR:
5192 case SIOCBONDCHANGEACTIVE:
5193 case SIOCBRADDIF:
5194 case SIOCBRDELIF:
5195 case SIOCSHWTSTAMP:
5196 if (!capable(CAP_NET_ADMIN))
5197 return -EPERM;
5198 /* fall through */
5199 case SIOCBONDSLAVEINFOQUERY:
5200 case SIOCBONDINFOQUERY:
5201 dev_load(net, ifr.ifr_name);
5202 rtnl_lock();
5203 ret = dev_ifsioc(net, &ifr, cmd);
5204 rtnl_unlock();
5205 return ret;
5206
5207 case SIOCGIFMEM:
5208 /* Get the per device memory space. We can add this but
5209 * currently do not support it */
5210 case SIOCSIFMEM:
5211 /* Set the per device memory buffer space.
5212 * Not applicable in our case */
5213 case SIOCSIFLINK:
5214 return -ENOTTY;
5215
5216 /*
5217 * Unknown or private ioctl.
5218 */
5219 default:
5220 if (cmd == SIOCWANDEV ||
5221 (cmd >= SIOCDEVPRIVATE &&
5222 cmd <= SIOCDEVPRIVATE + 15)) {
5223 dev_load(net, ifr.ifr_name);
5224 rtnl_lock();
5225 ret = dev_ifsioc(net, &ifr, cmd);
5226 rtnl_unlock();
5227 if (!ret && copy_to_user(arg, &ifr,
5228 sizeof(struct ifreq)))
5229 ret = -EFAULT;
5230 return ret;
5231 }
5232 /* Take care of Wireless Extensions */
5233 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5234 return wext_handle_ioctl(net, &ifr, cmd, arg);
5235 return -ENOTTY;
5236 }
5237 }
5238
5239
5240 /**
5241 * dev_new_index - allocate an ifindex
5242 * @net: the applicable net namespace
5243 *
5244 * Returns a suitable unique value for a new device interface
5245 * number. The caller must hold the rtnl semaphore or the
5246 * dev_base_lock to be sure it remains unique.
5247 */
5248 static int dev_new_index(struct net *net)
5249 {
5250 int ifindex = net->ifindex;
5251 for (;;) {
5252 if (++ifindex <= 0)
5253 ifindex = 1;
5254 if (!__dev_get_by_index(net, ifindex))
5255 return net->ifindex = ifindex;
5256 }
5257 }
5258
5259 /* Delayed registration/unregisteration */
5260 static LIST_HEAD(net_todo_list);
5261
5262 static void net_set_todo(struct net_device *dev)
5263 {
5264 list_add_tail(&dev->todo_list, &net_todo_list);
5265 }
5266
5267 static void rollback_registered_many(struct list_head *head)
5268 {
5269 struct net_device *dev, *tmp;
5270
5271 BUG_ON(dev_boot_phase);
5272 ASSERT_RTNL();
5273
5274 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5275 /* Some devices call without registering
5276 * for initialization unwind. Remove those
5277 * devices and proceed with the remaining.
5278 */
5279 if (dev->reg_state == NETREG_UNINITIALIZED) {
5280 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5281 dev->name, dev);
5282
5283 WARN_ON(1);
5284 list_del(&dev->unreg_list);
5285 continue;
5286 }
5287 dev->dismantle = true;
5288 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5289 }
5290
5291 /* If device is running, close it first. */
5292 dev_close_many(head);
5293
5294 list_for_each_entry(dev, head, unreg_list) {
5295 /* And unlink it from device chain. */
5296 unlist_netdevice(dev);
5297
5298 dev->reg_state = NETREG_UNREGISTERING;
5299 }
5300
5301 synchronize_net();
5302
5303 list_for_each_entry(dev, head, unreg_list) {
5304 /* Shutdown queueing discipline. */
5305 dev_shutdown(dev);
5306
5307
5308 /* Notify protocols, that we are about to destroy
5309 this device. They should clean all the things.
5310 */
5311 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5312
5313 if (!dev->rtnl_link_ops ||
5314 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5315 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5316
5317 /*
5318 * Flush the unicast and multicast chains
5319 */
5320 dev_uc_flush(dev);
5321 dev_mc_flush(dev);
5322
5323 if (dev->netdev_ops->ndo_uninit)
5324 dev->netdev_ops->ndo_uninit(dev);
5325
5326 /* Notifier chain MUST detach us from master device. */
5327 WARN_ON(dev->master);
5328
5329 /* Remove entries from kobject tree */
5330 netdev_unregister_kobject(dev);
5331 }
5332
5333 synchronize_net();
5334
5335 list_for_each_entry(dev, head, unreg_list)
5336 dev_put(dev);
5337 }
5338
5339 static void rollback_registered(struct net_device *dev)
5340 {
5341 LIST_HEAD(single);
5342
5343 list_add(&dev->unreg_list, &single);
5344 rollback_registered_many(&single);
5345 list_del(&single);
5346 }
5347
5348 static netdev_features_t netdev_fix_features(struct net_device *dev,
5349 netdev_features_t features)
5350 {
5351 /* Fix illegal checksum combinations */
5352 if ((features & NETIF_F_HW_CSUM) &&
5353 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5354 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5355 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5356 }
5357
5358 /* Fix illegal SG+CSUM combinations. */
5359 if ((features & NETIF_F_SG) &&
5360 !(features & NETIF_F_ALL_CSUM)) {
5361 netdev_dbg(dev,
5362 "Dropping NETIF_F_SG since no checksum feature.\n");
5363 features &= ~NETIF_F_SG;
5364 }
5365
5366 /* TSO requires that SG is present as well. */
5367 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5368 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5369 features &= ~NETIF_F_ALL_TSO;
5370 }
5371
5372 /* TSO ECN requires that TSO is present as well. */
5373 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5374 features &= ~NETIF_F_TSO_ECN;
5375
5376 /* Software GSO depends on SG. */
5377 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5378 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5379 features &= ~NETIF_F_GSO;
5380 }
5381
5382 /* UFO needs SG and checksumming */
5383 if (features & NETIF_F_UFO) {
5384 /* maybe split UFO into V4 and V6? */
5385 if (!((features & NETIF_F_GEN_CSUM) ||
5386 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5387 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5388 netdev_dbg(dev,
5389 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5390 features &= ~NETIF_F_UFO;
5391 }
5392
5393 if (!(features & NETIF_F_SG)) {
5394 netdev_dbg(dev,
5395 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5396 features &= ~NETIF_F_UFO;
5397 }
5398 }
5399
5400 return features;
5401 }
5402
5403 int __netdev_update_features(struct net_device *dev)
5404 {
5405 netdev_features_t features;
5406 int err = 0;
5407
5408 ASSERT_RTNL();
5409
5410 features = netdev_get_wanted_features(dev);
5411
5412 if (dev->netdev_ops->ndo_fix_features)
5413 features = dev->netdev_ops->ndo_fix_features(dev, features);
5414
5415 /* driver might be less strict about feature dependencies */
5416 features = netdev_fix_features(dev, features);
5417
5418 if (dev->features == features)
5419 return 0;
5420
5421 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5422 &dev->features, &features);
5423
5424 if (dev->netdev_ops->ndo_set_features)
5425 err = dev->netdev_ops->ndo_set_features(dev, features);
5426
5427 if (unlikely(err < 0)) {
5428 netdev_err(dev,
5429 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5430 err, &features, &dev->features);
5431 return -1;
5432 }
5433
5434 if (!err)
5435 dev->features = features;
5436
5437 return 1;
5438 }
5439
5440 /**
5441 * netdev_update_features - recalculate device features
5442 * @dev: the device to check
5443 *
5444 * Recalculate dev->features set and send notifications if it
5445 * has changed. Should be called after driver or hardware dependent
5446 * conditions might have changed that influence the features.
5447 */
5448 void netdev_update_features(struct net_device *dev)
5449 {
5450 if (__netdev_update_features(dev))
5451 netdev_features_change(dev);
5452 }
5453 EXPORT_SYMBOL(netdev_update_features);
5454
5455 /**
5456 * netdev_change_features - recalculate device features
5457 * @dev: the device to check
5458 *
5459 * Recalculate dev->features set and send notifications even
5460 * if they have not changed. Should be called instead of
5461 * netdev_update_features() if also dev->vlan_features might
5462 * have changed to allow the changes to be propagated to stacked
5463 * VLAN devices.
5464 */
5465 void netdev_change_features(struct net_device *dev)
5466 {
5467 __netdev_update_features(dev);
5468 netdev_features_change(dev);
5469 }
5470 EXPORT_SYMBOL(netdev_change_features);
5471
5472 /**
5473 * netif_stacked_transfer_operstate - transfer operstate
5474 * @rootdev: the root or lower level device to transfer state from
5475 * @dev: the device to transfer operstate to
5476 *
5477 * Transfer operational state from root to device. This is normally
5478 * called when a stacking relationship exists between the root
5479 * device and the device(a leaf device).
5480 */
5481 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5482 struct net_device *dev)
5483 {
5484 if (rootdev->operstate == IF_OPER_DORMANT)
5485 netif_dormant_on(dev);
5486 else
5487 netif_dormant_off(dev);
5488
5489 if (netif_carrier_ok(rootdev)) {
5490 if (!netif_carrier_ok(dev))
5491 netif_carrier_on(dev);
5492 } else {
5493 if (netif_carrier_ok(dev))
5494 netif_carrier_off(dev);
5495 }
5496 }
5497 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5498
5499 #ifdef CONFIG_RPS
5500 static int netif_alloc_rx_queues(struct net_device *dev)
5501 {
5502 unsigned int i, count = dev->num_rx_queues;
5503 struct netdev_rx_queue *rx;
5504
5505 BUG_ON(count < 1);
5506
5507 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5508 if (!rx) {
5509 pr_err("netdev: Unable to allocate %u rx queues\n", count);
5510 return -ENOMEM;
5511 }
5512 dev->_rx = rx;
5513
5514 for (i = 0; i < count; i++)
5515 rx[i].dev = dev;
5516 return 0;
5517 }
5518 #endif
5519
5520 static void netdev_init_one_queue(struct net_device *dev,
5521 struct netdev_queue *queue, void *_unused)
5522 {
5523 /* Initialize queue lock */
5524 spin_lock_init(&queue->_xmit_lock);
5525 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5526 queue->xmit_lock_owner = -1;
5527 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5528 queue->dev = dev;
5529 #ifdef CONFIG_BQL
5530 dql_init(&queue->dql, HZ);
5531 #endif
5532 }
5533
5534 static int netif_alloc_netdev_queues(struct net_device *dev)
5535 {
5536 unsigned int count = dev->num_tx_queues;
5537 struct netdev_queue *tx;
5538
5539 BUG_ON(count < 1);
5540
5541 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5542 if (!tx) {
5543 pr_err("netdev: Unable to allocate %u tx queues\n", count);
5544 return -ENOMEM;
5545 }
5546 dev->_tx = tx;
5547
5548 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5549 spin_lock_init(&dev->tx_global_lock);
5550
5551 return 0;
5552 }
5553
5554 /**
5555 * register_netdevice - register a network device
5556 * @dev: device to register
5557 *
5558 * Take a completed network device structure and add it to the kernel
5559 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5560 * chain. 0 is returned on success. A negative errno code is returned
5561 * on a failure to set up the device, or if the name is a duplicate.
5562 *
5563 * Callers must hold the rtnl semaphore. You may want
5564 * register_netdev() instead of this.
5565 *
5566 * BUGS:
5567 * The locking appears insufficient to guarantee two parallel registers
5568 * will not get the same name.
5569 */
5570
5571 int register_netdevice(struct net_device *dev)
5572 {
5573 int ret;
5574 struct net *net = dev_net(dev);
5575
5576 BUG_ON(dev_boot_phase);
5577 ASSERT_RTNL();
5578
5579 might_sleep();
5580
5581 /* When net_device's are persistent, this will be fatal. */
5582 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5583 BUG_ON(!net);
5584
5585 spin_lock_init(&dev->addr_list_lock);
5586 netdev_set_addr_lockdep_class(dev);
5587
5588 dev->iflink = -1;
5589
5590 ret = dev_get_valid_name(dev, dev->name);
5591 if (ret < 0)
5592 goto out;
5593
5594 /* Init, if this function is available */
5595 if (dev->netdev_ops->ndo_init) {
5596 ret = dev->netdev_ops->ndo_init(dev);
5597 if (ret) {
5598 if (ret > 0)
5599 ret = -EIO;
5600 goto out;
5601 }
5602 }
5603
5604 ret = -EBUSY;
5605 if (!dev->ifindex)
5606 dev->ifindex = dev_new_index(net);
5607 else if (__dev_get_by_index(net, dev->ifindex))
5608 goto err_uninit;
5609
5610 if (dev->iflink == -1)
5611 dev->iflink = dev->ifindex;
5612
5613 /* Transfer changeable features to wanted_features and enable
5614 * software offloads (GSO and GRO).
5615 */
5616 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5617 dev->features |= NETIF_F_SOFT_FEATURES;
5618 dev->wanted_features = dev->features & dev->hw_features;
5619
5620 /* Turn on no cache copy if HW is doing checksum */
5621 if (!(dev->flags & IFF_LOOPBACK)) {
5622 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5623 if (dev->features & NETIF_F_ALL_CSUM) {
5624 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5625 dev->features |= NETIF_F_NOCACHE_COPY;
5626 }
5627 }
5628
5629 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5630 */
5631 dev->vlan_features |= NETIF_F_HIGHDMA;
5632
5633 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5634 ret = notifier_to_errno(ret);
5635 if (ret)
5636 goto err_uninit;
5637
5638 ret = netdev_register_kobject(dev);
5639 if (ret)
5640 goto err_uninit;
5641 dev->reg_state = NETREG_REGISTERED;
5642
5643 __netdev_update_features(dev);
5644
5645 /*
5646 * Default initial state at registry is that the
5647 * device is present.
5648 */
5649
5650 set_bit(__LINK_STATE_PRESENT, &dev->state);
5651
5652 dev_init_scheduler(dev);
5653 dev_hold(dev);
5654 list_netdevice(dev);
5655 add_device_randomness(dev->dev_addr, dev->addr_len);
5656
5657 /* Notify protocols, that a new device appeared. */
5658 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5659 ret = notifier_to_errno(ret);
5660 if (ret) {
5661 rollback_registered(dev);
5662 dev->reg_state = NETREG_UNREGISTERED;
5663 }
5664 /*
5665 * Prevent userspace races by waiting until the network
5666 * device is fully setup before sending notifications.
5667 */
5668 if (!dev->rtnl_link_ops ||
5669 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5670 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5671
5672 out:
5673 return ret;
5674
5675 err_uninit:
5676 if (dev->netdev_ops->ndo_uninit)
5677 dev->netdev_ops->ndo_uninit(dev);
5678 goto out;
5679 }
5680 EXPORT_SYMBOL(register_netdevice);
5681
5682 /**
5683 * init_dummy_netdev - init a dummy network device for NAPI
5684 * @dev: device to init
5685 *
5686 * This takes a network device structure and initialize the minimum
5687 * amount of fields so it can be used to schedule NAPI polls without
5688 * registering a full blown interface. This is to be used by drivers
5689 * that need to tie several hardware interfaces to a single NAPI
5690 * poll scheduler due to HW limitations.
5691 */
5692 int init_dummy_netdev(struct net_device *dev)
5693 {
5694 /* Clear everything. Note we don't initialize spinlocks
5695 * are they aren't supposed to be taken by any of the
5696 * NAPI code and this dummy netdev is supposed to be
5697 * only ever used for NAPI polls
5698 */
5699 memset(dev, 0, sizeof(struct net_device));
5700
5701 /* make sure we BUG if trying to hit standard
5702 * register/unregister code path
5703 */
5704 dev->reg_state = NETREG_DUMMY;
5705
5706 /* NAPI wants this */
5707 INIT_LIST_HEAD(&dev->napi_list);
5708
5709 /* a dummy interface is started by default */
5710 set_bit(__LINK_STATE_PRESENT, &dev->state);
5711 set_bit(__LINK_STATE_START, &dev->state);
5712
5713 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5714 * because users of this 'device' dont need to change
5715 * its refcount.
5716 */
5717
5718 return 0;
5719 }
5720 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5721
5722
5723 /**
5724 * register_netdev - register a network device
5725 * @dev: device to register
5726 *
5727 * Take a completed network device structure and add it to the kernel
5728 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5729 * chain. 0 is returned on success. A negative errno code is returned
5730 * on a failure to set up the device, or if the name is a duplicate.
5731 *
5732 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5733 * and expands the device name if you passed a format string to
5734 * alloc_netdev.
5735 */
5736 int register_netdev(struct net_device *dev)
5737 {
5738 int err;
5739
5740 rtnl_lock();
5741 err = register_netdevice(dev);
5742 rtnl_unlock();
5743 return err;
5744 }
5745 EXPORT_SYMBOL(register_netdev);
5746
5747 int netdev_refcnt_read(const struct net_device *dev)
5748 {
5749 int i, refcnt = 0;
5750
5751 for_each_possible_cpu(i)
5752 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5753 return refcnt;
5754 }
5755 EXPORT_SYMBOL(netdev_refcnt_read);
5756
5757 /**
5758 * netdev_wait_allrefs - wait until all references are gone.
5759 * @dev: target net_device
5760 *
5761 * This is called when unregistering network devices.
5762 *
5763 * Any protocol or device that holds a reference should register
5764 * for netdevice notification, and cleanup and put back the
5765 * reference if they receive an UNREGISTER event.
5766 * We can get stuck here if buggy protocols don't correctly
5767 * call dev_put.
5768 */
5769 static void netdev_wait_allrefs(struct net_device *dev)
5770 {
5771 unsigned long rebroadcast_time, warning_time;
5772 int refcnt;
5773
5774 linkwatch_forget_dev(dev);
5775
5776 rebroadcast_time = warning_time = jiffies;
5777 refcnt = netdev_refcnt_read(dev);
5778
5779 while (refcnt != 0) {
5780 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5781 rtnl_lock();
5782
5783 /* Rebroadcast unregister notification */
5784 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5785 rcu_barrier();
5786 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5787 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5788 &dev->state)) {
5789 /* We must not have linkwatch events
5790 * pending on unregister. If this
5791 * happens, we simply run the queue
5792 * unscheduled, resulting in a noop
5793 * for this device.
5794 */
5795 linkwatch_run_queue();
5796 }
5797
5798 __rtnl_unlock();
5799
5800 rebroadcast_time = jiffies;
5801 }
5802
5803 msleep(250);
5804
5805 refcnt = netdev_refcnt_read(dev);
5806
5807 if (time_after(jiffies, warning_time + 10 * HZ)) {
5808 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5809 dev->name, refcnt);
5810 warning_time = jiffies;
5811 }
5812 }
5813 }
5814
5815 /* The sequence is:
5816 *
5817 * rtnl_lock();
5818 * ...
5819 * register_netdevice(x1);
5820 * register_netdevice(x2);
5821 * ...
5822 * unregister_netdevice(y1);
5823 * unregister_netdevice(y2);
5824 * ...
5825 * rtnl_unlock();
5826 * free_netdev(y1);
5827 * free_netdev(y2);
5828 *
5829 * We are invoked by rtnl_unlock().
5830 * This allows us to deal with problems:
5831 * 1) We can delete sysfs objects which invoke hotplug
5832 * without deadlocking with linkwatch via keventd.
5833 * 2) Since we run with the RTNL semaphore not held, we can sleep
5834 * safely in order to wait for the netdev refcnt to drop to zero.
5835 *
5836 * We must not return until all unregister events added during
5837 * the interval the lock was held have been completed.
5838 */
5839 void netdev_run_todo(void)
5840 {
5841 struct list_head list;
5842
5843 /* Snapshot list, allow later requests */
5844 list_replace_init(&net_todo_list, &list);
5845
5846 __rtnl_unlock();
5847
5848
5849 /* Wait for rcu callbacks to finish before next phase */
5850 if (!list_empty(&list))
5851 rcu_barrier();
5852
5853 while (!list_empty(&list)) {
5854 struct net_device *dev
5855 = list_first_entry(&list, struct net_device, todo_list);
5856 list_del(&dev->todo_list);
5857
5858 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5859
5860 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5861 pr_err("network todo '%s' but state %d\n",
5862 dev->name, dev->reg_state);
5863 dump_stack();
5864 continue;
5865 }
5866
5867 dev->reg_state = NETREG_UNREGISTERED;
5868
5869 on_each_cpu(flush_backlog, dev, 1);
5870
5871 netdev_wait_allrefs(dev);
5872
5873 /* paranoia */
5874 BUG_ON(netdev_refcnt_read(dev));
5875 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5876 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5877 WARN_ON(dev->dn_ptr);
5878
5879 if (dev->destructor)
5880 dev->destructor(dev);
5881
5882 /* Free network device */
5883 kobject_put(&dev->dev.kobj);
5884 }
5885 }
5886
5887 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5888 * fields in the same order, with only the type differing.
5889 */
5890 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5891 const struct net_device_stats *netdev_stats)
5892 {
5893 #if BITS_PER_LONG == 64
5894 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5895 memcpy(stats64, netdev_stats, sizeof(*stats64));
5896 #else
5897 size_t i, n = sizeof(*stats64) / sizeof(u64);
5898 const unsigned long *src = (const unsigned long *)netdev_stats;
5899 u64 *dst = (u64 *)stats64;
5900
5901 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5902 sizeof(*stats64) / sizeof(u64));
5903 for (i = 0; i < n; i++)
5904 dst[i] = src[i];
5905 #endif
5906 }
5907 EXPORT_SYMBOL(netdev_stats_to_stats64);
5908
5909 /**
5910 * dev_get_stats - get network device statistics
5911 * @dev: device to get statistics from
5912 * @storage: place to store stats
5913 *
5914 * Get network statistics from device. Return @storage.
5915 * The device driver may provide its own method by setting
5916 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5917 * otherwise the internal statistics structure is used.
5918 */
5919 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5920 struct rtnl_link_stats64 *storage)
5921 {
5922 const struct net_device_ops *ops = dev->netdev_ops;
5923
5924 if (ops->ndo_get_stats64) {
5925 memset(storage, 0, sizeof(*storage));
5926 ops->ndo_get_stats64(dev, storage);
5927 } else if (ops->ndo_get_stats) {
5928 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5929 } else {
5930 netdev_stats_to_stats64(storage, &dev->stats);
5931 }
5932 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5933 return storage;
5934 }
5935 EXPORT_SYMBOL(dev_get_stats);
5936
5937 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5938 {
5939 struct netdev_queue *queue = dev_ingress_queue(dev);
5940
5941 #ifdef CONFIG_NET_CLS_ACT
5942 if (queue)
5943 return queue;
5944 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5945 if (!queue)
5946 return NULL;
5947 netdev_init_one_queue(dev, queue, NULL);
5948 queue->qdisc = &noop_qdisc;
5949 queue->qdisc_sleeping = &noop_qdisc;
5950 rcu_assign_pointer(dev->ingress_queue, queue);
5951 #endif
5952 return queue;
5953 }
5954
5955 /**
5956 * alloc_netdev_mqs - allocate network device
5957 * @sizeof_priv: size of private data to allocate space for
5958 * @name: device name format string
5959 * @setup: callback to initialize device
5960 * @txqs: the number of TX subqueues to allocate
5961 * @rxqs: the number of RX subqueues to allocate
5962 *
5963 * Allocates a struct net_device with private data area for driver use
5964 * and performs basic initialization. Also allocates subquue structs
5965 * for each queue on the device.
5966 */
5967 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5968 void (*setup)(struct net_device *),
5969 unsigned int txqs, unsigned int rxqs)
5970 {
5971 struct net_device *dev;
5972 size_t alloc_size;
5973 struct net_device *p;
5974
5975 BUG_ON(strlen(name) >= sizeof(dev->name));
5976
5977 if (txqs < 1) {
5978 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5979 return NULL;
5980 }
5981
5982 #ifdef CONFIG_RPS
5983 if (rxqs < 1) {
5984 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5985 return NULL;
5986 }
5987 #endif
5988
5989 alloc_size = sizeof(struct net_device);
5990 if (sizeof_priv) {
5991 /* ensure 32-byte alignment of private area */
5992 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5993 alloc_size += sizeof_priv;
5994 }
5995 /* ensure 32-byte alignment of whole construct */
5996 alloc_size += NETDEV_ALIGN - 1;
5997
5998 p = kzalloc(alloc_size, GFP_KERNEL);
5999 if (!p) {
6000 pr_err("alloc_netdev: Unable to allocate device\n");
6001 return NULL;
6002 }
6003
6004 dev = PTR_ALIGN(p, NETDEV_ALIGN);
6005 dev->padded = (char *)dev - (char *)p;
6006
6007 dev->pcpu_refcnt = alloc_percpu(int);
6008 if (!dev->pcpu_refcnt)
6009 goto free_p;
6010
6011 if (dev_addr_init(dev))
6012 goto free_pcpu;
6013
6014 dev_mc_init(dev);
6015 dev_uc_init(dev);
6016
6017 dev_net_set(dev, &init_net);
6018
6019 dev->gso_max_size = GSO_MAX_SIZE;
6020 dev->gso_max_segs = GSO_MAX_SEGS;
6021
6022 INIT_LIST_HEAD(&dev->napi_list);
6023 INIT_LIST_HEAD(&dev->unreg_list);
6024 INIT_LIST_HEAD(&dev->link_watch_list);
6025 dev->priv_flags = IFF_XMIT_DST_RELEASE;
6026 setup(dev);
6027
6028 dev->num_tx_queues = txqs;
6029 dev->real_num_tx_queues = txqs;
6030 if (netif_alloc_netdev_queues(dev))
6031 goto free_all;
6032
6033 #ifdef CONFIG_RPS
6034 dev->num_rx_queues = rxqs;
6035 dev->real_num_rx_queues = rxqs;
6036 if (netif_alloc_rx_queues(dev))
6037 goto free_all;
6038 #endif
6039
6040 strcpy(dev->name, name);
6041 dev->group = INIT_NETDEV_GROUP;
6042 return dev;
6043
6044 free_all:
6045 free_netdev(dev);
6046 return NULL;
6047
6048 free_pcpu:
6049 free_percpu(dev->pcpu_refcnt);
6050 kfree(dev->_tx);
6051 #ifdef CONFIG_RPS
6052 kfree(dev->_rx);
6053 #endif
6054
6055 free_p:
6056 kfree(p);
6057 return NULL;
6058 }
6059 EXPORT_SYMBOL(alloc_netdev_mqs);
6060
6061 /**
6062 * free_netdev - free network device
6063 * @dev: device
6064 *
6065 * This function does the last stage of destroying an allocated device
6066 * interface. The reference to the device object is released.
6067 * If this is the last reference then it will be freed.
6068 */
6069 void free_netdev(struct net_device *dev)
6070 {
6071 struct napi_struct *p, *n;
6072
6073 release_net(dev_net(dev));
6074
6075 kfree(dev->_tx);
6076 #ifdef CONFIG_RPS
6077 kfree(dev->_rx);
6078 #endif
6079
6080 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6081
6082 /* Flush device addresses */
6083 dev_addr_flush(dev);
6084
6085 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6086 netif_napi_del(p);
6087
6088 free_percpu(dev->pcpu_refcnt);
6089 dev->pcpu_refcnt = NULL;
6090
6091 /* Compatibility with error handling in drivers */
6092 if (dev->reg_state == NETREG_UNINITIALIZED) {
6093 kfree((char *)dev - dev->padded);
6094 return;
6095 }
6096
6097 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6098 dev->reg_state = NETREG_RELEASED;
6099
6100 /* will free via device release */
6101 put_device(&dev->dev);
6102 }
6103 EXPORT_SYMBOL(free_netdev);
6104
6105 /**
6106 * synchronize_net - Synchronize with packet receive processing
6107 *
6108 * Wait for packets currently being received to be done.
6109 * Does not block later packets from starting.
6110 */
6111 void synchronize_net(void)
6112 {
6113 might_sleep();
6114 if (rtnl_is_locked())
6115 synchronize_rcu_expedited();
6116 else
6117 synchronize_rcu();
6118 }
6119 EXPORT_SYMBOL(synchronize_net);
6120
6121 /**
6122 * unregister_netdevice_queue - remove device from the kernel
6123 * @dev: device
6124 * @head: list
6125 *
6126 * This function shuts down a device interface and removes it
6127 * from the kernel tables.
6128 * If head not NULL, device is queued to be unregistered later.
6129 *
6130 * Callers must hold the rtnl semaphore. You may want
6131 * unregister_netdev() instead of this.
6132 */
6133
6134 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6135 {
6136 ASSERT_RTNL();
6137
6138 if (head) {
6139 list_move_tail(&dev->unreg_list, head);
6140 } else {
6141 rollback_registered(dev);
6142 /* Finish processing unregister after unlock */
6143 net_set_todo(dev);
6144 }
6145 }
6146 EXPORT_SYMBOL(unregister_netdevice_queue);
6147
6148 /**
6149 * unregister_netdevice_many - unregister many devices
6150 * @head: list of devices
6151 */
6152 void unregister_netdevice_many(struct list_head *head)
6153 {
6154 struct net_device *dev;
6155
6156 if (!list_empty(head)) {
6157 rollback_registered_many(head);
6158 list_for_each_entry(dev, head, unreg_list)
6159 net_set_todo(dev);
6160 }
6161 }
6162 EXPORT_SYMBOL(unregister_netdevice_many);
6163
6164 /**
6165 * unregister_netdev - remove device from the kernel
6166 * @dev: device
6167 *
6168 * This function shuts down a device interface and removes it
6169 * from the kernel tables.
6170 *
6171 * This is just a wrapper for unregister_netdevice that takes
6172 * the rtnl semaphore. In general you want to use this and not
6173 * unregister_netdevice.
6174 */
6175 void unregister_netdev(struct net_device *dev)
6176 {
6177 rtnl_lock();
6178 unregister_netdevice(dev);
6179 rtnl_unlock();
6180 }
6181 EXPORT_SYMBOL(unregister_netdev);
6182
6183 /**
6184 * dev_change_net_namespace - move device to different nethost namespace
6185 * @dev: device
6186 * @net: network namespace
6187 * @pat: If not NULL name pattern to try if the current device name
6188 * is already taken in the destination network namespace.
6189 *
6190 * This function shuts down a device interface and moves it
6191 * to a new network namespace. On success 0 is returned, on
6192 * a failure a netagive errno code is returned.
6193 *
6194 * Callers must hold the rtnl semaphore.
6195 */
6196
6197 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6198 {
6199 int err;
6200
6201 ASSERT_RTNL();
6202
6203 /* Don't allow namespace local devices to be moved. */
6204 err = -EINVAL;
6205 if (dev->features & NETIF_F_NETNS_LOCAL)
6206 goto out;
6207
6208 /* Ensure the device has been registrered */
6209 err = -EINVAL;
6210 if (dev->reg_state != NETREG_REGISTERED)
6211 goto out;
6212
6213 /* Get out if there is nothing todo */
6214 err = 0;
6215 if (net_eq(dev_net(dev), net))
6216 goto out;
6217
6218 /* Pick the destination device name, and ensure
6219 * we can use it in the destination network namespace.
6220 */
6221 err = -EEXIST;
6222 if (__dev_get_by_name(net, dev->name)) {
6223 /* We get here if we can't use the current device name */
6224 if (!pat)
6225 goto out;
6226 if (dev_get_valid_name(dev, pat) < 0)
6227 goto out;
6228 }
6229
6230 /*
6231 * And now a mini version of register_netdevice unregister_netdevice.
6232 */
6233
6234 /* If device is running close it first. */
6235 dev_close(dev);
6236
6237 /* And unlink it from device chain */
6238 err = -ENODEV;
6239 unlist_netdevice(dev);
6240
6241 synchronize_net();
6242
6243 /* Shutdown queueing discipline. */
6244 dev_shutdown(dev);
6245
6246 /* Notify protocols, that we are about to destroy
6247 this device. They should clean all the things.
6248
6249 Note that dev->reg_state stays at NETREG_REGISTERED.
6250 This is wanted because this way 8021q and macvlan know
6251 the device is just moving and can keep their slaves up.
6252 */
6253 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6254 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6255
6256 /*
6257 * Flush the unicast and multicast chains
6258 */
6259 dev_uc_flush(dev);
6260 dev_mc_flush(dev);
6261
6262 /* Actually switch the network namespace */
6263 dev_net_set(dev, net);
6264
6265 /* If there is an ifindex conflict assign a new one */
6266 if (__dev_get_by_index(net, dev->ifindex)) {
6267 int iflink = (dev->iflink == dev->ifindex);
6268 dev->ifindex = dev_new_index(net);
6269 if (iflink)
6270 dev->iflink = dev->ifindex;
6271 }
6272
6273 /* Fixup kobjects */
6274 err = device_rename(&dev->dev, dev->name);
6275 WARN_ON(err);
6276
6277 /* Add the device back in the hashes */
6278 list_netdevice(dev);
6279
6280 /* Notify protocols, that a new device appeared. */
6281 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6282
6283 /*
6284 * Prevent userspace races by waiting until the network
6285 * device is fully setup before sending notifications.
6286 */
6287 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6288
6289 synchronize_net();
6290 err = 0;
6291 out:
6292 return err;
6293 }
6294 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6295
6296 static int dev_cpu_callback(struct notifier_block *nfb,
6297 unsigned long action,
6298 void *ocpu)
6299 {
6300 struct sk_buff **list_skb;
6301 struct sk_buff *skb;
6302 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6303 struct softnet_data *sd, *oldsd;
6304
6305 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6306 return NOTIFY_OK;
6307
6308 local_irq_disable();
6309 cpu = smp_processor_id();
6310 sd = &per_cpu(softnet_data, cpu);
6311 oldsd = &per_cpu(softnet_data, oldcpu);
6312
6313 /* Find end of our completion_queue. */
6314 list_skb = &sd->completion_queue;
6315 while (*list_skb)
6316 list_skb = &(*list_skb)->next;
6317 /* Append completion queue from offline CPU. */
6318 *list_skb = oldsd->completion_queue;
6319 oldsd->completion_queue = NULL;
6320
6321 /* Append output queue from offline CPU. */
6322 if (oldsd->output_queue) {
6323 *sd->output_queue_tailp = oldsd->output_queue;
6324 sd->output_queue_tailp = oldsd->output_queue_tailp;
6325 oldsd->output_queue = NULL;
6326 oldsd->output_queue_tailp = &oldsd->output_queue;
6327 }
6328 /* Append NAPI poll list from offline CPU. */
6329 if (!list_empty(&oldsd->poll_list)) {
6330 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6331 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6332 }
6333
6334 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6335 local_irq_enable();
6336
6337 /* Process offline CPU's input_pkt_queue */
6338 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6339 netif_rx(skb);
6340 input_queue_head_incr(oldsd);
6341 }
6342 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6343 netif_rx(skb);
6344 input_queue_head_incr(oldsd);
6345 }
6346
6347 return NOTIFY_OK;
6348 }
6349
6350
6351 /**
6352 * netdev_increment_features - increment feature set by one
6353 * @all: current feature set
6354 * @one: new feature set
6355 * @mask: mask feature set
6356 *
6357 * Computes a new feature set after adding a device with feature set
6358 * @one to the master device with current feature set @all. Will not
6359 * enable anything that is off in @mask. Returns the new feature set.
6360 */
6361 netdev_features_t netdev_increment_features(netdev_features_t all,
6362 netdev_features_t one, netdev_features_t mask)
6363 {
6364 if (mask & NETIF_F_GEN_CSUM)
6365 mask |= NETIF_F_ALL_CSUM;
6366 mask |= NETIF_F_VLAN_CHALLENGED;
6367
6368 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6369 all &= one | ~NETIF_F_ALL_FOR_ALL;
6370
6371 /* If one device supports hw checksumming, set for all. */
6372 if (all & NETIF_F_GEN_CSUM)
6373 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6374
6375 return all;
6376 }
6377 EXPORT_SYMBOL(netdev_increment_features);
6378
6379 static struct hlist_head *netdev_create_hash(void)
6380 {
6381 int i;
6382 struct hlist_head *hash;
6383
6384 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6385 if (hash != NULL)
6386 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6387 INIT_HLIST_HEAD(&hash[i]);
6388
6389 return hash;
6390 }
6391
6392 /* Initialize per network namespace state */
6393 static int __net_init netdev_init(struct net *net)
6394 {
6395 if (net != &init_net)
6396 INIT_LIST_HEAD(&net->dev_base_head);
6397
6398 net->dev_name_head = netdev_create_hash();
6399 if (net->dev_name_head == NULL)
6400 goto err_name;
6401
6402 net->dev_index_head = netdev_create_hash();
6403 if (net->dev_index_head == NULL)
6404 goto err_idx;
6405
6406 return 0;
6407
6408 err_idx:
6409 kfree(net->dev_name_head);
6410 err_name:
6411 return -ENOMEM;
6412 }
6413
6414 /**
6415 * netdev_drivername - network driver for the device
6416 * @dev: network device
6417 *
6418 * Determine network driver for device.
6419 */
6420 const char *netdev_drivername(const struct net_device *dev)
6421 {
6422 const struct device_driver *driver;
6423 const struct device *parent;
6424 const char *empty = "";
6425
6426 parent = dev->dev.parent;
6427 if (!parent)
6428 return empty;
6429
6430 driver = parent->driver;
6431 if (driver && driver->name)
6432 return driver->name;
6433 return empty;
6434 }
6435
6436 int __netdev_printk(const char *level, const struct net_device *dev,
6437 struct va_format *vaf)
6438 {
6439 int r;
6440
6441 if (dev && dev->dev.parent)
6442 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6443 netdev_name(dev), vaf);
6444 else if (dev)
6445 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6446 else
6447 r = printk("%s(NULL net_device): %pV", level, vaf);
6448
6449 return r;
6450 }
6451 EXPORT_SYMBOL(__netdev_printk);
6452
6453 int netdev_printk(const char *level, const struct net_device *dev,
6454 const char *format, ...)
6455 {
6456 struct va_format vaf;
6457 va_list args;
6458 int r;
6459
6460 va_start(args, format);
6461
6462 vaf.fmt = format;
6463 vaf.va = &args;
6464
6465 r = __netdev_printk(level, dev, &vaf);
6466 va_end(args);
6467
6468 return r;
6469 }
6470 EXPORT_SYMBOL(netdev_printk);
6471
6472 #define define_netdev_printk_level(func, level) \
6473 int func(const struct net_device *dev, const char *fmt, ...) \
6474 { \
6475 int r; \
6476 struct va_format vaf; \
6477 va_list args; \
6478 \
6479 va_start(args, fmt); \
6480 \
6481 vaf.fmt = fmt; \
6482 vaf.va = &args; \
6483 \
6484 r = __netdev_printk(level, dev, &vaf); \
6485 va_end(args); \
6486 \
6487 return r; \
6488 } \
6489 EXPORT_SYMBOL(func);
6490
6491 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6492 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6493 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6494 define_netdev_printk_level(netdev_err, KERN_ERR);
6495 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6496 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6497 define_netdev_printk_level(netdev_info, KERN_INFO);
6498
6499 static void __net_exit netdev_exit(struct net *net)
6500 {
6501 kfree(net->dev_name_head);
6502 kfree(net->dev_index_head);
6503 }
6504
6505 static struct pernet_operations __net_initdata netdev_net_ops = {
6506 .init = netdev_init,
6507 .exit = netdev_exit,
6508 };
6509
6510 static void __net_exit default_device_exit(struct net *net)
6511 {
6512 struct net_device *dev, *aux;
6513 /*
6514 * Push all migratable network devices back to the
6515 * initial network namespace
6516 */
6517 rtnl_lock();
6518 for_each_netdev_safe(net, dev, aux) {
6519 int err;
6520 char fb_name[IFNAMSIZ];
6521
6522 /* Ignore unmoveable devices (i.e. loopback) */
6523 if (dev->features & NETIF_F_NETNS_LOCAL)
6524 continue;
6525
6526 /* Leave virtual devices for the generic cleanup */
6527 if (dev->rtnl_link_ops)
6528 continue;
6529
6530 /* Push remaining network devices to init_net */
6531 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6532 err = dev_change_net_namespace(dev, &init_net, fb_name);
6533 if (err) {
6534 pr_emerg("%s: failed to move %s to init_net: %d\n",
6535 __func__, dev->name, err);
6536 BUG();
6537 }
6538 }
6539 rtnl_unlock();
6540 }
6541
6542 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6543 {
6544 /* At exit all network devices most be removed from a network
6545 * namespace. Do this in the reverse order of registration.
6546 * Do this across as many network namespaces as possible to
6547 * improve batching efficiency.
6548 */
6549 struct net_device *dev;
6550 struct net *net;
6551 LIST_HEAD(dev_kill_list);
6552
6553 rtnl_lock();
6554 list_for_each_entry(net, net_list, exit_list) {
6555 for_each_netdev_reverse(net, dev) {
6556 if (dev->rtnl_link_ops)
6557 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6558 else
6559 unregister_netdevice_queue(dev, &dev_kill_list);
6560 }
6561 }
6562 unregister_netdevice_many(&dev_kill_list);
6563 list_del(&dev_kill_list);
6564 rtnl_unlock();
6565 }
6566
6567 static struct pernet_operations __net_initdata default_device_ops = {
6568 .exit = default_device_exit,
6569 .exit_batch = default_device_exit_batch,
6570 };
6571
6572 /*
6573 * Initialize the DEV module. At boot time this walks the device list and
6574 * unhooks any devices that fail to initialise (normally hardware not
6575 * present) and leaves us with a valid list of present and active devices.
6576 *
6577 */
6578
6579 /*
6580 * This is called single threaded during boot, so no need
6581 * to take the rtnl semaphore.
6582 */
6583 static int __init net_dev_init(void)
6584 {
6585 int i, rc = -ENOMEM;
6586
6587 BUG_ON(!dev_boot_phase);
6588
6589 if (dev_proc_init())
6590 goto out;
6591
6592 if (netdev_kobject_init())
6593 goto out;
6594
6595 INIT_LIST_HEAD(&ptype_all);
6596 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6597 INIT_LIST_HEAD(&ptype_base[i]);
6598
6599 if (register_pernet_subsys(&netdev_net_ops))
6600 goto out;
6601
6602 /*
6603 * Initialise the packet receive queues.
6604 */
6605
6606 for_each_possible_cpu(i) {
6607 struct softnet_data *sd = &per_cpu(softnet_data, i);
6608
6609 memset(sd, 0, sizeof(*sd));
6610 skb_queue_head_init(&sd->input_pkt_queue);
6611 skb_queue_head_init(&sd->process_queue);
6612 sd->completion_queue = NULL;
6613 INIT_LIST_HEAD(&sd->poll_list);
6614 sd->output_queue = NULL;
6615 sd->output_queue_tailp = &sd->output_queue;
6616 #ifdef CONFIG_RPS
6617 sd->csd.func = rps_trigger_softirq;
6618 sd->csd.info = sd;
6619 sd->csd.flags = 0;
6620 sd->cpu = i;
6621 #endif
6622
6623 sd->backlog.poll = process_backlog;
6624 sd->backlog.weight = weight_p;
6625 sd->backlog.gro_list = NULL;
6626 sd->backlog.gro_count = 0;
6627 }
6628
6629 dev_boot_phase = 0;
6630
6631 /* The loopback device is special if any other network devices
6632 * is present in a network namespace the loopback device must
6633 * be present. Since we now dynamically allocate and free the
6634 * loopback device ensure this invariant is maintained by
6635 * keeping the loopback device as the first device on the
6636 * list of network devices. Ensuring the loopback devices
6637 * is the first device that appears and the last network device
6638 * that disappears.
6639 */
6640 if (register_pernet_device(&loopback_net_ops))
6641 goto out;
6642
6643 if (register_pernet_device(&default_device_ops))
6644 goto out;
6645
6646 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6647 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6648
6649 hotcpu_notifier(dev_cpu_callback, 0);
6650 dst_init();
6651 dev_mcast_init();
6652 rc = 0;
6653 out:
6654 return rc;
6655 }
6656
6657 subsys_initcall(net_dev_init);
6658
6659 static int __init initialize_hashrnd(void)
6660 {
6661 get_random_bytes(&hashrnd, sizeof(hashrnd));
6662 return 0;
6663 }
6664
6665 late_initcall_sync(initialize_hashrnd);
6666
This page took 0.242388 seconds and 6 git commands to generate.