Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net...
[deliverable/linux.git] / net / core / dev.c
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/proc_fs.h>
101 #include <linux/seq_file.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/net_tstamp.h>
136 #include <linux/static_key.h>
137 #include <net/flow_keys.h>
138
139 #include "net-sysfs.h"
140
141 /* Instead of increasing this, you should create a hash table. */
142 #define MAX_GRO_SKBS 8
143
144 /* This should be increased if a protocol with a bigger head is added. */
145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146
147 /*
148 * The list of packet types we will receive (as opposed to discard)
149 * and the routines to invoke.
150 *
151 * Why 16. Because with 16 the only overlap we get on a hash of the
152 * low nibble of the protocol value is RARP/SNAP/X.25.
153 *
154 * NOTE: That is no longer true with the addition of VLAN tags. Not
155 * sure which should go first, but I bet it won't make much
156 * difference if we are running VLANs. The good news is that
157 * this protocol won't be in the list unless compiled in, so
158 * the average user (w/out VLANs) will not be adversely affected.
159 * --BLG
160 *
161 * 0800 IP
162 * 8100 802.1Q VLAN
163 * 0001 802.3
164 * 0002 AX.25
165 * 0004 802.2
166 * 8035 RARP
167 * 0005 SNAP
168 * 0805 X.25
169 * 0806 ARP
170 * 8137 IPX
171 * 0009 Localtalk
172 * 86DD IPv6
173 */
174
175 #define PTYPE_HASH_SIZE (16)
176 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
177
178 static DEFINE_SPINLOCK(ptype_lock);
179 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
180 static struct list_head ptype_all __read_mostly; /* Taps */
181
182 /*
183 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
184 * semaphore.
185 *
186 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
187 *
188 * Writers must hold the rtnl semaphore while they loop through the
189 * dev_base_head list, and hold dev_base_lock for writing when they do the
190 * actual updates. This allows pure readers to access the list even
191 * while a writer is preparing to update it.
192 *
193 * To put it another way, dev_base_lock is held for writing only to
194 * protect against pure readers; the rtnl semaphore provides the
195 * protection against other writers.
196 *
197 * See, for example usages, register_netdevice() and
198 * unregister_netdevice(), which must be called with the rtnl
199 * semaphore held.
200 */
201 DEFINE_RWLOCK(dev_base_lock);
202 EXPORT_SYMBOL(dev_base_lock);
203
204 static inline void dev_base_seq_inc(struct net *net)
205 {
206 while (++net->dev_base_seq == 0);
207 }
208
209 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
210 {
211 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
212
213 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
214 }
215
216 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
217 {
218 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
219 }
220
221 static inline void rps_lock(struct softnet_data *sd)
222 {
223 #ifdef CONFIG_RPS
224 spin_lock(&sd->input_pkt_queue.lock);
225 #endif
226 }
227
228 static inline void rps_unlock(struct softnet_data *sd)
229 {
230 #ifdef CONFIG_RPS
231 spin_unlock(&sd->input_pkt_queue.lock);
232 #endif
233 }
234
235 /* Device list insertion */
236 static int list_netdevice(struct net_device *dev)
237 {
238 struct net *net = dev_net(dev);
239
240 ASSERT_RTNL();
241
242 write_lock_bh(&dev_base_lock);
243 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
244 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
245 hlist_add_head_rcu(&dev->index_hlist,
246 dev_index_hash(net, dev->ifindex));
247 write_unlock_bh(&dev_base_lock);
248
249 dev_base_seq_inc(net);
250
251 return 0;
252 }
253
254 /* Device list removal
255 * caller must respect a RCU grace period before freeing/reusing dev
256 */
257 static void unlist_netdevice(struct net_device *dev)
258 {
259 ASSERT_RTNL();
260
261 /* Unlink dev from the device chain */
262 write_lock_bh(&dev_base_lock);
263 list_del_rcu(&dev->dev_list);
264 hlist_del_rcu(&dev->name_hlist);
265 hlist_del_rcu(&dev->index_hlist);
266 write_unlock_bh(&dev_base_lock);
267
268 dev_base_seq_inc(dev_net(dev));
269 }
270
271 /*
272 * Our notifier list
273 */
274
275 static RAW_NOTIFIER_HEAD(netdev_chain);
276
277 /*
278 * Device drivers call our routines to queue packets here. We empty the
279 * queue in the local softnet handler.
280 */
281
282 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
283 EXPORT_PER_CPU_SYMBOL(softnet_data);
284
285 #ifdef CONFIG_LOCKDEP
286 /*
287 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
288 * according to dev->type
289 */
290 static const unsigned short netdev_lock_type[] =
291 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
292 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
293 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
294 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
295 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
296 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
297 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
298 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
299 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
300 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
301 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
302 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
303 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
304 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
305 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
306
307 static const char *const netdev_lock_name[] =
308 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
309 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
310 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
311 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
312 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
313 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
314 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
315 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
316 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
317 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
318 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
319 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
320 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
321 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
322 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
323
324 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
325 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
326
327 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
328 {
329 int i;
330
331 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
332 if (netdev_lock_type[i] == dev_type)
333 return i;
334 /* the last key is used by default */
335 return ARRAY_SIZE(netdev_lock_type) - 1;
336 }
337
338 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
339 unsigned short dev_type)
340 {
341 int i;
342
343 i = netdev_lock_pos(dev_type);
344 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
345 netdev_lock_name[i]);
346 }
347
348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349 {
350 int i;
351
352 i = netdev_lock_pos(dev->type);
353 lockdep_set_class_and_name(&dev->addr_list_lock,
354 &netdev_addr_lock_key[i],
355 netdev_lock_name[i]);
356 }
357 #else
358 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
359 unsigned short dev_type)
360 {
361 }
362 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
363 {
364 }
365 #endif
366
367 /*******************************************************************************
368
369 Protocol management and registration routines
370
371 *******************************************************************************/
372
373 /*
374 * Add a protocol ID to the list. Now that the input handler is
375 * smarter we can dispense with all the messy stuff that used to be
376 * here.
377 *
378 * BEWARE!!! Protocol handlers, mangling input packets,
379 * MUST BE last in hash buckets and checking protocol handlers
380 * MUST start from promiscuous ptype_all chain in net_bh.
381 * It is true now, do not change it.
382 * Explanation follows: if protocol handler, mangling packet, will
383 * be the first on list, it is not able to sense, that packet
384 * is cloned and should be copied-on-write, so that it will
385 * change it and subsequent readers will get broken packet.
386 * --ANK (980803)
387 */
388
389 static inline struct list_head *ptype_head(const struct packet_type *pt)
390 {
391 if (pt->type == htons(ETH_P_ALL))
392 return &ptype_all;
393 else
394 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
395 }
396
397 /**
398 * dev_add_pack - add packet handler
399 * @pt: packet type declaration
400 *
401 * Add a protocol handler to the networking stack. The passed &packet_type
402 * is linked into kernel lists and may not be freed until it has been
403 * removed from the kernel lists.
404 *
405 * This call does not sleep therefore it can not
406 * guarantee all CPU's that are in middle of receiving packets
407 * will see the new packet type (until the next received packet).
408 */
409
410 void dev_add_pack(struct packet_type *pt)
411 {
412 struct list_head *head = ptype_head(pt);
413
414 spin_lock(&ptype_lock);
415 list_add_rcu(&pt->list, head);
416 spin_unlock(&ptype_lock);
417 }
418 EXPORT_SYMBOL(dev_add_pack);
419
420 /**
421 * __dev_remove_pack - remove packet handler
422 * @pt: packet type declaration
423 *
424 * Remove a protocol handler that was previously added to the kernel
425 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
426 * from the kernel lists and can be freed or reused once this function
427 * returns.
428 *
429 * The packet type might still be in use by receivers
430 * and must not be freed until after all the CPU's have gone
431 * through a quiescent state.
432 */
433 void __dev_remove_pack(struct packet_type *pt)
434 {
435 struct list_head *head = ptype_head(pt);
436 struct packet_type *pt1;
437
438 spin_lock(&ptype_lock);
439
440 list_for_each_entry(pt1, head, list) {
441 if (pt == pt1) {
442 list_del_rcu(&pt->list);
443 goto out;
444 }
445 }
446
447 pr_warn("dev_remove_pack: %p not found\n", pt);
448 out:
449 spin_unlock(&ptype_lock);
450 }
451 EXPORT_SYMBOL(__dev_remove_pack);
452
453 /**
454 * dev_remove_pack - remove packet handler
455 * @pt: packet type declaration
456 *
457 * Remove a protocol handler that was previously added to the kernel
458 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
459 * from the kernel lists and can be freed or reused once this function
460 * returns.
461 *
462 * This call sleeps to guarantee that no CPU is looking at the packet
463 * type after return.
464 */
465 void dev_remove_pack(struct packet_type *pt)
466 {
467 __dev_remove_pack(pt);
468
469 synchronize_net();
470 }
471 EXPORT_SYMBOL(dev_remove_pack);
472
473 /******************************************************************************
474
475 Device Boot-time Settings Routines
476
477 *******************************************************************************/
478
479 /* Boot time configuration table */
480 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
481
482 /**
483 * netdev_boot_setup_add - add new setup entry
484 * @name: name of the device
485 * @map: configured settings for the device
486 *
487 * Adds new setup entry to the dev_boot_setup list. The function
488 * returns 0 on error and 1 on success. This is a generic routine to
489 * all netdevices.
490 */
491 static int netdev_boot_setup_add(char *name, struct ifmap *map)
492 {
493 struct netdev_boot_setup *s;
494 int i;
495
496 s = dev_boot_setup;
497 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
498 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
499 memset(s[i].name, 0, sizeof(s[i].name));
500 strlcpy(s[i].name, name, IFNAMSIZ);
501 memcpy(&s[i].map, map, sizeof(s[i].map));
502 break;
503 }
504 }
505
506 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
507 }
508
509 /**
510 * netdev_boot_setup_check - check boot time settings
511 * @dev: the netdevice
512 *
513 * Check boot time settings for the device.
514 * The found settings are set for the device to be used
515 * later in the device probing.
516 * Returns 0 if no settings found, 1 if they are.
517 */
518 int netdev_boot_setup_check(struct net_device *dev)
519 {
520 struct netdev_boot_setup *s = dev_boot_setup;
521 int i;
522
523 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
524 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
525 !strcmp(dev->name, s[i].name)) {
526 dev->irq = s[i].map.irq;
527 dev->base_addr = s[i].map.base_addr;
528 dev->mem_start = s[i].map.mem_start;
529 dev->mem_end = s[i].map.mem_end;
530 return 1;
531 }
532 }
533 return 0;
534 }
535 EXPORT_SYMBOL(netdev_boot_setup_check);
536
537
538 /**
539 * netdev_boot_base - get address from boot time settings
540 * @prefix: prefix for network device
541 * @unit: id for network device
542 *
543 * Check boot time settings for the base address of device.
544 * The found settings are set for the device to be used
545 * later in the device probing.
546 * Returns 0 if no settings found.
547 */
548 unsigned long netdev_boot_base(const char *prefix, int unit)
549 {
550 const struct netdev_boot_setup *s = dev_boot_setup;
551 char name[IFNAMSIZ];
552 int i;
553
554 sprintf(name, "%s%d", prefix, unit);
555
556 /*
557 * If device already registered then return base of 1
558 * to indicate not to probe for this interface
559 */
560 if (__dev_get_by_name(&init_net, name))
561 return 1;
562
563 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
564 if (!strcmp(name, s[i].name))
565 return s[i].map.base_addr;
566 return 0;
567 }
568
569 /*
570 * Saves at boot time configured settings for any netdevice.
571 */
572 int __init netdev_boot_setup(char *str)
573 {
574 int ints[5];
575 struct ifmap map;
576
577 str = get_options(str, ARRAY_SIZE(ints), ints);
578 if (!str || !*str)
579 return 0;
580
581 /* Save settings */
582 memset(&map, 0, sizeof(map));
583 if (ints[0] > 0)
584 map.irq = ints[1];
585 if (ints[0] > 1)
586 map.base_addr = ints[2];
587 if (ints[0] > 2)
588 map.mem_start = ints[3];
589 if (ints[0] > 3)
590 map.mem_end = ints[4];
591
592 /* Add new entry to the list */
593 return netdev_boot_setup_add(str, &map);
594 }
595
596 __setup("netdev=", netdev_boot_setup);
597
598 /*******************************************************************************
599
600 Device Interface Subroutines
601
602 *******************************************************************************/
603
604 /**
605 * __dev_get_by_name - find a device by its name
606 * @net: the applicable net namespace
607 * @name: name to find
608 *
609 * Find an interface by name. Must be called under RTNL semaphore
610 * or @dev_base_lock. If the name is found a pointer to the device
611 * is returned. If the name is not found then %NULL is returned. The
612 * reference counters are not incremented so the caller must be
613 * careful with locks.
614 */
615
616 struct net_device *__dev_get_by_name(struct net *net, const char *name)
617 {
618 struct hlist_node *p;
619 struct net_device *dev;
620 struct hlist_head *head = dev_name_hash(net, name);
621
622 hlist_for_each_entry(dev, p, head, name_hlist)
623 if (!strncmp(dev->name, name, IFNAMSIZ))
624 return dev;
625
626 return NULL;
627 }
628 EXPORT_SYMBOL(__dev_get_by_name);
629
630 /**
631 * dev_get_by_name_rcu - find a device by its name
632 * @net: the applicable net namespace
633 * @name: name to find
634 *
635 * Find an interface by name.
636 * If the name is found a pointer to the device is returned.
637 * If the name is not found then %NULL is returned.
638 * The reference counters are not incremented so the caller must be
639 * careful with locks. The caller must hold RCU lock.
640 */
641
642 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
643 {
644 struct hlist_node *p;
645 struct net_device *dev;
646 struct hlist_head *head = dev_name_hash(net, name);
647
648 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
649 if (!strncmp(dev->name, name, IFNAMSIZ))
650 return dev;
651
652 return NULL;
653 }
654 EXPORT_SYMBOL(dev_get_by_name_rcu);
655
656 /**
657 * dev_get_by_name - find a device by its name
658 * @net: the applicable net namespace
659 * @name: name to find
660 *
661 * Find an interface by name. This can be called from any
662 * context and does its own locking. The returned handle has
663 * the usage count incremented and the caller must use dev_put() to
664 * release it when it is no longer needed. %NULL is returned if no
665 * matching device is found.
666 */
667
668 struct net_device *dev_get_by_name(struct net *net, const char *name)
669 {
670 struct net_device *dev;
671
672 rcu_read_lock();
673 dev = dev_get_by_name_rcu(net, name);
674 if (dev)
675 dev_hold(dev);
676 rcu_read_unlock();
677 return dev;
678 }
679 EXPORT_SYMBOL(dev_get_by_name);
680
681 /**
682 * __dev_get_by_index - find a device by its ifindex
683 * @net: the applicable net namespace
684 * @ifindex: index of device
685 *
686 * Search for an interface by index. Returns %NULL if the device
687 * is not found or a pointer to the device. The device has not
688 * had its reference counter increased so the caller must be careful
689 * about locking. The caller must hold either the RTNL semaphore
690 * or @dev_base_lock.
691 */
692
693 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
694 {
695 struct hlist_node *p;
696 struct net_device *dev;
697 struct hlist_head *head = dev_index_hash(net, ifindex);
698
699 hlist_for_each_entry(dev, p, head, index_hlist)
700 if (dev->ifindex == ifindex)
701 return dev;
702
703 return NULL;
704 }
705 EXPORT_SYMBOL(__dev_get_by_index);
706
707 /**
708 * dev_get_by_index_rcu - find a device by its ifindex
709 * @net: the applicable net namespace
710 * @ifindex: index of device
711 *
712 * Search for an interface by index. Returns %NULL if the device
713 * is not found or a pointer to the device. The device has not
714 * had its reference counter increased so the caller must be careful
715 * about locking. The caller must hold RCU lock.
716 */
717
718 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
719 {
720 struct hlist_node *p;
721 struct net_device *dev;
722 struct hlist_head *head = dev_index_hash(net, ifindex);
723
724 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
725 if (dev->ifindex == ifindex)
726 return dev;
727
728 return NULL;
729 }
730 EXPORT_SYMBOL(dev_get_by_index_rcu);
731
732
733 /**
734 * dev_get_by_index - find a device by its ifindex
735 * @net: the applicable net namespace
736 * @ifindex: index of device
737 *
738 * Search for an interface by index. Returns NULL if the device
739 * is not found or a pointer to the device. The device returned has
740 * had a reference added and the pointer is safe until the user calls
741 * dev_put to indicate they have finished with it.
742 */
743
744 struct net_device *dev_get_by_index(struct net *net, int ifindex)
745 {
746 struct net_device *dev;
747
748 rcu_read_lock();
749 dev = dev_get_by_index_rcu(net, ifindex);
750 if (dev)
751 dev_hold(dev);
752 rcu_read_unlock();
753 return dev;
754 }
755 EXPORT_SYMBOL(dev_get_by_index);
756
757 /**
758 * dev_getbyhwaddr_rcu - find a device by its hardware address
759 * @net: the applicable net namespace
760 * @type: media type of device
761 * @ha: hardware address
762 *
763 * Search for an interface by MAC address. Returns NULL if the device
764 * is not found or a pointer to the device.
765 * The caller must hold RCU or RTNL.
766 * The returned device has not had its ref count increased
767 * and the caller must therefore be careful about locking
768 *
769 */
770
771 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
772 const char *ha)
773 {
774 struct net_device *dev;
775
776 for_each_netdev_rcu(net, dev)
777 if (dev->type == type &&
778 !memcmp(dev->dev_addr, ha, dev->addr_len))
779 return dev;
780
781 return NULL;
782 }
783 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
784
785 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
786 {
787 struct net_device *dev;
788
789 ASSERT_RTNL();
790 for_each_netdev(net, dev)
791 if (dev->type == type)
792 return dev;
793
794 return NULL;
795 }
796 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
797
798 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
799 {
800 struct net_device *dev, *ret = NULL;
801
802 rcu_read_lock();
803 for_each_netdev_rcu(net, dev)
804 if (dev->type == type) {
805 dev_hold(dev);
806 ret = dev;
807 break;
808 }
809 rcu_read_unlock();
810 return ret;
811 }
812 EXPORT_SYMBOL(dev_getfirstbyhwtype);
813
814 /**
815 * dev_get_by_flags_rcu - find any device with given flags
816 * @net: the applicable net namespace
817 * @if_flags: IFF_* values
818 * @mask: bitmask of bits in if_flags to check
819 *
820 * Search for any interface with the given flags. Returns NULL if a device
821 * is not found or a pointer to the device. Must be called inside
822 * rcu_read_lock(), and result refcount is unchanged.
823 */
824
825 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
826 unsigned short mask)
827 {
828 struct net_device *dev, *ret;
829
830 ret = NULL;
831 for_each_netdev_rcu(net, dev) {
832 if (((dev->flags ^ if_flags) & mask) == 0) {
833 ret = dev;
834 break;
835 }
836 }
837 return ret;
838 }
839 EXPORT_SYMBOL(dev_get_by_flags_rcu);
840
841 /**
842 * dev_valid_name - check if name is okay for network device
843 * @name: name string
844 *
845 * Network device names need to be valid file names to
846 * to allow sysfs to work. We also disallow any kind of
847 * whitespace.
848 */
849 bool dev_valid_name(const char *name)
850 {
851 if (*name == '\0')
852 return false;
853 if (strlen(name) >= IFNAMSIZ)
854 return false;
855 if (!strcmp(name, ".") || !strcmp(name, ".."))
856 return false;
857
858 while (*name) {
859 if (*name == '/' || isspace(*name))
860 return false;
861 name++;
862 }
863 return true;
864 }
865 EXPORT_SYMBOL(dev_valid_name);
866
867 /**
868 * __dev_alloc_name - allocate a name for a device
869 * @net: network namespace to allocate the device name in
870 * @name: name format string
871 * @buf: scratch buffer and result name string
872 *
873 * Passed a format string - eg "lt%d" it will try and find a suitable
874 * id. It scans list of devices to build up a free map, then chooses
875 * the first empty slot. The caller must hold the dev_base or rtnl lock
876 * while allocating the name and adding the device in order to avoid
877 * duplicates.
878 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
879 * Returns the number of the unit assigned or a negative errno code.
880 */
881
882 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
883 {
884 int i = 0;
885 const char *p;
886 const int max_netdevices = 8*PAGE_SIZE;
887 unsigned long *inuse;
888 struct net_device *d;
889
890 p = strnchr(name, IFNAMSIZ-1, '%');
891 if (p) {
892 /*
893 * Verify the string as this thing may have come from
894 * the user. There must be either one "%d" and no other "%"
895 * characters.
896 */
897 if (p[1] != 'd' || strchr(p + 2, '%'))
898 return -EINVAL;
899
900 /* Use one page as a bit array of possible slots */
901 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
902 if (!inuse)
903 return -ENOMEM;
904
905 for_each_netdev(net, d) {
906 if (!sscanf(d->name, name, &i))
907 continue;
908 if (i < 0 || i >= max_netdevices)
909 continue;
910
911 /* avoid cases where sscanf is not exact inverse of printf */
912 snprintf(buf, IFNAMSIZ, name, i);
913 if (!strncmp(buf, d->name, IFNAMSIZ))
914 set_bit(i, inuse);
915 }
916
917 i = find_first_zero_bit(inuse, max_netdevices);
918 free_page((unsigned long) inuse);
919 }
920
921 if (buf != name)
922 snprintf(buf, IFNAMSIZ, name, i);
923 if (!__dev_get_by_name(net, buf))
924 return i;
925
926 /* It is possible to run out of possible slots
927 * when the name is long and there isn't enough space left
928 * for the digits, or if all bits are used.
929 */
930 return -ENFILE;
931 }
932
933 /**
934 * dev_alloc_name - allocate a name for a device
935 * @dev: device
936 * @name: name format string
937 *
938 * Passed a format string - eg "lt%d" it will try and find a suitable
939 * id. It scans list of devices to build up a free map, then chooses
940 * the first empty slot. The caller must hold the dev_base or rtnl lock
941 * while allocating the name and adding the device in order to avoid
942 * duplicates.
943 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
944 * Returns the number of the unit assigned or a negative errno code.
945 */
946
947 int dev_alloc_name(struct net_device *dev, const char *name)
948 {
949 char buf[IFNAMSIZ];
950 struct net *net;
951 int ret;
952
953 BUG_ON(!dev_net(dev));
954 net = dev_net(dev);
955 ret = __dev_alloc_name(net, name, buf);
956 if (ret >= 0)
957 strlcpy(dev->name, buf, IFNAMSIZ);
958 return ret;
959 }
960 EXPORT_SYMBOL(dev_alloc_name);
961
962 static int dev_get_valid_name(struct net_device *dev, const char *name)
963 {
964 struct net *net;
965
966 BUG_ON(!dev_net(dev));
967 net = dev_net(dev);
968
969 if (!dev_valid_name(name))
970 return -EINVAL;
971
972 if (strchr(name, '%'))
973 return dev_alloc_name(dev, name);
974 else if (__dev_get_by_name(net, name))
975 return -EEXIST;
976 else if (dev->name != name)
977 strlcpy(dev->name, name, IFNAMSIZ);
978
979 return 0;
980 }
981
982 /**
983 * dev_change_name - change name of a device
984 * @dev: device
985 * @newname: name (or format string) must be at least IFNAMSIZ
986 *
987 * Change name of a device, can pass format strings "eth%d".
988 * for wildcarding.
989 */
990 int dev_change_name(struct net_device *dev, const char *newname)
991 {
992 char oldname[IFNAMSIZ];
993 int err = 0;
994 int ret;
995 struct net *net;
996
997 ASSERT_RTNL();
998 BUG_ON(!dev_net(dev));
999
1000 net = dev_net(dev);
1001 if (dev->flags & IFF_UP)
1002 return -EBUSY;
1003
1004 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1005 return 0;
1006
1007 memcpy(oldname, dev->name, IFNAMSIZ);
1008
1009 err = dev_get_valid_name(dev, newname);
1010 if (err < 0)
1011 return err;
1012
1013 rollback:
1014 ret = device_rename(&dev->dev, dev->name);
1015 if (ret) {
1016 memcpy(dev->name, oldname, IFNAMSIZ);
1017 return ret;
1018 }
1019
1020 write_lock_bh(&dev_base_lock);
1021 hlist_del_rcu(&dev->name_hlist);
1022 write_unlock_bh(&dev_base_lock);
1023
1024 synchronize_rcu();
1025
1026 write_lock_bh(&dev_base_lock);
1027 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1028 write_unlock_bh(&dev_base_lock);
1029
1030 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1031 ret = notifier_to_errno(ret);
1032
1033 if (ret) {
1034 /* err >= 0 after dev_alloc_name() or stores the first errno */
1035 if (err >= 0) {
1036 err = ret;
1037 memcpy(dev->name, oldname, IFNAMSIZ);
1038 goto rollback;
1039 } else {
1040 pr_err("%s: name change rollback failed: %d\n",
1041 dev->name, ret);
1042 }
1043 }
1044
1045 return err;
1046 }
1047
1048 /**
1049 * dev_set_alias - change ifalias of a device
1050 * @dev: device
1051 * @alias: name up to IFALIASZ
1052 * @len: limit of bytes to copy from info
1053 *
1054 * Set ifalias for a device,
1055 */
1056 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057 {
1058 char *new_ifalias;
1059
1060 ASSERT_RTNL();
1061
1062 if (len >= IFALIASZ)
1063 return -EINVAL;
1064
1065 if (!len) {
1066 if (dev->ifalias) {
1067 kfree(dev->ifalias);
1068 dev->ifalias = NULL;
1069 }
1070 return 0;
1071 }
1072
1073 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1074 if (!new_ifalias)
1075 return -ENOMEM;
1076 dev->ifalias = new_ifalias;
1077
1078 strlcpy(dev->ifalias, alias, len+1);
1079 return len;
1080 }
1081
1082
1083 /**
1084 * netdev_features_change - device changes features
1085 * @dev: device to cause notification
1086 *
1087 * Called to indicate a device has changed features.
1088 */
1089 void netdev_features_change(struct net_device *dev)
1090 {
1091 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1092 }
1093 EXPORT_SYMBOL(netdev_features_change);
1094
1095 /**
1096 * netdev_state_change - device changes state
1097 * @dev: device to cause notification
1098 *
1099 * Called to indicate a device has changed state. This function calls
1100 * the notifier chains for netdev_chain and sends a NEWLINK message
1101 * to the routing socket.
1102 */
1103 void netdev_state_change(struct net_device *dev)
1104 {
1105 if (dev->flags & IFF_UP) {
1106 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1107 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1108 }
1109 }
1110 EXPORT_SYMBOL(netdev_state_change);
1111
1112 /**
1113 * netdev_notify_peers - notify network peers about existence of @dev
1114 * @dev: network device
1115 *
1116 * Generate traffic such that interested network peers are aware of
1117 * @dev, such as by generating a gratuitous ARP. This may be used when
1118 * a device wants to inform the rest of the network about some sort of
1119 * reconfiguration such as a failover event or virtual machine
1120 * migration.
1121 */
1122 void netdev_notify_peers(struct net_device *dev)
1123 {
1124 rtnl_lock();
1125 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1126 rtnl_unlock();
1127 }
1128 EXPORT_SYMBOL(netdev_notify_peers);
1129
1130 /**
1131 * dev_load - load a network module
1132 * @net: the applicable net namespace
1133 * @name: name of interface
1134 *
1135 * If a network interface is not present and the process has suitable
1136 * privileges this function loads the module. If module loading is not
1137 * available in this kernel then it becomes a nop.
1138 */
1139
1140 void dev_load(struct net *net, const char *name)
1141 {
1142 struct net_device *dev;
1143 int no_module;
1144
1145 rcu_read_lock();
1146 dev = dev_get_by_name_rcu(net, name);
1147 rcu_read_unlock();
1148
1149 no_module = !dev;
1150 if (no_module && capable(CAP_NET_ADMIN))
1151 no_module = request_module("netdev-%s", name);
1152 if (no_module && capable(CAP_SYS_MODULE)) {
1153 if (!request_module("%s", name))
1154 pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1155 name);
1156 }
1157 }
1158 EXPORT_SYMBOL(dev_load);
1159
1160 static int __dev_open(struct net_device *dev)
1161 {
1162 const struct net_device_ops *ops = dev->netdev_ops;
1163 int ret;
1164
1165 ASSERT_RTNL();
1166
1167 if (!netif_device_present(dev))
1168 return -ENODEV;
1169
1170 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1171 ret = notifier_to_errno(ret);
1172 if (ret)
1173 return ret;
1174
1175 set_bit(__LINK_STATE_START, &dev->state);
1176
1177 if (ops->ndo_validate_addr)
1178 ret = ops->ndo_validate_addr(dev);
1179
1180 if (!ret && ops->ndo_open)
1181 ret = ops->ndo_open(dev);
1182
1183 if (ret)
1184 clear_bit(__LINK_STATE_START, &dev->state);
1185 else {
1186 dev->flags |= IFF_UP;
1187 net_dmaengine_get();
1188 dev_set_rx_mode(dev);
1189 dev_activate(dev);
1190 add_device_randomness(dev->dev_addr, dev->addr_len);
1191 }
1192
1193 return ret;
1194 }
1195
1196 /**
1197 * dev_open - prepare an interface for use.
1198 * @dev: device to open
1199 *
1200 * Takes a device from down to up state. The device's private open
1201 * function is invoked and then the multicast lists are loaded. Finally
1202 * the device is moved into the up state and a %NETDEV_UP message is
1203 * sent to the netdev notifier chain.
1204 *
1205 * Calling this function on an active interface is a nop. On a failure
1206 * a negative errno code is returned.
1207 */
1208 int dev_open(struct net_device *dev)
1209 {
1210 int ret;
1211
1212 if (dev->flags & IFF_UP)
1213 return 0;
1214
1215 ret = __dev_open(dev);
1216 if (ret < 0)
1217 return ret;
1218
1219 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1220 call_netdevice_notifiers(NETDEV_UP, dev);
1221
1222 return ret;
1223 }
1224 EXPORT_SYMBOL(dev_open);
1225
1226 static int __dev_close_many(struct list_head *head)
1227 {
1228 struct net_device *dev;
1229
1230 ASSERT_RTNL();
1231 might_sleep();
1232
1233 list_for_each_entry(dev, head, unreg_list) {
1234 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1235
1236 clear_bit(__LINK_STATE_START, &dev->state);
1237
1238 /* Synchronize to scheduled poll. We cannot touch poll list, it
1239 * can be even on different cpu. So just clear netif_running().
1240 *
1241 * dev->stop() will invoke napi_disable() on all of it's
1242 * napi_struct instances on this device.
1243 */
1244 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1245 }
1246
1247 dev_deactivate_many(head);
1248
1249 list_for_each_entry(dev, head, unreg_list) {
1250 const struct net_device_ops *ops = dev->netdev_ops;
1251
1252 /*
1253 * Call the device specific close. This cannot fail.
1254 * Only if device is UP
1255 *
1256 * We allow it to be called even after a DETACH hot-plug
1257 * event.
1258 */
1259 if (ops->ndo_stop)
1260 ops->ndo_stop(dev);
1261
1262 dev->flags &= ~IFF_UP;
1263 net_dmaengine_put();
1264 }
1265
1266 return 0;
1267 }
1268
1269 static int __dev_close(struct net_device *dev)
1270 {
1271 int retval;
1272 LIST_HEAD(single);
1273
1274 list_add(&dev->unreg_list, &single);
1275 retval = __dev_close_many(&single);
1276 list_del(&single);
1277 return retval;
1278 }
1279
1280 static int dev_close_many(struct list_head *head)
1281 {
1282 struct net_device *dev, *tmp;
1283 LIST_HEAD(tmp_list);
1284
1285 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1286 if (!(dev->flags & IFF_UP))
1287 list_move(&dev->unreg_list, &tmp_list);
1288
1289 __dev_close_many(head);
1290
1291 list_for_each_entry(dev, head, unreg_list) {
1292 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1293 call_netdevice_notifiers(NETDEV_DOWN, dev);
1294 }
1295
1296 /* rollback_registered_many needs the complete original list */
1297 list_splice(&tmp_list, head);
1298 return 0;
1299 }
1300
1301 /**
1302 * dev_close - shutdown an interface.
1303 * @dev: device to shutdown
1304 *
1305 * This function moves an active device into down state. A
1306 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1307 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1308 * chain.
1309 */
1310 int dev_close(struct net_device *dev)
1311 {
1312 if (dev->flags & IFF_UP) {
1313 LIST_HEAD(single);
1314
1315 list_add(&dev->unreg_list, &single);
1316 dev_close_many(&single);
1317 list_del(&single);
1318 }
1319 return 0;
1320 }
1321 EXPORT_SYMBOL(dev_close);
1322
1323
1324 /**
1325 * dev_disable_lro - disable Large Receive Offload on a device
1326 * @dev: device
1327 *
1328 * Disable Large Receive Offload (LRO) on a net device. Must be
1329 * called under RTNL. This is needed if received packets may be
1330 * forwarded to another interface.
1331 */
1332 void dev_disable_lro(struct net_device *dev)
1333 {
1334 /*
1335 * If we're trying to disable lro on a vlan device
1336 * use the underlying physical device instead
1337 */
1338 if (is_vlan_dev(dev))
1339 dev = vlan_dev_real_dev(dev);
1340
1341 dev->wanted_features &= ~NETIF_F_LRO;
1342 netdev_update_features(dev);
1343
1344 if (unlikely(dev->features & NETIF_F_LRO))
1345 netdev_WARN(dev, "failed to disable LRO!\n");
1346 }
1347 EXPORT_SYMBOL(dev_disable_lro);
1348
1349
1350 static int dev_boot_phase = 1;
1351
1352 /**
1353 * register_netdevice_notifier - register a network notifier block
1354 * @nb: notifier
1355 *
1356 * Register a notifier to be called when network device events occur.
1357 * The notifier passed is linked into the kernel structures and must
1358 * not be reused until it has been unregistered. A negative errno code
1359 * is returned on a failure.
1360 *
1361 * When registered all registration and up events are replayed
1362 * to the new notifier to allow device to have a race free
1363 * view of the network device list.
1364 */
1365
1366 int register_netdevice_notifier(struct notifier_block *nb)
1367 {
1368 struct net_device *dev;
1369 struct net_device *last;
1370 struct net *net;
1371 int err;
1372
1373 rtnl_lock();
1374 err = raw_notifier_chain_register(&netdev_chain, nb);
1375 if (err)
1376 goto unlock;
1377 if (dev_boot_phase)
1378 goto unlock;
1379 for_each_net(net) {
1380 for_each_netdev(net, dev) {
1381 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1382 err = notifier_to_errno(err);
1383 if (err)
1384 goto rollback;
1385
1386 if (!(dev->flags & IFF_UP))
1387 continue;
1388
1389 nb->notifier_call(nb, NETDEV_UP, dev);
1390 }
1391 }
1392
1393 unlock:
1394 rtnl_unlock();
1395 return err;
1396
1397 rollback:
1398 last = dev;
1399 for_each_net(net) {
1400 for_each_netdev(net, dev) {
1401 if (dev == last)
1402 goto outroll;
1403
1404 if (dev->flags & IFF_UP) {
1405 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1406 nb->notifier_call(nb, NETDEV_DOWN, dev);
1407 }
1408 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1409 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1410 }
1411 }
1412
1413 outroll:
1414 raw_notifier_chain_unregister(&netdev_chain, nb);
1415 goto unlock;
1416 }
1417 EXPORT_SYMBOL(register_netdevice_notifier);
1418
1419 /**
1420 * unregister_netdevice_notifier - unregister a network notifier block
1421 * @nb: notifier
1422 *
1423 * Unregister a notifier previously registered by
1424 * register_netdevice_notifier(). The notifier is unlinked into the
1425 * kernel structures and may then be reused. A negative errno code
1426 * is returned on a failure.
1427 *
1428 * After unregistering unregister and down device events are synthesized
1429 * for all devices on the device list to the removed notifier to remove
1430 * the need for special case cleanup code.
1431 */
1432
1433 int unregister_netdevice_notifier(struct notifier_block *nb)
1434 {
1435 struct net_device *dev;
1436 struct net *net;
1437 int err;
1438
1439 rtnl_lock();
1440 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1441 if (err)
1442 goto unlock;
1443
1444 for_each_net(net) {
1445 for_each_netdev(net, dev) {
1446 if (dev->flags & IFF_UP) {
1447 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1448 nb->notifier_call(nb, NETDEV_DOWN, dev);
1449 }
1450 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1451 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1452 }
1453 }
1454 unlock:
1455 rtnl_unlock();
1456 return err;
1457 }
1458 EXPORT_SYMBOL(unregister_netdevice_notifier);
1459
1460 /**
1461 * call_netdevice_notifiers - call all network notifier blocks
1462 * @val: value passed unmodified to notifier function
1463 * @dev: net_device pointer passed unmodified to notifier function
1464 *
1465 * Call all network notifier blocks. Parameters and return value
1466 * are as for raw_notifier_call_chain().
1467 */
1468
1469 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1470 {
1471 ASSERT_RTNL();
1472 return raw_notifier_call_chain(&netdev_chain, val, dev);
1473 }
1474 EXPORT_SYMBOL(call_netdevice_notifiers);
1475
1476 static struct static_key netstamp_needed __read_mostly;
1477 #ifdef HAVE_JUMP_LABEL
1478 /* We are not allowed to call static_key_slow_dec() from irq context
1479 * If net_disable_timestamp() is called from irq context, defer the
1480 * static_key_slow_dec() calls.
1481 */
1482 static atomic_t netstamp_needed_deferred;
1483 #endif
1484
1485 void net_enable_timestamp(void)
1486 {
1487 #ifdef HAVE_JUMP_LABEL
1488 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1489
1490 if (deferred) {
1491 while (--deferred)
1492 static_key_slow_dec(&netstamp_needed);
1493 return;
1494 }
1495 #endif
1496 WARN_ON(in_interrupt());
1497 static_key_slow_inc(&netstamp_needed);
1498 }
1499 EXPORT_SYMBOL(net_enable_timestamp);
1500
1501 void net_disable_timestamp(void)
1502 {
1503 #ifdef HAVE_JUMP_LABEL
1504 if (in_interrupt()) {
1505 atomic_inc(&netstamp_needed_deferred);
1506 return;
1507 }
1508 #endif
1509 static_key_slow_dec(&netstamp_needed);
1510 }
1511 EXPORT_SYMBOL(net_disable_timestamp);
1512
1513 static inline void net_timestamp_set(struct sk_buff *skb)
1514 {
1515 skb->tstamp.tv64 = 0;
1516 if (static_key_false(&netstamp_needed))
1517 __net_timestamp(skb);
1518 }
1519
1520 #define net_timestamp_check(COND, SKB) \
1521 if (static_key_false(&netstamp_needed)) { \
1522 if ((COND) && !(SKB)->tstamp.tv64) \
1523 __net_timestamp(SKB); \
1524 } \
1525
1526 static int net_hwtstamp_validate(struct ifreq *ifr)
1527 {
1528 struct hwtstamp_config cfg;
1529 enum hwtstamp_tx_types tx_type;
1530 enum hwtstamp_rx_filters rx_filter;
1531 int tx_type_valid = 0;
1532 int rx_filter_valid = 0;
1533
1534 if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1535 return -EFAULT;
1536
1537 if (cfg.flags) /* reserved for future extensions */
1538 return -EINVAL;
1539
1540 tx_type = cfg.tx_type;
1541 rx_filter = cfg.rx_filter;
1542
1543 switch (tx_type) {
1544 case HWTSTAMP_TX_OFF:
1545 case HWTSTAMP_TX_ON:
1546 case HWTSTAMP_TX_ONESTEP_SYNC:
1547 tx_type_valid = 1;
1548 break;
1549 }
1550
1551 switch (rx_filter) {
1552 case HWTSTAMP_FILTER_NONE:
1553 case HWTSTAMP_FILTER_ALL:
1554 case HWTSTAMP_FILTER_SOME:
1555 case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1556 case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1557 case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1558 case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1559 case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1560 case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1561 case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1562 case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1563 case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1564 case HWTSTAMP_FILTER_PTP_V2_EVENT:
1565 case HWTSTAMP_FILTER_PTP_V2_SYNC:
1566 case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1567 rx_filter_valid = 1;
1568 break;
1569 }
1570
1571 if (!tx_type_valid || !rx_filter_valid)
1572 return -ERANGE;
1573
1574 return 0;
1575 }
1576
1577 static inline bool is_skb_forwardable(struct net_device *dev,
1578 struct sk_buff *skb)
1579 {
1580 unsigned int len;
1581
1582 if (!(dev->flags & IFF_UP))
1583 return false;
1584
1585 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1586 if (skb->len <= len)
1587 return true;
1588
1589 /* if TSO is enabled, we don't care about the length as the packet
1590 * could be forwarded without being segmented before
1591 */
1592 if (skb_is_gso(skb))
1593 return true;
1594
1595 return false;
1596 }
1597
1598 /**
1599 * dev_forward_skb - loopback an skb to another netif
1600 *
1601 * @dev: destination network device
1602 * @skb: buffer to forward
1603 *
1604 * return values:
1605 * NET_RX_SUCCESS (no congestion)
1606 * NET_RX_DROP (packet was dropped, but freed)
1607 *
1608 * dev_forward_skb can be used for injecting an skb from the
1609 * start_xmit function of one device into the receive queue
1610 * of another device.
1611 *
1612 * The receiving device may be in another namespace, so
1613 * we have to clear all information in the skb that could
1614 * impact namespace isolation.
1615 */
1616 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1617 {
1618 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1619 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1620 atomic_long_inc(&dev->rx_dropped);
1621 kfree_skb(skb);
1622 return NET_RX_DROP;
1623 }
1624 }
1625
1626 skb_orphan(skb);
1627 nf_reset(skb);
1628
1629 if (unlikely(!is_skb_forwardable(dev, skb))) {
1630 atomic_long_inc(&dev->rx_dropped);
1631 kfree_skb(skb);
1632 return NET_RX_DROP;
1633 }
1634 skb->skb_iif = 0;
1635 skb->dev = dev;
1636 skb_dst_drop(skb);
1637 skb->tstamp.tv64 = 0;
1638 skb->pkt_type = PACKET_HOST;
1639 skb->protocol = eth_type_trans(skb, dev);
1640 skb->mark = 0;
1641 secpath_reset(skb);
1642 nf_reset(skb);
1643 return netif_rx(skb);
1644 }
1645 EXPORT_SYMBOL_GPL(dev_forward_skb);
1646
1647 static inline int deliver_skb(struct sk_buff *skb,
1648 struct packet_type *pt_prev,
1649 struct net_device *orig_dev)
1650 {
1651 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1652 return -ENOMEM;
1653 atomic_inc(&skb->users);
1654 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1655 }
1656
1657 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1658 {
1659 if (ptype->af_packet_priv == NULL)
1660 return false;
1661
1662 if (ptype->id_match)
1663 return ptype->id_match(ptype, skb->sk);
1664 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1665 return true;
1666
1667 return false;
1668 }
1669
1670 /*
1671 * Support routine. Sends outgoing frames to any network
1672 * taps currently in use.
1673 */
1674
1675 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1676 {
1677 struct packet_type *ptype;
1678 struct sk_buff *skb2 = NULL;
1679 struct packet_type *pt_prev = NULL;
1680
1681 rcu_read_lock();
1682 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1683 /* Never send packets back to the socket
1684 * they originated from - MvS (miquels@drinkel.ow.org)
1685 */
1686 if ((ptype->dev == dev || !ptype->dev) &&
1687 (!skb_loop_sk(ptype, skb))) {
1688 if (pt_prev) {
1689 deliver_skb(skb2, pt_prev, skb->dev);
1690 pt_prev = ptype;
1691 continue;
1692 }
1693
1694 skb2 = skb_clone(skb, GFP_ATOMIC);
1695 if (!skb2)
1696 break;
1697
1698 net_timestamp_set(skb2);
1699
1700 /* skb->nh should be correctly
1701 set by sender, so that the second statement is
1702 just protection against buggy protocols.
1703 */
1704 skb_reset_mac_header(skb2);
1705
1706 if (skb_network_header(skb2) < skb2->data ||
1707 skb2->network_header > skb2->tail) {
1708 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1709 ntohs(skb2->protocol),
1710 dev->name);
1711 skb_reset_network_header(skb2);
1712 }
1713
1714 skb2->transport_header = skb2->network_header;
1715 skb2->pkt_type = PACKET_OUTGOING;
1716 pt_prev = ptype;
1717 }
1718 }
1719 if (pt_prev)
1720 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1721 rcu_read_unlock();
1722 }
1723
1724 /**
1725 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1726 * @dev: Network device
1727 * @txq: number of queues available
1728 *
1729 * If real_num_tx_queues is changed the tc mappings may no longer be
1730 * valid. To resolve this verify the tc mapping remains valid and if
1731 * not NULL the mapping. With no priorities mapping to this
1732 * offset/count pair it will no longer be used. In the worst case TC0
1733 * is invalid nothing can be done so disable priority mappings. If is
1734 * expected that drivers will fix this mapping if they can before
1735 * calling netif_set_real_num_tx_queues.
1736 */
1737 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1738 {
1739 int i;
1740 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1741
1742 /* If TC0 is invalidated disable TC mapping */
1743 if (tc->offset + tc->count > txq) {
1744 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1745 dev->num_tc = 0;
1746 return;
1747 }
1748
1749 /* Invalidated prio to tc mappings set to TC0 */
1750 for (i = 1; i < TC_BITMASK + 1; i++) {
1751 int q = netdev_get_prio_tc_map(dev, i);
1752
1753 tc = &dev->tc_to_txq[q];
1754 if (tc->offset + tc->count > txq) {
1755 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1756 i, q);
1757 netdev_set_prio_tc_map(dev, i, 0);
1758 }
1759 }
1760 }
1761
1762 /*
1763 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1764 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1765 */
1766 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1767 {
1768 int rc;
1769
1770 if (txq < 1 || txq > dev->num_tx_queues)
1771 return -EINVAL;
1772
1773 if (dev->reg_state == NETREG_REGISTERED ||
1774 dev->reg_state == NETREG_UNREGISTERING) {
1775 ASSERT_RTNL();
1776
1777 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1778 txq);
1779 if (rc)
1780 return rc;
1781
1782 if (dev->num_tc)
1783 netif_setup_tc(dev, txq);
1784
1785 if (txq < dev->real_num_tx_queues)
1786 qdisc_reset_all_tx_gt(dev, txq);
1787 }
1788
1789 dev->real_num_tx_queues = txq;
1790 return 0;
1791 }
1792 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1793
1794 #ifdef CONFIG_RPS
1795 /**
1796 * netif_set_real_num_rx_queues - set actual number of RX queues used
1797 * @dev: Network device
1798 * @rxq: Actual number of RX queues
1799 *
1800 * This must be called either with the rtnl_lock held or before
1801 * registration of the net device. Returns 0 on success, or a
1802 * negative error code. If called before registration, it always
1803 * succeeds.
1804 */
1805 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1806 {
1807 int rc;
1808
1809 if (rxq < 1 || rxq > dev->num_rx_queues)
1810 return -EINVAL;
1811
1812 if (dev->reg_state == NETREG_REGISTERED) {
1813 ASSERT_RTNL();
1814
1815 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1816 rxq);
1817 if (rc)
1818 return rc;
1819 }
1820
1821 dev->real_num_rx_queues = rxq;
1822 return 0;
1823 }
1824 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1825 #endif
1826
1827 /**
1828 * netif_get_num_default_rss_queues - default number of RSS queues
1829 *
1830 * This routine should set an upper limit on the number of RSS queues
1831 * used by default by multiqueue devices.
1832 */
1833 int netif_get_num_default_rss_queues(void)
1834 {
1835 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
1836 }
1837 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
1838
1839 static inline void __netif_reschedule(struct Qdisc *q)
1840 {
1841 struct softnet_data *sd;
1842 unsigned long flags;
1843
1844 local_irq_save(flags);
1845 sd = &__get_cpu_var(softnet_data);
1846 q->next_sched = NULL;
1847 *sd->output_queue_tailp = q;
1848 sd->output_queue_tailp = &q->next_sched;
1849 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1850 local_irq_restore(flags);
1851 }
1852
1853 void __netif_schedule(struct Qdisc *q)
1854 {
1855 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1856 __netif_reschedule(q);
1857 }
1858 EXPORT_SYMBOL(__netif_schedule);
1859
1860 void dev_kfree_skb_irq(struct sk_buff *skb)
1861 {
1862 if (atomic_dec_and_test(&skb->users)) {
1863 struct softnet_data *sd;
1864 unsigned long flags;
1865
1866 local_irq_save(flags);
1867 sd = &__get_cpu_var(softnet_data);
1868 skb->next = sd->completion_queue;
1869 sd->completion_queue = skb;
1870 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1871 local_irq_restore(flags);
1872 }
1873 }
1874 EXPORT_SYMBOL(dev_kfree_skb_irq);
1875
1876 void dev_kfree_skb_any(struct sk_buff *skb)
1877 {
1878 if (in_irq() || irqs_disabled())
1879 dev_kfree_skb_irq(skb);
1880 else
1881 dev_kfree_skb(skb);
1882 }
1883 EXPORT_SYMBOL(dev_kfree_skb_any);
1884
1885
1886 /**
1887 * netif_device_detach - mark device as removed
1888 * @dev: network device
1889 *
1890 * Mark device as removed from system and therefore no longer available.
1891 */
1892 void netif_device_detach(struct net_device *dev)
1893 {
1894 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1895 netif_running(dev)) {
1896 netif_tx_stop_all_queues(dev);
1897 }
1898 }
1899 EXPORT_SYMBOL(netif_device_detach);
1900
1901 /**
1902 * netif_device_attach - mark device as attached
1903 * @dev: network device
1904 *
1905 * Mark device as attached from system and restart if needed.
1906 */
1907 void netif_device_attach(struct net_device *dev)
1908 {
1909 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1910 netif_running(dev)) {
1911 netif_tx_wake_all_queues(dev);
1912 __netdev_watchdog_up(dev);
1913 }
1914 }
1915 EXPORT_SYMBOL(netif_device_attach);
1916
1917 static void skb_warn_bad_offload(const struct sk_buff *skb)
1918 {
1919 static const netdev_features_t null_features = 0;
1920 struct net_device *dev = skb->dev;
1921 const char *driver = "";
1922
1923 if (dev && dev->dev.parent)
1924 driver = dev_driver_string(dev->dev.parent);
1925
1926 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1927 "gso_type=%d ip_summed=%d\n",
1928 driver, dev ? &dev->features : &null_features,
1929 skb->sk ? &skb->sk->sk_route_caps : &null_features,
1930 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1931 skb_shinfo(skb)->gso_type, skb->ip_summed);
1932 }
1933
1934 /*
1935 * Invalidate hardware checksum when packet is to be mangled, and
1936 * complete checksum manually on outgoing path.
1937 */
1938 int skb_checksum_help(struct sk_buff *skb)
1939 {
1940 __wsum csum;
1941 int ret = 0, offset;
1942
1943 if (skb->ip_summed == CHECKSUM_COMPLETE)
1944 goto out_set_summed;
1945
1946 if (unlikely(skb_shinfo(skb)->gso_size)) {
1947 skb_warn_bad_offload(skb);
1948 return -EINVAL;
1949 }
1950
1951 offset = skb_checksum_start_offset(skb);
1952 BUG_ON(offset >= skb_headlen(skb));
1953 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1954
1955 offset += skb->csum_offset;
1956 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1957
1958 if (skb_cloned(skb) &&
1959 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1960 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1961 if (ret)
1962 goto out;
1963 }
1964
1965 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1966 out_set_summed:
1967 skb->ip_summed = CHECKSUM_NONE;
1968 out:
1969 return ret;
1970 }
1971 EXPORT_SYMBOL(skb_checksum_help);
1972
1973 /**
1974 * skb_gso_segment - Perform segmentation on skb.
1975 * @skb: buffer to segment
1976 * @features: features for the output path (see dev->features)
1977 *
1978 * This function segments the given skb and returns a list of segments.
1979 *
1980 * It may return NULL if the skb requires no segmentation. This is
1981 * only possible when GSO is used for verifying header integrity.
1982 */
1983 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1984 netdev_features_t features)
1985 {
1986 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1987 struct packet_type *ptype;
1988 __be16 type = skb->protocol;
1989 int vlan_depth = ETH_HLEN;
1990 int err;
1991
1992 while (type == htons(ETH_P_8021Q)) {
1993 struct vlan_hdr *vh;
1994
1995 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1996 return ERR_PTR(-EINVAL);
1997
1998 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1999 type = vh->h_vlan_encapsulated_proto;
2000 vlan_depth += VLAN_HLEN;
2001 }
2002
2003 skb_reset_mac_header(skb);
2004 skb->mac_len = skb->network_header - skb->mac_header;
2005 __skb_pull(skb, skb->mac_len);
2006
2007 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2008 skb_warn_bad_offload(skb);
2009
2010 if (skb_header_cloned(skb) &&
2011 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2012 return ERR_PTR(err);
2013 }
2014
2015 rcu_read_lock();
2016 list_for_each_entry_rcu(ptype,
2017 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2018 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
2019 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2020 err = ptype->gso_send_check(skb);
2021 segs = ERR_PTR(err);
2022 if (err || skb_gso_ok(skb, features))
2023 break;
2024 __skb_push(skb, (skb->data -
2025 skb_network_header(skb)));
2026 }
2027 segs = ptype->gso_segment(skb, features);
2028 break;
2029 }
2030 }
2031 rcu_read_unlock();
2032
2033 __skb_push(skb, skb->data - skb_mac_header(skb));
2034
2035 return segs;
2036 }
2037 EXPORT_SYMBOL(skb_gso_segment);
2038
2039 /* Take action when hardware reception checksum errors are detected. */
2040 #ifdef CONFIG_BUG
2041 void netdev_rx_csum_fault(struct net_device *dev)
2042 {
2043 if (net_ratelimit()) {
2044 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2045 dump_stack();
2046 }
2047 }
2048 EXPORT_SYMBOL(netdev_rx_csum_fault);
2049 #endif
2050
2051 /* Actually, we should eliminate this check as soon as we know, that:
2052 * 1. IOMMU is present and allows to map all the memory.
2053 * 2. No high memory really exists on this machine.
2054 */
2055
2056 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2057 {
2058 #ifdef CONFIG_HIGHMEM
2059 int i;
2060 if (!(dev->features & NETIF_F_HIGHDMA)) {
2061 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2062 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2063 if (PageHighMem(skb_frag_page(frag)))
2064 return 1;
2065 }
2066 }
2067
2068 if (PCI_DMA_BUS_IS_PHYS) {
2069 struct device *pdev = dev->dev.parent;
2070
2071 if (!pdev)
2072 return 0;
2073 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2074 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2075 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2076 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2077 return 1;
2078 }
2079 }
2080 #endif
2081 return 0;
2082 }
2083
2084 struct dev_gso_cb {
2085 void (*destructor)(struct sk_buff *skb);
2086 };
2087
2088 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2089
2090 static void dev_gso_skb_destructor(struct sk_buff *skb)
2091 {
2092 struct dev_gso_cb *cb;
2093
2094 do {
2095 struct sk_buff *nskb = skb->next;
2096
2097 skb->next = nskb->next;
2098 nskb->next = NULL;
2099 kfree_skb(nskb);
2100 } while (skb->next);
2101
2102 cb = DEV_GSO_CB(skb);
2103 if (cb->destructor)
2104 cb->destructor(skb);
2105 }
2106
2107 /**
2108 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2109 * @skb: buffer to segment
2110 * @features: device features as applicable to this skb
2111 *
2112 * This function segments the given skb and stores the list of segments
2113 * in skb->next.
2114 */
2115 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2116 {
2117 struct sk_buff *segs;
2118
2119 segs = skb_gso_segment(skb, features);
2120
2121 /* Verifying header integrity only. */
2122 if (!segs)
2123 return 0;
2124
2125 if (IS_ERR(segs))
2126 return PTR_ERR(segs);
2127
2128 skb->next = segs;
2129 DEV_GSO_CB(skb)->destructor = skb->destructor;
2130 skb->destructor = dev_gso_skb_destructor;
2131
2132 return 0;
2133 }
2134
2135 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2136 {
2137 return ((features & NETIF_F_GEN_CSUM) ||
2138 ((features & NETIF_F_V4_CSUM) &&
2139 protocol == htons(ETH_P_IP)) ||
2140 ((features & NETIF_F_V6_CSUM) &&
2141 protocol == htons(ETH_P_IPV6)) ||
2142 ((features & NETIF_F_FCOE_CRC) &&
2143 protocol == htons(ETH_P_FCOE)));
2144 }
2145
2146 static netdev_features_t harmonize_features(struct sk_buff *skb,
2147 __be16 protocol, netdev_features_t features)
2148 {
2149 if (!can_checksum_protocol(features, protocol)) {
2150 features &= ~NETIF_F_ALL_CSUM;
2151 features &= ~NETIF_F_SG;
2152 } else if (illegal_highdma(skb->dev, skb)) {
2153 features &= ~NETIF_F_SG;
2154 }
2155
2156 return features;
2157 }
2158
2159 netdev_features_t netif_skb_features(struct sk_buff *skb)
2160 {
2161 __be16 protocol = skb->protocol;
2162 netdev_features_t features = skb->dev->features;
2163
2164 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2165 features &= ~NETIF_F_GSO_MASK;
2166
2167 if (protocol == htons(ETH_P_8021Q)) {
2168 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2169 protocol = veh->h_vlan_encapsulated_proto;
2170 } else if (!vlan_tx_tag_present(skb)) {
2171 return harmonize_features(skb, protocol, features);
2172 }
2173
2174 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2175
2176 if (protocol != htons(ETH_P_8021Q)) {
2177 return harmonize_features(skb, protocol, features);
2178 } else {
2179 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2180 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2181 return harmonize_features(skb, protocol, features);
2182 }
2183 }
2184 EXPORT_SYMBOL(netif_skb_features);
2185
2186 /*
2187 * Returns true if either:
2188 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2189 * 2. skb is fragmented and the device does not support SG, or if
2190 * at least one of fragments is in highmem and device does not
2191 * support DMA from it.
2192 */
2193 static inline int skb_needs_linearize(struct sk_buff *skb,
2194 int features)
2195 {
2196 return skb_is_nonlinear(skb) &&
2197 ((skb_has_frag_list(skb) &&
2198 !(features & NETIF_F_FRAGLIST)) ||
2199 (skb_shinfo(skb)->nr_frags &&
2200 !(features & NETIF_F_SG)));
2201 }
2202
2203 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2204 struct netdev_queue *txq)
2205 {
2206 const struct net_device_ops *ops = dev->netdev_ops;
2207 int rc = NETDEV_TX_OK;
2208 unsigned int skb_len;
2209
2210 if (likely(!skb->next)) {
2211 netdev_features_t features;
2212
2213 /*
2214 * If device doesn't need skb->dst, release it right now while
2215 * its hot in this cpu cache
2216 */
2217 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2218 skb_dst_drop(skb);
2219
2220 if (!list_empty(&ptype_all))
2221 dev_queue_xmit_nit(skb, dev);
2222
2223 features = netif_skb_features(skb);
2224
2225 if (vlan_tx_tag_present(skb) &&
2226 !(features & NETIF_F_HW_VLAN_TX)) {
2227 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2228 if (unlikely(!skb))
2229 goto out;
2230
2231 skb->vlan_tci = 0;
2232 }
2233
2234 if (netif_needs_gso(skb, features)) {
2235 if (unlikely(dev_gso_segment(skb, features)))
2236 goto out_kfree_skb;
2237 if (skb->next)
2238 goto gso;
2239 } else {
2240 if (skb_needs_linearize(skb, features) &&
2241 __skb_linearize(skb))
2242 goto out_kfree_skb;
2243
2244 /* If packet is not checksummed and device does not
2245 * support checksumming for this protocol, complete
2246 * checksumming here.
2247 */
2248 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2249 skb_set_transport_header(skb,
2250 skb_checksum_start_offset(skb));
2251 if (!(features & NETIF_F_ALL_CSUM) &&
2252 skb_checksum_help(skb))
2253 goto out_kfree_skb;
2254 }
2255 }
2256
2257 skb_len = skb->len;
2258 rc = ops->ndo_start_xmit(skb, dev);
2259 trace_net_dev_xmit(skb, rc, dev, skb_len);
2260 if (rc == NETDEV_TX_OK)
2261 txq_trans_update(txq);
2262 return rc;
2263 }
2264
2265 gso:
2266 do {
2267 struct sk_buff *nskb = skb->next;
2268
2269 skb->next = nskb->next;
2270 nskb->next = NULL;
2271
2272 /*
2273 * If device doesn't need nskb->dst, release it right now while
2274 * its hot in this cpu cache
2275 */
2276 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2277 skb_dst_drop(nskb);
2278
2279 skb_len = nskb->len;
2280 rc = ops->ndo_start_xmit(nskb, dev);
2281 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2282 if (unlikely(rc != NETDEV_TX_OK)) {
2283 if (rc & ~NETDEV_TX_MASK)
2284 goto out_kfree_gso_skb;
2285 nskb->next = skb->next;
2286 skb->next = nskb;
2287 return rc;
2288 }
2289 txq_trans_update(txq);
2290 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2291 return NETDEV_TX_BUSY;
2292 } while (skb->next);
2293
2294 out_kfree_gso_skb:
2295 if (likely(skb->next == NULL))
2296 skb->destructor = DEV_GSO_CB(skb)->destructor;
2297 out_kfree_skb:
2298 kfree_skb(skb);
2299 out:
2300 return rc;
2301 }
2302
2303 static u32 hashrnd __read_mostly;
2304
2305 /*
2306 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2307 * to be used as a distribution range.
2308 */
2309 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2310 unsigned int num_tx_queues)
2311 {
2312 u32 hash;
2313 u16 qoffset = 0;
2314 u16 qcount = num_tx_queues;
2315
2316 if (skb_rx_queue_recorded(skb)) {
2317 hash = skb_get_rx_queue(skb);
2318 while (unlikely(hash >= num_tx_queues))
2319 hash -= num_tx_queues;
2320 return hash;
2321 }
2322
2323 if (dev->num_tc) {
2324 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2325 qoffset = dev->tc_to_txq[tc].offset;
2326 qcount = dev->tc_to_txq[tc].count;
2327 }
2328
2329 if (skb->sk && skb->sk->sk_hash)
2330 hash = skb->sk->sk_hash;
2331 else
2332 hash = (__force u16) skb->protocol;
2333 hash = jhash_1word(hash, hashrnd);
2334
2335 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2336 }
2337 EXPORT_SYMBOL(__skb_tx_hash);
2338
2339 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2340 {
2341 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2342 net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2343 dev->name, queue_index,
2344 dev->real_num_tx_queues);
2345 return 0;
2346 }
2347 return queue_index;
2348 }
2349
2350 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2351 {
2352 #ifdef CONFIG_XPS
2353 struct xps_dev_maps *dev_maps;
2354 struct xps_map *map;
2355 int queue_index = -1;
2356
2357 rcu_read_lock();
2358 dev_maps = rcu_dereference(dev->xps_maps);
2359 if (dev_maps) {
2360 map = rcu_dereference(
2361 dev_maps->cpu_map[raw_smp_processor_id()]);
2362 if (map) {
2363 if (map->len == 1)
2364 queue_index = map->queues[0];
2365 else {
2366 u32 hash;
2367 if (skb->sk && skb->sk->sk_hash)
2368 hash = skb->sk->sk_hash;
2369 else
2370 hash = (__force u16) skb->protocol ^
2371 skb->rxhash;
2372 hash = jhash_1word(hash, hashrnd);
2373 queue_index = map->queues[
2374 ((u64)hash * map->len) >> 32];
2375 }
2376 if (unlikely(queue_index >= dev->real_num_tx_queues))
2377 queue_index = -1;
2378 }
2379 }
2380 rcu_read_unlock();
2381
2382 return queue_index;
2383 #else
2384 return -1;
2385 #endif
2386 }
2387
2388 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2389 struct sk_buff *skb)
2390 {
2391 int queue_index;
2392 const struct net_device_ops *ops = dev->netdev_ops;
2393
2394 if (dev->real_num_tx_queues == 1)
2395 queue_index = 0;
2396 else if (ops->ndo_select_queue) {
2397 queue_index = ops->ndo_select_queue(dev, skb);
2398 queue_index = dev_cap_txqueue(dev, queue_index);
2399 } else {
2400 struct sock *sk = skb->sk;
2401 queue_index = sk_tx_queue_get(sk);
2402
2403 if (queue_index < 0 || skb->ooo_okay ||
2404 queue_index >= dev->real_num_tx_queues) {
2405 int old_index = queue_index;
2406
2407 queue_index = get_xps_queue(dev, skb);
2408 if (queue_index < 0)
2409 queue_index = skb_tx_hash(dev, skb);
2410
2411 if (queue_index != old_index && sk) {
2412 struct dst_entry *dst =
2413 rcu_dereference_check(sk->sk_dst_cache, 1);
2414
2415 if (dst && skb_dst(skb) == dst)
2416 sk_tx_queue_set(sk, queue_index);
2417 }
2418 }
2419 }
2420
2421 skb_set_queue_mapping(skb, queue_index);
2422 return netdev_get_tx_queue(dev, queue_index);
2423 }
2424
2425 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2426 struct net_device *dev,
2427 struct netdev_queue *txq)
2428 {
2429 spinlock_t *root_lock = qdisc_lock(q);
2430 bool contended;
2431 int rc;
2432
2433 qdisc_skb_cb(skb)->pkt_len = skb->len;
2434 qdisc_calculate_pkt_len(skb, q);
2435 /*
2436 * Heuristic to force contended enqueues to serialize on a
2437 * separate lock before trying to get qdisc main lock.
2438 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2439 * and dequeue packets faster.
2440 */
2441 contended = qdisc_is_running(q);
2442 if (unlikely(contended))
2443 spin_lock(&q->busylock);
2444
2445 spin_lock(root_lock);
2446 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2447 kfree_skb(skb);
2448 rc = NET_XMIT_DROP;
2449 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2450 qdisc_run_begin(q)) {
2451 /*
2452 * This is a work-conserving queue; there are no old skbs
2453 * waiting to be sent out; and the qdisc is not running -
2454 * xmit the skb directly.
2455 */
2456 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2457 skb_dst_force(skb);
2458
2459 qdisc_bstats_update(q, skb);
2460
2461 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2462 if (unlikely(contended)) {
2463 spin_unlock(&q->busylock);
2464 contended = false;
2465 }
2466 __qdisc_run(q);
2467 } else
2468 qdisc_run_end(q);
2469
2470 rc = NET_XMIT_SUCCESS;
2471 } else {
2472 skb_dst_force(skb);
2473 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2474 if (qdisc_run_begin(q)) {
2475 if (unlikely(contended)) {
2476 spin_unlock(&q->busylock);
2477 contended = false;
2478 }
2479 __qdisc_run(q);
2480 }
2481 }
2482 spin_unlock(root_lock);
2483 if (unlikely(contended))
2484 spin_unlock(&q->busylock);
2485 return rc;
2486 }
2487
2488 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2489 static void skb_update_prio(struct sk_buff *skb)
2490 {
2491 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2492
2493 if (!skb->priority && skb->sk && map) {
2494 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2495
2496 if (prioidx < map->priomap_len)
2497 skb->priority = map->priomap[prioidx];
2498 }
2499 }
2500 #else
2501 #define skb_update_prio(skb)
2502 #endif
2503
2504 static DEFINE_PER_CPU(int, xmit_recursion);
2505 #define RECURSION_LIMIT 10
2506
2507 /**
2508 * dev_loopback_xmit - loop back @skb
2509 * @skb: buffer to transmit
2510 */
2511 int dev_loopback_xmit(struct sk_buff *skb)
2512 {
2513 skb_reset_mac_header(skb);
2514 __skb_pull(skb, skb_network_offset(skb));
2515 skb->pkt_type = PACKET_LOOPBACK;
2516 skb->ip_summed = CHECKSUM_UNNECESSARY;
2517 WARN_ON(!skb_dst(skb));
2518 skb_dst_force(skb);
2519 netif_rx_ni(skb);
2520 return 0;
2521 }
2522 EXPORT_SYMBOL(dev_loopback_xmit);
2523
2524 /**
2525 * dev_queue_xmit - transmit a buffer
2526 * @skb: buffer to transmit
2527 *
2528 * Queue a buffer for transmission to a network device. The caller must
2529 * have set the device and priority and built the buffer before calling
2530 * this function. The function can be called from an interrupt.
2531 *
2532 * A negative errno code is returned on a failure. A success does not
2533 * guarantee the frame will be transmitted as it may be dropped due
2534 * to congestion or traffic shaping.
2535 *
2536 * -----------------------------------------------------------------------------------
2537 * I notice this method can also return errors from the queue disciplines,
2538 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2539 * be positive.
2540 *
2541 * Regardless of the return value, the skb is consumed, so it is currently
2542 * difficult to retry a send to this method. (You can bump the ref count
2543 * before sending to hold a reference for retry if you are careful.)
2544 *
2545 * When calling this method, interrupts MUST be enabled. This is because
2546 * the BH enable code must have IRQs enabled so that it will not deadlock.
2547 * --BLG
2548 */
2549 int dev_queue_xmit(struct sk_buff *skb)
2550 {
2551 struct net_device *dev = skb->dev;
2552 struct netdev_queue *txq;
2553 struct Qdisc *q;
2554 int rc = -ENOMEM;
2555
2556 /* Disable soft irqs for various locks below. Also
2557 * stops preemption for RCU.
2558 */
2559 rcu_read_lock_bh();
2560
2561 skb_update_prio(skb);
2562
2563 txq = dev_pick_tx(dev, skb);
2564 q = rcu_dereference_bh(txq->qdisc);
2565
2566 #ifdef CONFIG_NET_CLS_ACT
2567 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2568 #endif
2569 trace_net_dev_queue(skb);
2570 if (q->enqueue) {
2571 rc = __dev_xmit_skb(skb, q, dev, txq);
2572 goto out;
2573 }
2574
2575 /* The device has no queue. Common case for software devices:
2576 loopback, all the sorts of tunnels...
2577
2578 Really, it is unlikely that netif_tx_lock protection is necessary
2579 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2580 counters.)
2581 However, it is possible, that they rely on protection
2582 made by us here.
2583
2584 Check this and shot the lock. It is not prone from deadlocks.
2585 Either shot noqueue qdisc, it is even simpler 8)
2586 */
2587 if (dev->flags & IFF_UP) {
2588 int cpu = smp_processor_id(); /* ok because BHs are off */
2589
2590 if (txq->xmit_lock_owner != cpu) {
2591
2592 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2593 goto recursion_alert;
2594
2595 HARD_TX_LOCK(dev, txq, cpu);
2596
2597 if (!netif_xmit_stopped(txq)) {
2598 __this_cpu_inc(xmit_recursion);
2599 rc = dev_hard_start_xmit(skb, dev, txq);
2600 __this_cpu_dec(xmit_recursion);
2601 if (dev_xmit_complete(rc)) {
2602 HARD_TX_UNLOCK(dev, txq);
2603 goto out;
2604 }
2605 }
2606 HARD_TX_UNLOCK(dev, txq);
2607 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2608 dev->name);
2609 } else {
2610 /* Recursion is detected! It is possible,
2611 * unfortunately
2612 */
2613 recursion_alert:
2614 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2615 dev->name);
2616 }
2617 }
2618
2619 rc = -ENETDOWN;
2620 rcu_read_unlock_bh();
2621
2622 kfree_skb(skb);
2623 return rc;
2624 out:
2625 rcu_read_unlock_bh();
2626 return rc;
2627 }
2628 EXPORT_SYMBOL(dev_queue_xmit);
2629
2630
2631 /*=======================================================================
2632 Receiver routines
2633 =======================================================================*/
2634
2635 int netdev_max_backlog __read_mostly = 1000;
2636 int netdev_tstamp_prequeue __read_mostly = 1;
2637 int netdev_budget __read_mostly = 300;
2638 int weight_p __read_mostly = 64; /* old backlog weight */
2639
2640 /* Called with irq disabled */
2641 static inline void ____napi_schedule(struct softnet_data *sd,
2642 struct napi_struct *napi)
2643 {
2644 list_add_tail(&napi->poll_list, &sd->poll_list);
2645 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2646 }
2647
2648 /*
2649 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2650 * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
2651 * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
2652 * if hash is a canonical 4-tuple hash over transport ports.
2653 */
2654 void __skb_get_rxhash(struct sk_buff *skb)
2655 {
2656 struct flow_keys keys;
2657 u32 hash;
2658
2659 if (!skb_flow_dissect(skb, &keys))
2660 return;
2661
2662 if (keys.ports) {
2663 if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2664 swap(keys.port16[0], keys.port16[1]);
2665 skb->l4_rxhash = 1;
2666 }
2667
2668 /* get a consistent hash (same value on both flow directions) */
2669 if ((__force u32)keys.dst < (__force u32)keys.src)
2670 swap(keys.dst, keys.src);
2671
2672 hash = jhash_3words((__force u32)keys.dst,
2673 (__force u32)keys.src,
2674 (__force u32)keys.ports, hashrnd);
2675 if (!hash)
2676 hash = 1;
2677
2678 skb->rxhash = hash;
2679 }
2680 EXPORT_SYMBOL(__skb_get_rxhash);
2681
2682 #ifdef CONFIG_RPS
2683
2684 /* One global table that all flow-based protocols share. */
2685 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2686 EXPORT_SYMBOL(rps_sock_flow_table);
2687
2688 struct static_key rps_needed __read_mostly;
2689
2690 static struct rps_dev_flow *
2691 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2692 struct rps_dev_flow *rflow, u16 next_cpu)
2693 {
2694 if (next_cpu != RPS_NO_CPU) {
2695 #ifdef CONFIG_RFS_ACCEL
2696 struct netdev_rx_queue *rxqueue;
2697 struct rps_dev_flow_table *flow_table;
2698 struct rps_dev_flow *old_rflow;
2699 u32 flow_id;
2700 u16 rxq_index;
2701 int rc;
2702
2703 /* Should we steer this flow to a different hardware queue? */
2704 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2705 !(dev->features & NETIF_F_NTUPLE))
2706 goto out;
2707 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2708 if (rxq_index == skb_get_rx_queue(skb))
2709 goto out;
2710
2711 rxqueue = dev->_rx + rxq_index;
2712 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2713 if (!flow_table)
2714 goto out;
2715 flow_id = skb->rxhash & flow_table->mask;
2716 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2717 rxq_index, flow_id);
2718 if (rc < 0)
2719 goto out;
2720 old_rflow = rflow;
2721 rflow = &flow_table->flows[flow_id];
2722 rflow->filter = rc;
2723 if (old_rflow->filter == rflow->filter)
2724 old_rflow->filter = RPS_NO_FILTER;
2725 out:
2726 #endif
2727 rflow->last_qtail =
2728 per_cpu(softnet_data, next_cpu).input_queue_head;
2729 }
2730
2731 rflow->cpu = next_cpu;
2732 return rflow;
2733 }
2734
2735 /*
2736 * get_rps_cpu is called from netif_receive_skb and returns the target
2737 * CPU from the RPS map of the receiving queue for a given skb.
2738 * rcu_read_lock must be held on entry.
2739 */
2740 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2741 struct rps_dev_flow **rflowp)
2742 {
2743 struct netdev_rx_queue *rxqueue;
2744 struct rps_map *map;
2745 struct rps_dev_flow_table *flow_table;
2746 struct rps_sock_flow_table *sock_flow_table;
2747 int cpu = -1;
2748 u16 tcpu;
2749
2750 if (skb_rx_queue_recorded(skb)) {
2751 u16 index = skb_get_rx_queue(skb);
2752 if (unlikely(index >= dev->real_num_rx_queues)) {
2753 WARN_ONCE(dev->real_num_rx_queues > 1,
2754 "%s received packet on queue %u, but number "
2755 "of RX queues is %u\n",
2756 dev->name, index, dev->real_num_rx_queues);
2757 goto done;
2758 }
2759 rxqueue = dev->_rx + index;
2760 } else
2761 rxqueue = dev->_rx;
2762
2763 map = rcu_dereference(rxqueue->rps_map);
2764 if (map) {
2765 if (map->len == 1 &&
2766 !rcu_access_pointer(rxqueue->rps_flow_table)) {
2767 tcpu = map->cpus[0];
2768 if (cpu_online(tcpu))
2769 cpu = tcpu;
2770 goto done;
2771 }
2772 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2773 goto done;
2774 }
2775
2776 skb_reset_network_header(skb);
2777 if (!skb_get_rxhash(skb))
2778 goto done;
2779
2780 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2781 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2782 if (flow_table && sock_flow_table) {
2783 u16 next_cpu;
2784 struct rps_dev_flow *rflow;
2785
2786 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2787 tcpu = rflow->cpu;
2788
2789 next_cpu = sock_flow_table->ents[skb->rxhash &
2790 sock_flow_table->mask];
2791
2792 /*
2793 * If the desired CPU (where last recvmsg was done) is
2794 * different from current CPU (one in the rx-queue flow
2795 * table entry), switch if one of the following holds:
2796 * - Current CPU is unset (equal to RPS_NO_CPU).
2797 * - Current CPU is offline.
2798 * - The current CPU's queue tail has advanced beyond the
2799 * last packet that was enqueued using this table entry.
2800 * This guarantees that all previous packets for the flow
2801 * have been dequeued, thus preserving in order delivery.
2802 */
2803 if (unlikely(tcpu != next_cpu) &&
2804 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2805 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2806 rflow->last_qtail)) >= 0))
2807 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2808
2809 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2810 *rflowp = rflow;
2811 cpu = tcpu;
2812 goto done;
2813 }
2814 }
2815
2816 if (map) {
2817 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2818
2819 if (cpu_online(tcpu)) {
2820 cpu = tcpu;
2821 goto done;
2822 }
2823 }
2824
2825 done:
2826 return cpu;
2827 }
2828
2829 #ifdef CONFIG_RFS_ACCEL
2830
2831 /**
2832 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2833 * @dev: Device on which the filter was set
2834 * @rxq_index: RX queue index
2835 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2836 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2837 *
2838 * Drivers that implement ndo_rx_flow_steer() should periodically call
2839 * this function for each installed filter and remove the filters for
2840 * which it returns %true.
2841 */
2842 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2843 u32 flow_id, u16 filter_id)
2844 {
2845 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2846 struct rps_dev_flow_table *flow_table;
2847 struct rps_dev_flow *rflow;
2848 bool expire = true;
2849 int cpu;
2850
2851 rcu_read_lock();
2852 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2853 if (flow_table && flow_id <= flow_table->mask) {
2854 rflow = &flow_table->flows[flow_id];
2855 cpu = ACCESS_ONCE(rflow->cpu);
2856 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2857 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2858 rflow->last_qtail) <
2859 (int)(10 * flow_table->mask)))
2860 expire = false;
2861 }
2862 rcu_read_unlock();
2863 return expire;
2864 }
2865 EXPORT_SYMBOL(rps_may_expire_flow);
2866
2867 #endif /* CONFIG_RFS_ACCEL */
2868
2869 /* Called from hardirq (IPI) context */
2870 static void rps_trigger_softirq(void *data)
2871 {
2872 struct softnet_data *sd = data;
2873
2874 ____napi_schedule(sd, &sd->backlog);
2875 sd->received_rps++;
2876 }
2877
2878 #endif /* CONFIG_RPS */
2879
2880 /*
2881 * Check if this softnet_data structure is another cpu one
2882 * If yes, queue it to our IPI list and return 1
2883 * If no, return 0
2884 */
2885 static int rps_ipi_queued(struct softnet_data *sd)
2886 {
2887 #ifdef CONFIG_RPS
2888 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2889
2890 if (sd != mysd) {
2891 sd->rps_ipi_next = mysd->rps_ipi_list;
2892 mysd->rps_ipi_list = sd;
2893
2894 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2895 return 1;
2896 }
2897 #endif /* CONFIG_RPS */
2898 return 0;
2899 }
2900
2901 /*
2902 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2903 * queue (may be a remote CPU queue).
2904 */
2905 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2906 unsigned int *qtail)
2907 {
2908 struct softnet_data *sd;
2909 unsigned long flags;
2910
2911 sd = &per_cpu(softnet_data, cpu);
2912
2913 local_irq_save(flags);
2914
2915 rps_lock(sd);
2916 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2917 if (skb_queue_len(&sd->input_pkt_queue)) {
2918 enqueue:
2919 __skb_queue_tail(&sd->input_pkt_queue, skb);
2920 input_queue_tail_incr_save(sd, qtail);
2921 rps_unlock(sd);
2922 local_irq_restore(flags);
2923 return NET_RX_SUCCESS;
2924 }
2925
2926 /* Schedule NAPI for backlog device
2927 * We can use non atomic operation since we own the queue lock
2928 */
2929 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2930 if (!rps_ipi_queued(sd))
2931 ____napi_schedule(sd, &sd->backlog);
2932 }
2933 goto enqueue;
2934 }
2935
2936 sd->dropped++;
2937 rps_unlock(sd);
2938
2939 local_irq_restore(flags);
2940
2941 atomic_long_inc(&skb->dev->rx_dropped);
2942 kfree_skb(skb);
2943 return NET_RX_DROP;
2944 }
2945
2946 /**
2947 * netif_rx - post buffer to the network code
2948 * @skb: buffer to post
2949 *
2950 * This function receives a packet from a device driver and queues it for
2951 * the upper (protocol) levels to process. It always succeeds. The buffer
2952 * may be dropped during processing for congestion control or by the
2953 * protocol layers.
2954 *
2955 * return values:
2956 * NET_RX_SUCCESS (no congestion)
2957 * NET_RX_DROP (packet was dropped)
2958 *
2959 */
2960
2961 int netif_rx(struct sk_buff *skb)
2962 {
2963 int ret;
2964
2965 /* if netpoll wants it, pretend we never saw it */
2966 if (netpoll_rx(skb))
2967 return NET_RX_DROP;
2968
2969 net_timestamp_check(netdev_tstamp_prequeue, skb);
2970
2971 trace_netif_rx(skb);
2972 #ifdef CONFIG_RPS
2973 if (static_key_false(&rps_needed)) {
2974 struct rps_dev_flow voidflow, *rflow = &voidflow;
2975 int cpu;
2976
2977 preempt_disable();
2978 rcu_read_lock();
2979
2980 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2981 if (cpu < 0)
2982 cpu = smp_processor_id();
2983
2984 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2985
2986 rcu_read_unlock();
2987 preempt_enable();
2988 } else
2989 #endif
2990 {
2991 unsigned int qtail;
2992 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2993 put_cpu();
2994 }
2995 return ret;
2996 }
2997 EXPORT_SYMBOL(netif_rx);
2998
2999 int netif_rx_ni(struct sk_buff *skb)
3000 {
3001 int err;
3002
3003 preempt_disable();
3004 err = netif_rx(skb);
3005 if (local_softirq_pending())
3006 do_softirq();
3007 preempt_enable();
3008
3009 return err;
3010 }
3011 EXPORT_SYMBOL(netif_rx_ni);
3012
3013 static void net_tx_action(struct softirq_action *h)
3014 {
3015 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3016
3017 if (sd->completion_queue) {
3018 struct sk_buff *clist;
3019
3020 local_irq_disable();
3021 clist = sd->completion_queue;
3022 sd->completion_queue = NULL;
3023 local_irq_enable();
3024
3025 while (clist) {
3026 struct sk_buff *skb = clist;
3027 clist = clist->next;
3028
3029 WARN_ON(atomic_read(&skb->users));
3030 trace_kfree_skb(skb, net_tx_action);
3031 __kfree_skb(skb);
3032 }
3033 }
3034
3035 if (sd->output_queue) {
3036 struct Qdisc *head;
3037
3038 local_irq_disable();
3039 head = sd->output_queue;
3040 sd->output_queue = NULL;
3041 sd->output_queue_tailp = &sd->output_queue;
3042 local_irq_enable();
3043
3044 while (head) {
3045 struct Qdisc *q = head;
3046 spinlock_t *root_lock;
3047
3048 head = head->next_sched;
3049
3050 root_lock = qdisc_lock(q);
3051 if (spin_trylock(root_lock)) {
3052 smp_mb__before_clear_bit();
3053 clear_bit(__QDISC_STATE_SCHED,
3054 &q->state);
3055 qdisc_run(q);
3056 spin_unlock(root_lock);
3057 } else {
3058 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3059 &q->state)) {
3060 __netif_reschedule(q);
3061 } else {
3062 smp_mb__before_clear_bit();
3063 clear_bit(__QDISC_STATE_SCHED,
3064 &q->state);
3065 }
3066 }
3067 }
3068 }
3069 }
3070
3071 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3072 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3073 /* This hook is defined here for ATM LANE */
3074 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3075 unsigned char *addr) __read_mostly;
3076 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3077 #endif
3078
3079 #ifdef CONFIG_NET_CLS_ACT
3080 /* TODO: Maybe we should just force sch_ingress to be compiled in
3081 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3082 * a compare and 2 stores extra right now if we dont have it on
3083 * but have CONFIG_NET_CLS_ACT
3084 * NOTE: This doesn't stop any functionality; if you dont have
3085 * the ingress scheduler, you just can't add policies on ingress.
3086 *
3087 */
3088 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3089 {
3090 struct net_device *dev = skb->dev;
3091 u32 ttl = G_TC_RTTL(skb->tc_verd);
3092 int result = TC_ACT_OK;
3093 struct Qdisc *q;
3094
3095 if (unlikely(MAX_RED_LOOP < ttl++)) {
3096 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3097 skb->skb_iif, dev->ifindex);
3098 return TC_ACT_SHOT;
3099 }
3100
3101 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3102 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3103
3104 q = rxq->qdisc;
3105 if (q != &noop_qdisc) {
3106 spin_lock(qdisc_lock(q));
3107 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3108 result = qdisc_enqueue_root(skb, q);
3109 spin_unlock(qdisc_lock(q));
3110 }
3111
3112 return result;
3113 }
3114
3115 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3116 struct packet_type **pt_prev,
3117 int *ret, struct net_device *orig_dev)
3118 {
3119 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3120
3121 if (!rxq || rxq->qdisc == &noop_qdisc)
3122 goto out;
3123
3124 if (*pt_prev) {
3125 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3126 *pt_prev = NULL;
3127 }
3128
3129 switch (ing_filter(skb, rxq)) {
3130 case TC_ACT_SHOT:
3131 case TC_ACT_STOLEN:
3132 kfree_skb(skb);
3133 return NULL;
3134 }
3135
3136 out:
3137 skb->tc_verd = 0;
3138 return skb;
3139 }
3140 #endif
3141
3142 /**
3143 * netdev_rx_handler_register - register receive handler
3144 * @dev: device to register a handler for
3145 * @rx_handler: receive handler to register
3146 * @rx_handler_data: data pointer that is used by rx handler
3147 *
3148 * Register a receive hander for a device. This handler will then be
3149 * called from __netif_receive_skb. A negative errno code is returned
3150 * on a failure.
3151 *
3152 * The caller must hold the rtnl_mutex.
3153 *
3154 * For a general description of rx_handler, see enum rx_handler_result.
3155 */
3156 int netdev_rx_handler_register(struct net_device *dev,
3157 rx_handler_func_t *rx_handler,
3158 void *rx_handler_data)
3159 {
3160 ASSERT_RTNL();
3161
3162 if (dev->rx_handler)
3163 return -EBUSY;
3164
3165 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3166 rcu_assign_pointer(dev->rx_handler, rx_handler);
3167
3168 return 0;
3169 }
3170 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3171
3172 /**
3173 * netdev_rx_handler_unregister - unregister receive handler
3174 * @dev: device to unregister a handler from
3175 *
3176 * Unregister a receive hander from a device.
3177 *
3178 * The caller must hold the rtnl_mutex.
3179 */
3180 void netdev_rx_handler_unregister(struct net_device *dev)
3181 {
3182
3183 ASSERT_RTNL();
3184 RCU_INIT_POINTER(dev->rx_handler, NULL);
3185 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3186 }
3187 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3188
3189 /*
3190 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3191 * the special handling of PFMEMALLOC skbs.
3192 */
3193 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3194 {
3195 switch (skb->protocol) {
3196 case __constant_htons(ETH_P_ARP):
3197 case __constant_htons(ETH_P_IP):
3198 case __constant_htons(ETH_P_IPV6):
3199 case __constant_htons(ETH_P_8021Q):
3200 return true;
3201 default:
3202 return false;
3203 }
3204 }
3205
3206 static int __netif_receive_skb(struct sk_buff *skb)
3207 {
3208 struct packet_type *ptype, *pt_prev;
3209 rx_handler_func_t *rx_handler;
3210 struct net_device *orig_dev;
3211 struct net_device *null_or_dev;
3212 bool deliver_exact = false;
3213 int ret = NET_RX_DROP;
3214 __be16 type;
3215 unsigned long pflags = current->flags;
3216
3217 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3218
3219 trace_netif_receive_skb(skb);
3220
3221 /*
3222 * PFMEMALLOC skbs are special, they should
3223 * - be delivered to SOCK_MEMALLOC sockets only
3224 * - stay away from userspace
3225 * - have bounded memory usage
3226 *
3227 * Use PF_MEMALLOC as this saves us from propagating the allocation
3228 * context down to all allocation sites.
3229 */
3230 if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3231 current->flags |= PF_MEMALLOC;
3232
3233 /* if we've gotten here through NAPI, check netpoll */
3234 if (netpoll_receive_skb(skb))
3235 goto out;
3236
3237 orig_dev = skb->dev;
3238
3239 skb_reset_network_header(skb);
3240 skb_reset_transport_header(skb);
3241 skb_reset_mac_len(skb);
3242
3243 pt_prev = NULL;
3244
3245 rcu_read_lock();
3246
3247 another_round:
3248 skb->skb_iif = skb->dev->ifindex;
3249
3250 __this_cpu_inc(softnet_data.processed);
3251
3252 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3253 skb = vlan_untag(skb);
3254 if (unlikely(!skb))
3255 goto unlock;
3256 }
3257
3258 #ifdef CONFIG_NET_CLS_ACT
3259 if (skb->tc_verd & TC_NCLS) {
3260 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3261 goto ncls;
3262 }
3263 #endif
3264
3265 if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3266 goto skip_taps;
3267
3268 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3269 if (!ptype->dev || ptype->dev == skb->dev) {
3270 if (pt_prev)
3271 ret = deliver_skb(skb, pt_prev, orig_dev);
3272 pt_prev = ptype;
3273 }
3274 }
3275
3276 skip_taps:
3277 #ifdef CONFIG_NET_CLS_ACT
3278 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3279 if (!skb)
3280 goto unlock;
3281 ncls:
3282 #endif
3283
3284 if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3285 && !skb_pfmemalloc_protocol(skb))
3286 goto drop;
3287
3288 rx_handler = rcu_dereference(skb->dev->rx_handler);
3289 if (vlan_tx_tag_present(skb)) {
3290 if (pt_prev) {
3291 ret = deliver_skb(skb, pt_prev, orig_dev);
3292 pt_prev = NULL;
3293 }
3294 if (vlan_do_receive(&skb, !rx_handler))
3295 goto another_round;
3296 else if (unlikely(!skb))
3297 goto unlock;
3298 }
3299
3300 if (rx_handler) {
3301 if (pt_prev) {
3302 ret = deliver_skb(skb, pt_prev, orig_dev);
3303 pt_prev = NULL;
3304 }
3305 switch (rx_handler(&skb)) {
3306 case RX_HANDLER_CONSUMED:
3307 goto unlock;
3308 case RX_HANDLER_ANOTHER:
3309 goto another_round;
3310 case RX_HANDLER_EXACT:
3311 deliver_exact = true;
3312 case RX_HANDLER_PASS:
3313 break;
3314 default:
3315 BUG();
3316 }
3317 }
3318
3319 /* deliver only exact match when indicated */
3320 null_or_dev = deliver_exact ? skb->dev : NULL;
3321
3322 type = skb->protocol;
3323 list_for_each_entry_rcu(ptype,
3324 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3325 if (ptype->type == type &&
3326 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3327 ptype->dev == orig_dev)) {
3328 if (pt_prev)
3329 ret = deliver_skb(skb, pt_prev, orig_dev);
3330 pt_prev = ptype;
3331 }
3332 }
3333
3334 if (pt_prev) {
3335 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3336 ret = -ENOMEM;
3337 else
3338 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3339 } else {
3340 drop:
3341 atomic_long_inc(&skb->dev->rx_dropped);
3342 kfree_skb(skb);
3343 /* Jamal, now you will not able to escape explaining
3344 * me how you were going to use this. :-)
3345 */
3346 ret = NET_RX_DROP;
3347 }
3348
3349 unlock:
3350 rcu_read_unlock();
3351 out:
3352 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3353 return ret;
3354 }
3355
3356 /**
3357 * netif_receive_skb - process receive buffer from network
3358 * @skb: buffer to process
3359 *
3360 * netif_receive_skb() is the main receive data processing function.
3361 * It always succeeds. The buffer may be dropped during processing
3362 * for congestion control or by the protocol layers.
3363 *
3364 * This function may only be called from softirq context and interrupts
3365 * should be enabled.
3366 *
3367 * Return values (usually ignored):
3368 * NET_RX_SUCCESS: no congestion
3369 * NET_RX_DROP: packet was dropped
3370 */
3371 int netif_receive_skb(struct sk_buff *skb)
3372 {
3373 net_timestamp_check(netdev_tstamp_prequeue, skb);
3374
3375 if (skb_defer_rx_timestamp(skb))
3376 return NET_RX_SUCCESS;
3377
3378 #ifdef CONFIG_RPS
3379 if (static_key_false(&rps_needed)) {
3380 struct rps_dev_flow voidflow, *rflow = &voidflow;
3381 int cpu, ret;
3382
3383 rcu_read_lock();
3384
3385 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3386
3387 if (cpu >= 0) {
3388 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3389 rcu_read_unlock();
3390 return ret;
3391 }
3392 rcu_read_unlock();
3393 }
3394 #endif
3395 return __netif_receive_skb(skb);
3396 }
3397 EXPORT_SYMBOL(netif_receive_skb);
3398
3399 /* Network device is going away, flush any packets still pending
3400 * Called with irqs disabled.
3401 */
3402 static void flush_backlog(void *arg)
3403 {
3404 struct net_device *dev = arg;
3405 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3406 struct sk_buff *skb, *tmp;
3407
3408 rps_lock(sd);
3409 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3410 if (skb->dev == dev) {
3411 __skb_unlink(skb, &sd->input_pkt_queue);
3412 kfree_skb(skb);
3413 input_queue_head_incr(sd);
3414 }
3415 }
3416 rps_unlock(sd);
3417
3418 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3419 if (skb->dev == dev) {
3420 __skb_unlink(skb, &sd->process_queue);
3421 kfree_skb(skb);
3422 input_queue_head_incr(sd);
3423 }
3424 }
3425 }
3426
3427 static int napi_gro_complete(struct sk_buff *skb)
3428 {
3429 struct packet_type *ptype;
3430 __be16 type = skb->protocol;
3431 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3432 int err = -ENOENT;
3433
3434 if (NAPI_GRO_CB(skb)->count == 1) {
3435 skb_shinfo(skb)->gso_size = 0;
3436 goto out;
3437 }
3438
3439 rcu_read_lock();
3440 list_for_each_entry_rcu(ptype, head, list) {
3441 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3442 continue;
3443
3444 err = ptype->gro_complete(skb);
3445 break;
3446 }
3447 rcu_read_unlock();
3448
3449 if (err) {
3450 WARN_ON(&ptype->list == head);
3451 kfree_skb(skb);
3452 return NET_RX_SUCCESS;
3453 }
3454
3455 out:
3456 return netif_receive_skb(skb);
3457 }
3458
3459 inline void napi_gro_flush(struct napi_struct *napi)
3460 {
3461 struct sk_buff *skb, *next;
3462
3463 for (skb = napi->gro_list; skb; skb = next) {
3464 next = skb->next;
3465 skb->next = NULL;
3466 napi_gro_complete(skb);
3467 }
3468
3469 napi->gro_count = 0;
3470 napi->gro_list = NULL;
3471 }
3472 EXPORT_SYMBOL(napi_gro_flush);
3473
3474 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3475 {
3476 struct sk_buff **pp = NULL;
3477 struct packet_type *ptype;
3478 __be16 type = skb->protocol;
3479 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3480 int same_flow;
3481 int mac_len;
3482 enum gro_result ret;
3483
3484 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3485 goto normal;
3486
3487 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3488 goto normal;
3489
3490 rcu_read_lock();
3491 list_for_each_entry_rcu(ptype, head, list) {
3492 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3493 continue;
3494
3495 skb_set_network_header(skb, skb_gro_offset(skb));
3496 mac_len = skb->network_header - skb->mac_header;
3497 skb->mac_len = mac_len;
3498 NAPI_GRO_CB(skb)->same_flow = 0;
3499 NAPI_GRO_CB(skb)->flush = 0;
3500 NAPI_GRO_CB(skb)->free = 0;
3501
3502 pp = ptype->gro_receive(&napi->gro_list, skb);
3503 break;
3504 }
3505 rcu_read_unlock();
3506
3507 if (&ptype->list == head)
3508 goto normal;
3509
3510 same_flow = NAPI_GRO_CB(skb)->same_flow;
3511 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3512
3513 if (pp) {
3514 struct sk_buff *nskb = *pp;
3515
3516 *pp = nskb->next;
3517 nskb->next = NULL;
3518 napi_gro_complete(nskb);
3519 napi->gro_count--;
3520 }
3521
3522 if (same_flow)
3523 goto ok;
3524
3525 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3526 goto normal;
3527
3528 napi->gro_count++;
3529 NAPI_GRO_CB(skb)->count = 1;
3530 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3531 skb->next = napi->gro_list;
3532 napi->gro_list = skb;
3533 ret = GRO_HELD;
3534
3535 pull:
3536 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3537 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3538
3539 BUG_ON(skb->end - skb->tail < grow);
3540
3541 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3542
3543 skb->tail += grow;
3544 skb->data_len -= grow;
3545
3546 skb_shinfo(skb)->frags[0].page_offset += grow;
3547 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3548
3549 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3550 skb_frag_unref(skb, 0);
3551 memmove(skb_shinfo(skb)->frags,
3552 skb_shinfo(skb)->frags + 1,
3553 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3554 }
3555 }
3556
3557 ok:
3558 return ret;
3559
3560 normal:
3561 ret = GRO_NORMAL;
3562 goto pull;
3563 }
3564 EXPORT_SYMBOL(dev_gro_receive);
3565
3566 static inline gro_result_t
3567 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3568 {
3569 struct sk_buff *p;
3570 unsigned int maclen = skb->dev->hard_header_len;
3571
3572 for (p = napi->gro_list; p; p = p->next) {
3573 unsigned long diffs;
3574
3575 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3576 diffs |= p->vlan_tci ^ skb->vlan_tci;
3577 if (maclen == ETH_HLEN)
3578 diffs |= compare_ether_header(skb_mac_header(p),
3579 skb_gro_mac_header(skb));
3580 else if (!diffs)
3581 diffs = memcmp(skb_mac_header(p),
3582 skb_gro_mac_header(skb),
3583 maclen);
3584 NAPI_GRO_CB(p)->same_flow = !diffs;
3585 NAPI_GRO_CB(p)->flush = 0;
3586 }
3587
3588 return dev_gro_receive(napi, skb);
3589 }
3590
3591 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3592 {
3593 switch (ret) {
3594 case GRO_NORMAL:
3595 if (netif_receive_skb(skb))
3596 ret = GRO_DROP;
3597 break;
3598
3599 case GRO_DROP:
3600 kfree_skb(skb);
3601 break;
3602
3603 case GRO_MERGED_FREE:
3604 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3605 kmem_cache_free(skbuff_head_cache, skb);
3606 else
3607 __kfree_skb(skb);
3608 break;
3609
3610 case GRO_HELD:
3611 case GRO_MERGED:
3612 break;
3613 }
3614
3615 return ret;
3616 }
3617 EXPORT_SYMBOL(napi_skb_finish);
3618
3619 void skb_gro_reset_offset(struct sk_buff *skb)
3620 {
3621 NAPI_GRO_CB(skb)->data_offset = 0;
3622 NAPI_GRO_CB(skb)->frag0 = NULL;
3623 NAPI_GRO_CB(skb)->frag0_len = 0;
3624
3625 if (skb->mac_header == skb->tail &&
3626 !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3627 NAPI_GRO_CB(skb)->frag0 =
3628 skb_frag_address(&skb_shinfo(skb)->frags[0]);
3629 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3630 }
3631 }
3632 EXPORT_SYMBOL(skb_gro_reset_offset);
3633
3634 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3635 {
3636 skb_gro_reset_offset(skb);
3637
3638 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3639 }
3640 EXPORT_SYMBOL(napi_gro_receive);
3641
3642 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3643 {
3644 __skb_pull(skb, skb_headlen(skb));
3645 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3646 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3647 skb->vlan_tci = 0;
3648 skb->dev = napi->dev;
3649 skb->skb_iif = 0;
3650
3651 napi->skb = skb;
3652 }
3653
3654 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3655 {
3656 struct sk_buff *skb = napi->skb;
3657
3658 if (!skb) {
3659 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3660 if (skb)
3661 napi->skb = skb;
3662 }
3663 return skb;
3664 }
3665 EXPORT_SYMBOL(napi_get_frags);
3666
3667 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3668 gro_result_t ret)
3669 {
3670 switch (ret) {
3671 case GRO_NORMAL:
3672 case GRO_HELD:
3673 skb->protocol = eth_type_trans(skb, skb->dev);
3674
3675 if (ret == GRO_HELD)
3676 skb_gro_pull(skb, -ETH_HLEN);
3677 else if (netif_receive_skb(skb))
3678 ret = GRO_DROP;
3679 break;
3680
3681 case GRO_DROP:
3682 case GRO_MERGED_FREE:
3683 napi_reuse_skb(napi, skb);
3684 break;
3685
3686 case GRO_MERGED:
3687 break;
3688 }
3689
3690 return ret;
3691 }
3692 EXPORT_SYMBOL(napi_frags_finish);
3693
3694 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3695 {
3696 struct sk_buff *skb = napi->skb;
3697 struct ethhdr *eth;
3698 unsigned int hlen;
3699 unsigned int off;
3700
3701 napi->skb = NULL;
3702
3703 skb_reset_mac_header(skb);
3704 skb_gro_reset_offset(skb);
3705
3706 off = skb_gro_offset(skb);
3707 hlen = off + sizeof(*eth);
3708 eth = skb_gro_header_fast(skb, off);
3709 if (skb_gro_header_hard(skb, hlen)) {
3710 eth = skb_gro_header_slow(skb, hlen, off);
3711 if (unlikely(!eth)) {
3712 napi_reuse_skb(napi, skb);
3713 skb = NULL;
3714 goto out;
3715 }
3716 }
3717
3718 skb_gro_pull(skb, sizeof(*eth));
3719
3720 /*
3721 * This works because the only protocols we care about don't require
3722 * special handling. We'll fix it up properly at the end.
3723 */
3724 skb->protocol = eth->h_proto;
3725
3726 out:
3727 return skb;
3728 }
3729
3730 gro_result_t napi_gro_frags(struct napi_struct *napi)
3731 {
3732 struct sk_buff *skb = napi_frags_skb(napi);
3733
3734 if (!skb)
3735 return GRO_DROP;
3736
3737 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3738 }
3739 EXPORT_SYMBOL(napi_gro_frags);
3740
3741 /*
3742 * net_rps_action sends any pending IPI's for rps.
3743 * Note: called with local irq disabled, but exits with local irq enabled.
3744 */
3745 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3746 {
3747 #ifdef CONFIG_RPS
3748 struct softnet_data *remsd = sd->rps_ipi_list;
3749
3750 if (remsd) {
3751 sd->rps_ipi_list = NULL;
3752
3753 local_irq_enable();
3754
3755 /* Send pending IPI's to kick RPS processing on remote cpus. */
3756 while (remsd) {
3757 struct softnet_data *next = remsd->rps_ipi_next;
3758
3759 if (cpu_online(remsd->cpu))
3760 __smp_call_function_single(remsd->cpu,
3761 &remsd->csd, 0);
3762 remsd = next;
3763 }
3764 } else
3765 #endif
3766 local_irq_enable();
3767 }
3768
3769 static int process_backlog(struct napi_struct *napi, int quota)
3770 {
3771 int work = 0;
3772 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3773
3774 #ifdef CONFIG_RPS
3775 /* Check if we have pending ipi, its better to send them now,
3776 * not waiting net_rx_action() end.
3777 */
3778 if (sd->rps_ipi_list) {
3779 local_irq_disable();
3780 net_rps_action_and_irq_enable(sd);
3781 }
3782 #endif
3783 napi->weight = weight_p;
3784 local_irq_disable();
3785 while (work < quota) {
3786 struct sk_buff *skb;
3787 unsigned int qlen;
3788
3789 while ((skb = __skb_dequeue(&sd->process_queue))) {
3790 local_irq_enable();
3791 __netif_receive_skb(skb);
3792 local_irq_disable();
3793 input_queue_head_incr(sd);
3794 if (++work >= quota) {
3795 local_irq_enable();
3796 return work;
3797 }
3798 }
3799
3800 rps_lock(sd);
3801 qlen = skb_queue_len(&sd->input_pkt_queue);
3802 if (qlen)
3803 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3804 &sd->process_queue);
3805
3806 if (qlen < quota - work) {
3807 /*
3808 * Inline a custom version of __napi_complete().
3809 * only current cpu owns and manipulates this napi,
3810 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3811 * we can use a plain write instead of clear_bit(),
3812 * and we dont need an smp_mb() memory barrier.
3813 */
3814 list_del(&napi->poll_list);
3815 napi->state = 0;
3816
3817 quota = work + qlen;
3818 }
3819 rps_unlock(sd);
3820 }
3821 local_irq_enable();
3822
3823 return work;
3824 }
3825
3826 /**
3827 * __napi_schedule - schedule for receive
3828 * @n: entry to schedule
3829 *
3830 * The entry's receive function will be scheduled to run
3831 */
3832 void __napi_schedule(struct napi_struct *n)
3833 {
3834 unsigned long flags;
3835
3836 local_irq_save(flags);
3837 ____napi_schedule(&__get_cpu_var(softnet_data), n);
3838 local_irq_restore(flags);
3839 }
3840 EXPORT_SYMBOL(__napi_schedule);
3841
3842 void __napi_complete(struct napi_struct *n)
3843 {
3844 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3845 BUG_ON(n->gro_list);
3846
3847 list_del(&n->poll_list);
3848 smp_mb__before_clear_bit();
3849 clear_bit(NAPI_STATE_SCHED, &n->state);
3850 }
3851 EXPORT_SYMBOL(__napi_complete);
3852
3853 void napi_complete(struct napi_struct *n)
3854 {
3855 unsigned long flags;
3856
3857 /*
3858 * don't let napi dequeue from the cpu poll list
3859 * just in case its running on a different cpu
3860 */
3861 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3862 return;
3863
3864 napi_gro_flush(n);
3865 local_irq_save(flags);
3866 __napi_complete(n);
3867 local_irq_restore(flags);
3868 }
3869 EXPORT_SYMBOL(napi_complete);
3870
3871 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3872 int (*poll)(struct napi_struct *, int), int weight)
3873 {
3874 INIT_LIST_HEAD(&napi->poll_list);
3875 napi->gro_count = 0;
3876 napi->gro_list = NULL;
3877 napi->skb = NULL;
3878 napi->poll = poll;
3879 napi->weight = weight;
3880 list_add(&napi->dev_list, &dev->napi_list);
3881 napi->dev = dev;
3882 #ifdef CONFIG_NETPOLL
3883 spin_lock_init(&napi->poll_lock);
3884 napi->poll_owner = -1;
3885 #endif
3886 set_bit(NAPI_STATE_SCHED, &napi->state);
3887 }
3888 EXPORT_SYMBOL(netif_napi_add);
3889
3890 void netif_napi_del(struct napi_struct *napi)
3891 {
3892 struct sk_buff *skb, *next;
3893
3894 list_del_init(&napi->dev_list);
3895 napi_free_frags(napi);
3896
3897 for (skb = napi->gro_list; skb; skb = next) {
3898 next = skb->next;
3899 skb->next = NULL;
3900 kfree_skb(skb);
3901 }
3902
3903 napi->gro_list = NULL;
3904 napi->gro_count = 0;
3905 }
3906 EXPORT_SYMBOL(netif_napi_del);
3907
3908 static void net_rx_action(struct softirq_action *h)
3909 {
3910 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3911 unsigned long time_limit = jiffies + 2;
3912 int budget = netdev_budget;
3913 void *have;
3914
3915 local_irq_disable();
3916
3917 while (!list_empty(&sd->poll_list)) {
3918 struct napi_struct *n;
3919 int work, weight;
3920
3921 /* If softirq window is exhuasted then punt.
3922 * Allow this to run for 2 jiffies since which will allow
3923 * an average latency of 1.5/HZ.
3924 */
3925 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3926 goto softnet_break;
3927
3928 local_irq_enable();
3929
3930 /* Even though interrupts have been re-enabled, this
3931 * access is safe because interrupts can only add new
3932 * entries to the tail of this list, and only ->poll()
3933 * calls can remove this head entry from the list.
3934 */
3935 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3936
3937 have = netpoll_poll_lock(n);
3938
3939 weight = n->weight;
3940
3941 /* This NAPI_STATE_SCHED test is for avoiding a race
3942 * with netpoll's poll_napi(). Only the entity which
3943 * obtains the lock and sees NAPI_STATE_SCHED set will
3944 * actually make the ->poll() call. Therefore we avoid
3945 * accidentally calling ->poll() when NAPI is not scheduled.
3946 */
3947 work = 0;
3948 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3949 work = n->poll(n, weight);
3950 trace_napi_poll(n);
3951 }
3952
3953 WARN_ON_ONCE(work > weight);
3954
3955 budget -= work;
3956
3957 local_irq_disable();
3958
3959 /* Drivers must not modify the NAPI state if they
3960 * consume the entire weight. In such cases this code
3961 * still "owns" the NAPI instance and therefore can
3962 * move the instance around on the list at-will.
3963 */
3964 if (unlikely(work == weight)) {
3965 if (unlikely(napi_disable_pending(n))) {
3966 local_irq_enable();
3967 napi_complete(n);
3968 local_irq_disable();
3969 } else
3970 list_move_tail(&n->poll_list, &sd->poll_list);
3971 }
3972
3973 netpoll_poll_unlock(have);
3974 }
3975 out:
3976 net_rps_action_and_irq_enable(sd);
3977
3978 #ifdef CONFIG_NET_DMA
3979 /*
3980 * There may not be any more sk_buffs coming right now, so push
3981 * any pending DMA copies to hardware
3982 */
3983 dma_issue_pending_all();
3984 #endif
3985
3986 return;
3987
3988 softnet_break:
3989 sd->time_squeeze++;
3990 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3991 goto out;
3992 }
3993
3994 static gifconf_func_t *gifconf_list[NPROTO];
3995
3996 /**
3997 * register_gifconf - register a SIOCGIF handler
3998 * @family: Address family
3999 * @gifconf: Function handler
4000 *
4001 * Register protocol dependent address dumping routines. The handler
4002 * that is passed must not be freed or reused until it has been replaced
4003 * by another handler.
4004 */
4005 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4006 {
4007 if (family >= NPROTO)
4008 return -EINVAL;
4009 gifconf_list[family] = gifconf;
4010 return 0;
4011 }
4012 EXPORT_SYMBOL(register_gifconf);
4013
4014
4015 /*
4016 * Map an interface index to its name (SIOCGIFNAME)
4017 */
4018
4019 /*
4020 * We need this ioctl for efficient implementation of the
4021 * if_indextoname() function required by the IPv6 API. Without
4022 * it, we would have to search all the interfaces to find a
4023 * match. --pb
4024 */
4025
4026 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4027 {
4028 struct net_device *dev;
4029 struct ifreq ifr;
4030
4031 /*
4032 * Fetch the caller's info block.
4033 */
4034
4035 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4036 return -EFAULT;
4037
4038 rcu_read_lock();
4039 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4040 if (!dev) {
4041 rcu_read_unlock();
4042 return -ENODEV;
4043 }
4044
4045 strcpy(ifr.ifr_name, dev->name);
4046 rcu_read_unlock();
4047
4048 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4049 return -EFAULT;
4050 return 0;
4051 }
4052
4053 /*
4054 * Perform a SIOCGIFCONF call. This structure will change
4055 * size eventually, and there is nothing I can do about it.
4056 * Thus we will need a 'compatibility mode'.
4057 */
4058
4059 static int dev_ifconf(struct net *net, char __user *arg)
4060 {
4061 struct ifconf ifc;
4062 struct net_device *dev;
4063 char __user *pos;
4064 int len;
4065 int total;
4066 int i;
4067
4068 /*
4069 * Fetch the caller's info block.
4070 */
4071
4072 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4073 return -EFAULT;
4074
4075 pos = ifc.ifc_buf;
4076 len = ifc.ifc_len;
4077
4078 /*
4079 * Loop over the interfaces, and write an info block for each.
4080 */
4081
4082 total = 0;
4083 for_each_netdev(net, dev) {
4084 for (i = 0; i < NPROTO; i++) {
4085 if (gifconf_list[i]) {
4086 int done;
4087 if (!pos)
4088 done = gifconf_list[i](dev, NULL, 0);
4089 else
4090 done = gifconf_list[i](dev, pos + total,
4091 len - total);
4092 if (done < 0)
4093 return -EFAULT;
4094 total += done;
4095 }
4096 }
4097 }
4098
4099 /*
4100 * All done. Write the updated control block back to the caller.
4101 */
4102 ifc.ifc_len = total;
4103
4104 /*
4105 * Both BSD and Solaris return 0 here, so we do too.
4106 */
4107 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4108 }
4109
4110 #ifdef CONFIG_PROC_FS
4111
4112 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4113
4114 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4115 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4116 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4117
4118 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4119 {
4120 struct net *net = seq_file_net(seq);
4121 struct net_device *dev;
4122 struct hlist_node *p;
4123 struct hlist_head *h;
4124 unsigned int count = 0, offset = get_offset(*pos);
4125
4126 h = &net->dev_name_head[get_bucket(*pos)];
4127 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4128 if (++count == offset)
4129 return dev;
4130 }
4131
4132 return NULL;
4133 }
4134
4135 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4136 {
4137 struct net_device *dev;
4138 unsigned int bucket;
4139
4140 do {
4141 dev = dev_from_same_bucket(seq, pos);
4142 if (dev)
4143 return dev;
4144
4145 bucket = get_bucket(*pos) + 1;
4146 *pos = set_bucket_offset(bucket, 1);
4147 } while (bucket < NETDEV_HASHENTRIES);
4148
4149 return NULL;
4150 }
4151
4152 /*
4153 * This is invoked by the /proc filesystem handler to display a device
4154 * in detail.
4155 */
4156 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4157 __acquires(RCU)
4158 {
4159 rcu_read_lock();
4160 if (!*pos)
4161 return SEQ_START_TOKEN;
4162
4163 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4164 return NULL;
4165
4166 return dev_from_bucket(seq, pos);
4167 }
4168
4169 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4170 {
4171 ++*pos;
4172 return dev_from_bucket(seq, pos);
4173 }
4174
4175 void dev_seq_stop(struct seq_file *seq, void *v)
4176 __releases(RCU)
4177 {
4178 rcu_read_unlock();
4179 }
4180
4181 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4182 {
4183 struct rtnl_link_stats64 temp;
4184 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4185
4186 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4187 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4188 dev->name, stats->rx_bytes, stats->rx_packets,
4189 stats->rx_errors,
4190 stats->rx_dropped + stats->rx_missed_errors,
4191 stats->rx_fifo_errors,
4192 stats->rx_length_errors + stats->rx_over_errors +
4193 stats->rx_crc_errors + stats->rx_frame_errors,
4194 stats->rx_compressed, stats->multicast,
4195 stats->tx_bytes, stats->tx_packets,
4196 stats->tx_errors, stats->tx_dropped,
4197 stats->tx_fifo_errors, stats->collisions,
4198 stats->tx_carrier_errors +
4199 stats->tx_aborted_errors +
4200 stats->tx_window_errors +
4201 stats->tx_heartbeat_errors,
4202 stats->tx_compressed);
4203 }
4204
4205 /*
4206 * Called from the PROCfs module. This now uses the new arbitrary sized
4207 * /proc/net interface to create /proc/net/dev
4208 */
4209 static int dev_seq_show(struct seq_file *seq, void *v)
4210 {
4211 if (v == SEQ_START_TOKEN)
4212 seq_puts(seq, "Inter-| Receive "
4213 " | Transmit\n"
4214 " face |bytes packets errs drop fifo frame "
4215 "compressed multicast|bytes packets errs "
4216 "drop fifo colls carrier compressed\n");
4217 else
4218 dev_seq_printf_stats(seq, v);
4219 return 0;
4220 }
4221
4222 static struct softnet_data *softnet_get_online(loff_t *pos)
4223 {
4224 struct softnet_data *sd = NULL;
4225
4226 while (*pos < nr_cpu_ids)
4227 if (cpu_online(*pos)) {
4228 sd = &per_cpu(softnet_data, *pos);
4229 break;
4230 } else
4231 ++*pos;
4232 return sd;
4233 }
4234
4235 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4236 {
4237 return softnet_get_online(pos);
4238 }
4239
4240 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4241 {
4242 ++*pos;
4243 return softnet_get_online(pos);
4244 }
4245
4246 static void softnet_seq_stop(struct seq_file *seq, void *v)
4247 {
4248 }
4249
4250 static int softnet_seq_show(struct seq_file *seq, void *v)
4251 {
4252 struct softnet_data *sd = v;
4253
4254 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4255 sd->processed, sd->dropped, sd->time_squeeze, 0,
4256 0, 0, 0, 0, /* was fastroute */
4257 sd->cpu_collision, sd->received_rps);
4258 return 0;
4259 }
4260
4261 static const struct seq_operations dev_seq_ops = {
4262 .start = dev_seq_start,
4263 .next = dev_seq_next,
4264 .stop = dev_seq_stop,
4265 .show = dev_seq_show,
4266 };
4267
4268 static int dev_seq_open(struct inode *inode, struct file *file)
4269 {
4270 return seq_open_net(inode, file, &dev_seq_ops,
4271 sizeof(struct seq_net_private));
4272 }
4273
4274 static const struct file_operations dev_seq_fops = {
4275 .owner = THIS_MODULE,
4276 .open = dev_seq_open,
4277 .read = seq_read,
4278 .llseek = seq_lseek,
4279 .release = seq_release_net,
4280 };
4281
4282 static const struct seq_operations softnet_seq_ops = {
4283 .start = softnet_seq_start,
4284 .next = softnet_seq_next,
4285 .stop = softnet_seq_stop,
4286 .show = softnet_seq_show,
4287 };
4288
4289 static int softnet_seq_open(struct inode *inode, struct file *file)
4290 {
4291 return seq_open(file, &softnet_seq_ops);
4292 }
4293
4294 static const struct file_operations softnet_seq_fops = {
4295 .owner = THIS_MODULE,
4296 .open = softnet_seq_open,
4297 .read = seq_read,
4298 .llseek = seq_lseek,
4299 .release = seq_release,
4300 };
4301
4302 static void *ptype_get_idx(loff_t pos)
4303 {
4304 struct packet_type *pt = NULL;
4305 loff_t i = 0;
4306 int t;
4307
4308 list_for_each_entry_rcu(pt, &ptype_all, list) {
4309 if (i == pos)
4310 return pt;
4311 ++i;
4312 }
4313
4314 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4315 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4316 if (i == pos)
4317 return pt;
4318 ++i;
4319 }
4320 }
4321 return NULL;
4322 }
4323
4324 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4325 __acquires(RCU)
4326 {
4327 rcu_read_lock();
4328 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4329 }
4330
4331 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4332 {
4333 struct packet_type *pt;
4334 struct list_head *nxt;
4335 int hash;
4336
4337 ++*pos;
4338 if (v == SEQ_START_TOKEN)
4339 return ptype_get_idx(0);
4340
4341 pt = v;
4342 nxt = pt->list.next;
4343 if (pt->type == htons(ETH_P_ALL)) {
4344 if (nxt != &ptype_all)
4345 goto found;
4346 hash = 0;
4347 nxt = ptype_base[0].next;
4348 } else
4349 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4350
4351 while (nxt == &ptype_base[hash]) {
4352 if (++hash >= PTYPE_HASH_SIZE)
4353 return NULL;
4354 nxt = ptype_base[hash].next;
4355 }
4356 found:
4357 return list_entry(nxt, struct packet_type, list);
4358 }
4359
4360 static void ptype_seq_stop(struct seq_file *seq, void *v)
4361 __releases(RCU)
4362 {
4363 rcu_read_unlock();
4364 }
4365
4366 static int ptype_seq_show(struct seq_file *seq, void *v)
4367 {
4368 struct packet_type *pt = v;
4369
4370 if (v == SEQ_START_TOKEN)
4371 seq_puts(seq, "Type Device Function\n");
4372 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4373 if (pt->type == htons(ETH_P_ALL))
4374 seq_puts(seq, "ALL ");
4375 else
4376 seq_printf(seq, "%04x", ntohs(pt->type));
4377
4378 seq_printf(seq, " %-8s %pF\n",
4379 pt->dev ? pt->dev->name : "", pt->func);
4380 }
4381
4382 return 0;
4383 }
4384
4385 static const struct seq_operations ptype_seq_ops = {
4386 .start = ptype_seq_start,
4387 .next = ptype_seq_next,
4388 .stop = ptype_seq_stop,
4389 .show = ptype_seq_show,
4390 };
4391
4392 static int ptype_seq_open(struct inode *inode, struct file *file)
4393 {
4394 return seq_open_net(inode, file, &ptype_seq_ops,
4395 sizeof(struct seq_net_private));
4396 }
4397
4398 static const struct file_operations ptype_seq_fops = {
4399 .owner = THIS_MODULE,
4400 .open = ptype_seq_open,
4401 .read = seq_read,
4402 .llseek = seq_lseek,
4403 .release = seq_release_net,
4404 };
4405
4406
4407 static int __net_init dev_proc_net_init(struct net *net)
4408 {
4409 int rc = -ENOMEM;
4410
4411 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4412 goto out;
4413 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4414 goto out_dev;
4415 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4416 goto out_softnet;
4417
4418 if (wext_proc_init(net))
4419 goto out_ptype;
4420 rc = 0;
4421 out:
4422 return rc;
4423 out_ptype:
4424 proc_net_remove(net, "ptype");
4425 out_softnet:
4426 proc_net_remove(net, "softnet_stat");
4427 out_dev:
4428 proc_net_remove(net, "dev");
4429 goto out;
4430 }
4431
4432 static void __net_exit dev_proc_net_exit(struct net *net)
4433 {
4434 wext_proc_exit(net);
4435
4436 proc_net_remove(net, "ptype");
4437 proc_net_remove(net, "softnet_stat");
4438 proc_net_remove(net, "dev");
4439 }
4440
4441 static struct pernet_operations __net_initdata dev_proc_ops = {
4442 .init = dev_proc_net_init,
4443 .exit = dev_proc_net_exit,
4444 };
4445
4446 static int __init dev_proc_init(void)
4447 {
4448 return register_pernet_subsys(&dev_proc_ops);
4449 }
4450 #else
4451 #define dev_proc_init() 0
4452 #endif /* CONFIG_PROC_FS */
4453
4454
4455 /**
4456 * netdev_set_master - set up master pointer
4457 * @slave: slave device
4458 * @master: new master device
4459 *
4460 * Changes the master device of the slave. Pass %NULL to break the
4461 * bonding. The caller must hold the RTNL semaphore. On a failure
4462 * a negative errno code is returned. On success the reference counts
4463 * are adjusted and the function returns zero.
4464 */
4465 int netdev_set_master(struct net_device *slave, struct net_device *master)
4466 {
4467 struct net_device *old = slave->master;
4468
4469 ASSERT_RTNL();
4470
4471 if (master) {
4472 if (old)
4473 return -EBUSY;
4474 dev_hold(master);
4475 }
4476
4477 slave->master = master;
4478
4479 if (old)
4480 dev_put(old);
4481 return 0;
4482 }
4483 EXPORT_SYMBOL(netdev_set_master);
4484
4485 /**
4486 * netdev_set_bond_master - set up bonding master/slave pair
4487 * @slave: slave device
4488 * @master: new master device
4489 *
4490 * Changes the master device of the slave. Pass %NULL to break the
4491 * bonding. The caller must hold the RTNL semaphore. On a failure
4492 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4493 * to the routing socket and the function returns zero.
4494 */
4495 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4496 {
4497 int err;
4498
4499 ASSERT_RTNL();
4500
4501 err = netdev_set_master(slave, master);
4502 if (err)
4503 return err;
4504 if (master)
4505 slave->flags |= IFF_SLAVE;
4506 else
4507 slave->flags &= ~IFF_SLAVE;
4508
4509 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4510 return 0;
4511 }
4512 EXPORT_SYMBOL(netdev_set_bond_master);
4513
4514 static void dev_change_rx_flags(struct net_device *dev, int flags)
4515 {
4516 const struct net_device_ops *ops = dev->netdev_ops;
4517
4518 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4519 ops->ndo_change_rx_flags(dev, flags);
4520 }
4521
4522 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4523 {
4524 unsigned int old_flags = dev->flags;
4525 uid_t uid;
4526 gid_t gid;
4527
4528 ASSERT_RTNL();
4529
4530 dev->flags |= IFF_PROMISC;
4531 dev->promiscuity += inc;
4532 if (dev->promiscuity == 0) {
4533 /*
4534 * Avoid overflow.
4535 * If inc causes overflow, untouch promisc and return error.
4536 */
4537 if (inc < 0)
4538 dev->flags &= ~IFF_PROMISC;
4539 else {
4540 dev->promiscuity -= inc;
4541 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4542 dev->name);
4543 return -EOVERFLOW;
4544 }
4545 }
4546 if (dev->flags != old_flags) {
4547 pr_info("device %s %s promiscuous mode\n",
4548 dev->name,
4549 dev->flags & IFF_PROMISC ? "entered" : "left");
4550 if (audit_enabled) {
4551 current_uid_gid(&uid, &gid);
4552 audit_log(current->audit_context, GFP_ATOMIC,
4553 AUDIT_ANOM_PROMISCUOUS,
4554 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4555 dev->name, (dev->flags & IFF_PROMISC),
4556 (old_flags & IFF_PROMISC),
4557 audit_get_loginuid(current),
4558 uid, gid,
4559 audit_get_sessionid(current));
4560 }
4561
4562 dev_change_rx_flags(dev, IFF_PROMISC);
4563 }
4564 return 0;
4565 }
4566
4567 /**
4568 * dev_set_promiscuity - update promiscuity count on a device
4569 * @dev: device
4570 * @inc: modifier
4571 *
4572 * Add or remove promiscuity from a device. While the count in the device
4573 * remains above zero the interface remains promiscuous. Once it hits zero
4574 * the device reverts back to normal filtering operation. A negative inc
4575 * value is used to drop promiscuity on the device.
4576 * Return 0 if successful or a negative errno code on error.
4577 */
4578 int dev_set_promiscuity(struct net_device *dev, int inc)
4579 {
4580 unsigned int old_flags = dev->flags;
4581 int err;
4582
4583 err = __dev_set_promiscuity(dev, inc);
4584 if (err < 0)
4585 return err;
4586 if (dev->flags != old_flags)
4587 dev_set_rx_mode(dev);
4588 return err;
4589 }
4590 EXPORT_SYMBOL(dev_set_promiscuity);
4591
4592 /**
4593 * dev_set_allmulti - update allmulti count on a device
4594 * @dev: device
4595 * @inc: modifier
4596 *
4597 * Add or remove reception of all multicast frames to a device. While the
4598 * count in the device remains above zero the interface remains listening
4599 * to all interfaces. Once it hits zero the device reverts back to normal
4600 * filtering operation. A negative @inc value is used to drop the counter
4601 * when releasing a resource needing all multicasts.
4602 * Return 0 if successful or a negative errno code on error.
4603 */
4604
4605 int dev_set_allmulti(struct net_device *dev, int inc)
4606 {
4607 unsigned int old_flags = dev->flags;
4608
4609 ASSERT_RTNL();
4610
4611 dev->flags |= IFF_ALLMULTI;
4612 dev->allmulti += inc;
4613 if (dev->allmulti == 0) {
4614 /*
4615 * Avoid overflow.
4616 * If inc causes overflow, untouch allmulti and return error.
4617 */
4618 if (inc < 0)
4619 dev->flags &= ~IFF_ALLMULTI;
4620 else {
4621 dev->allmulti -= inc;
4622 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4623 dev->name);
4624 return -EOVERFLOW;
4625 }
4626 }
4627 if (dev->flags ^ old_flags) {
4628 dev_change_rx_flags(dev, IFF_ALLMULTI);
4629 dev_set_rx_mode(dev);
4630 }
4631 return 0;
4632 }
4633 EXPORT_SYMBOL(dev_set_allmulti);
4634
4635 /*
4636 * Upload unicast and multicast address lists to device and
4637 * configure RX filtering. When the device doesn't support unicast
4638 * filtering it is put in promiscuous mode while unicast addresses
4639 * are present.
4640 */
4641 void __dev_set_rx_mode(struct net_device *dev)
4642 {
4643 const struct net_device_ops *ops = dev->netdev_ops;
4644
4645 /* dev_open will call this function so the list will stay sane. */
4646 if (!(dev->flags&IFF_UP))
4647 return;
4648
4649 if (!netif_device_present(dev))
4650 return;
4651
4652 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4653 /* Unicast addresses changes may only happen under the rtnl,
4654 * therefore calling __dev_set_promiscuity here is safe.
4655 */
4656 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4657 __dev_set_promiscuity(dev, 1);
4658 dev->uc_promisc = true;
4659 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4660 __dev_set_promiscuity(dev, -1);
4661 dev->uc_promisc = false;
4662 }
4663 }
4664
4665 if (ops->ndo_set_rx_mode)
4666 ops->ndo_set_rx_mode(dev);
4667 }
4668
4669 void dev_set_rx_mode(struct net_device *dev)
4670 {
4671 netif_addr_lock_bh(dev);
4672 __dev_set_rx_mode(dev);
4673 netif_addr_unlock_bh(dev);
4674 }
4675
4676 /**
4677 * dev_get_flags - get flags reported to userspace
4678 * @dev: device
4679 *
4680 * Get the combination of flag bits exported through APIs to userspace.
4681 */
4682 unsigned int dev_get_flags(const struct net_device *dev)
4683 {
4684 unsigned int flags;
4685
4686 flags = (dev->flags & ~(IFF_PROMISC |
4687 IFF_ALLMULTI |
4688 IFF_RUNNING |
4689 IFF_LOWER_UP |
4690 IFF_DORMANT)) |
4691 (dev->gflags & (IFF_PROMISC |
4692 IFF_ALLMULTI));
4693
4694 if (netif_running(dev)) {
4695 if (netif_oper_up(dev))
4696 flags |= IFF_RUNNING;
4697 if (netif_carrier_ok(dev))
4698 flags |= IFF_LOWER_UP;
4699 if (netif_dormant(dev))
4700 flags |= IFF_DORMANT;
4701 }
4702
4703 return flags;
4704 }
4705 EXPORT_SYMBOL(dev_get_flags);
4706
4707 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4708 {
4709 unsigned int old_flags = dev->flags;
4710 int ret;
4711
4712 ASSERT_RTNL();
4713
4714 /*
4715 * Set the flags on our device.
4716 */
4717
4718 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4719 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4720 IFF_AUTOMEDIA)) |
4721 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4722 IFF_ALLMULTI));
4723
4724 /*
4725 * Load in the correct multicast list now the flags have changed.
4726 */
4727
4728 if ((old_flags ^ flags) & IFF_MULTICAST)
4729 dev_change_rx_flags(dev, IFF_MULTICAST);
4730
4731 dev_set_rx_mode(dev);
4732
4733 /*
4734 * Have we downed the interface. We handle IFF_UP ourselves
4735 * according to user attempts to set it, rather than blindly
4736 * setting it.
4737 */
4738
4739 ret = 0;
4740 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4741 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4742
4743 if (!ret)
4744 dev_set_rx_mode(dev);
4745 }
4746
4747 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4748 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4749
4750 dev->gflags ^= IFF_PROMISC;
4751 dev_set_promiscuity(dev, inc);
4752 }
4753
4754 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4755 is important. Some (broken) drivers set IFF_PROMISC, when
4756 IFF_ALLMULTI is requested not asking us and not reporting.
4757 */
4758 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4759 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4760
4761 dev->gflags ^= IFF_ALLMULTI;
4762 dev_set_allmulti(dev, inc);
4763 }
4764
4765 return ret;
4766 }
4767
4768 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4769 {
4770 unsigned int changes = dev->flags ^ old_flags;
4771
4772 if (changes & IFF_UP) {
4773 if (dev->flags & IFF_UP)
4774 call_netdevice_notifiers(NETDEV_UP, dev);
4775 else
4776 call_netdevice_notifiers(NETDEV_DOWN, dev);
4777 }
4778
4779 if (dev->flags & IFF_UP &&
4780 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4781 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4782 }
4783
4784 /**
4785 * dev_change_flags - change device settings
4786 * @dev: device
4787 * @flags: device state flags
4788 *
4789 * Change settings on device based state flags. The flags are
4790 * in the userspace exported format.
4791 */
4792 int dev_change_flags(struct net_device *dev, unsigned int flags)
4793 {
4794 int ret;
4795 unsigned int changes, old_flags = dev->flags;
4796
4797 ret = __dev_change_flags(dev, flags);
4798 if (ret < 0)
4799 return ret;
4800
4801 changes = old_flags ^ dev->flags;
4802 if (changes)
4803 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4804
4805 __dev_notify_flags(dev, old_flags);
4806 return ret;
4807 }
4808 EXPORT_SYMBOL(dev_change_flags);
4809
4810 /**
4811 * dev_set_mtu - Change maximum transfer unit
4812 * @dev: device
4813 * @new_mtu: new transfer unit
4814 *
4815 * Change the maximum transfer size of the network device.
4816 */
4817 int dev_set_mtu(struct net_device *dev, int new_mtu)
4818 {
4819 const struct net_device_ops *ops = dev->netdev_ops;
4820 int err;
4821
4822 if (new_mtu == dev->mtu)
4823 return 0;
4824
4825 /* MTU must be positive. */
4826 if (new_mtu < 0)
4827 return -EINVAL;
4828
4829 if (!netif_device_present(dev))
4830 return -ENODEV;
4831
4832 err = 0;
4833 if (ops->ndo_change_mtu)
4834 err = ops->ndo_change_mtu(dev, new_mtu);
4835 else
4836 dev->mtu = new_mtu;
4837
4838 if (!err && dev->flags & IFF_UP)
4839 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4840 return err;
4841 }
4842 EXPORT_SYMBOL(dev_set_mtu);
4843
4844 /**
4845 * dev_set_group - Change group this device belongs to
4846 * @dev: device
4847 * @new_group: group this device should belong to
4848 */
4849 void dev_set_group(struct net_device *dev, int new_group)
4850 {
4851 dev->group = new_group;
4852 }
4853 EXPORT_SYMBOL(dev_set_group);
4854
4855 /**
4856 * dev_set_mac_address - Change Media Access Control Address
4857 * @dev: device
4858 * @sa: new address
4859 *
4860 * Change the hardware (MAC) address of the device
4861 */
4862 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4863 {
4864 const struct net_device_ops *ops = dev->netdev_ops;
4865 int err;
4866
4867 if (!ops->ndo_set_mac_address)
4868 return -EOPNOTSUPP;
4869 if (sa->sa_family != dev->type)
4870 return -EINVAL;
4871 if (!netif_device_present(dev))
4872 return -ENODEV;
4873 err = ops->ndo_set_mac_address(dev, sa);
4874 if (!err)
4875 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4876 add_device_randomness(dev->dev_addr, dev->addr_len);
4877 return err;
4878 }
4879 EXPORT_SYMBOL(dev_set_mac_address);
4880
4881 /*
4882 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4883 */
4884 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4885 {
4886 int err;
4887 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4888
4889 if (!dev)
4890 return -ENODEV;
4891
4892 switch (cmd) {
4893 case SIOCGIFFLAGS: /* Get interface flags */
4894 ifr->ifr_flags = (short) dev_get_flags(dev);
4895 return 0;
4896
4897 case SIOCGIFMETRIC: /* Get the metric on the interface
4898 (currently unused) */
4899 ifr->ifr_metric = 0;
4900 return 0;
4901
4902 case SIOCGIFMTU: /* Get the MTU of a device */
4903 ifr->ifr_mtu = dev->mtu;
4904 return 0;
4905
4906 case SIOCGIFHWADDR:
4907 if (!dev->addr_len)
4908 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4909 else
4910 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4911 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4912 ifr->ifr_hwaddr.sa_family = dev->type;
4913 return 0;
4914
4915 case SIOCGIFSLAVE:
4916 err = -EINVAL;
4917 break;
4918
4919 case SIOCGIFMAP:
4920 ifr->ifr_map.mem_start = dev->mem_start;
4921 ifr->ifr_map.mem_end = dev->mem_end;
4922 ifr->ifr_map.base_addr = dev->base_addr;
4923 ifr->ifr_map.irq = dev->irq;
4924 ifr->ifr_map.dma = dev->dma;
4925 ifr->ifr_map.port = dev->if_port;
4926 return 0;
4927
4928 case SIOCGIFINDEX:
4929 ifr->ifr_ifindex = dev->ifindex;
4930 return 0;
4931
4932 case SIOCGIFTXQLEN:
4933 ifr->ifr_qlen = dev->tx_queue_len;
4934 return 0;
4935
4936 default:
4937 /* dev_ioctl() should ensure this case
4938 * is never reached
4939 */
4940 WARN_ON(1);
4941 err = -ENOTTY;
4942 break;
4943
4944 }
4945 return err;
4946 }
4947
4948 /*
4949 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4950 */
4951 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4952 {
4953 int err;
4954 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4955 const struct net_device_ops *ops;
4956
4957 if (!dev)
4958 return -ENODEV;
4959
4960 ops = dev->netdev_ops;
4961
4962 switch (cmd) {
4963 case SIOCSIFFLAGS: /* Set interface flags */
4964 return dev_change_flags(dev, ifr->ifr_flags);
4965
4966 case SIOCSIFMETRIC: /* Set the metric on the interface
4967 (currently unused) */
4968 return -EOPNOTSUPP;
4969
4970 case SIOCSIFMTU: /* Set the MTU of a device */
4971 return dev_set_mtu(dev, ifr->ifr_mtu);
4972
4973 case SIOCSIFHWADDR:
4974 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4975
4976 case SIOCSIFHWBROADCAST:
4977 if (ifr->ifr_hwaddr.sa_family != dev->type)
4978 return -EINVAL;
4979 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4980 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4981 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4982 return 0;
4983
4984 case SIOCSIFMAP:
4985 if (ops->ndo_set_config) {
4986 if (!netif_device_present(dev))
4987 return -ENODEV;
4988 return ops->ndo_set_config(dev, &ifr->ifr_map);
4989 }
4990 return -EOPNOTSUPP;
4991
4992 case SIOCADDMULTI:
4993 if (!ops->ndo_set_rx_mode ||
4994 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4995 return -EINVAL;
4996 if (!netif_device_present(dev))
4997 return -ENODEV;
4998 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4999
5000 case SIOCDELMULTI:
5001 if (!ops->ndo_set_rx_mode ||
5002 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5003 return -EINVAL;
5004 if (!netif_device_present(dev))
5005 return -ENODEV;
5006 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5007
5008 case SIOCSIFTXQLEN:
5009 if (ifr->ifr_qlen < 0)
5010 return -EINVAL;
5011 dev->tx_queue_len = ifr->ifr_qlen;
5012 return 0;
5013
5014 case SIOCSIFNAME:
5015 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5016 return dev_change_name(dev, ifr->ifr_newname);
5017
5018 case SIOCSHWTSTAMP:
5019 err = net_hwtstamp_validate(ifr);
5020 if (err)
5021 return err;
5022 /* fall through */
5023
5024 /*
5025 * Unknown or private ioctl
5026 */
5027 default:
5028 if ((cmd >= SIOCDEVPRIVATE &&
5029 cmd <= SIOCDEVPRIVATE + 15) ||
5030 cmd == SIOCBONDENSLAVE ||
5031 cmd == SIOCBONDRELEASE ||
5032 cmd == SIOCBONDSETHWADDR ||
5033 cmd == SIOCBONDSLAVEINFOQUERY ||
5034 cmd == SIOCBONDINFOQUERY ||
5035 cmd == SIOCBONDCHANGEACTIVE ||
5036 cmd == SIOCGMIIPHY ||
5037 cmd == SIOCGMIIREG ||
5038 cmd == SIOCSMIIREG ||
5039 cmd == SIOCBRADDIF ||
5040 cmd == SIOCBRDELIF ||
5041 cmd == SIOCSHWTSTAMP ||
5042 cmd == SIOCWANDEV) {
5043 err = -EOPNOTSUPP;
5044 if (ops->ndo_do_ioctl) {
5045 if (netif_device_present(dev))
5046 err = ops->ndo_do_ioctl(dev, ifr, cmd);
5047 else
5048 err = -ENODEV;
5049 }
5050 } else
5051 err = -EINVAL;
5052
5053 }
5054 return err;
5055 }
5056
5057 /*
5058 * This function handles all "interface"-type I/O control requests. The actual
5059 * 'doing' part of this is dev_ifsioc above.
5060 */
5061
5062 /**
5063 * dev_ioctl - network device ioctl
5064 * @net: the applicable net namespace
5065 * @cmd: command to issue
5066 * @arg: pointer to a struct ifreq in user space
5067 *
5068 * Issue ioctl functions to devices. This is normally called by the
5069 * user space syscall interfaces but can sometimes be useful for
5070 * other purposes. The return value is the return from the syscall if
5071 * positive or a negative errno code on error.
5072 */
5073
5074 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5075 {
5076 struct ifreq ifr;
5077 int ret;
5078 char *colon;
5079
5080 /* One special case: SIOCGIFCONF takes ifconf argument
5081 and requires shared lock, because it sleeps writing
5082 to user space.
5083 */
5084
5085 if (cmd == SIOCGIFCONF) {
5086 rtnl_lock();
5087 ret = dev_ifconf(net, (char __user *) arg);
5088 rtnl_unlock();
5089 return ret;
5090 }
5091 if (cmd == SIOCGIFNAME)
5092 return dev_ifname(net, (struct ifreq __user *)arg);
5093
5094 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5095 return -EFAULT;
5096
5097 ifr.ifr_name[IFNAMSIZ-1] = 0;
5098
5099 colon = strchr(ifr.ifr_name, ':');
5100 if (colon)
5101 *colon = 0;
5102
5103 /*
5104 * See which interface the caller is talking about.
5105 */
5106
5107 switch (cmd) {
5108 /*
5109 * These ioctl calls:
5110 * - can be done by all.
5111 * - atomic and do not require locking.
5112 * - return a value
5113 */
5114 case SIOCGIFFLAGS:
5115 case SIOCGIFMETRIC:
5116 case SIOCGIFMTU:
5117 case SIOCGIFHWADDR:
5118 case SIOCGIFSLAVE:
5119 case SIOCGIFMAP:
5120 case SIOCGIFINDEX:
5121 case SIOCGIFTXQLEN:
5122 dev_load(net, ifr.ifr_name);
5123 rcu_read_lock();
5124 ret = dev_ifsioc_locked(net, &ifr, cmd);
5125 rcu_read_unlock();
5126 if (!ret) {
5127 if (colon)
5128 *colon = ':';
5129 if (copy_to_user(arg, &ifr,
5130 sizeof(struct ifreq)))
5131 ret = -EFAULT;
5132 }
5133 return ret;
5134
5135 case SIOCETHTOOL:
5136 dev_load(net, ifr.ifr_name);
5137 rtnl_lock();
5138 ret = dev_ethtool(net, &ifr);
5139 rtnl_unlock();
5140 if (!ret) {
5141 if (colon)
5142 *colon = ':';
5143 if (copy_to_user(arg, &ifr,
5144 sizeof(struct ifreq)))
5145 ret = -EFAULT;
5146 }
5147 return ret;
5148
5149 /*
5150 * These ioctl calls:
5151 * - require superuser power.
5152 * - require strict serialization.
5153 * - return a value
5154 */
5155 case SIOCGMIIPHY:
5156 case SIOCGMIIREG:
5157 case SIOCSIFNAME:
5158 if (!capable(CAP_NET_ADMIN))
5159 return -EPERM;
5160 dev_load(net, ifr.ifr_name);
5161 rtnl_lock();
5162 ret = dev_ifsioc(net, &ifr, cmd);
5163 rtnl_unlock();
5164 if (!ret) {
5165 if (colon)
5166 *colon = ':';
5167 if (copy_to_user(arg, &ifr,
5168 sizeof(struct ifreq)))
5169 ret = -EFAULT;
5170 }
5171 return ret;
5172
5173 /*
5174 * These ioctl calls:
5175 * - require superuser power.
5176 * - require strict serialization.
5177 * - do not return a value
5178 */
5179 case SIOCSIFFLAGS:
5180 case SIOCSIFMETRIC:
5181 case SIOCSIFMTU:
5182 case SIOCSIFMAP:
5183 case SIOCSIFHWADDR:
5184 case SIOCSIFSLAVE:
5185 case SIOCADDMULTI:
5186 case SIOCDELMULTI:
5187 case SIOCSIFHWBROADCAST:
5188 case SIOCSIFTXQLEN:
5189 case SIOCSMIIREG:
5190 case SIOCBONDENSLAVE:
5191 case SIOCBONDRELEASE:
5192 case SIOCBONDSETHWADDR:
5193 case SIOCBONDCHANGEACTIVE:
5194 case SIOCBRADDIF:
5195 case SIOCBRDELIF:
5196 case SIOCSHWTSTAMP:
5197 if (!capable(CAP_NET_ADMIN))
5198 return -EPERM;
5199 /* fall through */
5200 case SIOCBONDSLAVEINFOQUERY:
5201 case SIOCBONDINFOQUERY:
5202 dev_load(net, ifr.ifr_name);
5203 rtnl_lock();
5204 ret = dev_ifsioc(net, &ifr, cmd);
5205 rtnl_unlock();
5206 return ret;
5207
5208 case SIOCGIFMEM:
5209 /* Get the per device memory space. We can add this but
5210 * currently do not support it */
5211 case SIOCSIFMEM:
5212 /* Set the per device memory buffer space.
5213 * Not applicable in our case */
5214 case SIOCSIFLINK:
5215 return -ENOTTY;
5216
5217 /*
5218 * Unknown or private ioctl.
5219 */
5220 default:
5221 if (cmd == SIOCWANDEV ||
5222 (cmd >= SIOCDEVPRIVATE &&
5223 cmd <= SIOCDEVPRIVATE + 15)) {
5224 dev_load(net, ifr.ifr_name);
5225 rtnl_lock();
5226 ret = dev_ifsioc(net, &ifr, cmd);
5227 rtnl_unlock();
5228 if (!ret && copy_to_user(arg, &ifr,
5229 sizeof(struct ifreq)))
5230 ret = -EFAULT;
5231 return ret;
5232 }
5233 /* Take care of Wireless Extensions */
5234 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5235 return wext_handle_ioctl(net, &ifr, cmd, arg);
5236 return -ENOTTY;
5237 }
5238 }
5239
5240
5241 /**
5242 * dev_new_index - allocate an ifindex
5243 * @net: the applicable net namespace
5244 *
5245 * Returns a suitable unique value for a new device interface
5246 * number. The caller must hold the rtnl semaphore or the
5247 * dev_base_lock to be sure it remains unique.
5248 */
5249 static int dev_new_index(struct net *net)
5250 {
5251 int ifindex = net->ifindex;
5252 for (;;) {
5253 if (++ifindex <= 0)
5254 ifindex = 1;
5255 if (!__dev_get_by_index(net, ifindex))
5256 return net->ifindex = ifindex;
5257 }
5258 }
5259
5260 /* Delayed registration/unregisteration */
5261 static LIST_HEAD(net_todo_list);
5262
5263 static void net_set_todo(struct net_device *dev)
5264 {
5265 list_add_tail(&dev->todo_list, &net_todo_list);
5266 }
5267
5268 static void rollback_registered_many(struct list_head *head)
5269 {
5270 struct net_device *dev, *tmp;
5271
5272 BUG_ON(dev_boot_phase);
5273 ASSERT_RTNL();
5274
5275 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5276 /* Some devices call without registering
5277 * for initialization unwind. Remove those
5278 * devices and proceed with the remaining.
5279 */
5280 if (dev->reg_state == NETREG_UNINITIALIZED) {
5281 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5282 dev->name, dev);
5283
5284 WARN_ON(1);
5285 list_del(&dev->unreg_list);
5286 continue;
5287 }
5288 dev->dismantle = true;
5289 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5290 }
5291
5292 /* If device is running, close it first. */
5293 dev_close_many(head);
5294
5295 list_for_each_entry(dev, head, unreg_list) {
5296 /* And unlink it from device chain. */
5297 unlist_netdevice(dev);
5298
5299 dev->reg_state = NETREG_UNREGISTERING;
5300 }
5301
5302 synchronize_net();
5303
5304 list_for_each_entry(dev, head, unreg_list) {
5305 /* Shutdown queueing discipline. */
5306 dev_shutdown(dev);
5307
5308
5309 /* Notify protocols, that we are about to destroy
5310 this device. They should clean all the things.
5311 */
5312 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5313
5314 if (!dev->rtnl_link_ops ||
5315 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5316 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5317
5318 /*
5319 * Flush the unicast and multicast chains
5320 */
5321 dev_uc_flush(dev);
5322 dev_mc_flush(dev);
5323
5324 if (dev->netdev_ops->ndo_uninit)
5325 dev->netdev_ops->ndo_uninit(dev);
5326
5327 /* Notifier chain MUST detach us from master device. */
5328 WARN_ON(dev->master);
5329
5330 /* Remove entries from kobject tree */
5331 netdev_unregister_kobject(dev);
5332 }
5333
5334 /* Process any work delayed until the end of the batch */
5335 dev = list_first_entry(head, struct net_device, unreg_list);
5336 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5337
5338 synchronize_net();
5339
5340 list_for_each_entry(dev, head, unreg_list)
5341 dev_put(dev);
5342 }
5343
5344 static void rollback_registered(struct net_device *dev)
5345 {
5346 LIST_HEAD(single);
5347
5348 list_add(&dev->unreg_list, &single);
5349 rollback_registered_many(&single);
5350 list_del(&single);
5351 }
5352
5353 static netdev_features_t netdev_fix_features(struct net_device *dev,
5354 netdev_features_t features)
5355 {
5356 /* Fix illegal checksum combinations */
5357 if ((features & NETIF_F_HW_CSUM) &&
5358 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5359 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5360 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5361 }
5362
5363 /* Fix illegal SG+CSUM combinations. */
5364 if ((features & NETIF_F_SG) &&
5365 !(features & NETIF_F_ALL_CSUM)) {
5366 netdev_dbg(dev,
5367 "Dropping NETIF_F_SG since no checksum feature.\n");
5368 features &= ~NETIF_F_SG;
5369 }
5370
5371 /* TSO requires that SG is present as well. */
5372 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5373 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5374 features &= ~NETIF_F_ALL_TSO;
5375 }
5376
5377 /* TSO ECN requires that TSO is present as well. */
5378 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5379 features &= ~NETIF_F_TSO_ECN;
5380
5381 /* Software GSO depends on SG. */
5382 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5383 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5384 features &= ~NETIF_F_GSO;
5385 }
5386
5387 /* UFO needs SG and checksumming */
5388 if (features & NETIF_F_UFO) {
5389 /* maybe split UFO into V4 and V6? */
5390 if (!((features & NETIF_F_GEN_CSUM) ||
5391 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5392 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5393 netdev_dbg(dev,
5394 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5395 features &= ~NETIF_F_UFO;
5396 }
5397
5398 if (!(features & NETIF_F_SG)) {
5399 netdev_dbg(dev,
5400 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5401 features &= ~NETIF_F_UFO;
5402 }
5403 }
5404
5405 return features;
5406 }
5407
5408 int __netdev_update_features(struct net_device *dev)
5409 {
5410 netdev_features_t features;
5411 int err = 0;
5412
5413 ASSERT_RTNL();
5414
5415 features = netdev_get_wanted_features(dev);
5416
5417 if (dev->netdev_ops->ndo_fix_features)
5418 features = dev->netdev_ops->ndo_fix_features(dev, features);
5419
5420 /* driver might be less strict about feature dependencies */
5421 features = netdev_fix_features(dev, features);
5422
5423 if (dev->features == features)
5424 return 0;
5425
5426 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5427 &dev->features, &features);
5428
5429 if (dev->netdev_ops->ndo_set_features)
5430 err = dev->netdev_ops->ndo_set_features(dev, features);
5431
5432 if (unlikely(err < 0)) {
5433 netdev_err(dev,
5434 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5435 err, &features, &dev->features);
5436 return -1;
5437 }
5438
5439 if (!err)
5440 dev->features = features;
5441
5442 return 1;
5443 }
5444
5445 /**
5446 * netdev_update_features - recalculate device features
5447 * @dev: the device to check
5448 *
5449 * Recalculate dev->features set and send notifications if it
5450 * has changed. Should be called after driver or hardware dependent
5451 * conditions might have changed that influence the features.
5452 */
5453 void netdev_update_features(struct net_device *dev)
5454 {
5455 if (__netdev_update_features(dev))
5456 netdev_features_change(dev);
5457 }
5458 EXPORT_SYMBOL(netdev_update_features);
5459
5460 /**
5461 * netdev_change_features - recalculate device features
5462 * @dev: the device to check
5463 *
5464 * Recalculate dev->features set and send notifications even
5465 * if they have not changed. Should be called instead of
5466 * netdev_update_features() if also dev->vlan_features might
5467 * have changed to allow the changes to be propagated to stacked
5468 * VLAN devices.
5469 */
5470 void netdev_change_features(struct net_device *dev)
5471 {
5472 __netdev_update_features(dev);
5473 netdev_features_change(dev);
5474 }
5475 EXPORT_SYMBOL(netdev_change_features);
5476
5477 /**
5478 * netif_stacked_transfer_operstate - transfer operstate
5479 * @rootdev: the root or lower level device to transfer state from
5480 * @dev: the device to transfer operstate to
5481 *
5482 * Transfer operational state from root to device. This is normally
5483 * called when a stacking relationship exists between the root
5484 * device and the device(a leaf device).
5485 */
5486 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5487 struct net_device *dev)
5488 {
5489 if (rootdev->operstate == IF_OPER_DORMANT)
5490 netif_dormant_on(dev);
5491 else
5492 netif_dormant_off(dev);
5493
5494 if (netif_carrier_ok(rootdev)) {
5495 if (!netif_carrier_ok(dev))
5496 netif_carrier_on(dev);
5497 } else {
5498 if (netif_carrier_ok(dev))
5499 netif_carrier_off(dev);
5500 }
5501 }
5502 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5503
5504 #ifdef CONFIG_RPS
5505 static int netif_alloc_rx_queues(struct net_device *dev)
5506 {
5507 unsigned int i, count = dev->num_rx_queues;
5508 struct netdev_rx_queue *rx;
5509
5510 BUG_ON(count < 1);
5511
5512 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5513 if (!rx) {
5514 pr_err("netdev: Unable to allocate %u rx queues\n", count);
5515 return -ENOMEM;
5516 }
5517 dev->_rx = rx;
5518
5519 for (i = 0; i < count; i++)
5520 rx[i].dev = dev;
5521 return 0;
5522 }
5523 #endif
5524
5525 static void netdev_init_one_queue(struct net_device *dev,
5526 struct netdev_queue *queue, void *_unused)
5527 {
5528 /* Initialize queue lock */
5529 spin_lock_init(&queue->_xmit_lock);
5530 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5531 queue->xmit_lock_owner = -1;
5532 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5533 queue->dev = dev;
5534 #ifdef CONFIG_BQL
5535 dql_init(&queue->dql, HZ);
5536 #endif
5537 }
5538
5539 static int netif_alloc_netdev_queues(struct net_device *dev)
5540 {
5541 unsigned int count = dev->num_tx_queues;
5542 struct netdev_queue *tx;
5543
5544 BUG_ON(count < 1);
5545
5546 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5547 if (!tx) {
5548 pr_err("netdev: Unable to allocate %u tx queues\n", count);
5549 return -ENOMEM;
5550 }
5551 dev->_tx = tx;
5552
5553 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5554 spin_lock_init(&dev->tx_global_lock);
5555
5556 return 0;
5557 }
5558
5559 /**
5560 * register_netdevice - register a network device
5561 * @dev: device to register
5562 *
5563 * Take a completed network device structure and add it to the kernel
5564 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5565 * chain. 0 is returned on success. A negative errno code is returned
5566 * on a failure to set up the device, or if the name is a duplicate.
5567 *
5568 * Callers must hold the rtnl semaphore. You may want
5569 * register_netdev() instead of this.
5570 *
5571 * BUGS:
5572 * The locking appears insufficient to guarantee two parallel registers
5573 * will not get the same name.
5574 */
5575
5576 int register_netdevice(struct net_device *dev)
5577 {
5578 int ret;
5579 struct net *net = dev_net(dev);
5580
5581 BUG_ON(dev_boot_phase);
5582 ASSERT_RTNL();
5583
5584 might_sleep();
5585
5586 /* When net_device's are persistent, this will be fatal. */
5587 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5588 BUG_ON(!net);
5589
5590 spin_lock_init(&dev->addr_list_lock);
5591 netdev_set_addr_lockdep_class(dev);
5592
5593 dev->iflink = -1;
5594
5595 ret = dev_get_valid_name(dev, dev->name);
5596 if (ret < 0)
5597 goto out;
5598
5599 /* Init, if this function is available */
5600 if (dev->netdev_ops->ndo_init) {
5601 ret = dev->netdev_ops->ndo_init(dev);
5602 if (ret) {
5603 if (ret > 0)
5604 ret = -EIO;
5605 goto out;
5606 }
5607 }
5608
5609 ret = -EBUSY;
5610 if (!dev->ifindex)
5611 dev->ifindex = dev_new_index(net);
5612 else if (__dev_get_by_index(net, dev->ifindex))
5613 goto err_uninit;
5614
5615 if (dev->iflink == -1)
5616 dev->iflink = dev->ifindex;
5617
5618 /* Transfer changeable features to wanted_features and enable
5619 * software offloads (GSO and GRO).
5620 */
5621 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5622 dev->features |= NETIF_F_SOFT_FEATURES;
5623 dev->wanted_features = dev->features & dev->hw_features;
5624
5625 /* Turn on no cache copy if HW is doing checksum */
5626 if (!(dev->flags & IFF_LOOPBACK)) {
5627 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5628 if (dev->features & NETIF_F_ALL_CSUM) {
5629 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5630 dev->features |= NETIF_F_NOCACHE_COPY;
5631 }
5632 }
5633
5634 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5635 */
5636 dev->vlan_features |= NETIF_F_HIGHDMA;
5637
5638 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5639 ret = notifier_to_errno(ret);
5640 if (ret)
5641 goto err_uninit;
5642
5643 ret = netdev_register_kobject(dev);
5644 if (ret)
5645 goto err_uninit;
5646 dev->reg_state = NETREG_REGISTERED;
5647
5648 __netdev_update_features(dev);
5649
5650 /*
5651 * Default initial state at registry is that the
5652 * device is present.
5653 */
5654
5655 set_bit(__LINK_STATE_PRESENT, &dev->state);
5656
5657 dev_init_scheduler(dev);
5658 dev_hold(dev);
5659 list_netdevice(dev);
5660 add_device_randomness(dev->dev_addr, dev->addr_len);
5661
5662 /* Notify protocols, that a new device appeared. */
5663 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5664 ret = notifier_to_errno(ret);
5665 if (ret) {
5666 rollback_registered(dev);
5667 dev->reg_state = NETREG_UNREGISTERED;
5668 }
5669 /*
5670 * Prevent userspace races by waiting until the network
5671 * device is fully setup before sending notifications.
5672 */
5673 if (!dev->rtnl_link_ops ||
5674 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5675 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5676
5677 out:
5678 return ret;
5679
5680 err_uninit:
5681 if (dev->netdev_ops->ndo_uninit)
5682 dev->netdev_ops->ndo_uninit(dev);
5683 goto out;
5684 }
5685 EXPORT_SYMBOL(register_netdevice);
5686
5687 /**
5688 * init_dummy_netdev - init a dummy network device for NAPI
5689 * @dev: device to init
5690 *
5691 * This takes a network device structure and initialize the minimum
5692 * amount of fields so it can be used to schedule NAPI polls without
5693 * registering a full blown interface. This is to be used by drivers
5694 * that need to tie several hardware interfaces to a single NAPI
5695 * poll scheduler due to HW limitations.
5696 */
5697 int init_dummy_netdev(struct net_device *dev)
5698 {
5699 /* Clear everything. Note we don't initialize spinlocks
5700 * are they aren't supposed to be taken by any of the
5701 * NAPI code and this dummy netdev is supposed to be
5702 * only ever used for NAPI polls
5703 */
5704 memset(dev, 0, sizeof(struct net_device));
5705
5706 /* make sure we BUG if trying to hit standard
5707 * register/unregister code path
5708 */
5709 dev->reg_state = NETREG_DUMMY;
5710
5711 /* NAPI wants this */
5712 INIT_LIST_HEAD(&dev->napi_list);
5713
5714 /* a dummy interface is started by default */
5715 set_bit(__LINK_STATE_PRESENT, &dev->state);
5716 set_bit(__LINK_STATE_START, &dev->state);
5717
5718 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5719 * because users of this 'device' dont need to change
5720 * its refcount.
5721 */
5722
5723 return 0;
5724 }
5725 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5726
5727
5728 /**
5729 * register_netdev - register a network device
5730 * @dev: device to register
5731 *
5732 * Take a completed network device structure and add it to the kernel
5733 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5734 * chain. 0 is returned on success. A negative errno code is returned
5735 * on a failure to set up the device, or if the name is a duplicate.
5736 *
5737 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5738 * and expands the device name if you passed a format string to
5739 * alloc_netdev.
5740 */
5741 int register_netdev(struct net_device *dev)
5742 {
5743 int err;
5744
5745 rtnl_lock();
5746 err = register_netdevice(dev);
5747 rtnl_unlock();
5748 return err;
5749 }
5750 EXPORT_SYMBOL(register_netdev);
5751
5752 int netdev_refcnt_read(const struct net_device *dev)
5753 {
5754 int i, refcnt = 0;
5755
5756 for_each_possible_cpu(i)
5757 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5758 return refcnt;
5759 }
5760 EXPORT_SYMBOL(netdev_refcnt_read);
5761
5762 /**
5763 * netdev_wait_allrefs - wait until all references are gone.
5764 * @dev: target net_device
5765 *
5766 * This is called when unregistering network devices.
5767 *
5768 * Any protocol or device that holds a reference should register
5769 * for netdevice notification, and cleanup and put back the
5770 * reference if they receive an UNREGISTER event.
5771 * We can get stuck here if buggy protocols don't correctly
5772 * call dev_put.
5773 */
5774 static void netdev_wait_allrefs(struct net_device *dev)
5775 {
5776 unsigned long rebroadcast_time, warning_time;
5777 int refcnt;
5778
5779 linkwatch_forget_dev(dev);
5780
5781 rebroadcast_time = warning_time = jiffies;
5782 refcnt = netdev_refcnt_read(dev);
5783
5784 while (refcnt != 0) {
5785 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5786 rtnl_lock();
5787
5788 /* Rebroadcast unregister notification */
5789 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5790 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5791 * should have already handle it the first time */
5792
5793 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5794 &dev->state)) {
5795 /* We must not have linkwatch events
5796 * pending on unregister. If this
5797 * happens, we simply run the queue
5798 * unscheduled, resulting in a noop
5799 * for this device.
5800 */
5801 linkwatch_run_queue();
5802 }
5803
5804 __rtnl_unlock();
5805
5806 rebroadcast_time = jiffies;
5807 }
5808
5809 msleep(250);
5810
5811 refcnt = netdev_refcnt_read(dev);
5812
5813 if (time_after(jiffies, warning_time + 10 * HZ)) {
5814 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5815 dev->name, refcnt);
5816 warning_time = jiffies;
5817 }
5818 }
5819 }
5820
5821 /* The sequence is:
5822 *
5823 * rtnl_lock();
5824 * ...
5825 * register_netdevice(x1);
5826 * register_netdevice(x2);
5827 * ...
5828 * unregister_netdevice(y1);
5829 * unregister_netdevice(y2);
5830 * ...
5831 * rtnl_unlock();
5832 * free_netdev(y1);
5833 * free_netdev(y2);
5834 *
5835 * We are invoked by rtnl_unlock().
5836 * This allows us to deal with problems:
5837 * 1) We can delete sysfs objects which invoke hotplug
5838 * without deadlocking with linkwatch via keventd.
5839 * 2) Since we run with the RTNL semaphore not held, we can sleep
5840 * safely in order to wait for the netdev refcnt to drop to zero.
5841 *
5842 * We must not return until all unregister events added during
5843 * the interval the lock was held have been completed.
5844 */
5845 void netdev_run_todo(void)
5846 {
5847 struct list_head list;
5848
5849 /* Snapshot list, allow later requests */
5850 list_replace_init(&net_todo_list, &list);
5851
5852 __rtnl_unlock();
5853
5854 /* Wait for rcu callbacks to finish before attempting to drain
5855 * the device list. This usually avoids a 250ms wait.
5856 */
5857 if (!list_empty(&list))
5858 rcu_barrier();
5859
5860 while (!list_empty(&list)) {
5861 struct net_device *dev
5862 = list_first_entry(&list, struct net_device, todo_list);
5863 list_del(&dev->todo_list);
5864
5865 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5866 pr_err("network todo '%s' but state %d\n",
5867 dev->name, dev->reg_state);
5868 dump_stack();
5869 continue;
5870 }
5871
5872 dev->reg_state = NETREG_UNREGISTERED;
5873
5874 on_each_cpu(flush_backlog, dev, 1);
5875
5876 netdev_wait_allrefs(dev);
5877
5878 /* paranoia */
5879 BUG_ON(netdev_refcnt_read(dev));
5880 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5881 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5882 WARN_ON(dev->dn_ptr);
5883
5884 if (dev->destructor)
5885 dev->destructor(dev);
5886
5887 /* Free network device */
5888 kobject_put(&dev->dev.kobj);
5889 }
5890 }
5891
5892 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5893 * fields in the same order, with only the type differing.
5894 */
5895 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5896 const struct net_device_stats *netdev_stats)
5897 {
5898 #if BITS_PER_LONG == 64
5899 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5900 memcpy(stats64, netdev_stats, sizeof(*stats64));
5901 #else
5902 size_t i, n = sizeof(*stats64) / sizeof(u64);
5903 const unsigned long *src = (const unsigned long *)netdev_stats;
5904 u64 *dst = (u64 *)stats64;
5905
5906 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5907 sizeof(*stats64) / sizeof(u64));
5908 for (i = 0; i < n; i++)
5909 dst[i] = src[i];
5910 #endif
5911 }
5912 EXPORT_SYMBOL(netdev_stats_to_stats64);
5913
5914 /**
5915 * dev_get_stats - get network device statistics
5916 * @dev: device to get statistics from
5917 * @storage: place to store stats
5918 *
5919 * Get network statistics from device. Return @storage.
5920 * The device driver may provide its own method by setting
5921 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5922 * otherwise the internal statistics structure is used.
5923 */
5924 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5925 struct rtnl_link_stats64 *storage)
5926 {
5927 const struct net_device_ops *ops = dev->netdev_ops;
5928
5929 if (ops->ndo_get_stats64) {
5930 memset(storage, 0, sizeof(*storage));
5931 ops->ndo_get_stats64(dev, storage);
5932 } else if (ops->ndo_get_stats) {
5933 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5934 } else {
5935 netdev_stats_to_stats64(storage, &dev->stats);
5936 }
5937 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5938 return storage;
5939 }
5940 EXPORT_SYMBOL(dev_get_stats);
5941
5942 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5943 {
5944 struct netdev_queue *queue = dev_ingress_queue(dev);
5945
5946 #ifdef CONFIG_NET_CLS_ACT
5947 if (queue)
5948 return queue;
5949 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5950 if (!queue)
5951 return NULL;
5952 netdev_init_one_queue(dev, queue, NULL);
5953 queue->qdisc = &noop_qdisc;
5954 queue->qdisc_sleeping = &noop_qdisc;
5955 rcu_assign_pointer(dev->ingress_queue, queue);
5956 #endif
5957 return queue;
5958 }
5959
5960 /**
5961 * alloc_netdev_mqs - allocate network device
5962 * @sizeof_priv: size of private data to allocate space for
5963 * @name: device name format string
5964 * @setup: callback to initialize device
5965 * @txqs: the number of TX subqueues to allocate
5966 * @rxqs: the number of RX subqueues to allocate
5967 *
5968 * Allocates a struct net_device with private data area for driver use
5969 * and performs basic initialization. Also allocates subquue structs
5970 * for each queue on the device.
5971 */
5972 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5973 void (*setup)(struct net_device *),
5974 unsigned int txqs, unsigned int rxqs)
5975 {
5976 struct net_device *dev;
5977 size_t alloc_size;
5978 struct net_device *p;
5979
5980 BUG_ON(strlen(name) >= sizeof(dev->name));
5981
5982 if (txqs < 1) {
5983 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5984 return NULL;
5985 }
5986
5987 #ifdef CONFIG_RPS
5988 if (rxqs < 1) {
5989 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5990 return NULL;
5991 }
5992 #endif
5993
5994 alloc_size = sizeof(struct net_device);
5995 if (sizeof_priv) {
5996 /* ensure 32-byte alignment of private area */
5997 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5998 alloc_size += sizeof_priv;
5999 }
6000 /* ensure 32-byte alignment of whole construct */
6001 alloc_size += NETDEV_ALIGN - 1;
6002
6003 p = kzalloc(alloc_size, GFP_KERNEL);
6004 if (!p) {
6005 pr_err("alloc_netdev: Unable to allocate device\n");
6006 return NULL;
6007 }
6008
6009 dev = PTR_ALIGN(p, NETDEV_ALIGN);
6010 dev->padded = (char *)dev - (char *)p;
6011
6012 dev->pcpu_refcnt = alloc_percpu(int);
6013 if (!dev->pcpu_refcnt)
6014 goto free_p;
6015
6016 if (dev_addr_init(dev))
6017 goto free_pcpu;
6018
6019 dev_mc_init(dev);
6020 dev_uc_init(dev);
6021
6022 dev_net_set(dev, &init_net);
6023
6024 dev->gso_max_size = GSO_MAX_SIZE;
6025 dev->gso_max_segs = GSO_MAX_SEGS;
6026
6027 INIT_LIST_HEAD(&dev->napi_list);
6028 INIT_LIST_HEAD(&dev->unreg_list);
6029 INIT_LIST_HEAD(&dev->link_watch_list);
6030 dev->priv_flags = IFF_XMIT_DST_RELEASE;
6031 setup(dev);
6032
6033 dev->num_tx_queues = txqs;
6034 dev->real_num_tx_queues = txqs;
6035 if (netif_alloc_netdev_queues(dev))
6036 goto free_all;
6037
6038 #ifdef CONFIG_RPS
6039 dev->num_rx_queues = rxqs;
6040 dev->real_num_rx_queues = rxqs;
6041 if (netif_alloc_rx_queues(dev))
6042 goto free_all;
6043 #endif
6044
6045 strcpy(dev->name, name);
6046 dev->group = INIT_NETDEV_GROUP;
6047 return dev;
6048
6049 free_all:
6050 free_netdev(dev);
6051 return NULL;
6052
6053 free_pcpu:
6054 free_percpu(dev->pcpu_refcnt);
6055 kfree(dev->_tx);
6056 #ifdef CONFIG_RPS
6057 kfree(dev->_rx);
6058 #endif
6059
6060 free_p:
6061 kfree(p);
6062 return NULL;
6063 }
6064 EXPORT_SYMBOL(alloc_netdev_mqs);
6065
6066 /**
6067 * free_netdev - free network device
6068 * @dev: device
6069 *
6070 * This function does the last stage of destroying an allocated device
6071 * interface. The reference to the device object is released.
6072 * If this is the last reference then it will be freed.
6073 */
6074 void free_netdev(struct net_device *dev)
6075 {
6076 struct napi_struct *p, *n;
6077
6078 release_net(dev_net(dev));
6079
6080 kfree(dev->_tx);
6081 #ifdef CONFIG_RPS
6082 kfree(dev->_rx);
6083 #endif
6084
6085 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6086
6087 /* Flush device addresses */
6088 dev_addr_flush(dev);
6089
6090 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6091 netif_napi_del(p);
6092
6093 free_percpu(dev->pcpu_refcnt);
6094 dev->pcpu_refcnt = NULL;
6095
6096 /* Compatibility with error handling in drivers */
6097 if (dev->reg_state == NETREG_UNINITIALIZED) {
6098 kfree((char *)dev - dev->padded);
6099 return;
6100 }
6101
6102 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6103 dev->reg_state = NETREG_RELEASED;
6104
6105 /* will free via device release */
6106 put_device(&dev->dev);
6107 }
6108 EXPORT_SYMBOL(free_netdev);
6109
6110 /**
6111 * synchronize_net - Synchronize with packet receive processing
6112 *
6113 * Wait for packets currently being received to be done.
6114 * Does not block later packets from starting.
6115 */
6116 void synchronize_net(void)
6117 {
6118 might_sleep();
6119 if (rtnl_is_locked())
6120 synchronize_rcu_expedited();
6121 else
6122 synchronize_rcu();
6123 }
6124 EXPORT_SYMBOL(synchronize_net);
6125
6126 /**
6127 * unregister_netdevice_queue - remove device from the kernel
6128 * @dev: device
6129 * @head: list
6130 *
6131 * This function shuts down a device interface and removes it
6132 * from the kernel tables.
6133 * If head not NULL, device is queued to be unregistered later.
6134 *
6135 * Callers must hold the rtnl semaphore. You may want
6136 * unregister_netdev() instead of this.
6137 */
6138
6139 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6140 {
6141 ASSERT_RTNL();
6142
6143 if (head) {
6144 list_move_tail(&dev->unreg_list, head);
6145 } else {
6146 rollback_registered(dev);
6147 /* Finish processing unregister after unlock */
6148 net_set_todo(dev);
6149 }
6150 }
6151 EXPORT_SYMBOL(unregister_netdevice_queue);
6152
6153 /**
6154 * unregister_netdevice_many - unregister many devices
6155 * @head: list of devices
6156 */
6157 void unregister_netdevice_many(struct list_head *head)
6158 {
6159 struct net_device *dev;
6160
6161 if (!list_empty(head)) {
6162 rollback_registered_many(head);
6163 list_for_each_entry(dev, head, unreg_list)
6164 net_set_todo(dev);
6165 }
6166 }
6167 EXPORT_SYMBOL(unregister_netdevice_many);
6168
6169 /**
6170 * unregister_netdev - remove device from the kernel
6171 * @dev: device
6172 *
6173 * This function shuts down a device interface and removes it
6174 * from the kernel tables.
6175 *
6176 * This is just a wrapper for unregister_netdevice that takes
6177 * the rtnl semaphore. In general you want to use this and not
6178 * unregister_netdevice.
6179 */
6180 void unregister_netdev(struct net_device *dev)
6181 {
6182 rtnl_lock();
6183 unregister_netdevice(dev);
6184 rtnl_unlock();
6185 }
6186 EXPORT_SYMBOL(unregister_netdev);
6187
6188 /**
6189 * dev_change_net_namespace - move device to different nethost namespace
6190 * @dev: device
6191 * @net: network namespace
6192 * @pat: If not NULL name pattern to try if the current device name
6193 * is already taken in the destination network namespace.
6194 *
6195 * This function shuts down a device interface and moves it
6196 * to a new network namespace. On success 0 is returned, on
6197 * a failure a netagive errno code is returned.
6198 *
6199 * Callers must hold the rtnl semaphore.
6200 */
6201
6202 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6203 {
6204 int err;
6205
6206 ASSERT_RTNL();
6207
6208 /* Don't allow namespace local devices to be moved. */
6209 err = -EINVAL;
6210 if (dev->features & NETIF_F_NETNS_LOCAL)
6211 goto out;
6212
6213 /* Ensure the device has been registrered */
6214 err = -EINVAL;
6215 if (dev->reg_state != NETREG_REGISTERED)
6216 goto out;
6217
6218 /* Get out if there is nothing todo */
6219 err = 0;
6220 if (net_eq(dev_net(dev), net))
6221 goto out;
6222
6223 /* Pick the destination device name, and ensure
6224 * we can use it in the destination network namespace.
6225 */
6226 err = -EEXIST;
6227 if (__dev_get_by_name(net, dev->name)) {
6228 /* We get here if we can't use the current device name */
6229 if (!pat)
6230 goto out;
6231 if (dev_get_valid_name(dev, pat) < 0)
6232 goto out;
6233 }
6234
6235 /*
6236 * And now a mini version of register_netdevice unregister_netdevice.
6237 */
6238
6239 /* If device is running close it first. */
6240 dev_close(dev);
6241
6242 /* And unlink it from device chain */
6243 err = -ENODEV;
6244 unlist_netdevice(dev);
6245
6246 synchronize_net();
6247
6248 /* Shutdown queueing discipline. */
6249 dev_shutdown(dev);
6250
6251 /* Notify protocols, that we are about to destroy
6252 this device. They should clean all the things.
6253
6254 Note that dev->reg_state stays at NETREG_REGISTERED.
6255 This is wanted because this way 8021q and macvlan know
6256 the device is just moving and can keep their slaves up.
6257 */
6258 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6259 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6260 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6261
6262 /*
6263 * Flush the unicast and multicast chains
6264 */
6265 dev_uc_flush(dev);
6266 dev_mc_flush(dev);
6267
6268 /* Actually switch the network namespace */
6269 dev_net_set(dev, net);
6270
6271 /* If there is an ifindex conflict assign a new one */
6272 if (__dev_get_by_index(net, dev->ifindex)) {
6273 int iflink = (dev->iflink == dev->ifindex);
6274 dev->ifindex = dev_new_index(net);
6275 if (iflink)
6276 dev->iflink = dev->ifindex;
6277 }
6278
6279 /* Fixup kobjects */
6280 err = device_rename(&dev->dev, dev->name);
6281 WARN_ON(err);
6282
6283 /* Add the device back in the hashes */
6284 list_netdevice(dev);
6285
6286 /* Notify protocols, that a new device appeared. */
6287 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6288
6289 /*
6290 * Prevent userspace races by waiting until the network
6291 * device is fully setup before sending notifications.
6292 */
6293 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6294
6295 synchronize_net();
6296 err = 0;
6297 out:
6298 return err;
6299 }
6300 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6301
6302 static int dev_cpu_callback(struct notifier_block *nfb,
6303 unsigned long action,
6304 void *ocpu)
6305 {
6306 struct sk_buff **list_skb;
6307 struct sk_buff *skb;
6308 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6309 struct softnet_data *sd, *oldsd;
6310
6311 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6312 return NOTIFY_OK;
6313
6314 local_irq_disable();
6315 cpu = smp_processor_id();
6316 sd = &per_cpu(softnet_data, cpu);
6317 oldsd = &per_cpu(softnet_data, oldcpu);
6318
6319 /* Find end of our completion_queue. */
6320 list_skb = &sd->completion_queue;
6321 while (*list_skb)
6322 list_skb = &(*list_skb)->next;
6323 /* Append completion queue from offline CPU. */
6324 *list_skb = oldsd->completion_queue;
6325 oldsd->completion_queue = NULL;
6326
6327 /* Append output queue from offline CPU. */
6328 if (oldsd->output_queue) {
6329 *sd->output_queue_tailp = oldsd->output_queue;
6330 sd->output_queue_tailp = oldsd->output_queue_tailp;
6331 oldsd->output_queue = NULL;
6332 oldsd->output_queue_tailp = &oldsd->output_queue;
6333 }
6334 /* Append NAPI poll list from offline CPU. */
6335 if (!list_empty(&oldsd->poll_list)) {
6336 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6337 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6338 }
6339
6340 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6341 local_irq_enable();
6342
6343 /* Process offline CPU's input_pkt_queue */
6344 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6345 netif_rx(skb);
6346 input_queue_head_incr(oldsd);
6347 }
6348 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6349 netif_rx(skb);
6350 input_queue_head_incr(oldsd);
6351 }
6352
6353 return NOTIFY_OK;
6354 }
6355
6356
6357 /**
6358 * netdev_increment_features - increment feature set by one
6359 * @all: current feature set
6360 * @one: new feature set
6361 * @mask: mask feature set
6362 *
6363 * Computes a new feature set after adding a device with feature set
6364 * @one to the master device with current feature set @all. Will not
6365 * enable anything that is off in @mask. Returns the new feature set.
6366 */
6367 netdev_features_t netdev_increment_features(netdev_features_t all,
6368 netdev_features_t one, netdev_features_t mask)
6369 {
6370 if (mask & NETIF_F_GEN_CSUM)
6371 mask |= NETIF_F_ALL_CSUM;
6372 mask |= NETIF_F_VLAN_CHALLENGED;
6373
6374 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6375 all &= one | ~NETIF_F_ALL_FOR_ALL;
6376
6377 /* If one device supports hw checksumming, set for all. */
6378 if (all & NETIF_F_GEN_CSUM)
6379 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6380
6381 return all;
6382 }
6383 EXPORT_SYMBOL(netdev_increment_features);
6384
6385 static struct hlist_head *netdev_create_hash(void)
6386 {
6387 int i;
6388 struct hlist_head *hash;
6389
6390 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6391 if (hash != NULL)
6392 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6393 INIT_HLIST_HEAD(&hash[i]);
6394
6395 return hash;
6396 }
6397
6398 /* Initialize per network namespace state */
6399 static int __net_init netdev_init(struct net *net)
6400 {
6401 if (net != &init_net)
6402 INIT_LIST_HEAD(&net->dev_base_head);
6403
6404 net->dev_name_head = netdev_create_hash();
6405 if (net->dev_name_head == NULL)
6406 goto err_name;
6407
6408 net->dev_index_head = netdev_create_hash();
6409 if (net->dev_index_head == NULL)
6410 goto err_idx;
6411
6412 return 0;
6413
6414 err_idx:
6415 kfree(net->dev_name_head);
6416 err_name:
6417 return -ENOMEM;
6418 }
6419
6420 /**
6421 * netdev_drivername - network driver for the device
6422 * @dev: network device
6423 *
6424 * Determine network driver for device.
6425 */
6426 const char *netdev_drivername(const struct net_device *dev)
6427 {
6428 const struct device_driver *driver;
6429 const struct device *parent;
6430 const char *empty = "";
6431
6432 parent = dev->dev.parent;
6433 if (!parent)
6434 return empty;
6435
6436 driver = parent->driver;
6437 if (driver && driver->name)
6438 return driver->name;
6439 return empty;
6440 }
6441
6442 int __netdev_printk(const char *level, const struct net_device *dev,
6443 struct va_format *vaf)
6444 {
6445 int r;
6446
6447 if (dev && dev->dev.parent)
6448 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6449 netdev_name(dev), vaf);
6450 else if (dev)
6451 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6452 else
6453 r = printk("%s(NULL net_device): %pV", level, vaf);
6454
6455 return r;
6456 }
6457 EXPORT_SYMBOL(__netdev_printk);
6458
6459 int netdev_printk(const char *level, const struct net_device *dev,
6460 const char *format, ...)
6461 {
6462 struct va_format vaf;
6463 va_list args;
6464 int r;
6465
6466 va_start(args, format);
6467
6468 vaf.fmt = format;
6469 vaf.va = &args;
6470
6471 r = __netdev_printk(level, dev, &vaf);
6472 va_end(args);
6473
6474 return r;
6475 }
6476 EXPORT_SYMBOL(netdev_printk);
6477
6478 #define define_netdev_printk_level(func, level) \
6479 int func(const struct net_device *dev, const char *fmt, ...) \
6480 { \
6481 int r; \
6482 struct va_format vaf; \
6483 va_list args; \
6484 \
6485 va_start(args, fmt); \
6486 \
6487 vaf.fmt = fmt; \
6488 vaf.va = &args; \
6489 \
6490 r = __netdev_printk(level, dev, &vaf); \
6491 va_end(args); \
6492 \
6493 return r; \
6494 } \
6495 EXPORT_SYMBOL(func);
6496
6497 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6498 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6499 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6500 define_netdev_printk_level(netdev_err, KERN_ERR);
6501 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6502 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6503 define_netdev_printk_level(netdev_info, KERN_INFO);
6504
6505 static void __net_exit netdev_exit(struct net *net)
6506 {
6507 kfree(net->dev_name_head);
6508 kfree(net->dev_index_head);
6509 }
6510
6511 static struct pernet_operations __net_initdata netdev_net_ops = {
6512 .init = netdev_init,
6513 .exit = netdev_exit,
6514 };
6515
6516 static void __net_exit default_device_exit(struct net *net)
6517 {
6518 struct net_device *dev, *aux;
6519 /*
6520 * Push all migratable network devices back to the
6521 * initial network namespace
6522 */
6523 rtnl_lock();
6524 for_each_netdev_safe(net, dev, aux) {
6525 int err;
6526 char fb_name[IFNAMSIZ];
6527
6528 /* Ignore unmoveable devices (i.e. loopback) */
6529 if (dev->features & NETIF_F_NETNS_LOCAL)
6530 continue;
6531
6532 /* Leave virtual devices for the generic cleanup */
6533 if (dev->rtnl_link_ops)
6534 continue;
6535
6536 /* Push remaining network devices to init_net */
6537 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6538 err = dev_change_net_namespace(dev, &init_net, fb_name);
6539 if (err) {
6540 pr_emerg("%s: failed to move %s to init_net: %d\n",
6541 __func__, dev->name, err);
6542 BUG();
6543 }
6544 }
6545 rtnl_unlock();
6546 }
6547
6548 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6549 {
6550 /* At exit all network devices most be removed from a network
6551 * namespace. Do this in the reverse order of registration.
6552 * Do this across as many network namespaces as possible to
6553 * improve batching efficiency.
6554 */
6555 struct net_device *dev;
6556 struct net *net;
6557 LIST_HEAD(dev_kill_list);
6558
6559 rtnl_lock();
6560 list_for_each_entry(net, net_list, exit_list) {
6561 for_each_netdev_reverse(net, dev) {
6562 if (dev->rtnl_link_ops)
6563 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6564 else
6565 unregister_netdevice_queue(dev, &dev_kill_list);
6566 }
6567 }
6568 unregister_netdevice_many(&dev_kill_list);
6569 list_del(&dev_kill_list);
6570 rtnl_unlock();
6571 }
6572
6573 static struct pernet_operations __net_initdata default_device_ops = {
6574 .exit = default_device_exit,
6575 .exit_batch = default_device_exit_batch,
6576 };
6577
6578 /*
6579 * Initialize the DEV module. At boot time this walks the device list and
6580 * unhooks any devices that fail to initialise (normally hardware not
6581 * present) and leaves us with a valid list of present and active devices.
6582 *
6583 */
6584
6585 /*
6586 * This is called single threaded during boot, so no need
6587 * to take the rtnl semaphore.
6588 */
6589 static int __init net_dev_init(void)
6590 {
6591 int i, rc = -ENOMEM;
6592
6593 BUG_ON(!dev_boot_phase);
6594
6595 if (dev_proc_init())
6596 goto out;
6597
6598 if (netdev_kobject_init())
6599 goto out;
6600
6601 INIT_LIST_HEAD(&ptype_all);
6602 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6603 INIT_LIST_HEAD(&ptype_base[i]);
6604
6605 if (register_pernet_subsys(&netdev_net_ops))
6606 goto out;
6607
6608 /*
6609 * Initialise the packet receive queues.
6610 */
6611
6612 for_each_possible_cpu(i) {
6613 struct softnet_data *sd = &per_cpu(softnet_data, i);
6614
6615 memset(sd, 0, sizeof(*sd));
6616 skb_queue_head_init(&sd->input_pkt_queue);
6617 skb_queue_head_init(&sd->process_queue);
6618 sd->completion_queue = NULL;
6619 INIT_LIST_HEAD(&sd->poll_list);
6620 sd->output_queue = NULL;
6621 sd->output_queue_tailp = &sd->output_queue;
6622 #ifdef CONFIG_RPS
6623 sd->csd.func = rps_trigger_softirq;
6624 sd->csd.info = sd;
6625 sd->csd.flags = 0;
6626 sd->cpu = i;
6627 #endif
6628
6629 sd->backlog.poll = process_backlog;
6630 sd->backlog.weight = weight_p;
6631 sd->backlog.gro_list = NULL;
6632 sd->backlog.gro_count = 0;
6633 }
6634
6635 dev_boot_phase = 0;
6636
6637 /* The loopback device is special if any other network devices
6638 * is present in a network namespace the loopback device must
6639 * be present. Since we now dynamically allocate and free the
6640 * loopback device ensure this invariant is maintained by
6641 * keeping the loopback device as the first device on the
6642 * list of network devices. Ensuring the loopback devices
6643 * is the first device that appears and the last network device
6644 * that disappears.
6645 */
6646 if (register_pernet_device(&loopback_net_ops))
6647 goto out;
6648
6649 if (register_pernet_device(&default_device_ops))
6650 goto out;
6651
6652 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6653 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6654
6655 hotcpu_notifier(dev_cpu_callback, 0);
6656 dst_init();
6657 dev_mcast_init();
6658 rc = 0;
6659 out:
6660 return rc;
6661 }
6662
6663 subsys_initcall(net_dev_init);
6664
6665 static int __init initialize_hashrnd(void)
6666 {
6667 get_random_bytes(&hashrnd, sizeof(hashrnd));
6668 return 0;
6669 }
6670
6671 late_initcall_sync(initialize_hashrnd);
6672
This page took 0.259961 seconds and 6 git commands to generate.