ce6ad88c980b8105531916fa3aa00ced83f437bd
[deliverable/linux.git] / net / core / dev.c
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <linux/pci.h>
132 #include <linux/inetdevice.h>
133
134 #include "net-sysfs.h"
135
136 /* Instead of increasing this, you should create a hash table. */
137 #define MAX_GRO_SKBS 8
138
139 /* This should be increased if a protocol with a bigger head is added. */
140 #define GRO_MAX_HEAD (MAX_HEADER + 128)
141
142 /*
143 * The list of packet types we will receive (as opposed to discard)
144 * and the routines to invoke.
145 *
146 * Why 16. Because with 16 the only overlap we get on a hash of the
147 * low nibble of the protocol value is RARP/SNAP/X.25.
148 *
149 * NOTE: That is no longer true with the addition of VLAN tags. Not
150 * sure which should go first, but I bet it won't make much
151 * difference if we are running VLANs. The good news is that
152 * this protocol won't be in the list unless compiled in, so
153 * the average user (w/out VLANs) will not be adversely affected.
154 * --BLG
155 *
156 * 0800 IP
157 * 8100 802.1Q VLAN
158 * 0001 802.3
159 * 0002 AX.25
160 * 0004 802.2
161 * 8035 RARP
162 * 0005 SNAP
163 * 0805 X.25
164 * 0806 ARP
165 * 8137 IPX
166 * 0009 Localtalk
167 * 86DD IPv6
168 */
169
170 #define PTYPE_HASH_SIZE (16)
171 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
172
173 static DEFINE_SPINLOCK(ptype_lock);
174 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
175 static struct list_head ptype_all __read_mostly; /* Taps */
176
177 /*
178 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
179 * semaphore.
180 *
181 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
182 *
183 * Writers must hold the rtnl semaphore while they loop through the
184 * dev_base_head list, and hold dev_base_lock for writing when they do the
185 * actual updates. This allows pure readers to access the list even
186 * while a writer is preparing to update it.
187 *
188 * To put it another way, dev_base_lock is held for writing only to
189 * protect against pure readers; the rtnl semaphore provides the
190 * protection against other writers.
191 *
192 * See, for example usages, register_netdevice() and
193 * unregister_netdevice(), which must be called with the rtnl
194 * semaphore held.
195 */
196 DEFINE_RWLOCK(dev_base_lock);
197 EXPORT_SYMBOL(dev_base_lock);
198
199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
200 {
201 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
202 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
203 }
204
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208 }
209
210 static inline void rps_lock(struct softnet_data *sd)
211 {
212 #ifdef CONFIG_RPS
213 spin_lock(&sd->input_pkt_queue.lock);
214 #endif
215 }
216
217 static inline void rps_unlock(struct softnet_data *sd)
218 {
219 #ifdef CONFIG_RPS
220 spin_unlock(&sd->input_pkt_queue.lock);
221 #endif
222 }
223
224 /* Device list insertion */
225 static int list_netdevice(struct net_device *dev)
226 {
227 struct net *net = dev_net(dev);
228
229 ASSERT_RTNL();
230
231 write_lock_bh(&dev_base_lock);
232 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
233 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
234 hlist_add_head_rcu(&dev->index_hlist,
235 dev_index_hash(net, dev->ifindex));
236 write_unlock_bh(&dev_base_lock);
237 return 0;
238 }
239
240 /* Device list removal
241 * caller must respect a RCU grace period before freeing/reusing dev
242 */
243 static void unlist_netdevice(struct net_device *dev)
244 {
245 ASSERT_RTNL();
246
247 /* Unlink dev from the device chain */
248 write_lock_bh(&dev_base_lock);
249 list_del_rcu(&dev->dev_list);
250 hlist_del_rcu(&dev->name_hlist);
251 hlist_del_rcu(&dev->index_hlist);
252 write_unlock_bh(&dev_base_lock);
253 }
254
255 /*
256 * Our notifier list
257 */
258
259 static RAW_NOTIFIER_HEAD(netdev_chain);
260
261 /*
262 * Device drivers call our routines to queue packets here. We empty the
263 * queue in the local softnet handler.
264 */
265
266 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
267 EXPORT_PER_CPU_SYMBOL(softnet_data);
268
269 #ifdef CONFIG_LOCKDEP
270 /*
271 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
272 * according to dev->type
273 */
274 static const unsigned short netdev_lock_type[] =
275 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
276 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
277 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
278 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
279 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
280 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
281 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
282 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
283 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
284 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
285 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
286 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
287 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
288 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
289 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
290 ARPHRD_VOID, ARPHRD_NONE};
291
292 static const char *const netdev_lock_name[] =
293 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
294 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
295 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
296 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
297 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
298 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
299 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
300 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
301 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
302 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
303 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
304 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
305 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
306 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
307 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
308 "_xmit_VOID", "_xmit_NONE"};
309
310 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
311 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
312
313 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
314 {
315 int i;
316
317 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
318 if (netdev_lock_type[i] == dev_type)
319 return i;
320 /* the last key is used by default */
321 return ARRAY_SIZE(netdev_lock_type) - 1;
322 }
323
324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
325 unsigned short dev_type)
326 {
327 int i;
328
329 i = netdev_lock_pos(dev_type);
330 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
331 netdev_lock_name[i]);
332 }
333
334 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
335 {
336 int i;
337
338 i = netdev_lock_pos(dev->type);
339 lockdep_set_class_and_name(&dev->addr_list_lock,
340 &netdev_addr_lock_key[i],
341 netdev_lock_name[i]);
342 }
343 #else
344 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
345 unsigned short dev_type)
346 {
347 }
348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349 {
350 }
351 #endif
352
353 /*******************************************************************************
354
355 Protocol management and registration routines
356
357 *******************************************************************************/
358
359 /*
360 * Add a protocol ID to the list. Now that the input handler is
361 * smarter we can dispense with all the messy stuff that used to be
362 * here.
363 *
364 * BEWARE!!! Protocol handlers, mangling input packets,
365 * MUST BE last in hash buckets and checking protocol handlers
366 * MUST start from promiscuous ptype_all chain in net_bh.
367 * It is true now, do not change it.
368 * Explanation follows: if protocol handler, mangling packet, will
369 * be the first on list, it is not able to sense, that packet
370 * is cloned and should be copied-on-write, so that it will
371 * change it and subsequent readers will get broken packet.
372 * --ANK (980803)
373 */
374
375 static inline struct list_head *ptype_head(const struct packet_type *pt)
376 {
377 if (pt->type == htons(ETH_P_ALL))
378 return &ptype_all;
379 else
380 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
381 }
382
383 /**
384 * dev_add_pack - add packet handler
385 * @pt: packet type declaration
386 *
387 * Add a protocol handler to the networking stack. The passed &packet_type
388 * is linked into kernel lists and may not be freed until it has been
389 * removed from the kernel lists.
390 *
391 * This call does not sleep therefore it can not
392 * guarantee all CPU's that are in middle of receiving packets
393 * will see the new packet type (until the next received packet).
394 */
395
396 void dev_add_pack(struct packet_type *pt)
397 {
398 struct list_head *head = ptype_head(pt);
399
400 spin_lock(&ptype_lock);
401 list_add_rcu(&pt->list, head);
402 spin_unlock(&ptype_lock);
403 }
404 EXPORT_SYMBOL(dev_add_pack);
405
406 /**
407 * __dev_remove_pack - remove packet handler
408 * @pt: packet type declaration
409 *
410 * Remove a protocol handler that was previously added to the kernel
411 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
412 * from the kernel lists and can be freed or reused once this function
413 * returns.
414 *
415 * The packet type might still be in use by receivers
416 * and must not be freed until after all the CPU's have gone
417 * through a quiescent state.
418 */
419 void __dev_remove_pack(struct packet_type *pt)
420 {
421 struct list_head *head = ptype_head(pt);
422 struct packet_type *pt1;
423
424 spin_lock(&ptype_lock);
425
426 list_for_each_entry(pt1, head, list) {
427 if (pt == pt1) {
428 list_del_rcu(&pt->list);
429 goto out;
430 }
431 }
432
433 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
434 out:
435 spin_unlock(&ptype_lock);
436 }
437 EXPORT_SYMBOL(__dev_remove_pack);
438
439 /**
440 * dev_remove_pack - remove packet handler
441 * @pt: packet type declaration
442 *
443 * Remove a protocol handler that was previously added to the kernel
444 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
445 * from the kernel lists and can be freed or reused once this function
446 * returns.
447 *
448 * This call sleeps to guarantee that no CPU is looking at the packet
449 * type after return.
450 */
451 void dev_remove_pack(struct packet_type *pt)
452 {
453 __dev_remove_pack(pt);
454
455 synchronize_net();
456 }
457 EXPORT_SYMBOL(dev_remove_pack);
458
459 /******************************************************************************
460
461 Device Boot-time Settings Routines
462
463 *******************************************************************************/
464
465 /* Boot time configuration table */
466 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
467
468 /**
469 * netdev_boot_setup_add - add new setup entry
470 * @name: name of the device
471 * @map: configured settings for the device
472 *
473 * Adds new setup entry to the dev_boot_setup list. The function
474 * returns 0 on error and 1 on success. This is a generic routine to
475 * all netdevices.
476 */
477 static int netdev_boot_setup_add(char *name, struct ifmap *map)
478 {
479 struct netdev_boot_setup *s;
480 int i;
481
482 s = dev_boot_setup;
483 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
484 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
485 memset(s[i].name, 0, sizeof(s[i].name));
486 strlcpy(s[i].name, name, IFNAMSIZ);
487 memcpy(&s[i].map, map, sizeof(s[i].map));
488 break;
489 }
490 }
491
492 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
493 }
494
495 /**
496 * netdev_boot_setup_check - check boot time settings
497 * @dev: the netdevice
498 *
499 * Check boot time settings for the device.
500 * The found settings are set for the device to be used
501 * later in the device probing.
502 * Returns 0 if no settings found, 1 if they are.
503 */
504 int netdev_boot_setup_check(struct net_device *dev)
505 {
506 struct netdev_boot_setup *s = dev_boot_setup;
507 int i;
508
509 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
510 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
511 !strcmp(dev->name, s[i].name)) {
512 dev->irq = s[i].map.irq;
513 dev->base_addr = s[i].map.base_addr;
514 dev->mem_start = s[i].map.mem_start;
515 dev->mem_end = s[i].map.mem_end;
516 return 1;
517 }
518 }
519 return 0;
520 }
521 EXPORT_SYMBOL(netdev_boot_setup_check);
522
523
524 /**
525 * netdev_boot_base - get address from boot time settings
526 * @prefix: prefix for network device
527 * @unit: id for network device
528 *
529 * Check boot time settings for the base address of device.
530 * The found settings are set for the device to be used
531 * later in the device probing.
532 * Returns 0 if no settings found.
533 */
534 unsigned long netdev_boot_base(const char *prefix, int unit)
535 {
536 const struct netdev_boot_setup *s = dev_boot_setup;
537 char name[IFNAMSIZ];
538 int i;
539
540 sprintf(name, "%s%d", prefix, unit);
541
542 /*
543 * If device already registered then return base of 1
544 * to indicate not to probe for this interface
545 */
546 if (__dev_get_by_name(&init_net, name))
547 return 1;
548
549 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
550 if (!strcmp(name, s[i].name))
551 return s[i].map.base_addr;
552 return 0;
553 }
554
555 /*
556 * Saves at boot time configured settings for any netdevice.
557 */
558 int __init netdev_boot_setup(char *str)
559 {
560 int ints[5];
561 struct ifmap map;
562
563 str = get_options(str, ARRAY_SIZE(ints), ints);
564 if (!str || !*str)
565 return 0;
566
567 /* Save settings */
568 memset(&map, 0, sizeof(map));
569 if (ints[0] > 0)
570 map.irq = ints[1];
571 if (ints[0] > 1)
572 map.base_addr = ints[2];
573 if (ints[0] > 2)
574 map.mem_start = ints[3];
575 if (ints[0] > 3)
576 map.mem_end = ints[4];
577
578 /* Add new entry to the list */
579 return netdev_boot_setup_add(str, &map);
580 }
581
582 __setup("netdev=", netdev_boot_setup);
583
584 /*******************************************************************************
585
586 Device Interface Subroutines
587
588 *******************************************************************************/
589
590 /**
591 * __dev_get_by_name - find a device by its name
592 * @net: the applicable net namespace
593 * @name: name to find
594 *
595 * Find an interface by name. Must be called under RTNL semaphore
596 * or @dev_base_lock. If the name is found a pointer to the device
597 * is returned. If the name is not found then %NULL is returned. The
598 * reference counters are not incremented so the caller must be
599 * careful with locks.
600 */
601
602 struct net_device *__dev_get_by_name(struct net *net, const char *name)
603 {
604 struct hlist_node *p;
605 struct net_device *dev;
606 struct hlist_head *head = dev_name_hash(net, name);
607
608 hlist_for_each_entry(dev, p, head, name_hlist)
609 if (!strncmp(dev->name, name, IFNAMSIZ))
610 return dev;
611
612 return NULL;
613 }
614 EXPORT_SYMBOL(__dev_get_by_name);
615
616 /**
617 * dev_get_by_name_rcu - find a device by its name
618 * @net: the applicable net namespace
619 * @name: name to find
620 *
621 * Find an interface by name.
622 * If the name is found a pointer to the device is returned.
623 * If the name is not found then %NULL is returned.
624 * The reference counters are not incremented so the caller must be
625 * careful with locks. The caller must hold RCU lock.
626 */
627
628 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
629 {
630 struct hlist_node *p;
631 struct net_device *dev;
632 struct hlist_head *head = dev_name_hash(net, name);
633
634 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
635 if (!strncmp(dev->name, name, IFNAMSIZ))
636 return dev;
637
638 return NULL;
639 }
640 EXPORT_SYMBOL(dev_get_by_name_rcu);
641
642 /**
643 * dev_get_by_name - find a device by its name
644 * @net: the applicable net namespace
645 * @name: name to find
646 *
647 * Find an interface by name. This can be called from any
648 * context and does its own locking. The returned handle has
649 * the usage count incremented and the caller must use dev_put() to
650 * release it when it is no longer needed. %NULL is returned if no
651 * matching device is found.
652 */
653
654 struct net_device *dev_get_by_name(struct net *net, const char *name)
655 {
656 struct net_device *dev;
657
658 rcu_read_lock();
659 dev = dev_get_by_name_rcu(net, name);
660 if (dev)
661 dev_hold(dev);
662 rcu_read_unlock();
663 return dev;
664 }
665 EXPORT_SYMBOL(dev_get_by_name);
666
667 /**
668 * __dev_get_by_index - find a device by its ifindex
669 * @net: the applicable net namespace
670 * @ifindex: index of device
671 *
672 * Search for an interface by index. Returns %NULL if the device
673 * is not found or a pointer to the device. The device has not
674 * had its reference counter increased so the caller must be careful
675 * about locking. The caller must hold either the RTNL semaphore
676 * or @dev_base_lock.
677 */
678
679 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
680 {
681 struct hlist_node *p;
682 struct net_device *dev;
683 struct hlist_head *head = dev_index_hash(net, ifindex);
684
685 hlist_for_each_entry(dev, p, head, index_hlist)
686 if (dev->ifindex == ifindex)
687 return dev;
688
689 return NULL;
690 }
691 EXPORT_SYMBOL(__dev_get_by_index);
692
693 /**
694 * dev_get_by_index_rcu - find a device by its ifindex
695 * @net: the applicable net namespace
696 * @ifindex: index of device
697 *
698 * Search for an interface by index. Returns %NULL if the device
699 * is not found or a pointer to the device. The device has not
700 * had its reference counter increased so the caller must be careful
701 * about locking. The caller must hold RCU lock.
702 */
703
704 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
705 {
706 struct hlist_node *p;
707 struct net_device *dev;
708 struct hlist_head *head = dev_index_hash(net, ifindex);
709
710 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
711 if (dev->ifindex == ifindex)
712 return dev;
713
714 return NULL;
715 }
716 EXPORT_SYMBOL(dev_get_by_index_rcu);
717
718
719 /**
720 * dev_get_by_index - find a device by its ifindex
721 * @net: the applicable net namespace
722 * @ifindex: index of device
723 *
724 * Search for an interface by index. Returns NULL if the device
725 * is not found or a pointer to the device. The device returned has
726 * had a reference added and the pointer is safe until the user calls
727 * dev_put to indicate they have finished with it.
728 */
729
730 struct net_device *dev_get_by_index(struct net *net, int ifindex)
731 {
732 struct net_device *dev;
733
734 rcu_read_lock();
735 dev = dev_get_by_index_rcu(net, ifindex);
736 if (dev)
737 dev_hold(dev);
738 rcu_read_unlock();
739 return dev;
740 }
741 EXPORT_SYMBOL(dev_get_by_index);
742
743 /**
744 * dev_getbyhwaddr - find a device by its hardware address
745 * @net: the applicable net namespace
746 * @type: media type of device
747 * @ha: hardware address
748 *
749 * Search for an interface by MAC address. Returns NULL if the device
750 * is not found or a pointer to the device. The caller must hold the
751 * rtnl semaphore. The returned device has not had its ref count increased
752 * and the caller must therefore be careful about locking
753 *
754 * BUGS:
755 * If the API was consistent this would be __dev_get_by_hwaddr
756 */
757
758 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
759 {
760 struct net_device *dev;
761
762 ASSERT_RTNL();
763
764 for_each_netdev(net, dev)
765 if (dev->type == type &&
766 !memcmp(dev->dev_addr, ha, dev->addr_len))
767 return dev;
768
769 return NULL;
770 }
771 EXPORT_SYMBOL(dev_getbyhwaddr);
772
773 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
774 {
775 struct net_device *dev;
776
777 ASSERT_RTNL();
778 for_each_netdev(net, dev)
779 if (dev->type == type)
780 return dev;
781
782 return NULL;
783 }
784 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
785
786 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
787 {
788 struct net_device *dev, *ret = NULL;
789
790 rcu_read_lock();
791 for_each_netdev_rcu(net, dev)
792 if (dev->type == type) {
793 dev_hold(dev);
794 ret = dev;
795 break;
796 }
797 rcu_read_unlock();
798 return ret;
799 }
800 EXPORT_SYMBOL(dev_getfirstbyhwtype);
801
802 /**
803 * dev_get_by_flags_rcu - find any device with given flags
804 * @net: the applicable net namespace
805 * @if_flags: IFF_* values
806 * @mask: bitmask of bits in if_flags to check
807 *
808 * Search for any interface with the given flags. Returns NULL if a device
809 * is not found or a pointer to the device. Must be called inside
810 * rcu_read_lock(), and result refcount is unchanged.
811 */
812
813 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
814 unsigned short mask)
815 {
816 struct net_device *dev, *ret;
817
818 ret = NULL;
819 for_each_netdev_rcu(net, dev) {
820 if (((dev->flags ^ if_flags) & mask) == 0) {
821 ret = dev;
822 break;
823 }
824 }
825 return ret;
826 }
827 EXPORT_SYMBOL(dev_get_by_flags_rcu);
828
829 /**
830 * dev_valid_name - check if name is okay for network device
831 * @name: name string
832 *
833 * Network device names need to be valid file names to
834 * to allow sysfs to work. We also disallow any kind of
835 * whitespace.
836 */
837 int dev_valid_name(const char *name)
838 {
839 if (*name == '\0')
840 return 0;
841 if (strlen(name) >= IFNAMSIZ)
842 return 0;
843 if (!strcmp(name, ".") || !strcmp(name, ".."))
844 return 0;
845
846 while (*name) {
847 if (*name == '/' || isspace(*name))
848 return 0;
849 name++;
850 }
851 return 1;
852 }
853 EXPORT_SYMBOL(dev_valid_name);
854
855 /**
856 * __dev_alloc_name - allocate a name for a device
857 * @net: network namespace to allocate the device name in
858 * @name: name format string
859 * @buf: scratch buffer and result name string
860 *
861 * Passed a format string - eg "lt%d" it will try and find a suitable
862 * id. It scans list of devices to build up a free map, then chooses
863 * the first empty slot. The caller must hold the dev_base or rtnl lock
864 * while allocating the name and adding the device in order to avoid
865 * duplicates.
866 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
867 * Returns the number of the unit assigned or a negative errno code.
868 */
869
870 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
871 {
872 int i = 0;
873 const char *p;
874 const int max_netdevices = 8*PAGE_SIZE;
875 unsigned long *inuse;
876 struct net_device *d;
877
878 p = strnchr(name, IFNAMSIZ-1, '%');
879 if (p) {
880 /*
881 * Verify the string as this thing may have come from
882 * the user. There must be either one "%d" and no other "%"
883 * characters.
884 */
885 if (p[1] != 'd' || strchr(p + 2, '%'))
886 return -EINVAL;
887
888 /* Use one page as a bit array of possible slots */
889 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
890 if (!inuse)
891 return -ENOMEM;
892
893 for_each_netdev(net, d) {
894 if (!sscanf(d->name, name, &i))
895 continue;
896 if (i < 0 || i >= max_netdevices)
897 continue;
898
899 /* avoid cases where sscanf is not exact inverse of printf */
900 snprintf(buf, IFNAMSIZ, name, i);
901 if (!strncmp(buf, d->name, IFNAMSIZ))
902 set_bit(i, inuse);
903 }
904
905 i = find_first_zero_bit(inuse, max_netdevices);
906 free_page((unsigned long) inuse);
907 }
908
909 if (buf != name)
910 snprintf(buf, IFNAMSIZ, name, i);
911 if (!__dev_get_by_name(net, buf))
912 return i;
913
914 /* It is possible to run out of possible slots
915 * when the name is long and there isn't enough space left
916 * for the digits, or if all bits are used.
917 */
918 return -ENFILE;
919 }
920
921 /**
922 * dev_alloc_name - allocate a name for a device
923 * @dev: device
924 * @name: name format string
925 *
926 * Passed a format string - eg "lt%d" it will try and find a suitable
927 * id. It scans list of devices to build up a free map, then chooses
928 * the first empty slot. The caller must hold the dev_base or rtnl lock
929 * while allocating the name and adding the device in order to avoid
930 * duplicates.
931 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
932 * Returns the number of the unit assigned or a negative errno code.
933 */
934
935 int dev_alloc_name(struct net_device *dev, const char *name)
936 {
937 char buf[IFNAMSIZ];
938 struct net *net;
939 int ret;
940
941 BUG_ON(!dev_net(dev));
942 net = dev_net(dev);
943 ret = __dev_alloc_name(net, name, buf);
944 if (ret >= 0)
945 strlcpy(dev->name, buf, IFNAMSIZ);
946 return ret;
947 }
948 EXPORT_SYMBOL(dev_alloc_name);
949
950 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
951 {
952 struct net *net;
953
954 BUG_ON(!dev_net(dev));
955 net = dev_net(dev);
956
957 if (!dev_valid_name(name))
958 return -EINVAL;
959
960 if (fmt && strchr(name, '%'))
961 return dev_alloc_name(dev, name);
962 else if (__dev_get_by_name(net, name))
963 return -EEXIST;
964 else if (dev->name != name)
965 strlcpy(dev->name, name, IFNAMSIZ);
966
967 return 0;
968 }
969
970 /**
971 * dev_change_name - change name of a device
972 * @dev: device
973 * @newname: name (or format string) must be at least IFNAMSIZ
974 *
975 * Change name of a device, can pass format strings "eth%d".
976 * for wildcarding.
977 */
978 int dev_change_name(struct net_device *dev, const char *newname)
979 {
980 char oldname[IFNAMSIZ];
981 int err = 0;
982 int ret;
983 struct net *net;
984
985 ASSERT_RTNL();
986 BUG_ON(!dev_net(dev));
987
988 net = dev_net(dev);
989 if (dev->flags & IFF_UP)
990 return -EBUSY;
991
992 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
993 return 0;
994
995 memcpy(oldname, dev->name, IFNAMSIZ);
996
997 err = dev_get_valid_name(dev, newname, 1);
998 if (err < 0)
999 return err;
1000
1001 rollback:
1002 ret = device_rename(&dev->dev, dev->name);
1003 if (ret) {
1004 memcpy(dev->name, oldname, IFNAMSIZ);
1005 return ret;
1006 }
1007
1008 write_lock_bh(&dev_base_lock);
1009 hlist_del(&dev->name_hlist);
1010 write_unlock_bh(&dev_base_lock);
1011
1012 synchronize_rcu();
1013
1014 write_lock_bh(&dev_base_lock);
1015 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1016 write_unlock_bh(&dev_base_lock);
1017
1018 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1019 ret = notifier_to_errno(ret);
1020
1021 if (ret) {
1022 /* err >= 0 after dev_alloc_name() or stores the first errno */
1023 if (err >= 0) {
1024 err = ret;
1025 memcpy(dev->name, oldname, IFNAMSIZ);
1026 goto rollback;
1027 } else {
1028 printk(KERN_ERR
1029 "%s: name change rollback failed: %d.\n",
1030 dev->name, ret);
1031 }
1032 }
1033
1034 return err;
1035 }
1036
1037 /**
1038 * dev_set_alias - change ifalias of a device
1039 * @dev: device
1040 * @alias: name up to IFALIASZ
1041 * @len: limit of bytes to copy from info
1042 *
1043 * Set ifalias for a device,
1044 */
1045 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1046 {
1047 ASSERT_RTNL();
1048
1049 if (len >= IFALIASZ)
1050 return -EINVAL;
1051
1052 if (!len) {
1053 if (dev->ifalias) {
1054 kfree(dev->ifalias);
1055 dev->ifalias = NULL;
1056 }
1057 return 0;
1058 }
1059
1060 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1061 if (!dev->ifalias)
1062 return -ENOMEM;
1063
1064 strlcpy(dev->ifalias, alias, len+1);
1065 return len;
1066 }
1067
1068
1069 /**
1070 * netdev_features_change - device changes features
1071 * @dev: device to cause notification
1072 *
1073 * Called to indicate a device has changed features.
1074 */
1075 void netdev_features_change(struct net_device *dev)
1076 {
1077 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1078 }
1079 EXPORT_SYMBOL(netdev_features_change);
1080
1081 /**
1082 * netdev_state_change - device changes state
1083 * @dev: device to cause notification
1084 *
1085 * Called to indicate a device has changed state. This function calls
1086 * the notifier chains for netdev_chain and sends a NEWLINK message
1087 * to the routing socket.
1088 */
1089 void netdev_state_change(struct net_device *dev)
1090 {
1091 if (dev->flags & IFF_UP) {
1092 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1093 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1094 }
1095 }
1096 EXPORT_SYMBOL(netdev_state_change);
1097
1098 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1099 {
1100 return call_netdevice_notifiers(event, dev);
1101 }
1102 EXPORT_SYMBOL(netdev_bonding_change);
1103
1104 /**
1105 * dev_load - load a network module
1106 * @net: the applicable net namespace
1107 * @name: name of interface
1108 *
1109 * If a network interface is not present and the process has suitable
1110 * privileges this function loads the module. If module loading is not
1111 * available in this kernel then it becomes a nop.
1112 */
1113
1114 void dev_load(struct net *net, const char *name)
1115 {
1116 struct net_device *dev;
1117
1118 rcu_read_lock();
1119 dev = dev_get_by_name_rcu(net, name);
1120 rcu_read_unlock();
1121
1122 if (!dev && capable(CAP_NET_ADMIN))
1123 request_module("%s", name);
1124 }
1125 EXPORT_SYMBOL(dev_load);
1126
1127 static int __dev_open(struct net_device *dev)
1128 {
1129 const struct net_device_ops *ops = dev->netdev_ops;
1130 int ret;
1131
1132 ASSERT_RTNL();
1133
1134 /*
1135 * Is it even present?
1136 */
1137 if (!netif_device_present(dev))
1138 return -ENODEV;
1139
1140 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1141 ret = notifier_to_errno(ret);
1142 if (ret)
1143 return ret;
1144
1145 /*
1146 * Call device private open method
1147 */
1148 set_bit(__LINK_STATE_START, &dev->state);
1149
1150 if (ops->ndo_validate_addr)
1151 ret = ops->ndo_validate_addr(dev);
1152
1153 if (!ret && ops->ndo_open)
1154 ret = ops->ndo_open(dev);
1155
1156 /*
1157 * If it went open OK then:
1158 */
1159
1160 if (ret)
1161 clear_bit(__LINK_STATE_START, &dev->state);
1162 else {
1163 /*
1164 * Set the flags.
1165 */
1166 dev->flags |= IFF_UP;
1167
1168 /*
1169 * Enable NET_DMA
1170 */
1171 net_dmaengine_get();
1172
1173 /*
1174 * Initialize multicasting status
1175 */
1176 dev_set_rx_mode(dev);
1177
1178 /*
1179 * Wakeup transmit queue engine
1180 */
1181 dev_activate(dev);
1182 }
1183
1184 return ret;
1185 }
1186
1187 /**
1188 * dev_open - prepare an interface for use.
1189 * @dev: device to open
1190 *
1191 * Takes a device from down to up state. The device's private open
1192 * function is invoked and then the multicast lists are loaded. Finally
1193 * the device is moved into the up state and a %NETDEV_UP message is
1194 * sent to the netdev notifier chain.
1195 *
1196 * Calling this function on an active interface is a nop. On a failure
1197 * a negative errno code is returned.
1198 */
1199 int dev_open(struct net_device *dev)
1200 {
1201 int ret;
1202
1203 /*
1204 * Is it already up?
1205 */
1206 if (dev->flags & IFF_UP)
1207 return 0;
1208
1209 /*
1210 * Open device
1211 */
1212 ret = __dev_open(dev);
1213 if (ret < 0)
1214 return ret;
1215
1216 /*
1217 * ... and announce new interface.
1218 */
1219 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1220 call_netdevice_notifiers(NETDEV_UP, dev);
1221
1222 return ret;
1223 }
1224 EXPORT_SYMBOL(dev_open);
1225
1226 static int __dev_close(struct net_device *dev)
1227 {
1228 const struct net_device_ops *ops = dev->netdev_ops;
1229
1230 ASSERT_RTNL();
1231 might_sleep();
1232
1233 /*
1234 * Tell people we are going down, so that they can
1235 * prepare to death, when device is still operating.
1236 */
1237 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1238
1239 clear_bit(__LINK_STATE_START, &dev->state);
1240
1241 /* Synchronize to scheduled poll. We cannot touch poll list,
1242 * it can be even on different cpu. So just clear netif_running().
1243 *
1244 * dev->stop() will invoke napi_disable() on all of it's
1245 * napi_struct instances on this device.
1246 */
1247 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1248
1249 dev_deactivate(dev);
1250
1251 /*
1252 * Call the device specific close. This cannot fail.
1253 * Only if device is UP
1254 *
1255 * We allow it to be called even after a DETACH hot-plug
1256 * event.
1257 */
1258 if (ops->ndo_stop)
1259 ops->ndo_stop(dev);
1260
1261 /*
1262 * Device is now down.
1263 */
1264
1265 dev->flags &= ~IFF_UP;
1266
1267 /*
1268 * Shutdown NET_DMA
1269 */
1270 net_dmaengine_put();
1271
1272 return 0;
1273 }
1274
1275 /**
1276 * dev_close - shutdown an interface.
1277 * @dev: device to shutdown
1278 *
1279 * This function moves an active device into down state. A
1280 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1281 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1282 * chain.
1283 */
1284 int dev_close(struct net_device *dev)
1285 {
1286 if (!(dev->flags & IFF_UP))
1287 return 0;
1288
1289 __dev_close(dev);
1290
1291 /*
1292 * Tell people we are down
1293 */
1294 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1295 call_netdevice_notifiers(NETDEV_DOWN, dev);
1296
1297 return 0;
1298 }
1299 EXPORT_SYMBOL(dev_close);
1300
1301
1302 /**
1303 * dev_disable_lro - disable Large Receive Offload on a device
1304 * @dev: device
1305 *
1306 * Disable Large Receive Offload (LRO) on a net device. Must be
1307 * called under RTNL. This is needed if received packets may be
1308 * forwarded to another interface.
1309 */
1310 void dev_disable_lro(struct net_device *dev)
1311 {
1312 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1313 dev->ethtool_ops->set_flags) {
1314 u32 flags = dev->ethtool_ops->get_flags(dev);
1315 if (flags & ETH_FLAG_LRO) {
1316 flags &= ~ETH_FLAG_LRO;
1317 dev->ethtool_ops->set_flags(dev, flags);
1318 }
1319 }
1320 WARN_ON(dev->features & NETIF_F_LRO);
1321 }
1322 EXPORT_SYMBOL(dev_disable_lro);
1323
1324
1325 static int dev_boot_phase = 1;
1326
1327 /*
1328 * Device change register/unregister. These are not inline or static
1329 * as we export them to the world.
1330 */
1331
1332 /**
1333 * register_netdevice_notifier - register a network notifier block
1334 * @nb: notifier
1335 *
1336 * Register a notifier to be called when network device events occur.
1337 * The notifier passed is linked into the kernel structures and must
1338 * not be reused until it has been unregistered. A negative errno code
1339 * is returned on a failure.
1340 *
1341 * When registered all registration and up events are replayed
1342 * to the new notifier to allow device to have a race free
1343 * view of the network device list.
1344 */
1345
1346 int register_netdevice_notifier(struct notifier_block *nb)
1347 {
1348 struct net_device *dev;
1349 struct net_device *last;
1350 struct net *net;
1351 int err;
1352
1353 rtnl_lock();
1354 err = raw_notifier_chain_register(&netdev_chain, nb);
1355 if (err)
1356 goto unlock;
1357 if (dev_boot_phase)
1358 goto unlock;
1359 for_each_net(net) {
1360 for_each_netdev(net, dev) {
1361 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1362 err = notifier_to_errno(err);
1363 if (err)
1364 goto rollback;
1365
1366 if (!(dev->flags & IFF_UP))
1367 continue;
1368
1369 nb->notifier_call(nb, NETDEV_UP, dev);
1370 }
1371 }
1372
1373 unlock:
1374 rtnl_unlock();
1375 return err;
1376
1377 rollback:
1378 last = dev;
1379 for_each_net(net) {
1380 for_each_netdev(net, dev) {
1381 if (dev == last)
1382 break;
1383
1384 if (dev->flags & IFF_UP) {
1385 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1386 nb->notifier_call(nb, NETDEV_DOWN, dev);
1387 }
1388 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1389 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1390 }
1391 }
1392
1393 raw_notifier_chain_unregister(&netdev_chain, nb);
1394 goto unlock;
1395 }
1396 EXPORT_SYMBOL(register_netdevice_notifier);
1397
1398 /**
1399 * unregister_netdevice_notifier - unregister a network notifier block
1400 * @nb: notifier
1401 *
1402 * Unregister a notifier previously registered by
1403 * register_netdevice_notifier(). The notifier is unlinked into the
1404 * kernel structures and may then be reused. A negative errno code
1405 * is returned on a failure.
1406 */
1407
1408 int unregister_netdevice_notifier(struct notifier_block *nb)
1409 {
1410 int err;
1411
1412 rtnl_lock();
1413 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1414 rtnl_unlock();
1415 return err;
1416 }
1417 EXPORT_SYMBOL(unregister_netdevice_notifier);
1418
1419 /**
1420 * call_netdevice_notifiers - call all network notifier blocks
1421 * @val: value passed unmodified to notifier function
1422 * @dev: net_device pointer passed unmodified to notifier function
1423 *
1424 * Call all network notifier blocks. Parameters and return value
1425 * are as for raw_notifier_call_chain().
1426 */
1427
1428 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1429 {
1430 ASSERT_RTNL();
1431 return raw_notifier_call_chain(&netdev_chain, val, dev);
1432 }
1433
1434 /* When > 0 there are consumers of rx skb time stamps */
1435 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1436
1437 void net_enable_timestamp(void)
1438 {
1439 atomic_inc(&netstamp_needed);
1440 }
1441 EXPORT_SYMBOL(net_enable_timestamp);
1442
1443 void net_disable_timestamp(void)
1444 {
1445 atomic_dec(&netstamp_needed);
1446 }
1447 EXPORT_SYMBOL(net_disable_timestamp);
1448
1449 static inline void net_timestamp_set(struct sk_buff *skb)
1450 {
1451 if (atomic_read(&netstamp_needed))
1452 __net_timestamp(skb);
1453 else
1454 skb->tstamp.tv64 = 0;
1455 }
1456
1457 static inline void net_timestamp_check(struct sk_buff *skb)
1458 {
1459 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1460 __net_timestamp(skb);
1461 }
1462
1463 /**
1464 * dev_forward_skb - loopback an skb to another netif
1465 *
1466 * @dev: destination network device
1467 * @skb: buffer to forward
1468 *
1469 * return values:
1470 * NET_RX_SUCCESS (no congestion)
1471 * NET_RX_DROP (packet was dropped, but freed)
1472 *
1473 * dev_forward_skb can be used for injecting an skb from the
1474 * start_xmit function of one device into the receive queue
1475 * of another device.
1476 *
1477 * The receiving device may be in another namespace, so
1478 * we have to clear all information in the skb that could
1479 * impact namespace isolation.
1480 */
1481 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1482 {
1483 skb_orphan(skb);
1484 nf_reset(skb);
1485
1486 if (!(dev->flags & IFF_UP) ||
1487 (skb->len > (dev->mtu + dev->hard_header_len))) {
1488 kfree_skb(skb);
1489 return NET_RX_DROP;
1490 }
1491 skb_set_dev(skb, dev);
1492 skb->tstamp.tv64 = 0;
1493 skb->pkt_type = PACKET_HOST;
1494 skb->protocol = eth_type_trans(skb, dev);
1495 return netif_rx(skb);
1496 }
1497 EXPORT_SYMBOL_GPL(dev_forward_skb);
1498
1499 /*
1500 * Support routine. Sends outgoing frames to any network
1501 * taps currently in use.
1502 */
1503
1504 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1505 {
1506 struct packet_type *ptype;
1507
1508 #ifdef CONFIG_NET_CLS_ACT
1509 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1510 net_timestamp_set(skb);
1511 #else
1512 net_timestamp_set(skb);
1513 #endif
1514
1515 rcu_read_lock();
1516 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1517 /* Never send packets back to the socket
1518 * they originated from - MvS (miquels@drinkel.ow.org)
1519 */
1520 if ((ptype->dev == dev || !ptype->dev) &&
1521 (ptype->af_packet_priv == NULL ||
1522 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1523 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1524 if (!skb2)
1525 break;
1526
1527 /* skb->nh should be correctly
1528 set by sender, so that the second statement is
1529 just protection against buggy protocols.
1530 */
1531 skb_reset_mac_header(skb2);
1532
1533 if (skb_network_header(skb2) < skb2->data ||
1534 skb2->network_header > skb2->tail) {
1535 if (net_ratelimit())
1536 printk(KERN_CRIT "protocol %04x is "
1537 "buggy, dev %s\n",
1538 ntohs(skb2->protocol),
1539 dev->name);
1540 skb_reset_network_header(skb2);
1541 }
1542
1543 skb2->transport_header = skb2->network_header;
1544 skb2->pkt_type = PACKET_OUTGOING;
1545 ptype->func(skb2, skb->dev, ptype, skb->dev);
1546 }
1547 }
1548 rcu_read_unlock();
1549 }
1550
1551 /*
1552 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1553 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1554 */
1555 void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1556 {
1557 unsigned int real_num = dev->real_num_tx_queues;
1558
1559 if (unlikely(txq > dev->num_tx_queues))
1560 ;
1561 else if (txq > real_num)
1562 dev->real_num_tx_queues = txq;
1563 else if (txq < real_num) {
1564 dev->real_num_tx_queues = txq;
1565 qdisc_reset_all_tx_gt(dev, txq);
1566 }
1567 }
1568 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1569
1570 #ifdef CONFIG_RPS
1571 /**
1572 * netif_set_real_num_rx_queues - set actual number of RX queues used
1573 * @dev: Network device
1574 * @rxq: Actual number of RX queues
1575 *
1576 * This must be called either with the rtnl_lock held or before
1577 * registration of the net device. Returns 0 on success, or a
1578 * negative error code. If called before registration, it also
1579 * sets the maximum number of queues, and always succeeds.
1580 */
1581 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1582 {
1583 int rc;
1584
1585 if (dev->reg_state == NETREG_REGISTERED) {
1586 ASSERT_RTNL();
1587
1588 if (rxq > dev->num_rx_queues)
1589 return -EINVAL;
1590
1591 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1592 rxq);
1593 if (rc)
1594 return rc;
1595 } else {
1596 dev->num_rx_queues = rxq;
1597 }
1598
1599 dev->real_num_rx_queues = rxq;
1600 return 0;
1601 }
1602 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1603 #endif
1604
1605 static inline void __netif_reschedule(struct Qdisc *q)
1606 {
1607 struct softnet_data *sd;
1608 unsigned long flags;
1609
1610 local_irq_save(flags);
1611 sd = &__get_cpu_var(softnet_data);
1612 q->next_sched = NULL;
1613 *sd->output_queue_tailp = q;
1614 sd->output_queue_tailp = &q->next_sched;
1615 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1616 local_irq_restore(flags);
1617 }
1618
1619 void __netif_schedule(struct Qdisc *q)
1620 {
1621 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1622 __netif_reschedule(q);
1623 }
1624 EXPORT_SYMBOL(__netif_schedule);
1625
1626 void dev_kfree_skb_irq(struct sk_buff *skb)
1627 {
1628 if (atomic_dec_and_test(&skb->users)) {
1629 struct softnet_data *sd;
1630 unsigned long flags;
1631
1632 local_irq_save(flags);
1633 sd = &__get_cpu_var(softnet_data);
1634 skb->next = sd->completion_queue;
1635 sd->completion_queue = skb;
1636 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1637 local_irq_restore(flags);
1638 }
1639 }
1640 EXPORT_SYMBOL(dev_kfree_skb_irq);
1641
1642 void dev_kfree_skb_any(struct sk_buff *skb)
1643 {
1644 if (in_irq() || irqs_disabled())
1645 dev_kfree_skb_irq(skb);
1646 else
1647 dev_kfree_skb(skb);
1648 }
1649 EXPORT_SYMBOL(dev_kfree_skb_any);
1650
1651
1652 /**
1653 * netif_device_detach - mark device as removed
1654 * @dev: network device
1655 *
1656 * Mark device as removed from system and therefore no longer available.
1657 */
1658 void netif_device_detach(struct net_device *dev)
1659 {
1660 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1661 netif_running(dev)) {
1662 netif_tx_stop_all_queues(dev);
1663 }
1664 }
1665 EXPORT_SYMBOL(netif_device_detach);
1666
1667 /**
1668 * netif_device_attach - mark device as attached
1669 * @dev: network device
1670 *
1671 * Mark device as attached from system and restart if needed.
1672 */
1673 void netif_device_attach(struct net_device *dev)
1674 {
1675 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1676 netif_running(dev)) {
1677 netif_tx_wake_all_queues(dev);
1678 __netdev_watchdog_up(dev);
1679 }
1680 }
1681 EXPORT_SYMBOL(netif_device_attach);
1682
1683 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1684 {
1685 return ((features & NETIF_F_GEN_CSUM) ||
1686 ((features & NETIF_F_IP_CSUM) &&
1687 protocol == htons(ETH_P_IP)) ||
1688 ((features & NETIF_F_IPV6_CSUM) &&
1689 protocol == htons(ETH_P_IPV6)) ||
1690 ((features & NETIF_F_FCOE_CRC) &&
1691 protocol == htons(ETH_P_FCOE)));
1692 }
1693
1694 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1695 {
1696 if (can_checksum_protocol(dev->features, skb->protocol))
1697 return true;
1698
1699 if (skb->protocol == htons(ETH_P_8021Q)) {
1700 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1701 if (can_checksum_protocol(dev->features & dev->vlan_features,
1702 veh->h_vlan_encapsulated_proto))
1703 return true;
1704 }
1705
1706 return false;
1707 }
1708
1709 /**
1710 * skb_dev_set -- assign a new device to a buffer
1711 * @skb: buffer for the new device
1712 * @dev: network device
1713 *
1714 * If an skb is owned by a device already, we have to reset
1715 * all data private to the namespace a device belongs to
1716 * before assigning it a new device.
1717 */
1718 #ifdef CONFIG_NET_NS
1719 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1720 {
1721 skb_dst_drop(skb);
1722 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1723 secpath_reset(skb);
1724 nf_reset(skb);
1725 skb_init_secmark(skb);
1726 skb->mark = 0;
1727 skb->priority = 0;
1728 skb->nf_trace = 0;
1729 skb->ipvs_property = 0;
1730 #ifdef CONFIG_NET_SCHED
1731 skb->tc_index = 0;
1732 #endif
1733 }
1734 skb->dev = dev;
1735 }
1736 EXPORT_SYMBOL(skb_set_dev);
1737 #endif /* CONFIG_NET_NS */
1738
1739 /*
1740 * Invalidate hardware checksum when packet is to be mangled, and
1741 * complete checksum manually on outgoing path.
1742 */
1743 int skb_checksum_help(struct sk_buff *skb)
1744 {
1745 __wsum csum;
1746 int ret = 0, offset;
1747
1748 if (skb->ip_summed == CHECKSUM_COMPLETE)
1749 goto out_set_summed;
1750
1751 if (unlikely(skb_shinfo(skb)->gso_size)) {
1752 /* Let GSO fix up the checksum. */
1753 goto out_set_summed;
1754 }
1755
1756 offset = skb->csum_start - skb_headroom(skb);
1757 BUG_ON(offset >= skb_headlen(skb));
1758 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1759
1760 offset += skb->csum_offset;
1761 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1762
1763 if (skb_cloned(skb) &&
1764 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1765 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1766 if (ret)
1767 goto out;
1768 }
1769
1770 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1771 out_set_summed:
1772 skb->ip_summed = CHECKSUM_NONE;
1773 out:
1774 return ret;
1775 }
1776 EXPORT_SYMBOL(skb_checksum_help);
1777
1778 /**
1779 * skb_gso_segment - Perform segmentation on skb.
1780 * @skb: buffer to segment
1781 * @features: features for the output path (see dev->features)
1782 *
1783 * This function segments the given skb and returns a list of segments.
1784 *
1785 * It may return NULL if the skb requires no segmentation. This is
1786 * only possible when GSO is used for verifying header integrity.
1787 */
1788 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1789 {
1790 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1791 struct packet_type *ptype;
1792 __be16 type = skb->protocol;
1793 int err;
1794
1795 skb_reset_mac_header(skb);
1796 skb->mac_len = skb->network_header - skb->mac_header;
1797 __skb_pull(skb, skb->mac_len);
1798
1799 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1800 struct net_device *dev = skb->dev;
1801 struct ethtool_drvinfo info = {};
1802
1803 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1804 dev->ethtool_ops->get_drvinfo(dev, &info);
1805
1806 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1807 "ip_summed=%d",
1808 info.driver, dev ? dev->features : 0L,
1809 skb->sk ? skb->sk->sk_route_caps : 0L,
1810 skb->len, skb->data_len, skb->ip_summed);
1811
1812 if (skb_header_cloned(skb) &&
1813 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1814 return ERR_PTR(err);
1815 }
1816
1817 rcu_read_lock();
1818 list_for_each_entry_rcu(ptype,
1819 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1820 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1821 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1822 err = ptype->gso_send_check(skb);
1823 segs = ERR_PTR(err);
1824 if (err || skb_gso_ok(skb, features))
1825 break;
1826 __skb_push(skb, (skb->data -
1827 skb_network_header(skb)));
1828 }
1829 segs = ptype->gso_segment(skb, features);
1830 break;
1831 }
1832 }
1833 rcu_read_unlock();
1834
1835 __skb_push(skb, skb->data - skb_mac_header(skb));
1836
1837 return segs;
1838 }
1839 EXPORT_SYMBOL(skb_gso_segment);
1840
1841 /* Take action when hardware reception checksum errors are detected. */
1842 #ifdef CONFIG_BUG
1843 void netdev_rx_csum_fault(struct net_device *dev)
1844 {
1845 if (net_ratelimit()) {
1846 printk(KERN_ERR "%s: hw csum failure.\n",
1847 dev ? dev->name : "<unknown>");
1848 dump_stack();
1849 }
1850 }
1851 EXPORT_SYMBOL(netdev_rx_csum_fault);
1852 #endif
1853
1854 /* Actually, we should eliminate this check as soon as we know, that:
1855 * 1. IOMMU is present and allows to map all the memory.
1856 * 2. No high memory really exists on this machine.
1857 */
1858
1859 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1860 {
1861 #ifdef CONFIG_HIGHMEM
1862 int i;
1863 if (!(dev->features & NETIF_F_HIGHDMA)) {
1864 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1865 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1866 return 1;
1867 }
1868
1869 if (PCI_DMA_BUS_IS_PHYS) {
1870 struct device *pdev = dev->dev.parent;
1871
1872 if (!pdev)
1873 return 0;
1874 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1875 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1876 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1877 return 1;
1878 }
1879 }
1880 #endif
1881 return 0;
1882 }
1883
1884 struct dev_gso_cb {
1885 void (*destructor)(struct sk_buff *skb);
1886 };
1887
1888 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1889
1890 static void dev_gso_skb_destructor(struct sk_buff *skb)
1891 {
1892 struct dev_gso_cb *cb;
1893
1894 do {
1895 struct sk_buff *nskb = skb->next;
1896
1897 skb->next = nskb->next;
1898 nskb->next = NULL;
1899 kfree_skb(nskb);
1900 } while (skb->next);
1901
1902 cb = DEV_GSO_CB(skb);
1903 if (cb->destructor)
1904 cb->destructor(skb);
1905 }
1906
1907 /**
1908 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1909 * @skb: buffer to segment
1910 *
1911 * This function segments the given skb and stores the list of segments
1912 * in skb->next.
1913 */
1914 static int dev_gso_segment(struct sk_buff *skb)
1915 {
1916 struct net_device *dev = skb->dev;
1917 struct sk_buff *segs;
1918 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1919 NETIF_F_SG : 0);
1920
1921 segs = skb_gso_segment(skb, features);
1922
1923 /* Verifying header integrity only. */
1924 if (!segs)
1925 return 0;
1926
1927 if (IS_ERR(segs))
1928 return PTR_ERR(segs);
1929
1930 skb->next = segs;
1931 DEV_GSO_CB(skb)->destructor = skb->destructor;
1932 skb->destructor = dev_gso_skb_destructor;
1933
1934 return 0;
1935 }
1936
1937 /*
1938 * Try to orphan skb early, right before transmission by the device.
1939 * We cannot orphan skb if tx timestamp is requested or the sk-reference
1940 * is needed on driver level for other reasons, e.g. see net/can/raw.c
1941 */
1942 static inline void skb_orphan_try(struct sk_buff *skb)
1943 {
1944 struct sock *sk = skb->sk;
1945
1946 if (sk && !skb_shinfo(skb)->tx_flags) {
1947 /* skb_tx_hash() wont be able to get sk.
1948 * We copy sk_hash into skb->rxhash
1949 */
1950 if (!skb->rxhash)
1951 skb->rxhash = sk->sk_hash;
1952 skb_orphan(skb);
1953 }
1954 }
1955
1956 /*
1957 * Returns true if either:
1958 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
1959 * 2. skb is fragmented and the device does not support SG, or if
1960 * at least one of fragments is in highmem and device does not
1961 * support DMA from it.
1962 */
1963 static inline int skb_needs_linearize(struct sk_buff *skb,
1964 struct net_device *dev)
1965 {
1966 return skb_is_nonlinear(skb) &&
1967 ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
1968 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
1969 illegal_highdma(dev, skb))));
1970 }
1971
1972 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1973 struct netdev_queue *txq)
1974 {
1975 const struct net_device_ops *ops = dev->netdev_ops;
1976 int rc = NETDEV_TX_OK;
1977
1978 if (likely(!skb->next)) {
1979 if (!list_empty(&ptype_all))
1980 dev_queue_xmit_nit(skb, dev);
1981
1982 /*
1983 * If device doesnt need skb->dst, release it right now while
1984 * its hot in this cpu cache
1985 */
1986 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1987 skb_dst_drop(skb);
1988
1989 skb_orphan_try(skb);
1990
1991 if (netif_needs_gso(dev, skb)) {
1992 if (unlikely(dev_gso_segment(skb)))
1993 goto out_kfree_skb;
1994 if (skb->next)
1995 goto gso;
1996 } else {
1997 if (skb_needs_linearize(skb, dev) &&
1998 __skb_linearize(skb))
1999 goto out_kfree_skb;
2000
2001 /* If packet is not checksummed and device does not
2002 * support checksumming for this protocol, complete
2003 * checksumming here.
2004 */
2005 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2006 skb_set_transport_header(skb, skb->csum_start -
2007 skb_headroom(skb));
2008 if (!dev_can_checksum(dev, skb) &&
2009 skb_checksum_help(skb))
2010 goto out_kfree_skb;
2011 }
2012 }
2013
2014 rc = ops->ndo_start_xmit(skb, dev);
2015 if (rc == NETDEV_TX_OK)
2016 txq_trans_update(txq);
2017 return rc;
2018 }
2019
2020 gso:
2021 do {
2022 struct sk_buff *nskb = skb->next;
2023
2024 skb->next = nskb->next;
2025 nskb->next = NULL;
2026
2027 /*
2028 * If device doesnt need nskb->dst, release it right now while
2029 * its hot in this cpu cache
2030 */
2031 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2032 skb_dst_drop(nskb);
2033
2034 rc = ops->ndo_start_xmit(nskb, dev);
2035 if (unlikely(rc != NETDEV_TX_OK)) {
2036 if (rc & ~NETDEV_TX_MASK)
2037 goto out_kfree_gso_skb;
2038 nskb->next = skb->next;
2039 skb->next = nskb;
2040 return rc;
2041 }
2042 txq_trans_update(txq);
2043 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2044 return NETDEV_TX_BUSY;
2045 } while (skb->next);
2046
2047 out_kfree_gso_skb:
2048 if (likely(skb->next == NULL))
2049 skb->destructor = DEV_GSO_CB(skb)->destructor;
2050 out_kfree_skb:
2051 kfree_skb(skb);
2052 return rc;
2053 }
2054
2055 static u32 hashrnd __read_mostly;
2056
2057 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
2058 {
2059 u32 hash;
2060
2061 if (skb_rx_queue_recorded(skb)) {
2062 hash = skb_get_rx_queue(skb);
2063 while (unlikely(hash >= dev->real_num_tx_queues))
2064 hash -= dev->real_num_tx_queues;
2065 return hash;
2066 }
2067
2068 if (skb->sk && skb->sk->sk_hash)
2069 hash = skb->sk->sk_hash;
2070 else
2071 hash = (__force u16) skb->protocol ^ skb->rxhash;
2072 hash = jhash_1word(hash, hashrnd);
2073
2074 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
2075 }
2076 EXPORT_SYMBOL(skb_tx_hash);
2077
2078 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2079 {
2080 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2081 if (net_ratelimit()) {
2082 pr_warning("%s selects TX queue %d, but "
2083 "real number of TX queues is %d\n",
2084 dev->name, queue_index, dev->real_num_tx_queues);
2085 }
2086 return 0;
2087 }
2088 return queue_index;
2089 }
2090
2091 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2092 struct sk_buff *skb)
2093 {
2094 int queue_index;
2095 const struct net_device_ops *ops = dev->netdev_ops;
2096
2097 if (ops->ndo_select_queue) {
2098 queue_index = ops->ndo_select_queue(dev, skb);
2099 queue_index = dev_cap_txqueue(dev, queue_index);
2100 } else {
2101 struct sock *sk = skb->sk;
2102 queue_index = sk_tx_queue_get(sk);
2103 if (queue_index < 0) {
2104
2105 queue_index = 0;
2106 if (dev->real_num_tx_queues > 1)
2107 queue_index = skb_tx_hash(dev, skb);
2108
2109 if (sk) {
2110 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
2111
2112 if (dst && skb_dst(skb) == dst)
2113 sk_tx_queue_set(sk, queue_index);
2114 }
2115 }
2116 }
2117
2118 skb_set_queue_mapping(skb, queue_index);
2119 return netdev_get_tx_queue(dev, queue_index);
2120 }
2121
2122 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2123 struct net_device *dev,
2124 struct netdev_queue *txq)
2125 {
2126 spinlock_t *root_lock = qdisc_lock(q);
2127 bool contended = qdisc_is_running(q);
2128 int rc;
2129
2130 /*
2131 * Heuristic to force contended enqueues to serialize on a
2132 * separate lock before trying to get qdisc main lock.
2133 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2134 * and dequeue packets faster.
2135 */
2136 if (unlikely(contended))
2137 spin_lock(&q->busylock);
2138
2139 spin_lock(root_lock);
2140 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2141 kfree_skb(skb);
2142 rc = NET_XMIT_DROP;
2143 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2144 qdisc_run_begin(q)) {
2145 /*
2146 * This is a work-conserving queue; there are no old skbs
2147 * waiting to be sent out; and the qdisc is not running -
2148 * xmit the skb directly.
2149 */
2150 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2151 skb_dst_force(skb);
2152 __qdisc_update_bstats(q, skb->len);
2153 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2154 if (unlikely(contended)) {
2155 spin_unlock(&q->busylock);
2156 contended = false;
2157 }
2158 __qdisc_run(q);
2159 } else
2160 qdisc_run_end(q);
2161
2162 rc = NET_XMIT_SUCCESS;
2163 } else {
2164 skb_dst_force(skb);
2165 rc = qdisc_enqueue_root(skb, q);
2166 if (qdisc_run_begin(q)) {
2167 if (unlikely(contended)) {
2168 spin_unlock(&q->busylock);
2169 contended = false;
2170 }
2171 __qdisc_run(q);
2172 }
2173 }
2174 spin_unlock(root_lock);
2175 if (unlikely(contended))
2176 spin_unlock(&q->busylock);
2177 return rc;
2178 }
2179
2180 static DEFINE_PER_CPU(int, xmit_recursion);
2181 #define RECURSION_LIMIT 3
2182
2183 /**
2184 * dev_queue_xmit - transmit a buffer
2185 * @skb: buffer to transmit
2186 *
2187 * Queue a buffer for transmission to a network device. The caller must
2188 * have set the device and priority and built the buffer before calling
2189 * this function. The function can be called from an interrupt.
2190 *
2191 * A negative errno code is returned on a failure. A success does not
2192 * guarantee the frame will be transmitted as it may be dropped due
2193 * to congestion or traffic shaping.
2194 *
2195 * -----------------------------------------------------------------------------------
2196 * I notice this method can also return errors from the queue disciplines,
2197 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2198 * be positive.
2199 *
2200 * Regardless of the return value, the skb is consumed, so it is currently
2201 * difficult to retry a send to this method. (You can bump the ref count
2202 * before sending to hold a reference for retry if you are careful.)
2203 *
2204 * When calling this method, interrupts MUST be enabled. This is because
2205 * the BH enable code must have IRQs enabled so that it will not deadlock.
2206 * --BLG
2207 */
2208 int dev_queue_xmit(struct sk_buff *skb)
2209 {
2210 struct net_device *dev = skb->dev;
2211 struct netdev_queue *txq;
2212 struct Qdisc *q;
2213 int rc = -ENOMEM;
2214
2215 /* Disable soft irqs for various locks below. Also
2216 * stops preemption for RCU.
2217 */
2218 rcu_read_lock_bh();
2219
2220 txq = dev_pick_tx(dev, skb);
2221 q = rcu_dereference_bh(txq->qdisc);
2222
2223 #ifdef CONFIG_NET_CLS_ACT
2224 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2225 #endif
2226 if (q->enqueue) {
2227 rc = __dev_xmit_skb(skb, q, dev, txq);
2228 goto out;
2229 }
2230
2231 /* The device has no queue. Common case for software devices:
2232 loopback, all the sorts of tunnels...
2233
2234 Really, it is unlikely that netif_tx_lock protection is necessary
2235 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2236 counters.)
2237 However, it is possible, that they rely on protection
2238 made by us here.
2239
2240 Check this and shot the lock. It is not prone from deadlocks.
2241 Either shot noqueue qdisc, it is even simpler 8)
2242 */
2243 if (dev->flags & IFF_UP) {
2244 int cpu = smp_processor_id(); /* ok because BHs are off */
2245
2246 if (txq->xmit_lock_owner != cpu) {
2247
2248 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2249 goto recursion_alert;
2250
2251 HARD_TX_LOCK(dev, txq, cpu);
2252
2253 if (!netif_tx_queue_stopped(txq)) {
2254 __this_cpu_inc(xmit_recursion);
2255 rc = dev_hard_start_xmit(skb, dev, txq);
2256 __this_cpu_dec(xmit_recursion);
2257 if (dev_xmit_complete(rc)) {
2258 HARD_TX_UNLOCK(dev, txq);
2259 goto out;
2260 }
2261 }
2262 HARD_TX_UNLOCK(dev, txq);
2263 if (net_ratelimit())
2264 printk(KERN_CRIT "Virtual device %s asks to "
2265 "queue packet!\n", dev->name);
2266 } else {
2267 /* Recursion is detected! It is possible,
2268 * unfortunately
2269 */
2270 recursion_alert:
2271 if (net_ratelimit())
2272 printk(KERN_CRIT "Dead loop on virtual device "
2273 "%s, fix it urgently!\n", dev->name);
2274 }
2275 }
2276
2277 rc = -ENETDOWN;
2278 rcu_read_unlock_bh();
2279
2280 kfree_skb(skb);
2281 return rc;
2282 out:
2283 rcu_read_unlock_bh();
2284 return rc;
2285 }
2286 EXPORT_SYMBOL(dev_queue_xmit);
2287
2288
2289 /*=======================================================================
2290 Receiver routines
2291 =======================================================================*/
2292
2293 int netdev_max_backlog __read_mostly = 1000;
2294 int netdev_tstamp_prequeue __read_mostly = 1;
2295 int netdev_budget __read_mostly = 300;
2296 int weight_p __read_mostly = 64; /* old backlog weight */
2297
2298 /* Called with irq disabled */
2299 static inline void ____napi_schedule(struct softnet_data *sd,
2300 struct napi_struct *napi)
2301 {
2302 list_add_tail(&napi->poll_list, &sd->poll_list);
2303 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2304 }
2305
2306 /*
2307 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2308 * and src/dst port numbers. Returns a non-zero hash number on success
2309 * and 0 on failure.
2310 */
2311 __u32 __skb_get_rxhash(struct sk_buff *skb)
2312 {
2313 int nhoff, hash = 0, poff;
2314 struct ipv6hdr *ip6;
2315 struct iphdr *ip;
2316 u8 ip_proto;
2317 u32 addr1, addr2, ihl;
2318 union {
2319 u32 v32;
2320 u16 v16[2];
2321 } ports;
2322
2323 nhoff = skb_network_offset(skb);
2324
2325 switch (skb->protocol) {
2326 case __constant_htons(ETH_P_IP):
2327 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2328 goto done;
2329
2330 ip = (struct iphdr *) (skb->data + nhoff);
2331 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2332 ip_proto = 0;
2333 else
2334 ip_proto = ip->protocol;
2335 addr1 = (__force u32) ip->saddr;
2336 addr2 = (__force u32) ip->daddr;
2337 ihl = ip->ihl;
2338 break;
2339 case __constant_htons(ETH_P_IPV6):
2340 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2341 goto done;
2342
2343 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2344 ip_proto = ip6->nexthdr;
2345 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2346 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2347 ihl = (40 >> 2);
2348 break;
2349 default:
2350 goto done;
2351 }
2352
2353 ports.v32 = 0;
2354 poff = proto_ports_offset(ip_proto);
2355 if (poff >= 0) {
2356 nhoff += ihl * 4 + poff;
2357 if (pskb_may_pull(skb, nhoff + 4)) {
2358 ports.v32 = * (__force u32 *) (skb->data + nhoff);
2359 if (ports.v16[1] < ports.v16[0])
2360 swap(ports.v16[0], ports.v16[1]);
2361 }
2362 }
2363
2364 /* get a consistent hash (same value on both flow directions) */
2365 if (addr2 < addr1)
2366 swap(addr1, addr2);
2367
2368 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2369 if (!hash)
2370 hash = 1;
2371
2372 done:
2373 return hash;
2374 }
2375 EXPORT_SYMBOL(__skb_get_rxhash);
2376
2377 #ifdef CONFIG_RPS
2378
2379 /* One global table that all flow-based protocols share. */
2380 struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2381 EXPORT_SYMBOL(rps_sock_flow_table);
2382
2383 /*
2384 * get_rps_cpu is called from netif_receive_skb and returns the target
2385 * CPU from the RPS map of the receiving queue for a given skb.
2386 * rcu_read_lock must be held on entry.
2387 */
2388 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2389 struct rps_dev_flow **rflowp)
2390 {
2391 struct netdev_rx_queue *rxqueue;
2392 struct rps_map *map = NULL;
2393 struct rps_dev_flow_table *flow_table;
2394 struct rps_sock_flow_table *sock_flow_table;
2395 int cpu = -1;
2396 u16 tcpu;
2397
2398 if (skb_rx_queue_recorded(skb)) {
2399 u16 index = skb_get_rx_queue(skb);
2400 if (unlikely(index >= dev->real_num_rx_queues)) {
2401 WARN_ONCE(dev->real_num_rx_queues > 1,
2402 "%s received packet on queue %u, but number "
2403 "of RX queues is %u\n",
2404 dev->name, index, dev->real_num_rx_queues);
2405 goto done;
2406 }
2407 rxqueue = dev->_rx + index;
2408 } else
2409 rxqueue = dev->_rx;
2410
2411 if (rxqueue->rps_map) {
2412 map = rcu_dereference(rxqueue->rps_map);
2413 if (map && map->len == 1) {
2414 tcpu = map->cpus[0];
2415 if (cpu_online(tcpu))
2416 cpu = tcpu;
2417 goto done;
2418 }
2419 } else if (!rxqueue->rps_flow_table) {
2420 goto done;
2421 }
2422
2423 skb_reset_network_header(skb);
2424 if (!skb_get_rxhash(skb))
2425 goto done;
2426
2427 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2428 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2429 if (flow_table && sock_flow_table) {
2430 u16 next_cpu;
2431 struct rps_dev_flow *rflow;
2432
2433 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2434 tcpu = rflow->cpu;
2435
2436 next_cpu = sock_flow_table->ents[skb->rxhash &
2437 sock_flow_table->mask];
2438
2439 /*
2440 * If the desired CPU (where last recvmsg was done) is
2441 * different from current CPU (one in the rx-queue flow
2442 * table entry), switch if one of the following holds:
2443 * - Current CPU is unset (equal to RPS_NO_CPU).
2444 * - Current CPU is offline.
2445 * - The current CPU's queue tail has advanced beyond the
2446 * last packet that was enqueued using this table entry.
2447 * This guarantees that all previous packets for the flow
2448 * have been dequeued, thus preserving in order delivery.
2449 */
2450 if (unlikely(tcpu != next_cpu) &&
2451 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2452 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2453 rflow->last_qtail)) >= 0)) {
2454 tcpu = rflow->cpu = next_cpu;
2455 if (tcpu != RPS_NO_CPU)
2456 rflow->last_qtail = per_cpu(softnet_data,
2457 tcpu).input_queue_head;
2458 }
2459 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2460 *rflowp = rflow;
2461 cpu = tcpu;
2462 goto done;
2463 }
2464 }
2465
2466 if (map) {
2467 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2468
2469 if (cpu_online(tcpu)) {
2470 cpu = tcpu;
2471 goto done;
2472 }
2473 }
2474
2475 done:
2476 return cpu;
2477 }
2478
2479 /* Called from hardirq (IPI) context */
2480 static void rps_trigger_softirq(void *data)
2481 {
2482 struct softnet_data *sd = data;
2483
2484 ____napi_schedule(sd, &sd->backlog);
2485 sd->received_rps++;
2486 }
2487
2488 #endif /* CONFIG_RPS */
2489
2490 /*
2491 * Check if this softnet_data structure is another cpu one
2492 * If yes, queue it to our IPI list and return 1
2493 * If no, return 0
2494 */
2495 static int rps_ipi_queued(struct softnet_data *sd)
2496 {
2497 #ifdef CONFIG_RPS
2498 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2499
2500 if (sd != mysd) {
2501 sd->rps_ipi_next = mysd->rps_ipi_list;
2502 mysd->rps_ipi_list = sd;
2503
2504 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2505 return 1;
2506 }
2507 #endif /* CONFIG_RPS */
2508 return 0;
2509 }
2510
2511 /*
2512 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2513 * queue (may be a remote CPU queue).
2514 */
2515 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2516 unsigned int *qtail)
2517 {
2518 struct softnet_data *sd;
2519 unsigned long flags;
2520
2521 sd = &per_cpu(softnet_data, cpu);
2522
2523 local_irq_save(flags);
2524
2525 rps_lock(sd);
2526 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2527 if (skb_queue_len(&sd->input_pkt_queue)) {
2528 enqueue:
2529 __skb_queue_tail(&sd->input_pkt_queue, skb);
2530 input_queue_tail_incr_save(sd, qtail);
2531 rps_unlock(sd);
2532 local_irq_restore(flags);
2533 return NET_RX_SUCCESS;
2534 }
2535
2536 /* Schedule NAPI for backlog device
2537 * We can use non atomic operation since we own the queue lock
2538 */
2539 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2540 if (!rps_ipi_queued(sd))
2541 ____napi_schedule(sd, &sd->backlog);
2542 }
2543 goto enqueue;
2544 }
2545
2546 sd->dropped++;
2547 rps_unlock(sd);
2548
2549 local_irq_restore(flags);
2550
2551 kfree_skb(skb);
2552 return NET_RX_DROP;
2553 }
2554
2555 /**
2556 * netif_rx - post buffer to the network code
2557 * @skb: buffer to post
2558 *
2559 * This function receives a packet from a device driver and queues it for
2560 * the upper (protocol) levels to process. It always succeeds. The buffer
2561 * may be dropped during processing for congestion control or by the
2562 * protocol layers.
2563 *
2564 * return values:
2565 * NET_RX_SUCCESS (no congestion)
2566 * NET_RX_DROP (packet was dropped)
2567 *
2568 */
2569
2570 int netif_rx(struct sk_buff *skb)
2571 {
2572 int ret;
2573
2574 /* if netpoll wants it, pretend we never saw it */
2575 if (netpoll_rx(skb))
2576 return NET_RX_DROP;
2577
2578 if (netdev_tstamp_prequeue)
2579 net_timestamp_check(skb);
2580
2581 #ifdef CONFIG_RPS
2582 {
2583 struct rps_dev_flow voidflow, *rflow = &voidflow;
2584 int cpu;
2585
2586 preempt_disable();
2587 rcu_read_lock();
2588
2589 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2590 if (cpu < 0)
2591 cpu = smp_processor_id();
2592
2593 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2594
2595 rcu_read_unlock();
2596 preempt_enable();
2597 }
2598 #else
2599 {
2600 unsigned int qtail;
2601 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2602 put_cpu();
2603 }
2604 #endif
2605 return ret;
2606 }
2607 EXPORT_SYMBOL(netif_rx);
2608
2609 int netif_rx_ni(struct sk_buff *skb)
2610 {
2611 int err;
2612
2613 preempt_disable();
2614 err = netif_rx(skb);
2615 if (local_softirq_pending())
2616 do_softirq();
2617 preempt_enable();
2618
2619 return err;
2620 }
2621 EXPORT_SYMBOL(netif_rx_ni);
2622
2623 static void net_tx_action(struct softirq_action *h)
2624 {
2625 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2626
2627 if (sd->completion_queue) {
2628 struct sk_buff *clist;
2629
2630 local_irq_disable();
2631 clist = sd->completion_queue;
2632 sd->completion_queue = NULL;
2633 local_irq_enable();
2634
2635 while (clist) {
2636 struct sk_buff *skb = clist;
2637 clist = clist->next;
2638
2639 WARN_ON(atomic_read(&skb->users));
2640 __kfree_skb(skb);
2641 }
2642 }
2643
2644 if (sd->output_queue) {
2645 struct Qdisc *head;
2646
2647 local_irq_disable();
2648 head = sd->output_queue;
2649 sd->output_queue = NULL;
2650 sd->output_queue_tailp = &sd->output_queue;
2651 local_irq_enable();
2652
2653 while (head) {
2654 struct Qdisc *q = head;
2655 spinlock_t *root_lock;
2656
2657 head = head->next_sched;
2658
2659 root_lock = qdisc_lock(q);
2660 if (spin_trylock(root_lock)) {
2661 smp_mb__before_clear_bit();
2662 clear_bit(__QDISC_STATE_SCHED,
2663 &q->state);
2664 qdisc_run(q);
2665 spin_unlock(root_lock);
2666 } else {
2667 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2668 &q->state)) {
2669 __netif_reschedule(q);
2670 } else {
2671 smp_mb__before_clear_bit();
2672 clear_bit(__QDISC_STATE_SCHED,
2673 &q->state);
2674 }
2675 }
2676 }
2677 }
2678 }
2679
2680 static inline int deliver_skb(struct sk_buff *skb,
2681 struct packet_type *pt_prev,
2682 struct net_device *orig_dev)
2683 {
2684 atomic_inc(&skb->users);
2685 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2686 }
2687
2688 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2689 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2690 /* This hook is defined here for ATM LANE */
2691 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2692 unsigned char *addr) __read_mostly;
2693 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2694 #endif
2695
2696 #ifdef CONFIG_NET_CLS_ACT
2697 /* TODO: Maybe we should just force sch_ingress to be compiled in
2698 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2699 * a compare and 2 stores extra right now if we dont have it on
2700 * but have CONFIG_NET_CLS_ACT
2701 * NOTE: This doesnt stop any functionality; if you dont have
2702 * the ingress scheduler, you just cant add policies on ingress.
2703 *
2704 */
2705 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2706 {
2707 struct net_device *dev = skb->dev;
2708 u32 ttl = G_TC_RTTL(skb->tc_verd);
2709 int result = TC_ACT_OK;
2710 struct Qdisc *q;
2711
2712 if (unlikely(MAX_RED_LOOP < ttl++)) {
2713 if (net_ratelimit())
2714 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2715 skb->skb_iif, dev->ifindex);
2716 return TC_ACT_SHOT;
2717 }
2718
2719 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2720 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2721
2722 q = rxq->qdisc;
2723 if (q != &noop_qdisc) {
2724 spin_lock(qdisc_lock(q));
2725 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2726 result = qdisc_enqueue_root(skb, q);
2727 spin_unlock(qdisc_lock(q));
2728 }
2729
2730 return result;
2731 }
2732
2733 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2734 struct packet_type **pt_prev,
2735 int *ret, struct net_device *orig_dev)
2736 {
2737 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2738
2739 if (!rxq || rxq->qdisc == &noop_qdisc)
2740 goto out;
2741
2742 if (*pt_prev) {
2743 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2744 *pt_prev = NULL;
2745 }
2746
2747 switch (ing_filter(skb, rxq)) {
2748 case TC_ACT_SHOT:
2749 case TC_ACT_STOLEN:
2750 kfree_skb(skb);
2751 return NULL;
2752 }
2753
2754 out:
2755 skb->tc_verd = 0;
2756 return skb;
2757 }
2758 #endif
2759
2760 /*
2761 * netif_nit_deliver - deliver received packets to network taps
2762 * @skb: buffer
2763 *
2764 * This function is used to deliver incoming packets to network
2765 * taps. It should be used when the normal netif_receive_skb path
2766 * is bypassed, for example because of VLAN acceleration.
2767 */
2768 void netif_nit_deliver(struct sk_buff *skb)
2769 {
2770 struct packet_type *ptype;
2771
2772 if (list_empty(&ptype_all))
2773 return;
2774
2775 skb_reset_network_header(skb);
2776 skb_reset_transport_header(skb);
2777 skb->mac_len = skb->network_header - skb->mac_header;
2778
2779 rcu_read_lock();
2780 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2781 if (!ptype->dev || ptype->dev == skb->dev)
2782 deliver_skb(skb, ptype, skb->dev);
2783 }
2784 rcu_read_unlock();
2785 }
2786
2787 /**
2788 * netdev_rx_handler_register - register receive handler
2789 * @dev: device to register a handler for
2790 * @rx_handler: receive handler to register
2791 * @rx_handler_data: data pointer that is used by rx handler
2792 *
2793 * Register a receive hander for a device. This handler will then be
2794 * called from __netif_receive_skb. A negative errno code is returned
2795 * on a failure.
2796 *
2797 * The caller must hold the rtnl_mutex.
2798 */
2799 int netdev_rx_handler_register(struct net_device *dev,
2800 rx_handler_func_t *rx_handler,
2801 void *rx_handler_data)
2802 {
2803 ASSERT_RTNL();
2804
2805 if (dev->rx_handler)
2806 return -EBUSY;
2807
2808 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2809 rcu_assign_pointer(dev->rx_handler, rx_handler);
2810
2811 return 0;
2812 }
2813 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2814
2815 /**
2816 * netdev_rx_handler_unregister - unregister receive handler
2817 * @dev: device to unregister a handler from
2818 *
2819 * Unregister a receive hander from a device.
2820 *
2821 * The caller must hold the rtnl_mutex.
2822 */
2823 void netdev_rx_handler_unregister(struct net_device *dev)
2824 {
2825
2826 ASSERT_RTNL();
2827 rcu_assign_pointer(dev->rx_handler, NULL);
2828 rcu_assign_pointer(dev->rx_handler_data, NULL);
2829 }
2830 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2831
2832 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2833 struct net_device *master)
2834 {
2835 if (skb->pkt_type == PACKET_HOST) {
2836 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2837
2838 memcpy(dest, master->dev_addr, ETH_ALEN);
2839 }
2840 }
2841
2842 /* On bonding slaves other than the currently active slave, suppress
2843 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2844 * ARP on active-backup slaves with arp_validate enabled.
2845 */
2846 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2847 {
2848 struct net_device *dev = skb->dev;
2849
2850 if (master->priv_flags & IFF_MASTER_ARPMON)
2851 dev->last_rx = jiffies;
2852
2853 if ((master->priv_flags & IFF_MASTER_ALB) &&
2854 (master->priv_flags & IFF_BRIDGE_PORT)) {
2855 /* Do address unmangle. The local destination address
2856 * will be always the one master has. Provides the right
2857 * functionality in a bridge.
2858 */
2859 skb_bond_set_mac_by_master(skb, master);
2860 }
2861
2862 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2863 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2864 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2865 return 0;
2866
2867 if (master->priv_flags & IFF_MASTER_ALB) {
2868 if (skb->pkt_type != PACKET_BROADCAST &&
2869 skb->pkt_type != PACKET_MULTICAST)
2870 return 0;
2871 }
2872 if (master->priv_flags & IFF_MASTER_8023AD &&
2873 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2874 return 0;
2875
2876 return 1;
2877 }
2878 return 0;
2879 }
2880 EXPORT_SYMBOL(__skb_bond_should_drop);
2881
2882 static int __netif_receive_skb(struct sk_buff *skb)
2883 {
2884 struct packet_type *ptype, *pt_prev;
2885 rx_handler_func_t *rx_handler;
2886 struct net_device *orig_dev;
2887 struct net_device *master;
2888 struct net_device *null_or_orig;
2889 struct net_device *orig_or_bond;
2890 int ret = NET_RX_DROP;
2891 __be16 type;
2892
2893 if (!netdev_tstamp_prequeue)
2894 net_timestamp_check(skb);
2895
2896 if (vlan_tx_tag_present(skb))
2897 vlan_hwaccel_do_receive(skb);
2898
2899 /* if we've gotten here through NAPI, check netpoll */
2900 if (netpoll_receive_skb(skb))
2901 return NET_RX_DROP;
2902
2903 if (!skb->skb_iif)
2904 skb->skb_iif = skb->dev->ifindex;
2905
2906 /*
2907 * bonding note: skbs received on inactive slaves should only
2908 * be delivered to pkt handlers that are exact matches. Also
2909 * the deliver_no_wcard flag will be set. If packet handlers
2910 * are sensitive to duplicate packets these skbs will need to
2911 * be dropped at the handler. The vlan accel path may have
2912 * already set the deliver_no_wcard flag.
2913 */
2914 null_or_orig = NULL;
2915 orig_dev = skb->dev;
2916 master = ACCESS_ONCE(orig_dev->master);
2917 if (skb->deliver_no_wcard)
2918 null_or_orig = orig_dev;
2919 else if (master) {
2920 if (skb_bond_should_drop(skb, master)) {
2921 skb->deliver_no_wcard = 1;
2922 null_or_orig = orig_dev; /* deliver only exact match */
2923 } else
2924 skb->dev = master;
2925 }
2926
2927 __this_cpu_inc(softnet_data.processed);
2928 skb_reset_network_header(skb);
2929 skb_reset_transport_header(skb);
2930 skb->mac_len = skb->network_header - skb->mac_header;
2931
2932 pt_prev = NULL;
2933
2934 rcu_read_lock();
2935
2936 #ifdef CONFIG_NET_CLS_ACT
2937 if (skb->tc_verd & TC_NCLS) {
2938 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2939 goto ncls;
2940 }
2941 #endif
2942
2943 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2944 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2945 ptype->dev == orig_dev) {
2946 if (pt_prev)
2947 ret = deliver_skb(skb, pt_prev, orig_dev);
2948 pt_prev = ptype;
2949 }
2950 }
2951
2952 #ifdef CONFIG_NET_CLS_ACT
2953 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2954 if (!skb)
2955 goto out;
2956 ncls:
2957 #endif
2958
2959 /* Handle special case of bridge or macvlan */
2960 rx_handler = rcu_dereference(skb->dev->rx_handler);
2961 if (rx_handler) {
2962 if (pt_prev) {
2963 ret = deliver_skb(skb, pt_prev, orig_dev);
2964 pt_prev = NULL;
2965 }
2966 skb = rx_handler(skb);
2967 if (!skb)
2968 goto out;
2969 }
2970
2971 /*
2972 * Make sure frames received on VLAN interfaces stacked on
2973 * bonding interfaces still make their way to any base bonding
2974 * device that may have registered for a specific ptype. The
2975 * handler may have to adjust skb->dev and orig_dev.
2976 */
2977 orig_or_bond = orig_dev;
2978 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2979 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2980 orig_or_bond = vlan_dev_real_dev(skb->dev);
2981 }
2982
2983 type = skb->protocol;
2984 list_for_each_entry_rcu(ptype,
2985 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2986 if (ptype->type == type && (ptype->dev == null_or_orig ||
2987 ptype->dev == skb->dev || ptype->dev == orig_dev ||
2988 ptype->dev == orig_or_bond)) {
2989 if (pt_prev)
2990 ret = deliver_skb(skb, pt_prev, orig_dev);
2991 pt_prev = ptype;
2992 }
2993 }
2994
2995 if (pt_prev) {
2996 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2997 } else {
2998 kfree_skb(skb);
2999 /* Jamal, now you will not able to escape explaining
3000 * me how you were going to use this. :-)
3001 */
3002 ret = NET_RX_DROP;
3003 }
3004
3005 out:
3006 rcu_read_unlock();
3007 return ret;
3008 }
3009
3010 /**
3011 * netif_receive_skb - process receive buffer from network
3012 * @skb: buffer to process
3013 *
3014 * netif_receive_skb() is the main receive data processing function.
3015 * It always succeeds. The buffer may be dropped during processing
3016 * for congestion control or by the protocol layers.
3017 *
3018 * This function may only be called from softirq context and interrupts
3019 * should be enabled.
3020 *
3021 * Return values (usually ignored):
3022 * NET_RX_SUCCESS: no congestion
3023 * NET_RX_DROP: packet was dropped
3024 */
3025 int netif_receive_skb(struct sk_buff *skb)
3026 {
3027 if (netdev_tstamp_prequeue)
3028 net_timestamp_check(skb);
3029
3030 if (skb_defer_rx_timestamp(skb))
3031 return NET_RX_SUCCESS;
3032
3033 #ifdef CONFIG_RPS
3034 {
3035 struct rps_dev_flow voidflow, *rflow = &voidflow;
3036 int cpu, ret;
3037
3038 rcu_read_lock();
3039
3040 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3041
3042 if (cpu >= 0) {
3043 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3044 rcu_read_unlock();
3045 } else {
3046 rcu_read_unlock();
3047 ret = __netif_receive_skb(skb);
3048 }
3049
3050 return ret;
3051 }
3052 #else
3053 return __netif_receive_skb(skb);
3054 #endif
3055 }
3056 EXPORT_SYMBOL(netif_receive_skb);
3057
3058 /* Network device is going away, flush any packets still pending
3059 * Called with irqs disabled.
3060 */
3061 static void flush_backlog(void *arg)
3062 {
3063 struct net_device *dev = arg;
3064 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3065 struct sk_buff *skb, *tmp;
3066
3067 rps_lock(sd);
3068 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3069 if (skb->dev == dev) {
3070 __skb_unlink(skb, &sd->input_pkt_queue);
3071 kfree_skb(skb);
3072 input_queue_head_incr(sd);
3073 }
3074 }
3075 rps_unlock(sd);
3076
3077 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3078 if (skb->dev == dev) {
3079 __skb_unlink(skb, &sd->process_queue);
3080 kfree_skb(skb);
3081 input_queue_head_incr(sd);
3082 }
3083 }
3084 }
3085
3086 static int napi_gro_complete(struct sk_buff *skb)
3087 {
3088 struct packet_type *ptype;
3089 __be16 type = skb->protocol;
3090 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3091 int err = -ENOENT;
3092
3093 if (NAPI_GRO_CB(skb)->count == 1) {
3094 skb_shinfo(skb)->gso_size = 0;
3095 goto out;
3096 }
3097
3098 rcu_read_lock();
3099 list_for_each_entry_rcu(ptype, head, list) {
3100 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3101 continue;
3102
3103 err = ptype->gro_complete(skb);
3104 break;
3105 }
3106 rcu_read_unlock();
3107
3108 if (err) {
3109 WARN_ON(&ptype->list == head);
3110 kfree_skb(skb);
3111 return NET_RX_SUCCESS;
3112 }
3113
3114 out:
3115 return netif_receive_skb(skb);
3116 }
3117
3118 inline void napi_gro_flush(struct napi_struct *napi)
3119 {
3120 struct sk_buff *skb, *next;
3121
3122 for (skb = napi->gro_list; skb; skb = next) {
3123 next = skb->next;
3124 skb->next = NULL;
3125 napi_gro_complete(skb);
3126 }
3127
3128 napi->gro_count = 0;
3129 napi->gro_list = NULL;
3130 }
3131 EXPORT_SYMBOL(napi_gro_flush);
3132
3133 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3134 {
3135 struct sk_buff **pp = NULL;
3136 struct packet_type *ptype;
3137 __be16 type = skb->protocol;
3138 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3139 int same_flow;
3140 int mac_len;
3141 enum gro_result ret;
3142
3143 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3144 goto normal;
3145
3146 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3147 goto normal;
3148
3149 rcu_read_lock();
3150 list_for_each_entry_rcu(ptype, head, list) {
3151 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3152 continue;
3153
3154 skb_set_network_header(skb, skb_gro_offset(skb));
3155 mac_len = skb->network_header - skb->mac_header;
3156 skb->mac_len = mac_len;
3157 NAPI_GRO_CB(skb)->same_flow = 0;
3158 NAPI_GRO_CB(skb)->flush = 0;
3159 NAPI_GRO_CB(skb)->free = 0;
3160
3161 pp = ptype->gro_receive(&napi->gro_list, skb);
3162 break;
3163 }
3164 rcu_read_unlock();
3165
3166 if (&ptype->list == head)
3167 goto normal;
3168
3169 same_flow = NAPI_GRO_CB(skb)->same_flow;
3170 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3171
3172 if (pp) {
3173 struct sk_buff *nskb = *pp;
3174
3175 *pp = nskb->next;
3176 nskb->next = NULL;
3177 napi_gro_complete(nskb);
3178 napi->gro_count--;
3179 }
3180
3181 if (same_flow)
3182 goto ok;
3183
3184 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3185 goto normal;
3186
3187 napi->gro_count++;
3188 NAPI_GRO_CB(skb)->count = 1;
3189 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3190 skb->next = napi->gro_list;
3191 napi->gro_list = skb;
3192 ret = GRO_HELD;
3193
3194 pull:
3195 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3196 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3197
3198 BUG_ON(skb->end - skb->tail < grow);
3199
3200 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3201
3202 skb->tail += grow;
3203 skb->data_len -= grow;
3204
3205 skb_shinfo(skb)->frags[0].page_offset += grow;
3206 skb_shinfo(skb)->frags[0].size -= grow;
3207
3208 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3209 put_page(skb_shinfo(skb)->frags[0].page);
3210 memmove(skb_shinfo(skb)->frags,
3211 skb_shinfo(skb)->frags + 1,
3212 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3213 }
3214 }
3215
3216 ok:
3217 return ret;
3218
3219 normal:
3220 ret = GRO_NORMAL;
3221 goto pull;
3222 }
3223 EXPORT_SYMBOL(dev_gro_receive);
3224
3225 static inline gro_result_t
3226 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3227 {
3228 struct sk_buff *p;
3229
3230 for (p = napi->gro_list; p; p = p->next) {
3231 unsigned long diffs;
3232
3233 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3234 diffs |= compare_ether_header(skb_mac_header(p),
3235 skb_gro_mac_header(skb));
3236 NAPI_GRO_CB(p)->same_flow = !diffs;
3237 NAPI_GRO_CB(p)->flush = 0;
3238 }
3239
3240 return dev_gro_receive(napi, skb);
3241 }
3242
3243 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3244 {
3245 switch (ret) {
3246 case GRO_NORMAL:
3247 if (netif_receive_skb(skb))
3248 ret = GRO_DROP;
3249 break;
3250
3251 case GRO_DROP:
3252 case GRO_MERGED_FREE:
3253 kfree_skb(skb);
3254 break;
3255
3256 case GRO_HELD:
3257 case GRO_MERGED:
3258 break;
3259 }
3260
3261 return ret;
3262 }
3263 EXPORT_SYMBOL(napi_skb_finish);
3264
3265 void skb_gro_reset_offset(struct sk_buff *skb)
3266 {
3267 NAPI_GRO_CB(skb)->data_offset = 0;
3268 NAPI_GRO_CB(skb)->frag0 = NULL;
3269 NAPI_GRO_CB(skb)->frag0_len = 0;
3270
3271 if (skb->mac_header == skb->tail &&
3272 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3273 NAPI_GRO_CB(skb)->frag0 =
3274 page_address(skb_shinfo(skb)->frags[0].page) +
3275 skb_shinfo(skb)->frags[0].page_offset;
3276 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3277 }
3278 }
3279 EXPORT_SYMBOL(skb_gro_reset_offset);
3280
3281 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3282 {
3283 skb_gro_reset_offset(skb);
3284
3285 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3286 }
3287 EXPORT_SYMBOL(napi_gro_receive);
3288
3289 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3290 {
3291 __skb_pull(skb, skb_headlen(skb));
3292 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3293
3294 napi->skb = skb;
3295 }
3296 EXPORT_SYMBOL(napi_reuse_skb);
3297
3298 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3299 {
3300 struct sk_buff *skb = napi->skb;
3301
3302 if (!skb) {
3303 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3304 if (skb)
3305 napi->skb = skb;
3306 }
3307 return skb;
3308 }
3309 EXPORT_SYMBOL(napi_get_frags);
3310
3311 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3312 gro_result_t ret)
3313 {
3314 switch (ret) {
3315 case GRO_NORMAL:
3316 case GRO_HELD:
3317 skb->protocol = eth_type_trans(skb, skb->dev);
3318
3319 if (ret == GRO_HELD)
3320 skb_gro_pull(skb, -ETH_HLEN);
3321 else if (netif_receive_skb(skb))
3322 ret = GRO_DROP;
3323 break;
3324
3325 case GRO_DROP:
3326 case GRO_MERGED_FREE:
3327 napi_reuse_skb(napi, skb);
3328 break;
3329
3330 case GRO_MERGED:
3331 break;
3332 }
3333
3334 return ret;
3335 }
3336 EXPORT_SYMBOL(napi_frags_finish);
3337
3338 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3339 {
3340 struct sk_buff *skb = napi->skb;
3341 struct ethhdr *eth;
3342 unsigned int hlen;
3343 unsigned int off;
3344
3345 napi->skb = NULL;
3346
3347 skb_reset_mac_header(skb);
3348 skb_gro_reset_offset(skb);
3349
3350 off = skb_gro_offset(skb);
3351 hlen = off + sizeof(*eth);
3352 eth = skb_gro_header_fast(skb, off);
3353 if (skb_gro_header_hard(skb, hlen)) {
3354 eth = skb_gro_header_slow(skb, hlen, off);
3355 if (unlikely(!eth)) {
3356 napi_reuse_skb(napi, skb);
3357 skb = NULL;
3358 goto out;
3359 }
3360 }
3361
3362 skb_gro_pull(skb, sizeof(*eth));
3363
3364 /*
3365 * This works because the only protocols we care about don't require
3366 * special handling. We'll fix it up properly at the end.
3367 */
3368 skb->protocol = eth->h_proto;
3369
3370 out:
3371 return skb;
3372 }
3373 EXPORT_SYMBOL(napi_frags_skb);
3374
3375 gro_result_t napi_gro_frags(struct napi_struct *napi)
3376 {
3377 struct sk_buff *skb = napi_frags_skb(napi);
3378
3379 if (!skb)
3380 return GRO_DROP;
3381
3382 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3383 }
3384 EXPORT_SYMBOL(napi_gro_frags);
3385
3386 /*
3387 * net_rps_action sends any pending IPI's for rps.
3388 * Note: called with local irq disabled, but exits with local irq enabled.
3389 */
3390 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3391 {
3392 #ifdef CONFIG_RPS
3393 struct softnet_data *remsd = sd->rps_ipi_list;
3394
3395 if (remsd) {
3396 sd->rps_ipi_list = NULL;
3397
3398 local_irq_enable();
3399
3400 /* Send pending IPI's to kick RPS processing on remote cpus. */
3401 while (remsd) {
3402 struct softnet_data *next = remsd->rps_ipi_next;
3403
3404 if (cpu_online(remsd->cpu))
3405 __smp_call_function_single(remsd->cpu,
3406 &remsd->csd, 0);
3407 remsd = next;
3408 }
3409 } else
3410 #endif
3411 local_irq_enable();
3412 }
3413
3414 static int process_backlog(struct napi_struct *napi, int quota)
3415 {
3416 int work = 0;
3417 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3418
3419 #ifdef CONFIG_RPS
3420 /* Check if we have pending ipi, its better to send them now,
3421 * not waiting net_rx_action() end.
3422 */
3423 if (sd->rps_ipi_list) {
3424 local_irq_disable();
3425 net_rps_action_and_irq_enable(sd);
3426 }
3427 #endif
3428 napi->weight = weight_p;
3429 local_irq_disable();
3430 while (work < quota) {
3431 struct sk_buff *skb;
3432 unsigned int qlen;
3433
3434 while ((skb = __skb_dequeue(&sd->process_queue))) {
3435 local_irq_enable();
3436 __netif_receive_skb(skb);
3437 local_irq_disable();
3438 input_queue_head_incr(sd);
3439 if (++work >= quota) {
3440 local_irq_enable();
3441 return work;
3442 }
3443 }
3444
3445 rps_lock(sd);
3446 qlen = skb_queue_len(&sd->input_pkt_queue);
3447 if (qlen)
3448 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3449 &sd->process_queue);
3450
3451 if (qlen < quota - work) {
3452 /*
3453 * Inline a custom version of __napi_complete().
3454 * only current cpu owns and manipulates this napi,
3455 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3456 * we can use a plain write instead of clear_bit(),
3457 * and we dont need an smp_mb() memory barrier.
3458 */
3459 list_del(&napi->poll_list);
3460 napi->state = 0;
3461
3462 quota = work + qlen;
3463 }
3464 rps_unlock(sd);
3465 }
3466 local_irq_enable();
3467
3468 return work;
3469 }
3470
3471 /**
3472 * __napi_schedule - schedule for receive
3473 * @n: entry to schedule
3474 *
3475 * The entry's receive function will be scheduled to run
3476 */
3477 void __napi_schedule(struct napi_struct *n)
3478 {
3479 unsigned long flags;
3480
3481 local_irq_save(flags);
3482 ____napi_schedule(&__get_cpu_var(softnet_data), n);
3483 local_irq_restore(flags);
3484 }
3485 EXPORT_SYMBOL(__napi_schedule);
3486
3487 void __napi_complete(struct napi_struct *n)
3488 {
3489 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3490 BUG_ON(n->gro_list);
3491
3492 list_del(&n->poll_list);
3493 smp_mb__before_clear_bit();
3494 clear_bit(NAPI_STATE_SCHED, &n->state);
3495 }
3496 EXPORT_SYMBOL(__napi_complete);
3497
3498 void napi_complete(struct napi_struct *n)
3499 {
3500 unsigned long flags;
3501
3502 /*
3503 * don't let napi dequeue from the cpu poll list
3504 * just in case its running on a different cpu
3505 */
3506 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3507 return;
3508
3509 napi_gro_flush(n);
3510 local_irq_save(flags);
3511 __napi_complete(n);
3512 local_irq_restore(flags);
3513 }
3514 EXPORT_SYMBOL(napi_complete);
3515
3516 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3517 int (*poll)(struct napi_struct *, int), int weight)
3518 {
3519 INIT_LIST_HEAD(&napi->poll_list);
3520 napi->gro_count = 0;
3521 napi->gro_list = NULL;
3522 napi->skb = NULL;
3523 napi->poll = poll;
3524 napi->weight = weight;
3525 list_add(&napi->dev_list, &dev->napi_list);
3526 napi->dev = dev;
3527 #ifdef CONFIG_NETPOLL
3528 spin_lock_init(&napi->poll_lock);
3529 napi->poll_owner = -1;
3530 #endif
3531 set_bit(NAPI_STATE_SCHED, &napi->state);
3532 }
3533 EXPORT_SYMBOL(netif_napi_add);
3534
3535 void netif_napi_del(struct napi_struct *napi)
3536 {
3537 struct sk_buff *skb, *next;
3538
3539 list_del_init(&napi->dev_list);
3540 napi_free_frags(napi);
3541
3542 for (skb = napi->gro_list; skb; skb = next) {
3543 next = skb->next;
3544 skb->next = NULL;
3545 kfree_skb(skb);
3546 }
3547
3548 napi->gro_list = NULL;
3549 napi->gro_count = 0;
3550 }
3551 EXPORT_SYMBOL(netif_napi_del);
3552
3553 static void net_rx_action(struct softirq_action *h)
3554 {
3555 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3556 unsigned long time_limit = jiffies + 2;
3557 int budget = netdev_budget;
3558 void *have;
3559
3560 local_irq_disable();
3561
3562 while (!list_empty(&sd->poll_list)) {
3563 struct napi_struct *n;
3564 int work, weight;
3565
3566 /* If softirq window is exhuasted then punt.
3567 * Allow this to run for 2 jiffies since which will allow
3568 * an average latency of 1.5/HZ.
3569 */
3570 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3571 goto softnet_break;
3572
3573 local_irq_enable();
3574
3575 /* Even though interrupts have been re-enabled, this
3576 * access is safe because interrupts can only add new
3577 * entries to the tail of this list, and only ->poll()
3578 * calls can remove this head entry from the list.
3579 */
3580 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3581
3582 have = netpoll_poll_lock(n);
3583
3584 weight = n->weight;
3585
3586 /* This NAPI_STATE_SCHED test is for avoiding a race
3587 * with netpoll's poll_napi(). Only the entity which
3588 * obtains the lock and sees NAPI_STATE_SCHED set will
3589 * actually make the ->poll() call. Therefore we avoid
3590 * accidently calling ->poll() when NAPI is not scheduled.
3591 */
3592 work = 0;
3593 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3594 work = n->poll(n, weight);
3595 trace_napi_poll(n);
3596 }
3597
3598 WARN_ON_ONCE(work > weight);
3599
3600 budget -= work;
3601
3602 local_irq_disable();
3603
3604 /* Drivers must not modify the NAPI state if they
3605 * consume the entire weight. In such cases this code
3606 * still "owns" the NAPI instance and therefore can
3607 * move the instance around on the list at-will.
3608 */
3609 if (unlikely(work == weight)) {
3610 if (unlikely(napi_disable_pending(n))) {
3611 local_irq_enable();
3612 napi_complete(n);
3613 local_irq_disable();
3614 } else
3615 list_move_tail(&n->poll_list, &sd->poll_list);
3616 }
3617
3618 netpoll_poll_unlock(have);
3619 }
3620 out:
3621 net_rps_action_and_irq_enable(sd);
3622
3623 #ifdef CONFIG_NET_DMA
3624 /*
3625 * There may not be any more sk_buffs coming right now, so push
3626 * any pending DMA copies to hardware
3627 */
3628 dma_issue_pending_all();
3629 #endif
3630
3631 return;
3632
3633 softnet_break:
3634 sd->time_squeeze++;
3635 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3636 goto out;
3637 }
3638
3639 static gifconf_func_t *gifconf_list[NPROTO];
3640
3641 /**
3642 * register_gifconf - register a SIOCGIF handler
3643 * @family: Address family
3644 * @gifconf: Function handler
3645 *
3646 * Register protocol dependent address dumping routines. The handler
3647 * that is passed must not be freed or reused until it has been replaced
3648 * by another handler.
3649 */
3650 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3651 {
3652 if (family >= NPROTO)
3653 return -EINVAL;
3654 gifconf_list[family] = gifconf;
3655 return 0;
3656 }
3657 EXPORT_SYMBOL(register_gifconf);
3658
3659
3660 /*
3661 * Map an interface index to its name (SIOCGIFNAME)
3662 */
3663
3664 /*
3665 * We need this ioctl for efficient implementation of the
3666 * if_indextoname() function required by the IPv6 API. Without
3667 * it, we would have to search all the interfaces to find a
3668 * match. --pb
3669 */
3670
3671 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3672 {
3673 struct net_device *dev;
3674 struct ifreq ifr;
3675
3676 /*
3677 * Fetch the caller's info block.
3678 */
3679
3680 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3681 return -EFAULT;
3682
3683 rcu_read_lock();
3684 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3685 if (!dev) {
3686 rcu_read_unlock();
3687 return -ENODEV;
3688 }
3689
3690 strcpy(ifr.ifr_name, dev->name);
3691 rcu_read_unlock();
3692
3693 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3694 return -EFAULT;
3695 return 0;
3696 }
3697
3698 /*
3699 * Perform a SIOCGIFCONF call. This structure will change
3700 * size eventually, and there is nothing I can do about it.
3701 * Thus we will need a 'compatibility mode'.
3702 */
3703
3704 static int dev_ifconf(struct net *net, char __user *arg)
3705 {
3706 struct ifconf ifc;
3707 struct net_device *dev;
3708 char __user *pos;
3709 int len;
3710 int total;
3711 int i;
3712
3713 /*
3714 * Fetch the caller's info block.
3715 */
3716
3717 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3718 return -EFAULT;
3719
3720 pos = ifc.ifc_buf;
3721 len = ifc.ifc_len;
3722
3723 /*
3724 * Loop over the interfaces, and write an info block for each.
3725 */
3726
3727 total = 0;
3728 for_each_netdev(net, dev) {
3729 for (i = 0; i < NPROTO; i++) {
3730 if (gifconf_list[i]) {
3731 int done;
3732 if (!pos)
3733 done = gifconf_list[i](dev, NULL, 0);
3734 else
3735 done = gifconf_list[i](dev, pos + total,
3736 len - total);
3737 if (done < 0)
3738 return -EFAULT;
3739 total += done;
3740 }
3741 }
3742 }
3743
3744 /*
3745 * All done. Write the updated control block back to the caller.
3746 */
3747 ifc.ifc_len = total;
3748
3749 /*
3750 * Both BSD and Solaris return 0 here, so we do too.
3751 */
3752 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3753 }
3754
3755 #ifdef CONFIG_PROC_FS
3756 /*
3757 * This is invoked by the /proc filesystem handler to display a device
3758 * in detail.
3759 */
3760 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3761 __acquires(RCU)
3762 {
3763 struct net *net = seq_file_net(seq);
3764 loff_t off;
3765 struct net_device *dev;
3766
3767 rcu_read_lock();
3768 if (!*pos)
3769 return SEQ_START_TOKEN;
3770
3771 off = 1;
3772 for_each_netdev_rcu(net, dev)
3773 if (off++ == *pos)
3774 return dev;
3775
3776 return NULL;
3777 }
3778
3779 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3780 {
3781 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3782 first_net_device(seq_file_net(seq)) :
3783 next_net_device((struct net_device *)v);
3784
3785 ++*pos;
3786 return rcu_dereference(dev);
3787 }
3788
3789 void dev_seq_stop(struct seq_file *seq, void *v)
3790 __releases(RCU)
3791 {
3792 rcu_read_unlock();
3793 }
3794
3795 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3796 {
3797 struct rtnl_link_stats64 temp;
3798 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3799
3800 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3801 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3802 dev->name, stats->rx_bytes, stats->rx_packets,
3803 stats->rx_errors,
3804 stats->rx_dropped + stats->rx_missed_errors,
3805 stats->rx_fifo_errors,
3806 stats->rx_length_errors + stats->rx_over_errors +
3807 stats->rx_crc_errors + stats->rx_frame_errors,
3808 stats->rx_compressed, stats->multicast,
3809 stats->tx_bytes, stats->tx_packets,
3810 stats->tx_errors, stats->tx_dropped,
3811 stats->tx_fifo_errors, stats->collisions,
3812 stats->tx_carrier_errors +
3813 stats->tx_aborted_errors +
3814 stats->tx_window_errors +
3815 stats->tx_heartbeat_errors,
3816 stats->tx_compressed);
3817 }
3818
3819 /*
3820 * Called from the PROCfs module. This now uses the new arbitrary sized
3821 * /proc/net interface to create /proc/net/dev
3822 */
3823 static int dev_seq_show(struct seq_file *seq, void *v)
3824 {
3825 if (v == SEQ_START_TOKEN)
3826 seq_puts(seq, "Inter-| Receive "
3827 " | Transmit\n"
3828 " face |bytes packets errs drop fifo frame "
3829 "compressed multicast|bytes packets errs "
3830 "drop fifo colls carrier compressed\n");
3831 else
3832 dev_seq_printf_stats(seq, v);
3833 return 0;
3834 }
3835
3836 static struct softnet_data *softnet_get_online(loff_t *pos)
3837 {
3838 struct softnet_data *sd = NULL;
3839
3840 while (*pos < nr_cpu_ids)
3841 if (cpu_online(*pos)) {
3842 sd = &per_cpu(softnet_data, *pos);
3843 break;
3844 } else
3845 ++*pos;
3846 return sd;
3847 }
3848
3849 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3850 {
3851 return softnet_get_online(pos);
3852 }
3853
3854 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3855 {
3856 ++*pos;
3857 return softnet_get_online(pos);
3858 }
3859
3860 static void softnet_seq_stop(struct seq_file *seq, void *v)
3861 {
3862 }
3863
3864 static int softnet_seq_show(struct seq_file *seq, void *v)
3865 {
3866 struct softnet_data *sd = v;
3867
3868 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3869 sd->processed, sd->dropped, sd->time_squeeze, 0,
3870 0, 0, 0, 0, /* was fastroute */
3871 sd->cpu_collision, sd->received_rps);
3872 return 0;
3873 }
3874
3875 static const struct seq_operations dev_seq_ops = {
3876 .start = dev_seq_start,
3877 .next = dev_seq_next,
3878 .stop = dev_seq_stop,
3879 .show = dev_seq_show,
3880 };
3881
3882 static int dev_seq_open(struct inode *inode, struct file *file)
3883 {
3884 return seq_open_net(inode, file, &dev_seq_ops,
3885 sizeof(struct seq_net_private));
3886 }
3887
3888 static const struct file_operations dev_seq_fops = {
3889 .owner = THIS_MODULE,
3890 .open = dev_seq_open,
3891 .read = seq_read,
3892 .llseek = seq_lseek,
3893 .release = seq_release_net,
3894 };
3895
3896 static const struct seq_operations softnet_seq_ops = {
3897 .start = softnet_seq_start,
3898 .next = softnet_seq_next,
3899 .stop = softnet_seq_stop,
3900 .show = softnet_seq_show,
3901 };
3902
3903 static int softnet_seq_open(struct inode *inode, struct file *file)
3904 {
3905 return seq_open(file, &softnet_seq_ops);
3906 }
3907
3908 static const struct file_operations softnet_seq_fops = {
3909 .owner = THIS_MODULE,
3910 .open = softnet_seq_open,
3911 .read = seq_read,
3912 .llseek = seq_lseek,
3913 .release = seq_release,
3914 };
3915
3916 static void *ptype_get_idx(loff_t pos)
3917 {
3918 struct packet_type *pt = NULL;
3919 loff_t i = 0;
3920 int t;
3921
3922 list_for_each_entry_rcu(pt, &ptype_all, list) {
3923 if (i == pos)
3924 return pt;
3925 ++i;
3926 }
3927
3928 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3929 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3930 if (i == pos)
3931 return pt;
3932 ++i;
3933 }
3934 }
3935 return NULL;
3936 }
3937
3938 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3939 __acquires(RCU)
3940 {
3941 rcu_read_lock();
3942 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3943 }
3944
3945 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3946 {
3947 struct packet_type *pt;
3948 struct list_head *nxt;
3949 int hash;
3950
3951 ++*pos;
3952 if (v == SEQ_START_TOKEN)
3953 return ptype_get_idx(0);
3954
3955 pt = v;
3956 nxt = pt->list.next;
3957 if (pt->type == htons(ETH_P_ALL)) {
3958 if (nxt != &ptype_all)
3959 goto found;
3960 hash = 0;
3961 nxt = ptype_base[0].next;
3962 } else
3963 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3964
3965 while (nxt == &ptype_base[hash]) {
3966 if (++hash >= PTYPE_HASH_SIZE)
3967 return NULL;
3968 nxt = ptype_base[hash].next;
3969 }
3970 found:
3971 return list_entry(nxt, struct packet_type, list);
3972 }
3973
3974 static void ptype_seq_stop(struct seq_file *seq, void *v)
3975 __releases(RCU)
3976 {
3977 rcu_read_unlock();
3978 }
3979
3980 static int ptype_seq_show(struct seq_file *seq, void *v)
3981 {
3982 struct packet_type *pt = v;
3983
3984 if (v == SEQ_START_TOKEN)
3985 seq_puts(seq, "Type Device Function\n");
3986 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3987 if (pt->type == htons(ETH_P_ALL))
3988 seq_puts(seq, "ALL ");
3989 else
3990 seq_printf(seq, "%04x", ntohs(pt->type));
3991
3992 seq_printf(seq, " %-8s %pF\n",
3993 pt->dev ? pt->dev->name : "", pt->func);
3994 }
3995
3996 return 0;
3997 }
3998
3999 static const struct seq_operations ptype_seq_ops = {
4000 .start = ptype_seq_start,
4001 .next = ptype_seq_next,
4002 .stop = ptype_seq_stop,
4003 .show = ptype_seq_show,
4004 };
4005
4006 static int ptype_seq_open(struct inode *inode, struct file *file)
4007 {
4008 return seq_open_net(inode, file, &ptype_seq_ops,
4009 sizeof(struct seq_net_private));
4010 }
4011
4012 static const struct file_operations ptype_seq_fops = {
4013 .owner = THIS_MODULE,
4014 .open = ptype_seq_open,
4015 .read = seq_read,
4016 .llseek = seq_lseek,
4017 .release = seq_release_net,
4018 };
4019
4020
4021 static int __net_init dev_proc_net_init(struct net *net)
4022 {
4023 int rc = -ENOMEM;
4024
4025 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4026 goto out;
4027 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4028 goto out_dev;
4029 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4030 goto out_softnet;
4031
4032 if (wext_proc_init(net))
4033 goto out_ptype;
4034 rc = 0;
4035 out:
4036 return rc;
4037 out_ptype:
4038 proc_net_remove(net, "ptype");
4039 out_softnet:
4040 proc_net_remove(net, "softnet_stat");
4041 out_dev:
4042 proc_net_remove(net, "dev");
4043 goto out;
4044 }
4045
4046 static void __net_exit dev_proc_net_exit(struct net *net)
4047 {
4048 wext_proc_exit(net);
4049
4050 proc_net_remove(net, "ptype");
4051 proc_net_remove(net, "softnet_stat");
4052 proc_net_remove(net, "dev");
4053 }
4054
4055 static struct pernet_operations __net_initdata dev_proc_ops = {
4056 .init = dev_proc_net_init,
4057 .exit = dev_proc_net_exit,
4058 };
4059
4060 static int __init dev_proc_init(void)
4061 {
4062 return register_pernet_subsys(&dev_proc_ops);
4063 }
4064 #else
4065 #define dev_proc_init() 0
4066 #endif /* CONFIG_PROC_FS */
4067
4068
4069 /**
4070 * netdev_set_master - set up master/slave pair
4071 * @slave: slave device
4072 * @master: new master device
4073 *
4074 * Changes the master device of the slave. Pass %NULL to break the
4075 * bonding. The caller must hold the RTNL semaphore. On a failure
4076 * a negative errno code is returned. On success the reference counts
4077 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4078 * function returns zero.
4079 */
4080 int netdev_set_master(struct net_device *slave, struct net_device *master)
4081 {
4082 struct net_device *old = slave->master;
4083
4084 ASSERT_RTNL();
4085
4086 if (master) {
4087 if (old)
4088 return -EBUSY;
4089 dev_hold(master);
4090 }
4091
4092 slave->master = master;
4093
4094 if (old) {
4095 synchronize_net();
4096 dev_put(old);
4097 }
4098 if (master)
4099 slave->flags |= IFF_SLAVE;
4100 else
4101 slave->flags &= ~IFF_SLAVE;
4102
4103 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4104 return 0;
4105 }
4106 EXPORT_SYMBOL(netdev_set_master);
4107
4108 static void dev_change_rx_flags(struct net_device *dev, int flags)
4109 {
4110 const struct net_device_ops *ops = dev->netdev_ops;
4111
4112 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4113 ops->ndo_change_rx_flags(dev, flags);
4114 }
4115
4116 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4117 {
4118 unsigned short old_flags = dev->flags;
4119 uid_t uid;
4120 gid_t gid;
4121
4122 ASSERT_RTNL();
4123
4124 dev->flags |= IFF_PROMISC;
4125 dev->promiscuity += inc;
4126 if (dev->promiscuity == 0) {
4127 /*
4128 * Avoid overflow.
4129 * If inc causes overflow, untouch promisc and return error.
4130 */
4131 if (inc < 0)
4132 dev->flags &= ~IFF_PROMISC;
4133 else {
4134 dev->promiscuity -= inc;
4135 printk(KERN_WARNING "%s: promiscuity touches roof, "
4136 "set promiscuity failed, promiscuity feature "
4137 "of device might be broken.\n", dev->name);
4138 return -EOVERFLOW;
4139 }
4140 }
4141 if (dev->flags != old_flags) {
4142 printk(KERN_INFO "device %s %s promiscuous mode\n",
4143 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4144 "left");
4145 if (audit_enabled) {
4146 current_uid_gid(&uid, &gid);
4147 audit_log(current->audit_context, GFP_ATOMIC,
4148 AUDIT_ANOM_PROMISCUOUS,
4149 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4150 dev->name, (dev->flags & IFF_PROMISC),
4151 (old_flags & IFF_PROMISC),
4152 audit_get_loginuid(current),
4153 uid, gid,
4154 audit_get_sessionid(current));
4155 }
4156
4157 dev_change_rx_flags(dev, IFF_PROMISC);
4158 }
4159 return 0;
4160 }
4161
4162 /**
4163 * dev_set_promiscuity - update promiscuity count on a device
4164 * @dev: device
4165 * @inc: modifier
4166 *
4167 * Add or remove promiscuity from a device. While the count in the device
4168 * remains above zero the interface remains promiscuous. Once it hits zero
4169 * the device reverts back to normal filtering operation. A negative inc
4170 * value is used to drop promiscuity on the device.
4171 * Return 0 if successful or a negative errno code on error.
4172 */
4173 int dev_set_promiscuity(struct net_device *dev, int inc)
4174 {
4175 unsigned short old_flags = dev->flags;
4176 int err;
4177
4178 err = __dev_set_promiscuity(dev, inc);
4179 if (err < 0)
4180 return err;
4181 if (dev->flags != old_flags)
4182 dev_set_rx_mode(dev);
4183 return err;
4184 }
4185 EXPORT_SYMBOL(dev_set_promiscuity);
4186
4187 /**
4188 * dev_set_allmulti - update allmulti count on a device
4189 * @dev: device
4190 * @inc: modifier
4191 *
4192 * Add or remove reception of all multicast frames to a device. While the
4193 * count in the device remains above zero the interface remains listening
4194 * to all interfaces. Once it hits zero the device reverts back to normal
4195 * filtering operation. A negative @inc value is used to drop the counter
4196 * when releasing a resource needing all multicasts.
4197 * Return 0 if successful or a negative errno code on error.
4198 */
4199
4200 int dev_set_allmulti(struct net_device *dev, int inc)
4201 {
4202 unsigned short old_flags = dev->flags;
4203
4204 ASSERT_RTNL();
4205
4206 dev->flags |= IFF_ALLMULTI;
4207 dev->allmulti += inc;
4208 if (dev->allmulti == 0) {
4209 /*
4210 * Avoid overflow.
4211 * If inc causes overflow, untouch allmulti and return error.
4212 */
4213 if (inc < 0)
4214 dev->flags &= ~IFF_ALLMULTI;
4215 else {
4216 dev->allmulti -= inc;
4217 printk(KERN_WARNING "%s: allmulti touches roof, "
4218 "set allmulti failed, allmulti feature of "
4219 "device might be broken.\n", dev->name);
4220 return -EOVERFLOW;
4221 }
4222 }
4223 if (dev->flags ^ old_flags) {
4224 dev_change_rx_flags(dev, IFF_ALLMULTI);
4225 dev_set_rx_mode(dev);
4226 }
4227 return 0;
4228 }
4229 EXPORT_SYMBOL(dev_set_allmulti);
4230
4231 /*
4232 * Upload unicast and multicast address lists to device and
4233 * configure RX filtering. When the device doesn't support unicast
4234 * filtering it is put in promiscuous mode while unicast addresses
4235 * are present.
4236 */
4237 void __dev_set_rx_mode(struct net_device *dev)
4238 {
4239 const struct net_device_ops *ops = dev->netdev_ops;
4240
4241 /* dev_open will call this function so the list will stay sane. */
4242 if (!(dev->flags&IFF_UP))
4243 return;
4244
4245 if (!netif_device_present(dev))
4246 return;
4247
4248 if (ops->ndo_set_rx_mode)
4249 ops->ndo_set_rx_mode(dev);
4250 else {
4251 /* Unicast addresses changes may only happen under the rtnl,
4252 * therefore calling __dev_set_promiscuity here is safe.
4253 */
4254 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4255 __dev_set_promiscuity(dev, 1);
4256 dev->uc_promisc = 1;
4257 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4258 __dev_set_promiscuity(dev, -1);
4259 dev->uc_promisc = 0;
4260 }
4261
4262 if (ops->ndo_set_multicast_list)
4263 ops->ndo_set_multicast_list(dev);
4264 }
4265 }
4266
4267 void dev_set_rx_mode(struct net_device *dev)
4268 {
4269 netif_addr_lock_bh(dev);
4270 __dev_set_rx_mode(dev);
4271 netif_addr_unlock_bh(dev);
4272 }
4273
4274 /**
4275 * dev_get_flags - get flags reported to userspace
4276 * @dev: device
4277 *
4278 * Get the combination of flag bits exported through APIs to userspace.
4279 */
4280 unsigned dev_get_flags(const struct net_device *dev)
4281 {
4282 unsigned flags;
4283
4284 flags = (dev->flags & ~(IFF_PROMISC |
4285 IFF_ALLMULTI |
4286 IFF_RUNNING |
4287 IFF_LOWER_UP |
4288 IFF_DORMANT)) |
4289 (dev->gflags & (IFF_PROMISC |
4290 IFF_ALLMULTI));
4291
4292 if (netif_running(dev)) {
4293 if (netif_oper_up(dev))
4294 flags |= IFF_RUNNING;
4295 if (netif_carrier_ok(dev))
4296 flags |= IFF_LOWER_UP;
4297 if (netif_dormant(dev))
4298 flags |= IFF_DORMANT;
4299 }
4300
4301 return flags;
4302 }
4303 EXPORT_SYMBOL(dev_get_flags);
4304
4305 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4306 {
4307 int old_flags = dev->flags;
4308 int ret;
4309
4310 ASSERT_RTNL();
4311
4312 /*
4313 * Set the flags on our device.
4314 */
4315
4316 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4317 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4318 IFF_AUTOMEDIA)) |
4319 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4320 IFF_ALLMULTI));
4321
4322 /*
4323 * Load in the correct multicast list now the flags have changed.
4324 */
4325
4326 if ((old_flags ^ flags) & IFF_MULTICAST)
4327 dev_change_rx_flags(dev, IFF_MULTICAST);
4328
4329 dev_set_rx_mode(dev);
4330
4331 /*
4332 * Have we downed the interface. We handle IFF_UP ourselves
4333 * according to user attempts to set it, rather than blindly
4334 * setting it.
4335 */
4336
4337 ret = 0;
4338 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4339 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4340
4341 if (!ret)
4342 dev_set_rx_mode(dev);
4343 }
4344
4345 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4346 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4347
4348 dev->gflags ^= IFF_PROMISC;
4349 dev_set_promiscuity(dev, inc);
4350 }
4351
4352 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4353 is important. Some (broken) drivers set IFF_PROMISC, when
4354 IFF_ALLMULTI is requested not asking us and not reporting.
4355 */
4356 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4357 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4358
4359 dev->gflags ^= IFF_ALLMULTI;
4360 dev_set_allmulti(dev, inc);
4361 }
4362
4363 return ret;
4364 }
4365
4366 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4367 {
4368 unsigned int changes = dev->flags ^ old_flags;
4369
4370 if (changes & IFF_UP) {
4371 if (dev->flags & IFF_UP)
4372 call_netdevice_notifiers(NETDEV_UP, dev);
4373 else
4374 call_netdevice_notifiers(NETDEV_DOWN, dev);
4375 }
4376
4377 if (dev->flags & IFF_UP &&
4378 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4379 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4380 }
4381
4382 /**
4383 * dev_change_flags - change device settings
4384 * @dev: device
4385 * @flags: device state flags
4386 *
4387 * Change settings on device based state flags. The flags are
4388 * in the userspace exported format.
4389 */
4390 int dev_change_flags(struct net_device *dev, unsigned flags)
4391 {
4392 int ret, changes;
4393 int old_flags = dev->flags;
4394
4395 ret = __dev_change_flags(dev, flags);
4396 if (ret < 0)
4397 return ret;
4398
4399 changes = old_flags ^ dev->flags;
4400 if (changes)
4401 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4402
4403 __dev_notify_flags(dev, old_flags);
4404 return ret;
4405 }
4406 EXPORT_SYMBOL(dev_change_flags);
4407
4408 /**
4409 * dev_set_mtu - Change maximum transfer unit
4410 * @dev: device
4411 * @new_mtu: new transfer unit
4412 *
4413 * Change the maximum transfer size of the network device.
4414 */
4415 int dev_set_mtu(struct net_device *dev, int new_mtu)
4416 {
4417 const struct net_device_ops *ops = dev->netdev_ops;
4418 int err;
4419
4420 if (new_mtu == dev->mtu)
4421 return 0;
4422
4423 /* MTU must be positive. */
4424 if (new_mtu < 0)
4425 return -EINVAL;
4426
4427 if (!netif_device_present(dev))
4428 return -ENODEV;
4429
4430 err = 0;
4431 if (ops->ndo_change_mtu)
4432 err = ops->ndo_change_mtu(dev, new_mtu);
4433 else
4434 dev->mtu = new_mtu;
4435
4436 if (!err && dev->flags & IFF_UP)
4437 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4438 return err;
4439 }
4440 EXPORT_SYMBOL(dev_set_mtu);
4441
4442 /**
4443 * dev_set_mac_address - Change Media Access Control Address
4444 * @dev: device
4445 * @sa: new address
4446 *
4447 * Change the hardware (MAC) address of the device
4448 */
4449 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4450 {
4451 const struct net_device_ops *ops = dev->netdev_ops;
4452 int err;
4453
4454 if (!ops->ndo_set_mac_address)
4455 return -EOPNOTSUPP;
4456 if (sa->sa_family != dev->type)
4457 return -EINVAL;
4458 if (!netif_device_present(dev))
4459 return -ENODEV;
4460 err = ops->ndo_set_mac_address(dev, sa);
4461 if (!err)
4462 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4463 return err;
4464 }
4465 EXPORT_SYMBOL(dev_set_mac_address);
4466
4467 /*
4468 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4469 */
4470 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4471 {
4472 int err;
4473 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4474
4475 if (!dev)
4476 return -ENODEV;
4477
4478 switch (cmd) {
4479 case SIOCGIFFLAGS: /* Get interface flags */
4480 ifr->ifr_flags = (short) dev_get_flags(dev);
4481 return 0;
4482
4483 case SIOCGIFMETRIC: /* Get the metric on the interface
4484 (currently unused) */
4485 ifr->ifr_metric = 0;
4486 return 0;
4487
4488 case SIOCGIFMTU: /* Get the MTU of a device */
4489 ifr->ifr_mtu = dev->mtu;
4490 return 0;
4491
4492 case SIOCGIFHWADDR:
4493 if (!dev->addr_len)
4494 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4495 else
4496 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4497 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4498 ifr->ifr_hwaddr.sa_family = dev->type;
4499 return 0;
4500
4501 case SIOCGIFSLAVE:
4502 err = -EINVAL;
4503 break;
4504
4505 case SIOCGIFMAP:
4506 ifr->ifr_map.mem_start = dev->mem_start;
4507 ifr->ifr_map.mem_end = dev->mem_end;
4508 ifr->ifr_map.base_addr = dev->base_addr;
4509 ifr->ifr_map.irq = dev->irq;
4510 ifr->ifr_map.dma = dev->dma;
4511 ifr->ifr_map.port = dev->if_port;
4512 return 0;
4513
4514 case SIOCGIFINDEX:
4515 ifr->ifr_ifindex = dev->ifindex;
4516 return 0;
4517
4518 case SIOCGIFTXQLEN:
4519 ifr->ifr_qlen = dev->tx_queue_len;
4520 return 0;
4521
4522 default:
4523 /* dev_ioctl() should ensure this case
4524 * is never reached
4525 */
4526 WARN_ON(1);
4527 err = -EINVAL;
4528 break;
4529
4530 }
4531 return err;
4532 }
4533
4534 /*
4535 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4536 */
4537 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4538 {
4539 int err;
4540 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4541 const struct net_device_ops *ops;
4542
4543 if (!dev)
4544 return -ENODEV;
4545
4546 ops = dev->netdev_ops;
4547
4548 switch (cmd) {
4549 case SIOCSIFFLAGS: /* Set interface flags */
4550 return dev_change_flags(dev, ifr->ifr_flags);
4551
4552 case SIOCSIFMETRIC: /* Set the metric on the interface
4553 (currently unused) */
4554 return -EOPNOTSUPP;
4555
4556 case SIOCSIFMTU: /* Set the MTU of a device */
4557 return dev_set_mtu(dev, ifr->ifr_mtu);
4558
4559 case SIOCSIFHWADDR:
4560 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4561
4562 case SIOCSIFHWBROADCAST:
4563 if (ifr->ifr_hwaddr.sa_family != dev->type)
4564 return -EINVAL;
4565 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4566 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4567 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4568 return 0;
4569
4570 case SIOCSIFMAP:
4571 if (ops->ndo_set_config) {
4572 if (!netif_device_present(dev))
4573 return -ENODEV;
4574 return ops->ndo_set_config(dev, &ifr->ifr_map);
4575 }
4576 return -EOPNOTSUPP;
4577
4578 case SIOCADDMULTI:
4579 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4580 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4581 return -EINVAL;
4582 if (!netif_device_present(dev))
4583 return -ENODEV;
4584 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4585
4586 case SIOCDELMULTI:
4587 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4588 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4589 return -EINVAL;
4590 if (!netif_device_present(dev))
4591 return -ENODEV;
4592 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4593
4594 case SIOCSIFTXQLEN:
4595 if (ifr->ifr_qlen < 0)
4596 return -EINVAL;
4597 dev->tx_queue_len = ifr->ifr_qlen;
4598 return 0;
4599
4600 case SIOCSIFNAME:
4601 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4602 return dev_change_name(dev, ifr->ifr_newname);
4603
4604 /*
4605 * Unknown or private ioctl
4606 */
4607 default:
4608 if ((cmd >= SIOCDEVPRIVATE &&
4609 cmd <= SIOCDEVPRIVATE + 15) ||
4610 cmd == SIOCBONDENSLAVE ||
4611 cmd == SIOCBONDRELEASE ||
4612 cmd == SIOCBONDSETHWADDR ||
4613 cmd == SIOCBONDSLAVEINFOQUERY ||
4614 cmd == SIOCBONDINFOQUERY ||
4615 cmd == SIOCBONDCHANGEACTIVE ||
4616 cmd == SIOCGMIIPHY ||
4617 cmd == SIOCGMIIREG ||
4618 cmd == SIOCSMIIREG ||
4619 cmd == SIOCBRADDIF ||
4620 cmd == SIOCBRDELIF ||
4621 cmd == SIOCSHWTSTAMP ||
4622 cmd == SIOCWANDEV) {
4623 err = -EOPNOTSUPP;
4624 if (ops->ndo_do_ioctl) {
4625 if (netif_device_present(dev))
4626 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4627 else
4628 err = -ENODEV;
4629 }
4630 } else
4631 err = -EINVAL;
4632
4633 }
4634 return err;
4635 }
4636
4637 /*
4638 * This function handles all "interface"-type I/O control requests. The actual
4639 * 'doing' part of this is dev_ifsioc above.
4640 */
4641
4642 /**
4643 * dev_ioctl - network device ioctl
4644 * @net: the applicable net namespace
4645 * @cmd: command to issue
4646 * @arg: pointer to a struct ifreq in user space
4647 *
4648 * Issue ioctl functions to devices. This is normally called by the
4649 * user space syscall interfaces but can sometimes be useful for
4650 * other purposes. The return value is the return from the syscall if
4651 * positive or a negative errno code on error.
4652 */
4653
4654 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4655 {
4656 struct ifreq ifr;
4657 int ret;
4658 char *colon;
4659
4660 /* One special case: SIOCGIFCONF takes ifconf argument
4661 and requires shared lock, because it sleeps writing
4662 to user space.
4663 */
4664
4665 if (cmd == SIOCGIFCONF) {
4666 rtnl_lock();
4667 ret = dev_ifconf(net, (char __user *) arg);
4668 rtnl_unlock();
4669 return ret;
4670 }
4671 if (cmd == SIOCGIFNAME)
4672 return dev_ifname(net, (struct ifreq __user *)arg);
4673
4674 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4675 return -EFAULT;
4676
4677 ifr.ifr_name[IFNAMSIZ-1] = 0;
4678
4679 colon = strchr(ifr.ifr_name, ':');
4680 if (colon)
4681 *colon = 0;
4682
4683 /*
4684 * See which interface the caller is talking about.
4685 */
4686
4687 switch (cmd) {
4688 /*
4689 * These ioctl calls:
4690 * - can be done by all.
4691 * - atomic and do not require locking.
4692 * - return a value
4693 */
4694 case SIOCGIFFLAGS:
4695 case SIOCGIFMETRIC:
4696 case SIOCGIFMTU:
4697 case SIOCGIFHWADDR:
4698 case SIOCGIFSLAVE:
4699 case SIOCGIFMAP:
4700 case SIOCGIFINDEX:
4701 case SIOCGIFTXQLEN:
4702 dev_load(net, ifr.ifr_name);
4703 rcu_read_lock();
4704 ret = dev_ifsioc_locked(net, &ifr, cmd);
4705 rcu_read_unlock();
4706 if (!ret) {
4707 if (colon)
4708 *colon = ':';
4709 if (copy_to_user(arg, &ifr,
4710 sizeof(struct ifreq)))
4711 ret = -EFAULT;
4712 }
4713 return ret;
4714
4715 case SIOCETHTOOL:
4716 dev_load(net, ifr.ifr_name);
4717 rtnl_lock();
4718 ret = dev_ethtool(net, &ifr);
4719 rtnl_unlock();
4720 if (!ret) {
4721 if (colon)
4722 *colon = ':';
4723 if (copy_to_user(arg, &ifr,
4724 sizeof(struct ifreq)))
4725 ret = -EFAULT;
4726 }
4727 return ret;
4728
4729 /*
4730 * These ioctl calls:
4731 * - require superuser power.
4732 * - require strict serialization.
4733 * - return a value
4734 */
4735 case SIOCGMIIPHY:
4736 case SIOCGMIIREG:
4737 case SIOCSIFNAME:
4738 if (!capable(CAP_NET_ADMIN))
4739 return -EPERM;
4740 dev_load(net, ifr.ifr_name);
4741 rtnl_lock();
4742 ret = dev_ifsioc(net, &ifr, cmd);
4743 rtnl_unlock();
4744 if (!ret) {
4745 if (colon)
4746 *colon = ':';
4747 if (copy_to_user(arg, &ifr,
4748 sizeof(struct ifreq)))
4749 ret = -EFAULT;
4750 }
4751 return ret;
4752
4753 /*
4754 * These ioctl calls:
4755 * - require superuser power.
4756 * - require strict serialization.
4757 * - do not return a value
4758 */
4759 case SIOCSIFFLAGS:
4760 case SIOCSIFMETRIC:
4761 case SIOCSIFMTU:
4762 case SIOCSIFMAP:
4763 case SIOCSIFHWADDR:
4764 case SIOCSIFSLAVE:
4765 case SIOCADDMULTI:
4766 case SIOCDELMULTI:
4767 case SIOCSIFHWBROADCAST:
4768 case SIOCSIFTXQLEN:
4769 case SIOCSMIIREG:
4770 case SIOCBONDENSLAVE:
4771 case SIOCBONDRELEASE:
4772 case SIOCBONDSETHWADDR:
4773 case SIOCBONDCHANGEACTIVE:
4774 case SIOCBRADDIF:
4775 case SIOCBRDELIF:
4776 case SIOCSHWTSTAMP:
4777 if (!capable(CAP_NET_ADMIN))
4778 return -EPERM;
4779 /* fall through */
4780 case SIOCBONDSLAVEINFOQUERY:
4781 case SIOCBONDINFOQUERY:
4782 dev_load(net, ifr.ifr_name);
4783 rtnl_lock();
4784 ret = dev_ifsioc(net, &ifr, cmd);
4785 rtnl_unlock();
4786 return ret;
4787
4788 case SIOCGIFMEM:
4789 /* Get the per device memory space. We can add this but
4790 * currently do not support it */
4791 case SIOCSIFMEM:
4792 /* Set the per device memory buffer space.
4793 * Not applicable in our case */
4794 case SIOCSIFLINK:
4795 return -EINVAL;
4796
4797 /*
4798 * Unknown or private ioctl.
4799 */
4800 default:
4801 if (cmd == SIOCWANDEV ||
4802 (cmd >= SIOCDEVPRIVATE &&
4803 cmd <= SIOCDEVPRIVATE + 15)) {
4804 dev_load(net, ifr.ifr_name);
4805 rtnl_lock();
4806 ret = dev_ifsioc(net, &ifr, cmd);
4807 rtnl_unlock();
4808 if (!ret && copy_to_user(arg, &ifr,
4809 sizeof(struct ifreq)))
4810 ret = -EFAULT;
4811 return ret;
4812 }
4813 /* Take care of Wireless Extensions */
4814 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4815 return wext_handle_ioctl(net, &ifr, cmd, arg);
4816 return -EINVAL;
4817 }
4818 }
4819
4820
4821 /**
4822 * dev_new_index - allocate an ifindex
4823 * @net: the applicable net namespace
4824 *
4825 * Returns a suitable unique value for a new device interface
4826 * number. The caller must hold the rtnl semaphore or the
4827 * dev_base_lock to be sure it remains unique.
4828 */
4829 static int dev_new_index(struct net *net)
4830 {
4831 static int ifindex;
4832 for (;;) {
4833 if (++ifindex <= 0)
4834 ifindex = 1;
4835 if (!__dev_get_by_index(net, ifindex))
4836 return ifindex;
4837 }
4838 }
4839
4840 /* Delayed registration/unregisteration */
4841 static LIST_HEAD(net_todo_list);
4842
4843 static void net_set_todo(struct net_device *dev)
4844 {
4845 list_add_tail(&dev->todo_list, &net_todo_list);
4846 }
4847
4848 static void rollback_registered_many(struct list_head *head)
4849 {
4850 struct net_device *dev, *tmp;
4851
4852 BUG_ON(dev_boot_phase);
4853 ASSERT_RTNL();
4854
4855 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4856 /* Some devices call without registering
4857 * for initialization unwind. Remove those
4858 * devices and proceed with the remaining.
4859 */
4860 if (dev->reg_state == NETREG_UNINITIALIZED) {
4861 pr_debug("unregister_netdevice: device %s/%p never "
4862 "was registered\n", dev->name, dev);
4863
4864 WARN_ON(1);
4865 list_del(&dev->unreg_list);
4866 continue;
4867 }
4868
4869 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4870
4871 /* If device is running, close it first. */
4872 dev_close(dev);
4873
4874 /* And unlink it from device chain. */
4875 unlist_netdevice(dev);
4876
4877 dev->reg_state = NETREG_UNREGISTERING;
4878 }
4879
4880 synchronize_net();
4881
4882 list_for_each_entry(dev, head, unreg_list) {
4883 /* Shutdown queueing discipline. */
4884 dev_shutdown(dev);
4885
4886
4887 /* Notify protocols, that we are about to destroy
4888 this device. They should clean all the things.
4889 */
4890 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4891
4892 if (!dev->rtnl_link_ops ||
4893 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4894 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4895
4896 /*
4897 * Flush the unicast and multicast chains
4898 */
4899 dev_uc_flush(dev);
4900 dev_mc_flush(dev);
4901
4902 if (dev->netdev_ops->ndo_uninit)
4903 dev->netdev_ops->ndo_uninit(dev);
4904
4905 /* Notifier chain MUST detach us from master device. */
4906 WARN_ON(dev->master);
4907
4908 /* Remove entries from kobject tree */
4909 netdev_unregister_kobject(dev);
4910 }
4911
4912 /* Process any work delayed until the end of the batch */
4913 dev = list_first_entry(head, struct net_device, unreg_list);
4914 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4915
4916 rcu_barrier();
4917
4918 list_for_each_entry(dev, head, unreg_list)
4919 dev_put(dev);
4920 }
4921
4922 static void rollback_registered(struct net_device *dev)
4923 {
4924 LIST_HEAD(single);
4925
4926 list_add(&dev->unreg_list, &single);
4927 rollback_registered_many(&single);
4928 }
4929
4930 static void __netdev_init_queue_locks_one(struct net_device *dev,
4931 struct netdev_queue *dev_queue,
4932 void *_unused)
4933 {
4934 spin_lock_init(&dev_queue->_xmit_lock);
4935 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4936 dev_queue->xmit_lock_owner = -1;
4937 }
4938
4939 static void netdev_init_queue_locks(struct net_device *dev)
4940 {
4941 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4942 }
4943
4944 unsigned long netdev_fix_features(unsigned long features, const char *name)
4945 {
4946 /* Fix illegal SG+CSUM combinations. */
4947 if ((features & NETIF_F_SG) &&
4948 !(features & NETIF_F_ALL_CSUM)) {
4949 if (name)
4950 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4951 "checksum feature.\n", name);
4952 features &= ~NETIF_F_SG;
4953 }
4954
4955 /* TSO requires that SG is present as well. */
4956 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4957 if (name)
4958 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4959 "SG feature.\n", name);
4960 features &= ~NETIF_F_TSO;
4961 }
4962
4963 if (features & NETIF_F_UFO) {
4964 if (!(features & NETIF_F_GEN_CSUM)) {
4965 if (name)
4966 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4967 "since no NETIF_F_HW_CSUM feature.\n",
4968 name);
4969 features &= ~NETIF_F_UFO;
4970 }
4971
4972 if (!(features & NETIF_F_SG)) {
4973 if (name)
4974 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4975 "since no NETIF_F_SG feature.\n", name);
4976 features &= ~NETIF_F_UFO;
4977 }
4978 }
4979
4980 return features;
4981 }
4982 EXPORT_SYMBOL(netdev_fix_features);
4983
4984 /**
4985 * netif_stacked_transfer_operstate - transfer operstate
4986 * @rootdev: the root or lower level device to transfer state from
4987 * @dev: the device to transfer operstate to
4988 *
4989 * Transfer operational state from root to device. This is normally
4990 * called when a stacking relationship exists between the root
4991 * device and the device(a leaf device).
4992 */
4993 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4994 struct net_device *dev)
4995 {
4996 if (rootdev->operstate == IF_OPER_DORMANT)
4997 netif_dormant_on(dev);
4998 else
4999 netif_dormant_off(dev);
5000
5001 if (netif_carrier_ok(rootdev)) {
5002 if (!netif_carrier_ok(dev))
5003 netif_carrier_on(dev);
5004 } else {
5005 if (netif_carrier_ok(dev))
5006 netif_carrier_off(dev);
5007 }
5008 }
5009 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5010
5011 static int netif_alloc_rx_queues(struct net_device *dev)
5012 {
5013 #ifdef CONFIG_RPS
5014 unsigned int i, count = dev->num_rx_queues;
5015
5016 if (count) {
5017 struct netdev_rx_queue *rx;
5018
5019 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5020 if (!rx) {
5021 pr_err("netdev: Unable to allocate %u rx queues.\n",
5022 count);
5023 return -ENOMEM;
5024 }
5025 dev->_rx = rx;
5026 atomic_set(&rx->count, count);
5027
5028 /*
5029 * Set a pointer to first element in the array which holds the
5030 * reference count.
5031 */
5032 for (i = 0; i < count; i++)
5033 rx[i].first = rx;
5034 }
5035 #endif
5036 return 0;
5037 }
5038
5039 /**
5040 * register_netdevice - register a network device
5041 * @dev: device to register
5042 *
5043 * Take a completed network device structure and add it to the kernel
5044 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5045 * chain. 0 is returned on success. A negative errno code is returned
5046 * on a failure to set up the device, or if the name is a duplicate.
5047 *
5048 * Callers must hold the rtnl semaphore. You may want
5049 * register_netdev() instead of this.
5050 *
5051 * BUGS:
5052 * The locking appears insufficient to guarantee two parallel registers
5053 * will not get the same name.
5054 */
5055
5056 int register_netdevice(struct net_device *dev)
5057 {
5058 int ret;
5059 struct net *net = dev_net(dev);
5060
5061 BUG_ON(dev_boot_phase);
5062 ASSERT_RTNL();
5063
5064 might_sleep();
5065
5066 /* When net_device's are persistent, this will be fatal. */
5067 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5068 BUG_ON(!net);
5069
5070 spin_lock_init(&dev->addr_list_lock);
5071 netdev_set_addr_lockdep_class(dev);
5072 netdev_init_queue_locks(dev);
5073
5074 dev->iflink = -1;
5075
5076 ret = netif_alloc_rx_queues(dev);
5077 if (ret)
5078 goto out;
5079
5080 /* Init, if this function is available */
5081 if (dev->netdev_ops->ndo_init) {
5082 ret = dev->netdev_ops->ndo_init(dev);
5083 if (ret) {
5084 if (ret > 0)
5085 ret = -EIO;
5086 goto out;
5087 }
5088 }
5089
5090 ret = dev_get_valid_name(dev, dev->name, 0);
5091 if (ret)
5092 goto err_uninit;
5093
5094 dev->ifindex = dev_new_index(net);
5095 if (dev->iflink == -1)
5096 dev->iflink = dev->ifindex;
5097
5098 /* Fix illegal checksum combinations */
5099 if ((dev->features & NETIF_F_HW_CSUM) &&
5100 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5101 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5102 dev->name);
5103 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5104 }
5105
5106 if ((dev->features & NETIF_F_NO_CSUM) &&
5107 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5108 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5109 dev->name);
5110 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5111 }
5112
5113 dev->features = netdev_fix_features(dev->features, dev->name);
5114
5115 /* Enable software GSO if SG is supported. */
5116 if (dev->features & NETIF_F_SG)
5117 dev->features |= NETIF_F_GSO;
5118
5119 /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5120 * vlan_dev_init() will do the dev->features check, so these features
5121 * are enabled only if supported by underlying device.
5122 */
5123 dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5124
5125 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5126 ret = notifier_to_errno(ret);
5127 if (ret)
5128 goto err_uninit;
5129
5130 ret = netdev_register_kobject(dev);
5131 if (ret)
5132 goto err_uninit;
5133 dev->reg_state = NETREG_REGISTERED;
5134
5135 /*
5136 * Default initial state at registry is that the
5137 * device is present.
5138 */
5139
5140 set_bit(__LINK_STATE_PRESENT, &dev->state);
5141
5142 dev_init_scheduler(dev);
5143 dev_hold(dev);
5144 list_netdevice(dev);
5145
5146 /* Notify protocols, that a new device appeared. */
5147 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5148 ret = notifier_to_errno(ret);
5149 if (ret) {
5150 rollback_registered(dev);
5151 dev->reg_state = NETREG_UNREGISTERED;
5152 }
5153 /*
5154 * Prevent userspace races by waiting until the network
5155 * device is fully setup before sending notifications.
5156 */
5157 if (!dev->rtnl_link_ops ||
5158 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5159 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5160
5161 out:
5162 return ret;
5163
5164 err_uninit:
5165 if (dev->netdev_ops->ndo_uninit)
5166 dev->netdev_ops->ndo_uninit(dev);
5167 goto out;
5168 }
5169 EXPORT_SYMBOL(register_netdevice);
5170
5171 /**
5172 * init_dummy_netdev - init a dummy network device for NAPI
5173 * @dev: device to init
5174 *
5175 * This takes a network device structure and initialize the minimum
5176 * amount of fields so it can be used to schedule NAPI polls without
5177 * registering a full blown interface. This is to be used by drivers
5178 * that need to tie several hardware interfaces to a single NAPI
5179 * poll scheduler due to HW limitations.
5180 */
5181 int init_dummy_netdev(struct net_device *dev)
5182 {
5183 /* Clear everything. Note we don't initialize spinlocks
5184 * are they aren't supposed to be taken by any of the
5185 * NAPI code and this dummy netdev is supposed to be
5186 * only ever used for NAPI polls
5187 */
5188 memset(dev, 0, sizeof(struct net_device));
5189
5190 /* make sure we BUG if trying to hit standard
5191 * register/unregister code path
5192 */
5193 dev->reg_state = NETREG_DUMMY;
5194
5195 /* initialize the ref count */
5196 atomic_set(&dev->refcnt, 1);
5197
5198 /* NAPI wants this */
5199 INIT_LIST_HEAD(&dev->napi_list);
5200
5201 /* a dummy interface is started by default */
5202 set_bit(__LINK_STATE_PRESENT, &dev->state);
5203 set_bit(__LINK_STATE_START, &dev->state);
5204
5205 return 0;
5206 }
5207 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5208
5209
5210 /**
5211 * register_netdev - register a network device
5212 * @dev: device to register
5213 *
5214 * Take a completed network device structure and add it to the kernel
5215 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5216 * chain. 0 is returned on success. A negative errno code is returned
5217 * on a failure to set up the device, or if the name is a duplicate.
5218 *
5219 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5220 * and expands the device name if you passed a format string to
5221 * alloc_netdev.
5222 */
5223 int register_netdev(struct net_device *dev)
5224 {
5225 int err;
5226
5227 rtnl_lock();
5228
5229 /*
5230 * If the name is a format string the caller wants us to do a
5231 * name allocation.
5232 */
5233 if (strchr(dev->name, '%')) {
5234 err = dev_alloc_name(dev, dev->name);
5235 if (err < 0)
5236 goto out;
5237 }
5238
5239 err = register_netdevice(dev);
5240 out:
5241 rtnl_unlock();
5242 return err;
5243 }
5244 EXPORT_SYMBOL(register_netdev);
5245
5246 /*
5247 * netdev_wait_allrefs - wait until all references are gone.
5248 *
5249 * This is called when unregistering network devices.
5250 *
5251 * Any protocol or device that holds a reference should register
5252 * for netdevice notification, and cleanup and put back the
5253 * reference if they receive an UNREGISTER event.
5254 * We can get stuck here if buggy protocols don't correctly
5255 * call dev_put.
5256 */
5257 static void netdev_wait_allrefs(struct net_device *dev)
5258 {
5259 unsigned long rebroadcast_time, warning_time;
5260
5261 linkwatch_forget_dev(dev);
5262
5263 rebroadcast_time = warning_time = jiffies;
5264 while (atomic_read(&dev->refcnt) != 0) {
5265 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5266 rtnl_lock();
5267
5268 /* Rebroadcast unregister notification */
5269 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5270 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5271 * should have already handle it the first time */
5272
5273 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5274 &dev->state)) {
5275 /* We must not have linkwatch events
5276 * pending on unregister. If this
5277 * happens, we simply run the queue
5278 * unscheduled, resulting in a noop
5279 * for this device.
5280 */
5281 linkwatch_run_queue();
5282 }
5283
5284 __rtnl_unlock();
5285
5286 rebroadcast_time = jiffies;
5287 }
5288
5289 msleep(250);
5290
5291 if (time_after(jiffies, warning_time + 10 * HZ)) {
5292 printk(KERN_EMERG "unregister_netdevice: "
5293 "waiting for %s to become free. Usage "
5294 "count = %d\n",
5295 dev->name, atomic_read(&dev->refcnt));
5296 warning_time = jiffies;
5297 }
5298 }
5299 }
5300
5301 /* The sequence is:
5302 *
5303 * rtnl_lock();
5304 * ...
5305 * register_netdevice(x1);
5306 * register_netdevice(x2);
5307 * ...
5308 * unregister_netdevice(y1);
5309 * unregister_netdevice(y2);
5310 * ...
5311 * rtnl_unlock();
5312 * free_netdev(y1);
5313 * free_netdev(y2);
5314 *
5315 * We are invoked by rtnl_unlock().
5316 * This allows us to deal with problems:
5317 * 1) We can delete sysfs objects which invoke hotplug
5318 * without deadlocking with linkwatch via keventd.
5319 * 2) Since we run with the RTNL semaphore not held, we can sleep
5320 * safely in order to wait for the netdev refcnt to drop to zero.
5321 *
5322 * We must not return until all unregister events added during
5323 * the interval the lock was held have been completed.
5324 */
5325 void netdev_run_todo(void)
5326 {
5327 struct list_head list;
5328
5329 /* Snapshot list, allow later requests */
5330 list_replace_init(&net_todo_list, &list);
5331
5332 __rtnl_unlock();
5333
5334 while (!list_empty(&list)) {
5335 struct net_device *dev
5336 = list_first_entry(&list, struct net_device, todo_list);
5337 list_del(&dev->todo_list);
5338
5339 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5340 printk(KERN_ERR "network todo '%s' but state %d\n",
5341 dev->name, dev->reg_state);
5342 dump_stack();
5343 continue;
5344 }
5345
5346 dev->reg_state = NETREG_UNREGISTERED;
5347
5348 on_each_cpu(flush_backlog, dev, 1);
5349
5350 netdev_wait_allrefs(dev);
5351
5352 /* paranoia */
5353 BUG_ON(atomic_read(&dev->refcnt));
5354 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5355 WARN_ON(dev->ip6_ptr);
5356 WARN_ON(dev->dn_ptr);
5357
5358 if (dev->destructor)
5359 dev->destructor(dev);
5360
5361 /* Free network device */
5362 kobject_put(&dev->dev.kobj);
5363 }
5364 }
5365
5366 /**
5367 * dev_txq_stats_fold - fold tx_queues stats
5368 * @dev: device to get statistics from
5369 * @stats: struct rtnl_link_stats64 to hold results
5370 */
5371 void dev_txq_stats_fold(const struct net_device *dev,
5372 struct rtnl_link_stats64 *stats)
5373 {
5374 u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5375 unsigned int i;
5376 struct netdev_queue *txq;
5377
5378 for (i = 0; i < dev->num_tx_queues; i++) {
5379 txq = netdev_get_tx_queue(dev, i);
5380 spin_lock_bh(&txq->_xmit_lock);
5381 tx_bytes += txq->tx_bytes;
5382 tx_packets += txq->tx_packets;
5383 tx_dropped += txq->tx_dropped;
5384 spin_unlock_bh(&txq->_xmit_lock);
5385 }
5386 if (tx_bytes || tx_packets || tx_dropped) {
5387 stats->tx_bytes = tx_bytes;
5388 stats->tx_packets = tx_packets;
5389 stats->tx_dropped = tx_dropped;
5390 }
5391 }
5392 EXPORT_SYMBOL(dev_txq_stats_fold);
5393
5394 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5395 * fields in the same order, with only the type differing.
5396 */
5397 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5398 const struct net_device_stats *netdev_stats)
5399 {
5400 #if BITS_PER_LONG == 64
5401 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5402 memcpy(stats64, netdev_stats, sizeof(*stats64));
5403 #else
5404 size_t i, n = sizeof(*stats64) / sizeof(u64);
5405 const unsigned long *src = (const unsigned long *)netdev_stats;
5406 u64 *dst = (u64 *)stats64;
5407
5408 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5409 sizeof(*stats64) / sizeof(u64));
5410 for (i = 0; i < n; i++)
5411 dst[i] = src[i];
5412 #endif
5413 }
5414
5415 /**
5416 * dev_get_stats - get network device statistics
5417 * @dev: device to get statistics from
5418 * @storage: place to store stats
5419 *
5420 * Get network statistics from device. Return @storage.
5421 * The device driver may provide its own method by setting
5422 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5423 * otherwise the internal statistics structure is used.
5424 */
5425 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5426 struct rtnl_link_stats64 *storage)
5427 {
5428 const struct net_device_ops *ops = dev->netdev_ops;
5429
5430 if (ops->ndo_get_stats64) {
5431 memset(storage, 0, sizeof(*storage));
5432 return ops->ndo_get_stats64(dev, storage);
5433 }
5434 if (ops->ndo_get_stats) {
5435 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5436 return storage;
5437 }
5438 netdev_stats_to_stats64(storage, &dev->stats);
5439 dev_txq_stats_fold(dev, storage);
5440 return storage;
5441 }
5442 EXPORT_SYMBOL(dev_get_stats);
5443
5444 static void netdev_init_one_queue(struct net_device *dev,
5445 struct netdev_queue *queue,
5446 void *_unused)
5447 {
5448 queue->dev = dev;
5449 }
5450
5451 static void netdev_init_queues(struct net_device *dev)
5452 {
5453 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5454 spin_lock_init(&dev->tx_global_lock);
5455 }
5456
5457 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5458 {
5459 struct netdev_queue *queue = dev_ingress_queue(dev);
5460
5461 #ifdef CONFIG_NET_CLS_ACT
5462 if (queue)
5463 return queue;
5464 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5465 if (!queue)
5466 return NULL;
5467 netdev_init_one_queue(dev, queue, NULL);
5468 __netdev_init_queue_locks_one(dev, queue, NULL);
5469 queue->qdisc = &noop_qdisc;
5470 queue->qdisc_sleeping = &noop_qdisc;
5471 rcu_assign_pointer(dev->ingress_queue, queue);
5472 #endif
5473 return queue;
5474 }
5475
5476 /**
5477 * alloc_netdev_mq - allocate network device
5478 * @sizeof_priv: size of private data to allocate space for
5479 * @name: device name format string
5480 * @setup: callback to initialize device
5481 * @queue_count: the number of subqueues to allocate
5482 *
5483 * Allocates a struct net_device with private data area for driver use
5484 * and performs basic initialization. Also allocates subquue structs
5485 * for each queue on the device at the end of the netdevice.
5486 */
5487 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5488 void (*setup)(struct net_device *), unsigned int queue_count)
5489 {
5490 struct netdev_queue *tx;
5491 struct net_device *dev;
5492 size_t alloc_size;
5493 struct net_device *p;
5494
5495 BUG_ON(strlen(name) >= sizeof(dev->name));
5496
5497 alloc_size = sizeof(struct net_device);
5498 if (sizeof_priv) {
5499 /* ensure 32-byte alignment of private area */
5500 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5501 alloc_size += sizeof_priv;
5502 }
5503 /* ensure 32-byte alignment of whole construct */
5504 alloc_size += NETDEV_ALIGN - 1;
5505
5506 p = kzalloc(alloc_size, GFP_KERNEL);
5507 if (!p) {
5508 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5509 return NULL;
5510 }
5511
5512 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5513 if (!tx) {
5514 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5515 "tx qdiscs.\n");
5516 goto free_p;
5517 }
5518
5519
5520 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5521 dev->padded = (char *)dev - (char *)p;
5522
5523 if (dev_addr_init(dev))
5524 goto free_tx;
5525
5526 dev_mc_init(dev);
5527 dev_uc_init(dev);
5528
5529 dev_net_set(dev, &init_net);
5530
5531 dev->_tx = tx;
5532 dev->num_tx_queues = queue_count;
5533 dev->real_num_tx_queues = queue_count;
5534
5535 #ifdef CONFIG_RPS
5536 dev->num_rx_queues = queue_count;
5537 dev->real_num_rx_queues = queue_count;
5538 #endif
5539
5540 dev->gso_max_size = GSO_MAX_SIZE;
5541
5542 netdev_init_queues(dev);
5543
5544 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5545 dev->ethtool_ntuple_list.count = 0;
5546 INIT_LIST_HEAD(&dev->napi_list);
5547 INIT_LIST_HEAD(&dev->unreg_list);
5548 INIT_LIST_HEAD(&dev->link_watch_list);
5549 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5550 setup(dev);
5551 strcpy(dev->name, name);
5552 return dev;
5553
5554 free_tx:
5555 kfree(tx);
5556 free_p:
5557 kfree(p);
5558 return NULL;
5559 }
5560 EXPORT_SYMBOL(alloc_netdev_mq);
5561
5562 /**
5563 * free_netdev - free network device
5564 * @dev: device
5565 *
5566 * This function does the last stage of destroying an allocated device
5567 * interface. The reference to the device object is released.
5568 * If this is the last reference then it will be freed.
5569 */
5570 void free_netdev(struct net_device *dev)
5571 {
5572 struct napi_struct *p, *n;
5573
5574 release_net(dev_net(dev));
5575
5576 kfree(dev->_tx);
5577
5578 kfree(rcu_dereference_raw(dev->ingress_queue));
5579
5580 /* Flush device addresses */
5581 dev_addr_flush(dev);
5582
5583 /* Clear ethtool n-tuple list */
5584 ethtool_ntuple_flush(dev);
5585
5586 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5587 netif_napi_del(p);
5588
5589 /* Compatibility with error handling in drivers */
5590 if (dev->reg_state == NETREG_UNINITIALIZED) {
5591 kfree((char *)dev - dev->padded);
5592 return;
5593 }
5594
5595 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5596 dev->reg_state = NETREG_RELEASED;
5597
5598 /* will free via device release */
5599 put_device(&dev->dev);
5600 }
5601 EXPORT_SYMBOL(free_netdev);
5602
5603 /**
5604 * synchronize_net - Synchronize with packet receive processing
5605 *
5606 * Wait for packets currently being received to be done.
5607 * Does not block later packets from starting.
5608 */
5609 void synchronize_net(void)
5610 {
5611 might_sleep();
5612 synchronize_rcu();
5613 }
5614 EXPORT_SYMBOL(synchronize_net);
5615
5616 /**
5617 * unregister_netdevice_queue - remove device from the kernel
5618 * @dev: device
5619 * @head: list
5620 *
5621 * This function shuts down a device interface and removes it
5622 * from the kernel tables.
5623 * If head not NULL, device is queued to be unregistered later.
5624 *
5625 * Callers must hold the rtnl semaphore. You may want
5626 * unregister_netdev() instead of this.
5627 */
5628
5629 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5630 {
5631 ASSERT_RTNL();
5632
5633 if (head) {
5634 list_move_tail(&dev->unreg_list, head);
5635 } else {
5636 rollback_registered(dev);
5637 /* Finish processing unregister after unlock */
5638 net_set_todo(dev);
5639 }
5640 }
5641 EXPORT_SYMBOL(unregister_netdevice_queue);
5642
5643 /**
5644 * unregister_netdevice_many - unregister many devices
5645 * @head: list of devices
5646 */
5647 void unregister_netdevice_many(struct list_head *head)
5648 {
5649 struct net_device *dev;
5650
5651 if (!list_empty(head)) {
5652 rollback_registered_many(head);
5653 list_for_each_entry(dev, head, unreg_list)
5654 net_set_todo(dev);
5655 }
5656 }
5657 EXPORT_SYMBOL(unregister_netdevice_many);
5658
5659 /**
5660 * unregister_netdev - remove device from the kernel
5661 * @dev: device
5662 *
5663 * This function shuts down a device interface and removes it
5664 * from the kernel tables.
5665 *
5666 * This is just a wrapper for unregister_netdevice that takes
5667 * the rtnl semaphore. In general you want to use this and not
5668 * unregister_netdevice.
5669 */
5670 void unregister_netdev(struct net_device *dev)
5671 {
5672 rtnl_lock();
5673 unregister_netdevice(dev);
5674 rtnl_unlock();
5675 }
5676 EXPORT_SYMBOL(unregister_netdev);
5677
5678 /**
5679 * dev_change_net_namespace - move device to different nethost namespace
5680 * @dev: device
5681 * @net: network namespace
5682 * @pat: If not NULL name pattern to try if the current device name
5683 * is already taken in the destination network namespace.
5684 *
5685 * This function shuts down a device interface and moves it
5686 * to a new network namespace. On success 0 is returned, on
5687 * a failure a netagive errno code is returned.
5688 *
5689 * Callers must hold the rtnl semaphore.
5690 */
5691
5692 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5693 {
5694 int err;
5695
5696 ASSERT_RTNL();
5697
5698 /* Don't allow namespace local devices to be moved. */
5699 err = -EINVAL;
5700 if (dev->features & NETIF_F_NETNS_LOCAL)
5701 goto out;
5702
5703 /* Ensure the device has been registrered */
5704 err = -EINVAL;
5705 if (dev->reg_state != NETREG_REGISTERED)
5706 goto out;
5707
5708 /* Get out if there is nothing todo */
5709 err = 0;
5710 if (net_eq(dev_net(dev), net))
5711 goto out;
5712
5713 /* Pick the destination device name, and ensure
5714 * we can use it in the destination network namespace.
5715 */
5716 err = -EEXIST;
5717 if (__dev_get_by_name(net, dev->name)) {
5718 /* We get here if we can't use the current device name */
5719 if (!pat)
5720 goto out;
5721 if (dev_get_valid_name(dev, pat, 1))
5722 goto out;
5723 }
5724
5725 /*
5726 * And now a mini version of register_netdevice unregister_netdevice.
5727 */
5728
5729 /* If device is running close it first. */
5730 dev_close(dev);
5731
5732 /* And unlink it from device chain */
5733 err = -ENODEV;
5734 unlist_netdevice(dev);
5735
5736 synchronize_net();
5737
5738 /* Shutdown queueing discipline. */
5739 dev_shutdown(dev);
5740
5741 /* Notify protocols, that we are about to destroy
5742 this device. They should clean all the things.
5743
5744 Note that dev->reg_state stays at NETREG_REGISTERED.
5745 This is wanted because this way 8021q and macvlan know
5746 the device is just moving and can keep their slaves up.
5747 */
5748 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5749 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5750
5751 /*
5752 * Flush the unicast and multicast chains
5753 */
5754 dev_uc_flush(dev);
5755 dev_mc_flush(dev);
5756
5757 /* Actually switch the network namespace */
5758 dev_net_set(dev, net);
5759
5760 /* If there is an ifindex conflict assign a new one */
5761 if (__dev_get_by_index(net, dev->ifindex)) {
5762 int iflink = (dev->iflink == dev->ifindex);
5763 dev->ifindex = dev_new_index(net);
5764 if (iflink)
5765 dev->iflink = dev->ifindex;
5766 }
5767
5768 /* Fixup kobjects */
5769 err = device_rename(&dev->dev, dev->name);
5770 WARN_ON(err);
5771
5772 /* Add the device back in the hashes */
5773 list_netdevice(dev);
5774
5775 /* Notify protocols, that a new device appeared. */
5776 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5777
5778 /*
5779 * Prevent userspace races by waiting until the network
5780 * device is fully setup before sending notifications.
5781 */
5782 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5783
5784 synchronize_net();
5785 err = 0;
5786 out:
5787 return err;
5788 }
5789 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5790
5791 static int dev_cpu_callback(struct notifier_block *nfb,
5792 unsigned long action,
5793 void *ocpu)
5794 {
5795 struct sk_buff **list_skb;
5796 struct sk_buff *skb;
5797 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5798 struct softnet_data *sd, *oldsd;
5799
5800 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5801 return NOTIFY_OK;
5802
5803 local_irq_disable();
5804 cpu = smp_processor_id();
5805 sd = &per_cpu(softnet_data, cpu);
5806 oldsd = &per_cpu(softnet_data, oldcpu);
5807
5808 /* Find end of our completion_queue. */
5809 list_skb = &sd->completion_queue;
5810 while (*list_skb)
5811 list_skb = &(*list_skb)->next;
5812 /* Append completion queue from offline CPU. */
5813 *list_skb = oldsd->completion_queue;
5814 oldsd->completion_queue = NULL;
5815
5816 /* Append output queue from offline CPU. */
5817 if (oldsd->output_queue) {
5818 *sd->output_queue_tailp = oldsd->output_queue;
5819 sd->output_queue_tailp = oldsd->output_queue_tailp;
5820 oldsd->output_queue = NULL;
5821 oldsd->output_queue_tailp = &oldsd->output_queue;
5822 }
5823
5824 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5825 local_irq_enable();
5826
5827 /* Process offline CPU's input_pkt_queue */
5828 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5829 netif_rx(skb);
5830 input_queue_head_incr(oldsd);
5831 }
5832 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5833 netif_rx(skb);
5834 input_queue_head_incr(oldsd);
5835 }
5836
5837 return NOTIFY_OK;
5838 }
5839
5840
5841 /**
5842 * netdev_increment_features - increment feature set by one
5843 * @all: current feature set
5844 * @one: new feature set
5845 * @mask: mask feature set
5846 *
5847 * Computes a new feature set after adding a device with feature set
5848 * @one to the master device with current feature set @all. Will not
5849 * enable anything that is off in @mask. Returns the new feature set.
5850 */
5851 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5852 unsigned long mask)
5853 {
5854 /* If device needs checksumming, downgrade to it. */
5855 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5856 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5857 else if (mask & NETIF_F_ALL_CSUM) {
5858 /* If one device supports v4/v6 checksumming, set for all. */
5859 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5860 !(all & NETIF_F_GEN_CSUM)) {
5861 all &= ~NETIF_F_ALL_CSUM;
5862 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5863 }
5864
5865 /* If one device supports hw checksumming, set for all. */
5866 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5867 all &= ~NETIF_F_ALL_CSUM;
5868 all |= NETIF_F_HW_CSUM;
5869 }
5870 }
5871
5872 one |= NETIF_F_ALL_CSUM;
5873
5874 one |= all & NETIF_F_ONE_FOR_ALL;
5875 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5876 all |= one & mask & NETIF_F_ONE_FOR_ALL;
5877
5878 return all;
5879 }
5880 EXPORT_SYMBOL(netdev_increment_features);
5881
5882 static struct hlist_head *netdev_create_hash(void)
5883 {
5884 int i;
5885 struct hlist_head *hash;
5886
5887 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5888 if (hash != NULL)
5889 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5890 INIT_HLIST_HEAD(&hash[i]);
5891
5892 return hash;
5893 }
5894
5895 /* Initialize per network namespace state */
5896 static int __net_init netdev_init(struct net *net)
5897 {
5898 INIT_LIST_HEAD(&net->dev_base_head);
5899
5900 net->dev_name_head = netdev_create_hash();
5901 if (net->dev_name_head == NULL)
5902 goto err_name;
5903
5904 net->dev_index_head = netdev_create_hash();
5905 if (net->dev_index_head == NULL)
5906 goto err_idx;
5907
5908 return 0;
5909
5910 err_idx:
5911 kfree(net->dev_name_head);
5912 err_name:
5913 return -ENOMEM;
5914 }
5915
5916 /**
5917 * netdev_drivername - network driver for the device
5918 * @dev: network device
5919 * @buffer: buffer for resulting name
5920 * @len: size of buffer
5921 *
5922 * Determine network driver for device.
5923 */
5924 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5925 {
5926 const struct device_driver *driver;
5927 const struct device *parent;
5928
5929 if (len <= 0 || !buffer)
5930 return buffer;
5931 buffer[0] = 0;
5932
5933 parent = dev->dev.parent;
5934
5935 if (!parent)
5936 return buffer;
5937
5938 driver = parent->driver;
5939 if (driver && driver->name)
5940 strlcpy(buffer, driver->name, len);
5941 return buffer;
5942 }
5943
5944 static int __netdev_printk(const char *level, const struct net_device *dev,
5945 struct va_format *vaf)
5946 {
5947 int r;
5948
5949 if (dev && dev->dev.parent)
5950 r = dev_printk(level, dev->dev.parent, "%s: %pV",
5951 netdev_name(dev), vaf);
5952 else if (dev)
5953 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
5954 else
5955 r = printk("%s(NULL net_device): %pV", level, vaf);
5956
5957 return r;
5958 }
5959
5960 int netdev_printk(const char *level, const struct net_device *dev,
5961 const char *format, ...)
5962 {
5963 struct va_format vaf;
5964 va_list args;
5965 int r;
5966
5967 va_start(args, format);
5968
5969 vaf.fmt = format;
5970 vaf.va = &args;
5971
5972 r = __netdev_printk(level, dev, &vaf);
5973 va_end(args);
5974
5975 return r;
5976 }
5977 EXPORT_SYMBOL(netdev_printk);
5978
5979 #define define_netdev_printk_level(func, level) \
5980 int func(const struct net_device *dev, const char *fmt, ...) \
5981 { \
5982 int r; \
5983 struct va_format vaf; \
5984 va_list args; \
5985 \
5986 va_start(args, fmt); \
5987 \
5988 vaf.fmt = fmt; \
5989 vaf.va = &args; \
5990 \
5991 r = __netdev_printk(level, dev, &vaf); \
5992 va_end(args); \
5993 \
5994 return r; \
5995 } \
5996 EXPORT_SYMBOL(func);
5997
5998 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
5999 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6000 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6001 define_netdev_printk_level(netdev_err, KERN_ERR);
6002 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6003 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6004 define_netdev_printk_level(netdev_info, KERN_INFO);
6005
6006 static void __net_exit netdev_exit(struct net *net)
6007 {
6008 kfree(net->dev_name_head);
6009 kfree(net->dev_index_head);
6010 }
6011
6012 static struct pernet_operations __net_initdata netdev_net_ops = {
6013 .init = netdev_init,
6014 .exit = netdev_exit,
6015 };
6016
6017 static void __net_exit default_device_exit(struct net *net)
6018 {
6019 struct net_device *dev, *aux;
6020 /*
6021 * Push all migratable network devices back to the
6022 * initial network namespace
6023 */
6024 rtnl_lock();
6025 for_each_netdev_safe(net, dev, aux) {
6026 int err;
6027 char fb_name[IFNAMSIZ];
6028
6029 /* Ignore unmoveable devices (i.e. loopback) */
6030 if (dev->features & NETIF_F_NETNS_LOCAL)
6031 continue;
6032
6033 /* Leave virtual devices for the generic cleanup */
6034 if (dev->rtnl_link_ops)
6035 continue;
6036
6037 /* Push remaing network devices to init_net */
6038 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6039 err = dev_change_net_namespace(dev, &init_net, fb_name);
6040 if (err) {
6041 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6042 __func__, dev->name, err);
6043 BUG();
6044 }
6045 }
6046 rtnl_unlock();
6047 }
6048
6049 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6050 {
6051 /* At exit all network devices most be removed from a network
6052 * namespace. Do this in the reverse order of registeration.
6053 * Do this across as many network namespaces as possible to
6054 * improve batching efficiency.
6055 */
6056 struct net_device *dev;
6057 struct net *net;
6058 LIST_HEAD(dev_kill_list);
6059
6060 rtnl_lock();
6061 list_for_each_entry(net, net_list, exit_list) {
6062 for_each_netdev_reverse(net, dev) {
6063 if (dev->rtnl_link_ops)
6064 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6065 else
6066 unregister_netdevice_queue(dev, &dev_kill_list);
6067 }
6068 }
6069 unregister_netdevice_many(&dev_kill_list);
6070 rtnl_unlock();
6071 }
6072
6073 static struct pernet_operations __net_initdata default_device_ops = {
6074 .exit = default_device_exit,
6075 .exit_batch = default_device_exit_batch,
6076 };
6077
6078 /*
6079 * Initialize the DEV module. At boot time this walks the device list and
6080 * unhooks any devices that fail to initialise (normally hardware not
6081 * present) and leaves us with a valid list of present and active devices.
6082 *
6083 */
6084
6085 /*
6086 * This is called single threaded during boot, so no need
6087 * to take the rtnl semaphore.
6088 */
6089 static int __init net_dev_init(void)
6090 {
6091 int i, rc = -ENOMEM;
6092
6093 BUG_ON(!dev_boot_phase);
6094
6095 if (dev_proc_init())
6096 goto out;
6097
6098 if (netdev_kobject_init())
6099 goto out;
6100
6101 INIT_LIST_HEAD(&ptype_all);
6102 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6103 INIT_LIST_HEAD(&ptype_base[i]);
6104
6105 if (register_pernet_subsys(&netdev_net_ops))
6106 goto out;
6107
6108 /*
6109 * Initialise the packet receive queues.
6110 */
6111
6112 for_each_possible_cpu(i) {
6113 struct softnet_data *sd = &per_cpu(softnet_data, i);
6114
6115 memset(sd, 0, sizeof(*sd));
6116 skb_queue_head_init(&sd->input_pkt_queue);
6117 skb_queue_head_init(&sd->process_queue);
6118 sd->completion_queue = NULL;
6119 INIT_LIST_HEAD(&sd->poll_list);
6120 sd->output_queue = NULL;
6121 sd->output_queue_tailp = &sd->output_queue;
6122 #ifdef CONFIG_RPS
6123 sd->csd.func = rps_trigger_softirq;
6124 sd->csd.info = sd;
6125 sd->csd.flags = 0;
6126 sd->cpu = i;
6127 #endif
6128
6129 sd->backlog.poll = process_backlog;
6130 sd->backlog.weight = weight_p;
6131 sd->backlog.gro_list = NULL;
6132 sd->backlog.gro_count = 0;
6133 }
6134
6135 dev_boot_phase = 0;
6136
6137 /* The loopback device is special if any other network devices
6138 * is present in a network namespace the loopback device must
6139 * be present. Since we now dynamically allocate and free the
6140 * loopback device ensure this invariant is maintained by
6141 * keeping the loopback device as the first device on the
6142 * list of network devices. Ensuring the loopback devices
6143 * is the first device that appears and the last network device
6144 * that disappears.
6145 */
6146 if (register_pernet_device(&loopback_net_ops))
6147 goto out;
6148
6149 if (register_pernet_device(&default_device_ops))
6150 goto out;
6151
6152 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6153 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6154
6155 hotcpu_notifier(dev_cpu_callback, 0);
6156 dst_init();
6157 dev_mcast_init();
6158 rc = 0;
6159 out:
6160 return rc;
6161 }
6162
6163 subsys_initcall(net_dev_init);
6164
6165 static int __init initialize_hashrnd(void)
6166 {
6167 get_random_bytes(&hashrnd, sizeof(hashrnd));
6168 return 0;
6169 }
6170
6171 late_initcall_sync(initialize_hashrnd);
6172
This page took 0.160551 seconds and 5 git commands to generate.