Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6
[deliverable/linux.git] / net / core / dev.c
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 #include <linux/cpu_rmap.h>
136
137 #include "net-sysfs.h"
138
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
141
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144
145 /*
146 * The list of packet types we will receive (as opposed to discard)
147 * and the routines to invoke.
148 *
149 * Why 16. Because with 16 the only overlap we get on a hash of the
150 * low nibble of the protocol value is RARP/SNAP/X.25.
151 *
152 * NOTE: That is no longer true with the addition of VLAN tags. Not
153 * sure which should go first, but I bet it won't make much
154 * difference if we are running VLANs. The good news is that
155 * this protocol won't be in the list unless compiled in, so
156 * the average user (w/out VLANs) will not be adversely affected.
157 * --BLG
158 *
159 * 0800 IP
160 * 8100 802.1Q VLAN
161 * 0001 802.3
162 * 0002 AX.25
163 * 0004 802.2
164 * 8035 RARP
165 * 0005 SNAP
166 * 0805 X.25
167 * 0806 ARP
168 * 8137 IPX
169 * 0009 Localtalk
170 * 86DD IPv6
171 */
172
173 #define PTYPE_HASH_SIZE (16)
174 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
175
176 static DEFINE_SPINLOCK(ptype_lock);
177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178 static struct list_head ptype_all __read_mostly; /* Taps */
179
180 /*
181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182 * semaphore.
183 *
184 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
185 *
186 * Writers must hold the rtnl semaphore while they loop through the
187 * dev_base_head list, and hold dev_base_lock for writing when they do the
188 * actual updates. This allows pure readers to access the list even
189 * while a writer is preparing to update it.
190 *
191 * To put it another way, dev_base_lock is held for writing only to
192 * protect against pure readers; the rtnl semaphore provides the
193 * protection against other writers.
194 *
195 * See, for example usages, register_netdevice() and
196 * unregister_netdevice(), which must be called with the rtnl
197 * semaphore held.
198 */
199 DEFINE_RWLOCK(dev_base_lock);
200 EXPORT_SYMBOL(dev_base_lock);
201
202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
203 {
204 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
205 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206 }
207
208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
209 {
210 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211 }
212
213 static inline void rps_lock(struct softnet_data *sd)
214 {
215 #ifdef CONFIG_RPS
216 spin_lock(&sd->input_pkt_queue.lock);
217 #endif
218 }
219
220 static inline void rps_unlock(struct softnet_data *sd)
221 {
222 #ifdef CONFIG_RPS
223 spin_unlock(&sd->input_pkt_queue.lock);
224 #endif
225 }
226
227 /* Device list insertion */
228 static int list_netdevice(struct net_device *dev)
229 {
230 struct net *net = dev_net(dev);
231
232 ASSERT_RTNL();
233
234 write_lock_bh(&dev_base_lock);
235 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
236 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
237 hlist_add_head_rcu(&dev->index_hlist,
238 dev_index_hash(net, dev->ifindex));
239 write_unlock_bh(&dev_base_lock);
240 return 0;
241 }
242
243 /* Device list removal
244 * caller must respect a RCU grace period before freeing/reusing dev
245 */
246 static void unlist_netdevice(struct net_device *dev)
247 {
248 ASSERT_RTNL();
249
250 /* Unlink dev from the device chain */
251 write_lock_bh(&dev_base_lock);
252 list_del_rcu(&dev->dev_list);
253 hlist_del_rcu(&dev->name_hlist);
254 hlist_del_rcu(&dev->index_hlist);
255 write_unlock_bh(&dev_base_lock);
256 }
257
258 /*
259 * Our notifier list
260 */
261
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263
264 /*
265 * Device drivers call our routines to queue packets here. We empty the
266 * queue in the local softnet handler.
267 */
268
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271
272 #ifdef CONFIG_LOCKDEP
273 /*
274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275 * according to dev->type
276 */
277 static const unsigned short netdev_lock_type[] =
278 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
291 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
292 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
293 ARPHRD_VOID, ARPHRD_NONE};
294
295 static const char *const netdev_lock_name[] =
296 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
309 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
310 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
311 "_xmit_VOID", "_xmit_NONE"};
312
313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
315
316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
317 {
318 int i;
319
320 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
321 if (netdev_lock_type[i] == dev_type)
322 return i;
323 /* the last key is used by default */
324 return ARRAY_SIZE(netdev_lock_type) - 1;
325 }
326
327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328 unsigned short dev_type)
329 {
330 int i;
331
332 i = netdev_lock_pos(dev_type);
333 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
334 netdev_lock_name[i]);
335 }
336
337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
338 {
339 int i;
340
341 i = netdev_lock_pos(dev->type);
342 lockdep_set_class_and_name(&dev->addr_list_lock,
343 &netdev_addr_lock_key[i],
344 netdev_lock_name[i]);
345 }
346 #else
347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
348 unsigned short dev_type)
349 {
350 }
351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352 {
353 }
354 #endif
355
356 /*******************************************************************************
357
358 Protocol management and registration routines
359
360 *******************************************************************************/
361
362 /*
363 * Add a protocol ID to the list. Now that the input handler is
364 * smarter we can dispense with all the messy stuff that used to be
365 * here.
366 *
367 * BEWARE!!! Protocol handlers, mangling input packets,
368 * MUST BE last in hash buckets and checking protocol handlers
369 * MUST start from promiscuous ptype_all chain in net_bh.
370 * It is true now, do not change it.
371 * Explanation follows: if protocol handler, mangling packet, will
372 * be the first on list, it is not able to sense, that packet
373 * is cloned and should be copied-on-write, so that it will
374 * change it and subsequent readers will get broken packet.
375 * --ANK (980803)
376 */
377
378 static inline struct list_head *ptype_head(const struct packet_type *pt)
379 {
380 if (pt->type == htons(ETH_P_ALL))
381 return &ptype_all;
382 else
383 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384 }
385
386 /**
387 * dev_add_pack - add packet handler
388 * @pt: packet type declaration
389 *
390 * Add a protocol handler to the networking stack. The passed &packet_type
391 * is linked into kernel lists and may not be freed until it has been
392 * removed from the kernel lists.
393 *
394 * This call does not sleep therefore it can not
395 * guarantee all CPU's that are in middle of receiving packets
396 * will see the new packet type (until the next received packet).
397 */
398
399 void dev_add_pack(struct packet_type *pt)
400 {
401 struct list_head *head = ptype_head(pt);
402
403 spin_lock(&ptype_lock);
404 list_add_rcu(&pt->list, head);
405 spin_unlock(&ptype_lock);
406 }
407 EXPORT_SYMBOL(dev_add_pack);
408
409 /**
410 * __dev_remove_pack - remove packet handler
411 * @pt: packet type declaration
412 *
413 * Remove a protocol handler that was previously added to the kernel
414 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
415 * from the kernel lists and can be freed or reused once this function
416 * returns.
417 *
418 * The packet type might still be in use by receivers
419 * and must not be freed until after all the CPU's have gone
420 * through a quiescent state.
421 */
422 void __dev_remove_pack(struct packet_type *pt)
423 {
424 struct list_head *head = ptype_head(pt);
425 struct packet_type *pt1;
426
427 spin_lock(&ptype_lock);
428
429 list_for_each_entry(pt1, head, list) {
430 if (pt == pt1) {
431 list_del_rcu(&pt->list);
432 goto out;
433 }
434 }
435
436 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437 out:
438 spin_unlock(&ptype_lock);
439 }
440 EXPORT_SYMBOL(__dev_remove_pack);
441
442 /**
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
445 *
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
449 * returns.
450 *
451 * This call sleeps to guarantee that no CPU is looking at the packet
452 * type after return.
453 */
454 void dev_remove_pack(struct packet_type *pt)
455 {
456 __dev_remove_pack(pt);
457
458 synchronize_net();
459 }
460 EXPORT_SYMBOL(dev_remove_pack);
461
462 /******************************************************************************
463
464 Device Boot-time Settings Routines
465
466 *******************************************************************************/
467
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470
471 /**
472 * netdev_boot_setup_add - add new setup entry
473 * @name: name of the device
474 * @map: configured settings for the device
475 *
476 * Adds new setup entry to the dev_boot_setup list. The function
477 * returns 0 on error and 1 on success. This is a generic routine to
478 * all netdevices.
479 */
480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
481 {
482 struct netdev_boot_setup *s;
483 int i;
484
485 s = dev_boot_setup;
486 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 memset(s[i].name, 0, sizeof(s[i].name));
489 strlcpy(s[i].name, name, IFNAMSIZ);
490 memcpy(&s[i].map, map, sizeof(s[i].map));
491 break;
492 }
493 }
494
495 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496 }
497
498 /**
499 * netdev_boot_setup_check - check boot time settings
500 * @dev: the netdevice
501 *
502 * Check boot time settings for the device.
503 * The found settings are set for the device to be used
504 * later in the device probing.
505 * Returns 0 if no settings found, 1 if they are.
506 */
507 int netdev_boot_setup_check(struct net_device *dev)
508 {
509 struct netdev_boot_setup *s = dev_boot_setup;
510 int i;
511
512 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514 !strcmp(dev->name, s[i].name)) {
515 dev->irq = s[i].map.irq;
516 dev->base_addr = s[i].map.base_addr;
517 dev->mem_start = s[i].map.mem_start;
518 dev->mem_end = s[i].map.mem_end;
519 return 1;
520 }
521 }
522 return 0;
523 }
524 EXPORT_SYMBOL(netdev_boot_setup_check);
525
526
527 /**
528 * netdev_boot_base - get address from boot time settings
529 * @prefix: prefix for network device
530 * @unit: id for network device
531 *
532 * Check boot time settings for the base address of device.
533 * The found settings are set for the device to be used
534 * later in the device probing.
535 * Returns 0 if no settings found.
536 */
537 unsigned long netdev_boot_base(const char *prefix, int unit)
538 {
539 const struct netdev_boot_setup *s = dev_boot_setup;
540 char name[IFNAMSIZ];
541 int i;
542
543 sprintf(name, "%s%d", prefix, unit);
544
545 /*
546 * If device already registered then return base of 1
547 * to indicate not to probe for this interface
548 */
549 if (__dev_get_by_name(&init_net, name))
550 return 1;
551
552 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 if (!strcmp(name, s[i].name))
554 return s[i].map.base_addr;
555 return 0;
556 }
557
558 /*
559 * Saves at boot time configured settings for any netdevice.
560 */
561 int __init netdev_boot_setup(char *str)
562 {
563 int ints[5];
564 struct ifmap map;
565
566 str = get_options(str, ARRAY_SIZE(ints), ints);
567 if (!str || !*str)
568 return 0;
569
570 /* Save settings */
571 memset(&map, 0, sizeof(map));
572 if (ints[0] > 0)
573 map.irq = ints[1];
574 if (ints[0] > 1)
575 map.base_addr = ints[2];
576 if (ints[0] > 2)
577 map.mem_start = ints[3];
578 if (ints[0] > 3)
579 map.mem_end = ints[4];
580
581 /* Add new entry to the list */
582 return netdev_boot_setup_add(str, &map);
583 }
584
585 __setup("netdev=", netdev_boot_setup);
586
587 /*******************************************************************************
588
589 Device Interface Subroutines
590
591 *******************************************************************************/
592
593 /**
594 * __dev_get_by_name - find a device by its name
595 * @net: the applicable net namespace
596 * @name: name to find
597 *
598 * Find an interface by name. Must be called under RTNL semaphore
599 * or @dev_base_lock. If the name is found a pointer to the device
600 * is returned. If the name is not found then %NULL is returned. The
601 * reference counters are not incremented so the caller must be
602 * careful with locks.
603 */
604
605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
606 {
607 struct hlist_node *p;
608 struct net_device *dev;
609 struct hlist_head *head = dev_name_hash(net, name);
610
611 hlist_for_each_entry(dev, p, head, name_hlist)
612 if (!strncmp(dev->name, name, IFNAMSIZ))
613 return dev;
614
615 return NULL;
616 }
617 EXPORT_SYMBOL(__dev_get_by_name);
618
619 /**
620 * dev_get_by_name_rcu - find a device by its name
621 * @net: the applicable net namespace
622 * @name: name to find
623 *
624 * Find an interface by name.
625 * If the name is found a pointer to the device is returned.
626 * If the name is not found then %NULL is returned.
627 * The reference counters are not incremented so the caller must be
628 * careful with locks. The caller must hold RCU lock.
629 */
630
631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632 {
633 struct hlist_node *p;
634 struct net_device *dev;
635 struct hlist_head *head = dev_name_hash(net, name);
636
637 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 if (!strncmp(dev->name, name, IFNAMSIZ))
639 return dev;
640
641 return NULL;
642 }
643 EXPORT_SYMBOL(dev_get_by_name_rcu);
644
645 /**
646 * dev_get_by_name - find a device by its name
647 * @net: the applicable net namespace
648 * @name: name to find
649 *
650 * Find an interface by name. This can be called from any
651 * context and does its own locking. The returned handle has
652 * the usage count incremented and the caller must use dev_put() to
653 * release it when it is no longer needed. %NULL is returned if no
654 * matching device is found.
655 */
656
657 struct net_device *dev_get_by_name(struct net *net, const char *name)
658 {
659 struct net_device *dev;
660
661 rcu_read_lock();
662 dev = dev_get_by_name_rcu(net, name);
663 if (dev)
664 dev_hold(dev);
665 rcu_read_unlock();
666 return dev;
667 }
668 EXPORT_SYMBOL(dev_get_by_name);
669
670 /**
671 * __dev_get_by_index - find a device by its ifindex
672 * @net: the applicable net namespace
673 * @ifindex: index of device
674 *
675 * Search for an interface by index. Returns %NULL if the device
676 * is not found or a pointer to the device. The device has not
677 * had its reference counter increased so the caller must be careful
678 * about locking. The caller must hold either the RTNL semaphore
679 * or @dev_base_lock.
680 */
681
682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
683 {
684 struct hlist_node *p;
685 struct net_device *dev;
686 struct hlist_head *head = dev_index_hash(net, ifindex);
687
688 hlist_for_each_entry(dev, p, head, index_hlist)
689 if (dev->ifindex == ifindex)
690 return dev;
691
692 return NULL;
693 }
694 EXPORT_SYMBOL(__dev_get_by_index);
695
696 /**
697 * dev_get_by_index_rcu - find a device by its ifindex
698 * @net: the applicable net namespace
699 * @ifindex: index of device
700 *
701 * Search for an interface by index. Returns %NULL if the device
702 * is not found or a pointer to the device. The device has not
703 * had its reference counter increased so the caller must be careful
704 * about locking. The caller must hold RCU lock.
705 */
706
707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708 {
709 struct hlist_node *p;
710 struct net_device *dev;
711 struct hlist_head *head = dev_index_hash(net, ifindex);
712
713 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 if (dev->ifindex == ifindex)
715 return dev;
716
717 return NULL;
718 }
719 EXPORT_SYMBOL(dev_get_by_index_rcu);
720
721
722 /**
723 * dev_get_by_index - find a device by its ifindex
724 * @net: the applicable net namespace
725 * @ifindex: index of device
726 *
727 * Search for an interface by index. Returns NULL if the device
728 * is not found or a pointer to the device. The device returned has
729 * had a reference added and the pointer is safe until the user calls
730 * dev_put to indicate they have finished with it.
731 */
732
733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
734 {
735 struct net_device *dev;
736
737 rcu_read_lock();
738 dev = dev_get_by_index_rcu(net, ifindex);
739 if (dev)
740 dev_hold(dev);
741 rcu_read_unlock();
742 return dev;
743 }
744 EXPORT_SYMBOL(dev_get_by_index);
745
746 /**
747 * dev_getbyhwaddr_rcu - find a device by its hardware address
748 * @net: the applicable net namespace
749 * @type: media type of device
750 * @ha: hardware address
751 *
752 * Search for an interface by MAC address. Returns NULL if the device
753 * is not found or a pointer to the device.
754 * The caller must hold RCU or RTNL.
755 * The returned device has not had its ref count increased
756 * and the caller must therefore be careful about locking
757 *
758 */
759
760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
761 const char *ha)
762 {
763 struct net_device *dev;
764
765 for_each_netdev_rcu(net, dev)
766 if (dev->type == type &&
767 !memcmp(dev->dev_addr, ha, dev->addr_len))
768 return dev;
769
770 return NULL;
771 }
772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
773
774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
775 {
776 struct net_device *dev;
777
778 ASSERT_RTNL();
779 for_each_netdev(net, dev)
780 if (dev->type == type)
781 return dev;
782
783 return NULL;
784 }
785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
786
787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
788 {
789 struct net_device *dev, *ret = NULL;
790
791 rcu_read_lock();
792 for_each_netdev_rcu(net, dev)
793 if (dev->type == type) {
794 dev_hold(dev);
795 ret = dev;
796 break;
797 }
798 rcu_read_unlock();
799 return ret;
800 }
801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
802
803 /**
804 * dev_get_by_flags_rcu - find any device with given flags
805 * @net: the applicable net namespace
806 * @if_flags: IFF_* values
807 * @mask: bitmask of bits in if_flags to check
808 *
809 * Search for any interface with the given flags. Returns NULL if a device
810 * is not found or a pointer to the device. Must be called inside
811 * rcu_read_lock(), and result refcount is unchanged.
812 */
813
814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
815 unsigned short mask)
816 {
817 struct net_device *dev, *ret;
818
819 ret = NULL;
820 for_each_netdev_rcu(net, dev) {
821 if (((dev->flags ^ if_flags) & mask) == 0) {
822 ret = dev;
823 break;
824 }
825 }
826 return ret;
827 }
828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
829
830 /**
831 * dev_valid_name - check if name is okay for network device
832 * @name: name string
833 *
834 * Network device names need to be valid file names to
835 * to allow sysfs to work. We also disallow any kind of
836 * whitespace.
837 */
838 int dev_valid_name(const char *name)
839 {
840 if (*name == '\0')
841 return 0;
842 if (strlen(name) >= IFNAMSIZ)
843 return 0;
844 if (!strcmp(name, ".") || !strcmp(name, ".."))
845 return 0;
846
847 while (*name) {
848 if (*name == '/' || isspace(*name))
849 return 0;
850 name++;
851 }
852 return 1;
853 }
854 EXPORT_SYMBOL(dev_valid_name);
855
856 /**
857 * __dev_alloc_name - allocate a name for a device
858 * @net: network namespace to allocate the device name in
859 * @name: name format string
860 * @buf: scratch buffer and result name string
861 *
862 * Passed a format string - eg "lt%d" it will try and find a suitable
863 * id. It scans list of devices to build up a free map, then chooses
864 * the first empty slot. The caller must hold the dev_base or rtnl lock
865 * while allocating the name and adding the device in order to avoid
866 * duplicates.
867 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
868 * Returns the number of the unit assigned or a negative errno code.
869 */
870
871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
872 {
873 int i = 0;
874 const char *p;
875 const int max_netdevices = 8*PAGE_SIZE;
876 unsigned long *inuse;
877 struct net_device *d;
878
879 p = strnchr(name, IFNAMSIZ-1, '%');
880 if (p) {
881 /*
882 * Verify the string as this thing may have come from
883 * the user. There must be either one "%d" and no other "%"
884 * characters.
885 */
886 if (p[1] != 'd' || strchr(p + 2, '%'))
887 return -EINVAL;
888
889 /* Use one page as a bit array of possible slots */
890 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
891 if (!inuse)
892 return -ENOMEM;
893
894 for_each_netdev(net, d) {
895 if (!sscanf(d->name, name, &i))
896 continue;
897 if (i < 0 || i >= max_netdevices)
898 continue;
899
900 /* avoid cases where sscanf is not exact inverse of printf */
901 snprintf(buf, IFNAMSIZ, name, i);
902 if (!strncmp(buf, d->name, IFNAMSIZ))
903 set_bit(i, inuse);
904 }
905
906 i = find_first_zero_bit(inuse, max_netdevices);
907 free_page((unsigned long) inuse);
908 }
909
910 if (buf != name)
911 snprintf(buf, IFNAMSIZ, name, i);
912 if (!__dev_get_by_name(net, buf))
913 return i;
914
915 /* It is possible to run out of possible slots
916 * when the name is long and there isn't enough space left
917 * for the digits, or if all bits are used.
918 */
919 return -ENFILE;
920 }
921
922 /**
923 * dev_alloc_name - allocate a name for a device
924 * @dev: device
925 * @name: name format string
926 *
927 * Passed a format string - eg "lt%d" it will try and find a suitable
928 * id. It scans list of devices to build up a free map, then chooses
929 * the first empty slot. The caller must hold the dev_base or rtnl lock
930 * while allocating the name and adding the device in order to avoid
931 * duplicates.
932 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
933 * Returns the number of the unit assigned or a negative errno code.
934 */
935
936 int dev_alloc_name(struct net_device *dev, const char *name)
937 {
938 char buf[IFNAMSIZ];
939 struct net *net;
940 int ret;
941
942 BUG_ON(!dev_net(dev));
943 net = dev_net(dev);
944 ret = __dev_alloc_name(net, name, buf);
945 if (ret >= 0)
946 strlcpy(dev->name, buf, IFNAMSIZ);
947 return ret;
948 }
949 EXPORT_SYMBOL(dev_alloc_name);
950
951 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
952 {
953 struct net *net;
954
955 BUG_ON(!dev_net(dev));
956 net = dev_net(dev);
957
958 if (!dev_valid_name(name))
959 return -EINVAL;
960
961 if (fmt && strchr(name, '%'))
962 return dev_alloc_name(dev, name);
963 else if (__dev_get_by_name(net, name))
964 return -EEXIST;
965 else if (dev->name != name)
966 strlcpy(dev->name, name, IFNAMSIZ);
967
968 return 0;
969 }
970
971 /**
972 * dev_change_name - change name of a device
973 * @dev: device
974 * @newname: name (or format string) must be at least IFNAMSIZ
975 *
976 * Change name of a device, can pass format strings "eth%d".
977 * for wildcarding.
978 */
979 int dev_change_name(struct net_device *dev, const char *newname)
980 {
981 char oldname[IFNAMSIZ];
982 int err = 0;
983 int ret;
984 struct net *net;
985
986 ASSERT_RTNL();
987 BUG_ON(!dev_net(dev));
988
989 net = dev_net(dev);
990 if (dev->flags & IFF_UP)
991 return -EBUSY;
992
993 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
994 return 0;
995
996 memcpy(oldname, dev->name, IFNAMSIZ);
997
998 err = dev_get_valid_name(dev, newname, 1);
999 if (err < 0)
1000 return err;
1001
1002 rollback:
1003 ret = device_rename(&dev->dev, dev->name);
1004 if (ret) {
1005 memcpy(dev->name, oldname, IFNAMSIZ);
1006 return ret;
1007 }
1008
1009 write_lock_bh(&dev_base_lock);
1010 hlist_del(&dev->name_hlist);
1011 write_unlock_bh(&dev_base_lock);
1012
1013 synchronize_rcu();
1014
1015 write_lock_bh(&dev_base_lock);
1016 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017 write_unlock_bh(&dev_base_lock);
1018
1019 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020 ret = notifier_to_errno(ret);
1021
1022 if (ret) {
1023 /* err >= 0 after dev_alloc_name() or stores the first errno */
1024 if (err >= 0) {
1025 err = ret;
1026 memcpy(dev->name, oldname, IFNAMSIZ);
1027 goto rollback;
1028 } else {
1029 printk(KERN_ERR
1030 "%s: name change rollback failed: %d.\n",
1031 dev->name, ret);
1032 }
1033 }
1034
1035 return err;
1036 }
1037
1038 /**
1039 * dev_set_alias - change ifalias of a device
1040 * @dev: device
1041 * @alias: name up to IFALIASZ
1042 * @len: limit of bytes to copy from info
1043 *
1044 * Set ifalias for a device,
1045 */
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047 {
1048 ASSERT_RTNL();
1049
1050 if (len >= IFALIASZ)
1051 return -EINVAL;
1052
1053 if (!len) {
1054 if (dev->ifalias) {
1055 kfree(dev->ifalias);
1056 dev->ifalias = NULL;
1057 }
1058 return 0;
1059 }
1060
1061 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062 if (!dev->ifalias)
1063 return -ENOMEM;
1064
1065 strlcpy(dev->ifalias, alias, len+1);
1066 return len;
1067 }
1068
1069
1070 /**
1071 * netdev_features_change - device changes features
1072 * @dev: device to cause notification
1073 *
1074 * Called to indicate a device has changed features.
1075 */
1076 void netdev_features_change(struct net_device *dev)
1077 {
1078 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079 }
1080 EXPORT_SYMBOL(netdev_features_change);
1081
1082 /**
1083 * netdev_state_change - device changes state
1084 * @dev: device to cause notification
1085 *
1086 * Called to indicate a device has changed state. This function calls
1087 * the notifier chains for netdev_chain and sends a NEWLINK message
1088 * to the routing socket.
1089 */
1090 void netdev_state_change(struct net_device *dev)
1091 {
1092 if (dev->flags & IFF_UP) {
1093 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095 }
1096 }
1097 EXPORT_SYMBOL(netdev_state_change);
1098
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100 {
1101 return call_netdevice_notifiers(event, dev);
1102 }
1103 EXPORT_SYMBOL(netdev_bonding_change);
1104
1105 /**
1106 * dev_load - load a network module
1107 * @net: the applicable net namespace
1108 * @name: name of interface
1109 *
1110 * If a network interface is not present and the process has suitable
1111 * privileges this function loads the module. If module loading is not
1112 * available in this kernel then it becomes a nop.
1113 */
1114
1115 void dev_load(struct net *net, const char *name)
1116 {
1117 struct net_device *dev;
1118
1119 rcu_read_lock();
1120 dev = dev_get_by_name_rcu(net, name);
1121 rcu_read_unlock();
1122
1123 if (!dev && capable(CAP_NET_ADMIN))
1124 request_module("%s", name);
1125 }
1126 EXPORT_SYMBOL(dev_load);
1127
1128 static int __dev_open(struct net_device *dev)
1129 {
1130 const struct net_device_ops *ops = dev->netdev_ops;
1131 int ret;
1132
1133 ASSERT_RTNL();
1134
1135 /*
1136 * Is it even present?
1137 */
1138 if (!netif_device_present(dev))
1139 return -ENODEV;
1140
1141 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1142 ret = notifier_to_errno(ret);
1143 if (ret)
1144 return ret;
1145
1146 /*
1147 * Call device private open method
1148 */
1149 set_bit(__LINK_STATE_START, &dev->state);
1150
1151 if (ops->ndo_validate_addr)
1152 ret = ops->ndo_validate_addr(dev);
1153
1154 if (!ret && ops->ndo_open)
1155 ret = ops->ndo_open(dev);
1156
1157 /*
1158 * If it went open OK then:
1159 */
1160
1161 if (ret)
1162 clear_bit(__LINK_STATE_START, &dev->state);
1163 else {
1164 /*
1165 * Set the flags.
1166 */
1167 dev->flags |= IFF_UP;
1168
1169 /*
1170 * Enable NET_DMA
1171 */
1172 net_dmaengine_get();
1173
1174 /*
1175 * Initialize multicasting status
1176 */
1177 dev_set_rx_mode(dev);
1178
1179 /*
1180 * Wakeup transmit queue engine
1181 */
1182 dev_activate(dev);
1183 }
1184
1185 return ret;
1186 }
1187
1188 /**
1189 * dev_open - prepare an interface for use.
1190 * @dev: device to open
1191 *
1192 * Takes a device from down to up state. The device's private open
1193 * function is invoked and then the multicast lists are loaded. Finally
1194 * the device is moved into the up state and a %NETDEV_UP message is
1195 * sent to the netdev notifier chain.
1196 *
1197 * Calling this function on an active interface is a nop. On a failure
1198 * a negative errno code is returned.
1199 */
1200 int dev_open(struct net_device *dev)
1201 {
1202 int ret;
1203
1204 /*
1205 * Is it already up?
1206 */
1207 if (dev->flags & IFF_UP)
1208 return 0;
1209
1210 /*
1211 * Open device
1212 */
1213 ret = __dev_open(dev);
1214 if (ret < 0)
1215 return ret;
1216
1217 /*
1218 * ... and announce new interface.
1219 */
1220 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1221 call_netdevice_notifiers(NETDEV_UP, dev);
1222
1223 return ret;
1224 }
1225 EXPORT_SYMBOL(dev_open);
1226
1227 static int __dev_close_many(struct list_head *head)
1228 {
1229 struct net_device *dev;
1230
1231 ASSERT_RTNL();
1232 might_sleep();
1233
1234 list_for_each_entry(dev, head, unreg_list) {
1235 /*
1236 * Tell people we are going down, so that they can
1237 * prepare to death, when device is still operating.
1238 */
1239 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1240
1241 clear_bit(__LINK_STATE_START, &dev->state);
1242
1243 /* Synchronize to scheduled poll. We cannot touch poll list, it
1244 * can be even on different cpu. So just clear netif_running().
1245 *
1246 * dev->stop() will invoke napi_disable() on all of it's
1247 * napi_struct instances on this device.
1248 */
1249 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1250 }
1251
1252 dev_deactivate_many(head);
1253
1254 list_for_each_entry(dev, head, unreg_list) {
1255 const struct net_device_ops *ops = dev->netdev_ops;
1256
1257 /*
1258 * Call the device specific close. This cannot fail.
1259 * Only if device is UP
1260 *
1261 * We allow it to be called even after a DETACH hot-plug
1262 * event.
1263 */
1264 if (ops->ndo_stop)
1265 ops->ndo_stop(dev);
1266
1267 /*
1268 * Device is now down.
1269 */
1270
1271 dev->flags &= ~IFF_UP;
1272
1273 /*
1274 * Shutdown NET_DMA
1275 */
1276 net_dmaengine_put();
1277 }
1278
1279 return 0;
1280 }
1281
1282 static int __dev_close(struct net_device *dev)
1283 {
1284 LIST_HEAD(single);
1285
1286 list_add(&dev->unreg_list, &single);
1287 return __dev_close_many(&single);
1288 }
1289
1290 static int dev_close_many(struct list_head *head)
1291 {
1292 struct net_device *dev, *tmp;
1293 LIST_HEAD(tmp_list);
1294
1295 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1296 if (!(dev->flags & IFF_UP))
1297 list_move(&dev->unreg_list, &tmp_list);
1298
1299 __dev_close_many(head);
1300
1301 /*
1302 * Tell people we are down
1303 */
1304 list_for_each_entry(dev, head, unreg_list) {
1305 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1306 call_netdevice_notifiers(NETDEV_DOWN, dev);
1307 }
1308
1309 /* rollback_registered_many needs the complete original list */
1310 list_splice(&tmp_list, head);
1311 return 0;
1312 }
1313
1314 /**
1315 * dev_close - shutdown an interface.
1316 * @dev: device to shutdown
1317 *
1318 * This function moves an active device into down state. A
1319 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1320 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1321 * chain.
1322 */
1323 int dev_close(struct net_device *dev)
1324 {
1325 LIST_HEAD(single);
1326
1327 list_add(&dev->unreg_list, &single);
1328 dev_close_many(&single);
1329
1330 return 0;
1331 }
1332 EXPORT_SYMBOL(dev_close);
1333
1334
1335 /**
1336 * dev_disable_lro - disable Large Receive Offload on a device
1337 * @dev: device
1338 *
1339 * Disable Large Receive Offload (LRO) on a net device. Must be
1340 * called under RTNL. This is needed if received packets may be
1341 * forwarded to another interface.
1342 */
1343 void dev_disable_lro(struct net_device *dev)
1344 {
1345 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1346 dev->ethtool_ops->set_flags) {
1347 u32 flags = dev->ethtool_ops->get_flags(dev);
1348 if (flags & ETH_FLAG_LRO) {
1349 flags &= ~ETH_FLAG_LRO;
1350 dev->ethtool_ops->set_flags(dev, flags);
1351 }
1352 }
1353 WARN_ON(dev->features & NETIF_F_LRO);
1354 }
1355 EXPORT_SYMBOL(dev_disable_lro);
1356
1357
1358 static int dev_boot_phase = 1;
1359
1360 /*
1361 * Device change register/unregister. These are not inline or static
1362 * as we export them to the world.
1363 */
1364
1365 /**
1366 * register_netdevice_notifier - register a network notifier block
1367 * @nb: notifier
1368 *
1369 * Register a notifier to be called when network device events occur.
1370 * The notifier passed is linked into the kernel structures and must
1371 * not be reused until it has been unregistered. A negative errno code
1372 * is returned on a failure.
1373 *
1374 * When registered all registration and up events are replayed
1375 * to the new notifier to allow device to have a race free
1376 * view of the network device list.
1377 */
1378
1379 int register_netdevice_notifier(struct notifier_block *nb)
1380 {
1381 struct net_device *dev;
1382 struct net_device *last;
1383 struct net *net;
1384 int err;
1385
1386 rtnl_lock();
1387 err = raw_notifier_chain_register(&netdev_chain, nb);
1388 if (err)
1389 goto unlock;
1390 if (dev_boot_phase)
1391 goto unlock;
1392 for_each_net(net) {
1393 for_each_netdev(net, dev) {
1394 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1395 err = notifier_to_errno(err);
1396 if (err)
1397 goto rollback;
1398
1399 if (!(dev->flags & IFF_UP))
1400 continue;
1401
1402 nb->notifier_call(nb, NETDEV_UP, dev);
1403 }
1404 }
1405
1406 unlock:
1407 rtnl_unlock();
1408 return err;
1409
1410 rollback:
1411 last = dev;
1412 for_each_net(net) {
1413 for_each_netdev(net, dev) {
1414 if (dev == last)
1415 break;
1416
1417 if (dev->flags & IFF_UP) {
1418 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1419 nb->notifier_call(nb, NETDEV_DOWN, dev);
1420 }
1421 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1422 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1423 }
1424 }
1425
1426 raw_notifier_chain_unregister(&netdev_chain, nb);
1427 goto unlock;
1428 }
1429 EXPORT_SYMBOL(register_netdevice_notifier);
1430
1431 /**
1432 * unregister_netdevice_notifier - unregister a network notifier block
1433 * @nb: notifier
1434 *
1435 * Unregister a notifier previously registered by
1436 * register_netdevice_notifier(). The notifier is unlinked into the
1437 * kernel structures and may then be reused. A negative errno code
1438 * is returned on a failure.
1439 */
1440
1441 int unregister_netdevice_notifier(struct notifier_block *nb)
1442 {
1443 int err;
1444
1445 rtnl_lock();
1446 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1447 rtnl_unlock();
1448 return err;
1449 }
1450 EXPORT_SYMBOL(unregister_netdevice_notifier);
1451
1452 /**
1453 * call_netdevice_notifiers - call all network notifier blocks
1454 * @val: value passed unmodified to notifier function
1455 * @dev: net_device pointer passed unmodified to notifier function
1456 *
1457 * Call all network notifier blocks. Parameters and return value
1458 * are as for raw_notifier_call_chain().
1459 */
1460
1461 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1462 {
1463 ASSERT_RTNL();
1464 return raw_notifier_call_chain(&netdev_chain, val, dev);
1465 }
1466
1467 /* When > 0 there are consumers of rx skb time stamps */
1468 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1469
1470 void net_enable_timestamp(void)
1471 {
1472 atomic_inc(&netstamp_needed);
1473 }
1474 EXPORT_SYMBOL(net_enable_timestamp);
1475
1476 void net_disable_timestamp(void)
1477 {
1478 atomic_dec(&netstamp_needed);
1479 }
1480 EXPORT_SYMBOL(net_disable_timestamp);
1481
1482 static inline void net_timestamp_set(struct sk_buff *skb)
1483 {
1484 if (atomic_read(&netstamp_needed))
1485 __net_timestamp(skb);
1486 else
1487 skb->tstamp.tv64 = 0;
1488 }
1489
1490 static inline void net_timestamp_check(struct sk_buff *skb)
1491 {
1492 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1493 __net_timestamp(skb);
1494 }
1495
1496 /**
1497 * dev_forward_skb - loopback an skb to another netif
1498 *
1499 * @dev: destination network device
1500 * @skb: buffer to forward
1501 *
1502 * return values:
1503 * NET_RX_SUCCESS (no congestion)
1504 * NET_RX_DROP (packet was dropped, but freed)
1505 *
1506 * dev_forward_skb can be used for injecting an skb from the
1507 * start_xmit function of one device into the receive queue
1508 * of another device.
1509 *
1510 * The receiving device may be in another namespace, so
1511 * we have to clear all information in the skb that could
1512 * impact namespace isolation.
1513 */
1514 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1515 {
1516 skb_orphan(skb);
1517 nf_reset(skb);
1518
1519 if (unlikely(!(dev->flags & IFF_UP) ||
1520 (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1521 atomic_long_inc(&dev->rx_dropped);
1522 kfree_skb(skb);
1523 return NET_RX_DROP;
1524 }
1525 skb_set_dev(skb, dev);
1526 skb->tstamp.tv64 = 0;
1527 skb->pkt_type = PACKET_HOST;
1528 skb->protocol = eth_type_trans(skb, dev);
1529 return netif_rx(skb);
1530 }
1531 EXPORT_SYMBOL_GPL(dev_forward_skb);
1532
1533 static inline int deliver_skb(struct sk_buff *skb,
1534 struct packet_type *pt_prev,
1535 struct net_device *orig_dev)
1536 {
1537 atomic_inc(&skb->users);
1538 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1539 }
1540
1541 /*
1542 * Support routine. Sends outgoing frames to any network
1543 * taps currently in use.
1544 */
1545
1546 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1547 {
1548 struct packet_type *ptype;
1549 struct sk_buff *skb2 = NULL;
1550 struct packet_type *pt_prev = NULL;
1551
1552 rcu_read_lock();
1553 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1554 /* Never send packets back to the socket
1555 * they originated from - MvS (miquels@drinkel.ow.org)
1556 */
1557 if ((ptype->dev == dev || !ptype->dev) &&
1558 (ptype->af_packet_priv == NULL ||
1559 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1560 if (pt_prev) {
1561 deliver_skb(skb2, pt_prev, skb->dev);
1562 pt_prev = ptype;
1563 continue;
1564 }
1565
1566 skb2 = skb_clone(skb, GFP_ATOMIC);
1567 if (!skb2)
1568 break;
1569
1570 net_timestamp_set(skb2);
1571
1572 /* skb->nh should be correctly
1573 set by sender, so that the second statement is
1574 just protection against buggy protocols.
1575 */
1576 skb_reset_mac_header(skb2);
1577
1578 if (skb_network_header(skb2) < skb2->data ||
1579 skb2->network_header > skb2->tail) {
1580 if (net_ratelimit())
1581 printk(KERN_CRIT "protocol %04x is "
1582 "buggy, dev %s\n",
1583 ntohs(skb2->protocol),
1584 dev->name);
1585 skb_reset_network_header(skb2);
1586 }
1587
1588 skb2->transport_header = skb2->network_header;
1589 skb2->pkt_type = PACKET_OUTGOING;
1590 pt_prev = ptype;
1591 }
1592 }
1593 if (pt_prev)
1594 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1595 rcu_read_unlock();
1596 }
1597
1598 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1599 * @dev: Network device
1600 * @txq: number of queues available
1601 *
1602 * If real_num_tx_queues is changed the tc mappings may no longer be
1603 * valid. To resolve this verify the tc mapping remains valid and if
1604 * not NULL the mapping. With no priorities mapping to this
1605 * offset/count pair it will no longer be used. In the worst case TC0
1606 * is invalid nothing can be done so disable priority mappings. If is
1607 * expected that drivers will fix this mapping if they can before
1608 * calling netif_set_real_num_tx_queues.
1609 */
1610 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1611 {
1612 int i;
1613 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1614
1615 /* If TC0 is invalidated disable TC mapping */
1616 if (tc->offset + tc->count > txq) {
1617 pr_warning("Number of in use tx queues changed "
1618 "invalidating tc mappings. Priority "
1619 "traffic classification disabled!\n");
1620 dev->num_tc = 0;
1621 return;
1622 }
1623
1624 /* Invalidated prio to tc mappings set to TC0 */
1625 for (i = 1; i < TC_BITMASK + 1; i++) {
1626 int q = netdev_get_prio_tc_map(dev, i);
1627
1628 tc = &dev->tc_to_txq[q];
1629 if (tc->offset + tc->count > txq) {
1630 pr_warning("Number of in use tx queues "
1631 "changed. Priority %i to tc "
1632 "mapping %i is no longer valid "
1633 "setting map to 0\n",
1634 i, q);
1635 netdev_set_prio_tc_map(dev, i, 0);
1636 }
1637 }
1638 }
1639
1640 /*
1641 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1642 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1643 */
1644 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1645 {
1646 int rc;
1647
1648 if (txq < 1 || txq > dev->num_tx_queues)
1649 return -EINVAL;
1650
1651 if (dev->reg_state == NETREG_REGISTERED) {
1652 ASSERT_RTNL();
1653
1654 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1655 txq);
1656 if (rc)
1657 return rc;
1658
1659 if (dev->num_tc)
1660 netif_setup_tc(dev, txq);
1661
1662 if (txq < dev->real_num_tx_queues)
1663 qdisc_reset_all_tx_gt(dev, txq);
1664 }
1665
1666 dev->real_num_tx_queues = txq;
1667 return 0;
1668 }
1669 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1670
1671 #ifdef CONFIG_RPS
1672 /**
1673 * netif_set_real_num_rx_queues - set actual number of RX queues used
1674 * @dev: Network device
1675 * @rxq: Actual number of RX queues
1676 *
1677 * This must be called either with the rtnl_lock held or before
1678 * registration of the net device. Returns 0 on success, or a
1679 * negative error code. If called before registration, it always
1680 * succeeds.
1681 */
1682 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1683 {
1684 int rc;
1685
1686 if (rxq < 1 || rxq > dev->num_rx_queues)
1687 return -EINVAL;
1688
1689 if (dev->reg_state == NETREG_REGISTERED) {
1690 ASSERT_RTNL();
1691
1692 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1693 rxq);
1694 if (rc)
1695 return rc;
1696 }
1697
1698 dev->real_num_rx_queues = rxq;
1699 return 0;
1700 }
1701 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1702 #endif
1703
1704 static inline void __netif_reschedule(struct Qdisc *q)
1705 {
1706 struct softnet_data *sd;
1707 unsigned long flags;
1708
1709 local_irq_save(flags);
1710 sd = &__get_cpu_var(softnet_data);
1711 q->next_sched = NULL;
1712 *sd->output_queue_tailp = q;
1713 sd->output_queue_tailp = &q->next_sched;
1714 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1715 local_irq_restore(flags);
1716 }
1717
1718 void __netif_schedule(struct Qdisc *q)
1719 {
1720 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1721 __netif_reschedule(q);
1722 }
1723 EXPORT_SYMBOL(__netif_schedule);
1724
1725 void dev_kfree_skb_irq(struct sk_buff *skb)
1726 {
1727 if (atomic_dec_and_test(&skb->users)) {
1728 struct softnet_data *sd;
1729 unsigned long flags;
1730
1731 local_irq_save(flags);
1732 sd = &__get_cpu_var(softnet_data);
1733 skb->next = sd->completion_queue;
1734 sd->completion_queue = skb;
1735 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1736 local_irq_restore(flags);
1737 }
1738 }
1739 EXPORT_SYMBOL(dev_kfree_skb_irq);
1740
1741 void dev_kfree_skb_any(struct sk_buff *skb)
1742 {
1743 if (in_irq() || irqs_disabled())
1744 dev_kfree_skb_irq(skb);
1745 else
1746 dev_kfree_skb(skb);
1747 }
1748 EXPORT_SYMBOL(dev_kfree_skb_any);
1749
1750
1751 /**
1752 * netif_device_detach - mark device as removed
1753 * @dev: network device
1754 *
1755 * Mark device as removed from system and therefore no longer available.
1756 */
1757 void netif_device_detach(struct net_device *dev)
1758 {
1759 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1760 netif_running(dev)) {
1761 netif_tx_stop_all_queues(dev);
1762 }
1763 }
1764 EXPORT_SYMBOL(netif_device_detach);
1765
1766 /**
1767 * netif_device_attach - mark device as attached
1768 * @dev: network device
1769 *
1770 * Mark device as attached from system and restart if needed.
1771 */
1772 void netif_device_attach(struct net_device *dev)
1773 {
1774 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1775 netif_running(dev)) {
1776 netif_tx_wake_all_queues(dev);
1777 __netdev_watchdog_up(dev);
1778 }
1779 }
1780 EXPORT_SYMBOL(netif_device_attach);
1781
1782 /**
1783 * skb_dev_set -- assign a new device to a buffer
1784 * @skb: buffer for the new device
1785 * @dev: network device
1786 *
1787 * If an skb is owned by a device already, we have to reset
1788 * all data private to the namespace a device belongs to
1789 * before assigning it a new device.
1790 */
1791 #ifdef CONFIG_NET_NS
1792 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1793 {
1794 skb_dst_drop(skb);
1795 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1796 secpath_reset(skb);
1797 nf_reset(skb);
1798 skb_init_secmark(skb);
1799 skb->mark = 0;
1800 skb->priority = 0;
1801 skb->nf_trace = 0;
1802 skb->ipvs_property = 0;
1803 #ifdef CONFIG_NET_SCHED
1804 skb->tc_index = 0;
1805 #endif
1806 }
1807 skb->dev = dev;
1808 }
1809 EXPORT_SYMBOL(skb_set_dev);
1810 #endif /* CONFIG_NET_NS */
1811
1812 /*
1813 * Invalidate hardware checksum when packet is to be mangled, and
1814 * complete checksum manually on outgoing path.
1815 */
1816 int skb_checksum_help(struct sk_buff *skb)
1817 {
1818 __wsum csum;
1819 int ret = 0, offset;
1820
1821 if (skb->ip_summed == CHECKSUM_COMPLETE)
1822 goto out_set_summed;
1823
1824 if (unlikely(skb_shinfo(skb)->gso_size)) {
1825 /* Let GSO fix up the checksum. */
1826 goto out_set_summed;
1827 }
1828
1829 offset = skb_checksum_start_offset(skb);
1830 BUG_ON(offset >= skb_headlen(skb));
1831 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1832
1833 offset += skb->csum_offset;
1834 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1835
1836 if (skb_cloned(skb) &&
1837 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1838 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1839 if (ret)
1840 goto out;
1841 }
1842
1843 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1844 out_set_summed:
1845 skb->ip_summed = CHECKSUM_NONE;
1846 out:
1847 return ret;
1848 }
1849 EXPORT_SYMBOL(skb_checksum_help);
1850
1851 /**
1852 * skb_gso_segment - Perform segmentation on skb.
1853 * @skb: buffer to segment
1854 * @features: features for the output path (see dev->features)
1855 *
1856 * This function segments the given skb and returns a list of segments.
1857 *
1858 * It may return NULL if the skb requires no segmentation. This is
1859 * only possible when GSO is used for verifying header integrity.
1860 */
1861 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1862 {
1863 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1864 struct packet_type *ptype;
1865 __be16 type = skb->protocol;
1866 int vlan_depth = ETH_HLEN;
1867 int err;
1868
1869 while (type == htons(ETH_P_8021Q)) {
1870 struct vlan_hdr *vh;
1871
1872 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1873 return ERR_PTR(-EINVAL);
1874
1875 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1876 type = vh->h_vlan_encapsulated_proto;
1877 vlan_depth += VLAN_HLEN;
1878 }
1879
1880 skb_reset_mac_header(skb);
1881 skb->mac_len = skb->network_header - skb->mac_header;
1882 __skb_pull(skb, skb->mac_len);
1883
1884 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1885 struct net_device *dev = skb->dev;
1886 struct ethtool_drvinfo info = {};
1887
1888 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1889 dev->ethtool_ops->get_drvinfo(dev, &info);
1890
1891 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1892 info.driver, dev ? dev->features : 0L,
1893 skb->sk ? skb->sk->sk_route_caps : 0L,
1894 skb->len, skb->data_len, skb->ip_summed);
1895
1896 if (skb_header_cloned(skb) &&
1897 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1898 return ERR_PTR(err);
1899 }
1900
1901 rcu_read_lock();
1902 list_for_each_entry_rcu(ptype,
1903 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1904 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1905 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1906 err = ptype->gso_send_check(skb);
1907 segs = ERR_PTR(err);
1908 if (err || skb_gso_ok(skb, features))
1909 break;
1910 __skb_push(skb, (skb->data -
1911 skb_network_header(skb)));
1912 }
1913 segs = ptype->gso_segment(skb, features);
1914 break;
1915 }
1916 }
1917 rcu_read_unlock();
1918
1919 __skb_push(skb, skb->data - skb_mac_header(skb));
1920
1921 return segs;
1922 }
1923 EXPORT_SYMBOL(skb_gso_segment);
1924
1925 /* Take action when hardware reception checksum errors are detected. */
1926 #ifdef CONFIG_BUG
1927 void netdev_rx_csum_fault(struct net_device *dev)
1928 {
1929 if (net_ratelimit()) {
1930 printk(KERN_ERR "%s: hw csum failure.\n",
1931 dev ? dev->name : "<unknown>");
1932 dump_stack();
1933 }
1934 }
1935 EXPORT_SYMBOL(netdev_rx_csum_fault);
1936 #endif
1937
1938 /* Actually, we should eliminate this check as soon as we know, that:
1939 * 1. IOMMU is present and allows to map all the memory.
1940 * 2. No high memory really exists on this machine.
1941 */
1942
1943 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1944 {
1945 #ifdef CONFIG_HIGHMEM
1946 int i;
1947 if (!(dev->features & NETIF_F_HIGHDMA)) {
1948 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1949 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1950 return 1;
1951 }
1952
1953 if (PCI_DMA_BUS_IS_PHYS) {
1954 struct device *pdev = dev->dev.parent;
1955
1956 if (!pdev)
1957 return 0;
1958 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1959 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1960 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1961 return 1;
1962 }
1963 }
1964 #endif
1965 return 0;
1966 }
1967
1968 struct dev_gso_cb {
1969 void (*destructor)(struct sk_buff *skb);
1970 };
1971
1972 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1973
1974 static void dev_gso_skb_destructor(struct sk_buff *skb)
1975 {
1976 struct dev_gso_cb *cb;
1977
1978 do {
1979 struct sk_buff *nskb = skb->next;
1980
1981 skb->next = nskb->next;
1982 nskb->next = NULL;
1983 kfree_skb(nskb);
1984 } while (skb->next);
1985
1986 cb = DEV_GSO_CB(skb);
1987 if (cb->destructor)
1988 cb->destructor(skb);
1989 }
1990
1991 /**
1992 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1993 * @skb: buffer to segment
1994 * @features: device features as applicable to this skb
1995 *
1996 * This function segments the given skb and stores the list of segments
1997 * in skb->next.
1998 */
1999 static int dev_gso_segment(struct sk_buff *skb, int features)
2000 {
2001 struct sk_buff *segs;
2002
2003 segs = skb_gso_segment(skb, features);
2004
2005 /* Verifying header integrity only. */
2006 if (!segs)
2007 return 0;
2008
2009 if (IS_ERR(segs))
2010 return PTR_ERR(segs);
2011
2012 skb->next = segs;
2013 DEV_GSO_CB(skb)->destructor = skb->destructor;
2014 skb->destructor = dev_gso_skb_destructor;
2015
2016 return 0;
2017 }
2018
2019 /*
2020 * Try to orphan skb early, right before transmission by the device.
2021 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2022 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2023 */
2024 static inline void skb_orphan_try(struct sk_buff *skb)
2025 {
2026 struct sock *sk = skb->sk;
2027
2028 if (sk && !skb_shinfo(skb)->tx_flags) {
2029 /* skb_tx_hash() wont be able to get sk.
2030 * We copy sk_hash into skb->rxhash
2031 */
2032 if (!skb->rxhash)
2033 skb->rxhash = sk->sk_hash;
2034 skb_orphan(skb);
2035 }
2036 }
2037
2038 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2039 {
2040 return ((features & NETIF_F_GEN_CSUM) ||
2041 ((features & NETIF_F_V4_CSUM) &&
2042 protocol == htons(ETH_P_IP)) ||
2043 ((features & NETIF_F_V6_CSUM) &&
2044 protocol == htons(ETH_P_IPV6)) ||
2045 ((features & NETIF_F_FCOE_CRC) &&
2046 protocol == htons(ETH_P_FCOE)));
2047 }
2048
2049 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2050 {
2051 if (!can_checksum_protocol(features, protocol)) {
2052 features &= ~NETIF_F_ALL_CSUM;
2053 features &= ~NETIF_F_SG;
2054 } else if (illegal_highdma(skb->dev, skb)) {
2055 features &= ~NETIF_F_SG;
2056 }
2057
2058 return features;
2059 }
2060
2061 u32 netif_skb_features(struct sk_buff *skb)
2062 {
2063 __be16 protocol = skb->protocol;
2064 u32 features = skb->dev->features;
2065
2066 if (protocol == htons(ETH_P_8021Q)) {
2067 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2068 protocol = veh->h_vlan_encapsulated_proto;
2069 } else if (!vlan_tx_tag_present(skb)) {
2070 return harmonize_features(skb, protocol, features);
2071 }
2072
2073 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2074
2075 if (protocol != htons(ETH_P_8021Q)) {
2076 return harmonize_features(skb, protocol, features);
2077 } else {
2078 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2079 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2080 return harmonize_features(skb, protocol, features);
2081 }
2082 }
2083 EXPORT_SYMBOL(netif_skb_features);
2084
2085 /*
2086 * Returns true if either:
2087 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2088 * 2. skb is fragmented and the device does not support SG, or if
2089 * at least one of fragments is in highmem and device does not
2090 * support DMA from it.
2091 */
2092 static inline int skb_needs_linearize(struct sk_buff *skb,
2093 int features)
2094 {
2095 return skb_is_nonlinear(skb) &&
2096 ((skb_has_frag_list(skb) &&
2097 !(features & NETIF_F_FRAGLIST)) ||
2098 (skb_shinfo(skb)->nr_frags &&
2099 !(features & NETIF_F_SG)));
2100 }
2101
2102 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2103 struct netdev_queue *txq)
2104 {
2105 const struct net_device_ops *ops = dev->netdev_ops;
2106 int rc = NETDEV_TX_OK;
2107
2108 if (likely(!skb->next)) {
2109 u32 features;
2110
2111 /*
2112 * If device doesnt need skb->dst, release it right now while
2113 * its hot in this cpu cache
2114 */
2115 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2116 skb_dst_drop(skb);
2117
2118 if (!list_empty(&ptype_all))
2119 dev_queue_xmit_nit(skb, dev);
2120
2121 skb_orphan_try(skb);
2122
2123 features = netif_skb_features(skb);
2124
2125 if (vlan_tx_tag_present(skb) &&
2126 !(features & NETIF_F_HW_VLAN_TX)) {
2127 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2128 if (unlikely(!skb))
2129 goto out;
2130
2131 skb->vlan_tci = 0;
2132 }
2133
2134 if (netif_needs_gso(skb, features)) {
2135 if (unlikely(dev_gso_segment(skb, features)))
2136 goto out_kfree_skb;
2137 if (skb->next)
2138 goto gso;
2139 } else {
2140 if (skb_needs_linearize(skb, features) &&
2141 __skb_linearize(skb))
2142 goto out_kfree_skb;
2143
2144 /* If packet is not checksummed and device does not
2145 * support checksumming for this protocol, complete
2146 * checksumming here.
2147 */
2148 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2149 skb_set_transport_header(skb,
2150 skb_checksum_start_offset(skb));
2151 if (!(features & NETIF_F_ALL_CSUM) &&
2152 skb_checksum_help(skb))
2153 goto out_kfree_skb;
2154 }
2155 }
2156
2157 rc = ops->ndo_start_xmit(skb, dev);
2158 trace_net_dev_xmit(skb, rc);
2159 if (rc == NETDEV_TX_OK)
2160 txq_trans_update(txq);
2161 return rc;
2162 }
2163
2164 gso:
2165 do {
2166 struct sk_buff *nskb = skb->next;
2167
2168 skb->next = nskb->next;
2169 nskb->next = NULL;
2170
2171 /*
2172 * If device doesnt need nskb->dst, release it right now while
2173 * its hot in this cpu cache
2174 */
2175 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2176 skb_dst_drop(nskb);
2177
2178 rc = ops->ndo_start_xmit(nskb, dev);
2179 trace_net_dev_xmit(nskb, rc);
2180 if (unlikely(rc != NETDEV_TX_OK)) {
2181 if (rc & ~NETDEV_TX_MASK)
2182 goto out_kfree_gso_skb;
2183 nskb->next = skb->next;
2184 skb->next = nskb;
2185 return rc;
2186 }
2187 txq_trans_update(txq);
2188 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2189 return NETDEV_TX_BUSY;
2190 } while (skb->next);
2191
2192 out_kfree_gso_skb:
2193 if (likely(skb->next == NULL))
2194 skb->destructor = DEV_GSO_CB(skb)->destructor;
2195 out_kfree_skb:
2196 kfree_skb(skb);
2197 out:
2198 return rc;
2199 }
2200
2201 static u32 hashrnd __read_mostly;
2202
2203 /*
2204 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2205 * to be used as a distribution range.
2206 */
2207 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2208 unsigned int num_tx_queues)
2209 {
2210 u32 hash;
2211 u16 qoffset = 0;
2212 u16 qcount = num_tx_queues;
2213
2214 if (skb_rx_queue_recorded(skb)) {
2215 hash = skb_get_rx_queue(skb);
2216 while (unlikely(hash >= num_tx_queues))
2217 hash -= num_tx_queues;
2218 return hash;
2219 }
2220
2221 if (dev->num_tc) {
2222 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2223 qoffset = dev->tc_to_txq[tc].offset;
2224 qcount = dev->tc_to_txq[tc].count;
2225 }
2226
2227 if (skb->sk && skb->sk->sk_hash)
2228 hash = skb->sk->sk_hash;
2229 else
2230 hash = (__force u16) skb->protocol ^ skb->rxhash;
2231 hash = jhash_1word(hash, hashrnd);
2232
2233 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2234 }
2235 EXPORT_SYMBOL(__skb_tx_hash);
2236
2237 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2238 {
2239 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2240 if (net_ratelimit()) {
2241 pr_warning("%s selects TX queue %d, but "
2242 "real number of TX queues is %d\n",
2243 dev->name, queue_index, dev->real_num_tx_queues);
2244 }
2245 return 0;
2246 }
2247 return queue_index;
2248 }
2249
2250 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2251 {
2252 #ifdef CONFIG_XPS
2253 struct xps_dev_maps *dev_maps;
2254 struct xps_map *map;
2255 int queue_index = -1;
2256
2257 rcu_read_lock();
2258 dev_maps = rcu_dereference(dev->xps_maps);
2259 if (dev_maps) {
2260 map = rcu_dereference(
2261 dev_maps->cpu_map[raw_smp_processor_id()]);
2262 if (map) {
2263 if (map->len == 1)
2264 queue_index = map->queues[0];
2265 else {
2266 u32 hash;
2267 if (skb->sk && skb->sk->sk_hash)
2268 hash = skb->sk->sk_hash;
2269 else
2270 hash = (__force u16) skb->protocol ^
2271 skb->rxhash;
2272 hash = jhash_1word(hash, hashrnd);
2273 queue_index = map->queues[
2274 ((u64)hash * map->len) >> 32];
2275 }
2276 if (unlikely(queue_index >= dev->real_num_tx_queues))
2277 queue_index = -1;
2278 }
2279 }
2280 rcu_read_unlock();
2281
2282 return queue_index;
2283 #else
2284 return -1;
2285 #endif
2286 }
2287
2288 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2289 struct sk_buff *skb)
2290 {
2291 int queue_index;
2292 const struct net_device_ops *ops = dev->netdev_ops;
2293
2294 if (dev->real_num_tx_queues == 1)
2295 queue_index = 0;
2296 else if (ops->ndo_select_queue) {
2297 queue_index = ops->ndo_select_queue(dev, skb);
2298 queue_index = dev_cap_txqueue(dev, queue_index);
2299 } else {
2300 struct sock *sk = skb->sk;
2301 queue_index = sk_tx_queue_get(sk);
2302
2303 if (queue_index < 0 || skb->ooo_okay ||
2304 queue_index >= dev->real_num_tx_queues) {
2305 int old_index = queue_index;
2306
2307 queue_index = get_xps_queue(dev, skb);
2308 if (queue_index < 0)
2309 queue_index = skb_tx_hash(dev, skb);
2310
2311 if (queue_index != old_index && sk) {
2312 struct dst_entry *dst =
2313 rcu_dereference_check(sk->sk_dst_cache, 1);
2314
2315 if (dst && skb_dst(skb) == dst)
2316 sk_tx_queue_set(sk, queue_index);
2317 }
2318 }
2319 }
2320
2321 skb_set_queue_mapping(skb, queue_index);
2322 return netdev_get_tx_queue(dev, queue_index);
2323 }
2324
2325 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2326 struct net_device *dev,
2327 struct netdev_queue *txq)
2328 {
2329 spinlock_t *root_lock = qdisc_lock(q);
2330 bool contended;
2331 int rc;
2332
2333 qdisc_skb_cb(skb)->pkt_len = skb->len;
2334 qdisc_calculate_pkt_len(skb, q);
2335 /*
2336 * Heuristic to force contended enqueues to serialize on a
2337 * separate lock before trying to get qdisc main lock.
2338 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2339 * and dequeue packets faster.
2340 */
2341 contended = qdisc_is_running(q);
2342 if (unlikely(contended))
2343 spin_lock(&q->busylock);
2344
2345 spin_lock(root_lock);
2346 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2347 kfree_skb(skb);
2348 rc = NET_XMIT_DROP;
2349 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2350 qdisc_run_begin(q)) {
2351 /*
2352 * This is a work-conserving queue; there are no old skbs
2353 * waiting to be sent out; and the qdisc is not running -
2354 * xmit the skb directly.
2355 */
2356 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2357 skb_dst_force(skb);
2358
2359 qdisc_bstats_update(q, skb);
2360
2361 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2362 if (unlikely(contended)) {
2363 spin_unlock(&q->busylock);
2364 contended = false;
2365 }
2366 __qdisc_run(q);
2367 } else
2368 qdisc_run_end(q);
2369
2370 rc = NET_XMIT_SUCCESS;
2371 } else {
2372 skb_dst_force(skb);
2373 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2374 if (qdisc_run_begin(q)) {
2375 if (unlikely(contended)) {
2376 spin_unlock(&q->busylock);
2377 contended = false;
2378 }
2379 __qdisc_run(q);
2380 }
2381 }
2382 spin_unlock(root_lock);
2383 if (unlikely(contended))
2384 spin_unlock(&q->busylock);
2385 return rc;
2386 }
2387
2388 static DEFINE_PER_CPU(int, xmit_recursion);
2389 #define RECURSION_LIMIT 10
2390
2391 /**
2392 * dev_queue_xmit - transmit a buffer
2393 * @skb: buffer to transmit
2394 *
2395 * Queue a buffer for transmission to a network device. The caller must
2396 * have set the device and priority and built the buffer before calling
2397 * this function. The function can be called from an interrupt.
2398 *
2399 * A negative errno code is returned on a failure. A success does not
2400 * guarantee the frame will be transmitted as it may be dropped due
2401 * to congestion or traffic shaping.
2402 *
2403 * -----------------------------------------------------------------------------------
2404 * I notice this method can also return errors from the queue disciplines,
2405 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2406 * be positive.
2407 *
2408 * Regardless of the return value, the skb is consumed, so it is currently
2409 * difficult to retry a send to this method. (You can bump the ref count
2410 * before sending to hold a reference for retry if you are careful.)
2411 *
2412 * When calling this method, interrupts MUST be enabled. This is because
2413 * the BH enable code must have IRQs enabled so that it will not deadlock.
2414 * --BLG
2415 */
2416 int dev_queue_xmit(struct sk_buff *skb)
2417 {
2418 struct net_device *dev = skb->dev;
2419 struct netdev_queue *txq;
2420 struct Qdisc *q;
2421 int rc = -ENOMEM;
2422
2423 /* Disable soft irqs for various locks below. Also
2424 * stops preemption for RCU.
2425 */
2426 rcu_read_lock_bh();
2427
2428 txq = dev_pick_tx(dev, skb);
2429 q = rcu_dereference_bh(txq->qdisc);
2430
2431 #ifdef CONFIG_NET_CLS_ACT
2432 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2433 #endif
2434 trace_net_dev_queue(skb);
2435 if (q->enqueue) {
2436 rc = __dev_xmit_skb(skb, q, dev, txq);
2437 goto out;
2438 }
2439
2440 /* The device has no queue. Common case for software devices:
2441 loopback, all the sorts of tunnels...
2442
2443 Really, it is unlikely that netif_tx_lock protection is necessary
2444 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2445 counters.)
2446 However, it is possible, that they rely on protection
2447 made by us here.
2448
2449 Check this and shot the lock. It is not prone from deadlocks.
2450 Either shot noqueue qdisc, it is even simpler 8)
2451 */
2452 if (dev->flags & IFF_UP) {
2453 int cpu = smp_processor_id(); /* ok because BHs are off */
2454
2455 if (txq->xmit_lock_owner != cpu) {
2456
2457 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2458 goto recursion_alert;
2459
2460 HARD_TX_LOCK(dev, txq, cpu);
2461
2462 if (!netif_tx_queue_stopped(txq)) {
2463 __this_cpu_inc(xmit_recursion);
2464 rc = dev_hard_start_xmit(skb, dev, txq);
2465 __this_cpu_dec(xmit_recursion);
2466 if (dev_xmit_complete(rc)) {
2467 HARD_TX_UNLOCK(dev, txq);
2468 goto out;
2469 }
2470 }
2471 HARD_TX_UNLOCK(dev, txq);
2472 if (net_ratelimit())
2473 printk(KERN_CRIT "Virtual device %s asks to "
2474 "queue packet!\n", dev->name);
2475 } else {
2476 /* Recursion is detected! It is possible,
2477 * unfortunately
2478 */
2479 recursion_alert:
2480 if (net_ratelimit())
2481 printk(KERN_CRIT "Dead loop on virtual device "
2482 "%s, fix it urgently!\n", dev->name);
2483 }
2484 }
2485
2486 rc = -ENETDOWN;
2487 rcu_read_unlock_bh();
2488
2489 kfree_skb(skb);
2490 return rc;
2491 out:
2492 rcu_read_unlock_bh();
2493 return rc;
2494 }
2495 EXPORT_SYMBOL(dev_queue_xmit);
2496
2497
2498 /*=======================================================================
2499 Receiver routines
2500 =======================================================================*/
2501
2502 int netdev_max_backlog __read_mostly = 1000;
2503 int netdev_tstamp_prequeue __read_mostly = 1;
2504 int netdev_budget __read_mostly = 300;
2505 int weight_p __read_mostly = 64; /* old backlog weight */
2506
2507 /* Called with irq disabled */
2508 static inline void ____napi_schedule(struct softnet_data *sd,
2509 struct napi_struct *napi)
2510 {
2511 list_add_tail(&napi->poll_list, &sd->poll_list);
2512 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2513 }
2514
2515 /*
2516 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2517 * and src/dst port numbers. Returns a non-zero hash number on success
2518 * and 0 on failure.
2519 */
2520 __u32 __skb_get_rxhash(struct sk_buff *skb)
2521 {
2522 int nhoff, hash = 0, poff;
2523 struct ipv6hdr *ip6;
2524 struct iphdr *ip;
2525 u8 ip_proto;
2526 u32 addr1, addr2, ihl;
2527 union {
2528 u32 v32;
2529 u16 v16[2];
2530 } ports;
2531
2532 nhoff = skb_network_offset(skb);
2533
2534 switch (skb->protocol) {
2535 case __constant_htons(ETH_P_IP):
2536 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2537 goto done;
2538
2539 ip = (struct iphdr *) (skb->data + nhoff);
2540 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2541 ip_proto = 0;
2542 else
2543 ip_proto = ip->protocol;
2544 addr1 = (__force u32) ip->saddr;
2545 addr2 = (__force u32) ip->daddr;
2546 ihl = ip->ihl;
2547 break;
2548 case __constant_htons(ETH_P_IPV6):
2549 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2550 goto done;
2551
2552 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2553 ip_proto = ip6->nexthdr;
2554 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2555 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2556 ihl = (40 >> 2);
2557 break;
2558 default:
2559 goto done;
2560 }
2561
2562 ports.v32 = 0;
2563 poff = proto_ports_offset(ip_proto);
2564 if (poff >= 0) {
2565 nhoff += ihl * 4 + poff;
2566 if (pskb_may_pull(skb, nhoff + 4)) {
2567 ports.v32 = * (__force u32 *) (skb->data + nhoff);
2568 if (ports.v16[1] < ports.v16[0])
2569 swap(ports.v16[0], ports.v16[1]);
2570 }
2571 }
2572
2573 /* get a consistent hash (same value on both flow directions) */
2574 if (addr2 < addr1)
2575 swap(addr1, addr2);
2576
2577 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2578 if (!hash)
2579 hash = 1;
2580
2581 done:
2582 return hash;
2583 }
2584 EXPORT_SYMBOL(__skb_get_rxhash);
2585
2586 #ifdef CONFIG_RPS
2587
2588 /* One global table that all flow-based protocols share. */
2589 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2590 EXPORT_SYMBOL(rps_sock_flow_table);
2591
2592 static struct rps_dev_flow *
2593 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2594 struct rps_dev_flow *rflow, u16 next_cpu)
2595 {
2596 u16 tcpu;
2597
2598 tcpu = rflow->cpu = next_cpu;
2599 if (tcpu != RPS_NO_CPU) {
2600 #ifdef CONFIG_RFS_ACCEL
2601 struct netdev_rx_queue *rxqueue;
2602 struct rps_dev_flow_table *flow_table;
2603 struct rps_dev_flow *old_rflow;
2604 u32 flow_id;
2605 u16 rxq_index;
2606 int rc;
2607
2608 /* Should we steer this flow to a different hardware queue? */
2609 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap)
2610 goto out;
2611 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2612 if (rxq_index == skb_get_rx_queue(skb))
2613 goto out;
2614
2615 rxqueue = dev->_rx + rxq_index;
2616 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2617 if (!flow_table)
2618 goto out;
2619 flow_id = skb->rxhash & flow_table->mask;
2620 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2621 rxq_index, flow_id);
2622 if (rc < 0)
2623 goto out;
2624 old_rflow = rflow;
2625 rflow = &flow_table->flows[flow_id];
2626 rflow->cpu = next_cpu;
2627 rflow->filter = rc;
2628 if (old_rflow->filter == rflow->filter)
2629 old_rflow->filter = RPS_NO_FILTER;
2630 out:
2631 #endif
2632 rflow->last_qtail =
2633 per_cpu(softnet_data, tcpu).input_queue_head;
2634 }
2635
2636 return rflow;
2637 }
2638
2639 /*
2640 * get_rps_cpu is called from netif_receive_skb and returns the target
2641 * CPU from the RPS map of the receiving queue for a given skb.
2642 * rcu_read_lock must be held on entry.
2643 */
2644 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2645 struct rps_dev_flow **rflowp)
2646 {
2647 struct netdev_rx_queue *rxqueue;
2648 struct rps_map *map;
2649 struct rps_dev_flow_table *flow_table;
2650 struct rps_sock_flow_table *sock_flow_table;
2651 int cpu = -1;
2652 u16 tcpu;
2653
2654 if (skb_rx_queue_recorded(skb)) {
2655 u16 index = skb_get_rx_queue(skb);
2656 if (unlikely(index >= dev->real_num_rx_queues)) {
2657 WARN_ONCE(dev->real_num_rx_queues > 1,
2658 "%s received packet on queue %u, but number "
2659 "of RX queues is %u\n",
2660 dev->name, index, dev->real_num_rx_queues);
2661 goto done;
2662 }
2663 rxqueue = dev->_rx + index;
2664 } else
2665 rxqueue = dev->_rx;
2666
2667 map = rcu_dereference(rxqueue->rps_map);
2668 if (map) {
2669 if (map->len == 1 &&
2670 !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2671 tcpu = map->cpus[0];
2672 if (cpu_online(tcpu))
2673 cpu = tcpu;
2674 goto done;
2675 }
2676 } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2677 goto done;
2678 }
2679
2680 skb_reset_network_header(skb);
2681 if (!skb_get_rxhash(skb))
2682 goto done;
2683
2684 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2685 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2686 if (flow_table && sock_flow_table) {
2687 u16 next_cpu;
2688 struct rps_dev_flow *rflow;
2689
2690 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2691 tcpu = rflow->cpu;
2692
2693 next_cpu = sock_flow_table->ents[skb->rxhash &
2694 sock_flow_table->mask];
2695
2696 /*
2697 * If the desired CPU (where last recvmsg was done) is
2698 * different from current CPU (one in the rx-queue flow
2699 * table entry), switch if one of the following holds:
2700 * - Current CPU is unset (equal to RPS_NO_CPU).
2701 * - Current CPU is offline.
2702 * - The current CPU's queue tail has advanced beyond the
2703 * last packet that was enqueued using this table entry.
2704 * This guarantees that all previous packets for the flow
2705 * have been dequeued, thus preserving in order delivery.
2706 */
2707 if (unlikely(tcpu != next_cpu) &&
2708 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2709 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2710 rflow->last_qtail)) >= 0))
2711 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2712
2713 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2714 *rflowp = rflow;
2715 cpu = tcpu;
2716 goto done;
2717 }
2718 }
2719
2720 if (map) {
2721 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2722
2723 if (cpu_online(tcpu)) {
2724 cpu = tcpu;
2725 goto done;
2726 }
2727 }
2728
2729 done:
2730 return cpu;
2731 }
2732
2733 #ifdef CONFIG_RFS_ACCEL
2734
2735 /**
2736 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2737 * @dev: Device on which the filter was set
2738 * @rxq_index: RX queue index
2739 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2740 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2741 *
2742 * Drivers that implement ndo_rx_flow_steer() should periodically call
2743 * this function for each installed filter and remove the filters for
2744 * which it returns %true.
2745 */
2746 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2747 u32 flow_id, u16 filter_id)
2748 {
2749 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2750 struct rps_dev_flow_table *flow_table;
2751 struct rps_dev_flow *rflow;
2752 bool expire = true;
2753 int cpu;
2754
2755 rcu_read_lock();
2756 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2757 if (flow_table && flow_id <= flow_table->mask) {
2758 rflow = &flow_table->flows[flow_id];
2759 cpu = ACCESS_ONCE(rflow->cpu);
2760 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2761 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2762 rflow->last_qtail) <
2763 (int)(10 * flow_table->mask)))
2764 expire = false;
2765 }
2766 rcu_read_unlock();
2767 return expire;
2768 }
2769 EXPORT_SYMBOL(rps_may_expire_flow);
2770
2771 #endif /* CONFIG_RFS_ACCEL */
2772
2773 /* Called from hardirq (IPI) context */
2774 static void rps_trigger_softirq(void *data)
2775 {
2776 struct softnet_data *sd = data;
2777
2778 ____napi_schedule(sd, &sd->backlog);
2779 sd->received_rps++;
2780 }
2781
2782 #endif /* CONFIG_RPS */
2783
2784 /*
2785 * Check if this softnet_data structure is another cpu one
2786 * If yes, queue it to our IPI list and return 1
2787 * If no, return 0
2788 */
2789 static int rps_ipi_queued(struct softnet_data *sd)
2790 {
2791 #ifdef CONFIG_RPS
2792 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2793
2794 if (sd != mysd) {
2795 sd->rps_ipi_next = mysd->rps_ipi_list;
2796 mysd->rps_ipi_list = sd;
2797
2798 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2799 return 1;
2800 }
2801 #endif /* CONFIG_RPS */
2802 return 0;
2803 }
2804
2805 /*
2806 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2807 * queue (may be a remote CPU queue).
2808 */
2809 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2810 unsigned int *qtail)
2811 {
2812 struct softnet_data *sd;
2813 unsigned long flags;
2814
2815 sd = &per_cpu(softnet_data, cpu);
2816
2817 local_irq_save(flags);
2818
2819 rps_lock(sd);
2820 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2821 if (skb_queue_len(&sd->input_pkt_queue)) {
2822 enqueue:
2823 __skb_queue_tail(&sd->input_pkt_queue, skb);
2824 input_queue_tail_incr_save(sd, qtail);
2825 rps_unlock(sd);
2826 local_irq_restore(flags);
2827 return NET_RX_SUCCESS;
2828 }
2829
2830 /* Schedule NAPI for backlog device
2831 * We can use non atomic operation since we own the queue lock
2832 */
2833 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2834 if (!rps_ipi_queued(sd))
2835 ____napi_schedule(sd, &sd->backlog);
2836 }
2837 goto enqueue;
2838 }
2839
2840 sd->dropped++;
2841 rps_unlock(sd);
2842
2843 local_irq_restore(flags);
2844
2845 atomic_long_inc(&skb->dev->rx_dropped);
2846 kfree_skb(skb);
2847 return NET_RX_DROP;
2848 }
2849
2850 /**
2851 * netif_rx - post buffer to the network code
2852 * @skb: buffer to post
2853 *
2854 * This function receives a packet from a device driver and queues it for
2855 * the upper (protocol) levels to process. It always succeeds. The buffer
2856 * may be dropped during processing for congestion control or by the
2857 * protocol layers.
2858 *
2859 * return values:
2860 * NET_RX_SUCCESS (no congestion)
2861 * NET_RX_DROP (packet was dropped)
2862 *
2863 */
2864
2865 int netif_rx(struct sk_buff *skb)
2866 {
2867 int ret;
2868
2869 /* if netpoll wants it, pretend we never saw it */
2870 if (netpoll_rx(skb))
2871 return NET_RX_DROP;
2872
2873 if (netdev_tstamp_prequeue)
2874 net_timestamp_check(skb);
2875
2876 trace_netif_rx(skb);
2877 #ifdef CONFIG_RPS
2878 {
2879 struct rps_dev_flow voidflow, *rflow = &voidflow;
2880 int cpu;
2881
2882 preempt_disable();
2883 rcu_read_lock();
2884
2885 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2886 if (cpu < 0)
2887 cpu = smp_processor_id();
2888
2889 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2890
2891 rcu_read_unlock();
2892 preempt_enable();
2893 }
2894 #else
2895 {
2896 unsigned int qtail;
2897 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2898 put_cpu();
2899 }
2900 #endif
2901 return ret;
2902 }
2903 EXPORT_SYMBOL(netif_rx);
2904
2905 int netif_rx_ni(struct sk_buff *skb)
2906 {
2907 int err;
2908
2909 preempt_disable();
2910 err = netif_rx(skb);
2911 if (local_softirq_pending())
2912 do_softirq();
2913 preempt_enable();
2914
2915 return err;
2916 }
2917 EXPORT_SYMBOL(netif_rx_ni);
2918
2919 static void net_tx_action(struct softirq_action *h)
2920 {
2921 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2922
2923 if (sd->completion_queue) {
2924 struct sk_buff *clist;
2925
2926 local_irq_disable();
2927 clist = sd->completion_queue;
2928 sd->completion_queue = NULL;
2929 local_irq_enable();
2930
2931 while (clist) {
2932 struct sk_buff *skb = clist;
2933 clist = clist->next;
2934
2935 WARN_ON(atomic_read(&skb->users));
2936 trace_kfree_skb(skb, net_tx_action);
2937 __kfree_skb(skb);
2938 }
2939 }
2940
2941 if (sd->output_queue) {
2942 struct Qdisc *head;
2943
2944 local_irq_disable();
2945 head = sd->output_queue;
2946 sd->output_queue = NULL;
2947 sd->output_queue_tailp = &sd->output_queue;
2948 local_irq_enable();
2949
2950 while (head) {
2951 struct Qdisc *q = head;
2952 spinlock_t *root_lock;
2953
2954 head = head->next_sched;
2955
2956 root_lock = qdisc_lock(q);
2957 if (spin_trylock(root_lock)) {
2958 smp_mb__before_clear_bit();
2959 clear_bit(__QDISC_STATE_SCHED,
2960 &q->state);
2961 qdisc_run(q);
2962 spin_unlock(root_lock);
2963 } else {
2964 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2965 &q->state)) {
2966 __netif_reschedule(q);
2967 } else {
2968 smp_mb__before_clear_bit();
2969 clear_bit(__QDISC_STATE_SCHED,
2970 &q->state);
2971 }
2972 }
2973 }
2974 }
2975 }
2976
2977 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2978 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2979 /* This hook is defined here for ATM LANE */
2980 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2981 unsigned char *addr) __read_mostly;
2982 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2983 #endif
2984
2985 #ifdef CONFIG_NET_CLS_ACT
2986 /* TODO: Maybe we should just force sch_ingress to be compiled in
2987 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2988 * a compare and 2 stores extra right now if we dont have it on
2989 * but have CONFIG_NET_CLS_ACT
2990 * NOTE: This doesnt stop any functionality; if you dont have
2991 * the ingress scheduler, you just cant add policies on ingress.
2992 *
2993 */
2994 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2995 {
2996 struct net_device *dev = skb->dev;
2997 u32 ttl = G_TC_RTTL(skb->tc_verd);
2998 int result = TC_ACT_OK;
2999 struct Qdisc *q;
3000
3001 if (unlikely(MAX_RED_LOOP < ttl++)) {
3002 if (net_ratelimit())
3003 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3004 skb->skb_iif, dev->ifindex);
3005 return TC_ACT_SHOT;
3006 }
3007
3008 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3009 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3010
3011 q = rxq->qdisc;
3012 if (q != &noop_qdisc) {
3013 spin_lock(qdisc_lock(q));
3014 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3015 result = qdisc_enqueue_root(skb, q);
3016 spin_unlock(qdisc_lock(q));
3017 }
3018
3019 return result;
3020 }
3021
3022 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3023 struct packet_type **pt_prev,
3024 int *ret, struct net_device *orig_dev)
3025 {
3026 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3027
3028 if (!rxq || rxq->qdisc == &noop_qdisc)
3029 goto out;
3030
3031 if (*pt_prev) {
3032 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3033 *pt_prev = NULL;
3034 }
3035
3036 switch (ing_filter(skb, rxq)) {
3037 case TC_ACT_SHOT:
3038 case TC_ACT_STOLEN:
3039 kfree_skb(skb);
3040 return NULL;
3041 }
3042
3043 out:
3044 skb->tc_verd = 0;
3045 return skb;
3046 }
3047 #endif
3048
3049 /**
3050 * netdev_rx_handler_register - register receive handler
3051 * @dev: device to register a handler for
3052 * @rx_handler: receive handler to register
3053 * @rx_handler_data: data pointer that is used by rx handler
3054 *
3055 * Register a receive hander for a device. This handler will then be
3056 * called from __netif_receive_skb. A negative errno code is returned
3057 * on a failure.
3058 *
3059 * The caller must hold the rtnl_mutex.
3060 */
3061 int netdev_rx_handler_register(struct net_device *dev,
3062 rx_handler_func_t *rx_handler,
3063 void *rx_handler_data)
3064 {
3065 ASSERT_RTNL();
3066
3067 if (dev->rx_handler)
3068 return -EBUSY;
3069
3070 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3071 rcu_assign_pointer(dev->rx_handler, rx_handler);
3072
3073 return 0;
3074 }
3075 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3076
3077 /**
3078 * netdev_rx_handler_unregister - unregister receive handler
3079 * @dev: device to unregister a handler from
3080 *
3081 * Unregister a receive hander from a device.
3082 *
3083 * The caller must hold the rtnl_mutex.
3084 */
3085 void netdev_rx_handler_unregister(struct net_device *dev)
3086 {
3087
3088 ASSERT_RTNL();
3089 rcu_assign_pointer(dev->rx_handler, NULL);
3090 rcu_assign_pointer(dev->rx_handler_data, NULL);
3091 }
3092 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3093
3094 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
3095 struct net_device *master)
3096 {
3097 if (skb->pkt_type == PACKET_HOST) {
3098 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
3099
3100 memcpy(dest, master->dev_addr, ETH_ALEN);
3101 }
3102 }
3103
3104 /* On bonding slaves other than the currently active slave, suppress
3105 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
3106 * ARP on active-backup slaves with arp_validate enabled.
3107 */
3108 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
3109 {
3110 struct net_device *dev = skb->dev;
3111
3112 if (master->priv_flags & IFF_MASTER_ARPMON)
3113 dev->last_rx = jiffies;
3114
3115 if ((master->priv_flags & IFF_MASTER_ALB) &&
3116 (master->priv_flags & IFF_BRIDGE_PORT)) {
3117 /* Do address unmangle. The local destination address
3118 * will be always the one master has. Provides the right
3119 * functionality in a bridge.
3120 */
3121 skb_bond_set_mac_by_master(skb, master);
3122 }
3123
3124 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
3125 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
3126 skb->protocol == __cpu_to_be16(ETH_P_ARP))
3127 return 0;
3128
3129 if (master->priv_flags & IFF_MASTER_ALB) {
3130 if (skb->pkt_type != PACKET_BROADCAST &&
3131 skb->pkt_type != PACKET_MULTICAST)
3132 return 0;
3133 }
3134 if (master->priv_flags & IFF_MASTER_8023AD &&
3135 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
3136 return 0;
3137
3138 return 1;
3139 }
3140 return 0;
3141 }
3142 EXPORT_SYMBOL(__skb_bond_should_drop);
3143
3144 static int __netif_receive_skb(struct sk_buff *skb)
3145 {
3146 struct packet_type *ptype, *pt_prev;
3147 rx_handler_func_t *rx_handler;
3148 struct net_device *orig_dev;
3149 struct net_device *master;
3150 struct net_device *null_or_orig;
3151 struct net_device *orig_or_bond;
3152 int ret = NET_RX_DROP;
3153 __be16 type;
3154
3155 if (!netdev_tstamp_prequeue)
3156 net_timestamp_check(skb);
3157
3158 trace_netif_receive_skb(skb);
3159
3160 /* if we've gotten here through NAPI, check netpoll */
3161 if (netpoll_receive_skb(skb))
3162 return NET_RX_DROP;
3163
3164 if (!skb->skb_iif)
3165 skb->skb_iif = skb->dev->ifindex;
3166
3167 /*
3168 * bonding note: skbs received on inactive slaves should only
3169 * be delivered to pkt handlers that are exact matches. Also
3170 * the deliver_no_wcard flag will be set. If packet handlers
3171 * are sensitive to duplicate packets these skbs will need to
3172 * be dropped at the handler.
3173 */
3174 null_or_orig = NULL;
3175 orig_dev = skb->dev;
3176 master = ACCESS_ONCE(orig_dev->master);
3177 if (skb->deliver_no_wcard)
3178 null_or_orig = orig_dev;
3179 else if (master) {
3180 if (skb_bond_should_drop(skb, master)) {
3181 skb->deliver_no_wcard = 1;
3182 null_or_orig = orig_dev; /* deliver only exact match */
3183 } else
3184 skb->dev = master;
3185 }
3186
3187 __this_cpu_inc(softnet_data.processed);
3188 skb_reset_network_header(skb);
3189 skb_reset_transport_header(skb);
3190 skb->mac_len = skb->network_header - skb->mac_header;
3191
3192 pt_prev = NULL;
3193
3194 rcu_read_lock();
3195
3196 #ifdef CONFIG_NET_CLS_ACT
3197 if (skb->tc_verd & TC_NCLS) {
3198 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3199 goto ncls;
3200 }
3201 #endif
3202
3203 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3204 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3205 ptype->dev == orig_dev) {
3206 if (pt_prev)
3207 ret = deliver_skb(skb, pt_prev, orig_dev);
3208 pt_prev = ptype;
3209 }
3210 }
3211
3212 #ifdef CONFIG_NET_CLS_ACT
3213 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3214 if (!skb)
3215 goto out;
3216 ncls:
3217 #endif
3218
3219 /* Handle special case of bridge or macvlan */
3220 rx_handler = rcu_dereference(skb->dev->rx_handler);
3221 if (rx_handler) {
3222 if (pt_prev) {
3223 ret = deliver_skb(skb, pt_prev, orig_dev);
3224 pt_prev = NULL;
3225 }
3226 skb = rx_handler(skb);
3227 if (!skb)
3228 goto out;
3229 }
3230
3231 if (vlan_tx_tag_present(skb)) {
3232 if (pt_prev) {
3233 ret = deliver_skb(skb, pt_prev, orig_dev);
3234 pt_prev = NULL;
3235 }
3236 if (vlan_hwaccel_do_receive(&skb)) {
3237 ret = __netif_receive_skb(skb);
3238 goto out;
3239 } else if (unlikely(!skb))
3240 goto out;
3241 }
3242
3243 /*
3244 * Make sure frames received on VLAN interfaces stacked on
3245 * bonding interfaces still make their way to any base bonding
3246 * device that may have registered for a specific ptype. The
3247 * handler may have to adjust skb->dev and orig_dev.
3248 */
3249 orig_or_bond = orig_dev;
3250 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3251 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3252 orig_or_bond = vlan_dev_real_dev(skb->dev);
3253 }
3254
3255 type = skb->protocol;
3256 list_for_each_entry_rcu(ptype,
3257 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3258 if (ptype->type == type && (ptype->dev == null_or_orig ||
3259 ptype->dev == skb->dev || ptype->dev == orig_dev ||
3260 ptype->dev == orig_or_bond)) {
3261 if (pt_prev)
3262 ret = deliver_skb(skb, pt_prev, orig_dev);
3263 pt_prev = ptype;
3264 }
3265 }
3266
3267 if (pt_prev) {
3268 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3269 } else {
3270 atomic_long_inc(&skb->dev->rx_dropped);
3271 kfree_skb(skb);
3272 /* Jamal, now you will not able to escape explaining
3273 * me how you were going to use this. :-)
3274 */
3275 ret = NET_RX_DROP;
3276 }
3277
3278 out:
3279 rcu_read_unlock();
3280 return ret;
3281 }
3282
3283 /**
3284 * netif_receive_skb - process receive buffer from network
3285 * @skb: buffer to process
3286 *
3287 * netif_receive_skb() is the main receive data processing function.
3288 * It always succeeds. The buffer may be dropped during processing
3289 * for congestion control or by the protocol layers.
3290 *
3291 * This function may only be called from softirq context and interrupts
3292 * should be enabled.
3293 *
3294 * Return values (usually ignored):
3295 * NET_RX_SUCCESS: no congestion
3296 * NET_RX_DROP: packet was dropped
3297 */
3298 int netif_receive_skb(struct sk_buff *skb)
3299 {
3300 if (netdev_tstamp_prequeue)
3301 net_timestamp_check(skb);
3302
3303 if (skb_defer_rx_timestamp(skb))
3304 return NET_RX_SUCCESS;
3305
3306 #ifdef CONFIG_RPS
3307 {
3308 struct rps_dev_flow voidflow, *rflow = &voidflow;
3309 int cpu, ret;
3310
3311 rcu_read_lock();
3312
3313 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3314
3315 if (cpu >= 0) {
3316 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3317 rcu_read_unlock();
3318 } else {
3319 rcu_read_unlock();
3320 ret = __netif_receive_skb(skb);
3321 }
3322
3323 return ret;
3324 }
3325 #else
3326 return __netif_receive_skb(skb);
3327 #endif
3328 }
3329 EXPORT_SYMBOL(netif_receive_skb);
3330
3331 /* Network device is going away, flush any packets still pending
3332 * Called with irqs disabled.
3333 */
3334 static void flush_backlog(void *arg)
3335 {
3336 struct net_device *dev = arg;
3337 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3338 struct sk_buff *skb, *tmp;
3339
3340 rps_lock(sd);
3341 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3342 if (skb->dev == dev) {
3343 __skb_unlink(skb, &sd->input_pkt_queue);
3344 kfree_skb(skb);
3345 input_queue_head_incr(sd);
3346 }
3347 }
3348 rps_unlock(sd);
3349
3350 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3351 if (skb->dev == dev) {
3352 __skb_unlink(skb, &sd->process_queue);
3353 kfree_skb(skb);
3354 input_queue_head_incr(sd);
3355 }
3356 }
3357 }
3358
3359 static int napi_gro_complete(struct sk_buff *skb)
3360 {
3361 struct packet_type *ptype;
3362 __be16 type = skb->protocol;
3363 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3364 int err = -ENOENT;
3365
3366 if (NAPI_GRO_CB(skb)->count == 1) {
3367 skb_shinfo(skb)->gso_size = 0;
3368 goto out;
3369 }
3370
3371 rcu_read_lock();
3372 list_for_each_entry_rcu(ptype, head, list) {
3373 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3374 continue;
3375
3376 err = ptype->gro_complete(skb);
3377 break;
3378 }
3379 rcu_read_unlock();
3380
3381 if (err) {
3382 WARN_ON(&ptype->list == head);
3383 kfree_skb(skb);
3384 return NET_RX_SUCCESS;
3385 }
3386
3387 out:
3388 return netif_receive_skb(skb);
3389 }
3390
3391 inline void napi_gro_flush(struct napi_struct *napi)
3392 {
3393 struct sk_buff *skb, *next;
3394
3395 for (skb = napi->gro_list; skb; skb = next) {
3396 next = skb->next;
3397 skb->next = NULL;
3398 napi_gro_complete(skb);
3399 }
3400
3401 napi->gro_count = 0;
3402 napi->gro_list = NULL;
3403 }
3404 EXPORT_SYMBOL(napi_gro_flush);
3405
3406 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3407 {
3408 struct sk_buff **pp = NULL;
3409 struct packet_type *ptype;
3410 __be16 type = skb->protocol;
3411 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3412 int same_flow;
3413 int mac_len;
3414 enum gro_result ret;
3415
3416 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3417 goto normal;
3418
3419 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3420 goto normal;
3421
3422 rcu_read_lock();
3423 list_for_each_entry_rcu(ptype, head, list) {
3424 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3425 continue;
3426
3427 skb_set_network_header(skb, skb_gro_offset(skb));
3428 mac_len = skb->network_header - skb->mac_header;
3429 skb->mac_len = mac_len;
3430 NAPI_GRO_CB(skb)->same_flow = 0;
3431 NAPI_GRO_CB(skb)->flush = 0;
3432 NAPI_GRO_CB(skb)->free = 0;
3433
3434 pp = ptype->gro_receive(&napi->gro_list, skb);
3435 break;
3436 }
3437 rcu_read_unlock();
3438
3439 if (&ptype->list == head)
3440 goto normal;
3441
3442 same_flow = NAPI_GRO_CB(skb)->same_flow;
3443 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3444
3445 if (pp) {
3446 struct sk_buff *nskb = *pp;
3447
3448 *pp = nskb->next;
3449 nskb->next = NULL;
3450 napi_gro_complete(nskb);
3451 napi->gro_count--;
3452 }
3453
3454 if (same_flow)
3455 goto ok;
3456
3457 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3458 goto normal;
3459
3460 napi->gro_count++;
3461 NAPI_GRO_CB(skb)->count = 1;
3462 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3463 skb->next = napi->gro_list;
3464 napi->gro_list = skb;
3465 ret = GRO_HELD;
3466
3467 pull:
3468 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3469 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3470
3471 BUG_ON(skb->end - skb->tail < grow);
3472
3473 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3474
3475 skb->tail += grow;
3476 skb->data_len -= grow;
3477
3478 skb_shinfo(skb)->frags[0].page_offset += grow;
3479 skb_shinfo(skb)->frags[0].size -= grow;
3480
3481 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3482 put_page(skb_shinfo(skb)->frags[0].page);
3483 memmove(skb_shinfo(skb)->frags,
3484 skb_shinfo(skb)->frags + 1,
3485 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3486 }
3487 }
3488
3489 ok:
3490 return ret;
3491
3492 normal:
3493 ret = GRO_NORMAL;
3494 goto pull;
3495 }
3496 EXPORT_SYMBOL(dev_gro_receive);
3497
3498 static inline gro_result_t
3499 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3500 {
3501 struct sk_buff *p;
3502
3503 for (p = napi->gro_list; p; p = p->next) {
3504 unsigned long diffs;
3505
3506 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3507 diffs |= p->vlan_tci ^ skb->vlan_tci;
3508 diffs |= compare_ether_header(skb_mac_header(p),
3509 skb_gro_mac_header(skb));
3510 NAPI_GRO_CB(p)->same_flow = !diffs;
3511 NAPI_GRO_CB(p)->flush = 0;
3512 }
3513
3514 return dev_gro_receive(napi, skb);
3515 }
3516
3517 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3518 {
3519 switch (ret) {
3520 case GRO_NORMAL:
3521 if (netif_receive_skb(skb))
3522 ret = GRO_DROP;
3523 break;
3524
3525 case GRO_DROP:
3526 case GRO_MERGED_FREE:
3527 kfree_skb(skb);
3528 break;
3529
3530 case GRO_HELD:
3531 case GRO_MERGED:
3532 break;
3533 }
3534
3535 return ret;
3536 }
3537 EXPORT_SYMBOL(napi_skb_finish);
3538
3539 void skb_gro_reset_offset(struct sk_buff *skb)
3540 {
3541 NAPI_GRO_CB(skb)->data_offset = 0;
3542 NAPI_GRO_CB(skb)->frag0 = NULL;
3543 NAPI_GRO_CB(skb)->frag0_len = 0;
3544
3545 if (skb->mac_header == skb->tail &&
3546 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3547 NAPI_GRO_CB(skb)->frag0 =
3548 page_address(skb_shinfo(skb)->frags[0].page) +
3549 skb_shinfo(skb)->frags[0].page_offset;
3550 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3551 }
3552 }
3553 EXPORT_SYMBOL(skb_gro_reset_offset);
3554
3555 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3556 {
3557 skb_gro_reset_offset(skb);
3558
3559 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3560 }
3561 EXPORT_SYMBOL(napi_gro_receive);
3562
3563 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3564 {
3565 __skb_pull(skb, skb_headlen(skb));
3566 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3567 skb->vlan_tci = 0;
3568 skb->dev = napi->dev;
3569 skb->skb_iif = 0;
3570
3571 napi->skb = skb;
3572 }
3573
3574 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3575 {
3576 struct sk_buff *skb = napi->skb;
3577
3578 if (!skb) {
3579 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3580 if (skb)
3581 napi->skb = skb;
3582 }
3583 return skb;
3584 }
3585 EXPORT_SYMBOL(napi_get_frags);
3586
3587 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3588 gro_result_t ret)
3589 {
3590 switch (ret) {
3591 case GRO_NORMAL:
3592 case GRO_HELD:
3593 skb->protocol = eth_type_trans(skb, skb->dev);
3594
3595 if (ret == GRO_HELD)
3596 skb_gro_pull(skb, -ETH_HLEN);
3597 else if (netif_receive_skb(skb))
3598 ret = GRO_DROP;
3599 break;
3600
3601 case GRO_DROP:
3602 case GRO_MERGED_FREE:
3603 napi_reuse_skb(napi, skb);
3604 break;
3605
3606 case GRO_MERGED:
3607 break;
3608 }
3609
3610 return ret;
3611 }
3612 EXPORT_SYMBOL(napi_frags_finish);
3613
3614 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3615 {
3616 struct sk_buff *skb = napi->skb;
3617 struct ethhdr *eth;
3618 unsigned int hlen;
3619 unsigned int off;
3620
3621 napi->skb = NULL;
3622
3623 skb_reset_mac_header(skb);
3624 skb_gro_reset_offset(skb);
3625
3626 off = skb_gro_offset(skb);
3627 hlen = off + sizeof(*eth);
3628 eth = skb_gro_header_fast(skb, off);
3629 if (skb_gro_header_hard(skb, hlen)) {
3630 eth = skb_gro_header_slow(skb, hlen, off);
3631 if (unlikely(!eth)) {
3632 napi_reuse_skb(napi, skb);
3633 skb = NULL;
3634 goto out;
3635 }
3636 }
3637
3638 skb_gro_pull(skb, sizeof(*eth));
3639
3640 /*
3641 * This works because the only protocols we care about don't require
3642 * special handling. We'll fix it up properly at the end.
3643 */
3644 skb->protocol = eth->h_proto;
3645
3646 out:
3647 return skb;
3648 }
3649 EXPORT_SYMBOL(napi_frags_skb);
3650
3651 gro_result_t napi_gro_frags(struct napi_struct *napi)
3652 {
3653 struct sk_buff *skb = napi_frags_skb(napi);
3654
3655 if (!skb)
3656 return GRO_DROP;
3657
3658 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3659 }
3660 EXPORT_SYMBOL(napi_gro_frags);
3661
3662 /*
3663 * net_rps_action sends any pending IPI's for rps.
3664 * Note: called with local irq disabled, but exits with local irq enabled.
3665 */
3666 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3667 {
3668 #ifdef CONFIG_RPS
3669 struct softnet_data *remsd = sd->rps_ipi_list;
3670
3671 if (remsd) {
3672 sd->rps_ipi_list = NULL;
3673
3674 local_irq_enable();
3675
3676 /* Send pending IPI's to kick RPS processing on remote cpus. */
3677 while (remsd) {
3678 struct softnet_data *next = remsd->rps_ipi_next;
3679
3680 if (cpu_online(remsd->cpu))
3681 __smp_call_function_single(remsd->cpu,
3682 &remsd->csd, 0);
3683 remsd = next;
3684 }
3685 } else
3686 #endif
3687 local_irq_enable();
3688 }
3689
3690 static int process_backlog(struct napi_struct *napi, int quota)
3691 {
3692 int work = 0;
3693 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3694
3695 #ifdef CONFIG_RPS
3696 /* Check if we have pending ipi, its better to send them now,
3697 * not waiting net_rx_action() end.
3698 */
3699 if (sd->rps_ipi_list) {
3700 local_irq_disable();
3701 net_rps_action_and_irq_enable(sd);
3702 }
3703 #endif
3704 napi->weight = weight_p;
3705 local_irq_disable();
3706 while (work < quota) {
3707 struct sk_buff *skb;
3708 unsigned int qlen;
3709
3710 while ((skb = __skb_dequeue(&sd->process_queue))) {
3711 local_irq_enable();
3712 __netif_receive_skb(skb);
3713 local_irq_disable();
3714 input_queue_head_incr(sd);
3715 if (++work >= quota) {
3716 local_irq_enable();
3717 return work;
3718 }
3719 }
3720
3721 rps_lock(sd);
3722 qlen = skb_queue_len(&sd->input_pkt_queue);
3723 if (qlen)
3724 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3725 &sd->process_queue);
3726
3727 if (qlen < quota - work) {
3728 /*
3729 * Inline a custom version of __napi_complete().
3730 * only current cpu owns and manipulates this napi,
3731 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3732 * we can use a plain write instead of clear_bit(),
3733 * and we dont need an smp_mb() memory barrier.
3734 */
3735 list_del(&napi->poll_list);
3736 napi->state = 0;
3737
3738 quota = work + qlen;
3739 }
3740 rps_unlock(sd);
3741 }
3742 local_irq_enable();
3743
3744 return work;
3745 }
3746
3747 /**
3748 * __napi_schedule - schedule for receive
3749 * @n: entry to schedule
3750 *
3751 * The entry's receive function will be scheduled to run
3752 */
3753 void __napi_schedule(struct napi_struct *n)
3754 {
3755 unsigned long flags;
3756
3757 local_irq_save(flags);
3758 ____napi_schedule(&__get_cpu_var(softnet_data), n);
3759 local_irq_restore(flags);
3760 }
3761 EXPORT_SYMBOL(__napi_schedule);
3762
3763 void __napi_complete(struct napi_struct *n)
3764 {
3765 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3766 BUG_ON(n->gro_list);
3767
3768 list_del(&n->poll_list);
3769 smp_mb__before_clear_bit();
3770 clear_bit(NAPI_STATE_SCHED, &n->state);
3771 }
3772 EXPORT_SYMBOL(__napi_complete);
3773
3774 void napi_complete(struct napi_struct *n)
3775 {
3776 unsigned long flags;
3777
3778 /*
3779 * don't let napi dequeue from the cpu poll list
3780 * just in case its running on a different cpu
3781 */
3782 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3783 return;
3784
3785 napi_gro_flush(n);
3786 local_irq_save(flags);
3787 __napi_complete(n);
3788 local_irq_restore(flags);
3789 }
3790 EXPORT_SYMBOL(napi_complete);
3791
3792 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3793 int (*poll)(struct napi_struct *, int), int weight)
3794 {
3795 INIT_LIST_HEAD(&napi->poll_list);
3796 napi->gro_count = 0;
3797 napi->gro_list = NULL;
3798 napi->skb = NULL;
3799 napi->poll = poll;
3800 napi->weight = weight;
3801 list_add(&napi->dev_list, &dev->napi_list);
3802 napi->dev = dev;
3803 #ifdef CONFIG_NETPOLL
3804 spin_lock_init(&napi->poll_lock);
3805 napi->poll_owner = -1;
3806 #endif
3807 set_bit(NAPI_STATE_SCHED, &napi->state);
3808 }
3809 EXPORT_SYMBOL(netif_napi_add);
3810
3811 void netif_napi_del(struct napi_struct *napi)
3812 {
3813 struct sk_buff *skb, *next;
3814
3815 list_del_init(&napi->dev_list);
3816 napi_free_frags(napi);
3817
3818 for (skb = napi->gro_list; skb; skb = next) {
3819 next = skb->next;
3820 skb->next = NULL;
3821 kfree_skb(skb);
3822 }
3823
3824 napi->gro_list = NULL;
3825 napi->gro_count = 0;
3826 }
3827 EXPORT_SYMBOL(netif_napi_del);
3828
3829 static void net_rx_action(struct softirq_action *h)
3830 {
3831 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3832 unsigned long time_limit = jiffies + 2;
3833 int budget = netdev_budget;
3834 void *have;
3835
3836 local_irq_disable();
3837
3838 while (!list_empty(&sd->poll_list)) {
3839 struct napi_struct *n;
3840 int work, weight;
3841
3842 /* If softirq window is exhuasted then punt.
3843 * Allow this to run for 2 jiffies since which will allow
3844 * an average latency of 1.5/HZ.
3845 */
3846 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3847 goto softnet_break;
3848
3849 local_irq_enable();
3850
3851 /* Even though interrupts have been re-enabled, this
3852 * access is safe because interrupts can only add new
3853 * entries to the tail of this list, and only ->poll()
3854 * calls can remove this head entry from the list.
3855 */
3856 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3857
3858 have = netpoll_poll_lock(n);
3859
3860 weight = n->weight;
3861
3862 /* This NAPI_STATE_SCHED test is for avoiding a race
3863 * with netpoll's poll_napi(). Only the entity which
3864 * obtains the lock and sees NAPI_STATE_SCHED set will
3865 * actually make the ->poll() call. Therefore we avoid
3866 * accidently calling ->poll() when NAPI is not scheduled.
3867 */
3868 work = 0;
3869 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3870 work = n->poll(n, weight);
3871 trace_napi_poll(n);
3872 }
3873
3874 WARN_ON_ONCE(work > weight);
3875
3876 budget -= work;
3877
3878 local_irq_disable();
3879
3880 /* Drivers must not modify the NAPI state if they
3881 * consume the entire weight. In such cases this code
3882 * still "owns" the NAPI instance and therefore can
3883 * move the instance around on the list at-will.
3884 */
3885 if (unlikely(work == weight)) {
3886 if (unlikely(napi_disable_pending(n))) {
3887 local_irq_enable();
3888 napi_complete(n);
3889 local_irq_disable();
3890 } else
3891 list_move_tail(&n->poll_list, &sd->poll_list);
3892 }
3893
3894 netpoll_poll_unlock(have);
3895 }
3896 out:
3897 net_rps_action_and_irq_enable(sd);
3898
3899 #ifdef CONFIG_NET_DMA
3900 /*
3901 * There may not be any more sk_buffs coming right now, so push
3902 * any pending DMA copies to hardware
3903 */
3904 dma_issue_pending_all();
3905 #endif
3906
3907 return;
3908
3909 softnet_break:
3910 sd->time_squeeze++;
3911 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3912 goto out;
3913 }
3914
3915 static gifconf_func_t *gifconf_list[NPROTO];
3916
3917 /**
3918 * register_gifconf - register a SIOCGIF handler
3919 * @family: Address family
3920 * @gifconf: Function handler
3921 *
3922 * Register protocol dependent address dumping routines. The handler
3923 * that is passed must not be freed or reused until it has been replaced
3924 * by another handler.
3925 */
3926 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3927 {
3928 if (family >= NPROTO)
3929 return -EINVAL;
3930 gifconf_list[family] = gifconf;
3931 return 0;
3932 }
3933 EXPORT_SYMBOL(register_gifconf);
3934
3935
3936 /*
3937 * Map an interface index to its name (SIOCGIFNAME)
3938 */
3939
3940 /*
3941 * We need this ioctl for efficient implementation of the
3942 * if_indextoname() function required by the IPv6 API. Without
3943 * it, we would have to search all the interfaces to find a
3944 * match. --pb
3945 */
3946
3947 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3948 {
3949 struct net_device *dev;
3950 struct ifreq ifr;
3951
3952 /*
3953 * Fetch the caller's info block.
3954 */
3955
3956 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3957 return -EFAULT;
3958
3959 rcu_read_lock();
3960 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3961 if (!dev) {
3962 rcu_read_unlock();
3963 return -ENODEV;
3964 }
3965
3966 strcpy(ifr.ifr_name, dev->name);
3967 rcu_read_unlock();
3968
3969 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3970 return -EFAULT;
3971 return 0;
3972 }
3973
3974 /*
3975 * Perform a SIOCGIFCONF call. This structure will change
3976 * size eventually, and there is nothing I can do about it.
3977 * Thus we will need a 'compatibility mode'.
3978 */
3979
3980 static int dev_ifconf(struct net *net, char __user *arg)
3981 {
3982 struct ifconf ifc;
3983 struct net_device *dev;
3984 char __user *pos;
3985 int len;
3986 int total;
3987 int i;
3988
3989 /*
3990 * Fetch the caller's info block.
3991 */
3992
3993 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3994 return -EFAULT;
3995
3996 pos = ifc.ifc_buf;
3997 len = ifc.ifc_len;
3998
3999 /*
4000 * Loop over the interfaces, and write an info block for each.
4001 */
4002
4003 total = 0;
4004 for_each_netdev(net, dev) {
4005 for (i = 0; i < NPROTO; i++) {
4006 if (gifconf_list[i]) {
4007 int done;
4008 if (!pos)
4009 done = gifconf_list[i](dev, NULL, 0);
4010 else
4011 done = gifconf_list[i](dev, pos + total,
4012 len - total);
4013 if (done < 0)
4014 return -EFAULT;
4015 total += done;
4016 }
4017 }
4018 }
4019
4020 /*
4021 * All done. Write the updated control block back to the caller.
4022 */
4023 ifc.ifc_len = total;
4024
4025 /*
4026 * Both BSD and Solaris return 0 here, so we do too.
4027 */
4028 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4029 }
4030
4031 #ifdef CONFIG_PROC_FS
4032 /*
4033 * This is invoked by the /proc filesystem handler to display a device
4034 * in detail.
4035 */
4036 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4037 __acquires(RCU)
4038 {
4039 struct net *net = seq_file_net(seq);
4040 loff_t off;
4041 struct net_device *dev;
4042
4043 rcu_read_lock();
4044 if (!*pos)
4045 return SEQ_START_TOKEN;
4046
4047 off = 1;
4048 for_each_netdev_rcu(net, dev)
4049 if (off++ == *pos)
4050 return dev;
4051
4052 return NULL;
4053 }
4054
4055 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4056 {
4057 struct net_device *dev = v;
4058
4059 if (v == SEQ_START_TOKEN)
4060 dev = first_net_device_rcu(seq_file_net(seq));
4061 else
4062 dev = next_net_device_rcu(dev);
4063
4064 ++*pos;
4065 return dev;
4066 }
4067
4068 void dev_seq_stop(struct seq_file *seq, void *v)
4069 __releases(RCU)
4070 {
4071 rcu_read_unlock();
4072 }
4073
4074 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4075 {
4076 struct rtnl_link_stats64 temp;
4077 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4078
4079 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4080 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4081 dev->name, stats->rx_bytes, stats->rx_packets,
4082 stats->rx_errors,
4083 stats->rx_dropped + stats->rx_missed_errors,
4084 stats->rx_fifo_errors,
4085 stats->rx_length_errors + stats->rx_over_errors +
4086 stats->rx_crc_errors + stats->rx_frame_errors,
4087 stats->rx_compressed, stats->multicast,
4088 stats->tx_bytes, stats->tx_packets,
4089 stats->tx_errors, stats->tx_dropped,
4090 stats->tx_fifo_errors, stats->collisions,
4091 stats->tx_carrier_errors +
4092 stats->tx_aborted_errors +
4093 stats->tx_window_errors +
4094 stats->tx_heartbeat_errors,
4095 stats->tx_compressed);
4096 }
4097
4098 /*
4099 * Called from the PROCfs module. This now uses the new arbitrary sized
4100 * /proc/net interface to create /proc/net/dev
4101 */
4102 static int dev_seq_show(struct seq_file *seq, void *v)
4103 {
4104 if (v == SEQ_START_TOKEN)
4105 seq_puts(seq, "Inter-| Receive "
4106 " | Transmit\n"
4107 " face |bytes packets errs drop fifo frame "
4108 "compressed multicast|bytes packets errs "
4109 "drop fifo colls carrier compressed\n");
4110 else
4111 dev_seq_printf_stats(seq, v);
4112 return 0;
4113 }
4114
4115 static struct softnet_data *softnet_get_online(loff_t *pos)
4116 {
4117 struct softnet_data *sd = NULL;
4118
4119 while (*pos < nr_cpu_ids)
4120 if (cpu_online(*pos)) {
4121 sd = &per_cpu(softnet_data, *pos);
4122 break;
4123 } else
4124 ++*pos;
4125 return sd;
4126 }
4127
4128 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4129 {
4130 return softnet_get_online(pos);
4131 }
4132
4133 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4134 {
4135 ++*pos;
4136 return softnet_get_online(pos);
4137 }
4138
4139 static void softnet_seq_stop(struct seq_file *seq, void *v)
4140 {
4141 }
4142
4143 static int softnet_seq_show(struct seq_file *seq, void *v)
4144 {
4145 struct softnet_data *sd = v;
4146
4147 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4148 sd->processed, sd->dropped, sd->time_squeeze, 0,
4149 0, 0, 0, 0, /* was fastroute */
4150 sd->cpu_collision, sd->received_rps);
4151 return 0;
4152 }
4153
4154 static const struct seq_operations dev_seq_ops = {
4155 .start = dev_seq_start,
4156 .next = dev_seq_next,
4157 .stop = dev_seq_stop,
4158 .show = dev_seq_show,
4159 };
4160
4161 static int dev_seq_open(struct inode *inode, struct file *file)
4162 {
4163 return seq_open_net(inode, file, &dev_seq_ops,
4164 sizeof(struct seq_net_private));
4165 }
4166
4167 static const struct file_operations dev_seq_fops = {
4168 .owner = THIS_MODULE,
4169 .open = dev_seq_open,
4170 .read = seq_read,
4171 .llseek = seq_lseek,
4172 .release = seq_release_net,
4173 };
4174
4175 static const struct seq_operations softnet_seq_ops = {
4176 .start = softnet_seq_start,
4177 .next = softnet_seq_next,
4178 .stop = softnet_seq_stop,
4179 .show = softnet_seq_show,
4180 };
4181
4182 static int softnet_seq_open(struct inode *inode, struct file *file)
4183 {
4184 return seq_open(file, &softnet_seq_ops);
4185 }
4186
4187 static const struct file_operations softnet_seq_fops = {
4188 .owner = THIS_MODULE,
4189 .open = softnet_seq_open,
4190 .read = seq_read,
4191 .llseek = seq_lseek,
4192 .release = seq_release,
4193 };
4194
4195 static void *ptype_get_idx(loff_t pos)
4196 {
4197 struct packet_type *pt = NULL;
4198 loff_t i = 0;
4199 int t;
4200
4201 list_for_each_entry_rcu(pt, &ptype_all, list) {
4202 if (i == pos)
4203 return pt;
4204 ++i;
4205 }
4206
4207 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4208 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4209 if (i == pos)
4210 return pt;
4211 ++i;
4212 }
4213 }
4214 return NULL;
4215 }
4216
4217 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4218 __acquires(RCU)
4219 {
4220 rcu_read_lock();
4221 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4222 }
4223
4224 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4225 {
4226 struct packet_type *pt;
4227 struct list_head *nxt;
4228 int hash;
4229
4230 ++*pos;
4231 if (v == SEQ_START_TOKEN)
4232 return ptype_get_idx(0);
4233
4234 pt = v;
4235 nxt = pt->list.next;
4236 if (pt->type == htons(ETH_P_ALL)) {
4237 if (nxt != &ptype_all)
4238 goto found;
4239 hash = 0;
4240 nxt = ptype_base[0].next;
4241 } else
4242 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4243
4244 while (nxt == &ptype_base[hash]) {
4245 if (++hash >= PTYPE_HASH_SIZE)
4246 return NULL;
4247 nxt = ptype_base[hash].next;
4248 }
4249 found:
4250 return list_entry(nxt, struct packet_type, list);
4251 }
4252
4253 static void ptype_seq_stop(struct seq_file *seq, void *v)
4254 __releases(RCU)
4255 {
4256 rcu_read_unlock();
4257 }
4258
4259 static int ptype_seq_show(struct seq_file *seq, void *v)
4260 {
4261 struct packet_type *pt = v;
4262
4263 if (v == SEQ_START_TOKEN)
4264 seq_puts(seq, "Type Device Function\n");
4265 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4266 if (pt->type == htons(ETH_P_ALL))
4267 seq_puts(seq, "ALL ");
4268 else
4269 seq_printf(seq, "%04x", ntohs(pt->type));
4270
4271 seq_printf(seq, " %-8s %pF\n",
4272 pt->dev ? pt->dev->name : "", pt->func);
4273 }
4274
4275 return 0;
4276 }
4277
4278 static const struct seq_operations ptype_seq_ops = {
4279 .start = ptype_seq_start,
4280 .next = ptype_seq_next,
4281 .stop = ptype_seq_stop,
4282 .show = ptype_seq_show,
4283 };
4284
4285 static int ptype_seq_open(struct inode *inode, struct file *file)
4286 {
4287 return seq_open_net(inode, file, &ptype_seq_ops,
4288 sizeof(struct seq_net_private));
4289 }
4290
4291 static const struct file_operations ptype_seq_fops = {
4292 .owner = THIS_MODULE,
4293 .open = ptype_seq_open,
4294 .read = seq_read,
4295 .llseek = seq_lseek,
4296 .release = seq_release_net,
4297 };
4298
4299
4300 static int __net_init dev_proc_net_init(struct net *net)
4301 {
4302 int rc = -ENOMEM;
4303
4304 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4305 goto out;
4306 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4307 goto out_dev;
4308 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4309 goto out_softnet;
4310
4311 if (wext_proc_init(net))
4312 goto out_ptype;
4313 rc = 0;
4314 out:
4315 return rc;
4316 out_ptype:
4317 proc_net_remove(net, "ptype");
4318 out_softnet:
4319 proc_net_remove(net, "softnet_stat");
4320 out_dev:
4321 proc_net_remove(net, "dev");
4322 goto out;
4323 }
4324
4325 static void __net_exit dev_proc_net_exit(struct net *net)
4326 {
4327 wext_proc_exit(net);
4328
4329 proc_net_remove(net, "ptype");
4330 proc_net_remove(net, "softnet_stat");
4331 proc_net_remove(net, "dev");
4332 }
4333
4334 static struct pernet_operations __net_initdata dev_proc_ops = {
4335 .init = dev_proc_net_init,
4336 .exit = dev_proc_net_exit,
4337 };
4338
4339 static int __init dev_proc_init(void)
4340 {
4341 return register_pernet_subsys(&dev_proc_ops);
4342 }
4343 #else
4344 #define dev_proc_init() 0
4345 #endif /* CONFIG_PROC_FS */
4346
4347
4348 /**
4349 * netdev_set_master - set up master/slave pair
4350 * @slave: slave device
4351 * @master: new master device
4352 *
4353 * Changes the master device of the slave. Pass %NULL to break the
4354 * bonding. The caller must hold the RTNL semaphore. On a failure
4355 * a negative errno code is returned. On success the reference counts
4356 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4357 * function returns zero.
4358 */
4359 int netdev_set_master(struct net_device *slave, struct net_device *master)
4360 {
4361 struct net_device *old = slave->master;
4362
4363 ASSERT_RTNL();
4364
4365 if (master) {
4366 if (old)
4367 return -EBUSY;
4368 dev_hold(master);
4369 }
4370
4371 slave->master = master;
4372
4373 if (old) {
4374 synchronize_net();
4375 dev_put(old);
4376 }
4377 if (master)
4378 slave->flags |= IFF_SLAVE;
4379 else
4380 slave->flags &= ~IFF_SLAVE;
4381
4382 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4383 return 0;
4384 }
4385 EXPORT_SYMBOL(netdev_set_master);
4386
4387 static void dev_change_rx_flags(struct net_device *dev, int flags)
4388 {
4389 const struct net_device_ops *ops = dev->netdev_ops;
4390
4391 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4392 ops->ndo_change_rx_flags(dev, flags);
4393 }
4394
4395 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4396 {
4397 unsigned short old_flags = dev->flags;
4398 uid_t uid;
4399 gid_t gid;
4400
4401 ASSERT_RTNL();
4402
4403 dev->flags |= IFF_PROMISC;
4404 dev->promiscuity += inc;
4405 if (dev->promiscuity == 0) {
4406 /*
4407 * Avoid overflow.
4408 * If inc causes overflow, untouch promisc and return error.
4409 */
4410 if (inc < 0)
4411 dev->flags &= ~IFF_PROMISC;
4412 else {
4413 dev->promiscuity -= inc;
4414 printk(KERN_WARNING "%s: promiscuity touches roof, "
4415 "set promiscuity failed, promiscuity feature "
4416 "of device might be broken.\n", dev->name);
4417 return -EOVERFLOW;
4418 }
4419 }
4420 if (dev->flags != old_flags) {
4421 printk(KERN_INFO "device %s %s promiscuous mode\n",
4422 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4423 "left");
4424 if (audit_enabled) {
4425 current_uid_gid(&uid, &gid);
4426 audit_log(current->audit_context, GFP_ATOMIC,
4427 AUDIT_ANOM_PROMISCUOUS,
4428 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4429 dev->name, (dev->flags & IFF_PROMISC),
4430 (old_flags & IFF_PROMISC),
4431 audit_get_loginuid(current),
4432 uid, gid,
4433 audit_get_sessionid(current));
4434 }
4435
4436 dev_change_rx_flags(dev, IFF_PROMISC);
4437 }
4438 return 0;
4439 }
4440
4441 /**
4442 * dev_set_promiscuity - update promiscuity count on a device
4443 * @dev: device
4444 * @inc: modifier
4445 *
4446 * Add or remove promiscuity from a device. While the count in the device
4447 * remains above zero the interface remains promiscuous. Once it hits zero
4448 * the device reverts back to normal filtering operation. A negative inc
4449 * value is used to drop promiscuity on the device.
4450 * Return 0 if successful or a negative errno code on error.
4451 */
4452 int dev_set_promiscuity(struct net_device *dev, int inc)
4453 {
4454 unsigned short old_flags = dev->flags;
4455 int err;
4456
4457 err = __dev_set_promiscuity(dev, inc);
4458 if (err < 0)
4459 return err;
4460 if (dev->flags != old_flags)
4461 dev_set_rx_mode(dev);
4462 return err;
4463 }
4464 EXPORT_SYMBOL(dev_set_promiscuity);
4465
4466 /**
4467 * dev_set_allmulti - update allmulti count on a device
4468 * @dev: device
4469 * @inc: modifier
4470 *
4471 * Add or remove reception of all multicast frames to a device. While the
4472 * count in the device remains above zero the interface remains listening
4473 * to all interfaces. Once it hits zero the device reverts back to normal
4474 * filtering operation. A negative @inc value is used to drop the counter
4475 * when releasing a resource needing all multicasts.
4476 * Return 0 if successful or a negative errno code on error.
4477 */
4478
4479 int dev_set_allmulti(struct net_device *dev, int inc)
4480 {
4481 unsigned short old_flags = dev->flags;
4482
4483 ASSERT_RTNL();
4484
4485 dev->flags |= IFF_ALLMULTI;
4486 dev->allmulti += inc;
4487 if (dev->allmulti == 0) {
4488 /*
4489 * Avoid overflow.
4490 * If inc causes overflow, untouch allmulti and return error.
4491 */
4492 if (inc < 0)
4493 dev->flags &= ~IFF_ALLMULTI;
4494 else {
4495 dev->allmulti -= inc;
4496 printk(KERN_WARNING "%s: allmulti touches roof, "
4497 "set allmulti failed, allmulti feature of "
4498 "device might be broken.\n", dev->name);
4499 return -EOVERFLOW;
4500 }
4501 }
4502 if (dev->flags ^ old_flags) {
4503 dev_change_rx_flags(dev, IFF_ALLMULTI);
4504 dev_set_rx_mode(dev);
4505 }
4506 return 0;
4507 }
4508 EXPORT_SYMBOL(dev_set_allmulti);
4509
4510 /*
4511 * Upload unicast and multicast address lists to device and
4512 * configure RX filtering. When the device doesn't support unicast
4513 * filtering it is put in promiscuous mode while unicast addresses
4514 * are present.
4515 */
4516 void __dev_set_rx_mode(struct net_device *dev)
4517 {
4518 const struct net_device_ops *ops = dev->netdev_ops;
4519
4520 /* dev_open will call this function so the list will stay sane. */
4521 if (!(dev->flags&IFF_UP))
4522 return;
4523
4524 if (!netif_device_present(dev))
4525 return;
4526
4527 if (ops->ndo_set_rx_mode)
4528 ops->ndo_set_rx_mode(dev);
4529 else {
4530 /* Unicast addresses changes may only happen under the rtnl,
4531 * therefore calling __dev_set_promiscuity here is safe.
4532 */
4533 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4534 __dev_set_promiscuity(dev, 1);
4535 dev->uc_promisc = 1;
4536 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4537 __dev_set_promiscuity(dev, -1);
4538 dev->uc_promisc = 0;
4539 }
4540
4541 if (ops->ndo_set_multicast_list)
4542 ops->ndo_set_multicast_list(dev);
4543 }
4544 }
4545
4546 void dev_set_rx_mode(struct net_device *dev)
4547 {
4548 netif_addr_lock_bh(dev);
4549 __dev_set_rx_mode(dev);
4550 netif_addr_unlock_bh(dev);
4551 }
4552
4553 /**
4554 * dev_get_flags - get flags reported to userspace
4555 * @dev: device
4556 *
4557 * Get the combination of flag bits exported through APIs to userspace.
4558 */
4559 unsigned dev_get_flags(const struct net_device *dev)
4560 {
4561 unsigned flags;
4562
4563 flags = (dev->flags & ~(IFF_PROMISC |
4564 IFF_ALLMULTI |
4565 IFF_RUNNING |
4566 IFF_LOWER_UP |
4567 IFF_DORMANT)) |
4568 (dev->gflags & (IFF_PROMISC |
4569 IFF_ALLMULTI));
4570
4571 if (netif_running(dev)) {
4572 if (netif_oper_up(dev))
4573 flags |= IFF_RUNNING;
4574 if (netif_carrier_ok(dev))
4575 flags |= IFF_LOWER_UP;
4576 if (netif_dormant(dev))
4577 flags |= IFF_DORMANT;
4578 }
4579
4580 return flags;
4581 }
4582 EXPORT_SYMBOL(dev_get_flags);
4583
4584 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4585 {
4586 int old_flags = dev->flags;
4587 int ret;
4588
4589 ASSERT_RTNL();
4590
4591 /*
4592 * Set the flags on our device.
4593 */
4594
4595 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4596 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4597 IFF_AUTOMEDIA)) |
4598 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4599 IFF_ALLMULTI));
4600
4601 /*
4602 * Load in the correct multicast list now the flags have changed.
4603 */
4604
4605 if ((old_flags ^ flags) & IFF_MULTICAST)
4606 dev_change_rx_flags(dev, IFF_MULTICAST);
4607
4608 dev_set_rx_mode(dev);
4609
4610 /*
4611 * Have we downed the interface. We handle IFF_UP ourselves
4612 * according to user attempts to set it, rather than blindly
4613 * setting it.
4614 */
4615
4616 ret = 0;
4617 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4618 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4619
4620 if (!ret)
4621 dev_set_rx_mode(dev);
4622 }
4623
4624 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4625 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4626
4627 dev->gflags ^= IFF_PROMISC;
4628 dev_set_promiscuity(dev, inc);
4629 }
4630
4631 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4632 is important. Some (broken) drivers set IFF_PROMISC, when
4633 IFF_ALLMULTI is requested not asking us and not reporting.
4634 */
4635 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4636 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4637
4638 dev->gflags ^= IFF_ALLMULTI;
4639 dev_set_allmulti(dev, inc);
4640 }
4641
4642 return ret;
4643 }
4644
4645 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4646 {
4647 unsigned int changes = dev->flags ^ old_flags;
4648
4649 if (changes & IFF_UP) {
4650 if (dev->flags & IFF_UP)
4651 call_netdevice_notifiers(NETDEV_UP, dev);
4652 else
4653 call_netdevice_notifiers(NETDEV_DOWN, dev);
4654 }
4655
4656 if (dev->flags & IFF_UP &&
4657 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4658 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4659 }
4660
4661 /**
4662 * dev_change_flags - change device settings
4663 * @dev: device
4664 * @flags: device state flags
4665 *
4666 * Change settings on device based state flags. The flags are
4667 * in the userspace exported format.
4668 */
4669 int dev_change_flags(struct net_device *dev, unsigned flags)
4670 {
4671 int ret, changes;
4672 int old_flags = dev->flags;
4673
4674 ret = __dev_change_flags(dev, flags);
4675 if (ret < 0)
4676 return ret;
4677
4678 changes = old_flags ^ dev->flags;
4679 if (changes)
4680 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4681
4682 __dev_notify_flags(dev, old_flags);
4683 return ret;
4684 }
4685 EXPORT_SYMBOL(dev_change_flags);
4686
4687 /**
4688 * dev_set_mtu - Change maximum transfer unit
4689 * @dev: device
4690 * @new_mtu: new transfer unit
4691 *
4692 * Change the maximum transfer size of the network device.
4693 */
4694 int dev_set_mtu(struct net_device *dev, int new_mtu)
4695 {
4696 const struct net_device_ops *ops = dev->netdev_ops;
4697 int err;
4698
4699 if (new_mtu == dev->mtu)
4700 return 0;
4701
4702 /* MTU must be positive. */
4703 if (new_mtu < 0)
4704 return -EINVAL;
4705
4706 if (!netif_device_present(dev))
4707 return -ENODEV;
4708
4709 err = 0;
4710 if (ops->ndo_change_mtu)
4711 err = ops->ndo_change_mtu(dev, new_mtu);
4712 else
4713 dev->mtu = new_mtu;
4714
4715 if (!err && dev->flags & IFF_UP)
4716 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4717 return err;
4718 }
4719 EXPORT_SYMBOL(dev_set_mtu);
4720
4721 /**
4722 * dev_set_group - Change group this device belongs to
4723 * @dev: device
4724 * @new_group: group this device should belong to
4725 */
4726 void dev_set_group(struct net_device *dev, int new_group)
4727 {
4728 dev->group = new_group;
4729 }
4730 EXPORT_SYMBOL(dev_set_group);
4731
4732 /**
4733 * dev_set_mac_address - Change Media Access Control Address
4734 * @dev: device
4735 * @sa: new address
4736 *
4737 * Change the hardware (MAC) address of the device
4738 */
4739 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4740 {
4741 const struct net_device_ops *ops = dev->netdev_ops;
4742 int err;
4743
4744 if (!ops->ndo_set_mac_address)
4745 return -EOPNOTSUPP;
4746 if (sa->sa_family != dev->type)
4747 return -EINVAL;
4748 if (!netif_device_present(dev))
4749 return -ENODEV;
4750 err = ops->ndo_set_mac_address(dev, sa);
4751 if (!err)
4752 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4753 return err;
4754 }
4755 EXPORT_SYMBOL(dev_set_mac_address);
4756
4757 /*
4758 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4759 */
4760 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4761 {
4762 int err;
4763 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4764
4765 if (!dev)
4766 return -ENODEV;
4767
4768 switch (cmd) {
4769 case SIOCGIFFLAGS: /* Get interface flags */
4770 ifr->ifr_flags = (short) dev_get_flags(dev);
4771 return 0;
4772
4773 case SIOCGIFMETRIC: /* Get the metric on the interface
4774 (currently unused) */
4775 ifr->ifr_metric = 0;
4776 return 0;
4777
4778 case SIOCGIFMTU: /* Get the MTU of a device */
4779 ifr->ifr_mtu = dev->mtu;
4780 return 0;
4781
4782 case SIOCGIFHWADDR:
4783 if (!dev->addr_len)
4784 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4785 else
4786 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4787 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4788 ifr->ifr_hwaddr.sa_family = dev->type;
4789 return 0;
4790
4791 case SIOCGIFSLAVE:
4792 err = -EINVAL;
4793 break;
4794
4795 case SIOCGIFMAP:
4796 ifr->ifr_map.mem_start = dev->mem_start;
4797 ifr->ifr_map.mem_end = dev->mem_end;
4798 ifr->ifr_map.base_addr = dev->base_addr;
4799 ifr->ifr_map.irq = dev->irq;
4800 ifr->ifr_map.dma = dev->dma;
4801 ifr->ifr_map.port = dev->if_port;
4802 return 0;
4803
4804 case SIOCGIFINDEX:
4805 ifr->ifr_ifindex = dev->ifindex;
4806 return 0;
4807
4808 case SIOCGIFTXQLEN:
4809 ifr->ifr_qlen = dev->tx_queue_len;
4810 return 0;
4811
4812 default:
4813 /* dev_ioctl() should ensure this case
4814 * is never reached
4815 */
4816 WARN_ON(1);
4817 err = -EINVAL;
4818 break;
4819
4820 }
4821 return err;
4822 }
4823
4824 /*
4825 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4826 */
4827 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4828 {
4829 int err;
4830 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4831 const struct net_device_ops *ops;
4832
4833 if (!dev)
4834 return -ENODEV;
4835
4836 ops = dev->netdev_ops;
4837
4838 switch (cmd) {
4839 case SIOCSIFFLAGS: /* Set interface flags */
4840 return dev_change_flags(dev, ifr->ifr_flags);
4841
4842 case SIOCSIFMETRIC: /* Set the metric on the interface
4843 (currently unused) */
4844 return -EOPNOTSUPP;
4845
4846 case SIOCSIFMTU: /* Set the MTU of a device */
4847 return dev_set_mtu(dev, ifr->ifr_mtu);
4848
4849 case SIOCSIFHWADDR:
4850 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4851
4852 case SIOCSIFHWBROADCAST:
4853 if (ifr->ifr_hwaddr.sa_family != dev->type)
4854 return -EINVAL;
4855 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4856 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4857 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4858 return 0;
4859
4860 case SIOCSIFMAP:
4861 if (ops->ndo_set_config) {
4862 if (!netif_device_present(dev))
4863 return -ENODEV;
4864 return ops->ndo_set_config(dev, &ifr->ifr_map);
4865 }
4866 return -EOPNOTSUPP;
4867
4868 case SIOCADDMULTI:
4869 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4870 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4871 return -EINVAL;
4872 if (!netif_device_present(dev))
4873 return -ENODEV;
4874 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4875
4876 case SIOCDELMULTI:
4877 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4878 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4879 return -EINVAL;
4880 if (!netif_device_present(dev))
4881 return -ENODEV;
4882 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4883
4884 case SIOCSIFTXQLEN:
4885 if (ifr->ifr_qlen < 0)
4886 return -EINVAL;
4887 dev->tx_queue_len = ifr->ifr_qlen;
4888 return 0;
4889
4890 case SIOCSIFNAME:
4891 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4892 return dev_change_name(dev, ifr->ifr_newname);
4893
4894 /*
4895 * Unknown or private ioctl
4896 */
4897 default:
4898 if ((cmd >= SIOCDEVPRIVATE &&
4899 cmd <= SIOCDEVPRIVATE + 15) ||
4900 cmd == SIOCBONDENSLAVE ||
4901 cmd == SIOCBONDRELEASE ||
4902 cmd == SIOCBONDSETHWADDR ||
4903 cmd == SIOCBONDSLAVEINFOQUERY ||
4904 cmd == SIOCBONDINFOQUERY ||
4905 cmd == SIOCBONDCHANGEACTIVE ||
4906 cmd == SIOCGMIIPHY ||
4907 cmd == SIOCGMIIREG ||
4908 cmd == SIOCSMIIREG ||
4909 cmd == SIOCBRADDIF ||
4910 cmd == SIOCBRDELIF ||
4911 cmd == SIOCSHWTSTAMP ||
4912 cmd == SIOCWANDEV) {
4913 err = -EOPNOTSUPP;
4914 if (ops->ndo_do_ioctl) {
4915 if (netif_device_present(dev))
4916 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4917 else
4918 err = -ENODEV;
4919 }
4920 } else
4921 err = -EINVAL;
4922
4923 }
4924 return err;
4925 }
4926
4927 /*
4928 * This function handles all "interface"-type I/O control requests. The actual
4929 * 'doing' part of this is dev_ifsioc above.
4930 */
4931
4932 /**
4933 * dev_ioctl - network device ioctl
4934 * @net: the applicable net namespace
4935 * @cmd: command to issue
4936 * @arg: pointer to a struct ifreq in user space
4937 *
4938 * Issue ioctl functions to devices. This is normally called by the
4939 * user space syscall interfaces but can sometimes be useful for
4940 * other purposes. The return value is the return from the syscall if
4941 * positive or a negative errno code on error.
4942 */
4943
4944 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4945 {
4946 struct ifreq ifr;
4947 int ret;
4948 char *colon;
4949
4950 /* One special case: SIOCGIFCONF takes ifconf argument
4951 and requires shared lock, because it sleeps writing
4952 to user space.
4953 */
4954
4955 if (cmd == SIOCGIFCONF) {
4956 rtnl_lock();
4957 ret = dev_ifconf(net, (char __user *) arg);
4958 rtnl_unlock();
4959 return ret;
4960 }
4961 if (cmd == SIOCGIFNAME)
4962 return dev_ifname(net, (struct ifreq __user *)arg);
4963
4964 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4965 return -EFAULT;
4966
4967 ifr.ifr_name[IFNAMSIZ-1] = 0;
4968
4969 colon = strchr(ifr.ifr_name, ':');
4970 if (colon)
4971 *colon = 0;
4972
4973 /*
4974 * See which interface the caller is talking about.
4975 */
4976
4977 switch (cmd) {
4978 /*
4979 * These ioctl calls:
4980 * - can be done by all.
4981 * - atomic and do not require locking.
4982 * - return a value
4983 */
4984 case SIOCGIFFLAGS:
4985 case SIOCGIFMETRIC:
4986 case SIOCGIFMTU:
4987 case SIOCGIFHWADDR:
4988 case SIOCGIFSLAVE:
4989 case SIOCGIFMAP:
4990 case SIOCGIFINDEX:
4991 case SIOCGIFTXQLEN:
4992 dev_load(net, ifr.ifr_name);
4993 rcu_read_lock();
4994 ret = dev_ifsioc_locked(net, &ifr, cmd);
4995 rcu_read_unlock();
4996 if (!ret) {
4997 if (colon)
4998 *colon = ':';
4999 if (copy_to_user(arg, &ifr,
5000 sizeof(struct ifreq)))
5001 ret = -EFAULT;
5002 }
5003 return ret;
5004
5005 case SIOCETHTOOL:
5006 dev_load(net, ifr.ifr_name);
5007 rtnl_lock();
5008 ret = dev_ethtool(net, &ifr);
5009 rtnl_unlock();
5010 if (!ret) {
5011 if (colon)
5012 *colon = ':';
5013 if (copy_to_user(arg, &ifr,
5014 sizeof(struct ifreq)))
5015 ret = -EFAULT;
5016 }
5017 return ret;
5018
5019 /*
5020 * These ioctl calls:
5021 * - require superuser power.
5022 * - require strict serialization.
5023 * - return a value
5024 */
5025 case SIOCGMIIPHY:
5026 case SIOCGMIIREG:
5027 case SIOCSIFNAME:
5028 if (!capable(CAP_NET_ADMIN))
5029 return -EPERM;
5030 dev_load(net, ifr.ifr_name);
5031 rtnl_lock();
5032 ret = dev_ifsioc(net, &ifr, cmd);
5033 rtnl_unlock();
5034 if (!ret) {
5035 if (colon)
5036 *colon = ':';
5037 if (copy_to_user(arg, &ifr,
5038 sizeof(struct ifreq)))
5039 ret = -EFAULT;
5040 }
5041 return ret;
5042
5043 /*
5044 * These ioctl calls:
5045 * - require superuser power.
5046 * - require strict serialization.
5047 * - do not return a value
5048 */
5049 case SIOCSIFFLAGS:
5050 case SIOCSIFMETRIC:
5051 case SIOCSIFMTU:
5052 case SIOCSIFMAP:
5053 case SIOCSIFHWADDR:
5054 case SIOCSIFSLAVE:
5055 case SIOCADDMULTI:
5056 case SIOCDELMULTI:
5057 case SIOCSIFHWBROADCAST:
5058 case SIOCSIFTXQLEN:
5059 case SIOCSMIIREG:
5060 case SIOCBONDENSLAVE:
5061 case SIOCBONDRELEASE:
5062 case SIOCBONDSETHWADDR:
5063 case SIOCBONDCHANGEACTIVE:
5064 case SIOCBRADDIF:
5065 case SIOCBRDELIF:
5066 case SIOCSHWTSTAMP:
5067 if (!capable(CAP_NET_ADMIN))
5068 return -EPERM;
5069 /* fall through */
5070 case SIOCBONDSLAVEINFOQUERY:
5071 case SIOCBONDINFOQUERY:
5072 dev_load(net, ifr.ifr_name);
5073 rtnl_lock();
5074 ret = dev_ifsioc(net, &ifr, cmd);
5075 rtnl_unlock();
5076 return ret;
5077
5078 case SIOCGIFMEM:
5079 /* Get the per device memory space. We can add this but
5080 * currently do not support it */
5081 case SIOCSIFMEM:
5082 /* Set the per device memory buffer space.
5083 * Not applicable in our case */
5084 case SIOCSIFLINK:
5085 return -EINVAL;
5086
5087 /*
5088 * Unknown or private ioctl.
5089 */
5090 default:
5091 if (cmd == SIOCWANDEV ||
5092 (cmd >= SIOCDEVPRIVATE &&
5093 cmd <= SIOCDEVPRIVATE + 15)) {
5094 dev_load(net, ifr.ifr_name);
5095 rtnl_lock();
5096 ret = dev_ifsioc(net, &ifr, cmd);
5097 rtnl_unlock();
5098 if (!ret && copy_to_user(arg, &ifr,
5099 sizeof(struct ifreq)))
5100 ret = -EFAULT;
5101 return ret;
5102 }
5103 /* Take care of Wireless Extensions */
5104 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5105 return wext_handle_ioctl(net, &ifr, cmd, arg);
5106 return -EINVAL;
5107 }
5108 }
5109
5110
5111 /**
5112 * dev_new_index - allocate an ifindex
5113 * @net: the applicable net namespace
5114 *
5115 * Returns a suitable unique value for a new device interface
5116 * number. The caller must hold the rtnl semaphore or the
5117 * dev_base_lock to be sure it remains unique.
5118 */
5119 static int dev_new_index(struct net *net)
5120 {
5121 static int ifindex;
5122 for (;;) {
5123 if (++ifindex <= 0)
5124 ifindex = 1;
5125 if (!__dev_get_by_index(net, ifindex))
5126 return ifindex;
5127 }
5128 }
5129
5130 /* Delayed registration/unregisteration */
5131 static LIST_HEAD(net_todo_list);
5132
5133 static void net_set_todo(struct net_device *dev)
5134 {
5135 list_add_tail(&dev->todo_list, &net_todo_list);
5136 }
5137
5138 static void rollback_registered_many(struct list_head *head)
5139 {
5140 struct net_device *dev, *tmp;
5141
5142 BUG_ON(dev_boot_phase);
5143 ASSERT_RTNL();
5144
5145 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5146 /* Some devices call without registering
5147 * for initialization unwind. Remove those
5148 * devices and proceed with the remaining.
5149 */
5150 if (dev->reg_state == NETREG_UNINITIALIZED) {
5151 pr_debug("unregister_netdevice: device %s/%p never "
5152 "was registered\n", dev->name, dev);
5153
5154 WARN_ON(1);
5155 list_del(&dev->unreg_list);
5156 continue;
5157 }
5158
5159 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5160 }
5161
5162 /* If device is running, close it first. */
5163 dev_close_many(head);
5164
5165 list_for_each_entry(dev, head, unreg_list) {
5166 /* And unlink it from device chain. */
5167 unlist_netdevice(dev);
5168
5169 dev->reg_state = NETREG_UNREGISTERING;
5170 }
5171
5172 synchronize_net();
5173
5174 list_for_each_entry(dev, head, unreg_list) {
5175 /* Shutdown queueing discipline. */
5176 dev_shutdown(dev);
5177
5178
5179 /* Notify protocols, that we are about to destroy
5180 this device. They should clean all the things.
5181 */
5182 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5183
5184 if (!dev->rtnl_link_ops ||
5185 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5186 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5187
5188 /*
5189 * Flush the unicast and multicast chains
5190 */
5191 dev_uc_flush(dev);
5192 dev_mc_flush(dev);
5193
5194 if (dev->netdev_ops->ndo_uninit)
5195 dev->netdev_ops->ndo_uninit(dev);
5196
5197 /* Notifier chain MUST detach us from master device. */
5198 WARN_ON(dev->master);
5199
5200 /* Remove entries from kobject tree */
5201 netdev_unregister_kobject(dev);
5202 }
5203
5204 /* Process any work delayed until the end of the batch */
5205 dev = list_first_entry(head, struct net_device, unreg_list);
5206 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5207
5208 rcu_barrier();
5209
5210 list_for_each_entry(dev, head, unreg_list)
5211 dev_put(dev);
5212 }
5213
5214 static void rollback_registered(struct net_device *dev)
5215 {
5216 LIST_HEAD(single);
5217
5218 list_add(&dev->unreg_list, &single);
5219 rollback_registered_many(&single);
5220 }
5221
5222 u32 netdev_fix_features(struct net_device *dev, u32 features)
5223 {
5224 /* Fix illegal checksum combinations */
5225 if ((features & NETIF_F_HW_CSUM) &&
5226 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5227 netdev_info(dev, "mixed HW and IP checksum settings.\n");
5228 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5229 }
5230
5231 if ((features & NETIF_F_NO_CSUM) &&
5232 (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5233 netdev_info(dev, "mixed no checksumming and other settings.\n");
5234 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5235 }
5236
5237 /* Fix illegal SG+CSUM combinations. */
5238 if ((features & NETIF_F_SG) &&
5239 !(features & NETIF_F_ALL_CSUM)) {
5240 netdev_info(dev,
5241 "Dropping NETIF_F_SG since no checksum feature.\n");
5242 features &= ~NETIF_F_SG;
5243 }
5244
5245 /* TSO requires that SG is present as well. */
5246 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5247 netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n");
5248 features &= ~NETIF_F_TSO;
5249 }
5250
5251 /* UFO needs SG and checksumming */
5252 if (features & NETIF_F_UFO) {
5253 /* maybe split UFO into V4 and V6? */
5254 if (!((features & NETIF_F_GEN_CSUM) ||
5255 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5256 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5257 netdev_info(dev,
5258 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5259 features &= ~NETIF_F_UFO;
5260 }
5261
5262 if (!(features & NETIF_F_SG)) {
5263 netdev_info(dev,
5264 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5265 features &= ~NETIF_F_UFO;
5266 }
5267 }
5268
5269 return features;
5270 }
5271 EXPORT_SYMBOL(netdev_fix_features);
5272
5273 /**
5274 * netif_stacked_transfer_operstate - transfer operstate
5275 * @rootdev: the root or lower level device to transfer state from
5276 * @dev: the device to transfer operstate to
5277 *
5278 * Transfer operational state from root to device. This is normally
5279 * called when a stacking relationship exists between the root
5280 * device and the device(a leaf device).
5281 */
5282 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5283 struct net_device *dev)
5284 {
5285 if (rootdev->operstate == IF_OPER_DORMANT)
5286 netif_dormant_on(dev);
5287 else
5288 netif_dormant_off(dev);
5289
5290 if (netif_carrier_ok(rootdev)) {
5291 if (!netif_carrier_ok(dev))
5292 netif_carrier_on(dev);
5293 } else {
5294 if (netif_carrier_ok(dev))
5295 netif_carrier_off(dev);
5296 }
5297 }
5298 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5299
5300 #ifdef CONFIG_RPS
5301 static int netif_alloc_rx_queues(struct net_device *dev)
5302 {
5303 unsigned int i, count = dev->num_rx_queues;
5304 struct netdev_rx_queue *rx;
5305
5306 BUG_ON(count < 1);
5307
5308 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5309 if (!rx) {
5310 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5311 return -ENOMEM;
5312 }
5313 dev->_rx = rx;
5314
5315 for (i = 0; i < count; i++)
5316 rx[i].dev = dev;
5317 return 0;
5318 }
5319 #endif
5320
5321 static void netdev_init_one_queue(struct net_device *dev,
5322 struct netdev_queue *queue, void *_unused)
5323 {
5324 /* Initialize queue lock */
5325 spin_lock_init(&queue->_xmit_lock);
5326 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5327 queue->xmit_lock_owner = -1;
5328 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5329 queue->dev = dev;
5330 }
5331
5332 static int netif_alloc_netdev_queues(struct net_device *dev)
5333 {
5334 unsigned int count = dev->num_tx_queues;
5335 struct netdev_queue *tx;
5336
5337 BUG_ON(count < 1);
5338
5339 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5340 if (!tx) {
5341 pr_err("netdev: Unable to allocate %u tx queues.\n",
5342 count);
5343 return -ENOMEM;
5344 }
5345 dev->_tx = tx;
5346
5347 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5348 spin_lock_init(&dev->tx_global_lock);
5349
5350 return 0;
5351 }
5352
5353 /**
5354 * register_netdevice - register a network device
5355 * @dev: device to register
5356 *
5357 * Take a completed network device structure and add it to the kernel
5358 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5359 * chain. 0 is returned on success. A negative errno code is returned
5360 * on a failure to set up the device, or if the name is a duplicate.
5361 *
5362 * Callers must hold the rtnl semaphore. You may want
5363 * register_netdev() instead of this.
5364 *
5365 * BUGS:
5366 * The locking appears insufficient to guarantee two parallel registers
5367 * will not get the same name.
5368 */
5369
5370 int register_netdevice(struct net_device *dev)
5371 {
5372 int ret;
5373 struct net *net = dev_net(dev);
5374
5375 BUG_ON(dev_boot_phase);
5376 ASSERT_RTNL();
5377
5378 might_sleep();
5379
5380 /* When net_device's are persistent, this will be fatal. */
5381 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5382 BUG_ON(!net);
5383
5384 spin_lock_init(&dev->addr_list_lock);
5385 netdev_set_addr_lockdep_class(dev);
5386
5387 dev->iflink = -1;
5388
5389 /* Init, if this function is available */
5390 if (dev->netdev_ops->ndo_init) {
5391 ret = dev->netdev_ops->ndo_init(dev);
5392 if (ret) {
5393 if (ret > 0)
5394 ret = -EIO;
5395 goto out;
5396 }
5397 }
5398
5399 ret = dev_get_valid_name(dev, dev->name, 0);
5400 if (ret)
5401 goto err_uninit;
5402
5403 dev->ifindex = dev_new_index(net);
5404 if (dev->iflink == -1)
5405 dev->iflink = dev->ifindex;
5406
5407 dev->features = netdev_fix_features(dev, dev->features);
5408
5409 /* Enable software GSO if SG is supported. */
5410 if (dev->features & NETIF_F_SG)
5411 dev->features |= NETIF_F_GSO;
5412
5413 /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5414 * vlan_dev_init() will do the dev->features check, so these features
5415 * are enabled only if supported by underlying device.
5416 */
5417 dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5418
5419 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5420 ret = notifier_to_errno(ret);
5421 if (ret)
5422 goto err_uninit;
5423
5424 ret = netdev_register_kobject(dev);
5425 if (ret)
5426 goto err_uninit;
5427 dev->reg_state = NETREG_REGISTERED;
5428
5429 /*
5430 * Default initial state at registry is that the
5431 * device is present.
5432 */
5433
5434 set_bit(__LINK_STATE_PRESENT, &dev->state);
5435
5436 dev_init_scheduler(dev);
5437 dev_hold(dev);
5438 list_netdevice(dev);
5439
5440 /* Notify protocols, that a new device appeared. */
5441 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5442 ret = notifier_to_errno(ret);
5443 if (ret) {
5444 rollback_registered(dev);
5445 dev->reg_state = NETREG_UNREGISTERED;
5446 }
5447 /*
5448 * Prevent userspace races by waiting until the network
5449 * device is fully setup before sending notifications.
5450 */
5451 if (!dev->rtnl_link_ops ||
5452 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5453 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5454
5455 out:
5456 return ret;
5457
5458 err_uninit:
5459 if (dev->netdev_ops->ndo_uninit)
5460 dev->netdev_ops->ndo_uninit(dev);
5461 goto out;
5462 }
5463 EXPORT_SYMBOL(register_netdevice);
5464
5465 /**
5466 * init_dummy_netdev - init a dummy network device for NAPI
5467 * @dev: device to init
5468 *
5469 * This takes a network device structure and initialize the minimum
5470 * amount of fields so it can be used to schedule NAPI polls without
5471 * registering a full blown interface. This is to be used by drivers
5472 * that need to tie several hardware interfaces to a single NAPI
5473 * poll scheduler due to HW limitations.
5474 */
5475 int init_dummy_netdev(struct net_device *dev)
5476 {
5477 /* Clear everything. Note we don't initialize spinlocks
5478 * are they aren't supposed to be taken by any of the
5479 * NAPI code and this dummy netdev is supposed to be
5480 * only ever used for NAPI polls
5481 */
5482 memset(dev, 0, sizeof(struct net_device));
5483
5484 /* make sure we BUG if trying to hit standard
5485 * register/unregister code path
5486 */
5487 dev->reg_state = NETREG_DUMMY;
5488
5489 /* NAPI wants this */
5490 INIT_LIST_HEAD(&dev->napi_list);
5491
5492 /* a dummy interface is started by default */
5493 set_bit(__LINK_STATE_PRESENT, &dev->state);
5494 set_bit(__LINK_STATE_START, &dev->state);
5495
5496 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5497 * because users of this 'device' dont need to change
5498 * its refcount.
5499 */
5500
5501 return 0;
5502 }
5503 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5504
5505
5506 /**
5507 * register_netdev - register a network device
5508 * @dev: device to register
5509 *
5510 * Take a completed network device structure and add it to the kernel
5511 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5512 * chain. 0 is returned on success. A negative errno code is returned
5513 * on a failure to set up the device, or if the name is a duplicate.
5514 *
5515 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5516 * and expands the device name if you passed a format string to
5517 * alloc_netdev.
5518 */
5519 int register_netdev(struct net_device *dev)
5520 {
5521 int err;
5522
5523 rtnl_lock();
5524
5525 /*
5526 * If the name is a format string the caller wants us to do a
5527 * name allocation.
5528 */
5529 if (strchr(dev->name, '%')) {
5530 err = dev_alloc_name(dev, dev->name);
5531 if (err < 0)
5532 goto out;
5533 }
5534
5535 err = register_netdevice(dev);
5536 out:
5537 rtnl_unlock();
5538 return err;
5539 }
5540 EXPORT_SYMBOL(register_netdev);
5541
5542 int netdev_refcnt_read(const struct net_device *dev)
5543 {
5544 int i, refcnt = 0;
5545
5546 for_each_possible_cpu(i)
5547 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5548 return refcnt;
5549 }
5550 EXPORT_SYMBOL(netdev_refcnt_read);
5551
5552 /*
5553 * netdev_wait_allrefs - wait until all references are gone.
5554 *
5555 * This is called when unregistering network devices.
5556 *
5557 * Any protocol or device that holds a reference should register
5558 * for netdevice notification, and cleanup and put back the
5559 * reference if they receive an UNREGISTER event.
5560 * We can get stuck here if buggy protocols don't correctly
5561 * call dev_put.
5562 */
5563 static void netdev_wait_allrefs(struct net_device *dev)
5564 {
5565 unsigned long rebroadcast_time, warning_time;
5566 int refcnt;
5567
5568 linkwatch_forget_dev(dev);
5569
5570 rebroadcast_time = warning_time = jiffies;
5571 refcnt = netdev_refcnt_read(dev);
5572
5573 while (refcnt != 0) {
5574 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5575 rtnl_lock();
5576
5577 /* Rebroadcast unregister notification */
5578 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5579 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5580 * should have already handle it the first time */
5581
5582 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5583 &dev->state)) {
5584 /* We must not have linkwatch events
5585 * pending on unregister. If this
5586 * happens, we simply run the queue
5587 * unscheduled, resulting in a noop
5588 * for this device.
5589 */
5590 linkwatch_run_queue();
5591 }
5592
5593 __rtnl_unlock();
5594
5595 rebroadcast_time = jiffies;
5596 }
5597
5598 msleep(250);
5599
5600 refcnt = netdev_refcnt_read(dev);
5601
5602 if (time_after(jiffies, warning_time + 10 * HZ)) {
5603 printk(KERN_EMERG "unregister_netdevice: "
5604 "waiting for %s to become free. Usage "
5605 "count = %d\n",
5606 dev->name, refcnt);
5607 warning_time = jiffies;
5608 }
5609 }
5610 }
5611
5612 /* The sequence is:
5613 *
5614 * rtnl_lock();
5615 * ...
5616 * register_netdevice(x1);
5617 * register_netdevice(x2);
5618 * ...
5619 * unregister_netdevice(y1);
5620 * unregister_netdevice(y2);
5621 * ...
5622 * rtnl_unlock();
5623 * free_netdev(y1);
5624 * free_netdev(y2);
5625 *
5626 * We are invoked by rtnl_unlock().
5627 * This allows us to deal with problems:
5628 * 1) We can delete sysfs objects which invoke hotplug
5629 * without deadlocking with linkwatch via keventd.
5630 * 2) Since we run with the RTNL semaphore not held, we can sleep
5631 * safely in order to wait for the netdev refcnt to drop to zero.
5632 *
5633 * We must not return until all unregister events added during
5634 * the interval the lock was held have been completed.
5635 */
5636 void netdev_run_todo(void)
5637 {
5638 struct list_head list;
5639
5640 /* Snapshot list, allow later requests */
5641 list_replace_init(&net_todo_list, &list);
5642
5643 __rtnl_unlock();
5644
5645 while (!list_empty(&list)) {
5646 struct net_device *dev
5647 = list_first_entry(&list, struct net_device, todo_list);
5648 list_del(&dev->todo_list);
5649
5650 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5651 printk(KERN_ERR "network todo '%s' but state %d\n",
5652 dev->name, dev->reg_state);
5653 dump_stack();
5654 continue;
5655 }
5656
5657 dev->reg_state = NETREG_UNREGISTERED;
5658
5659 on_each_cpu(flush_backlog, dev, 1);
5660
5661 netdev_wait_allrefs(dev);
5662
5663 /* paranoia */
5664 BUG_ON(netdev_refcnt_read(dev));
5665 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5666 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5667 WARN_ON(dev->dn_ptr);
5668
5669 if (dev->destructor)
5670 dev->destructor(dev);
5671
5672 /* Free network device */
5673 kobject_put(&dev->dev.kobj);
5674 }
5675 }
5676
5677 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5678 * fields in the same order, with only the type differing.
5679 */
5680 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5681 const struct net_device_stats *netdev_stats)
5682 {
5683 #if BITS_PER_LONG == 64
5684 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5685 memcpy(stats64, netdev_stats, sizeof(*stats64));
5686 #else
5687 size_t i, n = sizeof(*stats64) / sizeof(u64);
5688 const unsigned long *src = (const unsigned long *)netdev_stats;
5689 u64 *dst = (u64 *)stats64;
5690
5691 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5692 sizeof(*stats64) / sizeof(u64));
5693 for (i = 0; i < n; i++)
5694 dst[i] = src[i];
5695 #endif
5696 }
5697
5698 /**
5699 * dev_get_stats - get network device statistics
5700 * @dev: device to get statistics from
5701 * @storage: place to store stats
5702 *
5703 * Get network statistics from device. Return @storage.
5704 * The device driver may provide its own method by setting
5705 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5706 * otherwise the internal statistics structure is used.
5707 */
5708 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5709 struct rtnl_link_stats64 *storage)
5710 {
5711 const struct net_device_ops *ops = dev->netdev_ops;
5712
5713 if (ops->ndo_get_stats64) {
5714 memset(storage, 0, sizeof(*storage));
5715 ops->ndo_get_stats64(dev, storage);
5716 } else if (ops->ndo_get_stats) {
5717 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5718 } else {
5719 netdev_stats_to_stats64(storage, &dev->stats);
5720 }
5721 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5722 return storage;
5723 }
5724 EXPORT_SYMBOL(dev_get_stats);
5725
5726 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5727 {
5728 struct netdev_queue *queue = dev_ingress_queue(dev);
5729
5730 #ifdef CONFIG_NET_CLS_ACT
5731 if (queue)
5732 return queue;
5733 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5734 if (!queue)
5735 return NULL;
5736 netdev_init_one_queue(dev, queue, NULL);
5737 queue->qdisc = &noop_qdisc;
5738 queue->qdisc_sleeping = &noop_qdisc;
5739 rcu_assign_pointer(dev->ingress_queue, queue);
5740 #endif
5741 return queue;
5742 }
5743
5744 /**
5745 * alloc_netdev_mqs - allocate network device
5746 * @sizeof_priv: size of private data to allocate space for
5747 * @name: device name format string
5748 * @setup: callback to initialize device
5749 * @txqs: the number of TX subqueues to allocate
5750 * @rxqs: the number of RX subqueues to allocate
5751 *
5752 * Allocates a struct net_device with private data area for driver use
5753 * and performs basic initialization. Also allocates subquue structs
5754 * for each queue on the device.
5755 */
5756 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5757 void (*setup)(struct net_device *),
5758 unsigned int txqs, unsigned int rxqs)
5759 {
5760 struct net_device *dev;
5761 size_t alloc_size;
5762 struct net_device *p;
5763
5764 BUG_ON(strlen(name) >= sizeof(dev->name));
5765
5766 if (txqs < 1) {
5767 pr_err("alloc_netdev: Unable to allocate device "
5768 "with zero queues.\n");
5769 return NULL;
5770 }
5771
5772 #ifdef CONFIG_RPS
5773 if (rxqs < 1) {
5774 pr_err("alloc_netdev: Unable to allocate device "
5775 "with zero RX queues.\n");
5776 return NULL;
5777 }
5778 #endif
5779
5780 alloc_size = sizeof(struct net_device);
5781 if (sizeof_priv) {
5782 /* ensure 32-byte alignment of private area */
5783 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5784 alloc_size += sizeof_priv;
5785 }
5786 /* ensure 32-byte alignment of whole construct */
5787 alloc_size += NETDEV_ALIGN - 1;
5788
5789 p = kzalloc(alloc_size, GFP_KERNEL);
5790 if (!p) {
5791 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5792 return NULL;
5793 }
5794
5795 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5796 dev->padded = (char *)dev - (char *)p;
5797
5798 dev->pcpu_refcnt = alloc_percpu(int);
5799 if (!dev->pcpu_refcnt)
5800 goto free_p;
5801
5802 if (dev_addr_init(dev))
5803 goto free_pcpu;
5804
5805 dev_mc_init(dev);
5806 dev_uc_init(dev);
5807
5808 dev_net_set(dev, &init_net);
5809
5810 dev->gso_max_size = GSO_MAX_SIZE;
5811
5812 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5813 dev->ethtool_ntuple_list.count = 0;
5814 INIT_LIST_HEAD(&dev->napi_list);
5815 INIT_LIST_HEAD(&dev->unreg_list);
5816 INIT_LIST_HEAD(&dev->link_watch_list);
5817 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5818 setup(dev);
5819
5820 dev->num_tx_queues = txqs;
5821 dev->real_num_tx_queues = txqs;
5822 if (netif_alloc_netdev_queues(dev))
5823 goto free_all;
5824
5825 #ifdef CONFIG_RPS
5826 dev->num_rx_queues = rxqs;
5827 dev->real_num_rx_queues = rxqs;
5828 if (netif_alloc_rx_queues(dev))
5829 goto free_all;
5830 #endif
5831
5832 strcpy(dev->name, name);
5833 dev->group = INIT_NETDEV_GROUP;
5834 return dev;
5835
5836 free_all:
5837 free_netdev(dev);
5838 return NULL;
5839
5840 free_pcpu:
5841 free_percpu(dev->pcpu_refcnt);
5842 kfree(dev->_tx);
5843 #ifdef CONFIG_RPS
5844 kfree(dev->_rx);
5845 #endif
5846
5847 free_p:
5848 kfree(p);
5849 return NULL;
5850 }
5851 EXPORT_SYMBOL(alloc_netdev_mqs);
5852
5853 /**
5854 * free_netdev - free network device
5855 * @dev: device
5856 *
5857 * This function does the last stage of destroying an allocated device
5858 * interface. The reference to the device object is released.
5859 * If this is the last reference then it will be freed.
5860 */
5861 void free_netdev(struct net_device *dev)
5862 {
5863 struct napi_struct *p, *n;
5864
5865 release_net(dev_net(dev));
5866
5867 kfree(dev->_tx);
5868 #ifdef CONFIG_RPS
5869 kfree(dev->_rx);
5870 #endif
5871
5872 kfree(rcu_dereference_raw(dev->ingress_queue));
5873
5874 /* Flush device addresses */
5875 dev_addr_flush(dev);
5876
5877 /* Clear ethtool n-tuple list */
5878 ethtool_ntuple_flush(dev);
5879
5880 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5881 netif_napi_del(p);
5882
5883 free_percpu(dev->pcpu_refcnt);
5884 dev->pcpu_refcnt = NULL;
5885
5886 /* Compatibility with error handling in drivers */
5887 if (dev->reg_state == NETREG_UNINITIALIZED) {
5888 kfree((char *)dev - dev->padded);
5889 return;
5890 }
5891
5892 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5893 dev->reg_state = NETREG_RELEASED;
5894
5895 /* will free via device release */
5896 put_device(&dev->dev);
5897 }
5898 EXPORT_SYMBOL(free_netdev);
5899
5900 /**
5901 * synchronize_net - Synchronize with packet receive processing
5902 *
5903 * Wait for packets currently being received to be done.
5904 * Does not block later packets from starting.
5905 */
5906 void synchronize_net(void)
5907 {
5908 might_sleep();
5909 synchronize_rcu();
5910 }
5911 EXPORT_SYMBOL(synchronize_net);
5912
5913 /**
5914 * unregister_netdevice_queue - remove device from the kernel
5915 * @dev: device
5916 * @head: list
5917 *
5918 * This function shuts down a device interface and removes it
5919 * from the kernel tables.
5920 * If head not NULL, device is queued to be unregistered later.
5921 *
5922 * Callers must hold the rtnl semaphore. You may want
5923 * unregister_netdev() instead of this.
5924 */
5925
5926 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5927 {
5928 ASSERT_RTNL();
5929
5930 if (head) {
5931 list_move_tail(&dev->unreg_list, head);
5932 } else {
5933 rollback_registered(dev);
5934 /* Finish processing unregister after unlock */
5935 net_set_todo(dev);
5936 }
5937 }
5938 EXPORT_SYMBOL(unregister_netdevice_queue);
5939
5940 /**
5941 * unregister_netdevice_many - unregister many devices
5942 * @head: list of devices
5943 */
5944 void unregister_netdevice_many(struct list_head *head)
5945 {
5946 struct net_device *dev;
5947
5948 if (!list_empty(head)) {
5949 rollback_registered_many(head);
5950 list_for_each_entry(dev, head, unreg_list)
5951 net_set_todo(dev);
5952 }
5953 }
5954 EXPORT_SYMBOL(unregister_netdevice_many);
5955
5956 /**
5957 * unregister_netdev - remove device from the kernel
5958 * @dev: device
5959 *
5960 * This function shuts down a device interface and removes it
5961 * from the kernel tables.
5962 *
5963 * This is just a wrapper for unregister_netdevice that takes
5964 * the rtnl semaphore. In general you want to use this and not
5965 * unregister_netdevice.
5966 */
5967 void unregister_netdev(struct net_device *dev)
5968 {
5969 rtnl_lock();
5970 unregister_netdevice(dev);
5971 rtnl_unlock();
5972 }
5973 EXPORT_SYMBOL(unregister_netdev);
5974
5975 /**
5976 * dev_change_net_namespace - move device to different nethost namespace
5977 * @dev: device
5978 * @net: network namespace
5979 * @pat: If not NULL name pattern to try if the current device name
5980 * is already taken in the destination network namespace.
5981 *
5982 * This function shuts down a device interface and moves it
5983 * to a new network namespace. On success 0 is returned, on
5984 * a failure a netagive errno code is returned.
5985 *
5986 * Callers must hold the rtnl semaphore.
5987 */
5988
5989 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5990 {
5991 int err;
5992
5993 ASSERT_RTNL();
5994
5995 /* Don't allow namespace local devices to be moved. */
5996 err = -EINVAL;
5997 if (dev->features & NETIF_F_NETNS_LOCAL)
5998 goto out;
5999
6000 /* Ensure the device has been registrered */
6001 err = -EINVAL;
6002 if (dev->reg_state != NETREG_REGISTERED)
6003 goto out;
6004
6005 /* Get out if there is nothing todo */
6006 err = 0;
6007 if (net_eq(dev_net(dev), net))
6008 goto out;
6009
6010 /* Pick the destination device name, and ensure
6011 * we can use it in the destination network namespace.
6012 */
6013 err = -EEXIST;
6014 if (__dev_get_by_name(net, dev->name)) {
6015 /* We get here if we can't use the current device name */
6016 if (!pat)
6017 goto out;
6018 if (dev_get_valid_name(dev, pat, 1))
6019 goto out;
6020 }
6021
6022 /*
6023 * And now a mini version of register_netdevice unregister_netdevice.
6024 */
6025
6026 /* If device is running close it first. */
6027 dev_close(dev);
6028
6029 /* And unlink it from device chain */
6030 err = -ENODEV;
6031 unlist_netdevice(dev);
6032
6033 synchronize_net();
6034
6035 /* Shutdown queueing discipline. */
6036 dev_shutdown(dev);
6037
6038 /* Notify protocols, that we are about to destroy
6039 this device. They should clean all the things.
6040
6041 Note that dev->reg_state stays at NETREG_REGISTERED.
6042 This is wanted because this way 8021q and macvlan know
6043 the device is just moving and can keep their slaves up.
6044 */
6045 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6046 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6047
6048 /*
6049 * Flush the unicast and multicast chains
6050 */
6051 dev_uc_flush(dev);
6052 dev_mc_flush(dev);
6053
6054 /* Actually switch the network namespace */
6055 dev_net_set(dev, net);
6056
6057 /* If there is an ifindex conflict assign a new one */
6058 if (__dev_get_by_index(net, dev->ifindex)) {
6059 int iflink = (dev->iflink == dev->ifindex);
6060 dev->ifindex = dev_new_index(net);
6061 if (iflink)
6062 dev->iflink = dev->ifindex;
6063 }
6064
6065 /* Fixup kobjects */
6066 err = device_rename(&dev->dev, dev->name);
6067 WARN_ON(err);
6068
6069 /* Add the device back in the hashes */
6070 list_netdevice(dev);
6071
6072 /* Notify protocols, that a new device appeared. */
6073 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6074
6075 /*
6076 * Prevent userspace races by waiting until the network
6077 * device is fully setup before sending notifications.
6078 */
6079 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6080
6081 synchronize_net();
6082 err = 0;
6083 out:
6084 return err;
6085 }
6086 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6087
6088 static int dev_cpu_callback(struct notifier_block *nfb,
6089 unsigned long action,
6090 void *ocpu)
6091 {
6092 struct sk_buff **list_skb;
6093 struct sk_buff *skb;
6094 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6095 struct softnet_data *sd, *oldsd;
6096
6097 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6098 return NOTIFY_OK;
6099
6100 local_irq_disable();
6101 cpu = smp_processor_id();
6102 sd = &per_cpu(softnet_data, cpu);
6103 oldsd = &per_cpu(softnet_data, oldcpu);
6104
6105 /* Find end of our completion_queue. */
6106 list_skb = &sd->completion_queue;
6107 while (*list_skb)
6108 list_skb = &(*list_skb)->next;
6109 /* Append completion queue from offline CPU. */
6110 *list_skb = oldsd->completion_queue;
6111 oldsd->completion_queue = NULL;
6112
6113 /* Append output queue from offline CPU. */
6114 if (oldsd->output_queue) {
6115 *sd->output_queue_tailp = oldsd->output_queue;
6116 sd->output_queue_tailp = oldsd->output_queue_tailp;
6117 oldsd->output_queue = NULL;
6118 oldsd->output_queue_tailp = &oldsd->output_queue;
6119 }
6120
6121 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6122 local_irq_enable();
6123
6124 /* Process offline CPU's input_pkt_queue */
6125 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6126 netif_rx(skb);
6127 input_queue_head_incr(oldsd);
6128 }
6129 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6130 netif_rx(skb);
6131 input_queue_head_incr(oldsd);
6132 }
6133
6134 return NOTIFY_OK;
6135 }
6136
6137
6138 /**
6139 * netdev_increment_features - increment feature set by one
6140 * @all: current feature set
6141 * @one: new feature set
6142 * @mask: mask feature set
6143 *
6144 * Computes a new feature set after adding a device with feature set
6145 * @one to the master device with current feature set @all. Will not
6146 * enable anything that is off in @mask. Returns the new feature set.
6147 */
6148 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6149 {
6150 /* If device needs checksumming, downgrade to it. */
6151 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6152 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6153 else if (mask & NETIF_F_ALL_CSUM) {
6154 /* If one device supports v4/v6 checksumming, set for all. */
6155 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6156 !(all & NETIF_F_GEN_CSUM)) {
6157 all &= ~NETIF_F_ALL_CSUM;
6158 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6159 }
6160
6161 /* If one device supports hw checksumming, set for all. */
6162 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6163 all &= ~NETIF_F_ALL_CSUM;
6164 all |= NETIF_F_HW_CSUM;
6165 }
6166 }
6167
6168 one |= NETIF_F_ALL_CSUM;
6169
6170 one |= all & NETIF_F_ONE_FOR_ALL;
6171 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6172 all |= one & mask & NETIF_F_ONE_FOR_ALL;
6173
6174 return all;
6175 }
6176 EXPORT_SYMBOL(netdev_increment_features);
6177
6178 static struct hlist_head *netdev_create_hash(void)
6179 {
6180 int i;
6181 struct hlist_head *hash;
6182
6183 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6184 if (hash != NULL)
6185 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6186 INIT_HLIST_HEAD(&hash[i]);
6187
6188 return hash;
6189 }
6190
6191 /* Initialize per network namespace state */
6192 static int __net_init netdev_init(struct net *net)
6193 {
6194 INIT_LIST_HEAD(&net->dev_base_head);
6195
6196 net->dev_name_head = netdev_create_hash();
6197 if (net->dev_name_head == NULL)
6198 goto err_name;
6199
6200 net->dev_index_head = netdev_create_hash();
6201 if (net->dev_index_head == NULL)
6202 goto err_idx;
6203
6204 return 0;
6205
6206 err_idx:
6207 kfree(net->dev_name_head);
6208 err_name:
6209 return -ENOMEM;
6210 }
6211
6212 /**
6213 * netdev_drivername - network driver for the device
6214 * @dev: network device
6215 * @buffer: buffer for resulting name
6216 * @len: size of buffer
6217 *
6218 * Determine network driver for device.
6219 */
6220 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6221 {
6222 const struct device_driver *driver;
6223 const struct device *parent;
6224
6225 if (len <= 0 || !buffer)
6226 return buffer;
6227 buffer[0] = 0;
6228
6229 parent = dev->dev.parent;
6230
6231 if (!parent)
6232 return buffer;
6233
6234 driver = parent->driver;
6235 if (driver && driver->name)
6236 strlcpy(buffer, driver->name, len);
6237 return buffer;
6238 }
6239
6240 static int __netdev_printk(const char *level, const struct net_device *dev,
6241 struct va_format *vaf)
6242 {
6243 int r;
6244
6245 if (dev && dev->dev.parent)
6246 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6247 netdev_name(dev), vaf);
6248 else if (dev)
6249 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6250 else
6251 r = printk("%s(NULL net_device): %pV", level, vaf);
6252
6253 return r;
6254 }
6255
6256 int netdev_printk(const char *level, const struct net_device *dev,
6257 const char *format, ...)
6258 {
6259 struct va_format vaf;
6260 va_list args;
6261 int r;
6262
6263 va_start(args, format);
6264
6265 vaf.fmt = format;
6266 vaf.va = &args;
6267
6268 r = __netdev_printk(level, dev, &vaf);
6269 va_end(args);
6270
6271 return r;
6272 }
6273 EXPORT_SYMBOL(netdev_printk);
6274
6275 #define define_netdev_printk_level(func, level) \
6276 int func(const struct net_device *dev, const char *fmt, ...) \
6277 { \
6278 int r; \
6279 struct va_format vaf; \
6280 va_list args; \
6281 \
6282 va_start(args, fmt); \
6283 \
6284 vaf.fmt = fmt; \
6285 vaf.va = &args; \
6286 \
6287 r = __netdev_printk(level, dev, &vaf); \
6288 va_end(args); \
6289 \
6290 return r; \
6291 } \
6292 EXPORT_SYMBOL(func);
6293
6294 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6295 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6296 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6297 define_netdev_printk_level(netdev_err, KERN_ERR);
6298 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6299 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6300 define_netdev_printk_level(netdev_info, KERN_INFO);
6301
6302 static void __net_exit netdev_exit(struct net *net)
6303 {
6304 kfree(net->dev_name_head);
6305 kfree(net->dev_index_head);
6306 }
6307
6308 static struct pernet_operations __net_initdata netdev_net_ops = {
6309 .init = netdev_init,
6310 .exit = netdev_exit,
6311 };
6312
6313 static void __net_exit default_device_exit(struct net *net)
6314 {
6315 struct net_device *dev, *aux;
6316 /*
6317 * Push all migratable network devices back to the
6318 * initial network namespace
6319 */
6320 rtnl_lock();
6321 for_each_netdev_safe(net, dev, aux) {
6322 int err;
6323 char fb_name[IFNAMSIZ];
6324
6325 /* Ignore unmoveable devices (i.e. loopback) */
6326 if (dev->features & NETIF_F_NETNS_LOCAL)
6327 continue;
6328
6329 /* Leave virtual devices for the generic cleanup */
6330 if (dev->rtnl_link_ops)
6331 continue;
6332
6333 /* Push remaing network devices to init_net */
6334 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6335 err = dev_change_net_namespace(dev, &init_net, fb_name);
6336 if (err) {
6337 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6338 __func__, dev->name, err);
6339 BUG();
6340 }
6341 }
6342 rtnl_unlock();
6343 }
6344
6345 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6346 {
6347 /* At exit all network devices most be removed from a network
6348 * namespace. Do this in the reverse order of registration.
6349 * Do this across as many network namespaces as possible to
6350 * improve batching efficiency.
6351 */
6352 struct net_device *dev;
6353 struct net *net;
6354 LIST_HEAD(dev_kill_list);
6355
6356 rtnl_lock();
6357 list_for_each_entry(net, net_list, exit_list) {
6358 for_each_netdev_reverse(net, dev) {
6359 if (dev->rtnl_link_ops)
6360 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6361 else
6362 unregister_netdevice_queue(dev, &dev_kill_list);
6363 }
6364 }
6365 unregister_netdevice_many(&dev_kill_list);
6366 rtnl_unlock();
6367 }
6368
6369 static struct pernet_operations __net_initdata default_device_ops = {
6370 .exit = default_device_exit,
6371 .exit_batch = default_device_exit_batch,
6372 };
6373
6374 /*
6375 * Initialize the DEV module. At boot time this walks the device list and
6376 * unhooks any devices that fail to initialise (normally hardware not
6377 * present) and leaves us with a valid list of present and active devices.
6378 *
6379 */
6380
6381 /*
6382 * This is called single threaded during boot, so no need
6383 * to take the rtnl semaphore.
6384 */
6385 static int __init net_dev_init(void)
6386 {
6387 int i, rc = -ENOMEM;
6388
6389 BUG_ON(!dev_boot_phase);
6390
6391 if (dev_proc_init())
6392 goto out;
6393
6394 if (netdev_kobject_init())
6395 goto out;
6396
6397 INIT_LIST_HEAD(&ptype_all);
6398 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6399 INIT_LIST_HEAD(&ptype_base[i]);
6400
6401 if (register_pernet_subsys(&netdev_net_ops))
6402 goto out;
6403
6404 /*
6405 * Initialise the packet receive queues.
6406 */
6407
6408 for_each_possible_cpu(i) {
6409 struct softnet_data *sd = &per_cpu(softnet_data, i);
6410
6411 memset(sd, 0, sizeof(*sd));
6412 skb_queue_head_init(&sd->input_pkt_queue);
6413 skb_queue_head_init(&sd->process_queue);
6414 sd->completion_queue = NULL;
6415 INIT_LIST_HEAD(&sd->poll_list);
6416 sd->output_queue = NULL;
6417 sd->output_queue_tailp = &sd->output_queue;
6418 #ifdef CONFIG_RPS
6419 sd->csd.func = rps_trigger_softirq;
6420 sd->csd.info = sd;
6421 sd->csd.flags = 0;
6422 sd->cpu = i;
6423 #endif
6424
6425 sd->backlog.poll = process_backlog;
6426 sd->backlog.weight = weight_p;
6427 sd->backlog.gro_list = NULL;
6428 sd->backlog.gro_count = 0;
6429 }
6430
6431 dev_boot_phase = 0;
6432
6433 /* The loopback device is special if any other network devices
6434 * is present in a network namespace the loopback device must
6435 * be present. Since we now dynamically allocate and free the
6436 * loopback device ensure this invariant is maintained by
6437 * keeping the loopback device as the first device on the
6438 * list of network devices. Ensuring the loopback devices
6439 * is the first device that appears and the last network device
6440 * that disappears.
6441 */
6442 if (register_pernet_device(&loopback_net_ops))
6443 goto out;
6444
6445 if (register_pernet_device(&default_device_ops))
6446 goto out;
6447
6448 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6449 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6450
6451 hotcpu_notifier(dev_cpu_callback, 0);
6452 dst_init();
6453 dev_mcast_init();
6454 rc = 0;
6455 out:
6456 return rc;
6457 }
6458
6459 subsys_initcall(net_dev_init);
6460
6461 static int __init initialize_hashrnd(void)
6462 {
6463 get_random_bytes(&hashrnd, sizeof(hashrnd));
6464 return 0;
6465 }
6466
6467 late_initcall_sync(initialize_hashrnd);
6468
This page took 0.162583 seconds and 6 git commands to generate.