net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135
 136 #include "net-sysfs.h"
 137
 138 /* Instead of increasing this, you should create a hash table. */
 139 #define MAX_GRO_SKBS 8
 140
 141 /* This should be increased if a protocol with a bigger head is added. */
 142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144 /*
 145  *      The list of packet types we will receive (as opposed to discard)
 146  *      and the routines to invoke.
 147  *
 148  *      Why 16. Because with 16 the only overlap we get on a hash of the
 149  *      low nibble of the protocol value is RARP/SNAP/X.25.
 150  *
 151  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 152  *             sure which should go first, but I bet it won't make much
 153  *             difference if we are running VLANs.  The good news is that
 154  *             this protocol won't be in the list unless compiled in, so
 155  *             the average user (w/out VLANs) will not be adversely affected.
 156  *             --BLG
 157  *
 158  *              0800    IP
 159  *              8100    802.1Q VLAN
 160  *              0001    802.3
 161  *              0002    AX.25
 162  *              0004    802.2
 163  *              8035    RARP
 164  *              0005    SNAP
 165  *              0805    X.25
 166  *              0806    ARP
 167  *              8137    IPX
 168  *              0009    Localtalk
 169  *              86DD    IPv6
 170  */
 171
 172 #define PTYPE_HASH_SIZE (16)
 173 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 174
 175 static DEFINE_SPINLOCK(ptype_lock);
 176 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 177 static struct list_head ptype_all __read_mostly;        /* Taps */
 178
 179 /*
 180  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 181  * semaphore.
 182  *
 183  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 184  *
 185  * Writers must hold the rtnl semaphore while they loop through the
 186  * dev_base_head list, and hold dev_base_lock for writing when they do the
 187  * actual updates.  This allows pure readers to access the list even
 188  * while a writer is preparing to update it.
 189  *
 190  * To put it another way, dev_base_lock is held for writing only to
 191  * protect against pure readers; the rtnl semaphore provides the
 192  * protection against other writers.
 193  *
 194  * See, for example usages, register_netdevice() and
 195  * unregister_netdevice(), which must be called with the rtnl
 196  * semaphore held.
 197  */
 198 DEFINE_RWLOCK(dev_base_lock);
 199 EXPORT_SYMBOL(dev_base_lock);
 200
 201 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 202 {
 203         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 204         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 205 }
 206
 207 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 208 {
 209         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 210 }
 211
 212 static inline void rps_lock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_lock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 static inline void rps_unlock(struct softnet_data *sd)
 220 {
 221 #ifdef CONFIG_RPS
 222         spin_unlock(&sd->input_pkt_queue.lock);
 223 #endif
 224 }
 225
 226 /* Device list insertion */
 227 static int list_netdevice(struct net_device *dev)
 228 {
 229         struct net *net = dev_net(dev);
 230
 231         ASSERT_RTNL();
 232
 233         write_lock_bh(&dev_base_lock);
 234         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 235         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 236         hlist_add_head_rcu(&dev->index_hlist,
 237                            dev_index_hash(net, dev->ifindex));
 238         write_unlock_bh(&dev_base_lock);
 239         return 0;
 240 }
 241
 242 /* Device list removal
 243  * caller must respect a RCU grace period before freeing/reusing dev
 244  */
 245 static void unlist_netdevice(struct net_device *dev)
 246 {
 247         ASSERT_RTNL();
 248
 249         /* Unlink dev from the device chain */
 250         write_lock_bh(&dev_base_lock);
 251         list_del_rcu(&dev->dev_list);
 252         hlist_del_rcu(&dev->name_hlist);
 253         hlist_del_rcu(&dev->index_hlist);
 254         write_unlock_bh(&dev_base_lock);
 255 }
 256
 257 /*
 258  *      Our notifier list
 259  */
 260
 261 static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263 /*
 264  *      Device drivers call our routines to queue packets here. We empty the
 265  *      queue in the local softnet handler.
 266  */
 267
 268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 269 EXPORT_PER_CPU_SYMBOL(softnet_data);
 270
 271 #ifdef CONFIG_LOCKDEP
 272 /*
 273  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 274  * according to dev->type
 275  */
 276 static const unsigned short netdev_lock_type[] =
 277         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 278          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 279          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 280          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 281          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 282          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 283          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 284          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 285          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 286          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 287          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 288          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 289          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 290          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 291          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 292          ARPHRD_VOID, ARPHRD_NONE};
 293
 294 static const char *const netdev_lock_name[] =
 295         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 308          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 309          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 310          "_xmit_VOID", "_xmit_NONE"};
 311
 312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314
 315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 316 {
 317         int i;
 318
 319         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 320                 if (netdev_lock_type[i] == dev_type)
 321                         return i;
 322         /* the last key is used by default */
 323         return ARRAY_SIZE(netdev_lock_type) - 1;
 324 }
 325
 326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 327                                                  unsigned short dev_type)
 328 {
 329         int i;
 330
 331         i = netdev_lock_pos(dev_type);
 332         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 333                                    netdev_lock_name[i]);
 334 }
 335
 336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 337 {
 338         int i;
 339
 340         i = netdev_lock_pos(dev->type);
 341         lockdep_set_class_and_name(&dev->addr_list_lock,
 342                                    &netdev_addr_lock_key[i],
 343                                    netdev_lock_name[i]);
 344 }
 345 #else
 346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 347                                                  unsigned short dev_type)
 348 {
 349 }
 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351 {
 352 }
 353 #endif
 354
 355 /*******************************************************************************
 356
 357                 Protocol management and registration routines
 358
 359 *******************************************************************************/
 360
 361 /*
 362  *      Add a protocol ID to the list. Now that the input handler is
 363  *      smarter we can dispense with all the messy stuff that used to be
 364  *      here.
 365  *
 366  *      BEWARE!!! Protocol handlers, mangling input packets,
 367  *      MUST BE last in hash buckets and checking protocol handlers
 368  *      MUST start from promiscuous ptype_all chain in net_bh.
 369  *      It is true now, do not change it.
 370  *      Explanation follows: if protocol handler, mangling packet, will
 371  *      be the first on list, it is not able to sense, that packet
 372  *      is cloned and should be copied-on-write, so that it will
 373  *      change it and subsequent readers will get broken packet.
 374  *                                                      --ANK (980803)
 375  */
 376
 377 static inline struct list_head *ptype_head(const struct packet_type *pt)
 378 {
 379         if (pt->type == htons(ETH_P_ALL))
 380                 return &ptype_all;
 381         else
 382                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383 }
 384
 385 /**
 386  *      dev_add_pack - add packet handler
 387  *      @pt: packet type declaration
 388  *
 389  *      Add a protocol handler to the networking stack. The passed &packet_type
 390  *      is linked into kernel lists and may not be freed until it has been
 391  *      removed from the kernel lists.
 392  *
 393  *      This call does not sleep therefore it can not
 394  *      guarantee all CPU's that are in middle of receiving packets
 395  *      will see the new packet type (until the next received packet).
 396  */
 397
 398 void dev_add_pack(struct packet_type *pt)
 399 {
 400         struct list_head *head = ptype_head(pt);
 401
 402         spin_lock(&ptype_lock);
 403         list_add_rcu(&pt->list, head);
 404         spin_unlock(&ptype_lock);
 405 }
 406 EXPORT_SYMBOL(dev_add_pack);
 407
 408 /**
 409  *      __dev_remove_pack        - remove packet handler
 410  *      @pt: packet type declaration
 411  *
 412  *      Remove a protocol handler that was previously added to the kernel
 413  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414  *      from the kernel lists and can be freed or reused once this function
 415  *      returns.
 416  *
 417  *      The packet type might still be in use by receivers
 418  *      and must not be freed until after all the CPU's have gone
 419  *      through a quiescent state.
 420  */
 421 void __dev_remove_pack(struct packet_type *pt)
 422 {
 423         struct list_head *head = ptype_head(pt);
 424         struct packet_type *pt1;
 425
 426         spin_lock(&ptype_lock);
 427
 428         list_for_each_entry(pt1, head, list) {
 429                 if (pt == pt1) {
 430                         list_del_rcu(&pt->list);
 431                         goto out;
 432                 }
 433         }
 434
 435         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 436 out:
 437         spin_unlock(&ptype_lock);
 438 }
 439 EXPORT_SYMBOL(__dev_remove_pack);
 440
 441 /**
 442  *      dev_remove_pack  - remove packet handler
 443  *      @pt: packet type declaration
 444  *
 445  *      Remove a protocol handler that was previously added to the kernel
 446  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447  *      from the kernel lists and can be freed or reused once this function
 448  *      returns.
 449  *
 450  *      This call sleeps to guarantee that no CPU is looking at the packet
 451  *      type after return.
 452  */
 453 void dev_remove_pack(struct packet_type *pt)
 454 {
 455         __dev_remove_pack(pt);
 456
 457         synchronize_net();
 458 }
 459 EXPORT_SYMBOL(dev_remove_pack);
 460
 461 /******************************************************************************
 462
 463                       Device Boot-time Settings Routines
 464
 465 *******************************************************************************/
 466
 467 /* Boot time configuration table */
 468 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 469
 470 /**
 471  *      netdev_boot_setup_add   - add new setup entry
 472  *      @name: name of the device
 473  *      @map: configured settings for the device
 474  *
 475  *      Adds new setup entry to the dev_boot_setup list.  The function
 476  *      returns 0 on error and 1 on success.  This is a generic routine to
 477  *      all netdevices.
 478  */
 479 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 480 {
 481         struct netdev_boot_setup *s;
 482         int i;
 483
 484         s = dev_boot_setup;
 485         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 486                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 487                         memset(s[i].name, 0, sizeof(s[i].name));
 488                         strlcpy(s[i].name, name, IFNAMSIZ);
 489                         memcpy(&s[i].map, map, sizeof(s[i].map));
 490                         break;
 491                 }
 492         }
 493
 494         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 495 }
 496
 497 /**
 498  *      netdev_boot_setup_check - check boot time settings
 499  *      @dev: the netdevice
 500  *
 501  *      Check boot time settings for the device.
 502  *      The found settings are set for the device to be used
 503  *      later in the device probing.
 504  *      Returns 0 if no settings found, 1 if they are.
 505  */
 506 int netdev_boot_setup_check(struct net_device *dev)
 507 {
 508         struct netdev_boot_setup *s = dev_boot_setup;
 509         int i;
 510
 511         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 512                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 513                     !strcmp(dev->name, s[i].name)) {
 514                         dev->irq        = s[i].map.irq;
 515                         dev->base_addr  = s[i].map.base_addr;
 516                         dev->mem_start  = s[i].map.mem_start;
 517                         dev->mem_end    = s[i].map.mem_end;
 518                         return 1;
 519                 }
 520         }
 521         return 0;
 522 }
 523 EXPORT_SYMBOL(netdev_boot_setup_check);
 524
 525
 526 /**
 527  *      netdev_boot_base        - get address from boot time settings
 528  *      @prefix: prefix for network device
 529  *      @unit: id for network device
 530  *
 531  *      Check boot time settings for the base address of device.
 532  *      The found settings are set for the device to be used
 533  *      later in the device probing.
 534  *      Returns 0 if no settings found.
 535  */
 536 unsigned long netdev_boot_base(const char *prefix, int unit)
 537 {
 538         const struct netdev_boot_setup *s = dev_boot_setup;
 539         char name[IFNAMSIZ];
 540         int i;
 541
 542         sprintf(name, "%s%d", prefix, unit);
 543
 544         /*
 545          * If device already registered then return base of 1
 546          * to indicate not to probe for this interface
 547          */
 548         if (__dev_get_by_name(&init_net, name))
 549                 return 1;
 550
 551         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 552                 if (!strcmp(name, s[i].name))
 553                         return s[i].map.base_addr;
 554         return 0;
 555 }
 556
 557 /*
 558  * Saves at boot time configured settings for any netdevice.
 559  */
 560 int __init netdev_boot_setup(char *str)
 561 {
 562         int ints[5];
 563         struct ifmap map;
 564
 565         str = get_options(str, ARRAY_SIZE(ints), ints);
 566         if (!str || !*str)
 567                 return 0;
 568
 569         /* Save settings */
 570         memset(&map, 0, sizeof(map));
 571         if (ints[0] > 0)
 572                 map.irq = ints[1];
 573         if (ints[0] > 1)
 574                 map.base_addr = ints[2];
 575         if (ints[0] > 2)
 576                 map.mem_start = ints[3];
 577         if (ints[0] > 3)
 578                 map.mem_end = ints[4];
 579
 580         /* Add new entry to the list */
 581         return netdev_boot_setup_add(str, &map);
 582 }
 583
 584 __setup("netdev=", netdev_boot_setup);
 585
 586 /*******************************************************************************
 587
 588                             Device Interface Subroutines
 589
 590 *******************************************************************************/
 591
 592 /**
 593  *      __dev_get_by_name       - find a device by its name
 594  *      @net: the applicable net namespace
 595  *      @name: name to find
 596  *
 597  *      Find an interface by name. Must be called under RTNL semaphore
 598  *      or @dev_base_lock. If the name is found a pointer to the device
 599  *      is returned. If the name is not found then %NULL is returned. The
 600  *      reference counters are not incremented so the caller must be
 601  *      careful with locks.
 602  */
 603
 604 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 605 {
 606         struct hlist_node *p;
 607         struct net_device *dev;
 608         struct hlist_head *head = dev_name_hash(net, name);
 609
 610         hlist_for_each_entry(dev, p, head, name_hlist)
 611                 if (!strncmp(dev->name, name, IFNAMSIZ))
 612                         return dev;
 613
 614         return NULL;
 615 }
 616 EXPORT_SYMBOL(__dev_get_by_name);
 617
 618 /**
 619  *      dev_get_by_name_rcu     - find a device by its name
 620  *      @net: the applicable net namespace
 621  *      @name: name to find
 622  *
 623  *      Find an interface by name.
 624  *      If the name is found a pointer to the device is returned.
 625  *      If the name is not found then %NULL is returned.
 626  *      The reference counters are not incremented so the caller must be
 627  *      careful with locks. The caller must hold RCU lock.
 628  */
 629
 630 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 631 {
 632         struct hlist_node *p;
 633         struct net_device *dev;
 634         struct hlist_head *head = dev_name_hash(net, name);
 635
 636         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 637                 if (!strncmp(dev->name, name, IFNAMSIZ))
 638                         return dev;
 639
 640         return NULL;
 641 }
 642 EXPORT_SYMBOL(dev_get_by_name_rcu);
 643
 644 /**
 645  *      dev_get_by_name         - find a device by its name
 646  *      @net: the applicable net namespace
 647  *      @name: name to find
 648  *
 649  *      Find an interface by name. This can be called from any
 650  *      context and does its own locking. The returned handle has
 651  *      the usage count incremented and the caller must use dev_put() to
 652  *      release it when it is no longer needed. %NULL is returned if no
 653  *      matching device is found.
 654  */
 655
 656 struct net_device *dev_get_by_name(struct net *net, const char *name)
 657 {
 658         struct net_device *dev;
 659
 660         rcu_read_lock();
 661         dev = dev_get_by_name_rcu(net, name);
 662         if (dev)
 663                 dev_hold(dev);
 664         rcu_read_unlock();
 665         return dev;
 666 }
 667 EXPORT_SYMBOL(dev_get_by_name);
 668
 669 /**
 670  *      __dev_get_by_index - find a device by its ifindex
 671  *      @net: the applicable net namespace
 672  *      @ifindex: index of device
 673  *
 674  *      Search for an interface by index. Returns %NULL if the device
 675  *      is not found or a pointer to the device. The device has not
 676  *      had its reference counter increased so the caller must be careful
 677  *      about locking. The caller must hold either the RTNL semaphore
 678  *      or @dev_base_lock.
 679  */
 680
 681 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 682 {
 683         struct hlist_node *p;
 684         struct net_device *dev;
 685         struct hlist_head *head = dev_index_hash(net, ifindex);
 686
 687         hlist_for_each_entry(dev, p, head, index_hlist)
 688                 if (dev->ifindex == ifindex)
 689                         return dev;
 690
 691         return NULL;
 692 }
 693 EXPORT_SYMBOL(__dev_get_by_index);
 694
 695 /**
 696  *      dev_get_by_index_rcu - find a device by its ifindex
 697  *      @net: the applicable net namespace
 698  *      @ifindex: index of device
 699  *
 700  *      Search for an interface by index. Returns %NULL if the device
 701  *      is not found or a pointer to the device. The device has not
 702  *      had its reference counter increased so the caller must be careful
 703  *      about locking. The caller must hold RCU lock.
 704  */
 705
 706 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 707 {
 708         struct hlist_node *p;
 709         struct net_device *dev;
 710         struct hlist_head *head = dev_index_hash(net, ifindex);
 711
 712         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 713                 if (dev->ifindex == ifindex)
 714                         return dev;
 715
 716         return NULL;
 717 }
 718 EXPORT_SYMBOL(dev_get_by_index_rcu);
 719
 720
 721 /**
 722  *      dev_get_by_index - find a device by its ifindex
 723  *      @net: the applicable net namespace
 724  *      @ifindex: index of device
 725  *
 726  *      Search for an interface by index. Returns NULL if the device
 727  *      is not found or a pointer to the device. The device returned has
 728  *      had a reference added and the pointer is safe until the user calls
 729  *      dev_put to indicate they have finished with it.
 730  */
 731
 732 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 733 {
 734         struct net_device *dev;
 735
 736         rcu_read_lock();
 737         dev = dev_get_by_index_rcu(net, ifindex);
 738         if (dev)
 739                 dev_hold(dev);
 740         rcu_read_unlock();
 741         return dev;
 742 }
 743 EXPORT_SYMBOL(dev_get_by_index);
 744
 745 /**
 746  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 747  *      @net: the applicable net namespace
 748  *      @type: media type of device
 749  *      @ha: hardware address
 750  *
 751  *      Search for an interface by MAC address. Returns NULL if the device
 752  *      is not found or a pointer to the device. The caller must hold RCU
 753  *      The returned device has not had its ref count increased
 754  *      and the caller must therefore be careful about locking
 755  *
 756  */
 757
 758 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 759                                        const char *ha)
 760 {
 761         struct net_device *dev;
 762
 763         for_each_netdev_rcu(net, dev)
 764                 if (dev->type == type &&
 765                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 766                         return dev;
 767
 768         return NULL;
 769 }
 770 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 771
 772 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 773 {
 774         struct net_device *dev;
 775
 776         ASSERT_RTNL();
 777         for_each_netdev(net, dev)
 778                 if (dev->type == type)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 784
 785 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 786 {
 787         struct net_device *dev, *ret = NULL;
 788
 789         rcu_read_lock();
 790         for_each_netdev_rcu(net, dev)
 791                 if (dev->type == type) {
 792                         dev_hold(dev);
 793                         ret = dev;
 794                         break;
 795                 }
 796         rcu_read_unlock();
 797         return ret;
 798 }
 799 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 800
 801 /**
 802  *      dev_get_by_flags_rcu - find any device with given flags
 803  *      @net: the applicable net namespace
 804  *      @if_flags: IFF_* values
 805  *      @mask: bitmask of bits in if_flags to check
 806  *
 807  *      Search for any interface with the given flags. Returns NULL if a device
 808  *      is not found or a pointer to the device. Must be called inside
 809  *      rcu_read_lock(), and result refcount is unchanged.
 810  */
 811
 812 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 813                                     unsigned short mask)
 814 {
 815         struct net_device *dev, *ret;
 816
 817         ret = NULL;
 818         for_each_netdev_rcu(net, dev) {
 819                 if (((dev->flags ^ if_flags) & mask) == 0) {
 820                         ret = dev;
 821                         break;
 822                 }
 823         }
 824         return ret;
 825 }
 826 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 827
 828 /**
 829  *      dev_valid_name - check if name is okay for network device
 830  *      @name: name string
 831  *
 832  *      Network device names need to be valid file names to
 833  *      to allow sysfs to work.  We also disallow any kind of
 834  *      whitespace.
 835  */
 836 int dev_valid_name(const char *name)
 837 {
 838         if (*name == '\0')
 839                 return 0;
 840         if (strlen(name) >= IFNAMSIZ)
 841                 return 0;
 842         if (!strcmp(name, ".") || !strcmp(name, ".."))
 843                 return 0;
 844
 845         while (*name) {
 846                 if (*name == '/' || isspace(*name))
 847                         return 0;
 848                 name++;
 849         }
 850         return 1;
 851 }
 852 EXPORT_SYMBOL(dev_valid_name);
 853
 854 /**
 855  *      __dev_alloc_name - allocate a name for a device
 856  *      @net: network namespace to allocate the device name in
 857  *      @name: name format string
 858  *      @buf:  scratch buffer and result name string
 859  *
 860  *      Passed a format string - eg "lt%d" it will try and find a suitable
 861  *      id. It scans list of devices to build up a free map, then chooses
 862  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 863  *      while allocating the name and adding the device in order to avoid
 864  *      duplicates.
 865  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 866  *      Returns the number of the unit assigned or a negative errno code.
 867  */
 868
 869 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 870 {
 871         int i = 0;
 872         const char *p;
 873         const int max_netdevices = 8*PAGE_SIZE;
 874         unsigned long *inuse;
 875         struct net_device *d;
 876
 877         p = strnchr(name, IFNAMSIZ-1, '%');
 878         if (p) {
 879                 /*
 880                  * Verify the string as this thing may have come from
 881                  * the user.  There must be either one "%d" and no other "%"
 882                  * characters.
 883                  */
 884                 if (p[1] != 'd' || strchr(p + 2, '%'))
 885                         return -EINVAL;
 886
 887                 /* Use one page as a bit array of possible slots */
 888                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 889                 if (!inuse)
 890                         return -ENOMEM;
 891
 892                 for_each_netdev(net, d) {
 893                         if (!sscanf(d->name, name, &i))
 894                                 continue;
 895                         if (i < 0 || i >= max_netdevices)
 896                                 continue;
 897
 898                         /*  avoid cases where sscanf is not exact inverse of printf */
 899                         snprintf(buf, IFNAMSIZ, name, i);
 900                         if (!strncmp(buf, d->name, IFNAMSIZ))
 901                                 set_bit(i, inuse);
 902                 }
 903
 904                 i = find_first_zero_bit(inuse, max_netdevices);
 905                 free_page((unsigned long) inuse);
 906         }
 907
 908         if (buf != name)
 909                 snprintf(buf, IFNAMSIZ, name, i);
 910         if (!__dev_get_by_name(net, buf))
 911                 return i;
 912
 913         /* It is possible to run out of possible slots
 914          * when the name is long and there isn't enough space left
 915          * for the digits, or if all bits are used.
 916          */
 917         return -ENFILE;
 918 }
 919
 920 /**
 921  *      dev_alloc_name - allocate a name for a device
 922  *      @dev: device
 923  *      @name: name format string
 924  *
 925  *      Passed a format string - eg "lt%d" it will try and find a suitable
 926  *      id. It scans list of devices to build up a free map, then chooses
 927  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 928  *      while allocating the name and adding the device in order to avoid
 929  *      duplicates.
 930  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 931  *      Returns the number of the unit assigned or a negative errno code.
 932  */
 933
 934 int dev_alloc_name(struct net_device *dev, const char *name)
 935 {
 936         char buf[IFNAMSIZ];
 937         struct net *net;
 938         int ret;
 939
 940         BUG_ON(!dev_net(dev));
 941         net = dev_net(dev);
 942         ret = __dev_alloc_name(net, name, buf);
 943         if (ret >= 0)
 944                 strlcpy(dev->name, buf, IFNAMSIZ);
 945         return ret;
 946 }
 947 EXPORT_SYMBOL(dev_alloc_name);
 948
 949 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 950 {
 951         struct net *net;
 952
 953         BUG_ON(!dev_net(dev));
 954         net = dev_net(dev);
 955
 956         if (!dev_valid_name(name))
 957                 return -EINVAL;
 958
 959         if (fmt && strchr(name, '%'))
 960                 return dev_alloc_name(dev, name);
 961         else if (__dev_get_by_name(net, name))
 962                 return -EEXIST;
 963         else if (dev->name != name)
 964                 strlcpy(dev->name, name, IFNAMSIZ);
 965
 966         return 0;
 967 }
 968
 969 /**
 970  *      dev_change_name - change name of a device
 971  *      @dev: device
 972  *      @newname: name (or format string) must be at least IFNAMSIZ
 973  *
 974  *      Change name of a device, can pass format strings "eth%d".
 975  *      for wildcarding.
 976  */
 977 int dev_change_name(struct net_device *dev, const char *newname)
 978 {
 979         char oldname[IFNAMSIZ];
 980         int err = 0;
 981         int ret;
 982         struct net *net;
 983
 984         ASSERT_RTNL();
 985         BUG_ON(!dev_net(dev));
 986
 987         net = dev_net(dev);
 988         if (dev->flags & IFF_UP)
 989                 return -EBUSY;
 990
 991         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 992                 return 0;
 993
 994         memcpy(oldname, dev->name, IFNAMSIZ);
 995
 996         err = dev_get_valid_name(dev, newname, 1);
 997         if (err < 0)
 998                 return err;
 999
1000 rollback:
1001         ret = device_rename(&dev->dev, dev->name);
1002         if (ret) {
1003                 memcpy(dev->name, oldname, IFNAMSIZ);
1004                 return ret;
1005         }
1006
1007         write_lock_bh(&dev_base_lock);
1008         hlist_del(&dev->name_hlist);
1009         write_unlock_bh(&dev_base_lock);
1010
1011         synchronize_rcu();
1012
1013         write_lock_bh(&dev_base_lock);
1014         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1015         write_unlock_bh(&dev_base_lock);
1016
1017         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1018         ret = notifier_to_errno(ret);
1019
1020         if (ret) {
1021                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1022                 if (err >= 0) {
1023                         err = ret;
1024                         memcpy(dev->name, oldname, IFNAMSIZ);
1025                         goto rollback;
1026                 } else {
1027                         printk(KERN_ERR
1028                                "%s: name change rollback failed: %d.\n",
1029                                dev->name, ret);
1030                 }
1031         }
1032
1033         return err;
1034 }
1035
1036 /**
1037  *      dev_set_alias - change ifalias of a device
1038  *      @dev: device
1039  *      @alias: name up to IFALIASZ
1040  *      @len: limit of bytes to copy from info
1041  *
1042  *      Set ifalias for a device,
1043  */
1044 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1045 {
1046         ASSERT_RTNL();
1047
1048         if (len >= IFALIASZ)
1049                 return -EINVAL;
1050
1051         if (!len) {
1052                 if (dev->ifalias) {
1053                         kfree(dev->ifalias);
1054                         dev->ifalias = NULL;
1055                 }
1056                 return 0;
1057         }
1058
1059         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1060         if (!dev->ifalias)
1061                 return -ENOMEM;
1062
1063         strlcpy(dev->ifalias, alias, len+1);
1064         return len;
1065 }
1066
1067
1068 /**
1069  *      netdev_features_change - device changes features
1070  *      @dev: device to cause notification
1071  *
1072  *      Called to indicate a device has changed features.
1073  */
1074 void netdev_features_change(struct net_device *dev)
1075 {
1076         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1077 }
1078 EXPORT_SYMBOL(netdev_features_change);
1079
1080 /**
1081  *      netdev_state_change - device changes state
1082  *      @dev: device to cause notification
1083  *
1084  *      Called to indicate a device has changed state. This function calls
1085  *      the notifier chains for netdev_chain and sends a NEWLINK message
1086  *      to the routing socket.
1087  */
1088 void netdev_state_change(struct net_device *dev)
1089 {
1090         if (dev->flags & IFF_UP) {
1091                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1092                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1093         }
1094 }
1095 EXPORT_SYMBOL(netdev_state_change);
1096
1097 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1098 {
1099         return call_netdevice_notifiers(event, dev);
1100 }
1101 EXPORT_SYMBOL(netdev_bonding_change);
1102
1103 /**
1104  *      dev_load        - load a network module
1105  *      @net: the applicable net namespace
1106  *      @name: name of interface
1107  *
1108  *      If a network interface is not present and the process has suitable
1109  *      privileges this function loads the module. If module loading is not
1110  *      available in this kernel then it becomes a nop.
1111  */
1112
1113 void dev_load(struct net *net, const char *name)
1114 {
1115         struct net_device *dev;
1116
1117         rcu_read_lock();
1118         dev = dev_get_by_name_rcu(net, name);
1119         rcu_read_unlock();
1120
1121         if (!dev && capable(CAP_NET_ADMIN))
1122                 request_module("%s", name);
1123 }
1124 EXPORT_SYMBOL(dev_load);
1125
1126 static int __dev_open(struct net_device *dev)
1127 {
1128         const struct net_device_ops *ops = dev->netdev_ops;
1129         int ret;
1130
1131         ASSERT_RTNL();
1132
1133         /*
1134          *      Is it even present?
1135          */
1136         if (!netif_device_present(dev))
1137                 return -ENODEV;
1138
1139         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1140         ret = notifier_to_errno(ret);
1141         if (ret)
1142                 return ret;
1143
1144         /*
1145          *      Call device private open method
1146          */
1147         set_bit(__LINK_STATE_START, &dev->state);
1148
1149         if (ops->ndo_validate_addr)
1150                 ret = ops->ndo_validate_addr(dev);
1151
1152         if (!ret && ops->ndo_open)
1153                 ret = ops->ndo_open(dev);
1154
1155         /*
1156          *      If it went open OK then:
1157          */
1158
1159         if (ret)
1160                 clear_bit(__LINK_STATE_START, &dev->state);
1161         else {
1162                 /*
1163                  *      Set the flags.
1164                  */
1165                 dev->flags |= IFF_UP;
1166
1167                 /*
1168                  *      Enable NET_DMA
1169                  */
1170                 net_dmaengine_get();
1171
1172                 /*
1173                  *      Initialize multicasting status
1174                  */
1175                 dev_set_rx_mode(dev);
1176
1177                 /*
1178                  *      Wakeup transmit queue engine
1179                  */
1180                 dev_activate(dev);
1181         }
1182
1183         return ret;
1184 }
1185
1186 /**
1187  *      dev_open        - prepare an interface for use.
1188  *      @dev:   device to open
1189  *
1190  *      Takes a device from down to up state. The device's private open
1191  *      function is invoked and then the multicast lists are loaded. Finally
1192  *      the device is moved into the up state and a %NETDEV_UP message is
1193  *      sent to the netdev notifier chain.
1194  *
1195  *      Calling this function on an active interface is a nop. On a failure
1196  *      a negative errno code is returned.
1197  */
1198 int dev_open(struct net_device *dev)
1199 {
1200         int ret;
1201
1202         /*
1203          *      Is it already up?
1204          */
1205         if (dev->flags & IFF_UP)
1206                 return 0;
1207
1208         /*
1209          *      Open device
1210          */
1211         ret = __dev_open(dev);
1212         if (ret < 0)
1213                 return ret;
1214
1215         /*
1216          *      ... and announce new interface.
1217          */
1218         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1219         call_netdevice_notifiers(NETDEV_UP, dev);
1220
1221         return ret;
1222 }
1223 EXPORT_SYMBOL(dev_open);
1224
1225 static int __dev_close_many(struct list_head *head)
1226 {
1227         struct net_device *dev;
1228
1229         ASSERT_RTNL();
1230         might_sleep();
1231
1232         list_for_each_entry(dev, head, unreg_list) {
1233                 /*
1234                  *      Tell people we are going down, so that they can
1235                  *      prepare to death, when device is still operating.
1236                  */
1237                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1238
1239                 clear_bit(__LINK_STATE_START, &dev->state);
1240
1241                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1242                  * can be even on different cpu. So just clear netif_running().
1243                  *
1244                  * dev->stop() will invoke napi_disable() on all of it's
1245                  * napi_struct instances on this device.
1246                  */
1247                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1248         }
1249
1250         dev_deactivate_many(head);
1251
1252         list_for_each_entry(dev, head, unreg_list) {
1253                 const struct net_device_ops *ops = dev->netdev_ops;
1254
1255                 /*
1256                  *      Call the device specific close. This cannot fail.
1257                  *      Only if device is UP
1258                  *
1259                  *      We allow it to be called even after a DETACH hot-plug
1260                  *      event.
1261                  */
1262                 if (ops->ndo_stop)
1263                         ops->ndo_stop(dev);
1264
1265                 /*
1266                  *      Device is now down.
1267                  */
1268
1269                 dev->flags &= ~IFF_UP;
1270
1271                 /*
1272                  *      Shutdown NET_DMA
1273                  */
1274                 net_dmaengine_put();
1275         }
1276
1277         return 0;
1278 }
1279
1280 static int __dev_close(struct net_device *dev)
1281 {
1282         LIST_HEAD(single);
1283
1284         list_add(&dev->unreg_list, &single);
1285         return __dev_close_many(&single);
1286 }
1287
1288 int dev_close_many(struct list_head *head)
1289 {
1290         struct net_device *dev, *tmp;
1291         LIST_HEAD(tmp_list);
1292
1293         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1294                 if (!(dev->flags & IFF_UP))
1295                         list_move(&dev->unreg_list, &tmp_list);
1296
1297         __dev_close_many(head);
1298
1299         /*
1300          * Tell people we are down
1301          */
1302         list_for_each_entry(dev, head, unreg_list) {
1303                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1304                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1305         }
1306
1307         /* rollback_registered_many needs the complete original list */
1308         list_splice(&tmp_list, head);
1309         return 0;
1310 }
1311
1312 /**
1313  *      dev_close - shutdown an interface.
1314  *      @dev: device to shutdown
1315  *
1316  *      This function moves an active device into down state. A
1317  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1318  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1319  *      chain.
1320  */
1321 int dev_close(struct net_device *dev)
1322 {
1323         LIST_HEAD(single);
1324
1325         list_add(&dev->unreg_list, &single);
1326         dev_close_many(&single);
1327
1328         return 0;
1329 }
1330 EXPORT_SYMBOL(dev_close);
1331
1332
1333 /**
1334  *      dev_disable_lro - disable Large Receive Offload on a device
1335  *      @dev: device
1336  *
1337  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1338  *      called under RTNL.  This is needed if received packets may be
1339  *      forwarded to another interface.
1340  */
1341 void dev_disable_lro(struct net_device *dev)
1342 {
1343         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1344             dev->ethtool_ops->set_flags) {
1345                 u32 flags = dev->ethtool_ops->get_flags(dev);
1346                 if (flags & ETH_FLAG_LRO) {
1347                         flags &= ~ETH_FLAG_LRO;
1348                         dev->ethtool_ops->set_flags(dev, flags);
1349                 }
1350         }
1351         WARN_ON(dev->features & NETIF_F_LRO);
1352 }
1353 EXPORT_SYMBOL(dev_disable_lro);
1354
1355
1356 static int dev_boot_phase = 1;
1357
1358 /*
1359  *      Device change register/unregister. These are not inline or static
1360  *      as we export them to the world.
1361  */
1362
1363 /**
1364  *      register_netdevice_notifier - register a network notifier block
1365  *      @nb: notifier
1366  *
1367  *      Register a notifier to be called when network device events occur.
1368  *      The notifier passed is linked into the kernel structures and must
1369  *      not be reused until it has been unregistered. A negative errno code
1370  *      is returned on a failure.
1371  *
1372  *      When registered all registration and up events are replayed
1373  *      to the new notifier to allow device to have a race free
1374  *      view of the network device list.
1375  */
1376
1377 int register_netdevice_notifier(struct notifier_block *nb)
1378 {
1379         struct net_device *dev;
1380         struct net_device *last;
1381         struct net *net;
1382         int err;
1383
1384         rtnl_lock();
1385         err = raw_notifier_chain_register(&netdev_chain, nb);
1386         if (err)
1387                 goto unlock;
1388         if (dev_boot_phase)
1389                 goto unlock;
1390         for_each_net(net) {
1391                 for_each_netdev(net, dev) {
1392                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1393                         err = notifier_to_errno(err);
1394                         if (err)
1395                                 goto rollback;
1396
1397                         if (!(dev->flags & IFF_UP))
1398                                 continue;
1399
1400                         nb->notifier_call(nb, NETDEV_UP, dev);
1401                 }
1402         }
1403
1404 unlock:
1405         rtnl_unlock();
1406         return err;
1407
1408 rollback:
1409         last = dev;
1410         for_each_net(net) {
1411                 for_each_netdev(net, dev) {
1412                         if (dev == last)
1413                                 break;
1414
1415                         if (dev->flags & IFF_UP) {
1416                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1417                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1418                         }
1419                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1420                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1421                 }
1422         }
1423
1424         raw_notifier_chain_unregister(&netdev_chain, nb);
1425         goto unlock;
1426 }
1427 EXPORT_SYMBOL(register_netdevice_notifier);
1428
1429 /**
1430  *      unregister_netdevice_notifier - unregister a network notifier block
1431  *      @nb: notifier
1432  *
1433  *      Unregister a notifier previously registered by
1434  *      register_netdevice_notifier(). The notifier is unlinked into the
1435  *      kernel structures and may then be reused. A negative errno code
1436  *      is returned on a failure.
1437  */
1438
1439 int unregister_netdevice_notifier(struct notifier_block *nb)
1440 {
1441         int err;
1442
1443         rtnl_lock();
1444         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1445         rtnl_unlock();
1446         return err;
1447 }
1448 EXPORT_SYMBOL(unregister_netdevice_notifier);
1449
1450 /**
1451  *      call_netdevice_notifiers - call all network notifier blocks
1452  *      @val: value passed unmodified to notifier function
1453  *      @dev: net_device pointer passed unmodified to notifier function
1454  *
1455  *      Call all network notifier blocks.  Parameters and return value
1456  *      are as for raw_notifier_call_chain().
1457  */
1458
1459 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1460 {
1461         ASSERT_RTNL();
1462         return raw_notifier_call_chain(&netdev_chain, val, dev);
1463 }
1464
1465 /* When > 0 there are consumers of rx skb time stamps */
1466 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1467
1468 void net_enable_timestamp(void)
1469 {
1470         atomic_inc(&netstamp_needed);
1471 }
1472 EXPORT_SYMBOL(net_enable_timestamp);
1473
1474 void net_disable_timestamp(void)
1475 {
1476         atomic_dec(&netstamp_needed);
1477 }
1478 EXPORT_SYMBOL(net_disable_timestamp);
1479
1480 static inline void net_timestamp_set(struct sk_buff *skb)
1481 {
1482         if (atomic_read(&netstamp_needed))
1483                 __net_timestamp(skb);
1484         else
1485                 skb->tstamp.tv64 = 0;
1486 }
1487
1488 static inline void net_timestamp_check(struct sk_buff *skb)
1489 {
1490         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1491                 __net_timestamp(skb);
1492 }
1493
1494 /**
1495  * dev_forward_skb - loopback an skb to another netif
1496  *
1497  * @dev: destination network device
1498  * @skb: buffer to forward
1499  *
1500  * return values:
1501  *      NET_RX_SUCCESS  (no congestion)
1502  *      NET_RX_DROP     (packet was dropped, but freed)
1503  *
1504  * dev_forward_skb can be used for injecting an skb from the
1505  * start_xmit function of one device into the receive queue
1506  * of another device.
1507  *
1508  * The receiving device may be in another namespace, so
1509  * we have to clear all information in the skb that could
1510  * impact namespace isolation.
1511  */
1512 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1513 {
1514         skb_orphan(skb);
1515         nf_reset(skb);
1516
1517         if (unlikely(!(dev->flags & IFF_UP) ||
1518                      (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1519                 atomic_long_inc(&dev->rx_dropped);
1520                 kfree_skb(skb);
1521                 return NET_RX_DROP;
1522         }
1523         skb_set_dev(skb, dev);
1524         skb->tstamp.tv64 = 0;
1525         skb->pkt_type = PACKET_HOST;
1526         skb->protocol = eth_type_trans(skb, dev);
1527         return netif_rx(skb);
1528 }
1529 EXPORT_SYMBOL_GPL(dev_forward_skb);
1530
1531 static inline int deliver_skb(struct sk_buff *skb,
1532                               struct packet_type *pt_prev,
1533                               struct net_device *orig_dev)
1534 {
1535         atomic_inc(&skb->users);
1536         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1537 }
1538
1539 /*
1540  *      Support routine. Sends outgoing frames to any network
1541  *      taps currently in use.
1542  */
1543
1544 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1545 {
1546         struct packet_type *ptype;
1547         struct sk_buff *skb2 = NULL;
1548         struct packet_type *pt_prev = NULL;
1549
1550         rcu_read_lock();
1551         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1552                 /* Never send packets back to the socket
1553                  * they originated from - MvS (miquels@drinkel.ow.org)
1554                  */
1555                 if ((ptype->dev == dev || !ptype->dev) &&
1556                     (ptype->af_packet_priv == NULL ||
1557                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1558                         if (pt_prev) {
1559                                 deliver_skb(skb2, pt_prev, skb->dev);
1560                                 pt_prev = ptype;
1561                                 continue;
1562                         }
1563
1564                         skb2 = skb_clone(skb, GFP_ATOMIC);
1565                         if (!skb2)
1566                                 break;
1567
1568                         net_timestamp_set(skb2);
1569
1570                         /* skb->nh should be correctly
1571                            set by sender, so that the second statement is
1572                            just protection against buggy protocols.
1573                          */
1574                         skb_reset_mac_header(skb2);
1575
1576                         if (skb_network_header(skb2) < skb2->data ||
1577                             skb2->network_header > skb2->tail) {
1578                                 if (net_ratelimit())
1579                                         printk(KERN_CRIT "protocol %04x is "
1580                                                "buggy, dev %s\n",
1581                                                ntohs(skb2->protocol),
1582                                                dev->name);
1583                                 skb_reset_network_header(skb2);
1584                         }
1585
1586                         skb2->transport_header = skb2->network_header;
1587                         skb2->pkt_type = PACKET_OUTGOING;
1588                         pt_prev = ptype;
1589                 }
1590         }
1591         if (pt_prev)
1592                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1593         rcu_read_unlock();
1594 }
1595
1596 /*
1597  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1598  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1599  */
1600 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1601 {
1602         int rc;
1603
1604         if (txq < 1 || txq > dev->num_tx_queues)
1605                 return -EINVAL;
1606
1607         if (dev->reg_state == NETREG_REGISTERED) {
1608                 ASSERT_RTNL();
1609
1610                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1611                                                   txq);
1612                 if (rc)
1613                         return rc;
1614
1615                 if (txq < dev->real_num_tx_queues)
1616                         qdisc_reset_all_tx_gt(dev, txq);
1617         }
1618
1619         dev->real_num_tx_queues = txq;
1620         return 0;
1621 }
1622 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1623
1624 #ifdef CONFIG_RPS
1625 /**
1626  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1627  *      @dev: Network device
1628  *      @rxq: Actual number of RX queues
1629  *
1630  *      This must be called either with the rtnl_lock held or before
1631  *      registration of the net device.  Returns 0 on success, or a
1632  *      negative error code.  If called before registration, it always
1633  *      succeeds.
1634  */
1635 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1636 {
1637         int rc;
1638
1639         if (rxq < 1 || rxq > dev->num_rx_queues)
1640                 return -EINVAL;
1641
1642         if (dev->reg_state == NETREG_REGISTERED) {
1643                 ASSERT_RTNL();
1644
1645                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1646                                                   rxq);
1647                 if (rc)
1648                         return rc;
1649         }
1650
1651         dev->real_num_rx_queues = rxq;
1652         return 0;
1653 }
1654 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1655 #endif
1656
1657 static inline void __netif_reschedule(struct Qdisc *q)
1658 {
1659         struct softnet_data *sd;
1660         unsigned long flags;
1661
1662         local_irq_save(flags);
1663         sd = &__get_cpu_var(softnet_data);
1664         q->next_sched = NULL;
1665         *sd->output_queue_tailp = q;
1666         sd->output_queue_tailp = &q->next_sched;
1667         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1668         local_irq_restore(flags);
1669 }
1670
1671 void __netif_schedule(struct Qdisc *q)
1672 {
1673         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1674                 __netif_reschedule(q);
1675 }
1676 EXPORT_SYMBOL(__netif_schedule);
1677
1678 void dev_kfree_skb_irq(struct sk_buff *skb)
1679 {
1680         if (atomic_dec_and_test(&skb->users)) {
1681                 struct softnet_data *sd;
1682                 unsigned long flags;
1683
1684                 local_irq_save(flags);
1685                 sd = &__get_cpu_var(softnet_data);
1686                 skb->next = sd->completion_queue;
1687                 sd->completion_queue = skb;
1688                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1689                 local_irq_restore(flags);
1690         }
1691 }
1692 EXPORT_SYMBOL(dev_kfree_skb_irq);
1693
1694 void dev_kfree_skb_any(struct sk_buff *skb)
1695 {
1696         if (in_irq() || irqs_disabled())
1697                 dev_kfree_skb_irq(skb);
1698         else
1699                 dev_kfree_skb(skb);
1700 }
1701 EXPORT_SYMBOL(dev_kfree_skb_any);
1702
1703
1704 /**
1705  * netif_device_detach - mark device as removed
1706  * @dev: network device
1707  *
1708  * Mark device as removed from system and therefore no longer available.
1709  */
1710 void netif_device_detach(struct net_device *dev)
1711 {
1712         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1713             netif_running(dev)) {
1714                 netif_tx_stop_all_queues(dev);
1715         }
1716 }
1717 EXPORT_SYMBOL(netif_device_detach);
1718
1719 /**
1720  * netif_device_attach - mark device as attached
1721  * @dev: network device
1722  *
1723  * Mark device as attached from system and restart if needed.
1724  */
1725 void netif_device_attach(struct net_device *dev)
1726 {
1727         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1728             netif_running(dev)) {
1729                 netif_tx_wake_all_queues(dev);
1730                 __netdev_watchdog_up(dev);
1731         }
1732 }
1733 EXPORT_SYMBOL(netif_device_attach);
1734
1735 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1736 {
1737         return ((features & NETIF_F_GEN_CSUM) ||
1738                 ((features & NETIF_F_V4_CSUM) &&
1739                  protocol == htons(ETH_P_IP)) ||
1740                 ((features & NETIF_F_V6_CSUM) &&
1741                  protocol == htons(ETH_P_IPV6)) ||
1742                 ((features & NETIF_F_FCOE_CRC) &&
1743                  protocol == htons(ETH_P_FCOE)));
1744 }
1745
1746 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1747 {
1748         __be16 protocol = skb->protocol;
1749         int features = dev->features;
1750
1751         if (vlan_tx_tag_present(skb)) {
1752                 features &= dev->vlan_features;
1753         } else if (protocol == htons(ETH_P_8021Q)) {
1754                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1755                 protocol = veh->h_vlan_encapsulated_proto;
1756                 features &= dev->vlan_features;
1757         }
1758
1759         return can_checksum_protocol(features, protocol);
1760 }
1761
1762 /**
1763  * skb_dev_set -- assign a new device to a buffer
1764  * @skb: buffer for the new device
1765  * @dev: network device
1766  *
1767  * If an skb is owned by a device already, we have to reset
1768  * all data private to the namespace a device belongs to
1769  * before assigning it a new device.
1770  */
1771 #ifdef CONFIG_NET_NS
1772 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1773 {
1774         skb_dst_drop(skb);
1775         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1776                 secpath_reset(skb);
1777                 nf_reset(skb);
1778                 skb_init_secmark(skb);
1779                 skb->mark = 0;
1780                 skb->priority = 0;
1781                 skb->nf_trace = 0;
1782                 skb->ipvs_property = 0;
1783 #ifdef CONFIG_NET_SCHED
1784                 skb->tc_index = 0;
1785 #endif
1786         }
1787         skb->dev = dev;
1788 }
1789 EXPORT_SYMBOL(skb_set_dev);
1790 #endif /* CONFIG_NET_NS */
1791
1792 /*
1793  * Invalidate hardware checksum when packet is to be mangled, and
1794  * complete checksum manually on outgoing path.
1795  */
1796 int skb_checksum_help(struct sk_buff *skb)
1797 {
1798         __wsum csum;
1799         int ret = 0, offset;
1800
1801         if (skb->ip_summed == CHECKSUM_COMPLETE)
1802                 goto out_set_summed;
1803
1804         if (unlikely(skb_shinfo(skb)->gso_size)) {
1805                 /* Let GSO fix up the checksum. */
1806                 goto out_set_summed;
1807         }
1808
1809         offset = skb_checksum_start_offset(skb);
1810         BUG_ON(offset >= skb_headlen(skb));
1811         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1812
1813         offset += skb->csum_offset;
1814         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1815
1816         if (skb_cloned(skb) &&
1817             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1818                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1819                 if (ret)
1820                         goto out;
1821         }
1822
1823         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1824 out_set_summed:
1825         skb->ip_summed = CHECKSUM_NONE;
1826 out:
1827         return ret;
1828 }
1829 EXPORT_SYMBOL(skb_checksum_help);
1830
1831 /**
1832  *      skb_gso_segment - Perform segmentation on skb.
1833  *      @skb: buffer to segment
1834  *      @features: features for the output path (see dev->features)
1835  *
1836  *      This function segments the given skb and returns a list of segments.
1837  *
1838  *      It may return NULL if the skb requires no segmentation.  This is
1839  *      only possible when GSO is used for verifying header integrity.
1840  */
1841 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1842 {
1843         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1844         struct packet_type *ptype;
1845         __be16 type = skb->protocol;
1846         int vlan_depth = ETH_HLEN;
1847         int err;
1848
1849         while (type == htons(ETH_P_8021Q)) {
1850                 struct vlan_hdr *vh;
1851
1852                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1853                         return ERR_PTR(-EINVAL);
1854
1855                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1856                 type = vh->h_vlan_encapsulated_proto;
1857                 vlan_depth += VLAN_HLEN;
1858         }
1859
1860         skb_reset_mac_header(skb);
1861         skb->mac_len = skb->network_header - skb->mac_header;
1862         __skb_pull(skb, skb->mac_len);
1863
1864         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1865                 struct net_device *dev = skb->dev;
1866                 struct ethtool_drvinfo info = {};
1867
1868                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1869                         dev->ethtool_ops->get_drvinfo(dev, &info);
1870
1871                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1872                      info.driver, dev ? dev->features : 0L,
1873                      skb->sk ? skb->sk->sk_route_caps : 0L,
1874                      skb->len, skb->data_len, skb->ip_summed);
1875
1876                 if (skb_header_cloned(skb) &&
1877                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1878                         return ERR_PTR(err);
1879         }
1880
1881         rcu_read_lock();
1882         list_for_each_entry_rcu(ptype,
1883                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1884                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1885                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1886                                 err = ptype->gso_send_check(skb);
1887                                 segs = ERR_PTR(err);
1888                                 if (err || skb_gso_ok(skb, features))
1889                                         break;
1890                                 __skb_push(skb, (skb->data -
1891                                                  skb_network_header(skb)));
1892                         }
1893                         segs = ptype->gso_segment(skb, features);
1894                         break;
1895                 }
1896         }
1897         rcu_read_unlock();
1898
1899         __skb_push(skb, skb->data - skb_mac_header(skb));
1900
1901         return segs;
1902 }
1903 EXPORT_SYMBOL(skb_gso_segment);
1904
1905 /* Take action when hardware reception checksum errors are detected. */
1906 #ifdef CONFIG_BUG
1907 void netdev_rx_csum_fault(struct net_device *dev)
1908 {
1909         if (net_ratelimit()) {
1910                 printk(KERN_ERR "%s: hw csum failure.\n",
1911                         dev ? dev->name : "<unknown>");
1912                 dump_stack();
1913         }
1914 }
1915 EXPORT_SYMBOL(netdev_rx_csum_fault);
1916 #endif
1917
1918 /* Actually, we should eliminate this check as soon as we know, that:
1919  * 1. IOMMU is present and allows to map all the memory.
1920  * 2. No high memory really exists on this machine.
1921  */
1922
1923 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1924 {
1925 #ifdef CONFIG_HIGHMEM
1926         int i;
1927         if (!(dev->features & NETIF_F_HIGHDMA)) {
1928                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1929                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1930                                 return 1;
1931         }
1932
1933         if (PCI_DMA_BUS_IS_PHYS) {
1934                 struct device *pdev = dev->dev.parent;
1935
1936                 if (!pdev)
1937                         return 0;
1938                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1939                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1940                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1941                                 return 1;
1942                 }
1943         }
1944 #endif
1945         return 0;
1946 }
1947
1948 struct dev_gso_cb {
1949         void (*destructor)(struct sk_buff *skb);
1950 };
1951
1952 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1953
1954 static void dev_gso_skb_destructor(struct sk_buff *skb)
1955 {
1956         struct dev_gso_cb *cb;
1957
1958         do {
1959                 struct sk_buff *nskb = skb->next;
1960
1961                 skb->next = nskb->next;
1962                 nskb->next = NULL;
1963                 kfree_skb(nskb);
1964         } while (skb->next);
1965
1966         cb = DEV_GSO_CB(skb);
1967         if (cb->destructor)
1968                 cb->destructor(skb);
1969 }
1970
1971 /**
1972  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1973  *      @skb: buffer to segment
1974  *      @features: device features as applicable to this skb
1975  *
1976  *      This function segments the given skb and stores the list of segments
1977  *      in skb->next.
1978  */
1979 static int dev_gso_segment(struct sk_buff *skb, int features)
1980 {
1981         struct sk_buff *segs;
1982
1983         segs = skb_gso_segment(skb, features);
1984
1985         /* Verifying header integrity only. */
1986         if (!segs)
1987                 return 0;
1988
1989         if (IS_ERR(segs))
1990                 return PTR_ERR(segs);
1991
1992         skb->next = segs;
1993         DEV_GSO_CB(skb)->destructor = skb->destructor;
1994         skb->destructor = dev_gso_skb_destructor;
1995
1996         return 0;
1997 }
1998
1999 /*
2000  * Try to orphan skb early, right before transmission by the device.
2001  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2002  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2003  */
2004 static inline void skb_orphan_try(struct sk_buff *skb)
2005 {
2006         struct sock *sk = skb->sk;
2007
2008         if (sk && !skb_shinfo(skb)->tx_flags) {
2009                 /* skb_tx_hash() wont be able to get sk.
2010                  * We copy sk_hash into skb->rxhash
2011                  */
2012                 if (!skb->rxhash)
2013                         skb->rxhash = sk->sk_hash;
2014                 skb_orphan(skb);
2015         }
2016 }
2017
2018 static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
2019 {
2020         if (!can_checksum_protocol(protocol, features)) {
2021                 features &= ~NETIF_F_ALL_CSUM;
2022                 features &= ~NETIF_F_SG;
2023         } else if (illegal_highdma(skb->dev, skb)) {
2024                 features &= ~NETIF_F_SG;
2025         }
2026
2027         return features;
2028 }
2029
2030 int netif_skb_features(struct sk_buff *skb)
2031 {
2032         __be16 protocol = skb->protocol;
2033         int features = skb->dev->features;
2034
2035         if (protocol == htons(ETH_P_8021Q)) {
2036                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2037                 protocol = veh->h_vlan_encapsulated_proto;
2038         } else if (!vlan_tx_tag_present(skb)) {
2039                 return harmonize_features(skb, protocol, features);
2040         }
2041
2042         features &= skb->dev->vlan_features;
2043
2044         if (protocol != htons(ETH_P_8021Q)) {
2045                 return harmonize_features(skb, protocol, features);
2046         } else {
2047                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2048                                 NETIF_F_GEN_CSUM;
2049                 return harmonize_features(skb, protocol, features);
2050         }
2051 }
2052 EXPORT_SYMBOL(netif_skb_features);
2053
2054 /*
2055  * Returns true if either:
2056  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2057  *      2. skb is fragmented and the device does not support SG, or if
2058  *         at least one of fragments is in highmem and device does not
2059  *         support DMA from it.
2060  */
2061 static inline int skb_needs_linearize(struct sk_buff *skb,
2062                                       struct net_device *dev)
2063 {
2064         if (skb_is_nonlinear(skb)) {
2065                 int features = dev->features;
2066
2067                 if (vlan_tx_tag_present(skb))
2068                         features &= dev->vlan_features;
2069
2070                 return (skb_has_frag_list(skb) &&
2071                         !(features & NETIF_F_FRAGLIST)) ||
2072                         (skb_shinfo(skb)->nr_frags &&
2073                         (!(features & NETIF_F_SG) ||
2074                         illegal_highdma(dev, skb)));
2075         }
2076
2077         return 0;
2078 }
2079
2080 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2081                         struct netdev_queue *txq)
2082 {
2083         const struct net_device_ops *ops = dev->netdev_ops;
2084         int rc = NETDEV_TX_OK;
2085
2086         if (likely(!skb->next)) {
2087                 int features;
2088
2089                 /*
2090                  * If device doesnt need skb->dst, release it right now while
2091                  * its hot in this cpu cache
2092                  */
2093                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2094                         skb_dst_drop(skb);
2095
2096                 if (!list_empty(&ptype_all))
2097                         dev_queue_xmit_nit(skb, dev);
2098
2099                 skb_orphan_try(skb);
2100
2101                 features = netif_skb_features(skb);
2102
2103                 if (vlan_tx_tag_present(skb) &&
2104                     !(features & NETIF_F_HW_VLAN_TX)) {
2105                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2106                         if (unlikely(!skb))
2107                                 goto out;
2108
2109                         skb->vlan_tci = 0;
2110                 }
2111
2112                 if (netif_needs_gso(skb, features)) {
2113                         if (unlikely(dev_gso_segment(skb, features)))
2114                                 goto out_kfree_skb;
2115                         if (skb->next)
2116                                 goto gso;
2117                 } else {
2118                         if (skb_needs_linearize(skb, dev) &&
2119                             __skb_linearize(skb))
2120                                 goto out_kfree_skb;
2121
2122                         /* If packet is not checksummed and device does not
2123                          * support checksumming for this protocol, complete
2124                          * checksumming here.
2125                          */
2126                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2127                                 skb_set_transport_header(skb,
2128                                         skb_checksum_start_offset(skb));
2129                                 if (!dev_can_checksum(dev, skb) &&
2130                                      skb_checksum_help(skb))
2131                                         goto out_kfree_skb;
2132                         }
2133                 }
2134
2135                 rc = ops->ndo_start_xmit(skb, dev);
2136                 trace_net_dev_xmit(skb, rc);
2137                 if (rc == NETDEV_TX_OK)
2138                         txq_trans_update(txq);
2139                 return rc;
2140         }
2141
2142 gso:
2143         do {
2144                 struct sk_buff *nskb = skb->next;
2145
2146                 skb->next = nskb->next;
2147                 nskb->next = NULL;
2148
2149                 /*
2150                  * If device doesnt need nskb->dst, release it right now while
2151                  * its hot in this cpu cache
2152                  */
2153                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2154                         skb_dst_drop(nskb);
2155
2156                 rc = ops->ndo_start_xmit(nskb, dev);
2157                 trace_net_dev_xmit(nskb, rc);
2158                 if (unlikely(rc != NETDEV_TX_OK)) {
2159                         if (rc & ~NETDEV_TX_MASK)
2160                                 goto out_kfree_gso_skb;
2161                         nskb->next = skb->next;
2162                         skb->next = nskb;
2163                         return rc;
2164                 }
2165                 txq_trans_update(txq);
2166                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2167                         return NETDEV_TX_BUSY;
2168         } while (skb->next);
2169
2170 out_kfree_gso_skb:
2171         if (likely(skb->next == NULL))
2172                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2173 out_kfree_skb:
2174         kfree_skb(skb);
2175 out:
2176         return rc;
2177 }
2178
2179 static u32 hashrnd __read_mostly;
2180
2181 /*
2182  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2183  * to be used as a distribution range.
2184  */
2185 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2186                   unsigned int num_tx_queues)
2187 {
2188         u32 hash;
2189
2190         if (skb_rx_queue_recorded(skb)) {
2191                 hash = skb_get_rx_queue(skb);
2192                 while (unlikely(hash >= num_tx_queues))
2193                         hash -= num_tx_queues;
2194                 return hash;
2195         }
2196
2197         if (skb->sk && skb->sk->sk_hash)
2198                 hash = skb->sk->sk_hash;
2199         else
2200                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2201         hash = jhash_1word(hash, hashrnd);
2202
2203         return (u16) (((u64) hash * num_tx_queues) >> 32);
2204 }
2205 EXPORT_SYMBOL(__skb_tx_hash);
2206
2207 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2208 {
2209         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2210                 if (net_ratelimit()) {
2211                         pr_warning("%s selects TX queue %d, but "
2212                                 "real number of TX queues is %d\n",
2213                                 dev->name, queue_index, dev->real_num_tx_queues);
2214                 }
2215                 return 0;
2216         }
2217         return queue_index;
2218 }
2219
2220 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2221 {
2222 #ifdef CONFIG_XPS
2223         struct xps_dev_maps *dev_maps;
2224         struct xps_map *map;
2225         int queue_index = -1;
2226
2227         rcu_read_lock();
2228         dev_maps = rcu_dereference(dev->xps_maps);
2229         if (dev_maps) {
2230                 map = rcu_dereference(
2231                     dev_maps->cpu_map[raw_smp_processor_id()]);
2232                 if (map) {
2233                         if (map->len == 1)
2234                                 queue_index = map->queues[0];
2235                         else {
2236                                 u32 hash;
2237                                 if (skb->sk && skb->sk->sk_hash)
2238                                         hash = skb->sk->sk_hash;
2239                                 else
2240                                         hash = (__force u16) skb->protocol ^
2241                                             skb->rxhash;
2242                                 hash = jhash_1word(hash, hashrnd);
2243                                 queue_index = map->queues[
2244                                     ((u64)hash * map->len) >> 32];
2245                         }
2246                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2247                                 queue_index = -1;
2248                 }
2249         }
2250         rcu_read_unlock();
2251
2252         return queue_index;
2253 #else
2254         return -1;
2255 #endif
2256 }
2257
2258 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2259                                         struct sk_buff *skb)
2260 {
2261         int queue_index;
2262         const struct net_device_ops *ops = dev->netdev_ops;
2263
2264         if (dev->real_num_tx_queues == 1)
2265                 queue_index = 0;
2266         else if (ops->ndo_select_queue) {
2267                 queue_index = ops->ndo_select_queue(dev, skb);
2268                 queue_index = dev_cap_txqueue(dev, queue_index);
2269         } else {
2270                 struct sock *sk = skb->sk;
2271                 queue_index = sk_tx_queue_get(sk);
2272
2273                 if (queue_index < 0 || skb->ooo_okay ||
2274                     queue_index >= dev->real_num_tx_queues) {
2275                         int old_index = queue_index;
2276
2277                         queue_index = get_xps_queue(dev, skb);
2278                         if (queue_index < 0)
2279                                 queue_index = skb_tx_hash(dev, skb);
2280
2281                         if (queue_index != old_index && sk) {
2282                                 struct dst_entry *dst =
2283                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2284
2285                                 if (dst && skb_dst(skb) == dst)
2286                                         sk_tx_queue_set(sk, queue_index);
2287                         }
2288                 }
2289         }
2290
2291         skb_set_queue_mapping(skb, queue_index);
2292         return netdev_get_tx_queue(dev, queue_index);
2293 }
2294
2295 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2296                                  struct net_device *dev,
2297                                  struct netdev_queue *txq)
2298 {
2299         spinlock_t *root_lock = qdisc_lock(q);
2300         bool contended = qdisc_is_running(q);
2301         int rc;
2302
2303         /*
2304          * Heuristic to force contended enqueues to serialize on a
2305          * separate lock before trying to get qdisc main lock.
2306          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2307          * and dequeue packets faster.
2308          */
2309         if (unlikely(contended))
2310                 spin_lock(&q->busylock);
2311
2312         spin_lock(root_lock);
2313         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2314                 kfree_skb(skb);
2315                 rc = NET_XMIT_DROP;
2316         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2317                    qdisc_run_begin(q)) {
2318                 /*
2319                  * This is a work-conserving queue; there are no old skbs
2320                  * waiting to be sent out; and the qdisc is not running -
2321                  * xmit the skb directly.
2322                  */
2323                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2324                         skb_dst_force(skb);
2325                 __qdisc_update_bstats(q, skb->len);
2326                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2327                         if (unlikely(contended)) {
2328                                 spin_unlock(&q->busylock);
2329                                 contended = false;
2330                         }
2331                         __qdisc_run(q);
2332                 } else
2333                         qdisc_run_end(q);
2334
2335                 rc = NET_XMIT_SUCCESS;
2336         } else {
2337                 skb_dst_force(skb);
2338                 rc = qdisc_enqueue_root(skb, q);
2339                 if (qdisc_run_begin(q)) {
2340                         if (unlikely(contended)) {
2341                                 spin_unlock(&q->busylock);
2342                                 contended = false;
2343                         }
2344                         __qdisc_run(q);
2345                 }
2346         }
2347         spin_unlock(root_lock);
2348         if (unlikely(contended))
2349                 spin_unlock(&q->busylock);
2350         return rc;
2351 }
2352
2353 static DEFINE_PER_CPU(int, xmit_recursion);
2354 #define RECURSION_LIMIT 10
2355
2356 /**
2357  *      dev_queue_xmit - transmit a buffer
2358  *      @skb: buffer to transmit
2359  *
2360  *      Queue a buffer for transmission to a network device. The caller must
2361  *      have set the device and priority and built the buffer before calling
2362  *      this function. The function can be called from an interrupt.
2363  *
2364  *      A negative errno code is returned on a failure. A success does not
2365  *      guarantee the frame will be transmitted as it may be dropped due
2366  *      to congestion or traffic shaping.
2367  *
2368  * -----------------------------------------------------------------------------------
2369  *      I notice this method can also return errors from the queue disciplines,
2370  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2371  *      be positive.
2372  *
2373  *      Regardless of the return value, the skb is consumed, so it is currently
2374  *      difficult to retry a send to this method.  (You can bump the ref count
2375  *      before sending to hold a reference for retry if you are careful.)
2376  *
2377  *      When calling this method, interrupts MUST be enabled.  This is because
2378  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2379  *          --BLG
2380  */
2381 int dev_queue_xmit(struct sk_buff *skb)
2382 {
2383         struct net_device *dev = skb->dev;
2384         struct netdev_queue *txq;
2385         struct Qdisc *q;
2386         int rc = -ENOMEM;
2387
2388         /* Disable soft irqs for various locks below. Also
2389          * stops preemption for RCU.
2390          */
2391         rcu_read_lock_bh();
2392
2393         txq = dev_pick_tx(dev, skb);
2394         q = rcu_dereference_bh(txq->qdisc);
2395
2396 #ifdef CONFIG_NET_CLS_ACT
2397         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2398 #endif
2399         trace_net_dev_queue(skb);
2400         if (q->enqueue) {
2401                 rc = __dev_xmit_skb(skb, q, dev, txq);
2402                 goto out;
2403         }
2404
2405         /* The device has no queue. Common case for software devices:
2406            loopback, all the sorts of tunnels...
2407
2408            Really, it is unlikely that netif_tx_lock protection is necessary
2409            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2410            counters.)
2411            However, it is possible, that they rely on protection
2412            made by us here.
2413
2414            Check this and shot the lock. It is not prone from deadlocks.
2415            Either shot noqueue qdisc, it is even simpler 8)
2416          */
2417         if (dev->flags & IFF_UP) {
2418                 int cpu = smp_processor_id(); /* ok because BHs are off */
2419
2420                 if (txq->xmit_lock_owner != cpu) {
2421
2422                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2423                                 goto recursion_alert;
2424
2425                         HARD_TX_LOCK(dev, txq, cpu);
2426
2427                         if (!netif_tx_queue_stopped(txq)) {
2428                                 __this_cpu_inc(xmit_recursion);
2429                                 rc = dev_hard_start_xmit(skb, dev, txq);
2430                                 __this_cpu_dec(xmit_recursion);
2431                                 if (dev_xmit_complete(rc)) {
2432                                         HARD_TX_UNLOCK(dev, txq);
2433                                         goto out;
2434                                 }
2435                         }
2436                         HARD_TX_UNLOCK(dev, txq);
2437                         if (net_ratelimit())
2438                                 printk(KERN_CRIT "Virtual device %s asks to "
2439                                        "queue packet!\n", dev->name);
2440                 } else {
2441                         /* Recursion is detected! It is possible,
2442                          * unfortunately
2443                          */
2444 recursion_alert:
2445                         if (net_ratelimit())
2446                                 printk(KERN_CRIT "Dead loop on virtual device "
2447                                        "%s, fix it urgently!\n", dev->name);
2448                 }
2449         }
2450
2451         rc = -ENETDOWN;
2452         rcu_read_unlock_bh();
2453
2454         kfree_skb(skb);
2455         return rc;
2456 out:
2457         rcu_read_unlock_bh();
2458         return rc;
2459 }
2460 EXPORT_SYMBOL(dev_queue_xmit);
2461
2462
2463 /*=======================================================================
2464                         Receiver routines
2465   =======================================================================*/
2466
2467 int netdev_max_backlog __read_mostly = 1000;
2468 int netdev_tstamp_prequeue __read_mostly = 1;
2469 int netdev_budget __read_mostly = 300;
2470 int weight_p __read_mostly = 64;            /* old backlog weight */
2471
2472 /* Called with irq disabled */
2473 static inline void ____napi_schedule(struct softnet_data *sd,
2474                                      struct napi_struct *napi)
2475 {
2476         list_add_tail(&napi->poll_list, &sd->poll_list);
2477         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2478 }
2479
2480 /*
2481  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2482  * and src/dst port numbers. Returns a non-zero hash number on success
2483  * and 0 on failure.
2484  */
2485 __u32 __skb_get_rxhash(struct sk_buff *skb)
2486 {
2487         int nhoff, hash = 0, poff;
2488         struct ipv6hdr *ip6;
2489         struct iphdr *ip;
2490         u8 ip_proto;
2491         u32 addr1, addr2, ihl;
2492         union {
2493                 u32 v32;
2494                 u16 v16[2];
2495         } ports;
2496
2497         nhoff = skb_network_offset(skb);
2498
2499         switch (skb->protocol) {
2500         case __constant_htons(ETH_P_IP):
2501                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2502                         goto done;
2503
2504                 ip = (struct iphdr *) (skb->data + nhoff);
2505                 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2506                         ip_proto = 0;
2507                 else
2508                         ip_proto = ip->protocol;
2509                 addr1 = (__force u32) ip->saddr;
2510                 addr2 = (__force u32) ip->daddr;
2511                 ihl = ip->ihl;
2512                 break;
2513         case __constant_htons(ETH_P_IPV6):
2514                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2515                         goto done;
2516
2517                 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2518                 ip_proto = ip6->nexthdr;
2519                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2520                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2521                 ihl = (40 >> 2);
2522                 break;
2523         default:
2524                 goto done;
2525         }
2526
2527         ports.v32 = 0;
2528         poff = proto_ports_offset(ip_proto);
2529         if (poff >= 0) {
2530                 nhoff += ihl * 4 + poff;
2531                 if (pskb_may_pull(skb, nhoff + 4)) {
2532                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2533                         if (ports.v16[1] < ports.v16[0])
2534                                 swap(ports.v16[0], ports.v16[1]);
2535                 }
2536         }
2537
2538         /* get a consistent hash (same value on both flow directions) */
2539         if (addr2 < addr1)
2540                 swap(addr1, addr2);
2541
2542         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2543         if (!hash)
2544                 hash = 1;
2545
2546 done:
2547         return hash;
2548 }
2549 EXPORT_SYMBOL(__skb_get_rxhash);
2550
2551 #ifdef CONFIG_RPS
2552
2553 /* One global table that all flow-based protocols share. */
2554 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2555 EXPORT_SYMBOL(rps_sock_flow_table);
2556
2557 /*
2558  * get_rps_cpu is called from netif_receive_skb and returns the target
2559  * CPU from the RPS map of the receiving queue for a given skb.
2560  * rcu_read_lock must be held on entry.
2561  */
2562 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2563                        struct rps_dev_flow **rflowp)
2564 {
2565         struct netdev_rx_queue *rxqueue;
2566         struct rps_map *map;
2567         struct rps_dev_flow_table *flow_table;
2568         struct rps_sock_flow_table *sock_flow_table;
2569         int cpu = -1;
2570         u16 tcpu;
2571
2572         if (skb_rx_queue_recorded(skb)) {
2573                 u16 index = skb_get_rx_queue(skb);
2574                 if (unlikely(index >= dev->real_num_rx_queues)) {
2575                         WARN_ONCE(dev->real_num_rx_queues > 1,
2576                                   "%s received packet on queue %u, but number "
2577                                   "of RX queues is %u\n",
2578                                   dev->name, index, dev->real_num_rx_queues);
2579                         goto done;
2580                 }
2581                 rxqueue = dev->_rx + index;
2582         } else
2583                 rxqueue = dev->_rx;
2584
2585         map = rcu_dereference(rxqueue->rps_map);
2586         if (map) {
2587                 if (map->len == 1) {
2588                         tcpu = map->cpus[0];
2589                         if (cpu_online(tcpu))
2590                                 cpu = tcpu;
2591                         goto done;
2592                 }
2593         } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2594                 goto done;
2595         }
2596
2597         skb_reset_network_header(skb);
2598         if (!skb_get_rxhash(skb))
2599                 goto done;
2600
2601         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2602         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2603         if (flow_table && sock_flow_table) {
2604                 u16 next_cpu;
2605                 struct rps_dev_flow *rflow;
2606
2607                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2608                 tcpu = rflow->cpu;
2609
2610                 next_cpu = sock_flow_table->ents[skb->rxhash &
2611                     sock_flow_table->mask];
2612
2613                 /*
2614                  * If the desired CPU (where last recvmsg was done) is
2615                  * different from current CPU (one in the rx-queue flow
2616                  * table entry), switch if one of the following holds:
2617                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2618                  *   - Current CPU is offline.
2619                  *   - The current CPU's queue tail has advanced beyond the
2620                  *     last packet that was enqueued using this table entry.
2621                  *     This guarantees that all previous packets for the flow
2622                  *     have been dequeued, thus preserving in order delivery.
2623                  */
2624                 if (unlikely(tcpu != next_cpu) &&
2625                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2626                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2627                       rflow->last_qtail)) >= 0)) {
2628                         tcpu = rflow->cpu = next_cpu;
2629                         if (tcpu != RPS_NO_CPU)
2630                                 rflow->last_qtail = per_cpu(softnet_data,
2631                                     tcpu).input_queue_head;
2632                 }
2633                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2634                         *rflowp = rflow;
2635                         cpu = tcpu;
2636                         goto done;
2637                 }
2638         }
2639
2640         if (map) {
2641                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2642
2643                 if (cpu_online(tcpu)) {
2644                         cpu = tcpu;
2645                         goto done;
2646                 }
2647         }
2648
2649 done:
2650         return cpu;
2651 }
2652
2653 /* Called from hardirq (IPI) context */
2654 static void rps_trigger_softirq(void *data)
2655 {
2656         struct softnet_data *sd = data;
2657
2658         ____napi_schedule(sd, &sd->backlog);
2659         sd->received_rps++;
2660 }
2661
2662 #endif /* CONFIG_RPS */
2663
2664 /*
2665  * Check if this softnet_data structure is another cpu one
2666  * If yes, queue it to our IPI list and return 1
2667  * If no, return 0
2668  */
2669 static int rps_ipi_queued(struct softnet_data *sd)
2670 {
2671 #ifdef CONFIG_RPS
2672         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2673
2674         if (sd != mysd) {
2675                 sd->rps_ipi_next = mysd->rps_ipi_list;
2676                 mysd->rps_ipi_list = sd;
2677
2678                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2679                 return 1;
2680         }
2681 #endif /* CONFIG_RPS */
2682         return 0;
2683 }
2684
2685 /*
2686  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2687  * queue (may be a remote CPU queue).
2688  */
2689 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2690                               unsigned int *qtail)
2691 {
2692         struct softnet_data *sd;
2693         unsigned long flags;
2694
2695         sd = &per_cpu(softnet_data, cpu);
2696
2697         local_irq_save(flags);
2698
2699         rps_lock(sd);
2700         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2701                 if (skb_queue_len(&sd->input_pkt_queue)) {
2702 enqueue:
2703                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2704                         input_queue_tail_incr_save(sd, qtail);
2705                         rps_unlock(sd);
2706                         local_irq_restore(flags);
2707                         return NET_RX_SUCCESS;
2708                 }
2709
2710                 /* Schedule NAPI for backlog device
2711                  * We can use non atomic operation since we own the queue lock
2712                  */
2713                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2714                         if (!rps_ipi_queued(sd))
2715                                 ____napi_schedule(sd, &sd->backlog);
2716                 }
2717                 goto enqueue;
2718         }
2719
2720         sd->dropped++;
2721         rps_unlock(sd);
2722
2723         local_irq_restore(flags);
2724
2725         atomic_long_inc(&skb->dev->rx_dropped);
2726         kfree_skb(skb);
2727         return NET_RX_DROP;
2728 }
2729
2730 /**
2731  *      netif_rx        -       post buffer to the network code
2732  *      @skb: buffer to post
2733  *
2734  *      This function receives a packet from a device driver and queues it for
2735  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2736  *      may be dropped during processing for congestion control or by the
2737  *      protocol layers.
2738  *
2739  *      return values:
2740  *      NET_RX_SUCCESS  (no congestion)
2741  *      NET_RX_DROP     (packet was dropped)
2742  *
2743  */
2744
2745 int netif_rx(struct sk_buff *skb)
2746 {
2747         int ret;
2748
2749         /* if netpoll wants it, pretend we never saw it */
2750         if (netpoll_rx(skb))
2751                 return NET_RX_DROP;
2752
2753         if (netdev_tstamp_prequeue)
2754                 net_timestamp_check(skb);
2755
2756         trace_netif_rx(skb);
2757 #ifdef CONFIG_RPS
2758         {
2759                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2760                 int cpu;
2761
2762                 preempt_disable();
2763                 rcu_read_lock();
2764
2765                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2766                 if (cpu < 0)
2767                         cpu = smp_processor_id();
2768
2769                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2770
2771                 rcu_read_unlock();
2772                 preempt_enable();
2773         }
2774 #else
2775         {
2776                 unsigned int qtail;
2777                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2778                 put_cpu();
2779         }
2780 #endif
2781         return ret;
2782 }
2783 EXPORT_SYMBOL(netif_rx);
2784
2785 int netif_rx_ni(struct sk_buff *skb)
2786 {
2787         int err;
2788
2789         preempt_disable();
2790         err = netif_rx(skb);
2791         if (local_softirq_pending())
2792                 do_softirq();
2793         preempt_enable();
2794
2795         return err;
2796 }
2797 EXPORT_SYMBOL(netif_rx_ni);
2798
2799 static void net_tx_action(struct softirq_action *h)
2800 {
2801         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2802
2803         if (sd->completion_queue) {
2804                 struct sk_buff *clist;
2805
2806                 local_irq_disable();
2807                 clist = sd->completion_queue;
2808                 sd->completion_queue = NULL;
2809                 local_irq_enable();
2810
2811                 while (clist) {
2812                         struct sk_buff *skb = clist;
2813                         clist = clist->next;
2814
2815                         WARN_ON(atomic_read(&skb->users));
2816                         trace_kfree_skb(skb, net_tx_action);
2817                         __kfree_skb(skb);
2818                 }
2819         }
2820
2821         if (sd->output_queue) {
2822                 struct Qdisc *head;
2823
2824                 local_irq_disable();
2825                 head = sd->output_queue;
2826                 sd->output_queue = NULL;
2827                 sd->output_queue_tailp = &sd->output_queue;
2828                 local_irq_enable();
2829
2830                 while (head) {
2831                         struct Qdisc *q = head;
2832                         spinlock_t *root_lock;
2833
2834                         head = head->next_sched;
2835
2836                         root_lock = qdisc_lock(q);
2837                         if (spin_trylock(root_lock)) {
2838                                 smp_mb__before_clear_bit();
2839                                 clear_bit(__QDISC_STATE_SCHED,
2840                                           &q->state);
2841                                 qdisc_run(q);
2842                                 spin_unlock(root_lock);
2843                         } else {
2844                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2845                                               &q->state)) {
2846                                         __netif_reschedule(q);
2847                                 } else {
2848                                         smp_mb__before_clear_bit();
2849                                         clear_bit(__QDISC_STATE_SCHED,
2850                                                   &q->state);
2851                                 }
2852                         }
2853                 }
2854         }
2855 }
2856
2857 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2858     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2859 /* This hook is defined here for ATM LANE */
2860 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2861                              unsigned char *addr) __read_mostly;
2862 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2863 #endif
2864
2865 #ifdef CONFIG_NET_CLS_ACT
2866 /* TODO: Maybe we should just force sch_ingress to be compiled in
2867  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2868  * a compare and 2 stores extra right now if we dont have it on
2869  * but have CONFIG_NET_CLS_ACT
2870  * NOTE: This doesnt stop any functionality; if you dont have
2871  * the ingress scheduler, you just cant add policies on ingress.
2872  *
2873  */
2874 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2875 {
2876         struct net_device *dev = skb->dev;
2877         u32 ttl = G_TC_RTTL(skb->tc_verd);
2878         int result = TC_ACT_OK;
2879         struct Qdisc *q;
2880
2881         if (unlikely(MAX_RED_LOOP < ttl++)) {
2882                 if (net_ratelimit())
2883                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2884                                skb->skb_iif, dev->ifindex);
2885                 return TC_ACT_SHOT;
2886         }
2887
2888         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2889         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2890
2891         q = rxq->qdisc;
2892         if (q != &noop_qdisc) {
2893                 spin_lock(qdisc_lock(q));
2894                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2895                         result = qdisc_enqueue_root(skb, q);
2896                 spin_unlock(qdisc_lock(q));
2897         }
2898
2899         return result;
2900 }
2901
2902 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2903                                          struct packet_type **pt_prev,
2904                                          int *ret, struct net_device *orig_dev)
2905 {
2906         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2907
2908         if (!rxq || rxq->qdisc == &noop_qdisc)
2909                 goto out;
2910
2911         if (*pt_prev) {
2912                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2913                 *pt_prev = NULL;
2914         }
2915
2916         switch (ing_filter(skb, rxq)) {
2917         case TC_ACT_SHOT:
2918         case TC_ACT_STOLEN:
2919                 kfree_skb(skb);
2920                 return NULL;
2921         }
2922
2923 out:
2924         skb->tc_verd = 0;
2925         return skb;
2926 }
2927 #endif
2928
2929 /**
2930  *      netdev_rx_handler_register - register receive handler
2931  *      @dev: device to register a handler for
2932  *      @rx_handler: receive handler to register
2933  *      @rx_handler_data: data pointer that is used by rx handler
2934  *
2935  *      Register a receive hander for a device. This handler will then be
2936  *      called from __netif_receive_skb. A negative errno code is returned
2937  *      on a failure.
2938  *
2939  *      The caller must hold the rtnl_mutex.
2940  */
2941 int netdev_rx_handler_register(struct net_device *dev,
2942                                rx_handler_func_t *rx_handler,
2943                                void *rx_handler_data)
2944 {
2945         ASSERT_RTNL();
2946
2947         if (dev->rx_handler)
2948                 return -EBUSY;
2949
2950         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2951         rcu_assign_pointer(dev->rx_handler, rx_handler);
2952
2953         return 0;
2954 }
2955 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2956
2957 /**
2958  *      netdev_rx_handler_unregister - unregister receive handler
2959  *      @dev: device to unregister a handler from
2960  *
2961  *      Unregister a receive hander from a device.
2962  *
2963  *      The caller must hold the rtnl_mutex.
2964  */
2965 void netdev_rx_handler_unregister(struct net_device *dev)
2966 {
2967
2968         ASSERT_RTNL();
2969         rcu_assign_pointer(dev->rx_handler, NULL);
2970         rcu_assign_pointer(dev->rx_handler_data, NULL);
2971 }
2972 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2973
2974 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2975                                               struct net_device *master)
2976 {
2977         if (skb->pkt_type == PACKET_HOST) {
2978                 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2979
2980                 memcpy(dest, master->dev_addr, ETH_ALEN);
2981         }
2982 }
2983
2984 /* On bonding slaves other than the currently active slave, suppress
2985  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2986  * ARP on active-backup slaves with arp_validate enabled.
2987  */
2988 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2989 {
2990         struct net_device *dev = skb->dev;
2991
2992         if (master->priv_flags & IFF_MASTER_ARPMON)
2993                 dev->last_rx = jiffies;
2994
2995         if ((master->priv_flags & IFF_MASTER_ALB) &&
2996             (master->priv_flags & IFF_BRIDGE_PORT)) {
2997                 /* Do address unmangle. The local destination address
2998                  * will be always the one master has. Provides the right
2999                  * functionality in a bridge.
3000                  */
3001                 skb_bond_set_mac_by_master(skb, master);
3002         }
3003
3004         if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
3005                 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
3006                     skb->protocol == __cpu_to_be16(ETH_P_ARP))
3007                         return 0;
3008
3009                 if (master->priv_flags & IFF_MASTER_ALB) {
3010                         if (skb->pkt_type != PACKET_BROADCAST &&
3011                             skb->pkt_type != PACKET_MULTICAST)
3012                                 return 0;
3013                 }
3014                 if (master->priv_flags & IFF_MASTER_8023AD &&
3015                     skb->protocol == __cpu_to_be16(ETH_P_SLOW))
3016                         return 0;
3017
3018                 return 1;
3019         }
3020         return 0;
3021 }
3022 EXPORT_SYMBOL(__skb_bond_should_drop);
3023
3024 static int __netif_receive_skb(struct sk_buff *skb)
3025 {
3026         struct packet_type *ptype, *pt_prev;
3027         rx_handler_func_t *rx_handler;
3028         struct net_device *orig_dev;
3029         struct net_device *master;
3030         struct net_device *null_or_orig;
3031         struct net_device *orig_or_bond;
3032         int ret = NET_RX_DROP;
3033         __be16 type;
3034
3035         if (!netdev_tstamp_prequeue)
3036                 net_timestamp_check(skb);
3037
3038         trace_netif_receive_skb(skb);
3039
3040         /* if we've gotten here through NAPI, check netpoll */
3041         if (netpoll_receive_skb(skb))
3042                 return NET_RX_DROP;
3043
3044         if (!skb->skb_iif)
3045                 skb->skb_iif = skb->dev->ifindex;
3046
3047         /*
3048          * bonding note: skbs received on inactive slaves should only
3049          * be delivered to pkt handlers that are exact matches.  Also
3050          * the deliver_no_wcard flag will be set.  If packet handlers
3051          * are sensitive to duplicate packets these skbs will need to
3052          * be dropped at the handler.
3053          */
3054         null_or_orig = NULL;
3055         orig_dev = skb->dev;
3056         master = ACCESS_ONCE(orig_dev->master);
3057         if (skb->deliver_no_wcard)
3058                 null_or_orig = orig_dev;
3059         else if (master) {
3060                 if (skb_bond_should_drop(skb, master)) {
3061                         skb->deliver_no_wcard = 1;
3062                         null_or_orig = orig_dev; /* deliver only exact match */
3063                 } else
3064                         skb->dev = master;
3065         }
3066
3067         __this_cpu_inc(softnet_data.processed);
3068         skb_reset_network_header(skb);
3069         skb_reset_transport_header(skb);
3070         skb->mac_len = skb->network_header - skb->mac_header;
3071
3072         pt_prev = NULL;
3073
3074         rcu_read_lock();
3075
3076 #ifdef CONFIG_NET_CLS_ACT
3077         if (skb->tc_verd & TC_NCLS) {
3078                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3079                 goto ncls;
3080         }
3081 #endif
3082
3083         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3084                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3085                     ptype->dev == orig_dev) {
3086                         if (pt_prev)
3087                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3088                         pt_prev = ptype;
3089                 }
3090         }
3091
3092 #ifdef CONFIG_NET_CLS_ACT
3093         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3094         if (!skb)
3095                 goto out;
3096 ncls:
3097 #endif
3098
3099         /* Handle special case of bridge or macvlan */
3100         rx_handler = rcu_dereference(skb->dev->rx_handler);
3101         if (rx_handler) {
3102                 if (pt_prev) {
3103                         ret = deliver_skb(skb, pt_prev, orig_dev);
3104                         pt_prev = NULL;
3105                 }
3106                 skb = rx_handler(skb);
3107                 if (!skb)
3108                         goto out;
3109         }
3110
3111         if (vlan_tx_tag_present(skb)) {
3112                 if (pt_prev) {
3113                         ret = deliver_skb(skb, pt_prev, orig_dev);
3114                         pt_prev = NULL;
3115                 }
3116                 if (vlan_hwaccel_do_receive(&skb)) {
3117                         ret = __netif_receive_skb(skb);
3118                         goto out;
3119                 } else if (unlikely(!skb))
3120                         goto out;
3121         }
3122
3123         /*
3124          * Make sure frames received on VLAN interfaces stacked on
3125          * bonding interfaces still make their way to any base bonding
3126          * device that may have registered for a specific ptype.  The
3127          * handler may have to adjust skb->dev and orig_dev.
3128          */
3129         orig_or_bond = orig_dev;
3130         if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3131             (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3132                 orig_or_bond = vlan_dev_real_dev(skb->dev);
3133         }
3134
3135         type = skb->protocol;
3136         list_for_each_entry_rcu(ptype,
3137                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3138                 if (ptype->type == type && (ptype->dev == null_or_orig ||
3139                      ptype->dev == skb->dev || ptype->dev == orig_dev ||
3140                      ptype->dev == orig_or_bond)) {
3141                         if (pt_prev)
3142                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3143                         pt_prev = ptype;
3144                 }
3145         }
3146
3147         if (pt_prev) {
3148                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3149         } else {
3150                 atomic_long_inc(&skb->dev->rx_dropped);
3151                 kfree_skb(skb);
3152                 /* Jamal, now you will not able to escape explaining
3153                  * me how you were going to use this. :-)
3154                  */
3155                 ret = NET_RX_DROP;
3156         }
3157
3158 out:
3159         rcu_read_unlock();
3160         return ret;
3161 }
3162
3163 /**
3164  *      netif_receive_skb - process receive buffer from network
3165  *      @skb: buffer to process
3166  *
3167  *      netif_receive_skb() is the main receive data processing function.
3168  *      It always succeeds. The buffer may be dropped during processing
3169  *      for congestion control or by the protocol layers.
3170  *
3171  *      This function may only be called from softirq context and interrupts
3172  *      should be enabled.
3173  *
3174  *      Return values (usually ignored):
3175  *      NET_RX_SUCCESS: no congestion
3176  *      NET_RX_DROP: packet was dropped
3177  */
3178 int netif_receive_skb(struct sk_buff *skb)
3179 {
3180         if (netdev_tstamp_prequeue)
3181                 net_timestamp_check(skb);
3182
3183         if (skb_defer_rx_timestamp(skb))
3184                 return NET_RX_SUCCESS;
3185
3186 #ifdef CONFIG_RPS
3187         {
3188                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3189                 int cpu, ret;
3190
3191                 rcu_read_lock();
3192
3193                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3194
3195                 if (cpu >= 0) {
3196                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3197                         rcu_read_unlock();
3198                 } else {
3199                         rcu_read_unlock();
3200                         ret = __netif_receive_skb(skb);
3201                 }
3202
3203                 return ret;
3204         }
3205 #else
3206         return __netif_receive_skb(skb);
3207 #endif
3208 }
3209 EXPORT_SYMBOL(netif_receive_skb);
3210
3211 /* Network device is going away, flush any packets still pending
3212  * Called with irqs disabled.
3213  */
3214 static void flush_backlog(void *arg)
3215 {
3216         struct net_device *dev = arg;
3217         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3218         struct sk_buff *skb, *tmp;
3219
3220         rps_lock(sd);
3221         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3222                 if (skb->dev == dev) {
3223                         __skb_unlink(skb, &sd->input_pkt_queue);
3224                         kfree_skb(skb);
3225                         input_queue_head_incr(sd);
3226                 }
3227         }
3228         rps_unlock(sd);
3229
3230         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3231                 if (skb->dev == dev) {
3232                         __skb_unlink(skb, &sd->process_queue);
3233                         kfree_skb(skb);
3234                         input_queue_head_incr(sd);
3235                 }
3236         }
3237 }
3238
3239 static int napi_gro_complete(struct sk_buff *skb)
3240 {
3241         struct packet_type *ptype;
3242         __be16 type = skb->protocol;
3243         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3244         int err = -ENOENT;
3245
3246         if (NAPI_GRO_CB(skb)->count == 1) {
3247                 skb_shinfo(skb)->gso_size = 0;
3248                 goto out;
3249         }
3250
3251         rcu_read_lock();
3252         list_for_each_entry_rcu(ptype, head, list) {
3253                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3254                         continue;
3255
3256                 err = ptype->gro_complete(skb);
3257                 break;
3258         }
3259         rcu_read_unlock();
3260
3261         if (err) {
3262                 WARN_ON(&ptype->list == head);
3263                 kfree_skb(skb);
3264                 return NET_RX_SUCCESS;
3265         }
3266
3267 out:
3268         return netif_receive_skb(skb);
3269 }
3270
3271 inline void napi_gro_flush(struct napi_struct *napi)
3272 {
3273         struct sk_buff *skb, *next;
3274
3275         for (skb = napi->gro_list; skb; skb = next) {
3276                 next = skb->next;
3277                 skb->next = NULL;
3278                 napi_gro_complete(skb);
3279         }
3280
3281         napi->gro_count = 0;
3282         napi->gro_list = NULL;
3283 }
3284 EXPORT_SYMBOL(napi_gro_flush);
3285
3286 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3287 {
3288         struct sk_buff **pp = NULL;
3289         struct packet_type *ptype;
3290         __be16 type = skb->protocol;
3291         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3292         int same_flow;
3293         int mac_len;
3294         enum gro_result ret;
3295
3296         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3297                 goto normal;
3298
3299         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3300                 goto normal;
3301
3302         rcu_read_lock();
3303         list_for_each_entry_rcu(ptype, head, list) {
3304                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3305                         continue;
3306
3307                 skb_set_network_header(skb, skb_gro_offset(skb));
3308                 mac_len = skb->network_header - skb->mac_header;
3309                 skb->mac_len = mac_len;
3310                 NAPI_GRO_CB(skb)->same_flow = 0;
3311                 NAPI_GRO_CB(skb)->flush = 0;
3312                 NAPI_GRO_CB(skb)->free = 0;
3313
3314                 pp = ptype->gro_receive(&napi->gro_list, skb);
3315                 break;
3316         }
3317         rcu_read_unlock();
3318
3319         if (&ptype->list == head)
3320                 goto normal;
3321
3322         same_flow = NAPI_GRO_CB(skb)->same_flow;
3323         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3324
3325         if (pp) {
3326                 struct sk_buff *nskb = *pp;
3327
3328                 *pp = nskb->next;
3329                 nskb->next = NULL;
3330                 napi_gro_complete(nskb);
3331                 napi->gro_count--;
3332         }
3333
3334         if (same_flow)
3335                 goto ok;
3336
3337         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3338                 goto normal;
3339
3340         napi->gro_count++;
3341         NAPI_GRO_CB(skb)->count = 1;
3342         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3343         skb->next = napi->gro_list;
3344         napi->gro_list = skb;
3345         ret = GRO_HELD;
3346
3347 pull:
3348         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3349                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3350
3351                 BUG_ON(skb->end - skb->tail < grow);
3352
3353                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3354
3355                 skb->tail += grow;
3356                 skb->data_len -= grow;
3357
3358                 skb_shinfo(skb)->frags[0].page_offset += grow;
3359                 skb_shinfo(skb)->frags[0].size -= grow;
3360
3361                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3362                         put_page(skb_shinfo(skb)->frags[0].page);
3363                         memmove(skb_shinfo(skb)->frags,
3364                                 skb_shinfo(skb)->frags + 1,
3365                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3366                 }
3367         }
3368
3369 ok:
3370         return ret;
3371
3372 normal:
3373         ret = GRO_NORMAL;
3374         goto pull;
3375 }
3376 EXPORT_SYMBOL(dev_gro_receive);
3377
3378 static inline gro_result_t
3379 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3380 {
3381         struct sk_buff *p;
3382
3383         for (p = napi->gro_list; p; p = p->next) {
3384                 unsigned long diffs;
3385
3386                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3387                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3388                 diffs |= compare_ether_header(skb_mac_header(p),
3389                                               skb_gro_mac_header(skb));
3390                 NAPI_GRO_CB(p)->same_flow = !diffs;
3391                 NAPI_GRO_CB(p)->flush = 0;
3392         }
3393
3394         return dev_gro_receive(napi, skb);
3395 }
3396
3397 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3398 {
3399         switch (ret) {
3400         case GRO_NORMAL:
3401                 if (netif_receive_skb(skb))
3402                         ret = GRO_DROP;
3403                 break;
3404
3405         case GRO_DROP:
3406         case GRO_MERGED_FREE:
3407                 kfree_skb(skb);
3408                 break;
3409
3410         case GRO_HELD:
3411         case GRO_MERGED:
3412                 break;
3413         }
3414
3415         return ret;
3416 }
3417 EXPORT_SYMBOL(napi_skb_finish);
3418
3419 void skb_gro_reset_offset(struct sk_buff *skb)
3420 {
3421         NAPI_GRO_CB(skb)->data_offset = 0;
3422         NAPI_GRO_CB(skb)->frag0 = NULL;
3423         NAPI_GRO_CB(skb)->frag0_len = 0;
3424
3425         if (skb->mac_header == skb->tail &&
3426             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3427                 NAPI_GRO_CB(skb)->frag0 =
3428                         page_address(skb_shinfo(skb)->frags[0].page) +
3429                         skb_shinfo(skb)->frags[0].page_offset;
3430                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3431         }
3432 }
3433 EXPORT_SYMBOL(skb_gro_reset_offset);
3434
3435 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3436 {
3437         skb_gro_reset_offset(skb);
3438
3439         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3440 }
3441 EXPORT_SYMBOL(napi_gro_receive);
3442
3443 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3444 {
3445         __skb_pull(skb, skb_headlen(skb));
3446         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3447         skb->vlan_tci = 0;
3448
3449         napi->skb = skb;
3450 }
3451
3452 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3453 {
3454         struct sk_buff *skb = napi->skb;
3455
3456         if (!skb) {
3457                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3458                 if (skb)
3459                         napi->skb = skb;
3460         }
3461         return skb;
3462 }
3463 EXPORT_SYMBOL(napi_get_frags);
3464
3465 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3466                                gro_result_t ret)
3467 {
3468         switch (ret) {
3469         case GRO_NORMAL:
3470         case GRO_HELD:
3471                 skb->protocol = eth_type_trans(skb, skb->dev);
3472
3473                 if (ret == GRO_HELD)
3474                         skb_gro_pull(skb, -ETH_HLEN);
3475                 else if (netif_receive_skb(skb))
3476                         ret = GRO_DROP;
3477                 break;
3478
3479         case GRO_DROP:
3480         case GRO_MERGED_FREE:
3481                 napi_reuse_skb(napi, skb);
3482                 break;
3483
3484         case GRO_MERGED:
3485                 break;
3486         }
3487
3488         return ret;
3489 }
3490 EXPORT_SYMBOL(napi_frags_finish);
3491
3492 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3493 {
3494         struct sk_buff *skb = napi->skb;
3495         struct ethhdr *eth;
3496         unsigned int hlen;
3497         unsigned int off;
3498
3499         napi->skb = NULL;
3500
3501         skb_reset_mac_header(skb);
3502         skb_gro_reset_offset(skb);
3503
3504         off = skb_gro_offset(skb);
3505         hlen = off + sizeof(*eth);
3506         eth = skb_gro_header_fast(skb, off);
3507         if (skb_gro_header_hard(skb, hlen)) {
3508                 eth = skb_gro_header_slow(skb, hlen, off);
3509                 if (unlikely(!eth)) {
3510                         napi_reuse_skb(napi, skb);
3511                         skb = NULL;
3512                         goto out;
3513                 }
3514         }
3515
3516         skb_gro_pull(skb, sizeof(*eth));
3517
3518         /*
3519          * This works because the only protocols we care about don't require
3520          * special handling.  We'll fix it up properly at the end.
3521          */
3522         skb->protocol = eth->h_proto;
3523
3524 out:
3525         return skb;
3526 }
3527 EXPORT_SYMBOL(napi_frags_skb);
3528
3529 gro_result_t napi_gro_frags(struct napi_struct *napi)
3530 {
3531         struct sk_buff *skb = napi_frags_skb(napi);
3532
3533         if (!skb)
3534                 return GRO_DROP;
3535
3536         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3537 }
3538 EXPORT_SYMBOL(napi_gro_frags);
3539
3540 /*
3541  * net_rps_action sends any pending IPI's for rps.
3542  * Note: called with local irq disabled, but exits with local irq enabled.
3543  */
3544 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3545 {
3546 #ifdef CONFIG_RPS
3547         struct softnet_data *remsd = sd->rps_ipi_list;
3548
3549         if (remsd) {
3550                 sd->rps_ipi_list = NULL;
3551
3552                 local_irq_enable();
3553
3554                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3555                 while (remsd) {
3556                         struct softnet_data *next = remsd->rps_ipi_next;
3557
3558                         if (cpu_online(remsd->cpu))
3559                                 __smp_call_function_single(remsd->cpu,
3560                                                            &remsd->csd, 0);
3561                         remsd = next;
3562                 }
3563         } else
3564 #endif
3565                 local_irq_enable();
3566 }
3567
3568 static int process_backlog(struct napi_struct *napi, int quota)
3569 {
3570         int work = 0;
3571         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3572
3573 #ifdef CONFIG_RPS
3574         /* Check if we have pending ipi, its better to send them now,
3575          * not waiting net_rx_action() end.
3576          */
3577         if (sd->rps_ipi_list) {
3578                 local_irq_disable();
3579                 net_rps_action_and_irq_enable(sd);
3580         }
3581 #endif
3582         napi->weight = weight_p;
3583         local_irq_disable();
3584         while (work < quota) {
3585                 struct sk_buff *skb;
3586                 unsigned int qlen;
3587
3588                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3589                         local_irq_enable();
3590                         __netif_receive_skb(skb);
3591                         local_irq_disable();
3592                         input_queue_head_incr(sd);
3593                         if (++work >= quota) {
3594                                 local_irq_enable();
3595                                 return work;
3596                         }
3597                 }
3598
3599                 rps_lock(sd);
3600                 qlen = skb_queue_len(&sd->input_pkt_queue);
3601                 if (qlen)
3602                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3603                                                    &sd->process_queue);
3604
3605                 if (qlen < quota - work) {
3606                         /*
3607                          * Inline a custom version of __napi_complete().
3608                          * only current cpu owns and manipulates this napi,
3609                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3610                          * we can use a plain write instead of clear_bit(),
3611                          * and we dont need an smp_mb() memory barrier.
3612                          */
3613                         list_del(&napi->poll_list);
3614                         napi->state = 0;
3615
3616                         quota = work + qlen;
3617                 }
3618                 rps_unlock(sd);
3619         }
3620         local_irq_enable();
3621
3622         return work;
3623 }
3624
3625 /**
3626  * __napi_schedule - schedule for receive
3627  * @n: entry to schedule
3628  *
3629  * The entry's receive function will be scheduled to run
3630  */
3631 void __napi_schedule(struct napi_struct *n)
3632 {
3633         unsigned long flags;
3634
3635         local_irq_save(flags);
3636         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3637         local_irq_restore(flags);
3638 }
3639 EXPORT_SYMBOL(__napi_schedule);
3640
3641 void __napi_complete(struct napi_struct *n)
3642 {
3643         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3644         BUG_ON(n->gro_list);
3645
3646         list_del(&n->poll_list);
3647         smp_mb__before_clear_bit();
3648         clear_bit(NAPI_STATE_SCHED, &n->state);
3649 }
3650 EXPORT_SYMBOL(__napi_complete);
3651
3652 void napi_complete(struct napi_struct *n)
3653 {
3654         unsigned long flags;
3655
3656         /*
3657          * don't let napi dequeue from the cpu poll list
3658          * just in case its running on a different cpu
3659          */
3660         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3661                 return;
3662
3663         napi_gro_flush(n);
3664         local_irq_save(flags);
3665         __napi_complete(n);
3666         local_irq_restore(flags);
3667 }
3668 EXPORT_SYMBOL(napi_complete);
3669
3670 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3671                     int (*poll)(struct napi_struct *, int), int weight)
3672 {
3673         INIT_LIST_HEAD(&napi->poll_list);
3674         napi->gro_count = 0;
3675         napi->gro_list = NULL;
3676         napi->skb = NULL;
3677         napi->poll = poll;
3678         napi->weight = weight;
3679         list_add(&napi->dev_list, &dev->napi_list);
3680         napi->dev = dev;
3681 #ifdef CONFIG_NETPOLL
3682         spin_lock_init(&napi->poll_lock);
3683         napi->poll_owner = -1;
3684 #endif
3685         set_bit(NAPI_STATE_SCHED, &napi->state);
3686 }
3687 EXPORT_SYMBOL(netif_napi_add);
3688
3689 void netif_napi_del(struct napi_struct *napi)
3690 {
3691         struct sk_buff *skb, *next;
3692
3693         list_del_init(&napi->dev_list);
3694         napi_free_frags(napi);
3695
3696         for (skb = napi->gro_list; skb; skb = next) {
3697                 next = skb->next;
3698                 skb->next = NULL;
3699                 kfree_skb(skb);
3700         }
3701
3702         napi->gro_list = NULL;
3703         napi->gro_count = 0;
3704 }
3705 EXPORT_SYMBOL(netif_napi_del);
3706
3707 static void net_rx_action(struct softirq_action *h)
3708 {
3709         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3710         unsigned long time_limit = jiffies + 2;
3711         int budget = netdev_budget;
3712         void *have;
3713
3714         local_irq_disable();
3715
3716         while (!list_empty(&sd->poll_list)) {
3717                 struct napi_struct *n;
3718                 int work, weight;
3719
3720                 /* If softirq window is exhuasted then punt.
3721                  * Allow this to run for 2 jiffies since which will allow
3722                  * an average latency of 1.5/HZ.
3723                  */
3724                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3725                         goto softnet_break;
3726
3727                 local_irq_enable();
3728
3729                 /* Even though interrupts have been re-enabled, this
3730                  * access is safe because interrupts can only add new
3731                  * entries to the tail of this list, and only ->poll()
3732                  * calls can remove this head entry from the list.
3733                  */
3734                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3735
3736                 have = netpoll_poll_lock(n);
3737
3738                 weight = n->weight;
3739
3740                 /* This NAPI_STATE_SCHED test is for avoiding a race
3741                  * with netpoll's poll_napi().  Only the entity which
3742                  * obtains the lock and sees NAPI_STATE_SCHED set will
3743                  * actually make the ->poll() call.  Therefore we avoid
3744                  * accidently calling ->poll() when NAPI is not scheduled.
3745                  */
3746                 work = 0;
3747                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3748                         work = n->poll(n, weight);
3749                         trace_napi_poll(n);
3750                 }
3751
3752                 WARN_ON_ONCE(work > weight);
3753
3754                 budget -= work;
3755
3756                 local_irq_disable();
3757
3758                 /* Drivers must not modify the NAPI state if they
3759                  * consume the entire weight.  In such cases this code
3760                  * still "owns" the NAPI instance and therefore can
3761                  * move the instance around on the list at-will.
3762                  */
3763                 if (unlikely(work == weight)) {
3764                         if (unlikely(napi_disable_pending(n))) {
3765                                 local_irq_enable();
3766                                 napi_complete(n);
3767                                 local_irq_disable();
3768                         } else
3769                                 list_move_tail(&n->poll_list, &sd->poll_list);
3770                 }
3771
3772                 netpoll_poll_unlock(have);
3773         }
3774 out:
3775         net_rps_action_and_irq_enable(sd);
3776
3777 #ifdef CONFIG_NET_DMA
3778         /*
3779          * There may not be any more sk_buffs coming right now, so push
3780          * any pending DMA copies to hardware
3781          */
3782         dma_issue_pending_all();
3783 #endif
3784
3785         return;
3786
3787 softnet_break:
3788         sd->time_squeeze++;
3789         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3790         goto out;
3791 }
3792
3793 static gifconf_func_t *gifconf_list[NPROTO];
3794
3795 /**
3796  *      register_gifconf        -       register a SIOCGIF handler
3797  *      @family: Address family
3798  *      @gifconf: Function handler
3799  *
3800  *      Register protocol dependent address dumping routines. The handler
3801  *      that is passed must not be freed or reused until it has been replaced
3802  *      by another handler.
3803  */
3804 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3805 {
3806         if (family >= NPROTO)
3807                 return -EINVAL;
3808         gifconf_list[family] = gifconf;
3809         return 0;
3810 }
3811 EXPORT_SYMBOL(register_gifconf);
3812
3813
3814 /*
3815  *      Map an interface index to its name (SIOCGIFNAME)
3816  */
3817
3818 /*
3819  *      We need this ioctl for efficient implementation of the
3820  *      if_indextoname() function required by the IPv6 API.  Without
3821  *      it, we would have to search all the interfaces to find a
3822  *      match.  --pb
3823  */
3824
3825 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3826 {
3827         struct net_device *dev;
3828         struct ifreq ifr;
3829
3830         /*
3831          *      Fetch the caller's info block.
3832          */
3833
3834         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3835                 return -EFAULT;
3836
3837         rcu_read_lock();
3838         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3839         if (!dev) {
3840                 rcu_read_unlock();
3841                 return -ENODEV;
3842         }
3843
3844         strcpy(ifr.ifr_name, dev->name);
3845         rcu_read_unlock();
3846
3847         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3848                 return -EFAULT;
3849         return 0;
3850 }
3851
3852 /*
3853  *      Perform a SIOCGIFCONF call. This structure will change
3854  *      size eventually, and there is nothing I can do about it.
3855  *      Thus we will need a 'compatibility mode'.
3856  */
3857
3858 static int dev_ifconf(struct net *net, char __user *arg)
3859 {
3860         struct ifconf ifc;
3861         struct net_device *dev;
3862         char __user *pos;
3863         int len;
3864         int total;
3865         int i;
3866
3867         /*
3868          *      Fetch the caller's info block.
3869          */
3870
3871         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3872                 return -EFAULT;
3873
3874         pos = ifc.ifc_buf;
3875         len = ifc.ifc_len;
3876
3877         /*
3878          *      Loop over the interfaces, and write an info block for each.
3879          */
3880
3881         total = 0;
3882         for_each_netdev(net, dev) {
3883                 for (i = 0; i < NPROTO; i++) {
3884                         if (gifconf_list[i]) {
3885                                 int done;
3886                                 if (!pos)
3887                                         done = gifconf_list[i](dev, NULL, 0);
3888                                 else
3889                                         done = gifconf_list[i](dev, pos + total,
3890                                                                len - total);
3891                                 if (done < 0)
3892                                         return -EFAULT;
3893                                 total += done;
3894                         }
3895                 }
3896         }
3897
3898         /*
3899          *      All done.  Write the updated control block back to the caller.
3900          */
3901         ifc.ifc_len = total;
3902
3903         /*
3904          *      Both BSD and Solaris return 0 here, so we do too.
3905          */
3906         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3907 }
3908
3909 #ifdef CONFIG_PROC_FS
3910 /*
3911  *      This is invoked by the /proc filesystem handler to display a device
3912  *      in detail.
3913  */
3914 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3915         __acquires(RCU)
3916 {
3917         struct net *net = seq_file_net(seq);
3918         loff_t off;
3919         struct net_device *dev;
3920
3921         rcu_read_lock();
3922         if (!*pos)
3923                 return SEQ_START_TOKEN;
3924
3925         off = 1;
3926         for_each_netdev_rcu(net, dev)
3927                 if (off++ == *pos)
3928                         return dev;
3929
3930         return NULL;
3931 }
3932
3933 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3934 {
3935         struct net_device *dev = (v == SEQ_START_TOKEN) ?
3936                                   first_net_device(seq_file_net(seq)) :
3937                                   next_net_device((struct net_device *)v);
3938
3939         ++*pos;
3940         return rcu_dereference(dev);
3941 }
3942
3943 void dev_seq_stop(struct seq_file *seq, void *v)
3944         __releases(RCU)
3945 {
3946         rcu_read_unlock();
3947 }
3948
3949 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3950 {
3951         struct rtnl_link_stats64 temp;
3952         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3953
3954         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3955                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3956                    dev->name, stats->rx_bytes, stats->rx_packets,
3957                    stats->rx_errors,
3958                    stats->rx_dropped + stats->rx_missed_errors,
3959                    stats->rx_fifo_errors,
3960                    stats->rx_length_errors + stats->rx_over_errors +
3961                     stats->rx_crc_errors + stats->rx_frame_errors,
3962                    stats->rx_compressed, stats->multicast,
3963                    stats->tx_bytes, stats->tx_packets,
3964                    stats->tx_errors, stats->tx_dropped,
3965                    stats->tx_fifo_errors, stats->collisions,
3966                    stats->tx_carrier_errors +
3967                     stats->tx_aborted_errors +
3968                     stats->tx_window_errors +
3969                     stats->tx_heartbeat_errors,
3970                    stats->tx_compressed);
3971 }
3972
3973 /*
3974  *      Called from the PROCfs module. This now uses the new arbitrary sized
3975  *      /proc/net interface to create /proc/net/dev
3976  */
3977 static int dev_seq_show(struct seq_file *seq, void *v)
3978 {
3979         if (v == SEQ_START_TOKEN)
3980                 seq_puts(seq, "Inter-|   Receive                            "
3981                               "                    |  Transmit\n"
3982                               " face |bytes    packets errs drop fifo frame "
3983                               "compressed multicast|bytes    packets errs "
3984                               "drop fifo colls carrier compressed\n");
3985         else
3986                 dev_seq_printf_stats(seq, v);
3987         return 0;
3988 }
3989
3990 static struct softnet_data *softnet_get_online(loff_t *pos)
3991 {
3992         struct softnet_data *sd = NULL;
3993
3994         while (*pos < nr_cpu_ids)
3995                 if (cpu_online(*pos)) {
3996                         sd = &per_cpu(softnet_data, *pos);
3997                         break;
3998                 } else
3999                         ++*pos;
4000         return sd;
4001 }
4002
4003 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4004 {
4005         return softnet_get_online(pos);
4006 }
4007
4008 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4009 {
4010         ++*pos;
4011         return softnet_get_online(pos);
4012 }
4013
4014 static void softnet_seq_stop(struct seq_file *seq, void *v)
4015 {
4016 }
4017
4018 static int softnet_seq_show(struct seq_file *seq, void *v)
4019 {
4020         struct softnet_data *sd = v;
4021
4022         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4023                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4024                    0, 0, 0, 0, /* was fastroute */
4025                    sd->cpu_collision, sd->received_rps);
4026         return 0;
4027 }
4028
4029 static const struct seq_operations dev_seq_ops = {
4030         .start = dev_seq_start,
4031         .next  = dev_seq_next,
4032         .stop  = dev_seq_stop,
4033         .show  = dev_seq_show,
4034 };
4035
4036 static int dev_seq_open(struct inode *inode, struct file *file)
4037 {
4038         return seq_open_net(inode, file, &dev_seq_ops,
4039                             sizeof(struct seq_net_private));
4040 }
4041
4042 static const struct file_operations dev_seq_fops = {
4043         .owner   = THIS_MODULE,
4044         .open    = dev_seq_open,
4045         .read    = seq_read,
4046         .llseek  = seq_lseek,
4047         .release = seq_release_net,
4048 };
4049
4050 static const struct seq_operations softnet_seq_ops = {
4051         .start = softnet_seq_start,
4052         .next  = softnet_seq_next,
4053         .stop  = softnet_seq_stop,
4054         .show  = softnet_seq_show,
4055 };
4056
4057 static int softnet_seq_open(struct inode *inode, struct file *file)
4058 {
4059         return seq_open(file, &softnet_seq_ops);
4060 }
4061
4062 static const struct file_operations softnet_seq_fops = {
4063         .owner   = THIS_MODULE,
4064         .open    = softnet_seq_open,
4065         .read    = seq_read,
4066         .llseek  = seq_lseek,
4067         .release = seq_release,
4068 };
4069
4070 static void *ptype_get_idx(loff_t pos)
4071 {
4072         struct packet_type *pt = NULL;
4073         loff_t i = 0;
4074         int t;
4075
4076         list_for_each_entry_rcu(pt, &ptype_all, list) {
4077                 if (i == pos)
4078                         return pt;
4079                 ++i;
4080         }
4081
4082         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4083                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4084                         if (i == pos)
4085                                 return pt;
4086                         ++i;
4087                 }
4088         }
4089         return NULL;
4090 }
4091
4092 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4093         __acquires(RCU)
4094 {
4095         rcu_read_lock();
4096         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4097 }
4098
4099 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4100 {
4101         struct packet_type *pt;
4102         struct list_head *nxt;
4103         int hash;
4104
4105         ++*pos;
4106         if (v == SEQ_START_TOKEN)
4107                 return ptype_get_idx(0);
4108
4109         pt = v;
4110         nxt = pt->list.next;
4111         if (pt->type == htons(ETH_P_ALL)) {
4112                 if (nxt != &ptype_all)
4113                         goto found;
4114                 hash = 0;
4115                 nxt = ptype_base[0].next;
4116         } else
4117                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4118
4119         while (nxt == &ptype_base[hash]) {
4120                 if (++hash >= PTYPE_HASH_SIZE)
4121                         return NULL;
4122                 nxt = ptype_base[hash].next;
4123         }
4124 found:
4125         return list_entry(nxt, struct packet_type, list);
4126 }
4127
4128 static void ptype_seq_stop(struct seq_file *seq, void *v)
4129         __releases(RCU)
4130 {
4131         rcu_read_unlock();
4132 }
4133
4134 static int ptype_seq_show(struct seq_file *seq, void *v)
4135 {
4136         struct packet_type *pt = v;
4137
4138         if (v == SEQ_START_TOKEN)
4139                 seq_puts(seq, "Type Device      Function\n");
4140         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4141                 if (pt->type == htons(ETH_P_ALL))
4142                         seq_puts(seq, "ALL ");
4143                 else
4144                         seq_printf(seq, "%04x", ntohs(pt->type));
4145
4146                 seq_printf(seq, " %-8s %pF\n",
4147                            pt->dev ? pt->dev->name : "", pt->func);
4148         }
4149
4150         return 0;
4151 }
4152
4153 static const struct seq_operations ptype_seq_ops = {
4154         .start = ptype_seq_start,
4155         .next  = ptype_seq_next,
4156         .stop  = ptype_seq_stop,
4157         .show  = ptype_seq_show,
4158 };
4159
4160 static int ptype_seq_open(struct inode *inode, struct file *file)
4161 {
4162         return seq_open_net(inode, file, &ptype_seq_ops,
4163                         sizeof(struct seq_net_private));
4164 }
4165
4166 static const struct file_operations ptype_seq_fops = {
4167         .owner   = THIS_MODULE,
4168         .open    = ptype_seq_open,
4169         .read    = seq_read,
4170         .llseek  = seq_lseek,
4171         .release = seq_release_net,
4172 };
4173
4174
4175 static int __net_init dev_proc_net_init(struct net *net)
4176 {
4177         int rc = -ENOMEM;
4178
4179         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4180                 goto out;
4181         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4182                 goto out_dev;
4183         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4184                 goto out_softnet;
4185
4186         if (wext_proc_init(net))
4187                 goto out_ptype;
4188         rc = 0;
4189 out:
4190         return rc;
4191 out_ptype:
4192         proc_net_remove(net, "ptype");
4193 out_softnet:
4194         proc_net_remove(net, "softnet_stat");
4195 out_dev:
4196         proc_net_remove(net, "dev");
4197         goto out;
4198 }
4199
4200 static void __net_exit dev_proc_net_exit(struct net *net)
4201 {
4202         wext_proc_exit(net);
4203
4204         proc_net_remove(net, "ptype");
4205         proc_net_remove(net, "softnet_stat");
4206         proc_net_remove(net, "dev");
4207 }
4208
4209 static struct pernet_operations __net_initdata dev_proc_ops = {
4210         .init = dev_proc_net_init,
4211         .exit = dev_proc_net_exit,
4212 };
4213
4214 static int __init dev_proc_init(void)
4215 {
4216         return register_pernet_subsys(&dev_proc_ops);
4217 }
4218 #else
4219 #define dev_proc_init() 0
4220 #endif  /* CONFIG_PROC_FS */
4221
4222
4223 /**
4224  *      netdev_set_master       -       set up master/slave pair
4225  *      @slave: slave device
4226  *      @master: new master device
4227  *
4228  *      Changes the master device of the slave. Pass %NULL to break the
4229  *      bonding. The caller must hold the RTNL semaphore. On a failure
4230  *      a negative errno code is returned. On success the reference counts
4231  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4232  *      function returns zero.
4233  */
4234 int netdev_set_master(struct net_device *slave, struct net_device *master)
4235 {
4236         struct net_device *old = slave->master;
4237
4238         ASSERT_RTNL();
4239
4240         if (master) {
4241                 if (old)
4242                         return -EBUSY;
4243                 dev_hold(master);
4244         }
4245
4246         slave->master = master;
4247
4248         if (old) {
4249                 synchronize_net();
4250                 dev_put(old);
4251         }
4252         if (master)
4253                 slave->flags |= IFF_SLAVE;
4254         else
4255                 slave->flags &= ~IFF_SLAVE;
4256
4257         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4258         return 0;
4259 }
4260 EXPORT_SYMBOL(netdev_set_master);
4261
4262 static void dev_change_rx_flags(struct net_device *dev, int flags)
4263 {
4264         const struct net_device_ops *ops = dev->netdev_ops;
4265
4266         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4267                 ops->ndo_change_rx_flags(dev, flags);
4268 }
4269
4270 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4271 {
4272         unsigned short old_flags = dev->flags;
4273         uid_t uid;
4274         gid_t gid;
4275
4276         ASSERT_RTNL();
4277
4278         dev->flags |= IFF_PROMISC;
4279         dev->promiscuity += inc;
4280         if (dev->promiscuity == 0) {
4281                 /*
4282                  * Avoid overflow.
4283                  * If inc causes overflow, untouch promisc and return error.
4284                  */
4285                 if (inc < 0)
4286                         dev->flags &= ~IFF_PROMISC;
4287                 else {
4288                         dev->promiscuity -= inc;
4289                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4290                                 "set promiscuity failed, promiscuity feature "
4291                                 "of device might be broken.\n", dev->name);
4292                         return -EOVERFLOW;
4293                 }
4294         }
4295         if (dev->flags != old_flags) {
4296                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4297                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4298                                                                "left");
4299                 if (audit_enabled) {
4300                         current_uid_gid(&uid, &gid);
4301                         audit_log(current->audit_context, GFP_ATOMIC,
4302                                 AUDIT_ANOM_PROMISCUOUS,
4303                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4304                                 dev->name, (dev->flags & IFF_PROMISC),
4305                                 (old_flags & IFF_PROMISC),
4306                                 audit_get_loginuid(current),
4307                                 uid, gid,
4308                                 audit_get_sessionid(current));
4309                 }
4310
4311                 dev_change_rx_flags(dev, IFF_PROMISC);
4312         }
4313         return 0;
4314 }
4315
4316 /**
4317  *      dev_set_promiscuity     - update promiscuity count on a device
4318  *      @dev: device
4319  *      @inc: modifier
4320  *
4321  *      Add or remove promiscuity from a device. While the count in the device
4322  *      remains above zero the interface remains promiscuous. Once it hits zero
4323  *      the device reverts back to normal filtering operation. A negative inc
4324  *      value is used to drop promiscuity on the device.
4325  *      Return 0 if successful or a negative errno code on error.
4326  */
4327 int dev_set_promiscuity(struct net_device *dev, int inc)
4328 {
4329         unsigned short old_flags = dev->flags;
4330         int err;
4331
4332         err = __dev_set_promiscuity(dev, inc);
4333         if (err < 0)
4334                 return err;
4335         if (dev->flags != old_flags)
4336                 dev_set_rx_mode(dev);
4337         return err;
4338 }
4339 EXPORT_SYMBOL(dev_set_promiscuity);
4340
4341 /**
4342  *      dev_set_allmulti        - update allmulti count on a device
4343  *      @dev: device
4344  *      @inc: modifier
4345  *
4346  *      Add or remove reception of all multicast frames to a device. While the
4347  *      count in the device remains above zero the interface remains listening
4348  *      to all interfaces. Once it hits zero the device reverts back to normal
4349  *      filtering operation. A negative @inc value is used to drop the counter
4350  *      when releasing a resource needing all multicasts.
4351  *      Return 0 if successful or a negative errno code on error.
4352  */
4353
4354 int dev_set_allmulti(struct net_device *dev, int inc)
4355 {
4356         unsigned short old_flags = dev->flags;
4357
4358         ASSERT_RTNL();
4359
4360         dev->flags |= IFF_ALLMULTI;
4361         dev->allmulti += inc;
4362         if (dev->allmulti == 0) {
4363                 /*
4364                  * Avoid overflow.
4365                  * If inc causes overflow, untouch allmulti and return error.
4366                  */
4367                 if (inc < 0)
4368                         dev->flags &= ~IFF_ALLMULTI;
4369                 else {
4370                         dev->allmulti -= inc;
4371                         printk(KERN_WARNING "%s: allmulti touches roof, "
4372                                 "set allmulti failed, allmulti feature of "
4373                                 "device might be broken.\n", dev->name);
4374                         return -EOVERFLOW;
4375                 }
4376         }
4377         if (dev->flags ^ old_flags) {
4378                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4379                 dev_set_rx_mode(dev);
4380         }
4381         return 0;
4382 }
4383 EXPORT_SYMBOL(dev_set_allmulti);
4384
4385 /*
4386  *      Upload unicast and multicast address lists to device and
4387  *      configure RX filtering. When the device doesn't support unicast
4388  *      filtering it is put in promiscuous mode while unicast addresses
4389  *      are present.
4390  */
4391 void __dev_set_rx_mode(struct net_device *dev)
4392 {
4393         const struct net_device_ops *ops = dev->netdev_ops;
4394
4395         /* dev_open will call this function so the list will stay sane. */
4396         if (!(dev->flags&IFF_UP))
4397                 return;
4398
4399         if (!netif_device_present(dev))
4400                 return;
4401
4402         if (ops->ndo_set_rx_mode)
4403                 ops->ndo_set_rx_mode(dev);
4404         else {
4405                 /* Unicast addresses changes may only happen under the rtnl,
4406                  * therefore calling __dev_set_promiscuity here is safe.
4407                  */
4408                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4409                         __dev_set_promiscuity(dev, 1);
4410                         dev->uc_promisc = 1;
4411                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4412                         __dev_set_promiscuity(dev, -1);
4413                         dev->uc_promisc = 0;
4414                 }
4415
4416                 if (ops->ndo_set_multicast_list)
4417                         ops->ndo_set_multicast_list(dev);
4418         }
4419 }
4420
4421 void dev_set_rx_mode(struct net_device *dev)
4422 {
4423         netif_addr_lock_bh(dev);
4424         __dev_set_rx_mode(dev);
4425         netif_addr_unlock_bh(dev);
4426 }
4427
4428 /**
4429  *      dev_get_flags - get flags reported to userspace
4430  *      @dev: device
4431  *
4432  *      Get the combination of flag bits exported through APIs to userspace.
4433  */
4434 unsigned dev_get_flags(const struct net_device *dev)
4435 {
4436         unsigned flags;
4437
4438         flags = (dev->flags & ~(IFF_PROMISC |
4439                                 IFF_ALLMULTI |
4440                                 IFF_RUNNING |
4441                                 IFF_LOWER_UP |
4442                                 IFF_DORMANT)) |
4443                 (dev->gflags & (IFF_PROMISC |
4444                                 IFF_ALLMULTI));
4445
4446         if (netif_running(dev)) {
4447                 if (netif_oper_up(dev))
4448                         flags |= IFF_RUNNING;
4449                 if (netif_carrier_ok(dev))
4450                         flags |= IFF_LOWER_UP;
4451                 if (netif_dormant(dev))
4452                         flags |= IFF_DORMANT;
4453         }
4454
4455         return flags;
4456 }
4457 EXPORT_SYMBOL(dev_get_flags);
4458
4459 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4460 {
4461         int old_flags = dev->flags;
4462         int ret;
4463
4464         ASSERT_RTNL();
4465
4466         /*
4467          *      Set the flags on our device.
4468          */
4469
4470         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4471                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4472                                IFF_AUTOMEDIA)) |
4473                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4474                                     IFF_ALLMULTI));
4475
4476         /*
4477          *      Load in the correct multicast list now the flags have changed.
4478          */
4479
4480         if ((old_flags ^ flags) & IFF_MULTICAST)
4481                 dev_change_rx_flags(dev, IFF_MULTICAST);
4482
4483         dev_set_rx_mode(dev);
4484
4485         /*
4486          *      Have we downed the interface. We handle IFF_UP ourselves
4487          *      according to user attempts to set it, rather than blindly
4488          *      setting it.
4489          */
4490
4491         ret = 0;
4492         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4493                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4494
4495                 if (!ret)
4496                         dev_set_rx_mode(dev);
4497         }
4498
4499         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4500                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4501
4502                 dev->gflags ^= IFF_PROMISC;
4503                 dev_set_promiscuity(dev, inc);
4504         }
4505
4506         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4507            is important. Some (broken) drivers set IFF_PROMISC, when
4508            IFF_ALLMULTI is requested not asking us and not reporting.
4509          */
4510         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4511                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4512
4513                 dev->gflags ^= IFF_ALLMULTI;
4514                 dev_set_allmulti(dev, inc);
4515         }
4516
4517         return ret;
4518 }
4519
4520 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4521 {
4522         unsigned int changes = dev->flags ^ old_flags;
4523
4524         if (changes & IFF_UP) {
4525                 if (dev->flags & IFF_UP)
4526                         call_netdevice_notifiers(NETDEV_UP, dev);
4527                 else
4528                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4529         }
4530
4531         if (dev->flags & IFF_UP &&
4532             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4533                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4534 }
4535
4536 /**
4537  *      dev_change_flags - change device settings
4538  *      @dev: device
4539  *      @flags: device state flags
4540  *
4541  *      Change settings on device based state flags. The flags are
4542  *      in the userspace exported format.
4543  */
4544 int dev_change_flags(struct net_device *dev, unsigned flags)
4545 {
4546         int ret, changes;
4547         int old_flags = dev->flags;
4548
4549         ret = __dev_change_flags(dev, flags);
4550         if (ret < 0)
4551                 return ret;
4552
4553         changes = old_flags ^ dev->flags;
4554         if (changes)
4555                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4556
4557         __dev_notify_flags(dev, old_flags);
4558         return ret;
4559 }
4560 EXPORT_SYMBOL(dev_change_flags);
4561
4562 /**
4563  *      dev_set_mtu - Change maximum transfer unit
4564  *      @dev: device
4565  *      @new_mtu: new transfer unit
4566  *
4567  *      Change the maximum transfer size of the network device.
4568  */
4569 int dev_set_mtu(struct net_device *dev, int new_mtu)
4570 {
4571         const struct net_device_ops *ops = dev->netdev_ops;
4572         int err;
4573
4574         if (new_mtu == dev->mtu)
4575                 return 0;
4576
4577         /*      MTU must be positive.    */
4578         if (new_mtu < 0)
4579                 return -EINVAL;
4580
4581         if (!netif_device_present(dev))
4582                 return -ENODEV;
4583
4584         err = 0;
4585         if (ops->ndo_change_mtu)
4586                 err = ops->ndo_change_mtu(dev, new_mtu);
4587         else
4588                 dev->mtu = new_mtu;
4589
4590         if (!err && dev->flags & IFF_UP)
4591                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4592         return err;
4593 }
4594 EXPORT_SYMBOL(dev_set_mtu);
4595
4596 /**
4597  *      dev_set_mac_address - Change Media Access Control Address
4598  *      @dev: device
4599  *      @sa: new address
4600  *
4601  *      Change the hardware (MAC) address of the device
4602  */
4603 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4604 {
4605         const struct net_device_ops *ops = dev->netdev_ops;
4606         int err;
4607
4608         if (!ops->ndo_set_mac_address)
4609                 return -EOPNOTSUPP;
4610         if (sa->sa_family != dev->type)
4611                 return -EINVAL;
4612         if (!netif_device_present(dev))
4613                 return -ENODEV;
4614         err = ops->ndo_set_mac_address(dev, sa);
4615         if (!err)
4616                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4617         return err;
4618 }
4619 EXPORT_SYMBOL(dev_set_mac_address);
4620
4621 /*
4622  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4623  */
4624 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4625 {
4626         int err;
4627         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4628
4629         if (!dev)
4630                 return -ENODEV;
4631
4632         switch (cmd) {
4633         case SIOCGIFFLAGS:      /* Get interface flags */
4634                 ifr->ifr_flags = (short) dev_get_flags(dev);
4635                 return 0;
4636
4637         case SIOCGIFMETRIC:     /* Get the metric on the interface
4638                                    (currently unused) */
4639                 ifr->ifr_metric = 0;
4640                 return 0;
4641
4642         case SIOCGIFMTU:        /* Get the MTU of a device */
4643                 ifr->ifr_mtu = dev->mtu;
4644                 return 0;
4645
4646         case SIOCGIFHWADDR:
4647                 if (!dev->addr_len)
4648                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4649                 else
4650                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4651                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4652                 ifr->ifr_hwaddr.sa_family = dev->type;
4653                 return 0;
4654
4655         case SIOCGIFSLAVE:
4656                 err = -EINVAL;
4657                 break;
4658
4659         case SIOCGIFMAP:
4660                 ifr->ifr_map.mem_start = dev->mem_start;
4661                 ifr->ifr_map.mem_end   = dev->mem_end;
4662                 ifr->ifr_map.base_addr = dev->base_addr;
4663                 ifr->ifr_map.irq       = dev->irq;
4664                 ifr->ifr_map.dma       = dev->dma;
4665                 ifr->ifr_map.port      = dev->if_port;
4666                 return 0;
4667
4668         case SIOCGIFINDEX:
4669                 ifr->ifr_ifindex = dev->ifindex;
4670                 return 0;
4671
4672         case SIOCGIFTXQLEN:
4673                 ifr->ifr_qlen = dev->tx_queue_len;
4674                 return 0;
4675
4676         default:
4677                 /* dev_ioctl() should ensure this case
4678                  * is never reached
4679                  */
4680                 WARN_ON(1);
4681                 err = -EINVAL;
4682                 break;
4683
4684         }
4685         return err;
4686 }
4687
4688 /*
4689  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4690  */
4691 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4692 {
4693         int err;
4694         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4695         const struct net_device_ops *ops;
4696
4697         if (!dev)
4698                 return -ENODEV;
4699
4700         ops = dev->netdev_ops;
4701
4702         switch (cmd) {
4703         case SIOCSIFFLAGS:      /* Set interface flags */
4704                 return dev_change_flags(dev, ifr->ifr_flags);
4705
4706         case SIOCSIFMETRIC:     /* Set the metric on the interface
4707                                    (currently unused) */
4708                 return -EOPNOTSUPP;
4709
4710         case SIOCSIFMTU:        /* Set the MTU of a device */
4711                 return dev_set_mtu(dev, ifr->ifr_mtu);
4712
4713         case SIOCSIFHWADDR:
4714                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4715
4716         case SIOCSIFHWBROADCAST:
4717                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4718                         return -EINVAL;
4719                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4720                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4721                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4722                 return 0;
4723
4724         case SIOCSIFMAP:
4725                 if (ops->ndo_set_config) {
4726                         if (!netif_device_present(dev))
4727                                 return -ENODEV;
4728                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4729                 }
4730                 return -EOPNOTSUPP;
4731
4732         case SIOCADDMULTI:
4733                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4734                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4735                         return -EINVAL;
4736                 if (!netif_device_present(dev))
4737                         return -ENODEV;
4738                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4739
4740         case SIOCDELMULTI:
4741                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4742                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4743                         return -EINVAL;
4744                 if (!netif_device_present(dev))
4745                         return -ENODEV;
4746                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4747
4748         case SIOCSIFTXQLEN:
4749                 if (ifr->ifr_qlen < 0)
4750                         return -EINVAL;
4751                 dev->tx_queue_len = ifr->ifr_qlen;
4752                 return 0;
4753
4754         case SIOCSIFNAME:
4755                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4756                 return dev_change_name(dev, ifr->ifr_newname);
4757
4758         /*
4759          *      Unknown or private ioctl
4760          */
4761         default:
4762                 if ((cmd >= SIOCDEVPRIVATE &&
4763                     cmd <= SIOCDEVPRIVATE + 15) ||
4764                     cmd == SIOCBONDENSLAVE ||
4765                     cmd == SIOCBONDRELEASE ||
4766                     cmd == SIOCBONDSETHWADDR ||
4767                     cmd == SIOCBONDSLAVEINFOQUERY ||
4768                     cmd == SIOCBONDINFOQUERY ||
4769                     cmd == SIOCBONDCHANGEACTIVE ||
4770                     cmd == SIOCGMIIPHY ||
4771                     cmd == SIOCGMIIREG ||
4772                     cmd == SIOCSMIIREG ||
4773                     cmd == SIOCBRADDIF ||
4774                     cmd == SIOCBRDELIF ||
4775                     cmd == SIOCSHWTSTAMP ||
4776                     cmd == SIOCWANDEV) {
4777                         err = -EOPNOTSUPP;
4778                         if (ops->ndo_do_ioctl) {
4779                                 if (netif_device_present(dev))
4780                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4781                                 else
4782                                         err = -ENODEV;
4783                         }
4784                 } else
4785                         err = -EINVAL;
4786
4787         }
4788         return err;
4789 }
4790
4791 /*
4792  *      This function handles all "interface"-type I/O control requests. The actual
4793  *      'doing' part of this is dev_ifsioc above.
4794  */
4795
4796 /**
4797  *      dev_ioctl       -       network device ioctl
4798  *      @net: the applicable net namespace
4799  *      @cmd: command to issue
4800  *      @arg: pointer to a struct ifreq in user space
4801  *
4802  *      Issue ioctl functions to devices. This is normally called by the
4803  *      user space syscall interfaces but can sometimes be useful for
4804  *      other purposes. The return value is the return from the syscall if
4805  *      positive or a negative errno code on error.
4806  */
4807
4808 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4809 {
4810         struct ifreq ifr;
4811         int ret;
4812         char *colon;
4813
4814         /* One special case: SIOCGIFCONF takes ifconf argument
4815            and requires shared lock, because it sleeps writing
4816            to user space.
4817          */
4818
4819         if (cmd == SIOCGIFCONF) {
4820                 rtnl_lock();
4821                 ret = dev_ifconf(net, (char __user *) arg);
4822                 rtnl_unlock();
4823                 return ret;
4824         }
4825         if (cmd == SIOCGIFNAME)
4826                 return dev_ifname(net, (struct ifreq __user *)arg);
4827
4828         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4829                 return -EFAULT;
4830
4831         ifr.ifr_name[IFNAMSIZ-1] = 0;
4832
4833         colon = strchr(ifr.ifr_name, ':');
4834         if (colon)
4835                 *colon = 0;
4836
4837         /*
4838          *      See which interface the caller is talking about.
4839          */
4840
4841         switch (cmd) {
4842         /*
4843          *      These ioctl calls:
4844          *      - can be done by all.
4845          *      - atomic and do not require locking.
4846          *      - return a value
4847          */
4848         case SIOCGIFFLAGS:
4849         case SIOCGIFMETRIC:
4850         case SIOCGIFMTU:
4851         case SIOCGIFHWADDR:
4852         case SIOCGIFSLAVE:
4853         case SIOCGIFMAP:
4854         case SIOCGIFINDEX:
4855         case SIOCGIFTXQLEN:
4856                 dev_load(net, ifr.ifr_name);
4857                 rcu_read_lock();
4858                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4859                 rcu_read_unlock();
4860                 if (!ret) {
4861                         if (colon)
4862                                 *colon = ':';
4863                         if (copy_to_user(arg, &ifr,
4864                                          sizeof(struct ifreq)))
4865                                 ret = -EFAULT;
4866                 }
4867                 return ret;
4868
4869         case SIOCETHTOOL:
4870                 dev_load(net, ifr.ifr_name);
4871                 rtnl_lock();
4872                 ret = dev_ethtool(net, &ifr);
4873                 rtnl_unlock();
4874                 if (!ret) {
4875                         if (colon)
4876                                 *colon = ':';
4877                         if (copy_to_user(arg, &ifr,
4878                                          sizeof(struct ifreq)))
4879                                 ret = -EFAULT;
4880                 }
4881                 return ret;
4882
4883         /*
4884          *      These ioctl calls:
4885          *      - require superuser power.
4886          *      - require strict serialization.
4887          *      - return a value
4888          */
4889         case SIOCGMIIPHY:
4890         case SIOCGMIIREG:
4891         case SIOCSIFNAME:
4892                 if (!capable(CAP_NET_ADMIN))
4893                         return -EPERM;
4894                 dev_load(net, ifr.ifr_name);
4895                 rtnl_lock();
4896                 ret = dev_ifsioc(net, &ifr, cmd);
4897                 rtnl_unlock();
4898                 if (!ret) {
4899                         if (colon)
4900                                 *colon = ':';
4901                         if (copy_to_user(arg, &ifr,
4902                                          sizeof(struct ifreq)))
4903                                 ret = -EFAULT;
4904                 }
4905                 return ret;
4906
4907         /*
4908          *      These ioctl calls:
4909          *      - require superuser power.
4910          *      - require strict serialization.
4911          *      - do not return a value
4912          */
4913         case SIOCSIFFLAGS:
4914         case SIOCSIFMETRIC:
4915         case SIOCSIFMTU:
4916         case SIOCSIFMAP:
4917         case SIOCSIFHWADDR:
4918         case SIOCSIFSLAVE:
4919         case SIOCADDMULTI:
4920         case SIOCDELMULTI:
4921         case SIOCSIFHWBROADCAST:
4922         case SIOCSIFTXQLEN:
4923         case SIOCSMIIREG:
4924         case SIOCBONDENSLAVE:
4925         case SIOCBONDRELEASE:
4926         case SIOCBONDSETHWADDR:
4927         case SIOCBONDCHANGEACTIVE:
4928         case SIOCBRADDIF:
4929         case SIOCBRDELIF:
4930         case SIOCSHWTSTAMP:
4931                 if (!capable(CAP_NET_ADMIN))
4932                         return -EPERM;
4933                 /* fall through */
4934         case SIOCBONDSLAVEINFOQUERY:
4935         case SIOCBONDINFOQUERY:
4936                 dev_load(net, ifr.ifr_name);
4937                 rtnl_lock();
4938                 ret = dev_ifsioc(net, &ifr, cmd);
4939                 rtnl_unlock();
4940                 return ret;
4941
4942         case SIOCGIFMEM:
4943                 /* Get the per device memory space. We can add this but
4944                  * currently do not support it */
4945         case SIOCSIFMEM:
4946                 /* Set the per device memory buffer space.
4947                  * Not applicable in our case */
4948         case SIOCSIFLINK:
4949                 return -EINVAL;
4950
4951         /*
4952          *      Unknown or private ioctl.
4953          */
4954         default:
4955                 if (cmd == SIOCWANDEV ||
4956                     (cmd >= SIOCDEVPRIVATE &&
4957                      cmd <= SIOCDEVPRIVATE + 15)) {
4958                         dev_load(net, ifr.ifr_name);
4959                         rtnl_lock();
4960                         ret = dev_ifsioc(net, &ifr, cmd);
4961                         rtnl_unlock();
4962                         if (!ret && copy_to_user(arg, &ifr,
4963                                                  sizeof(struct ifreq)))
4964                                 ret = -EFAULT;
4965                         return ret;
4966                 }
4967                 /* Take care of Wireless Extensions */
4968                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4969                         return wext_handle_ioctl(net, &ifr, cmd, arg);
4970                 return -EINVAL;
4971         }
4972 }
4973
4974
4975 /**
4976  *      dev_new_index   -       allocate an ifindex
4977  *      @net: the applicable net namespace
4978  *
4979  *      Returns a suitable unique value for a new device interface
4980  *      number.  The caller must hold the rtnl semaphore or the
4981  *      dev_base_lock to be sure it remains unique.
4982  */
4983 static int dev_new_index(struct net *net)
4984 {
4985         static int ifindex;
4986         for (;;) {
4987                 if (++ifindex <= 0)
4988                         ifindex = 1;
4989                 if (!__dev_get_by_index(net, ifindex))
4990                         return ifindex;
4991         }
4992 }
4993
4994 /* Delayed registration/unregisteration */
4995 static LIST_HEAD(net_todo_list);
4996
4997 static void net_set_todo(struct net_device *dev)
4998 {
4999         list_add_tail(&dev->todo_list, &net_todo_list);
5000 }
5001
5002 static void rollback_registered_many(struct list_head *head)
5003 {
5004         struct net_device *dev, *tmp;
5005
5006         BUG_ON(dev_boot_phase);
5007         ASSERT_RTNL();
5008
5009         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5010                 /* Some devices call without registering
5011                  * for initialization unwind. Remove those
5012                  * devices and proceed with the remaining.
5013                  */
5014                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5015                         pr_debug("unregister_netdevice: device %s/%p never "
5016                                  "was registered\n", dev->name, dev);
5017
5018                         WARN_ON(1);
5019                         list_del(&dev->unreg_list);
5020                         continue;
5021                 }
5022
5023                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5024         }
5025
5026         /* If device is running, close it first. */
5027         dev_close_many(head);
5028
5029         list_for_each_entry(dev, head, unreg_list) {
5030                 /* And unlink it from device chain. */
5031                 unlist_netdevice(dev);
5032
5033                 dev->reg_state = NETREG_UNREGISTERING;
5034         }
5035
5036         synchronize_net();
5037
5038         list_for_each_entry(dev, head, unreg_list) {
5039                 /* Shutdown queueing discipline. */
5040                 dev_shutdown(dev);
5041
5042
5043                 /* Notify protocols, that we are about to destroy
5044                    this device. They should clean all the things.
5045                 */
5046                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5047
5048                 if (!dev->rtnl_link_ops ||
5049                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5050                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5051
5052                 /*
5053                  *      Flush the unicast and multicast chains
5054                  */
5055                 dev_uc_flush(dev);
5056                 dev_mc_flush(dev);
5057
5058                 if (dev->netdev_ops->ndo_uninit)
5059                         dev->netdev_ops->ndo_uninit(dev);
5060
5061                 /* Notifier chain MUST detach us from master device. */
5062                 WARN_ON(dev->master);
5063
5064                 /* Remove entries from kobject tree */
5065                 netdev_unregister_kobject(dev);
5066         }
5067
5068         /* Process any work delayed until the end of the batch */
5069         dev = list_first_entry(head, struct net_device, unreg_list);
5070         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5071
5072         rcu_barrier();
5073
5074         list_for_each_entry(dev, head, unreg_list)
5075                 dev_put(dev);
5076 }
5077
5078 static void rollback_registered(struct net_device *dev)
5079 {
5080         LIST_HEAD(single);
5081
5082         list_add(&dev->unreg_list, &single);
5083         rollback_registered_many(&single);
5084 }
5085
5086 unsigned long netdev_fix_features(unsigned long features, const char *name)
5087 {
5088         /* Fix illegal SG+CSUM combinations. */
5089         if ((features & NETIF_F_SG) &&
5090             !(features & NETIF_F_ALL_CSUM)) {
5091                 if (name)
5092                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5093                                "checksum feature.\n", name);
5094                 features &= ~NETIF_F_SG;
5095         }
5096
5097         /* TSO requires that SG is present as well. */
5098         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5099                 if (name)
5100                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5101                                "SG feature.\n", name);
5102                 features &= ~NETIF_F_TSO;
5103         }
5104
5105         if (features & NETIF_F_UFO) {
5106                 /* maybe split UFO into V4 and V6? */
5107                 if (!((features & NETIF_F_GEN_CSUM) ||
5108                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5109                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5110                         if (name)
5111                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5112                                        "since no checksum offload features.\n",
5113                                        name);
5114                         features &= ~NETIF_F_UFO;
5115                 }
5116
5117                 if (!(features & NETIF_F_SG)) {
5118                         if (name)
5119                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5120                                        "since no NETIF_F_SG feature.\n", name);
5121                         features &= ~NETIF_F_UFO;
5122                 }
5123         }
5124
5125         return features;
5126 }
5127 EXPORT_SYMBOL(netdev_fix_features);
5128
5129 /**
5130  *      netif_stacked_transfer_operstate -      transfer operstate
5131  *      @rootdev: the root or lower level device to transfer state from
5132  *      @dev: the device to transfer operstate to
5133  *
5134  *      Transfer operational state from root to device. This is normally
5135  *      called when a stacking relationship exists between the root
5136  *      device and the device(a leaf device).
5137  */
5138 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5139                                         struct net_device *dev)
5140 {
5141         if (rootdev->operstate == IF_OPER_DORMANT)
5142                 netif_dormant_on(dev);
5143         else
5144                 netif_dormant_off(dev);
5145
5146         if (netif_carrier_ok(rootdev)) {
5147                 if (!netif_carrier_ok(dev))
5148                         netif_carrier_on(dev);
5149         } else {
5150                 if (netif_carrier_ok(dev))
5151                         netif_carrier_off(dev);
5152         }
5153 }
5154 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5155
5156 #ifdef CONFIG_RPS
5157 static int netif_alloc_rx_queues(struct net_device *dev)
5158 {
5159         unsigned int i, count = dev->num_rx_queues;
5160         struct netdev_rx_queue *rx;
5161
5162         BUG_ON(count < 1);
5163
5164         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5165         if (!rx) {
5166                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5167                 return -ENOMEM;
5168         }
5169         dev->_rx = rx;
5170
5171         for (i = 0; i < count; i++)
5172                 rx[i].dev = dev;
5173         return 0;
5174 }
5175 #endif
5176
5177 static void netdev_init_one_queue(struct net_device *dev,
5178                                   struct netdev_queue *queue, void *_unused)
5179 {
5180         /* Initialize queue lock */
5181         spin_lock_init(&queue->_xmit_lock);
5182         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5183         queue->xmit_lock_owner = -1;
5184         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5185         queue->dev = dev;
5186 }
5187
5188 static int netif_alloc_netdev_queues(struct net_device *dev)
5189 {
5190         unsigned int count = dev->num_tx_queues;
5191         struct netdev_queue *tx;
5192
5193         BUG_ON(count < 1);
5194
5195         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5196         if (!tx) {
5197                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5198                        count);
5199                 return -ENOMEM;
5200         }
5201         dev->_tx = tx;
5202
5203         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5204         spin_lock_init(&dev->tx_global_lock);
5205
5206         return 0;
5207 }
5208
5209 /**
5210  *      register_netdevice      - register a network device
5211  *      @dev: device to register
5212  *
5213  *      Take a completed network device structure and add it to the kernel
5214  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5215  *      chain. 0 is returned on success. A negative errno code is returned
5216  *      on a failure to set up the device, or if the name is a duplicate.
5217  *
5218  *      Callers must hold the rtnl semaphore. You may want
5219  *      register_netdev() instead of this.
5220  *
5221  *      BUGS:
5222  *      The locking appears insufficient to guarantee two parallel registers
5223  *      will not get the same name.
5224  */
5225
5226 int register_netdevice(struct net_device *dev)
5227 {
5228         int ret;
5229         struct net *net = dev_net(dev);
5230
5231         BUG_ON(dev_boot_phase);
5232         ASSERT_RTNL();
5233
5234         might_sleep();
5235
5236         /* When net_device's are persistent, this will be fatal. */
5237         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5238         BUG_ON(!net);
5239
5240         spin_lock_init(&dev->addr_list_lock);
5241         netdev_set_addr_lockdep_class(dev);
5242
5243         dev->iflink = -1;
5244
5245         /* Init, if this function is available */
5246         if (dev->netdev_ops->ndo_init) {
5247                 ret = dev->netdev_ops->ndo_init(dev);
5248                 if (ret) {
5249                         if (ret > 0)
5250                                 ret = -EIO;
5251                         goto out;
5252                 }
5253         }
5254
5255         ret = dev_get_valid_name(dev, dev->name, 0);
5256         if (ret)
5257                 goto err_uninit;
5258
5259         dev->ifindex = dev_new_index(net);
5260         if (dev->iflink == -1)
5261                 dev->iflink = dev->ifindex;
5262
5263         /* Fix illegal checksum combinations */
5264         if ((dev->features & NETIF_F_HW_CSUM) &&
5265             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5266                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5267                        dev->name);
5268                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5269         }
5270
5271         if ((dev->features & NETIF_F_NO_CSUM) &&
5272             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5273                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5274                        dev->name);
5275                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5276         }
5277
5278         dev->features = netdev_fix_features(dev->features, dev->name);
5279
5280         /* Enable software GSO if SG is supported. */
5281         if (dev->features & NETIF_F_SG)
5282                 dev->features |= NETIF_F_GSO;
5283
5284         /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5285          * vlan_dev_init() will do the dev->features check, so these features
5286          * are enabled only if supported by underlying device.
5287          */
5288         dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5289
5290         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5291         ret = notifier_to_errno(ret);
5292         if (ret)
5293                 goto err_uninit;
5294
5295         ret = netdev_register_kobject(dev);
5296         if (ret)
5297                 goto err_uninit;
5298         dev->reg_state = NETREG_REGISTERED;
5299
5300         /*
5301          *      Default initial state at registry is that the
5302          *      device is present.
5303          */
5304
5305         set_bit(__LINK_STATE_PRESENT, &dev->state);
5306
5307         dev_init_scheduler(dev);
5308         dev_hold(dev);
5309         list_netdevice(dev);
5310
5311         /* Notify protocols, that a new device appeared. */
5312         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5313         ret = notifier_to_errno(ret);
5314         if (ret) {
5315                 rollback_registered(dev);
5316                 dev->reg_state = NETREG_UNREGISTERED;
5317         }
5318         /*
5319          *      Prevent userspace races by waiting until the network
5320          *      device is fully setup before sending notifications.
5321          */
5322         if (!dev->rtnl_link_ops ||
5323             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5324                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5325
5326 out:
5327         return ret;
5328
5329 err_uninit:
5330         if (dev->netdev_ops->ndo_uninit)
5331                 dev->netdev_ops->ndo_uninit(dev);
5332         goto out;
5333 }
5334 EXPORT_SYMBOL(register_netdevice);
5335
5336 /**
5337  *      init_dummy_netdev       - init a dummy network device for NAPI
5338  *      @dev: device to init
5339  *
5340  *      This takes a network device structure and initialize the minimum
5341  *      amount of fields so it can be used to schedule NAPI polls without
5342  *      registering a full blown interface. This is to be used by drivers
5343  *      that need to tie several hardware interfaces to a single NAPI
5344  *      poll scheduler due to HW limitations.
5345  */
5346 int init_dummy_netdev(struct net_device *dev)
5347 {
5348         /* Clear everything. Note we don't initialize spinlocks
5349          * are they aren't supposed to be taken by any of the
5350          * NAPI code and this dummy netdev is supposed to be
5351          * only ever used for NAPI polls
5352          */
5353         memset(dev, 0, sizeof(struct net_device));
5354
5355         /* make sure we BUG if trying to hit standard
5356          * register/unregister code path
5357          */
5358         dev->reg_state = NETREG_DUMMY;
5359
5360         /* NAPI wants this */
5361         INIT_LIST_HEAD(&dev->napi_list);
5362
5363         /* a dummy interface is started by default */
5364         set_bit(__LINK_STATE_PRESENT, &dev->state);
5365         set_bit(__LINK_STATE_START, &dev->state);
5366
5367         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5368          * because users of this 'device' dont need to change
5369          * its refcount.
5370          */
5371
5372         return 0;
5373 }
5374 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5375
5376
5377 /**
5378  *      register_netdev - register a network device
5379  *      @dev: device to register
5380  *
5381  *      Take a completed network device structure and add it to the kernel
5382  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5383  *      chain. 0 is returned on success. A negative errno code is returned
5384  *      on a failure to set up the device, or if the name is a duplicate.
5385  *
5386  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5387  *      and expands the device name if you passed a format string to
5388  *      alloc_netdev.
5389  */
5390 int register_netdev(struct net_device *dev)
5391 {
5392         int err;
5393
5394         rtnl_lock();
5395
5396         /*
5397          * If the name is a format string the caller wants us to do a
5398          * name allocation.
5399          */
5400         if (strchr(dev->name, '%')) {
5401                 err = dev_alloc_name(dev, dev->name);
5402                 if (err < 0)
5403                         goto out;
5404         }
5405
5406         err = register_netdevice(dev);
5407 out:
5408         rtnl_unlock();
5409         return err;
5410 }
5411 EXPORT_SYMBOL(register_netdev);
5412
5413 int netdev_refcnt_read(const struct net_device *dev)
5414 {
5415         int i, refcnt = 0;
5416
5417         for_each_possible_cpu(i)
5418                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5419         return refcnt;
5420 }
5421 EXPORT_SYMBOL(netdev_refcnt_read);
5422
5423 /*
5424  * netdev_wait_allrefs - wait until all references are gone.
5425  *
5426  * This is called when unregistering network devices.
5427  *
5428  * Any protocol or device that holds a reference should register
5429  * for netdevice notification, and cleanup and put back the
5430  * reference if they receive an UNREGISTER event.
5431  * We can get stuck here if buggy protocols don't correctly
5432  * call dev_put.
5433  */
5434 static void netdev_wait_allrefs(struct net_device *dev)
5435 {
5436         unsigned long rebroadcast_time, warning_time;
5437         int refcnt;
5438
5439         linkwatch_forget_dev(dev);
5440
5441         rebroadcast_time = warning_time = jiffies;
5442         refcnt = netdev_refcnt_read(dev);
5443
5444         while (refcnt != 0) {
5445                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5446                         rtnl_lock();
5447
5448                         /* Rebroadcast unregister notification */
5449                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5450                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5451                          * should have already handle it the first time */
5452
5453                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5454                                      &dev->state)) {
5455                                 /* We must not have linkwatch events
5456                                  * pending on unregister. If this
5457                                  * happens, we simply run the queue
5458                                  * unscheduled, resulting in a noop
5459                                  * for this device.
5460                                  */
5461                                 linkwatch_run_queue();
5462                         }
5463
5464                         __rtnl_unlock();
5465
5466                         rebroadcast_time = jiffies;
5467                 }
5468
5469                 msleep(250);
5470
5471                 refcnt = netdev_refcnt_read(dev);
5472
5473                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5474                         printk(KERN_EMERG "unregister_netdevice: "
5475                                "waiting for %s to become free. Usage "
5476                                "count = %d\n",
5477                                dev->name, refcnt);
5478                         warning_time = jiffies;
5479                 }
5480         }
5481 }
5482
5483 /* The sequence is:
5484  *
5485  *      rtnl_lock();
5486  *      ...
5487  *      register_netdevice(x1);
5488  *      register_netdevice(x2);
5489  *      ...
5490  *      unregister_netdevice(y1);
5491  *      unregister_netdevice(y2);
5492  *      ...
5493  *      rtnl_unlock();
5494  *      free_netdev(y1);
5495  *      free_netdev(y2);
5496  *
5497  * We are invoked by rtnl_unlock().
5498  * This allows us to deal with problems:
5499  * 1) We can delete sysfs objects which invoke hotplug
5500  *    without deadlocking with linkwatch via keventd.
5501  * 2) Since we run with the RTNL semaphore not held, we can sleep
5502  *    safely in order to wait for the netdev refcnt to drop to zero.
5503  *
5504  * We must not return until all unregister events added during
5505  * the interval the lock was held have been completed.
5506  */
5507 void netdev_run_todo(void)
5508 {
5509         struct list_head list;
5510
5511         /* Snapshot list, allow later requests */
5512         list_replace_init(&net_todo_list, &list);
5513
5514         __rtnl_unlock();
5515
5516         while (!list_empty(&list)) {
5517                 struct net_device *dev
5518                         = list_first_entry(&list, struct net_device, todo_list);
5519                 list_del(&dev->todo_list);
5520
5521                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5522                         printk(KERN_ERR "network todo '%s' but state %d\n",
5523                                dev->name, dev->reg_state);
5524                         dump_stack();
5525                         continue;
5526                 }
5527
5528                 dev->reg_state = NETREG_UNREGISTERED;
5529
5530                 on_each_cpu(flush_backlog, dev, 1);
5531
5532                 netdev_wait_allrefs(dev);
5533
5534                 /* paranoia */
5535                 BUG_ON(netdev_refcnt_read(dev));
5536                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5537                 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5538                 WARN_ON(dev->dn_ptr);
5539
5540                 if (dev->destructor)
5541                         dev->destructor(dev);
5542
5543                 /* Free network device */
5544                 kobject_put(&dev->dev.kobj);
5545         }
5546 }
5547
5548 /**
5549  *      dev_txq_stats_fold - fold tx_queues stats
5550  *      @dev: device to get statistics from
5551  *      @stats: struct rtnl_link_stats64 to hold results
5552  */
5553 void dev_txq_stats_fold(const struct net_device *dev,
5554                         struct rtnl_link_stats64 *stats)
5555 {
5556         u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5557         unsigned int i;
5558         struct netdev_queue *txq;
5559
5560         for (i = 0; i < dev->num_tx_queues; i++) {
5561                 txq = netdev_get_tx_queue(dev, i);
5562                 spin_lock_bh(&txq->_xmit_lock);
5563                 tx_bytes   += txq->tx_bytes;
5564                 tx_packets += txq->tx_packets;
5565                 tx_dropped += txq->tx_dropped;
5566                 spin_unlock_bh(&txq->_xmit_lock);
5567         }
5568         if (tx_bytes || tx_packets || tx_dropped) {
5569                 stats->tx_bytes   = tx_bytes;
5570                 stats->tx_packets = tx_packets;
5571                 stats->tx_dropped = tx_dropped;
5572         }
5573 }
5574 EXPORT_SYMBOL(dev_txq_stats_fold);
5575
5576 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5577  * fields in the same order, with only the type differing.
5578  */
5579 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5580                                     const struct net_device_stats *netdev_stats)
5581 {
5582 #if BITS_PER_LONG == 64
5583         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5584         memcpy(stats64, netdev_stats, sizeof(*stats64));
5585 #else
5586         size_t i, n = sizeof(*stats64) / sizeof(u64);
5587         const unsigned long *src = (const unsigned long *)netdev_stats;
5588         u64 *dst = (u64 *)stats64;
5589
5590         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5591                      sizeof(*stats64) / sizeof(u64));
5592         for (i = 0; i < n; i++)
5593                 dst[i] = src[i];
5594 #endif
5595 }
5596
5597 /**
5598  *      dev_get_stats   - get network device statistics
5599  *      @dev: device to get statistics from
5600  *      @storage: place to store stats
5601  *
5602  *      Get network statistics from device. Return @storage.
5603  *      The device driver may provide its own method by setting
5604  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5605  *      otherwise the internal statistics structure is used.
5606  */
5607 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5608                                         struct rtnl_link_stats64 *storage)
5609 {
5610         const struct net_device_ops *ops = dev->netdev_ops;
5611
5612         if (ops->ndo_get_stats64) {
5613                 memset(storage, 0, sizeof(*storage));
5614                 ops->ndo_get_stats64(dev, storage);
5615         } else if (ops->ndo_get_stats) {
5616                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5617         } else {
5618                 netdev_stats_to_stats64(storage, &dev->stats);
5619                 dev_txq_stats_fold(dev, storage);
5620         }
5621         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5622         return storage;
5623 }
5624 EXPORT_SYMBOL(dev_get_stats);
5625
5626 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5627 {
5628         struct netdev_queue *queue = dev_ingress_queue(dev);
5629
5630 #ifdef CONFIG_NET_CLS_ACT
5631         if (queue)
5632                 return queue;
5633         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5634         if (!queue)
5635                 return NULL;
5636         netdev_init_one_queue(dev, queue, NULL);
5637         queue->qdisc = &noop_qdisc;
5638         queue->qdisc_sleeping = &noop_qdisc;
5639         rcu_assign_pointer(dev->ingress_queue, queue);
5640 #endif
5641         return queue;
5642 }
5643
5644 /**
5645  *      alloc_netdev_mq - allocate network device
5646  *      @sizeof_priv:   size of private data to allocate space for
5647  *      @name:          device name format string
5648  *      @setup:         callback to initialize device
5649  *      @queue_count:   the number of subqueues to allocate
5650  *
5651  *      Allocates a struct net_device with private data area for driver use
5652  *      and performs basic initialization.  Also allocates subquue structs
5653  *      for each queue on the device at the end of the netdevice.
5654  */
5655 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5656                 void (*setup)(struct net_device *), unsigned int queue_count)
5657 {
5658         struct net_device *dev;
5659         size_t alloc_size;
5660         struct net_device *p;
5661
5662         BUG_ON(strlen(name) >= sizeof(dev->name));
5663
5664         if (queue_count < 1) {
5665                 pr_err("alloc_netdev: Unable to allocate device "
5666                        "with zero queues.\n");
5667                 return NULL;
5668         }
5669
5670         alloc_size = sizeof(struct net_device);
5671         if (sizeof_priv) {
5672                 /* ensure 32-byte alignment of private area */
5673                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5674                 alloc_size += sizeof_priv;
5675         }
5676         /* ensure 32-byte alignment of whole construct */
5677         alloc_size += NETDEV_ALIGN - 1;
5678
5679         p = kzalloc(alloc_size, GFP_KERNEL);
5680         if (!p) {
5681                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5682                 return NULL;
5683         }
5684
5685         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5686         dev->padded = (char *)dev - (char *)p;
5687
5688         dev->pcpu_refcnt = alloc_percpu(int);
5689         if (!dev->pcpu_refcnt)
5690                 goto free_p;
5691
5692         if (dev_addr_init(dev))
5693                 goto free_pcpu;
5694
5695         dev_mc_init(dev);
5696         dev_uc_init(dev);
5697
5698         dev_net_set(dev, &init_net);
5699
5700         dev->num_tx_queues = queue_count;
5701         dev->real_num_tx_queues = queue_count;
5702         if (netif_alloc_netdev_queues(dev))
5703                 goto free_pcpu;
5704
5705 #ifdef CONFIG_RPS
5706         dev->num_rx_queues = queue_count;
5707         dev->real_num_rx_queues = queue_count;
5708         if (netif_alloc_rx_queues(dev))
5709                 goto free_pcpu;
5710 #endif
5711
5712         dev->gso_max_size = GSO_MAX_SIZE;
5713
5714         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5715         dev->ethtool_ntuple_list.count = 0;
5716         INIT_LIST_HEAD(&dev->napi_list);
5717         INIT_LIST_HEAD(&dev->unreg_list);
5718         INIT_LIST_HEAD(&dev->link_watch_list);
5719         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5720         setup(dev);
5721         strcpy(dev->name, name);
5722         return dev;
5723
5724 free_pcpu:
5725         free_percpu(dev->pcpu_refcnt);
5726         kfree(dev->_tx);
5727 #ifdef CONFIG_RPS
5728         kfree(dev->_rx);
5729 #endif
5730
5731 free_p:
5732         kfree(p);
5733         return NULL;
5734 }
5735 EXPORT_SYMBOL(alloc_netdev_mq);
5736
5737 /**
5738  *      free_netdev - free network device
5739  *      @dev: device
5740  *
5741  *      This function does the last stage of destroying an allocated device
5742  *      interface. The reference to the device object is released.
5743  *      If this is the last reference then it will be freed.
5744  */
5745 void free_netdev(struct net_device *dev)
5746 {
5747         struct napi_struct *p, *n;
5748
5749         release_net(dev_net(dev));
5750
5751         kfree(dev->_tx);
5752 #ifdef CONFIG_RPS
5753         kfree(dev->_rx);
5754 #endif
5755
5756         kfree(rcu_dereference_raw(dev->ingress_queue));
5757
5758         /* Flush device addresses */
5759         dev_addr_flush(dev);
5760
5761         /* Clear ethtool n-tuple list */
5762         ethtool_ntuple_flush(dev);
5763
5764         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5765                 netif_napi_del(p);
5766
5767         free_percpu(dev->pcpu_refcnt);
5768         dev->pcpu_refcnt = NULL;
5769
5770         /*  Compatibility with error handling in drivers */
5771         if (dev->reg_state == NETREG_UNINITIALIZED) {
5772                 kfree((char *)dev - dev->padded);
5773                 return;
5774         }
5775
5776         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5777         dev->reg_state = NETREG_RELEASED;
5778
5779         /* will free via device release */
5780         put_device(&dev->dev);
5781 }
5782 EXPORT_SYMBOL(free_netdev);
5783
5784 /**
5785  *      synchronize_net -  Synchronize with packet receive processing
5786  *
5787  *      Wait for packets currently being received to be done.
5788  *      Does not block later packets from starting.
5789  */
5790 void synchronize_net(void)
5791 {
5792         might_sleep();
5793         synchronize_rcu();
5794 }
5795 EXPORT_SYMBOL(synchronize_net);
5796
5797 /**
5798  *      unregister_netdevice_queue - remove device from the kernel
5799  *      @dev: device
5800  *      @head: list
5801  *
5802  *      This function shuts down a device interface and removes it
5803  *      from the kernel tables.
5804  *      If head not NULL, device is queued to be unregistered later.
5805  *
5806  *      Callers must hold the rtnl semaphore.  You may want
5807  *      unregister_netdev() instead of this.
5808  */
5809
5810 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5811 {
5812         ASSERT_RTNL();
5813
5814         if (head) {
5815                 list_move_tail(&dev->unreg_list, head);
5816         } else {
5817                 rollback_registered(dev);
5818                 /* Finish processing unregister after unlock */
5819                 net_set_todo(dev);
5820         }
5821 }
5822 EXPORT_SYMBOL(unregister_netdevice_queue);
5823
5824 /**
5825  *      unregister_netdevice_many - unregister many devices
5826  *      @head: list of devices
5827  */
5828 void unregister_netdevice_many(struct list_head *head)
5829 {
5830         struct net_device *dev;
5831
5832         if (!list_empty(head)) {
5833                 rollback_registered_many(head);
5834                 list_for_each_entry(dev, head, unreg_list)
5835                         net_set_todo(dev);
5836         }
5837 }
5838 EXPORT_SYMBOL(unregister_netdevice_many);
5839
5840 /**
5841  *      unregister_netdev - remove device from the kernel
5842  *      @dev: device
5843  *
5844  *      This function shuts down a device interface and removes it
5845  *      from the kernel tables.
5846  *
5847  *      This is just a wrapper for unregister_netdevice that takes
5848  *      the rtnl semaphore.  In general you want to use this and not
5849  *      unregister_netdevice.
5850  */
5851 void unregister_netdev(struct net_device *dev)
5852 {
5853         rtnl_lock();
5854         unregister_netdevice(dev);
5855         rtnl_unlock();
5856 }
5857 EXPORT_SYMBOL(unregister_netdev);
5858
5859 /**
5860  *      dev_change_net_namespace - move device to different nethost namespace
5861  *      @dev: device
5862  *      @net: network namespace
5863  *      @pat: If not NULL name pattern to try if the current device name
5864  *            is already taken in the destination network namespace.
5865  *
5866  *      This function shuts down a device interface and moves it
5867  *      to a new network namespace. On success 0 is returned, on
5868  *      a failure a netagive errno code is returned.
5869  *
5870  *      Callers must hold the rtnl semaphore.
5871  */
5872
5873 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5874 {
5875         int err;
5876
5877         ASSERT_RTNL();
5878
5879         /* Don't allow namespace local devices to be moved. */
5880         err = -EINVAL;
5881         if (dev->features & NETIF_F_NETNS_LOCAL)
5882                 goto out;
5883
5884         /* Ensure the device has been registrered */
5885         err = -EINVAL;
5886         if (dev->reg_state != NETREG_REGISTERED)
5887                 goto out;
5888
5889         /* Get out if there is nothing todo */
5890         err = 0;
5891         if (net_eq(dev_net(dev), net))
5892                 goto out;
5893
5894         /* Pick the destination device name, and ensure
5895          * we can use it in the destination network namespace.
5896          */
5897         err = -EEXIST;
5898         if (__dev_get_by_name(net, dev->name)) {
5899                 /* We get here if we can't use the current device name */
5900                 if (!pat)
5901                         goto out;
5902                 if (dev_get_valid_name(dev, pat, 1))
5903                         goto out;
5904         }
5905
5906         /*
5907          * And now a mini version of register_netdevice unregister_netdevice.
5908          */
5909
5910         /* If device is running close it first. */
5911         dev_close(dev);
5912
5913         /* And unlink it from device chain */
5914         err = -ENODEV;
5915         unlist_netdevice(dev);
5916
5917         synchronize_net();
5918
5919         /* Shutdown queueing discipline. */
5920         dev_shutdown(dev);
5921
5922         /* Notify protocols, that we are about to destroy
5923            this device. They should clean all the things.
5924
5925            Note that dev->reg_state stays at NETREG_REGISTERED.
5926            This is wanted because this way 8021q and macvlan know
5927            the device is just moving and can keep their slaves up.
5928         */
5929         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5930         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5931
5932         /*
5933          *      Flush the unicast and multicast chains
5934          */
5935         dev_uc_flush(dev);
5936         dev_mc_flush(dev);
5937
5938         /* Actually switch the network namespace */
5939         dev_net_set(dev, net);
5940
5941         /* If there is an ifindex conflict assign a new one */
5942         if (__dev_get_by_index(net, dev->ifindex)) {
5943                 int iflink = (dev->iflink == dev->ifindex);
5944                 dev->ifindex = dev_new_index(net);
5945                 if (iflink)
5946                         dev->iflink = dev->ifindex;
5947         }
5948
5949         /* Fixup kobjects */
5950         err = device_rename(&dev->dev, dev->name);
5951         WARN_ON(err);
5952
5953         /* Add the device back in the hashes */
5954         list_netdevice(dev);
5955
5956         /* Notify protocols, that a new device appeared. */
5957         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5958
5959         /*
5960          *      Prevent userspace races by waiting until the network
5961          *      device is fully setup before sending notifications.
5962          */
5963         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5964
5965         synchronize_net();
5966         err = 0;
5967 out:
5968         return err;
5969 }
5970 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5971
5972 static int dev_cpu_callback(struct notifier_block *nfb,
5973                             unsigned long action,
5974                             void *ocpu)
5975 {
5976         struct sk_buff **list_skb;
5977         struct sk_buff *skb;
5978         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5979         struct softnet_data *sd, *oldsd;
5980
5981         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5982                 return NOTIFY_OK;
5983
5984         local_irq_disable();
5985         cpu = smp_processor_id();
5986         sd = &per_cpu(softnet_data, cpu);
5987         oldsd = &per_cpu(softnet_data, oldcpu);
5988
5989         /* Find end of our completion_queue. */
5990         list_skb = &sd->completion_queue;
5991         while (*list_skb)
5992                 list_skb = &(*list_skb)->next;
5993         /* Append completion queue from offline CPU. */
5994         *list_skb = oldsd->completion_queue;
5995         oldsd->completion_queue = NULL;
5996
5997         /* Append output queue from offline CPU. */
5998         if (oldsd->output_queue) {
5999                 *sd->output_queue_tailp = oldsd->output_queue;
6000                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6001                 oldsd->output_queue = NULL;
6002                 oldsd->output_queue_tailp = &oldsd->output_queue;
6003         }
6004
6005         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6006         local_irq_enable();
6007
6008         /* Process offline CPU's input_pkt_queue */
6009         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6010                 netif_rx(skb);
6011                 input_queue_head_incr(oldsd);
6012         }
6013         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6014                 netif_rx(skb);
6015                 input_queue_head_incr(oldsd);
6016         }
6017
6018         return NOTIFY_OK;
6019 }
6020
6021
6022 /**
6023  *      netdev_increment_features - increment feature set by one
6024  *      @all: current feature set
6025  *      @one: new feature set
6026  *      @mask: mask feature set
6027  *
6028  *      Computes a new feature set after adding a device with feature set
6029  *      @one to the master device with current feature set @all.  Will not
6030  *      enable anything that is off in @mask. Returns the new feature set.
6031  */
6032 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
6033                                         unsigned long mask)
6034 {
6035         /* If device needs checksumming, downgrade to it. */
6036         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6037                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6038         else if (mask & NETIF_F_ALL_CSUM) {
6039                 /* If one device supports v4/v6 checksumming, set for all. */
6040                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6041                     !(all & NETIF_F_GEN_CSUM)) {
6042                         all &= ~NETIF_F_ALL_CSUM;
6043                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6044                 }
6045
6046                 /* If one device supports hw checksumming, set for all. */
6047                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6048                         all &= ~NETIF_F_ALL_CSUM;
6049                         all |= NETIF_F_HW_CSUM;
6050                 }
6051         }
6052
6053         one |= NETIF_F_ALL_CSUM;
6054
6055         one |= all & NETIF_F_ONE_FOR_ALL;
6056         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6057         all |= one & mask & NETIF_F_ONE_FOR_ALL;
6058
6059         return all;
6060 }
6061 EXPORT_SYMBOL(netdev_increment_features);
6062
6063 static struct hlist_head *netdev_create_hash(void)
6064 {
6065         int i;
6066         struct hlist_head *hash;
6067
6068         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6069         if (hash != NULL)
6070                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6071                         INIT_HLIST_HEAD(&hash[i]);
6072
6073         return hash;
6074 }
6075
6076 /* Initialize per network namespace state */
6077 static int __net_init netdev_init(struct net *net)
6078 {
6079         INIT_LIST_HEAD(&net->dev_base_head);
6080
6081         net->dev_name_head = netdev_create_hash();
6082         if (net->dev_name_head == NULL)
6083                 goto err_name;
6084
6085         net->dev_index_head = netdev_create_hash();
6086         if (net->dev_index_head == NULL)
6087                 goto err_idx;
6088
6089         return 0;
6090
6091 err_idx:
6092         kfree(net->dev_name_head);
6093 err_name:
6094         return -ENOMEM;
6095 }
6096
6097 /**
6098  *      netdev_drivername - network driver for the device
6099  *      @dev: network device
6100  *      @buffer: buffer for resulting name
6101  *      @len: size of buffer
6102  *
6103  *      Determine network driver for device.
6104  */
6105 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6106 {
6107         const struct device_driver *driver;
6108         const struct device *parent;
6109
6110         if (len <= 0 || !buffer)
6111                 return buffer;
6112         buffer[0] = 0;
6113
6114         parent = dev->dev.parent;
6115
6116         if (!parent)
6117                 return buffer;
6118
6119         driver = parent->driver;
6120         if (driver && driver->name)
6121                 strlcpy(buffer, driver->name, len);
6122         return buffer;
6123 }
6124
6125 static int __netdev_printk(const char *level, const struct net_device *dev,
6126                            struct va_format *vaf)
6127 {
6128         int r;
6129
6130         if (dev && dev->dev.parent)
6131                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6132                                netdev_name(dev), vaf);
6133         else if (dev)
6134                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6135         else
6136                 r = printk("%s(NULL net_device): %pV", level, vaf);
6137
6138         return r;
6139 }
6140
6141 int netdev_printk(const char *level, const struct net_device *dev,
6142                   const char *format, ...)
6143 {
6144         struct va_format vaf;
6145         va_list args;
6146         int r;
6147
6148         va_start(args, format);
6149
6150         vaf.fmt = format;
6151         vaf.va = &args;
6152
6153         r = __netdev_printk(level, dev, &vaf);
6154         va_end(args);
6155
6156         return r;
6157 }
6158 EXPORT_SYMBOL(netdev_printk);
6159
6160 #define define_netdev_printk_level(func, level)                 \
6161 int func(const struct net_device *dev, const char *fmt, ...)    \
6162 {                                                               \
6163         int r;                                                  \
6164         struct va_format vaf;                                   \
6165         va_list args;                                           \
6166                                                                 \
6167         va_start(args, fmt);                                    \
6168                                                                 \
6169         vaf.fmt = fmt;                                          \
6170         vaf.va = &args;                                         \
6171                                                                 \
6172         r = __netdev_printk(level, dev, &vaf);                  \
6173         va_end(args);                                           \
6174                                                                 \
6175         return r;                                               \
6176 }                                                               \
6177 EXPORT_SYMBOL(func);
6178
6179 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6180 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6181 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6182 define_netdev_printk_level(netdev_err, KERN_ERR);
6183 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6184 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6185 define_netdev_printk_level(netdev_info, KERN_INFO);
6186
6187 static void __net_exit netdev_exit(struct net *net)
6188 {
6189         kfree(net->dev_name_head);
6190         kfree(net->dev_index_head);
6191 }
6192
6193 static struct pernet_operations __net_initdata netdev_net_ops = {
6194         .init = netdev_init,
6195         .exit = netdev_exit,
6196 };
6197
6198 static void __net_exit default_device_exit(struct net *net)
6199 {
6200         struct net_device *dev, *aux;
6201         /*
6202          * Push all migratable network devices back to the
6203          * initial network namespace
6204          */
6205         rtnl_lock();
6206         for_each_netdev_safe(net, dev, aux) {
6207                 int err;
6208                 char fb_name[IFNAMSIZ];
6209
6210                 /* Ignore unmoveable devices (i.e. loopback) */
6211                 if (dev->features & NETIF_F_NETNS_LOCAL)
6212                         continue;
6213
6214                 /* Leave virtual devices for the generic cleanup */
6215                 if (dev->rtnl_link_ops)
6216                         continue;
6217
6218                 /* Push remaing network devices to init_net */
6219                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6220                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6221                 if (err) {
6222                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6223                                 __func__, dev->name, err);
6224                         BUG();
6225                 }
6226         }
6227         rtnl_unlock();
6228 }
6229
6230 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6231 {
6232         /* At exit all network devices most be removed from a network
6233          * namespace.  Do this in the reverse order of registeration.
6234          * Do this across as many network namespaces as possible to
6235          * improve batching efficiency.
6236          */
6237         struct net_device *dev;
6238         struct net *net;
6239         LIST_HEAD(dev_kill_list);
6240
6241         rtnl_lock();
6242         list_for_each_entry(net, net_list, exit_list) {
6243                 for_each_netdev_reverse(net, dev) {
6244                         if (dev->rtnl_link_ops)
6245                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6246                         else
6247                                 unregister_netdevice_queue(dev, &dev_kill_list);
6248                 }
6249         }
6250         unregister_netdevice_many(&dev_kill_list);
6251         rtnl_unlock();
6252 }
6253
6254 static struct pernet_operations __net_initdata default_device_ops = {
6255         .exit = default_device_exit,
6256         .exit_batch = default_device_exit_batch,
6257 };
6258
6259 /*
6260  *      Initialize the DEV module. At boot time this walks the device list and
6261  *      unhooks any devices that fail to initialise (normally hardware not
6262  *      present) and leaves us with a valid list of present and active devices.
6263  *
6264  */
6265
6266 /*
6267  *       This is called single threaded during boot, so no need
6268  *       to take the rtnl semaphore.
6269  */
6270 static int __init net_dev_init(void)
6271 {
6272         int i, rc = -ENOMEM;
6273
6274         BUG_ON(!dev_boot_phase);
6275
6276         if (dev_proc_init())
6277                 goto out;
6278
6279         if (netdev_kobject_init())
6280                 goto out;
6281
6282         INIT_LIST_HEAD(&ptype_all);
6283         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6284                 INIT_LIST_HEAD(&ptype_base[i]);
6285
6286         if (register_pernet_subsys(&netdev_net_ops))
6287                 goto out;
6288
6289         /*
6290          *      Initialise the packet receive queues.
6291          */
6292
6293         for_each_possible_cpu(i) {
6294                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6295
6296                 memset(sd, 0, sizeof(*sd));
6297                 skb_queue_head_init(&sd->input_pkt_queue);
6298                 skb_queue_head_init(&sd->process_queue);
6299                 sd->completion_queue = NULL;
6300                 INIT_LIST_HEAD(&sd->poll_list);
6301                 sd->output_queue = NULL;
6302                 sd->output_queue_tailp = &sd->output_queue;
6303 #ifdef CONFIG_RPS
6304                 sd->csd.func = rps_trigger_softirq;
6305                 sd->csd.info = sd;
6306                 sd->csd.flags = 0;
6307                 sd->cpu = i;
6308 #endif
6309
6310                 sd->backlog.poll = process_backlog;
6311                 sd->backlog.weight = weight_p;
6312                 sd->backlog.gro_list = NULL;
6313                 sd->backlog.gro_count = 0;
6314         }
6315
6316         dev_boot_phase = 0;
6317
6318         /* The loopback device is special if any other network devices
6319          * is present in a network namespace the loopback device must
6320          * be present. Since we now dynamically allocate and free the
6321          * loopback device ensure this invariant is maintained by
6322          * keeping the loopback device as the first device on the
6323          * list of network devices.  Ensuring the loopback devices
6324          * is the first device that appears and the last network device
6325          * that disappears.
6326          */
6327         if (register_pernet_device(&loopback_net_ops))
6328                 goto out;
6329
6330         if (register_pernet_device(&default_device_ops))
6331                 goto out;
6332
6333         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6334         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6335
6336         hotcpu_notifier(dev_cpu_callback, 0);
6337         dst_init();
6338         dev_mcast_init();
6339         rc = 0;
6340 out:
6341         return rc;
6342 }
6343
6344 subsys_initcall(net_dev_init);
6345
6346 static int __init initialize_hashrnd(void)
6347 {
6348         get_random_bytes(&hashrnd, sizeof(hashrnd));
6349         return 0;
6350 }
6351
6352 late_initcall_sync(initialize_hashrnd);
6353