net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135 #include <linux/cpu_rmap.h>
 136 #include <linux/if_tunnel.h>
 137 #include <linux/if_pppox.h>
 138 #include <linux/ppp_defs.h>
 139 #include <linux/net_tstamp.h>
 140 #include <linux/jump_label.h>
 141
 142 #include "net-sysfs.h"
 143
 144 /* Instead of increasing this, you should create a hash table. */
 145 #define MAX_GRO_SKBS 8
 146
 147 /* This should be increased if a protocol with a bigger head is added. */
 148 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 149
 150 /*
 151  *      The list of packet types we will receive (as opposed to discard)
 152  *      and the routines to invoke.
 153  *
 154  *      Why 16. Because with 16 the only overlap we get on a hash of the
 155  *      low nibble of the protocol value is RARP/SNAP/X.25.
 156  *
 157  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 158  *             sure which should go first, but I bet it won't make much
 159  *             difference if we are running VLANs.  The good news is that
 160  *             this protocol won't be in the list unless compiled in, so
 161  *             the average user (w/out VLANs) will not be adversely affected.
 162  *             --BLG
 163  *
 164  *              0800    IP
 165  *              8100    802.1Q VLAN
 166  *              0001    802.3
 167  *              0002    AX.25
 168  *              0004    802.2
 169  *              8035    RARP
 170  *              0005    SNAP
 171  *              0805    X.25
 172  *              0806    ARP
 173  *              8137    IPX
 174  *              0009    Localtalk
 175  *              86DD    IPv6
 176  */
 177
 178 #define PTYPE_HASH_SIZE (16)
 179 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 180
 181 static DEFINE_SPINLOCK(ptype_lock);
 182 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 183 static struct list_head ptype_all __read_mostly;        /* Taps */
 184
 185 /*
 186  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 187  * semaphore.
 188  *
 189  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 190  *
 191  * Writers must hold the rtnl semaphore while they loop through the
 192  * dev_base_head list, and hold dev_base_lock for writing when they do the
 193  * actual updates.  This allows pure readers to access the list even
 194  * while a writer is preparing to update it.
 195  *
 196  * To put it another way, dev_base_lock is held for writing only to
 197  * protect against pure readers; the rtnl semaphore provides the
 198  * protection against other writers.
 199  *
 200  * See, for example usages, register_netdevice() and
 201  * unregister_netdevice(), which must be called with the rtnl
 202  * semaphore held.
 203  */
 204 DEFINE_RWLOCK(dev_base_lock);
 205 EXPORT_SYMBOL(dev_base_lock);
 206
 207 static inline void dev_base_seq_inc(struct net *net)
 208 {
 209         while (++net->dev_base_seq == 0);
 210 }
 211
 212 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 213 {
 214         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 215         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 216 }
 217
 218 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 219 {
 220         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 221 }
 222
 223 static inline void rps_lock(struct softnet_data *sd)
 224 {
 225 #ifdef CONFIG_RPS
 226         spin_lock(&sd->input_pkt_queue.lock);
 227 #endif
 228 }
 229
 230 static inline void rps_unlock(struct softnet_data *sd)
 231 {
 232 #ifdef CONFIG_RPS
 233         spin_unlock(&sd->input_pkt_queue.lock);
 234 #endif
 235 }
 236
 237 /* Device list insertion */
 238 static int list_netdevice(struct net_device *dev)
 239 {
 240         struct net *net = dev_net(dev);
 241
 242         ASSERT_RTNL();
 243
 244         write_lock_bh(&dev_base_lock);
 245         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 246         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 247         hlist_add_head_rcu(&dev->index_hlist,
 248                            dev_index_hash(net, dev->ifindex));
 249         write_unlock_bh(&dev_base_lock);
 250
 251         dev_base_seq_inc(net);
 252
 253         return 0;
 254 }
 255
 256 /* Device list removal
 257  * caller must respect a RCU grace period before freeing/reusing dev
 258  */
 259 static void unlist_netdevice(struct net_device *dev)
 260 {
 261         ASSERT_RTNL();
 262
 263         /* Unlink dev from the device chain */
 264         write_lock_bh(&dev_base_lock);
 265         list_del_rcu(&dev->dev_list);
 266         hlist_del_rcu(&dev->name_hlist);
 267         hlist_del_rcu(&dev->index_hlist);
 268         write_unlock_bh(&dev_base_lock);
 269
 270         dev_base_seq_inc(dev_net(dev));
 271 }
 272
 273 /*
 274  *      Our notifier list
 275  */
 276
 277 static RAW_NOTIFIER_HEAD(netdev_chain);
 278
 279 /*
 280  *      Device drivers call our routines to queue packets here. We empty the
 281  *      queue in the local softnet handler.
 282  */
 283
 284 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 285 EXPORT_PER_CPU_SYMBOL(softnet_data);
 286
 287 #ifdef CONFIG_LOCKDEP
 288 /*
 289  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 290  * according to dev->type
 291  */
 292 static const unsigned short netdev_lock_type[] =
 293         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 294          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 295          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 296          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 297          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 298          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 299          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 300          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 301          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 302          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 303          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 304          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 305          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 306          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 307          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 308          ARPHRD_VOID, ARPHRD_NONE};
 309
 310 static const char *const netdev_lock_name[] =
 311         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 312          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 313          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 314          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 315          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 316          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 317          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 318          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 319          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 320          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 321          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 322          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 323          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 324          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 325          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 326          "_xmit_VOID", "_xmit_NONE"};
 327
 328 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 329 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 330
 331 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 332 {
 333         int i;
 334
 335         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 336                 if (netdev_lock_type[i] == dev_type)
 337                         return i;
 338         /* the last key is used by default */
 339         return ARRAY_SIZE(netdev_lock_type) - 1;
 340 }
 341
 342 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 343                                                  unsigned short dev_type)
 344 {
 345         int i;
 346
 347         i = netdev_lock_pos(dev_type);
 348         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 349                                    netdev_lock_name[i]);
 350 }
 351
 352 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 353 {
 354         int i;
 355
 356         i = netdev_lock_pos(dev->type);
 357         lockdep_set_class_and_name(&dev->addr_list_lock,
 358                                    &netdev_addr_lock_key[i],
 359                                    netdev_lock_name[i]);
 360 }
 361 #else
 362 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 363                                                  unsigned short dev_type)
 364 {
 365 }
 366 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 367 {
 368 }
 369 #endif
 370
 371 /*******************************************************************************
 372
 373                 Protocol management and registration routines
 374
 375 *******************************************************************************/
 376
 377 /*
 378  *      Add a protocol ID to the list. Now that the input handler is
 379  *      smarter we can dispense with all the messy stuff that used to be
 380  *      here.
 381  *
 382  *      BEWARE!!! Protocol handlers, mangling input packets,
 383  *      MUST BE last in hash buckets and checking protocol handlers
 384  *      MUST start from promiscuous ptype_all chain in net_bh.
 385  *      It is true now, do not change it.
 386  *      Explanation follows: if protocol handler, mangling packet, will
 387  *      be the first on list, it is not able to sense, that packet
 388  *      is cloned and should be copied-on-write, so that it will
 389  *      change it and subsequent readers will get broken packet.
 390  *                                                      --ANK (980803)
 391  */
 392
 393 static inline struct list_head *ptype_head(const struct packet_type *pt)
 394 {
 395         if (pt->type == htons(ETH_P_ALL))
 396                 return &ptype_all;
 397         else
 398                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 399 }
 400
 401 /**
 402  *      dev_add_pack - add packet handler
 403  *      @pt: packet type declaration
 404  *
 405  *      Add a protocol handler to the networking stack. The passed &packet_type
 406  *      is linked into kernel lists and may not be freed until it has been
 407  *      removed from the kernel lists.
 408  *
 409  *      This call does not sleep therefore it can not
 410  *      guarantee all CPU's that are in middle of receiving packets
 411  *      will see the new packet type (until the next received packet).
 412  */
 413
 414 void dev_add_pack(struct packet_type *pt)
 415 {
 416         struct list_head *head = ptype_head(pt);
 417
 418         spin_lock(&ptype_lock);
 419         list_add_rcu(&pt->list, head);
 420         spin_unlock(&ptype_lock);
 421 }
 422 EXPORT_SYMBOL(dev_add_pack);
 423
 424 /**
 425  *      __dev_remove_pack        - remove packet handler
 426  *      @pt: packet type declaration
 427  *
 428  *      Remove a protocol handler that was previously added to the kernel
 429  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 430  *      from the kernel lists and can be freed or reused once this function
 431  *      returns.
 432  *
 433  *      The packet type might still be in use by receivers
 434  *      and must not be freed until after all the CPU's have gone
 435  *      through a quiescent state.
 436  */
 437 void __dev_remove_pack(struct packet_type *pt)
 438 {
 439         struct list_head *head = ptype_head(pt);
 440         struct packet_type *pt1;
 441
 442         spin_lock(&ptype_lock);
 443
 444         list_for_each_entry(pt1, head, list) {
 445                 if (pt == pt1) {
 446                         list_del_rcu(&pt->list);
 447                         goto out;
 448                 }
 449         }
 450
 451         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 452 out:
 453         spin_unlock(&ptype_lock);
 454 }
 455 EXPORT_SYMBOL(__dev_remove_pack);
 456
 457 /**
 458  *      dev_remove_pack  - remove packet handler
 459  *      @pt: packet type declaration
 460  *
 461  *      Remove a protocol handler that was previously added to the kernel
 462  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 463  *      from the kernel lists and can be freed or reused once this function
 464  *      returns.
 465  *
 466  *      This call sleeps to guarantee that no CPU is looking at the packet
 467  *      type after return.
 468  */
 469 void dev_remove_pack(struct packet_type *pt)
 470 {
 471         __dev_remove_pack(pt);
 472
 473         synchronize_net();
 474 }
 475 EXPORT_SYMBOL(dev_remove_pack);
 476
 477 /******************************************************************************
 478
 479                       Device Boot-time Settings Routines
 480
 481 *******************************************************************************/
 482
 483 /* Boot time configuration table */
 484 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 485
 486 /**
 487  *      netdev_boot_setup_add   - add new setup entry
 488  *      @name: name of the device
 489  *      @map: configured settings for the device
 490  *
 491  *      Adds new setup entry to the dev_boot_setup list.  The function
 492  *      returns 0 on error and 1 on success.  This is a generic routine to
 493  *      all netdevices.
 494  */
 495 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 496 {
 497         struct netdev_boot_setup *s;
 498         int i;
 499
 500         s = dev_boot_setup;
 501         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 502                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 503                         memset(s[i].name, 0, sizeof(s[i].name));
 504                         strlcpy(s[i].name, name, IFNAMSIZ);
 505                         memcpy(&s[i].map, map, sizeof(s[i].map));
 506                         break;
 507                 }
 508         }
 509
 510         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 511 }
 512
 513 /**
 514  *      netdev_boot_setup_check - check boot time settings
 515  *      @dev: the netdevice
 516  *
 517  *      Check boot time settings for the device.
 518  *      The found settings are set for the device to be used
 519  *      later in the device probing.
 520  *      Returns 0 if no settings found, 1 if they are.
 521  */
 522 int netdev_boot_setup_check(struct net_device *dev)
 523 {
 524         struct netdev_boot_setup *s = dev_boot_setup;
 525         int i;
 526
 527         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 528                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 529                     !strcmp(dev->name, s[i].name)) {
 530                         dev->irq        = s[i].map.irq;
 531                         dev->base_addr  = s[i].map.base_addr;
 532                         dev->mem_start  = s[i].map.mem_start;
 533                         dev->mem_end    = s[i].map.mem_end;
 534                         return 1;
 535                 }
 536         }
 537         return 0;
 538 }
 539 EXPORT_SYMBOL(netdev_boot_setup_check);
 540
 541
 542 /**
 543  *      netdev_boot_base        - get address from boot time settings
 544  *      @prefix: prefix for network device
 545  *      @unit: id for network device
 546  *
 547  *      Check boot time settings for the base address of device.
 548  *      The found settings are set for the device to be used
 549  *      later in the device probing.
 550  *      Returns 0 if no settings found.
 551  */
 552 unsigned long netdev_boot_base(const char *prefix, int unit)
 553 {
 554         const struct netdev_boot_setup *s = dev_boot_setup;
 555         char name[IFNAMSIZ];
 556         int i;
 557
 558         sprintf(name, "%s%d", prefix, unit);
 559
 560         /*
 561          * If device already registered then return base of 1
 562          * to indicate not to probe for this interface
 563          */
 564         if (__dev_get_by_name(&init_net, name))
 565                 return 1;
 566
 567         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 568                 if (!strcmp(name, s[i].name))
 569                         return s[i].map.base_addr;
 570         return 0;
 571 }
 572
 573 /*
 574  * Saves at boot time configured settings for any netdevice.
 575  */
 576 int __init netdev_boot_setup(char *str)
 577 {
 578         int ints[5];
 579         struct ifmap map;
 580
 581         str = get_options(str, ARRAY_SIZE(ints), ints);
 582         if (!str || !*str)
 583                 return 0;
 584
 585         /* Save settings */
 586         memset(&map, 0, sizeof(map));
 587         if (ints[0] > 0)
 588                 map.irq = ints[1];
 589         if (ints[0] > 1)
 590                 map.base_addr = ints[2];
 591         if (ints[0] > 2)
 592                 map.mem_start = ints[3];
 593         if (ints[0] > 3)
 594                 map.mem_end = ints[4];
 595
 596         /* Add new entry to the list */
 597         return netdev_boot_setup_add(str, &map);
 598 }
 599
 600 __setup("netdev=", netdev_boot_setup);
 601
 602 /*******************************************************************************
 603
 604                             Device Interface Subroutines
 605
 606 *******************************************************************************/
 607
 608 /**
 609  *      __dev_get_by_name       - find a device by its name
 610  *      @net: the applicable net namespace
 611  *      @name: name to find
 612  *
 613  *      Find an interface by name. Must be called under RTNL semaphore
 614  *      or @dev_base_lock. If the name is found a pointer to the device
 615  *      is returned. If the name is not found then %NULL is returned. The
 616  *      reference counters are not incremented so the caller must be
 617  *      careful with locks.
 618  */
 619
 620 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 621 {
 622         struct hlist_node *p;
 623         struct net_device *dev;
 624         struct hlist_head *head = dev_name_hash(net, name);
 625
 626         hlist_for_each_entry(dev, p, head, name_hlist)
 627                 if (!strncmp(dev->name, name, IFNAMSIZ))
 628                         return dev;
 629
 630         return NULL;
 631 }
 632 EXPORT_SYMBOL(__dev_get_by_name);
 633
 634 /**
 635  *      dev_get_by_name_rcu     - find a device by its name
 636  *      @net: the applicable net namespace
 637  *      @name: name to find
 638  *
 639  *      Find an interface by name.
 640  *      If the name is found a pointer to the device is returned.
 641  *      If the name is not found then %NULL is returned.
 642  *      The reference counters are not incremented so the caller must be
 643  *      careful with locks. The caller must hold RCU lock.
 644  */
 645
 646 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 647 {
 648         struct hlist_node *p;
 649         struct net_device *dev;
 650         struct hlist_head *head = dev_name_hash(net, name);
 651
 652         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 653                 if (!strncmp(dev->name, name, IFNAMSIZ))
 654                         return dev;
 655
 656         return NULL;
 657 }
 658 EXPORT_SYMBOL(dev_get_by_name_rcu);
 659
 660 /**
 661  *      dev_get_by_name         - find a device by its name
 662  *      @net: the applicable net namespace
 663  *      @name: name to find
 664  *
 665  *      Find an interface by name. This can be called from any
 666  *      context and does its own locking. The returned handle has
 667  *      the usage count incremented and the caller must use dev_put() to
 668  *      release it when it is no longer needed. %NULL is returned if no
 669  *      matching device is found.
 670  */
 671
 672 struct net_device *dev_get_by_name(struct net *net, const char *name)
 673 {
 674         struct net_device *dev;
 675
 676         rcu_read_lock();
 677         dev = dev_get_by_name_rcu(net, name);
 678         if (dev)
 679                 dev_hold(dev);
 680         rcu_read_unlock();
 681         return dev;
 682 }
 683 EXPORT_SYMBOL(dev_get_by_name);
 684
 685 /**
 686  *      __dev_get_by_index - find a device by its ifindex
 687  *      @net: the applicable net namespace
 688  *      @ifindex: index of device
 689  *
 690  *      Search for an interface by index. Returns %NULL if the device
 691  *      is not found or a pointer to the device. The device has not
 692  *      had its reference counter increased so the caller must be careful
 693  *      about locking. The caller must hold either the RTNL semaphore
 694  *      or @dev_base_lock.
 695  */
 696
 697 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 698 {
 699         struct hlist_node *p;
 700         struct net_device *dev;
 701         struct hlist_head *head = dev_index_hash(net, ifindex);
 702
 703         hlist_for_each_entry(dev, p, head, index_hlist)
 704                 if (dev->ifindex == ifindex)
 705                         return dev;
 706
 707         return NULL;
 708 }
 709 EXPORT_SYMBOL(__dev_get_by_index);
 710
 711 /**
 712  *      dev_get_by_index_rcu - find a device by its ifindex
 713  *      @net: the applicable net namespace
 714  *      @ifindex: index of device
 715  *
 716  *      Search for an interface by index. Returns %NULL if the device
 717  *      is not found or a pointer to the device. The device has not
 718  *      had its reference counter increased so the caller must be careful
 719  *      about locking. The caller must hold RCU lock.
 720  */
 721
 722 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 723 {
 724         struct hlist_node *p;
 725         struct net_device *dev;
 726         struct hlist_head *head = dev_index_hash(net, ifindex);
 727
 728         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 729                 if (dev->ifindex == ifindex)
 730                         return dev;
 731
 732         return NULL;
 733 }
 734 EXPORT_SYMBOL(dev_get_by_index_rcu);
 735
 736
 737 /**
 738  *      dev_get_by_index - find a device by its ifindex
 739  *      @net: the applicable net namespace
 740  *      @ifindex: index of device
 741  *
 742  *      Search for an interface by index. Returns NULL if the device
 743  *      is not found or a pointer to the device. The device returned has
 744  *      had a reference added and the pointer is safe until the user calls
 745  *      dev_put to indicate they have finished with it.
 746  */
 747
 748 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 749 {
 750         struct net_device *dev;
 751
 752         rcu_read_lock();
 753         dev = dev_get_by_index_rcu(net, ifindex);
 754         if (dev)
 755                 dev_hold(dev);
 756         rcu_read_unlock();
 757         return dev;
 758 }
 759 EXPORT_SYMBOL(dev_get_by_index);
 760
 761 /**
 762  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 763  *      @net: the applicable net namespace
 764  *      @type: media type of device
 765  *      @ha: hardware address
 766  *
 767  *      Search for an interface by MAC address. Returns NULL if the device
 768  *      is not found or a pointer to the device.
 769  *      The caller must hold RCU or RTNL.
 770  *      The returned device has not had its ref count increased
 771  *      and the caller must therefore be careful about locking
 772  *
 773  */
 774
 775 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 776                                        const char *ha)
 777 {
 778         struct net_device *dev;
 779
 780         for_each_netdev_rcu(net, dev)
 781                 if (dev->type == type &&
 782                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 783                         return dev;
 784
 785         return NULL;
 786 }
 787 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 788
 789 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 790 {
 791         struct net_device *dev;
 792
 793         ASSERT_RTNL();
 794         for_each_netdev(net, dev)
 795                 if (dev->type == type)
 796                         return dev;
 797
 798         return NULL;
 799 }
 800 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 801
 802 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 803 {
 804         struct net_device *dev, *ret = NULL;
 805
 806         rcu_read_lock();
 807         for_each_netdev_rcu(net, dev)
 808                 if (dev->type == type) {
 809                         dev_hold(dev);
 810                         ret = dev;
 811                         break;
 812                 }
 813         rcu_read_unlock();
 814         return ret;
 815 }
 816 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 817
 818 /**
 819  *      dev_get_by_flags_rcu - find any device with given flags
 820  *      @net: the applicable net namespace
 821  *      @if_flags: IFF_* values
 822  *      @mask: bitmask of bits in if_flags to check
 823  *
 824  *      Search for any interface with the given flags. Returns NULL if a device
 825  *      is not found or a pointer to the device. Must be called inside
 826  *      rcu_read_lock(), and result refcount is unchanged.
 827  */
 828
 829 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 830                                     unsigned short mask)
 831 {
 832         struct net_device *dev, *ret;
 833
 834         ret = NULL;
 835         for_each_netdev_rcu(net, dev) {
 836                 if (((dev->flags ^ if_flags) & mask) == 0) {
 837                         ret = dev;
 838                         break;
 839                 }
 840         }
 841         return ret;
 842 }
 843 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 844
 845 /**
 846  *      dev_valid_name - check if name is okay for network device
 847  *      @name: name string
 848  *
 849  *      Network device names need to be valid file names to
 850  *      to allow sysfs to work.  We also disallow any kind of
 851  *      whitespace.
 852  */
 853 int dev_valid_name(const char *name)
 854 {
 855         if (*name == '\0')
 856                 return 0;
 857         if (strlen(name) >= IFNAMSIZ)
 858                 return 0;
 859         if (!strcmp(name, ".") || !strcmp(name, ".."))
 860                 return 0;
 861
 862         while (*name) {
 863                 if (*name == '/' || isspace(*name))
 864                         return 0;
 865                 name++;
 866         }
 867         return 1;
 868 }
 869 EXPORT_SYMBOL(dev_valid_name);
 870
 871 /**
 872  *      __dev_alloc_name - allocate a name for a device
 873  *      @net: network namespace to allocate the device name in
 874  *      @name: name format string
 875  *      @buf:  scratch buffer and result name string
 876  *
 877  *      Passed a format string - eg "lt%d" it will try and find a suitable
 878  *      id. It scans list of devices to build up a free map, then chooses
 879  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 880  *      while allocating the name and adding the device in order to avoid
 881  *      duplicates.
 882  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 883  *      Returns the number of the unit assigned or a negative errno code.
 884  */
 885
 886 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 887 {
 888         int i = 0;
 889         const char *p;
 890         const int max_netdevices = 8*PAGE_SIZE;
 891         unsigned long *inuse;
 892         struct net_device *d;
 893
 894         p = strnchr(name, IFNAMSIZ-1, '%');
 895         if (p) {
 896                 /*
 897                  * Verify the string as this thing may have come from
 898                  * the user.  There must be either one "%d" and no other "%"
 899                  * characters.
 900                  */
 901                 if (p[1] != 'd' || strchr(p + 2, '%'))
 902                         return -EINVAL;
 903
 904                 /* Use one page as a bit array of possible slots */
 905                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 906                 if (!inuse)
 907                         return -ENOMEM;
 908
 909                 for_each_netdev(net, d) {
 910                         if (!sscanf(d->name, name, &i))
 911                                 continue;
 912                         if (i < 0 || i >= max_netdevices)
 913                                 continue;
 914
 915                         /*  avoid cases where sscanf is not exact inverse of printf */
 916                         snprintf(buf, IFNAMSIZ, name, i);
 917                         if (!strncmp(buf, d->name, IFNAMSIZ))
 918                                 set_bit(i, inuse);
 919                 }
 920
 921                 i = find_first_zero_bit(inuse, max_netdevices);
 922                 free_page((unsigned long) inuse);
 923         }
 924
 925         if (buf != name)
 926                 snprintf(buf, IFNAMSIZ, name, i);
 927         if (!__dev_get_by_name(net, buf))
 928                 return i;
 929
 930         /* It is possible to run out of possible slots
 931          * when the name is long and there isn't enough space left
 932          * for the digits, or if all bits are used.
 933          */
 934         return -ENFILE;
 935 }
 936
 937 /**
 938  *      dev_alloc_name - allocate a name for a device
 939  *      @dev: device
 940  *      @name: name format string
 941  *
 942  *      Passed a format string - eg "lt%d" it will try and find a suitable
 943  *      id. It scans list of devices to build up a free map, then chooses
 944  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 945  *      while allocating the name and adding the device in order to avoid
 946  *      duplicates.
 947  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 948  *      Returns the number of the unit assigned or a negative errno code.
 949  */
 950
 951 int dev_alloc_name(struct net_device *dev, const char *name)
 952 {
 953         char buf[IFNAMSIZ];
 954         struct net *net;
 955         int ret;
 956
 957         BUG_ON(!dev_net(dev));
 958         net = dev_net(dev);
 959         ret = __dev_alloc_name(net, name, buf);
 960         if (ret >= 0)
 961                 strlcpy(dev->name, buf, IFNAMSIZ);
 962         return ret;
 963 }
 964 EXPORT_SYMBOL(dev_alloc_name);
 965
 966 static int dev_get_valid_name(struct net_device *dev, const char *name)
 967 {
 968         struct net *net;
 969
 970         BUG_ON(!dev_net(dev));
 971         net = dev_net(dev);
 972
 973         if (!dev_valid_name(name))
 974                 return -EINVAL;
 975
 976         if (strchr(name, '%'))
 977                 return dev_alloc_name(dev, name);
 978         else if (__dev_get_by_name(net, name))
 979                 return -EEXIST;
 980         else if (dev->name != name)
 981                 strlcpy(dev->name, name, IFNAMSIZ);
 982
 983         return 0;
 984 }
 985
 986 /**
 987  *      dev_change_name - change name of a device
 988  *      @dev: device
 989  *      @newname: name (or format string) must be at least IFNAMSIZ
 990  *
 991  *      Change name of a device, can pass format strings "eth%d".
 992  *      for wildcarding.
 993  */
 994 int dev_change_name(struct net_device *dev, const char *newname)
 995 {
 996         char oldname[IFNAMSIZ];
 997         int err = 0;
 998         int ret;
 999         struct net *net;
1000
1001         ASSERT_RTNL();
1002         BUG_ON(!dev_net(dev));
1003
1004         net = dev_net(dev);
1005         if (dev->flags & IFF_UP)
1006                 return -EBUSY;
1007
1008         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1009                 return 0;
1010
1011         memcpy(oldname, dev->name, IFNAMSIZ);
1012
1013         err = dev_get_valid_name(dev, newname);
1014         if (err < 0)
1015                 return err;
1016
1017 rollback:
1018         ret = device_rename(&dev->dev, dev->name);
1019         if (ret) {
1020                 memcpy(dev->name, oldname, IFNAMSIZ);
1021                 return ret;
1022         }
1023
1024         write_lock_bh(&dev_base_lock);
1025         hlist_del_rcu(&dev->name_hlist);
1026         write_unlock_bh(&dev_base_lock);
1027
1028         synchronize_rcu();
1029
1030         write_lock_bh(&dev_base_lock);
1031         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1032         write_unlock_bh(&dev_base_lock);
1033
1034         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1035         ret = notifier_to_errno(ret);
1036
1037         if (ret) {
1038                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1039                 if (err >= 0) {
1040                         err = ret;
1041                         memcpy(dev->name, oldname, IFNAMSIZ);
1042                         goto rollback;
1043                 } else {
1044                         printk(KERN_ERR
1045                                "%s: name change rollback failed: %d.\n",
1046                                dev->name, ret);
1047                 }
1048         }
1049
1050         return err;
1051 }
1052
1053 /**
1054  *      dev_set_alias - change ifalias of a device
1055  *      @dev: device
1056  *      @alias: name up to IFALIASZ
1057  *      @len: limit of bytes to copy from info
1058  *
1059  *      Set ifalias for a device,
1060  */
1061 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1062 {
1063         ASSERT_RTNL();
1064
1065         if (len >= IFALIASZ)
1066                 return -EINVAL;
1067
1068         if (!len) {
1069                 if (dev->ifalias) {
1070                         kfree(dev->ifalias);
1071                         dev->ifalias = NULL;
1072                 }
1073                 return 0;
1074         }
1075
1076         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1077         if (!dev->ifalias)
1078                 return -ENOMEM;
1079
1080         strlcpy(dev->ifalias, alias, len+1);
1081         return len;
1082 }
1083
1084
1085 /**
1086  *      netdev_features_change - device changes features
1087  *      @dev: device to cause notification
1088  *
1089  *      Called to indicate a device has changed features.
1090  */
1091 void netdev_features_change(struct net_device *dev)
1092 {
1093         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1094 }
1095 EXPORT_SYMBOL(netdev_features_change);
1096
1097 /**
1098  *      netdev_state_change - device changes state
1099  *      @dev: device to cause notification
1100  *
1101  *      Called to indicate a device has changed state. This function calls
1102  *      the notifier chains for netdev_chain and sends a NEWLINK message
1103  *      to the routing socket.
1104  */
1105 void netdev_state_change(struct net_device *dev)
1106 {
1107         if (dev->flags & IFF_UP) {
1108                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1109                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1110         }
1111 }
1112 EXPORT_SYMBOL(netdev_state_change);
1113
1114 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1115 {
1116         return call_netdevice_notifiers(event, dev);
1117 }
1118 EXPORT_SYMBOL(netdev_bonding_change);
1119
1120 /**
1121  *      dev_load        - load a network module
1122  *      @net: the applicable net namespace
1123  *      @name: name of interface
1124  *
1125  *      If a network interface is not present and the process has suitable
1126  *      privileges this function loads the module. If module loading is not
1127  *      available in this kernel then it becomes a nop.
1128  */
1129
1130 void dev_load(struct net *net, const char *name)
1131 {
1132         struct net_device *dev;
1133         int no_module;
1134
1135         rcu_read_lock();
1136         dev = dev_get_by_name_rcu(net, name);
1137         rcu_read_unlock();
1138
1139         no_module = !dev;
1140         if (no_module && capable(CAP_NET_ADMIN))
1141                 no_module = request_module("netdev-%s", name);
1142         if (no_module && capable(CAP_SYS_MODULE)) {
1143                 if (!request_module("%s", name))
1144                         pr_err("Loading kernel module for a network device "
1145 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1146 "instead\n", name);
1147         }
1148 }
1149 EXPORT_SYMBOL(dev_load);
1150
1151 static int __dev_open(struct net_device *dev)
1152 {
1153         const struct net_device_ops *ops = dev->netdev_ops;
1154         int ret;
1155
1156         ASSERT_RTNL();
1157
1158         if (!netif_device_present(dev))
1159                 return -ENODEV;
1160
1161         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1162         ret = notifier_to_errno(ret);
1163         if (ret)
1164                 return ret;
1165
1166         set_bit(__LINK_STATE_START, &dev->state);
1167
1168         if (ops->ndo_validate_addr)
1169                 ret = ops->ndo_validate_addr(dev);
1170
1171         if (!ret && ops->ndo_open)
1172                 ret = ops->ndo_open(dev);
1173
1174         if (ret)
1175                 clear_bit(__LINK_STATE_START, &dev->state);
1176         else {
1177                 dev->flags |= IFF_UP;
1178                 net_dmaengine_get();
1179                 dev_set_rx_mode(dev);
1180                 dev_activate(dev);
1181         }
1182
1183         return ret;
1184 }
1185
1186 /**
1187  *      dev_open        - prepare an interface for use.
1188  *      @dev:   device to open
1189  *
1190  *      Takes a device from down to up state. The device's private open
1191  *      function is invoked and then the multicast lists are loaded. Finally
1192  *      the device is moved into the up state and a %NETDEV_UP message is
1193  *      sent to the netdev notifier chain.
1194  *
1195  *      Calling this function on an active interface is a nop. On a failure
1196  *      a negative errno code is returned.
1197  */
1198 int dev_open(struct net_device *dev)
1199 {
1200         int ret;
1201
1202         if (dev->flags & IFF_UP)
1203                 return 0;
1204
1205         ret = __dev_open(dev);
1206         if (ret < 0)
1207                 return ret;
1208
1209         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1210         call_netdevice_notifiers(NETDEV_UP, dev);
1211
1212         return ret;
1213 }
1214 EXPORT_SYMBOL(dev_open);
1215
1216 static int __dev_close_many(struct list_head *head)
1217 {
1218         struct net_device *dev;
1219
1220         ASSERT_RTNL();
1221         might_sleep();
1222
1223         list_for_each_entry(dev, head, unreg_list) {
1224                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1225
1226                 clear_bit(__LINK_STATE_START, &dev->state);
1227
1228                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1229                  * can be even on different cpu. So just clear netif_running().
1230                  *
1231                  * dev->stop() will invoke napi_disable() on all of it's
1232                  * napi_struct instances on this device.
1233                  */
1234                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1235         }
1236
1237         dev_deactivate_many(head);
1238
1239         list_for_each_entry(dev, head, unreg_list) {
1240                 const struct net_device_ops *ops = dev->netdev_ops;
1241
1242                 /*
1243                  *      Call the device specific close. This cannot fail.
1244                  *      Only if device is UP
1245                  *
1246                  *      We allow it to be called even after a DETACH hot-plug
1247                  *      event.
1248                  */
1249                 if (ops->ndo_stop)
1250                         ops->ndo_stop(dev);
1251
1252                 dev->flags &= ~IFF_UP;
1253                 net_dmaengine_put();
1254         }
1255
1256         return 0;
1257 }
1258
1259 static int __dev_close(struct net_device *dev)
1260 {
1261         int retval;
1262         LIST_HEAD(single);
1263
1264         list_add(&dev->unreg_list, &single);
1265         retval = __dev_close_many(&single);
1266         list_del(&single);
1267         return retval;
1268 }
1269
1270 static int dev_close_many(struct list_head *head)
1271 {
1272         struct net_device *dev, *tmp;
1273         LIST_HEAD(tmp_list);
1274
1275         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1276                 if (!(dev->flags & IFF_UP))
1277                         list_move(&dev->unreg_list, &tmp_list);
1278
1279         __dev_close_many(head);
1280
1281         list_for_each_entry(dev, head, unreg_list) {
1282                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1283                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1284         }
1285
1286         /* rollback_registered_many needs the complete original list */
1287         list_splice(&tmp_list, head);
1288         return 0;
1289 }
1290
1291 /**
1292  *      dev_close - shutdown an interface.
1293  *      @dev: device to shutdown
1294  *
1295  *      This function moves an active device into down state. A
1296  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1297  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1298  *      chain.
1299  */
1300 int dev_close(struct net_device *dev)
1301 {
1302         if (dev->flags & IFF_UP) {
1303                 LIST_HEAD(single);
1304
1305                 list_add(&dev->unreg_list, &single);
1306                 dev_close_many(&single);
1307                 list_del(&single);
1308         }
1309         return 0;
1310 }
1311 EXPORT_SYMBOL(dev_close);
1312
1313
1314 /**
1315  *      dev_disable_lro - disable Large Receive Offload on a device
1316  *      @dev: device
1317  *
1318  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1319  *      called under RTNL.  This is needed if received packets may be
1320  *      forwarded to another interface.
1321  */
1322 void dev_disable_lro(struct net_device *dev)
1323 {
1324         /*
1325          * If we're trying to disable lro on a vlan device
1326          * use the underlying physical device instead
1327          */
1328         if (is_vlan_dev(dev))
1329                 dev = vlan_dev_real_dev(dev);
1330
1331         dev->wanted_features &= ~NETIF_F_LRO;
1332         netdev_update_features(dev);
1333
1334         if (unlikely(dev->features & NETIF_F_LRO))
1335                 netdev_WARN(dev, "failed to disable LRO!\n");
1336 }
1337 EXPORT_SYMBOL(dev_disable_lro);
1338
1339
1340 static int dev_boot_phase = 1;
1341
1342 /**
1343  *      register_netdevice_notifier - register a network notifier block
1344  *      @nb: notifier
1345  *
1346  *      Register a notifier to be called when network device events occur.
1347  *      The notifier passed is linked into the kernel structures and must
1348  *      not be reused until it has been unregistered. A negative errno code
1349  *      is returned on a failure.
1350  *
1351  *      When registered all registration and up events are replayed
1352  *      to the new notifier to allow device to have a race free
1353  *      view of the network device list.
1354  */
1355
1356 int register_netdevice_notifier(struct notifier_block *nb)
1357 {
1358         struct net_device *dev;
1359         struct net_device *last;
1360         struct net *net;
1361         int err;
1362
1363         rtnl_lock();
1364         err = raw_notifier_chain_register(&netdev_chain, nb);
1365         if (err)
1366                 goto unlock;
1367         if (dev_boot_phase)
1368                 goto unlock;
1369         for_each_net(net) {
1370                 for_each_netdev(net, dev) {
1371                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1372                         err = notifier_to_errno(err);
1373                         if (err)
1374                                 goto rollback;
1375
1376                         if (!(dev->flags & IFF_UP))
1377                                 continue;
1378
1379                         nb->notifier_call(nb, NETDEV_UP, dev);
1380                 }
1381         }
1382
1383 unlock:
1384         rtnl_unlock();
1385         return err;
1386
1387 rollback:
1388         last = dev;
1389         for_each_net(net) {
1390                 for_each_netdev(net, dev) {
1391                         if (dev == last)
1392                                 break;
1393
1394                         if (dev->flags & IFF_UP) {
1395                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1396                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1397                         }
1398                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1399                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1400                 }
1401         }
1402
1403         raw_notifier_chain_unregister(&netdev_chain, nb);
1404         goto unlock;
1405 }
1406 EXPORT_SYMBOL(register_netdevice_notifier);
1407
1408 /**
1409  *      unregister_netdevice_notifier - unregister a network notifier block
1410  *      @nb: notifier
1411  *
1412  *      Unregister a notifier previously registered by
1413  *      register_netdevice_notifier(). The notifier is unlinked into the
1414  *      kernel structures and may then be reused. A negative errno code
1415  *      is returned on a failure.
1416  */
1417
1418 int unregister_netdevice_notifier(struct notifier_block *nb)
1419 {
1420         int err;
1421
1422         rtnl_lock();
1423         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1424         rtnl_unlock();
1425         return err;
1426 }
1427 EXPORT_SYMBOL(unregister_netdevice_notifier);
1428
1429 /**
1430  *      call_netdevice_notifiers - call all network notifier blocks
1431  *      @val: value passed unmodified to notifier function
1432  *      @dev: net_device pointer passed unmodified to notifier function
1433  *
1434  *      Call all network notifier blocks.  Parameters and return value
1435  *      are as for raw_notifier_call_chain().
1436  */
1437
1438 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1439 {
1440         ASSERT_RTNL();
1441         return raw_notifier_call_chain(&netdev_chain, val, dev);
1442 }
1443 EXPORT_SYMBOL(call_netdevice_notifiers);
1444
1445 static struct jump_label_key netstamp_needed __read_mostly;
1446
1447 void net_enable_timestamp(void)
1448 {
1449         jump_label_inc(&netstamp_needed);
1450 }
1451 EXPORT_SYMBOL(net_enable_timestamp);
1452
1453 void net_disable_timestamp(void)
1454 {
1455         jump_label_dec(&netstamp_needed);
1456 }
1457 EXPORT_SYMBOL(net_disable_timestamp);
1458
1459 static inline void net_timestamp_set(struct sk_buff *skb)
1460 {
1461         skb->tstamp.tv64 = 0;
1462         if (static_branch(&netstamp_needed))
1463                 __net_timestamp(skb);
1464 }
1465
1466 #define net_timestamp_check(COND, SKB)                  \
1467         if (static_branch(&netstamp_needed)) {          \
1468                 if ((COND) && !(SKB)->tstamp.tv64)      \
1469                         __net_timestamp(SKB);           \
1470         }                                               \
1471
1472 static int net_hwtstamp_validate(struct ifreq *ifr)
1473 {
1474         struct hwtstamp_config cfg;
1475         enum hwtstamp_tx_types tx_type;
1476         enum hwtstamp_rx_filters rx_filter;
1477         int tx_type_valid = 0;
1478         int rx_filter_valid = 0;
1479
1480         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1481                 return -EFAULT;
1482
1483         if (cfg.flags) /* reserved for future extensions */
1484                 return -EINVAL;
1485
1486         tx_type = cfg.tx_type;
1487         rx_filter = cfg.rx_filter;
1488
1489         switch (tx_type) {
1490         case HWTSTAMP_TX_OFF:
1491         case HWTSTAMP_TX_ON:
1492         case HWTSTAMP_TX_ONESTEP_SYNC:
1493                 tx_type_valid = 1;
1494                 break;
1495         }
1496
1497         switch (rx_filter) {
1498         case HWTSTAMP_FILTER_NONE:
1499         case HWTSTAMP_FILTER_ALL:
1500         case HWTSTAMP_FILTER_SOME:
1501         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1502         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1503         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1504         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1505         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1506         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1507         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1508         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1509         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1510         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1511         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1512         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1513                 rx_filter_valid = 1;
1514                 break;
1515         }
1516
1517         if (!tx_type_valid || !rx_filter_valid)
1518                 return -ERANGE;
1519
1520         return 0;
1521 }
1522
1523 static inline bool is_skb_forwardable(struct net_device *dev,
1524                                       struct sk_buff *skb)
1525 {
1526         unsigned int len;
1527
1528         if (!(dev->flags & IFF_UP))
1529                 return false;
1530
1531         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1532         if (skb->len <= len)
1533                 return true;
1534
1535         /* if TSO is enabled, we don't care about the length as the packet
1536          * could be forwarded without being segmented before
1537          */
1538         if (skb_is_gso(skb))
1539                 return true;
1540
1541         return false;
1542 }
1543
1544 /**
1545  * dev_forward_skb - loopback an skb to another netif
1546  *
1547  * @dev: destination network device
1548  * @skb: buffer to forward
1549  *
1550  * return values:
1551  *      NET_RX_SUCCESS  (no congestion)
1552  *      NET_RX_DROP     (packet was dropped, but freed)
1553  *
1554  * dev_forward_skb can be used for injecting an skb from the
1555  * start_xmit function of one device into the receive queue
1556  * of another device.
1557  *
1558  * The receiving device may be in another namespace, so
1559  * we have to clear all information in the skb that could
1560  * impact namespace isolation.
1561  */
1562 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1563 {
1564         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1565                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1566                         atomic_long_inc(&dev->rx_dropped);
1567                         kfree_skb(skb);
1568                         return NET_RX_DROP;
1569                 }
1570         }
1571
1572         skb_orphan(skb);
1573         nf_reset(skb);
1574
1575         if (unlikely(!is_skb_forwardable(dev, skb))) {
1576                 atomic_long_inc(&dev->rx_dropped);
1577                 kfree_skb(skb);
1578                 return NET_RX_DROP;
1579         }
1580         skb_set_dev(skb, dev);
1581         skb->tstamp.tv64 = 0;
1582         skb->pkt_type = PACKET_HOST;
1583         skb->protocol = eth_type_trans(skb, dev);
1584         return netif_rx(skb);
1585 }
1586 EXPORT_SYMBOL_GPL(dev_forward_skb);
1587
1588 static inline int deliver_skb(struct sk_buff *skb,
1589                               struct packet_type *pt_prev,
1590                               struct net_device *orig_dev)
1591 {
1592         atomic_inc(&skb->users);
1593         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1594 }
1595
1596 /*
1597  *      Support routine. Sends outgoing frames to any network
1598  *      taps currently in use.
1599  */
1600
1601 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1602 {
1603         struct packet_type *ptype;
1604         struct sk_buff *skb2 = NULL;
1605         struct packet_type *pt_prev = NULL;
1606
1607         rcu_read_lock();
1608         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1609                 /* Never send packets back to the socket
1610                  * they originated from - MvS (miquels@drinkel.ow.org)
1611                  */
1612                 if ((ptype->dev == dev || !ptype->dev) &&
1613                     (ptype->af_packet_priv == NULL ||
1614                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1615                         if (pt_prev) {
1616                                 deliver_skb(skb2, pt_prev, skb->dev);
1617                                 pt_prev = ptype;
1618                                 continue;
1619                         }
1620
1621                         skb2 = skb_clone(skb, GFP_ATOMIC);
1622                         if (!skb2)
1623                                 break;
1624
1625                         net_timestamp_set(skb2);
1626
1627                         /* skb->nh should be correctly
1628                            set by sender, so that the second statement is
1629                            just protection against buggy protocols.
1630                          */
1631                         skb_reset_mac_header(skb2);
1632
1633                         if (skb_network_header(skb2) < skb2->data ||
1634                             skb2->network_header > skb2->tail) {
1635                                 if (net_ratelimit())
1636                                         printk(KERN_CRIT "protocol %04x is "
1637                                                "buggy, dev %s\n",
1638                                                ntohs(skb2->protocol),
1639                                                dev->name);
1640                                 skb_reset_network_header(skb2);
1641                         }
1642
1643                         skb2->transport_header = skb2->network_header;
1644                         skb2->pkt_type = PACKET_OUTGOING;
1645                         pt_prev = ptype;
1646                 }
1647         }
1648         if (pt_prev)
1649                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1650         rcu_read_unlock();
1651 }
1652
1653 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1654  * @dev: Network device
1655  * @txq: number of queues available
1656  *
1657  * If real_num_tx_queues is changed the tc mappings may no longer be
1658  * valid. To resolve this verify the tc mapping remains valid and if
1659  * not NULL the mapping. With no priorities mapping to this
1660  * offset/count pair it will no longer be used. In the worst case TC0
1661  * is invalid nothing can be done so disable priority mappings. If is
1662  * expected that drivers will fix this mapping if they can before
1663  * calling netif_set_real_num_tx_queues.
1664  */
1665 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1666 {
1667         int i;
1668         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1669
1670         /* If TC0 is invalidated disable TC mapping */
1671         if (tc->offset + tc->count > txq) {
1672                 pr_warning("Number of in use tx queues changed "
1673                            "invalidating tc mappings. Priority "
1674                            "traffic classification disabled!\n");
1675                 dev->num_tc = 0;
1676                 return;
1677         }
1678
1679         /* Invalidated prio to tc mappings set to TC0 */
1680         for (i = 1; i < TC_BITMASK + 1; i++) {
1681                 int q = netdev_get_prio_tc_map(dev, i);
1682
1683                 tc = &dev->tc_to_txq[q];
1684                 if (tc->offset + tc->count > txq) {
1685                         pr_warning("Number of in use tx queues "
1686                                    "changed. Priority %i to tc "
1687                                    "mapping %i is no longer valid "
1688                                    "setting map to 0\n",
1689                                    i, q);
1690                         netdev_set_prio_tc_map(dev, i, 0);
1691                 }
1692         }
1693 }
1694
1695 /*
1696  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1697  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1698  */
1699 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1700 {
1701         int rc;
1702
1703         if (txq < 1 || txq > dev->num_tx_queues)
1704                 return -EINVAL;
1705
1706         if (dev->reg_state == NETREG_REGISTERED ||
1707             dev->reg_state == NETREG_UNREGISTERING) {
1708                 ASSERT_RTNL();
1709
1710                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1711                                                   txq);
1712                 if (rc)
1713                         return rc;
1714
1715                 if (dev->num_tc)
1716                         netif_setup_tc(dev, txq);
1717
1718                 if (txq < dev->real_num_tx_queues)
1719                         qdisc_reset_all_tx_gt(dev, txq);
1720         }
1721
1722         dev->real_num_tx_queues = txq;
1723         return 0;
1724 }
1725 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1726
1727 #ifdef CONFIG_RPS
1728 /**
1729  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1730  *      @dev: Network device
1731  *      @rxq: Actual number of RX queues
1732  *
1733  *      This must be called either with the rtnl_lock held or before
1734  *      registration of the net device.  Returns 0 on success, or a
1735  *      negative error code.  If called before registration, it always
1736  *      succeeds.
1737  */
1738 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1739 {
1740         int rc;
1741
1742         if (rxq < 1 || rxq > dev->num_rx_queues)
1743                 return -EINVAL;
1744
1745         if (dev->reg_state == NETREG_REGISTERED) {
1746                 ASSERT_RTNL();
1747
1748                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1749                                                   rxq);
1750                 if (rc)
1751                         return rc;
1752         }
1753
1754         dev->real_num_rx_queues = rxq;
1755         return 0;
1756 }
1757 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1758 #endif
1759
1760 static inline void __netif_reschedule(struct Qdisc *q)
1761 {
1762         struct softnet_data *sd;
1763         unsigned long flags;
1764
1765         local_irq_save(flags);
1766         sd = &__get_cpu_var(softnet_data);
1767         q->next_sched = NULL;
1768         *sd->output_queue_tailp = q;
1769         sd->output_queue_tailp = &q->next_sched;
1770         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1771         local_irq_restore(flags);
1772 }
1773
1774 void __netif_schedule(struct Qdisc *q)
1775 {
1776         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1777                 __netif_reschedule(q);
1778 }
1779 EXPORT_SYMBOL(__netif_schedule);
1780
1781 void dev_kfree_skb_irq(struct sk_buff *skb)
1782 {
1783         if (atomic_dec_and_test(&skb->users)) {
1784                 struct softnet_data *sd;
1785                 unsigned long flags;
1786
1787                 local_irq_save(flags);
1788                 sd = &__get_cpu_var(softnet_data);
1789                 skb->next = sd->completion_queue;
1790                 sd->completion_queue = skb;
1791                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1792                 local_irq_restore(flags);
1793         }
1794 }
1795 EXPORT_SYMBOL(dev_kfree_skb_irq);
1796
1797 void dev_kfree_skb_any(struct sk_buff *skb)
1798 {
1799         if (in_irq() || irqs_disabled())
1800                 dev_kfree_skb_irq(skb);
1801         else
1802                 dev_kfree_skb(skb);
1803 }
1804 EXPORT_SYMBOL(dev_kfree_skb_any);
1805
1806
1807 /**
1808  * netif_device_detach - mark device as removed
1809  * @dev: network device
1810  *
1811  * Mark device as removed from system and therefore no longer available.
1812  */
1813 void netif_device_detach(struct net_device *dev)
1814 {
1815         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1816             netif_running(dev)) {
1817                 netif_tx_stop_all_queues(dev);
1818         }
1819 }
1820 EXPORT_SYMBOL(netif_device_detach);
1821
1822 /**
1823  * netif_device_attach - mark device as attached
1824  * @dev: network device
1825  *
1826  * Mark device as attached from system and restart if needed.
1827  */
1828 void netif_device_attach(struct net_device *dev)
1829 {
1830         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1831             netif_running(dev)) {
1832                 netif_tx_wake_all_queues(dev);
1833                 __netdev_watchdog_up(dev);
1834         }
1835 }
1836 EXPORT_SYMBOL(netif_device_attach);
1837
1838 /**
1839  * skb_dev_set -- assign a new device to a buffer
1840  * @skb: buffer for the new device
1841  * @dev: network device
1842  *
1843  * If an skb is owned by a device already, we have to reset
1844  * all data private to the namespace a device belongs to
1845  * before assigning it a new device.
1846  */
1847 #ifdef CONFIG_NET_NS
1848 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1849 {
1850         skb_dst_drop(skb);
1851         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1852                 secpath_reset(skb);
1853                 nf_reset(skb);
1854                 skb_init_secmark(skb);
1855                 skb->mark = 0;
1856                 skb->priority = 0;
1857                 skb->nf_trace = 0;
1858                 skb->ipvs_property = 0;
1859 #ifdef CONFIG_NET_SCHED
1860                 skb->tc_index = 0;
1861 #endif
1862         }
1863         skb->dev = dev;
1864 }
1865 EXPORT_SYMBOL(skb_set_dev);
1866 #endif /* CONFIG_NET_NS */
1867
1868 /*
1869  * Invalidate hardware checksum when packet is to be mangled, and
1870  * complete checksum manually on outgoing path.
1871  */
1872 int skb_checksum_help(struct sk_buff *skb)
1873 {
1874         __wsum csum;
1875         int ret = 0, offset;
1876
1877         if (skb->ip_summed == CHECKSUM_COMPLETE)
1878                 goto out_set_summed;
1879
1880         if (unlikely(skb_shinfo(skb)->gso_size)) {
1881                 /* Let GSO fix up the checksum. */
1882                 goto out_set_summed;
1883         }
1884
1885         offset = skb_checksum_start_offset(skb);
1886         BUG_ON(offset >= skb_headlen(skb));
1887         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1888
1889         offset += skb->csum_offset;
1890         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1891
1892         if (skb_cloned(skb) &&
1893             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1894                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1895                 if (ret)
1896                         goto out;
1897         }
1898
1899         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1900 out_set_summed:
1901         skb->ip_summed = CHECKSUM_NONE;
1902 out:
1903         return ret;
1904 }
1905 EXPORT_SYMBOL(skb_checksum_help);
1906
1907 /**
1908  *      skb_gso_segment - Perform segmentation on skb.
1909  *      @skb: buffer to segment
1910  *      @features: features for the output path (see dev->features)
1911  *
1912  *      This function segments the given skb and returns a list of segments.
1913  *
1914  *      It may return NULL if the skb requires no segmentation.  This is
1915  *      only possible when GSO is used for verifying header integrity.
1916  */
1917 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1918         netdev_features_t features)
1919 {
1920         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1921         struct packet_type *ptype;
1922         __be16 type = skb->protocol;
1923         int vlan_depth = ETH_HLEN;
1924         int err;
1925
1926         while (type == htons(ETH_P_8021Q)) {
1927                 struct vlan_hdr *vh;
1928
1929                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1930                         return ERR_PTR(-EINVAL);
1931
1932                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1933                 type = vh->h_vlan_encapsulated_proto;
1934                 vlan_depth += VLAN_HLEN;
1935         }
1936
1937         skb_reset_mac_header(skb);
1938         skb->mac_len = skb->network_header - skb->mac_header;
1939         __skb_pull(skb, skb->mac_len);
1940
1941         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1942                 struct net_device *dev = skb->dev;
1943                 struct ethtool_drvinfo info = {};
1944
1945                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1946                         dev->ethtool_ops->get_drvinfo(dev, &info);
1947
1948                 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d ip_summed=%d\n",
1949                      info.driver, dev ? &dev->features : NULL,
1950                      skb->sk ? &skb->sk->sk_route_caps : NULL,
1951                      skb->len, skb->data_len, skb->ip_summed);
1952
1953                 if (skb_header_cloned(skb) &&
1954                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1955                         return ERR_PTR(err);
1956         }
1957
1958         rcu_read_lock();
1959         list_for_each_entry_rcu(ptype,
1960                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1961                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1962                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1963                                 err = ptype->gso_send_check(skb);
1964                                 segs = ERR_PTR(err);
1965                                 if (err || skb_gso_ok(skb, features))
1966                                         break;
1967                                 __skb_push(skb, (skb->data -
1968                                                  skb_network_header(skb)));
1969                         }
1970                         segs = ptype->gso_segment(skb, features);
1971                         break;
1972                 }
1973         }
1974         rcu_read_unlock();
1975
1976         __skb_push(skb, skb->data - skb_mac_header(skb));
1977
1978         return segs;
1979 }
1980 EXPORT_SYMBOL(skb_gso_segment);
1981
1982 /* Take action when hardware reception checksum errors are detected. */
1983 #ifdef CONFIG_BUG
1984 void netdev_rx_csum_fault(struct net_device *dev)
1985 {
1986         if (net_ratelimit()) {
1987                 printk(KERN_ERR "%s: hw csum failure.\n",
1988                         dev ? dev->name : "<unknown>");
1989                 dump_stack();
1990         }
1991 }
1992 EXPORT_SYMBOL(netdev_rx_csum_fault);
1993 #endif
1994
1995 /* Actually, we should eliminate this check as soon as we know, that:
1996  * 1. IOMMU is present and allows to map all the memory.
1997  * 2. No high memory really exists on this machine.
1998  */
1999
2000 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2001 {
2002 #ifdef CONFIG_HIGHMEM
2003         int i;
2004         if (!(dev->features & NETIF_F_HIGHDMA)) {
2005                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2006                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2007                         if (PageHighMem(skb_frag_page(frag)))
2008                                 return 1;
2009                 }
2010         }
2011
2012         if (PCI_DMA_BUS_IS_PHYS) {
2013                 struct device *pdev = dev->dev.parent;
2014
2015                 if (!pdev)
2016                         return 0;
2017                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2018                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2019                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2020                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2021                                 return 1;
2022                 }
2023         }
2024 #endif
2025         return 0;
2026 }
2027
2028 struct dev_gso_cb {
2029         void (*destructor)(struct sk_buff *skb);
2030 };
2031
2032 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2033
2034 static void dev_gso_skb_destructor(struct sk_buff *skb)
2035 {
2036         struct dev_gso_cb *cb;
2037
2038         do {
2039                 struct sk_buff *nskb = skb->next;
2040
2041                 skb->next = nskb->next;
2042                 nskb->next = NULL;
2043                 kfree_skb(nskb);
2044         } while (skb->next);
2045
2046         cb = DEV_GSO_CB(skb);
2047         if (cb->destructor)
2048                 cb->destructor(skb);
2049 }
2050
2051 /**
2052  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2053  *      @skb: buffer to segment
2054  *      @features: device features as applicable to this skb
2055  *
2056  *      This function segments the given skb and stores the list of segments
2057  *      in skb->next.
2058  */
2059 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2060 {
2061         struct sk_buff *segs;
2062
2063         segs = skb_gso_segment(skb, features);
2064
2065         /* Verifying header integrity only. */
2066         if (!segs)
2067                 return 0;
2068
2069         if (IS_ERR(segs))
2070                 return PTR_ERR(segs);
2071
2072         skb->next = segs;
2073         DEV_GSO_CB(skb)->destructor = skb->destructor;
2074         skb->destructor = dev_gso_skb_destructor;
2075
2076         return 0;
2077 }
2078
2079 /*
2080  * Try to orphan skb early, right before transmission by the device.
2081  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2082  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2083  */
2084 static inline void skb_orphan_try(struct sk_buff *skb)
2085 {
2086         struct sock *sk = skb->sk;
2087
2088         if (sk && !skb_shinfo(skb)->tx_flags) {
2089                 /* skb_tx_hash() wont be able to get sk.
2090                  * We copy sk_hash into skb->rxhash
2091                  */
2092                 if (!skb->rxhash)
2093                         skb->rxhash = sk->sk_hash;
2094                 skb_orphan(skb);
2095         }
2096 }
2097
2098 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2099 {
2100         return ((features & NETIF_F_GEN_CSUM) ||
2101                 ((features & NETIF_F_V4_CSUM) &&
2102                  protocol == htons(ETH_P_IP)) ||
2103                 ((features & NETIF_F_V6_CSUM) &&
2104                  protocol == htons(ETH_P_IPV6)) ||
2105                 ((features & NETIF_F_FCOE_CRC) &&
2106                  protocol == htons(ETH_P_FCOE)));
2107 }
2108
2109 static netdev_features_t harmonize_features(struct sk_buff *skb,
2110         __be16 protocol, netdev_features_t features)
2111 {
2112         if (!can_checksum_protocol(features, protocol)) {
2113                 features &= ~NETIF_F_ALL_CSUM;
2114                 features &= ~NETIF_F_SG;
2115         } else if (illegal_highdma(skb->dev, skb)) {
2116                 features &= ~NETIF_F_SG;
2117         }
2118
2119         return features;
2120 }
2121
2122 netdev_features_t netif_skb_features(struct sk_buff *skb)
2123 {
2124         __be16 protocol = skb->protocol;
2125         netdev_features_t features = skb->dev->features;
2126
2127         if (protocol == htons(ETH_P_8021Q)) {
2128                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2129                 protocol = veh->h_vlan_encapsulated_proto;
2130         } else if (!vlan_tx_tag_present(skb)) {
2131                 return harmonize_features(skb, protocol, features);
2132         }
2133
2134         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2135
2136         if (protocol != htons(ETH_P_8021Q)) {
2137                 return harmonize_features(skb, protocol, features);
2138         } else {
2139                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2140                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2141                 return harmonize_features(skb, protocol, features);
2142         }
2143 }
2144 EXPORT_SYMBOL(netif_skb_features);
2145
2146 /*
2147  * Returns true if either:
2148  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2149  *      2. skb is fragmented and the device does not support SG, or if
2150  *         at least one of fragments is in highmem and device does not
2151  *         support DMA from it.
2152  */
2153 static inline int skb_needs_linearize(struct sk_buff *skb,
2154                                       int features)
2155 {
2156         return skb_is_nonlinear(skb) &&
2157                         ((skb_has_frag_list(skb) &&
2158                                 !(features & NETIF_F_FRAGLIST)) ||
2159                         (skb_shinfo(skb)->nr_frags &&
2160                                 !(features & NETIF_F_SG)));
2161 }
2162
2163 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2164                         struct netdev_queue *txq)
2165 {
2166         const struct net_device_ops *ops = dev->netdev_ops;
2167         int rc = NETDEV_TX_OK;
2168         unsigned int skb_len;
2169
2170         if (likely(!skb->next)) {
2171                 netdev_features_t features;
2172
2173                 /*
2174                  * If device doesn't need skb->dst, release it right now while
2175                  * its hot in this cpu cache
2176                  */
2177                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2178                         skb_dst_drop(skb);
2179
2180                 if (!list_empty(&ptype_all))
2181                         dev_queue_xmit_nit(skb, dev);
2182
2183                 skb_orphan_try(skb);
2184
2185                 features = netif_skb_features(skb);
2186
2187                 if (vlan_tx_tag_present(skb) &&
2188                     !(features & NETIF_F_HW_VLAN_TX)) {
2189                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2190                         if (unlikely(!skb))
2191                                 goto out;
2192
2193                         skb->vlan_tci = 0;
2194                 }
2195
2196                 if (netif_needs_gso(skb, features)) {
2197                         if (unlikely(dev_gso_segment(skb, features)))
2198                                 goto out_kfree_skb;
2199                         if (skb->next)
2200                                 goto gso;
2201                 } else {
2202                         if (skb_needs_linearize(skb, features) &&
2203                             __skb_linearize(skb))
2204                                 goto out_kfree_skb;
2205
2206                         /* If packet is not checksummed and device does not
2207                          * support checksumming for this protocol, complete
2208                          * checksumming here.
2209                          */
2210                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2211                                 skb_set_transport_header(skb,
2212                                         skb_checksum_start_offset(skb));
2213                                 if (!(features & NETIF_F_ALL_CSUM) &&
2214                                      skb_checksum_help(skb))
2215                                         goto out_kfree_skb;
2216                         }
2217                 }
2218
2219                 skb_len = skb->len;
2220                 rc = ops->ndo_start_xmit(skb, dev);
2221                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2222                 if (rc == NETDEV_TX_OK)
2223                         txq_trans_update(txq);
2224                 return rc;
2225         }
2226
2227 gso:
2228         do {
2229                 struct sk_buff *nskb = skb->next;
2230
2231                 skb->next = nskb->next;
2232                 nskb->next = NULL;
2233
2234                 /*
2235                  * If device doesn't need nskb->dst, release it right now while
2236                  * its hot in this cpu cache
2237                  */
2238                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2239                         skb_dst_drop(nskb);
2240
2241                 skb_len = nskb->len;
2242                 rc = ops->ndo_start_xmit(nskb, dev);
2243                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2244                 if (unlikely(rc != NETDEV_TX_OK)) {
2245                         if (rc & ~NETDEV_TX_MASK)
2246                                 goto out_kfree_gso_skb;
2247                         nskb->next = skb->next;
2248                         skb->next = nskb;
2249                         return rc;
2250                 }
2251                 txq_trans_update(txq);
2252                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2253                         return NETDEV_TX_BUSY;
2254         } while (skb->next);
2255
2256 out_kfree_gso_skb:
2257         if (likely(skb->next == NULL))
2258                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2259 out_kfree_skb:
2260         kfree_skb(skb);
2261 out:
2262         return rc;
2263 }
2264
2265 static u32 hashrnd __read_mostly;
2266
2267 /*
2268  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2269  * to be used as a distribution range.
2270  */
2271 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2272                   unsigned int num_tx_queues)
2273 {
2274         u32 hash;
2275         u16 qoffset = 0;
2276         u16 qcount = num_tx_queues;
2277
2278         if (skb_rx_queue_recorded(skb)) {
2279                 hash = skb_get_rx_queue(skb);
2280                 while (unlikely(hash >= num_tx_queues))
2281                         hash -= num_tx_queues;
2282                 return hash;
2283         }
2284
2285         if (dev->num_tc) {
2286                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2287                 qoffset = dev->tc_to_txq[tc].offset;
2288                 qcount = dev->tc_to_txq[tc].count;
2289         }
2290
2291         if (skb->sk && skb->sk->sk_hash)
2292                 hash = skb->sk->sk_hash;
2293         else
2294                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2295         hash = jhash_1word(hash, hashrnd);
2296
2297         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2298 }
2299 EXPORT_SYMBOL(__skb_tx_hash);
2300
2301 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2302 {
2303         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2304                 if (net_ratelimit()) {
2305                         pr_warning("%s selects TX queue %d, but "
2306                                 "real number of TX queues is %d\n",
2307                                 dev->name, queue_index, dev->real_num_tx_queues);
2308                 }
2309                 return 0;
2310         }
2311         return queue_index;
2312 }
2313
2314 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2315 {
2316 #ifdef CONFIG_XPS
2317         struct xps_dev_maps *dev_maps;
2318         struct xps_map *map;
2319         int queue_index = -1;
2320
2321         rcu_read_lock();
2322         dev_maps = rcu_dereference(dev->xps_maps);
2323         if (dev_maps) {
2324                 map = rcu_dereference(
2325                     dev_maps->cpu_map[raw_smp_processor_id()]);
2326                 if (map) {
2327                         if (map->len == 1)
2328                                 queue_index = map->queues[0];
2329                         else {
2330                                 u32 hash;
2331                                 if (skb->sk && skb->sk->sk_hash)
2332                                         hash = skb->sk->sk_hash;
2333                                 else
2334                                         hash = (__force u16) skb->protocol ^
2335                                             skb->rxhash;
2336                                 hash = jhash_1word(hash, hashrnd);
2337                                 queue_index = map->queues[
2338                                     ((u64)hash * map->len) >> 32];
2339                         }
2340                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2341                                 queue_index = -1;
2342                 }
2343         }
2344         rcu_read_unlock();
2345
2346         return queue_index;
2347 #else
2348         return -1;
2349 #endif
2350 }
2351
2352 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2353                                         struct sk_buff *skb)
2354 {
2355         int queue_index;
2356         const struct net_device_ops *ops = dev->netdev_ops;
2357
2358         if (dev->real_num_tx_queues == 1)
2359                 queue_index = 0;
2360         else if (ops->ndo_select_queue) {
2361                 queue_index = ops->ndo_select_queue(dev, skb);
2362                 queue_index = dev_cap_txqueue(dev, queue_index);
2363         } else {
2364                 struct sock *sk = skb->sk;
2365                 queue_index = sk_tx_queue_get(sk);
2366
2367                 if (queue_index < 0 || skb->ooo_okay ||
2368                     queue_index >= dev->real_num_tx_queues) {
2369                         int old_index = queue_index;
2370
2371                         queue_index = get_xps_queue(dev, skb);
2372                         if (queue_index < 0)
2373                                 queue_index = skb_tx_hash(dev, skb);
2374
2375                         if (queue_index != old_index && sk) {
2376                                 struct dst_entry *dst =
2377                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2378
2379                                 if (dst && skb_dst(skb) == dst)
2380                                         sk_tx_queue_set(sk, queue_index);
2381                         }
2382                 }
2383         }
2384
2385         skb_set_queue_mapping(skb, queue_index);
2386         return netdev_get_tx_queue(dev, queue_index);
2387 }
2388
2389 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2390                                  struct net_device *dev,
2391                                  struct netdev_queue *txq)
2392 {
2393         spinlock_t *root_lock = qdisc_lock(q);
2394         bool contended;
2395         int rc;
2396
2397         qdisc_skb_cb(skb)->pkt_len = skb->len;
2398         qdisc_calculate_pkt_len(skb, q);
2399         /*
2400          * Heuristic to force contended enqueues to serialize on a
2401          * separate lock before trying to get qdisc main lock.
2402          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2403          * and dequeue packets faster.
2404          */
2405         contended = qdisc_is_running(q);
2406         if (unlikely(contended))
2407                 spin_lock(&q->busylock);
2408
2409         spin_lock(root_lock);
2410         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2411                 kfree_skb(skb);
2412                 rc = NET_XMIT_DROP;
2413         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2414                    qdisc_run_begin(q)) {
2415                 /*
2416                  * This is a work-conserving queue; there are no old skbs
2417                  * waiting to be sent out; and the qdisc is not running -
2418                  * xmit the skb directly.
2419                  */
2420                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2421                         skb_dst_force(skb);
2422
2423                 qdisc_bstats_update(q, skb);
2424
2425                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2426                         if (unlikely(contended)) {
2427                                 spin_unlock(&q->busylock);
2428                                 contended = false;
2429                         }
2430                         __qdisc_run(q);
2431                 } else
2432                         qdisc_run_end(q);
2433
2434                 rc = NET_XMIT_SUCCESS;
2435         } else {
2436                 skb_dst_force(skb);
2437                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2438                 if (qdisc_run_begin(q)) {
2439                         if (unlikely(contended)) {
2440                                 spin_unlock(&q->busylock);
2441                                 contended = false;
2442                         }
2443                         __qdisc_run(q);
2444                 }
2445         }
2446         spin_unlock(root_lock);
2447         if (unlikely(contended))
2448                 spin_unlock(&q->busylock);
2449         return rc;
2450 }
2451
2452 static DEFINE_PER_CPU(int, xmit_recursion);
2453 #define RECURSION_LIMIT 10
2454
2455 /**
2456  *      dev_queue_xmit - transmit a buffer
2457  *      @skb: buffer to transmit
2458  *
2459  *      Queue a buffer for transmission to a network device. The caller must
2460  *      have set the device and priority and built the buffer before calling
2461  *      this function. The function can be called from an interrupt.
2462  *
2463  *      A negative errno code is returned on a failure. A success does not
2464  *      guarantee the frame will be transmitted as it may be dropped due
2465  *      to congestion or traffic shaping.
2466  *
2467  * -----------------------------------------------------------------------------------
2468  *      I notice this method can also return errors from the queue disciplines,
2469  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2470  *      be positive.
2471  *
2472  *      Regardless of the return value, the skb is consumed, so it is currently
2473  *      difficult to retry a send to this method.  (You can bump the ref count
2474  *      before sending to hold a reference for retry if you are careful.)
2475  *
2476  *      When calling this method, interrupts MUST be enabled.  This is because
2477  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2478  *          --BLG
2479  */
2480 int dev_queue_xmit(struct sk_buff *skb)
2481 {
2482         struct net_device *dev = skb->dev;
2483         struct netdev_queue *txq;
2484         struct Qdisc *q;
2485         int rc = -ENOMEM;
2486
2487         /* Disable soft irqs for various locks below. Also
2488          * stops preemption for RCU.
2489          */
2490         rcu_read_lock_bh();
2491
2492         txq = dev_pick_tx(dev, skb);
2493         q = rcu_dereference_bh(txq->qdisc);
2494
2495 #ifdef CONFIG_NET_CLS_ACT
2496         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2497 #endif
2498         trace_net_dev_queue(skb);
2499         if (q->enqueue) {
2500                 rc = __dev_xmit_skb(skb, q, dev, txq);
2501                 goto out;
2502         }
2503
2504         /* The device has no queue. Common case for software devices:
2505            loopback, all the sorts of tunnels...
2506
2507            Really, it is unlikely that netif_tx_lock protection is necessary
2508            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2509            counters.)
2510            However, it is possible, that they rely on protection
2511            made by us here.
2512
2513            Check this and shot the lock. It is not prone from deadlocks.
2514            Either shot noqueue qdisc, it is even simpler 8)
2515          */
2516         if (dev->flags & IFF_UP) {
2517                 int cpu = smp_processor_id(); /* ok because BHs are off */
2518
2519                 if (txq->xmit_lock_owner != cpu) {
2520
2521                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2522                                 goto recursion_alert;
2523
2524                         HARD_TX_LOCK(dev, txq, cpu);
2525
2526                         if (!netif_tx_queue_stopped(txq)) {
2527                                 __this_cpu_inc(xmit_recursion);
2528                                 rc = dev_hard_start_xmit(skb, dev, txq);
2529                                 __this_cpu_dec(xmit_recursion);
2530                                 if (dev_xmit_complete(rc)) {
2531                                         HARD_TX_UNLOCK(dev, txq);
2532                                         goto out;
2533                                 }
2534                         }
2535                         HARD_TX_UNLOCK(dev, txq);
2536                         if (net_ratelimit())
2537                                 printk(KERN_CRIT "Virtual device %s asks to "
2538                                        "queue packet!\n", dev->name);
2539                 } else {
2540                         /* Recursion is detected! It is possible,
2541                          * unfortunately
2542                          */
2543 recursion_alert:
2544                         if (net_ratelimit())
2545                                 printk(KERN_CRIT "Dead loop on virtual device "
2546                                        "%s, fix it urgently!\n", dev->name);
2547                 }
2548         }
2549
2550         rc = -ENETDOWN;
2551         rcu_read_unlock_bh();
2552
2553         kfree_skb(skb);
2554         return rc;
2555 out:
2556         rcu_read_unlock_bh();
2557         return rc;
2558 }
2559 EXPORT_SYMBOL(dev_queue_xmit);
2560
2561
2562 /*=======================================================================
2563                         Receiver routines
2564   =======================================================================*/
2565
2566 int netdev_max_backlog __read_mostly = 1000;
2567 int netdev_tstamp_prequeue __read_mostly = 1;
2568 int netdev_budget __read_mostly = 300;
2569 int weight_p __read_mostly = 64;            /* old backlog weight */
2570
2571 /* Called with irq disabled */
2572 static inline void ____napi_schedule(struct softnet_data *sd,
2573                                      struct napi_struct *napi)
2574 {
2575         list_add_tail(&napi->poll_list, &sd->poll_list);
2576         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2577 }
2578
2579 /*
2580  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2581  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2582  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2583  * if hash is a canonical 4-tuple hash over transport ports.
2584  */
2585 void __skb_get_rxhash(struct sk_buff *skb)
2586 {
2587         int nhoff, hash = 0, poff;
2588         const struct ipv6hdr *ip6;
2589         const struct iphdr *ip;
2590         const struct vlan_hdr *vlan;
2591         u8 ip_proto;
2592         u32 addr1, addr2;
2593         u16 proto;
2594         union {
2595                 u32 v32;
2596                 u16 v16[2];
2597         } ports;
2598
2599         nhoff = skb_network_offset(skb);
2600         proto = skb->protocol;
2601
2602 again:
2603         switch (proto) {
2604         case __constant_htons(ETH_P_IP):
2605 ip:
2606                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2607                         goto done;
2608
2609                 ip = (const struct iphdr *) (skb->data + nhoff);
2610                 if (ip_is_fragment(ip))
2611                         ip_proto = 0;
2612                 else
2613                         ip_proto = ip->protocol;
2614                 addr1 = (__force u32) ip->saddr;
2615                 addr2 = (__force u32) ip->daddr;
2616                 nhoff += ip->ihl * 4;
2617                 break;
2618         case __constant_htons(ETH_P_IPV6):
2619 ipv6:
2620                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2621                         goto done;
2622
2623                 ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2624                 ip_proto = ip6->nexthdr;
2625                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2626                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2627                 nhoff += 40;
2628                 break;
2629         case __constant_htons(ETH_P_8021Q):
2630                 if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
2631                         goto done;
2632                 vlan = (const struct vlan_hdr *) (skb->data + nhoff);
2633                 proto = vlan->h_vlan_encapsulated_proto;
2634                 nhoff += sizeof(*vlan);
2635                 goto again;
2636         case __constant_htons(ETH_P_PPP_SES):
2637                 if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
2638                         goto done;
2639                 proto = *((__be16 *) (skb->data + nhoff +
2640                                       sizeof(struct pppoe_hdr)));
2641                 nhoff += PPPOE_SES_HLEN;
2642                 switch (proto) {
2643                 case __constant_htons(PPP_IP):
2644                         goto ip;
2645                 case __constant_htons(PPP_IPV6):
2646                         goto ipv6;
2647                 default:
2648                         goto done;
2649                 }
2650         default:
2651                 goto done;
2652         }
2653
2654         switch (ip_proto) {
2655         case IPPROTO_GRE:
2656                 if (pskb_may_pull(skb, nhoff + 16)) {
2657                         u8 *h = skb->data + nhoff;
2658                         __be16 flags = *(__be16 *)h;
2659
2660                         /*
2661                          * Only look inside GRE if version zero and no
2662                          * routing
2663                          */
2664                         if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
2665                                 proto = *(__be16 *)(h + 2);
2666                                 nhoff += 4;
2667                                 if (flags & GRE_CSUM)
2668                                         nhoff += 4;
2669                                 if (flags & GRE_KEY)
2670                                         nhoff += 4;
2671                                 if (flags & GRE_SEQ)
2672                                         nhoff += 4;
2673                                 goto again;
2674                         }
2675                 }
2676                 break;
2677         case IPPROTO_IPIP:
2678                 goto again;
2679         default:
2680                 break;
2681         }
2682
2683         ports.v32 = 0;
2684         poff = proto_ports_offset(ip_proto);
2685         if (poff >= 0) {
2686                 nhoff += poff;
2687                 if (pskb_may_pull(skb, nhoff + 4)) {
2688                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2689                         if (ports.v16[1] < ports.v16[0])
2690                                 swap(ports.v16[0], ports.v16[1]);
2691                         skb->l4_rxhash = 1;
2692                 }
2693         }
2694
2695         /* get a consistent hash (same value on both flow directions) */
2696         if (addr2 < addr1)
2697                 swap(addr1, addr2);
2698
2699         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2700         if (!hash)
2701                 hash = 1;
2702
2703 done:
2704         skb->rxhash = hash;
2705 }
2706 EXPORT_SYMBOL(__skb_get_rxhash);
2707
2708 #ifdef CONFIG_RPS
2709
2710 /* One global table that all flow-based protocols share. */
2711 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2712 EXPORT_SYMBOL(rps_sock_flow_table);
2713
2714 static struct rps_dev_flow *
2715 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2716             struct rps_dev_flow *rflow, u16 next_cpu)
2717 {
2718         if (next_cpu != RPS_NO_CPU) {
2719 #ifdef CONFIG_RFS_ACCEL
2720                 struct netdev_rx_queue *rxqueue;
2721                 struct rps_dev_flow_table *flow_table;
2722                 struct rps_dev_flow *old_rflow;
2723                 u32 flow_id;
2724                 u16 rxq_index;
2725                 int rc;
2726
2727                 /* Should we steer this flow to a different hardware queue? */
2728                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2729                     !(dev->features & NETIF_F_NTUPLE))
2730                         goto out;
2731                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2732                 if (rxq_index == skb_get_rx_queue(skb))
2733                         goto out;
2734
2735                 rxqueue = dev->_rx + rxq_index;
2736                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2737                 if (!flow_table)
2738                         goto out;
2739                 flow_id = skb->rxhash & flow_table->mask;
2740                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2741                                                         rxq_index, flow_id);
2742                 if (rc < 0)
2743                         goto out;
2744                 old_rflow = rflow;
2745                 rflow = &flow_table->flows[flow_id];
2746                 rflow->filter = rc;
2747                 if (old_rflow->filter == rflow->filter)
2748                         old_rflow->filter = RPS_NO_FILTER;
2749         out:
2750 #endif
2751                 rflow->last_qtail =
2752                         per_cpu(softnet_data, next_cpu).input_queue_head;
2753         }
2754
2755         rflow->cpu = next_cpu;
2756         return rflow;
2757 }
2758
2759 /*
2760  * get_rps_cpu is called from netif_receive_skb and returns the target
2761  * CPU from the RPS map of the receiving queue for a given skb.
2762  * rcu_read_lock must be held on entry.
2763  */
2764 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2765                        struct rps_dev_flow **rflowp)
2766 {
2767         struct netdev_rx_queue *rxqueue;
2768         struct rps_map *map;
2769         struct rps_dev_flow_table *flow_table;
2770         struct rps_sock_flow_table *sock_flow_table;
2771         int cpu = -1;
2772         u16 tcpu;
2773
2774         if (skb_rx_queue_recorded(skb)) {
2775                 u16 index = skb_get_rx_queue(skb);
2776                 if (unlikely(index >= dev->real_num_rx_queues)) {
2777                         WARN_ONCE(dev->real_num_rx_queues > 1,
2778                                   "%s received packet on queue %u, but number "
2779                                   "of RX queues is %u\n",
2780                                   dev->name, index, dev->real_num_rx_queues);
2781                         goto done;
2782                 }
2783                 rxqueue = dev->_rx + index;
2784         } else
2785                 rxqueue = dev->_rx;
2786
2787         map = rcu_dereference(rxqueue->rps_map);
2788         if (map) {
2789                 if (map->len == 1 &&
2790                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2791                         tcpu = map->cpus[0];
2792                         if (cpu_online(tcpu))
2793                                 cpu = tcpu;
2794                         goto done;
2795                 }
2796         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2797                 goto done;
2798         }
2799
2800         skb_reset_network_header(skb);
2801         if (!skb_get_rxhash(skb))
2802                 goto done;
2803
2804         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2805         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2806         if (flow_table && sock_flow_table) {
2807                 u16 next_cpu;
2808                 struct rps_dev_flow *rflow;
2809
2810                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2811                 tcpu = rflow->cpu;
2812
2813                 next_cpu = sock_flow_table->ents[skb->rxhash &
2814                     sock_flow_table->mask];
2815
2816                 /*
2817                  * If the desired CPU (where last recvmsg was done) is
2818                  * different from current CPU (one in the rx-queue flow
2819                  * table entry), switch if one of the following holds:
2820                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2821                  *   - Current CPU is offline.
2822                  *   - The current CPU's queue tail has advanced beyond the
2823                  *     last packet that was enqueued using this table entry.
2824                  *     This guarantees that all previous packets for the flow
2825                  *     have been dequeued, thus preserving in order delivery.
2826                  */
2827                 if (unlikely(tcpu != next_cpu) &&
2828                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2829                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2830                       rflow->last_qtail)) >= 0))
2831                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2832
2833                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2834                         *rflowp = rflow;
2835                         cpu = tcpu;
2836                         goto done;
2837                 }
2838         }
2839
2840         if (map) {
2841                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2842
2843                 if (cpu_online(tcpu)) {
2844                         cpu = tcpu;
2845                         goto done;
2846                 }
2847         }
2848
2849 done:
2850         return cpu;
2851 }
2852
2853 #ifdef CONFIG_RFS_ACCEL
2854
2855 /**
2856  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2857  * @dev: Device on which the filter was set
2858  * @rxq_index: RX queue index
2859  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2860  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2861  *
2862  * Drivers that implement ndo_rx_flow_steer() should periodically call
2863  * this function for each installed filter and remove the filters for
2864  * which it returns %true.
2865  */
2866 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2867                          u32 flow_id, u16 filter_id)
2868 {
2869         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2870         struct rps_dev_flow_table *flow_table;
2871         struct rps_dev_flow *rflow;
2872         bool expire = true;
2873         int cpu;
2874
2875         rcu_read_lock();
2876         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2877         if (flow_table && flow_id <= flow_table->mask) {
2878                 rflow = &flow_table->flows[flow_id];
2879                 cpu = ACCESS_ONCE(rflow->cpu);
2880                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2881                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2882                            rflow->last_qtail) <
2883                      (int)(10 * flow_table->mask)))
2884                         expire = false;
2885         }
2886         rcu_read_unlock();
2887         return expire;
2888 }
2889 EXPORT_SYMBOL(rps_may_expire_flow);
2890
2891 #endif /* CONFIG_RFS_ACCEL */
2892
2893 /* Called from hardirq (IPI) context */
2894 static void rps_trigger_softirq(void *data)
2895 {
2896         struct softnet_data *sd = data;
2897
2898         ____napi_schedule(sd, &sd->backlog);
2899         sd->received_rps++;
2900 }
2901
2902 #endif /* CONFIG_RPS */
2903
2904 /*
2905  * Check if this softnet_data structure is another cpu one
2906  * If yes, queue it to our IPI list and return 1
2907  * If no, return 0
2908  */
2909 static int rps_ipi_queued(struct softnet_data *sd)
2910 {
2911 #ifdef CONFIG_RPS
2912         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2913
2914         if (sd != mysd) {
2915                 sd->rps_ipi_next = mysd->rps_ipi_list;
2916                 mysd->rps_ipi_list = sd;
2917
2918                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2919                 return 1;
2920         }
2921 #endif /* CONFIG_RPS */
2922         return 0;
2923 }
2924
2925 /*
2926  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2927  * queue (may be a remote CPU queue).
2928  */
2929 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2930                               unsigned int *qtail)
2931 {
2932         struct softnet_data *sd;
2933         unsigned long flags;
2934
2935         sd = &per_cpu(softnet_data, cpu);
2936
2937         local_irq_save(flags);
2938
2939         rps_lock(sd);
2940         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2941                 if (skb_queue_len(&sd->input_pkt_queue)) {
2942 enqueue:
2943                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2944                         input_queue_tail_incr_save(sd, qtail);
2945                         rps_unlock(sd);
2946                         local_irq_restore(flags);
2947                         return NET_RX_SUCCESS;
2948                 }
2949
2950                 /* Schedule NAPI for backlog device
2951                  * We can use non atomic operation since we own the queue lock
2952                  */
2953                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2954                         if (!rps_ipi_queued(sd))
2955                                 ____napi_schedule(sd, &sd->backlog);
2956                 }
2957                 goto enqueue;
2958         }
2959
2960         sd->dropped++;
2961         rps_unlock(sd);
2962
2963         local_irq_restore(flags);
2964
2965         atomic_long_inc(&skb->dev->rx_dropped);
2966         kfree_skb(skb);
2967         return NET_RX_DROP;
2968 }
2969
2970 /**
2971  *      netif_rx        -       post buffer to the network code
2972  *      @skb: buffer to post
2973  *
2974  *      This function receives a packet from a device driver and queues it for
2975  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2976  *      may be dropped during processing for congestion control or by the
2977  *      protocol layers.
2978  *
2979  *      return values:
2980  *      NET_RX_SUCCESS  (no congestion)
2981  *      NET_RX_DROP     (packet was dropped)
2982  *
2983  */
2984
2985 int netif_rx(struct sk_buff *skb)
2986 {
2987         int ret;
2988
2989         /* if netpoll wants it, pretend we never saw it */
2990         if (netpoll_rx(skb))
2991                 return NET_RX_DROP;
2992
2993         net_timestamp_check(netdev_tstamp_prequeue, skb);
2994
2995         trace_netif_rx(skb);
2996 #ifdef CONFIG_RPS
2997         {
2998                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2999                 int cpu;
3000
3001                 preempt_disable();
3002                 rcu_read_lock();
3003
3004                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3005                 if (cpu < 0)
3006                         cpu = smp_processor_id();
3007
3008                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3009
3010                 rcu_read_unlock();
3011                 preempt_enable();
3012         }
3013 #else
3014         {
3015                 unsigned int qtail;
3016                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3017                 put_cpu();
3018         }
3019 #endif
3020         return ret;
3021 }
3022 EXPORT_SYMBOL(netif_rx);
3023
3024 int netif_rx_ni(struct sk_buff *skb)
3025 {
3026         int err;
3027
3028         preempt_disable();
3029         err = netif_rx(skb);
3030         if (local_softirq_pending())
3031                 do_softirq();
3032         preempt_enable();
3033
3034         return err;
3035 }
3036 EXPORT_SYMBOL(netif_rx_ni);
3037
3038 static void net_tx_action(struct softirq_action *h)
3039 {
3040         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3041
3042         if (sd->completion_queue) {
3043                 struct sk_buff *clist;
3044
3045                 local_irq_disable();
3046                 clist = sd->completion_queue;
3047                 sd->completion_queue = NULL;
3048                 local_irq_enable();
3049
3050                 while (clist) {
3051                         struct sk_buff *skb = clist;
3052                         clist = clist->next;
3053
3054                         WARN_ON(atomic_read(&skb->users));
3055                         trace_kfree_skb(skb, net_tx_action);
3056                         __kfree_skb(skb);
3057                 }
3058         }
3059
3060         if (sd->output_queue) {
3061                 struct Qdisc *head;
3062
3063                 local_irq_disable();
3064                 head = sd->output_queue;
3065                 sd->output_queue = NULL;
3066                 sd->output_queue_tailp = &sd->output_queue;
3067                 local_irq_enable();
3068
3069                 while (head) {
3070                         struct Qdisc *q = head;
3071                         spinlock_t *root_lock;
3072
3073                         head = head->next_sched;
3074
3075                         root_lock = qdisc_lock(q);
3076                         if (spin_trylock(root_lock)) {
3077                                 smp_mb__before_clear_bit();
3078                                 clear_bit(__QDISC_STATE_SCHED,
3079                                           &q->state);
3080                                 qdisc_run(q);
3081                                 spin_unlock(root_lock);
3082                         } else {
3083                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3084                                               &q->state)) {
3085                                         __netif_reschedule(q);
3086                                 } else {
3087                                         smp_mb__before_clear_bit();
3088                                         clear_bit(__QDISC_STATE_SCHED,
3089                                                   &q->state);
3090                                 }
3091                         }
3092                 }
3093         }
3094 }
3095
3096 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3097     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3098 /* This hook is defined here for ATM LANE */
3099 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3100                              unsigned char *addr) __read_mostly;
3101 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3102 #endif
3103
3104 #ifdef CONFIG_NET_CLS_ACT
3105 /* TODO: Maybe we should just force sch_ingress to be compiled in
3106  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3107  * a compare and 2 stores extra right now if we dont have it on
3108  * but have CONFIG_NET_CLS_ACT
3109  * NOTE: This doesn't stop any functionality; if you dont have
3110  * the ingress scheduler, you just can't add policies on ingress.
3111  *
3112  */
3113 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3114 {
3115         struct net_device *dev = skb->dev;
3116         u32 ttl = G_TC_RTTL(skb->tc_verd);
3117         int result = TC_ACT_OK;
3118         struct Qdisc *q;
3119
3120         if (unlikely(MAX_RED_LOOP < ttl++)) {
3121                 if (net_ratelimit())
3122                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3123                                skb->skb_iif, dev->ifindex);
3124                 return TC_ACT_SHOT;
3125         }
3126
3127         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3128         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3129
3130         q = rxq->qdisc;
3131         if (q != &noop_qdisc) {
3132                 spin_lock(qdisc_lock(q));
3133                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3134                         result = qdisc_enqueue_root(skb, q);
3135                 spin_unlock(qdisc_lock(q));
3136         }
3137
3138         return result;
3139 }
3140
3141 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3142                                          struct packet_type **pt_prev,
3143                                          int *ret, struct net_device *orig_dev)
3144 {
3145         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3146
3147         if (!rxq || rxq->qdisc == &noop_qdisc)
3148                 goto out;
3149
3150         if (*pt_prev) {
3151                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3152                 *pt_prev = NULL;
3153         }
3154
3155         switch (ing_filter(skb, rxq)) {
3156         case TC_ACT_SHOT:
3157         case TC_ACT_STOLEN:
3158                 kfree_skb(skb);
3159                 return NULL;
3160         }
3161
3162 out:
3163         skb->tc_verd = 0;
3164         return skb;
3165 }
3166 #endif
3167
3168 /**
3169  *      netdev_rx_handler_register - register receive handler
3170  *      @dev: device to register a handler for
3171  *      @rx_handler: receive handler to register
3172  *      @rx_handler_data: data pointer that is used by rx handler
3173  *
3174  *      Register a receive hander for a device. This handler will then be
3175  *      called from __netif_receive_skb. A negative errno code is returned
3176  *      on a failure.
3177  *
3178  *      The caller must hold the rtnl_mutex.
3179  *
3180  *      For a general description of rx_handler, see enum rx_handler_result.
3181  */
3182 int netdev_rx_handler_register(struct net_device *dev,
3183                                rx_handler_func_t *rx_handler,
3184                                void *rx_handler_data)
3185 {
3186         ASSERT_RTNL();
3187
3188         if (dev->rx_handler)
3189                 return -EBUSY;
3190
3191         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3192         rcu_assign_pointer(dev->rx_handler, rx_handler);
3193
3194         return 0;
3195 }
3196 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3197
3198 /**
3199  *      netdev_rx_handler_unregister - unregister receive handler
3200  *      @dev: device to unregister a handler from
3201  *
3202  *      Unregister a receive hander from a device.
3203  *
3204  *      The caller must hold the rtnl_mutex.
3205  */
3206 void netdev_rx_handler_unregister(struct net_device *dev)
3207 {
3208
3209         ASSERT_RTNL();
3210         RCU_INIT_POINTER(dev->rx_handler, NULL);
3211         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3212 }
3213 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3214
3215 static int __netif_receive_skb(struct sk_buff *skb)
3216 {
3217         struct packet_type *ptype, *pt_prev;
3218         rx_handler_func_t *rx_handler;
3219         struct net_device *orig_dev;
3220         struct net_device *null_or_dev;
3221         bool deliver_exact = false;
3222         int ret = NET_RX_DROP;
3223         __be16 type;
3224
3225         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3226
3227         trace_netif_receive_skb(skb);
3228
3229         /* if we've gotten here through NAPI, check netpoll */
3230         if (netpoll_receive_skb(skb))
3231                 return NET_RX_DROP;
3232
3233         if (!skb->skb_iif)
3234                 skb->skb_iif = skb->dev->ifindex;
3235         orig_dev = skb->dev;
3236
3237         skb_reset_network_header(skb);
3238         skb_reset_transport_header(skb);
3239         skb_reset_mac_len(skb);
3240
3241         pt_prev = NULL;
3242
3243         rcu_read_lock();
3244
3245 another_round:
3246
3247         __this_cpu_inc(softnet_data.processed);
3248
3249         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3250                 skb = vlan_untag(skb);
3251                 if (unlikely(!skb))
3252                         goto out;
3253         }
3254
3255 #ifdef CONFIG_NET_CLS_ACT
3256         if (skb->tc_verd & TC_NCLS) {
3257                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3258                 goto ncls;
3259         }
3260 #endif
3261
3262         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3263                 if (!ptype->dev || ptype->dev == skb->dev) {
3264                         if (pt_prev)
3265                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3266                         pt_prev = ptype;
3267                 }
3268         }
3269
3270 #ifdef CONFIG_NET_CLS_ACT
3271         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3272         if (!skb)
3273                 goto out;
3274 ncls:
3275 #endif
3276
3277         rx_handler = rcu_dereference(skb->dev->rx_handler);
3278         if (vlan_tx_tag_present(skb)) {
3279                 if (pt_prev) {
3280                         ret = deliver_skb(skb, pt_prev, orig_dev);
3281                         pt_prev = NULL;
3282                 }
3283                 if (vlan_do_receive(&skb, !rx_handler))
3284                         goto another_round;
3285                 else if (unlikely(!skb))
3286                         goto out;
3287         }
3288
3289         if (rx_handler) {
3290                 if (pt_prev) {
3291                         ret = deliver_skb(skb, pt_prev, orig_dev);
3292                         pt_prev = NULL;
3293                 }
3294                 switch (rx_handler(&skb)) {
3295                 case RX_HANDLER_CONSUMED:
3296                         goto out;
3297                 case RX_HANDLER_ANOTHER:
3298                         goto another_round;
3299                 case RX_HANDLER_EXACT:
3300                         deliver_exact = true;
3301                 case RX_HANDLER_PASS:
3302                         break;
3303                 default:
3304                         BUG();
3305                 }
3306         }
3307
3308         /* deliver only exact match when indicated */
3309         null_or_dev = deliver_exact ? skb->dev : NULL;
3310
3311         type = skb->protocol;
3312         list_for_each_entry_rcu(ptype,
3313                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3314                 if (ptype->type == type &&
3315                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3316                      ptype->dev == orig_dev)) {
3317                         if (pt_prev)
3318                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3319                         pt_prev = ptype;
3320                 }
3321         }
3322
3323         if (pt_prev) {
3324                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3325         } else {
3326                 atomic_long_inc(&skb->dev->rx_dropped);
3327                 kfree_skb(skb);
3328                 /* Jamal, now you will not able to escape explaining
3329                  * me how you were going to use this. :-)
3330                  */
3331                 ret = NET_RX_DROP;
3332         }
3333
3334 out:
3335         rcu_read_unlock();
3336         return ret;
3337 }
3338
3339 /**
3340  *      netif_receive_skb - process receive buffer from network
3341  *      @skb: buffer to process
3342  *
3343  *      netif_receive_skb() is the main receive data processing function.
3344  *      It always succeeds. The buffer may be dropped during processing
3345  *      for congestion control or by the protocol layers.
3346  *
3347  *      This function may only be called from softirq context and interrupts
3348  *      should be enabled.
3349  *
3350  *      Return values (usually ignored):
3351  *      NET_RX_SUCCESS: no congestion
3352  *      NET_RX_DROP: packet was dropped
3353  */
3354 int netif_receive_skb(struct sk_buff *skb)
3355 {
3356         net_timestamp_check(netdev_tstamp_prequeue, skb);
3357
3358         if (skb_defer_rx_timestamp(skb))
3359                 return NET_RX_SUCCESS;
3360
3361 #ifdef CONFIG_RPS
3362         {
3363                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3364                 int cpu, ret;
3365
3366                 rcu_read_lock();
3367
3368                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3369
3370                 if (cpu >= 0) {
3371                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3372                         rcu_read_unlock();
3373                 } else {
3374                         rcu_read_unlock();
3375                         ret = __netif_receive_skb(skb);
3376                 }
3377
3378                 return ret;
3379         }
3380 #else
3381         return __netif_receive_skb(skb);
3382 #endif
3383 }
3384 EXPORT_SYMBOL(netif_receive_skb);
3385
3386 /* Network device is going away, flush any packets still pending
3387  * Called with irqs disabled.
3388  */
3389 static void flush_backlog(void *arg)
3390 {
3391         struct net_device *dev = arg;
3392         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3393         struct sk_buff *skb, *tmp;
3394
3395         rps_lock(sd);
3396         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3397                 if (skb->dev == dev) {
3398                         __skb_unlink(skb, &sd->input_pkt_queue);
3399                         kfree_skb(skb);
3400                         input_queue_head_incr(sd);
3401                 }
3402         }
3403         rps_unlock(sd);
3404
3405         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3406                 if (skb->dev == dev) {
3407                         __skb_unlink(skb, &sd->process_queue);
3408                         kfree_skb(skb);
3409                         input_queue_head_incr(sd);
3410                 }
3411         }
3412 }
3413
3414 static int napi_gro_complete(struct sk_buff *skb)
3415 {
3416         struct packet_type *ptype;
3417         __be16 type = skb->protocol;
3418         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3419         int err = -ENOENT;
3420
3421         if (NAPI_GRO_CB(skb)->count == 1) {
3422                 skb_shinfo(skb)->gso_size = 0;
3423                 goto out;
3424         }
3425
3426         rcu_read_lock();
3427         list_for_each_entry_rcu(ptype, head, list) {
3428                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3429                         continue;
3430
3431                 err = ptype->gro_complete(skb);
3432                 break;
3433         }
3434         rcu_read_unlock();
3435
3436         if (err) {
3437                 WARN_ON(&ptype->list == head);
3438                 kfree_skb(skb);
3439                 return NET_RX_SUCCESS;
3440         }
3441
3442 out:
3443         return netif_receive_skb(skb);
3444 }
3445
3446 inline void napi_gro_flush(struct napi_struct *napi)
3447 {
3448         struct sk_buff *skb, *next;
3449
3450         for (skb = napi->gro_list; skb; skb = next) {
3451                 next = skb->next;
3452                 skb->next = NULL;
3453                 napi_gro_complete(skb);
3454         }
3455
3456         napi->gro_count = 0;
3457         napi->gro_list = NULL;
3458 }
3459 EXPORT_SYMBOL(napi_gro_flush);
3460
3461 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3462 {
3463         struct sk_buff **pp = NULL;
3464         struct packet_type *ptype;
3465         __be16 type = skb->protocol;
3466         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3467         int same_flow;
3468         int mac_len;
3469         enum gro_result ret;
3470
3471         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3472                 goto normal;
3473
3474         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3475                 goto normal;
3476
3477         rcu_read_lock();
3478         list_for_each_entry_rcu(ptype, head, list) {
3479                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3480                         continue;
3481
3482                 skb_set_network_header(skb, skb_gro_offset(skb));
3483                 mac_len = skb->network_header - skb->mac_header;
3484                 skb->mac_len = mac_len;
3485                 NAPI_GRO_CB(skb)->same_flow = 0;
3486                 NAPI_GRO_CB(skb)->flush = 0;
3487                 NAPI_GRO_CB(skb)->free = 0;
3488
3489                 pp = ptype->gro_receive(&napi->gro_list, skb);
3490                 break;
3491         }
3492         rcu_read_unlock();
3493
3494         if (&ptype->list == head)
3495                 goto normal;
3496
3497         same_flow = NAPI_GRO_CB(skb)->same_flow;
3498         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3499
3500         if (pp) {
3501                 struct sk_buff *nskb = *pp;
3502
3503                 *pp = nskb->next;
3504                 nskb->next = NULL;
3505                 napi_gro_complete(nskb);
3506                 napi->gro_count--;
3507         }
3508
3509         if (same_flow)
3510                 goto ok;
3511
3512         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3513                 goto normal;
3514
3515         napi->gro_count++;
3516         NAPI_GRO_CB(skb)->count = 1;
3517         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3518         skb->next = napi->gro_list;
3519         napi->gro_list = skb;
3520         ret = GRO_HELD;
3521
3522 pull:
3523         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3524                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3525
3526                 BUG_ON(skb->end - skb->tail < grow);
3527
3528                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3529
3530                 skb->tail += grow;
3531                 skb->data_len -= grow;
3532
3533                 skb_shinfo(skb)->frags[0].page_offset += grow;
3534                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3535
3536                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3537                         skb_frag_unref(skb, 0);
3538                         memmove(skb_shinfo(skb)->frags,
3539                                 skb_shinfo(skb)->frags + 1,
3540                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3541                 }
3542         }
3543
3544 ok:
3545         return ret;
3546
3547 normal:
3548         ret = GRO_NORMAL;
3549         goto pull;
3550 }
3551 EXPORT_SYMBOL(dev_gro_receive);
3552
3553 static inline gro_result_t
3554 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3555 {
3556         struct sk_buff *p;
3557
3558         for (p = napi->gro_list; p; p = p->next) {
3559                 unsigned long diffs;
3560
3561                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3562                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3563                 diffs |= compare_ether_header(skb_mac_header(p),
3564                                               skb_gro_mac_header(skb));
3565                 NAPI_GRO_CB(p)->same_flow = !diffs;
3566                 NAPI_GRO_CB(p)->flush = 0;
3567         }
3568
3569         return dev_gro_receive(napi, skb);
3570 }
3571
3572 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3573 {
3574         switch (ret) {
3575         case GRO_NORMAL:
3576                 if (netif_receive_skb(skb))
3577                         ret = GRO_DROP;
3578                 break;
3579
3580         case GRO_DROP:
3581         case GRO_MERGED_FREE:
3582                 kfree_skb(skb);
3583                 break;
3584
3585         case GRO_HELD:
3586         case GRO_MERGED:
3587                 break;
3588         }
3589
3590         return ret;
3591 }
3592 EXPORT_SYMBOL(napi_skb_finish);
3593
3594 void skb_gro_reset_offset(struct sk_buff *skb)
3595 {
3596         NAPI_GRO_CB(skb)->data_offset = 0;
3597         NAPI_GRO_CB(skb)->frag0 = NULL;
3598         NAPI_GRO_CB(skb)->frag0_len = 0;
3599
3600         if (skb->mac_header == skb->tail &&
3601             !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3602                 NAPI_GRO_CB(skb)->frag0 =
3603                         skb_frag_address(&skb_shinfo(skb)->frags[0]);
3604                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3605         }
3606 }
3607 EXPORT_SYMBOL(skb_gro_reset_offset);
3608
3609 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3610 {
3611         skb_gro_reset_offset(skb);
3612
3613         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3614 }
3615 EXPORT_SYMBOL(napi_gro_receive);
3616
3617 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3618 {
3619         __skb_pull(skb, skb_headlen(skb));
3620         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3621         skb->vlan_tci = 0;
3622         skb->dev = napi->dev;
3623         skb->skb_iif = 0;
3624
3625         napi->skb = skb;
3626 }
3627
3628 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3629 {
3630         struct sk_buff *skb = napi->skb;
3631
3632         if (!skb) {
3633                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3634                 if (skb)
3635                         napi->skb = skb;
3636         }
3637         return skb;
3638 }
3639 EXPORT_SYMBOL(napi_get_frags);
3640
3641 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3642                                gro_result_t ret)
3643 {
3644         switch (ret) {
3645         case GRO_NORMAL:
3646         case GRO_HELD:
3647                 skb->protocol = eth_type_trans(skb, skb->dev);
3648
3649                 if (ret == GRO_HELD)
3650                         skb_gro_pull(skb, -ETH_HLEN);
3651                 else if (netif_receive_skb(skb))
3652                         ret = GRO_DROP;
3653                 break;
3654
3655         case GRO_DROP:
3656         case GRO_MERGED_FREE:
3657                 napi_reuse_skb(napi, skb);
3658                 break;
3659
3660         case GRO_MERGED:
3661                 break;
3662         }
3663
3664         return ret;
3665 }
3666 EXPORT_SYMBOL(napi_frags_finish);
3667
3668 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3669 {
3670         struct sk_buff *skb = napi->skb;
3671         struct ethhdr *eth;
3672         unsigned int hlen;
3673         unsigned int off;
3674
3675         napi->skb = NULL;
3676
3677         skb_reset_mac_header(skb);
3678         skb_gro_reset_offset(skb);
3679
3680         off = skb_gro_offset(skb);
3681         hlen = off + sizeof(*eth);
3682         eth = skb_gro_header_fast(skb, off);
3683         if (skb_gro_header_hard(skb, hlen)) {
3684                 eth = skb_gro_header_slow(skb, hlen, off);
3685                 if (unlikely(!eth)) {
3686                         napi_reuse_skb(napi, skb);
3687                         skb = NULL;
3688                         goto out;
3689                 }
3690         }
3691
3692         skb_gro_pull(skb, sizeof(*eth));
3693
3694         /*
3695          * This works because the only protocols we care about don't require
3696          * special handling.  We'll fix it up properly at the end.
3697          */
3698         skb->protocol = eth->h_proto;
3699
3700 out:
3701         return skb;
3702 }
3703 EXPORT_SYMBOL(napi_frags_skb);
3704
3705 gro_result_t napi_gro_frags(struct napi_struct *napi)
3706 {
3707         struct sk_buff *skb = napi_frags_skb(napi);
3708
3709         if (!skb)
3710                 return GRO_DROP;
3711
3712         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3713 }
3714 EXPORT_SYMBOL(napi_gro_frags);
3715
3716 /*
3717  * net_rps_action sends any pending IPI's for rps.
3718  * Note: called with local irq disabled, but exits with local irq enabled.
3719  */
3720 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3721 {
3722 #ifdef CONFIG_RPS
3723         struct softnet_data *remsd = sd->rps_ipi_list;
3724
3725         if (remsd) {
3726                 sd->rps_ipi_list = NULL;
3727
3728                 local_irq_enable();
3729
3730                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3731                 while (remsd) {
3732                         struct softnet_data *next = remsd->rps_ipi_next;
3733
3734                         if (cpu_online(remsd->cpu))
3735                                 __smp_call_function_single(remsd->cpu,
3736                                                            &remsd->csd, 0);
3737                         remsd = next;
3738                 }
3739         } else
3740 #endif
3741                 local_irq_enable();
3742 }
3743
3744 static int process_backlog(struct napi_struct *napi, int quota)
3745 {
3746         int work = 0;
3747         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3748
3749 #ifdef CONFIG_RPS
3750         /* Check if we have pending ipi, its better to send them now,
3751          * not waiting net_rx_action() end.
3752          */
3753         if (sd->rps_ipi_list) {
3754                 local_irq_disable();
3755                 net_rps_action_and_irq_enable(sd);
3756         }
3757 #endif
3758         napi->weight = weight_p;
3759         local_irq_disable();
3760         while (work < quota) {
3761                 struct sk_buff *skb;
3762                 unsigned int qlen;
3763
3764                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3765                         local_irq_enable();
3766                         __netif_receive_skb(skb);
3767                         local_irq_disable();
3768                         input_queue_head_incr(sd);
3769                         if (++work >= quota) {
3770                                 local_irq_enable();
3771                                 return work;
3772                         }
3773                 }
3774
3775                 rps_lock(sd);
3776                 qlen = skb_queue_len(&sd->input_pkt_queue);
3777                 if (qlen)
3778                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3779                                                    &sd->process_queue);
3780
3781                 if (qlen < quota - work) {
3782                         /*
3783                          * Inline a custom version of __napi_complete().
3784                          * only current cpu owns and manipulates this napi,
3785                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3786                          * we can use a plain write instead of clear_bit(),
3787                          * and we dont need an smp_mb() memory barrier.
3788                          */
3789                         list_del(&napi->poll_list);
3790                         napi->state = 0;
3791
3792                         quota = work + qlen;
3793                 }
3794                 rps_unlock(sd);
3795         }
3796         local_irq_enable();
3797
3798         return work;
3799 }
3800
3801 /**
3802  * __napi_schedule - schedule for receive
3803  * @n: entry to schedule
3804  *
3805  * The entry's receive function will be scheduled to run
3806  */
3807 void __napi_schedule(struct napi_struct *n)
3808 {
3809         unsigned long flags;
3810
3811         local_irq_save(flags);
3812         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3813         local_irq_restore(flags);
3814 }
3815 EXPORT_SYMBOL(__napi_schedule);
3816
3817 void __napi_complete(struct napi_struct *n)
3818 {
3819         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3820         BUG_ON(n->gro_list);
3821
3822         list_del(&n->poll_list);
3823         smp_mb__before_clear_bit();
3824         clear_bit(NAPI_STATE_SCHED, &n->state);
3825 }
3826 EXPORT_SYMBOL(__napi_complete);
3827
3828 void napi_complete(struct napi_struct *n)
3829 {
3830         unsigned long flags;
3831
3832         /*
3833          * don't let napi dequeue from the cpu poll list
3834          * just in case its running on a different cpu
3835          */
3836         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3837                 return;
3838
3839         napi_gro_flush(n);
3840         local_irq_save(flags);
3841         __napi_complete(n);
3842         local_irq_restore(flags);
3843 }
3844 EXPORT_SYMBOL(napi_complete);
3845
3846 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3847                     int (*poll)(struct napi_struct *, int), int weight)
3848 {
3849         INIT_LIST_HEAD(&napi->poll_list);
3850         napi->gro_count = 0;
3851         napi->gro_list = NULL;
3852         napi->skb = NULL;
3853         napi->poll = poll;
3854         napi->weight = weight;
3855         list_add(&napi->dev_list, &dev->napi_list);
3856         napi->dev = dev;
3857 #ifdef CONFIG_NETPOLL
3858         spin_lock_init(&napi->poll_lock);
3859         napi->poll_owner = -1;
3860 #endif
3861         set_bit(NAPI_STATE_SCHED, &napi->state);
3862 }
3863 EXPORT_SYMBOL(netif_napi_add);
3864
3865 void netif_napi_del(struct napi_struct *napi)
3866 {
3867         struct sk_buff *skb, *next;
3868
3869         list_del_init(&napi->dev_list);
3870         napi_free_frags(napi);
3871
3872         for (skb = napi->gro_list; skb; skb = next) {
3873                 next = skb->next;
3874                 skb->next = NULL;
3875                 kfree_skb(skb);
3876         }
3877
3878         napi->gro_list = NULL;
3879         napi->gro_count = 0;
3880 }
3881 EXPORT_SYMBOL(netif_napi_del);
3882
3883 static void net_rx_action(struct softirq_action *h)
3884 {
3885         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3886         unsigned long time_limit = jiffies + 2;
3887         int budget = netdev_budget;
3888         void *have;
3889
3890         local_irq_disable();
3891
3892         while (!list_empty(&sd->poll_list)) {
3893                 struct napi_struct *n;
3894                 int work, weight;
3895
3896                 /* If softirq window is exhuasted then punt.
3897                  * Allow this to run for 2 jiffies since which will allow
3898                  * an average latency of 1.5/HZ.
3899                  */
3900                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3901                         goto softnet_break;
3902
3903                 local_irq_enable();
3904
3905                 /* Even though interrupts have been re-enabled, this
3906                  * access is safe because interrupts can only add new
3907                  * entries to the tail of this list, and only ->poll()
3908                  * calls can remove this head entry from the list.
3909                  */
3910                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3911
3912                 have = netpoll_poll_lock(n);
3913
3914                 weight = n->weight;
3915
3916                 /* This NAPI_STATE_SCHED test is for avoiding a race
3917                  * with netpoll's poll_napi().  Only the entity which
3918                  * obtains the lock and sees NAPI_STATE_SCHED set will
3919                  * actually make the ->poll() call.  Therefore we avoid
3920                  * accidentally calling ->poll() when NAPI is not scheduled.
3921                  */
3922                 work = 0;
3923                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3924                         work = n->poll(n, weight);
3925                         trace_napi_poll(n);
3926                 }
3927
3928                 WARN_ON_ONCE(work > weight);
3929
3930                 budget -= work;
3931
3932                 local_irq_disable();
3933
3934                 /* Drivers must not modify the NAPI state if they
3935                  * consume the entire weight.  In such cases this code
3936                  * still "owns" the NAPI instance and therefore can
3937                  * move the instance around on the list at-will.
3938                  */
3939                 if (unlikely(work == weight)) {
3940                         if (unlikely(napi_disable_pending(n))) {
3941                                 local_irq_enable();
3942                                 napi_complete(n);
3943                                 local_irq_disable();
3944                         } else
3945                                 list_move_tail(&n->poll_list, &sd->poll_list);
3946                 }
3947
3948                 netpoll_poll_unlock(have);
3949         }
3950 out:
3951         net_rps_action_and_irq_enable(sd);
3952
3953 #ifdef CONFIG_NET_DMA
3954         /*
3955          * There may not be any more sk_buffs coming right now, so push
3956          * any pending DMA copies to hardware
3957          */
3958         dma_issue_pending_all();
3959 #endif
3960
3961         return;
3962
3963 softnet_break:
3964         sd->time_squeeze++;
3965         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3966         goto out;
3967 }
3968
3969 static gifconf_func_t *gifconf_list[NPROTO];
3970
3971 /**
3972  *      register_gifconf        -       register a SIOCGIF handler
3973  *      @family: Address family
3974  *      @gifconf: Function handler
3975  *
3976  *      Register protocol dependent address dumping routines. The handler
3977  *      that is passed must not be freed or reused until it has been replaced
3978  *      by another handler.
3979  */
3980 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3981 {
3982         if (family >= NPROTO)
3983                 return -EINVAL;
3984         gifconf_list[family] = gifconf;
3985         return 0;
3986 }
3987 EXPORT_SYMBOL(register_gifconf);
3988
3989
3990 /*
3991  *      Map an interface index to its name (SIOCGIFNAME)
3992  */
3993
3994 /*
3995  *      We need this ioctl for efficient implementation of the
3996  *      if_indextoname() function required by the IPv6 API.  Without
3997  *      it, we would have to search all the interfaces to find a
3998  *      match.  --pb
3999  */
4000
4001 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4002 {
4003         struct net_device *dev;
4004         struct ifreq ifr;
4005
4006         /*
4007          *      Fetch the caller's info block.
4008          */
4009
4010         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4011                 return -EFAULT;
4012
4013         rcu_read_lock();
4014         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4015         if (!dev) {
4016                 rcu_read_unlock();
4017                 return -ENODEV;
4018         }
4019
4020         strcpy(ifr.ifr_name, dev->name);
4021         rcu_read_unlock();
4022
4023         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4024                 return -EFAULT;
4025         return 0;
4026 }
4027
4028 /*
4029  *      Perform a SIOCGIFCONF call. This structure will change
4030  *      size eventually, and there is nothing I can do about it.
4031  *      Thus we will need a 'compatibility mode'.
4032  */
4033
4034 static int dev_ifconf(struct net *net, char __user *arg)
4035 {
4036         struct ifconf ifc;
4037         struct net_device *dev;
4038         char __user *pos;
4039         int len;
4040         int total;
4041         int i;
4042
4043         /*
4044          *      Fetch the caller's info block.
4045          */
4046
4047         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4048                 return -EFAULT;
4049
4050         pos = ifc.ifc_buf;
4051         len = ifc.ifc_len;
4052
4053         /*
4054          *      Loop over the interfaces, and write an info block for each.
4055          */
4056
4057         total = 0;
4058         for_each_netdev(net, dev) {
4059                 for (i = 0; i < NPROTO; i++) {
4060                         if (gifconf_list[i]) {
4061                                 int done;
4062                                 if (!pos)
4063                                         done = gifconf_list[i](dev, NULL, 0);
4064                                 else
4065                                         done = gifconf_list[i](dev, pos + total,
4066                                                                len - total);
4067                                 if (done < 0)
4068                                         return -EFAULT;
4069                                 total += done;
4070                         }
4071                 }
4072         }
4073
4074         /*
4075          *      All done.  Write the updated control block back to the caller.
4076          */
4077         ifc.ifc_len = total;
4078
4079         /*
4080          *      Both BSD and Solaris return 0 here, so we do too.
4081          */
4082         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4083 }
4084
4085 #ifdef CONFIG_PROC_FS
4086
4087 #define BUCKET_SPACE (32 - NETDEV_HASHBITS)
4088
4089 struct dev_iter_state {
4090         struct seq_net_private p;
4091         unsigned int pos; /* bucket << BUCKET_SPACE + offset */
4092 };
4093
4094 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4095 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4096 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4097
4098 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
4099 {
4100         struct dev_iter_state *state = seq->private;
4101         struct net *net = seq_file_net(seq);
4102         struct net_device *dev;
4103         struct hlist_node *p;
4104         struct hlist_head *h;
4105         unsigned int count, bucket, offset;
4106
4107         bucket = get_bucket(state->pos);
4108         offset = get_offset(state->pos);
4109         h = &net->dev_name_head[bucket];
4110         count = 0;
4111         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4112                 if (count++ == offset) {
4113                         state->pos = set_bucket_offset(bucket, count);
4114                         return dev;
4115                 }
4116         }
4117
4118         return NULL;
4119 }
4120
4121 static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
4122 {
4123         struct dev_iter_state *state = seq->private;
4124         struct net_device *dev;
4125         unsigned int bucket;
4126
4127         bucket = get_bucket(state->pos);
4128         do {
4129                 dev = dev_from_same_bucket(seq);
4130                 if (dev)
4131                         return dev;
4132
4133                 bucket++;
4134                 state->pos = set_bucket_offset(bucket, 0);
4135         } while (bucket < NETDEV_HASHENTRIES);
4136
4137         return NULL;
4138 }
4139
4140 /*
4141  *      This is invoked by the /proc filesystem handler to display a device
4142  *      in detail.
4143  */
4144 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4145         __acquires(RCU)
4146 {
4147         struct dev_iter_state *state = seq->private;
4148
4149         rcu_read_lock();
4150         if (!*pos)
4151                 return SEQ_START_TOKEN;
4152
4153         /* check for end of the hash */
4154         if (state->pos == 0 && *pos > 1)
4155                 return NULL;
4156
4157         return dev_from_new_bucket(seq);
4158 }
4159
4160 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4161 {
4162         struct net_device *dev;
4163
4164         ++*pos;
4165
4166         if (v == SEQ_START_TOKEN)
4167                 return dev_from_new_bucket(seq);
4168
4169         dev = dev_from_same_bucket(seq);
4170         if (dev)
4171                 return dev;
4172
4173         return dev_from_new_bucket(seq);
4174 }
4175
4176 void dev_seq_stop(struct seq_file *seq, void *v)
4177         __releases(RCU)
4178 {
4179         rcu_read_unlock();
4180 }
4181
4182 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4183 {
4184         struct rtnl_link_stats64 temp;
4185         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4186
4187         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4188                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4189                    dev->name, stats->rx_bytes, stats->rx_packets,
4190                    stats->rx_errors,
4191                    stats->rx_dropped + stats->rx_missed_errors,
4192                    stats->rx_fifo_errors,
4193                    stats->rx_length_errors + stats->rx_over_errors +
4194                     stats->rx_crc_errors + stats->rx_frame_errors,
4195                    stats->rx_compressed, stats->multicast,
4196                    stats->tx_bytes, stats->tx_packets,
4197                    stats->tx_errors, stats->tx_dropped,
4198                    stats->tx_fifo_errors, stats->collisions,
4199                    stats->tx_carrier_errors +
4200                     stats->tx_aborted_errors +
4201                     stats->tx_window_errors +
4202                     stats->tx_heartbeat_errors,
4203                    stats->tx_compressed);
4204 }
4205
4206 /*
4207  *      Called from the PROCfs module. This now uses the new arbitrary sized
4208  *      /proc/net interface to create /proc/net/dev
4209  */
4210 static int dev_seq_show(struct seq_file *seq, void *v)
4211 {
4212         if (v == SEQ_START_TOKEN)
4213                 seq_puts(seq, "Inter-|   Receive                            "
4214                               "                    |  Transmit\n"
4215                               " face |bytes    packets errs drop fifo frame "
4216                               "compressed multicast|bytes    packets errs "
4217                               "drop fifo colls carrier compressed\n");
4218         else
4219                 dev_seq_printf_stats(seq, v);
4220         return 0;
4221 }
4222
4223 static struct softnet_data *softnet_get_online(loff_t *pos)
4224 {
4225         struct softnet_data *sd = NULL;
4226
4227         while (*pos < nr_cpu_ids)
4228                 if (cpu_online(*pos)) {
4229                         sd = &per_cpu(softnet_data, *pos);
4230                         break;
4231                 } else
4232                         ++*pos;
4233         return sd;
4234 }
4235
4236 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4237 {
4238         return softnet_get_online(pos);
4239 }
4240
4241 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4242 {
4243         ++*pos;
4244         return softnet_get_online(pos);
4245 }
4246
4247 static void softnet_seq_stop(struct seq_file *seq, void *v)
4248 {
4249 }
4250
4251 static int softnet_seq_show(struct seq_file *seq, void *v)
4252 {
4253         struct softnet_data *sd = v;
4254
4255         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4256                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4257                    0, 0, 0, 0, /* was fastroute */
4258                    sd->cpu_collision, sd->received_rps);
4259         return 0;
4260 }
4261
4262 static const struct seq_operations dev_seq_ops = {
4263         .start = dev_seq_start,
4264         .next  = dev_seq_next,
4265         .stop  = dev_seq_stop,
4266         .show  = dev_seq_show,
4267 };
4268
4269 static int dev_seq_open(struct inode *inode, struct file *file)
4270 {
4271         return seq_open_net(inode, file, &dev_seq_ops,
4272                             sizeof(struct dev_iter_state));
4273 }
4274
4275 static const struct file_operations dev_seq_fops = {
4276         .owner   = THIS_MODULE,
4277         .open    = dev_seq_open,
4278         .read    = seq_read,
4279         .llseek  = seq_lseek,
4280         .release = seq_release_net,
4281 };
4282
4283 static const struct seq_operations softnet_seq_ops = {
4284         .start = softnet_seq_start,
4285         .next  = softnet_seq_next,
4286         .stop  = softnet_seq_stop,
4287         .show  = softnet_seq_show,
4288 };
4289
4290 static int softnet_seq_open(struct inode *inode, struct file *file)
4291 {
4292         return seq_open(file, &softnet_seq_ops);
4293 }
4294
4295 static const struct file_operations softnet_seq_fops = {
4296         .owner   = THIS_MODULE,
4297         .open    = softnet_seq_open,
4298         .read    = seq_read,
4299         .llseek  = seq_lseek,
4300         .release = seq_release,
4301 };
4302
4303 static void *ptype_get_idx(loff_t pos)
4304 {
4305         struct packet_type *pt = NULL;
4306         loff_t i = 0;
4307         int t;
4308
4309         list_for_each_entry_rcu(pt, &ptype_all, list) {
4310                 if (i == pos)
4311                         return pt;
4312                 ++i;
4313         }
4314
4315         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4316                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4317                         if (i == pos)
4318                                 return pt;
4319                         ++i;
4320                 }
4321         }
4322         return NULL;
4323 }
4324
4325 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4326         __acquires(RCU)
4327 {
4328         rcu_read_lock();
4329         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4330 }
4331
4332 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4333 {
4334         struct packet_type *pt;
4335         struct list_head *nxt;
4336         int hash;
4337
4338         ++*pos;
4339         if (v == SEQ_START_TOKEN)
4340                 return ptype_get_idx(0);
4341
4342         pt = v;
4343         nxt = pt->list.next;
4344         if (pt->type == htons(ETH_P_ALL)) {
4345                 if (nxt != &ptype_all)
4346                         goto found;
4347                 hash = 0;
4348                 nxt = ptype_base[0].next;
4349         } else
4350                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4351
4352         while (nxt == &ptype_base[hash]) {
4353                 if (++hash >= PTYPE_HASH_SIZE)
4354                         return NULL;
4355                 nxt = ptype_base[hash].next;
4356         }
4357 found:
4358         return list_entry(nxt, struct packet_type, list);
4359 }
4360
4361 static void ptype_seq_stop(struct seq_file *seq, void *v)
4362         __releases(RCU)
4363 {
4364         rcu_read_unlock();
4365 }
4366
4367 static int ptype_seq_show(struct seq_file *seq, void *v)
4368 {
4369         struct packet_type *pt = v;
4370
4371         if (v == SEQ_START_TOKEN)
4372                 seq_puts(seq, "Type Device      Function\n");
4373         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4374                 if (pt->type == htons(ETH_P_ALL))
4375                         seq_puts(seq, "ALL ");
4376                 else
4377                         seq_printf(seq, "%04x", ntohs(pt->type));
4378
4379                 seq_printf(seq, " %-8s %pF\n",
4380                            pt->dev ? pt->dev->name : "", pt->func);
4381         }
4382
4383         return 0;
4384 }
4385
4386 static const struct seq_operations ptype_seq_ops = {
4387         .start = ptype_seq_start,
4388         .next  = ptype_seq_next,
4389         .stop  = ptype_seq_stop,
4390         .show  = ptype_seq_show,
4391 };
4392
4393 static int ptype_seq_open(struct inode *inode, struct file *file)
4394 {
4395         return seq_open_net(inode, file, &ptype_seq_ops,
4396                         sizeof(struct seq_net_private));
4397 }
4398
4399 static const struct file_operations ptype_seq_fops = {
4400         .owner   = THIS_MODULE,
4401         .open    = ptype_seq_open,
4402         .read    = seq_read,
4403         .llseek  = seq_lseek,
4404         .release = seq_release_net,
4405 };
4406
4407
4408 static int __net_init dev_proc_net_init(struct net *net)
4409 {
4410         int rc = -ENOMEM;
4411
4412         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4413                 goto out;
4414         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4415                 goto out_dev;
4416         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4417                 goto out_softnet;
4418
4419         if (wext_proc_init(net))
4420                 goto out_ptype;
4421         rc = 0;
4422 out:
4423         return rc;
4424 out_ptype:
4425         proc_net_remove(net, "ptype");
4426 out_softnet:
4427         proc_net_remove(net, "softnet_stat");
4428 out_dev:
4429         proc_net_remove(net, "dev");
4430         goto out;
4431 }
4432
4433 static void __net_exit dev_proc_net_exit(struct net *net)
4434 {
4435         wext_proc_exit(net);
4436
4437         proc_net_remove(net, "ptype");
4438         proc_net_remove(net, "softnet_stat");
4439         proc_net_remove(net, "dev");
4440 }
4441
4442 static struct pernet_operations __net_initdata dev_proc_ops = {
4443         .init = dev_proc_net_init,
4444         .exit = dev_proc_net_exit,
4445 };
4446
4447 static int __init dev_proc_init(void)
4448 {
4449         return register_pernet_subsys(&dev_proc_ops);
4450 }
4451 #else
4452 #define dev_proc_init() 0
4453 #endif  /* CONFIG_PROC_FS */
4454
4455
4456 /**
4457  *      netdev_set_master       -       set up master pointer
4458  *      @slave: slave device
4459  *      @master: new master device
4460  *
4461  *      Changes the master device of the slave. Pass %NULL to break the
4462  *      bonding. The caller must hold the RTNL semaphore. On a failure
4463  *      a negative errno code is returned. On success the reference counts
4464  *      are adjusted and the function returns zero.
4465  */
4466 int netdev_set_master(struct net_device *slave, struct net_device *master)
4467 {
4468         struct net_device *old = slave->master;
4469
4470         ASSERT_RTNL();
4471
4472         if (master) {
4473                 if (old)
4474                         return -EBUSY;
4475                 dev_hold(master);
4476         }
4477
4478         slave->master = master;
4479
4480         if (old)
4481                 dev_put(old);
4482         return 0;
4483 }
4484 EXPORT_SYMBOL(netdev_set_master);
4485
4486 /**
4487  *      netdev_set_bond_master  -       set up bonding master/slave pair
4488  *      @slave: slave device
4489  *      @master: new master device
4490  *
4491  *      Changes the master device of the slave. Pass %NULL to break the
4492  *      bonding. The caller must hold the RTNL semaphore. On a failure
4493  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4494  *      to the routing socket and the function returns zero.
4495  */
4496 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4497 {
4498         int err;
4499
4500         ASSERT_RTNL();
4501
4502         err = netdev_set_master(slave, master);
4503         if (err)
4504                 return err;
4505         if (master)
4506                 slave->flags |= IFF_SLAVE;
4507         else
4508                 slave->flags &= ~IFF_SLAVE;
4509
4510         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4511         return 0;
4512 }
4513 EXPORT_SYMBOL(netdev_set_bond_master);
4514
4515 static void dev_change_rx_flags(struct net_device *dev, int flags)
4516 {
4517         const struct net_device_ops *ops = dev->netdev_ops;
4518
4519         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4520                 ops->ndo_change_rx_flags(dev, flags);
4521 }
4522
4523 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4524 {
4525         unsigned short old_flags = dev->flags;
4526         uid_t uid;
4527         gid_t gid;
4528
4529         ASSERT_RTNL();
4530
4531         dev->flags |= IFF_PROMISC;
4532         dev->promiscuity += inc;
4533         if (dev->promiscuity == 0) {
4534                 /*
4535                  * Avoid overflow.
4536                  * If inc causes overflow, untouch promisc and return error.
4537                  */
4538                 if (inc < 0)
4539                         dev->flags &= ~IFF_PROMISC;
4540                 else {
4541                         dev->promiscuity -= inc;
4542                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4543                                 "set promiscuity failed, promiscuity feature "
4544                                 "of device might be broken.\n", dev->name);
4545                         return -EOVERFLOW;
4546                 }
4547         }
4548         if (dev->flags != old_flags) {
4549                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4550                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4551                                                                "left");
4552                 if (audit_enabled) {
4553                         current_uid_gid(&uid, &gid);
4554                         audit_log(current->audit_context, GFP_ATOMIC,
4555                                 AUDIT_ANOM_PROMISCUOUS,
4556                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4557                                 dev->name, (dev->flags & IFF_PROMISC),
4558                                 (old_flags & IFF_PROMISC),
4559                                 audit_get_loginuid(current),
4560                                 uid, gid,
4561                                 audit_get_sessionid(current));
4562                 }
4563
4564                 dev_change_rx_flags(dev, IFF_PROMISC);
4565         }
4566         return 0;
4567 }
4568
4569 /**
4570  *      dev_set_promiscuity     - update promiscuity count on a device
4571  *      @dev: device
4572  *      @inc: modifier
4573  *
4574  *      Add or remove promiscuity from a device. While the count in the device
4575  *      remains above zero the interface remains promiscuous. Once it hits zero
4576  *      the device reverts back to normal filtering operation. A negative inc
4577  *      value is used to drop promiscuity on the device.
4578  *      Return 0 if successful or a negative errno code on error.
4579  */
4580 int dev_set_promiscuity(struct net_device *dev, int inc)
4581 {
4582         unsigned short old_flags = dev->flags;
4583         int err;
4584
4585         err = __dev_set_promiscuity(dev, inc);
4586         if (err < 0)
4587                 return err;
4588         if (dev->flags != old_flags)
4589                 dev_set_rx_mode(dev);
4590         return err;
4591 }
4592 EXPORT_SYMBOL(dev_set_promiscuity);
4593
4594 /**
4595  *      dev_set_allmulti        - update allmulti count on a device
4596  *      @dev: device
4597  *      @inc: modifier
4598  *
4599  *      Add or remove reception of all multicast frames to a device. While the
4600  *      count in the device remains above zero the interface remains listening
4601  *      to all interfaces. Once it hits zero the device reverts back to normal
4602  *      filtering operation. A negative @inc value is used to drop the counter
4603  *      when releasing a resource needing all multicasts.
4604  *      Return 0 if successful or a negative errno code on error.
4605  */
4606
4607 int dev_set_allmulti(struct net_device *dev, int inc)
4608 {
4609         unsigned short old_flags = dev->flags;
4610
4611         ASSERT_RTNL();
4612
4613         dev->flags |= IFF_ALLMULTI;
4614         dev->allmulti += inc;
4615         if (dev->allmulti == 0) {
4616                 /*
4617                  * Avoid overflow.
4618                  * If inc causes overflow, untouch allmulti and return error.
4619                  */
4620                 if (inc < 0)
4621                         dev->flags &= ~IFF_ALLMULTI;
4622                 else {
4623                         dev->allmulti -= inc;
4624                         printk(KERN_WARNING "%s: allmulti touches roof, "
4625                                 "set allmulti failed, allmulti feature of "
4626                                 "device might be broken.\n", dev->name);
4627                         return -EOVERFLOW;
4628                 }
4629         }
4630         if (dev->flags ^ old_flags) {
4631                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4632                 dev_set_rx_mode(dev);
4633         }
4634         return 0;
4635 }
4636 EXPORT_SYMBOL(dev_set_allmulti);
4637
4638 /*
4639  *      Upload unicast and multicast address lists to device and
4640  *      configure RX filtering. When the device doesn't support unicast
4641  *      filtering it is put in promiscuous mode while unicast addresses
4642  *      are present.
4643  */
4644 void __dev_set_rx_mode(struct net_device *dev)
4645 {
4646         const struct net_device_ops *ops = dev->netdev_ops;
4647
4648         /* dev_open will call this function so the list will stay sane. */
4649         if (!(dev->flags&IFF_UP))
4650                 return;
4651
4652         if (!netif_device_present(dev))
4653                 return;
4654
4655         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4656                 /* Unicast addresses changes may only happen under the rtnl,
4657                  * therefore calling __dev_set_promiscuity here is safe.
4658                  */
4659                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4660                         __dev_set_promiscuity(dev, 1);
4661                         dev->uc_promisc = true;
4662                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4663                         __dev_set_promiscuity(dev, -1);
4664                         dev->uc_promisc = false;
4665                 }
4666         }
4667
4668         if (ops->ndo_set_rx_mode)
4669                 ops->ndo_set_rx_mode(dev);
4670 }
4671
4672 void dev_set_rx_mode(struct net_device *dev)
4673 {
4674         netif_addr_lock_bh(dev);
4675         __dev_set_rx_mode(dev);
4676         netif_addr_unlock_bh(dev);
4677 }
4678
4679 /**
4680  *      dev_get_flags - get flags reported to userspace
4681  *      @dev: device
4682  *
4683  *      Get the combination of flag bits exported through APIs to userspace.
4684  */
4685 unsigned dev_get_flags(const struct net_device *dev)
4686 {
4687         unsigned flags;
4688
4689         flags = (dev->flags & ~(IFF_PROMISC |
4690                                 IFF_ALLMULTI |
4691                                 IFF_RUNNING |
4692                                 IFF_LOWER_UP |
4693                                 IFF_DORMANT)) |
4694                 (dev->gflags & (IFF_PROMISC |
4695                                 IFF_ALLMULTI));
4696
4697         if (netif_running(dev)) {
4698                 if (netif_oper_up(dev))
4699                         flags |= IFF_RUNNING;
4700                 if (netif_carrier_ok(dev))
4701                         flags |= IFF_LOWER_UP;
4702                 if (netif_dormant(dev))
4703                         flags |= IFF_DORMANT;
4704         }
4705
4706         return flags;
4707 }
4708 EXPORT_SYMBOL(dev_get_flags);
4709
4710 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4711 {
4712         int old_flags = dev->flags;
4713         int ret;
4714
4715         ASSERT_RTNL();
4716
4717         /*
4718          *      Set the flags on our device.
4719          */
4720
4721         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4722                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4723                                IFF_AUTOMEDIA)) |
4724                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4725                                     IFF_ALLMULTI));
4726
4727         /*
4728          *      Load in the correct multicast list now the flags have changed.
4729          */
4730
4731         if ((old_flags ^ flags) & IFF_MULTICAST)
4732                 dev_change_rx_flags(dev, IFF_MULTICAST);
4733
4734         dev_set_rx_mode(dev);
4735
4736         /*
4737          *      Have we downed the interface. We handle IFF_UP ourselves
4738          *      according to user attempts to set it, rather than blindly
4739          *      setting it.
4740          */
4741
4742         ret = 0;
4743         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4744                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4745
4746                 if (!ret)
4747                         dev_set_rx_mode(dev);
4748         }
4749
4750         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4751                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4752
4753                 dev->gflags ^= IFF_PROMISC;
4754                 dev_set_promiscuity(dev, inc);
4755         }
4756
4757         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4758            is important. Some (broken) drivers set IFF_PROMISC, when
4759            IFF_ALLMULTI is requested not asking us and not reporting.
4760          */
4761         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4762                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4763
4764                 dev->gflags ^= IFF_ALLMULTI;
4765                 dev_set_allmulti(dev, inc);
4766         }
4767
4768         return ret;
4769 }
4770
4771 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4772 {
4773         unsigned int changes = dev->flags ^ old_flags;
4774
4775         if (changes & IFF_UP) {
4776                 if (dev->flags & IFF_UP)
4777                         call_netdevice_notifiers(NETDEV_UP, dev);
4778                 else
4779                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4780         }
4781
4782         if (dev->flags & IFF_UP &&
4783             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4784                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4785 }
4786
4787 /**
4788  *      dev_change_flags - change device settings
4789  *      @dev: device
4790  *      @flags: device state flags
4791  *
4792  *      Change settings on device based state flags. The flags are
4793  *      in the userspace exported format.
4794  */
4795 int dev_change_flags(struct net_device *dev, unsigned flags)
4796 {
4797         int ret, changes;
4798         int old_flags = dev->flags;
4799
4800         ret = __dev_change_flags(dev, flags);
4801         if (ret < 0)
4802                 return ret;
4803
4804         changes = old_flags ^ dev->flags;
4805         if (changes)
4806                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4807
4808         __dev_notify_flags(dev, old_flags);
4809         return ret;
4810 }
4811 EXPORT_SYMBOL(dev_change_flags);
4812
4813 /**
4814  *      dev_set_mtu - Change maximum transfer unit
4815  *      @dev: device
4816  *      @new_mtu: new transfer unit
4817  *
4818  *      Change the maximum transfer size of the network device.
4819  */
4820 int dev_set_mtu(struct net_device *dev, int new_mtu)
4821 {
4822         const struct net_device_ops *ops = dev->netdev_ops;
4823         int err;
4824
4825         if (new_mtu == dev->mtu)
4826                 return 0;
4827
4828         /*      MTU must be positive.    */
4829         if (new_mtu < 0)
4830                 return -EINVAL;
4831
4832         if (!netif_device_present(dev))
4833                 return -ENODEV;
4834
4835         err = 0;
4836         if (ops->ndo_change_mtu)
4837                 err = ops->ndo_change_mtu(dev, new_mtu);
4838         else
4839                 dev->mtu = new_mtu;
4840
4841         if (!err && dev->flags & IFF_UP)
4842                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4843         return err;
4844 }
4845 EXPORT_SYMBOL(dev_set_mtu);
4846
4847 /**
4848  *      dev_set_group - Change group this device belongs to
4849  *      @dev: device
4850  *      @new_group: group this device should belong to
4851  */
4852 void dev_set_group(struct net_device *dev, int new_group)
4853 {
4854         dev->group = new_group;
4855 }
4856 EXPORT_SYMBOL(dev_set_group);
4857
4858 /**
4859  *      dev_set_mac_address - Change Media Access Control Address
4860  *      @dev: device
4861  *      @sa: new address
4862  *
4863  *      Change the hardware (MAC) address of the device
4864  */
4865 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4866 {
4867         const struct net_device_ops *ops = dev->netdev_ops;
4868         int err;
4869
4870         if (!ops->ndo_set_mac_address)
4871                 return -EOPNOTSUPP;
4872         if (sa->sa_family != dev->type)
4873                 return -EINVAL;
4874         if (!netif_device_present(dev))
4875                 return -ENODEV;
4876         err = ops->ndo_set_mac_address(dev, sa);
4877         if (!err)
4878                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4879         return err;
4880 }
4881 EXPORT_SYMBOL(dev_set_mac_address);
4882
4883 /*
4884  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4885  */
4886 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4887 {
4888         int err;
4889         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4890
4891         if (!dev)
4892                 return -ENODEV;
4893
4894         switch (cmd) {
4895         case SIOCGIFFLAGS:      /* Get interface flags */
4896                 ifr->ifr_flags = (short) dev_get_flags(dev);
4897                 return 0;
4898
4899         case SIOCGIFMETRIC:     /* Get the metric on the interface
4900                                    (currently unused) */
4901                 ifr->ifr_metric = 0;
4902                 return 0;
4903
4904         case SIOCGIFMTU:        /* Get the MTU of a device */
4905                 ifr->ifr_mtu = dev->mtu;
4906                 return 0;
4907
4908         case SIOCGIFHWADDR:
4909                 if (!dev->addr_len)
4910                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4911                 else
4912                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4913                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4914                 ifr->ifr_hwaddr.sa_family = dev->type;
4915                 return 0;
4916
4917         case SIOCGIFSLAVE:
4918                 err = -EINVAL;
4919                 break;
4920
4921         case SIOCGIFMAP:
4922                 ifr->ifr_map.mem_start = dev->mem_start;
4923                 ifr->ifr_map.mem_end   = dev->mem_end;
4924                 ifr->ifr_map.base_addr = dev->base_addr;
4925                 ifr->ifr_map.irq       = dev->irq;
4926                 ifr->ifr_map.dma       = dev->dma;
4927                 ifr->ifr_map.port      = dev->if_port;
4928                 return 0;
4929
4930         case SIOCGIFINDEX:
4931                 ifr->ifr_ifindex = dev->ifindex;
4932                 return 0;
4933
4934         case SIOCGIFTXQLEN:
4935                 ifr->ifr_qlen = dev->tx_queue_len;
4936                 return 0;
4937
4938         default:
4939                 /* dev_ioctl() should ensure this case
4940                  * is never reached
4941                  */
4942                 WARN_ON(1);
4943                 err = -ENOTTY;
4944                 break;
4945
4946         }
4947         return err;
4948 }
4949
4950 /*
4951  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4952  */
4953 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4954 {
4955         int err;
4956         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4957         const struct net_device_ops *ops;
4958
4959         if (!dev)
4960                 return -ENODEV;
4961
4962         ops = dev->netdev_ops;
4963
4964         switch (cmd) {
4965         case SIOCSIFFLAGS:      /* Set interface flags */
4966                 return dev_change_flags(dev, ifr->ifr_flags);
4967
4968         case SIOCSIFMETRIC:     /* Set the metric on the interface
4969                                    (currently unused) */
4970                 return -EOPNOTSUPP;
4971
4972         case SIOCSIFMTU:        /* Set the MTU of a device */
4973                 return dev_set_mtu(dev, ifr->ifr_mtu);
4974
4975         case SIOCSIFHWADDR:
4976                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4977
4978         case SIOCSIFHWBROADCAST:
4979                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4980                         return -EINVAL;
4981                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4982                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4983                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4984                 return 0;
4985
4986         case SIOCSIFMAP:
4987                 if (ops->ndo_set_config) {
4988                         if (!netif_device_present(dev))
4989                                 return -ENODEV;
4990                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4991                 }
4992                 return -EOPNOTSUPP;
4993
4994         case SIOCADDMULTI:
4995                 if (!ops->ndo_set_rx_mode ||
4996                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4997                         return -EINVAL;
4998                 if (!netif_device_present(dev))
4999                         return -ENODEV;
5000                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5001
5002         case SIOCDELMULTI:
5003                 if (!ops->ndo_set_rx_mode ||
5004                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5005                         return -EINVAL;
5006                 if (!netif_device_present(dev))
5007                         return -ENODEV;
5008                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5009
5010         case SIOCSIFTXQLEN:
5011                 if (ifr->ifr_qlen < 0)
5012                         return -EINVAL;
5013                 dev->tx_queue_len = ifr->ifr_qlen;
5014                 return 0;
5015
5016         case SIOCSIFNAME:
5017                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5018                 return dev_change_name(dev, ifr->ifr_newname);
5019
5020         case SIOCSHWTSTAMP:
5021                 err = net_hwtstamp_validate(ifr);
5022                 if (err)
5023                         return err;
5024                 /* fall through */
5025
5026         /*
5027          *      Unknown or private ioctl
5028          */
5029         default:
5030                 if ((cmd >= SIOCDEVPRIVATE &&
5031                     cmd <= SIOCDEVPRIVATE + 15) ||
5032                     cmd == SIOCBONDENSLAVE ||
5033                     cmd == SIOCBONDRELEASE ||
5034                     cmd == SIOCBONDSETHWADDR ||
5035                     cmd == SIOCBONDSLAVEINFOQUERY ||
5036                     cmd == SIOCBONDINFOQUERY ||
5037                     cmd == SIOCBONDCHANGEACTIVE ||
5038                     cmd == SIOCGMIIPHY ||
5039                     cmd == SIOCGMIIREG ||
5040                     cmd == SIOCSMIIREG ||
5041                     cmd == SIOCBRADDIF ||
5042                     cmd == SIOCBRDELIF ||
5043                     cmd == SIOCSHWTSTAMP ||
5044                     cmd == SIOCWANDEV) {
5045                         err = -EOPNOTSUPP;
5046                         if (ops->ndo_do_ioctl) {
5047                                 if (netif_device_present(dev))
5048                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
5049                                 else
5050                                         err = -ENODEV;
5051                         }
5052                 } else
5053                         err = -EINVAL;
5054
5055         }
5056         return err;
5057 }
5058
5059 /*
5060  *      This function handles all "interface"-type I/O control requests. The actual
5061  *      'doing' part of this is dev_ifsioc above.
5062  */
5063
5064 /**
5065  *      dev_ioctl       -       network device ioctl
5066  *      @net: the applicable net namespace
5067  *      @cmd: command to issue
5068  *      @arg: pointer to a struct ifreq in user space
5069  *
5070  *      Issue ioctl functions to devices. This is normally called by the
5071  *      user space syscall interfaces but can sometimes be useful for
5072  *      other purposes. The return value is the return from the syscall if
5073  *      positive or a negative errno code on error.
5074  */
5075
5076 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5077 {
5078         struct ifreq ifr;
5079         int ret;
5080         char *colon;
5081
5082         /* One special case: SIOCGIFCONF takes ifconf argument
5083            and requires shared lock, because it sleeps writing
5084            to user space.
5085          */
5086
5087         if (cmd == SIOCGIFCONF) {
5088                 rtnl_lock();
5089                 ret = dev_ifconf(net, (char __user *) arg);
5090                 rtnl_unlock();
5091                 return ret;
5092         }
5093         if (cmd == SIOCGIFNAME)
5094                 return dev_ifname(net, (struct ifreq __user *)arg);
5095
5096         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5097                 return -EFAULT;
5098
5099         ifr.ifr_name[IFNAMSIZ-1] = 0;
5100
5101         colon = strchr(ifr.ifr_name, ':');
5102         if (colon)
5103                 *colon = 0;
5104
5105         /*
5106          *      See which interface the caller is talking about.
5107          */
5108
5109         switch (cmd) {
5110         /*
5111          *      These ioctl calls:
5112          *      - can be done by all.
5113          *      - atomic and do not require locking.
5114          *      - return a value
5115          */
5116         case SIOCGIFFLAGS:
5117         case SIOCGIFMETRIC:
5118         case SIOCGIFMTU:
5119         case SIOCGIFHWADDR:
5120         case SIOCGIFSLAVE:
5121         case SIOCGIFMAP:
5122         case SIOCGIFINDEX:
5123         case SIOCGIFTXQLEN:
5124                 dev_load(net, ifr.ifr_name);
5125                 rcu_read_lock();
5126                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5127                 rcu_read_unlock();
5128                 if (!ret) {
5129                         if (colon)
5130                                 *colon = ':';
5131                         if (copy_to_user(arg, &ifr,
5132                                          sizeof(struct ifreq)))
5133                                 ret = -EFAULT;
5134                 }
5135                 return ret;
5136
5137         case SIOCETHTOOL:
5138                 dev_load(net, ifr.ifr_name);
5139                 rtnl_lock();
5140                 ret = dev_ethtool(net, &ifr);
5141                 rtnl_unlock();
5142                 if (!ret) {
5143                         if (colon)
5144                                 *colon = ':';
5145                         if (copy_to_user(arg, &ifr,
5146                                          sizeof(struct ifreq)))
5147                                 ret = -EFAULT;
5148                 }
5149                 return ret;
5150
5151         /*
5152          *      These ioctl calls:
5153          *      - require superuser power.
5154          *      - require strict serialization.
5155          *      - return a value
5156          */
5157         case SIOCGMIIPHY:
5158         case SIOCGMIIREG:
5159         case SIOCSIFNAME:
5160                 if (!capable(CAP_NET_ADMIN))
5161                         return -EPERM;
5162                 dev_load(net, ifr.ifr_name);
5163                 rtnl_lock();
5164                 ret = dev_ifsioc(net, &ifr, cmd);
5165                 rtnl_unlock();
5166                 if (!ret) {
5167                         if (colon)
5168                                 *colon = ':';
5169                         if (copy_to_user(arg, &ifr,
5170                                          sizeof(struct ifreq)))
5171                                 ret = -EFAULT;
5172                 }
5173                 return ret;
5174
5175         /*
5176          *      These ioctl calls:
5177          *      - require superuser power.
5178          *      - require strict serialization.
5179          *      - do not return a value
5180          */
5181         case SIOCSIFFLAGS:
5182         case SIOCSIFMETRIC:
5183         case SIOCSIFMTU:
5184         case SIOCSIFMAP:
5185         case SIOCSIFHWADDR:
5186         case SIOCSIFSLAVE:
5187         case SIOCADDMULTI:
5188         case SIOCDELMULTI:
5189         case SIOCSIFHWBROADCAST:
5190         case SIOCSIFTXQLEN:
5191         case SIOCSMIIREG:
5192         case SIOCBONDENSLAVE:
5193         case SIOCBONDRELEASE:
5194         case SIOCBONDSETHWADDR:
5195         case SIOCBONDCHANGEACTIVE:
5196         case SIOCBRADDIF:
5197         case SIOCBRDELIF:
5198         case SIOCSHWTSTAMP:
5199                 if (!capable(CAP_NET_ADMIN))
5200                         return -EPERM;
5201                 /* fall through */
5202         case SIOCBONDSLAVEINFOQUERY:
5203         case SIOCBONDINFOQUERY:
5204                 dev_load(net, ifr.ifr_name);
5205                 rtnl_lock();
5206                 ret = dev_ifsioc(net, &ifr, cmd);
5207                 rtnl_unlock();
5208                 return ret;
5209
5210         case SIOCGIFMEM:
5211                 /* Get the per device memory space. We can add this but
5212                  * currently do not support it */
5213         case SIOCSIFMEM:
5214                 /* Set the per device memory buffer space.
5215                  * Not applicable in our case */
5216         case SIOCSIFLINK:
5217                 return -ENOTTY;
5218
5219         /*
5220          *      Unknown or private ioctl.
5221          */
5222         default:
5223                 if (cmd == SIOCWANDEV ||
5224                     (cmd >= SIOCDEVPRIVATE &&
5225                      cmd <= SIOCDEVPRIVATE + 15)) {
5226                         dev_load(net, ifr.ifr_name);
5227                         rtnl_lock();
5228                         ret = dev_ifsioc(net, &ifr, cmd);
5229                         rtnl_unlock();
5230                         if (!ret && copy_to_user(arg, &ifr,
5231                                                  sizeof(struct ifreq)))
5232                                 ret = -EFAULT;
5233                         return ret;
5234                 }
5235                 /* Take care of Wireless Extensions */
5236                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5237                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5238                 return -ENOTTY;
5239         }
5240 }
5241
5242
5243 /**
5244  *      dev_new_index   -       allocate an ifindex
5245  *      @net: the applicable net namespace
5246  *
5247  *      Returns a suitable unique value for a new device interface
5248  *      number.  The caller must hold the rtnl semaphore or the
5249  *      dev_base_lock to be sure it remains unique.
5250  */
5251 static int dev_new_index(struct net *net)
5252 {
5253         static int ifindex;
5254         for (;;) {
5255                 if (++ifindex <= 0)
5256                         ifindex = 1;
5257                 if (!__dev_get_by_index(net, ifindex))
5258                         return ifindex;
5259         }
5260 }
5261
5262 /* Delayed registration/unregisteration */
5263 static LIST_HEAD(net_todo_list);
5264
5265 static void net_set_todo(struct net_device *dev)
5266 {
5267         list_add_tail(&dev->todo_list, &net_todo_list);
5268 }
5269
5270 static void rollback_registered_many(struct list_head *head)
5271 {
5272         struct net_device *dev, *tmp;
5273
5274         BUG_ON(dev_boot_phase);
5275         ASSERT_RTNL();
5276
5277         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5278                 /* Some devices call without registering
5279                  * for initialization unwind. Remove those
5280                  * devices and proceed with the remaining.
5281                  */
5282                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5283                         pr_debug("unregister_netdevice: device %s/%p never "
5284                                  "was registered\n", dev->name, dev);
5285
5286                         WARN_ON(1);
5287                         list_del(&dev->unreg_list);
5288                         continue;
5289                 }
5290                 dev->dismantle = true;
5291                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5292         }
5293
5294         /* If device is running, close it first. */
5295         dev_close_many(head);
5296
5297         list_for_each_entry(dev, head, unreg_list) {
5298                 /* And unlink it from device chain. */
5299                 unlist_netdevice(dev);
5300
5301                 dev->reg_state = NETREG_UNREGISTERING;
5302         }
5303
5304         synchronize_net();
5305
5306         list_for_each_entry(dev, head, unreg_list) {
5307                 /* Shutdown queueing discipline. */
5308                 dev_shutdown(dev);
5309
5310
5311                 /* Notify protocols, that we are about to destroy
5312                    this device. They should clean all the things.
5313                 */
5314                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5315
5316                 if (!dev->rtnl_link_ops ||
5317                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5318                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5319
5320                 /*
5321                  *      Flush the unicast and multicast chains
5322                  */
5323                 dev_uc_flush(dev);
5324                 dev_mc_flush(dev);
5325
5326                 if (dev->netdev_ops->ndo_uninit)
5327                         dev->netdev_ops->ndo_uninit(dev);
5328
5329                 /* Notifier chain MUST detach us from master device. */
5330                 WARN_ON(dev->master);
5331
5332                 /* Remove entries from kobject tree */
5333                 netdev_unregister_kobject(dev);
5334         }
5335
5336         /* Process any work delayed until the end of the batch */
5337         dev = list_first_entry(head, struct net_device, unreg_list);
5338         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5339
5340         synchronize_net();
5341
5342         list_for_each_entry(dev, head, unreg_list)
5343                 dev_put(dev);
5344 }
5345
5346 static void rollback_registered(struct net_device *dev)
5347 {
5348         LIST_HEAD(single);
5349
5350         list_add(&dev->unreg_list, &single);
5351         rollback_registered_many(&single);
5352         list_del(&single);
5353 }
5354
5355 static netdev_features_t netdev_fix_features(struct net_device *dev,
5356         netdev_features_t features)
5357 {
5358         /* Fix illegal checksum combinations */
5359         if ((features & NETIF_F_HW_CSUM) &&
5360             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5361                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5362                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5363         }
5364
5365         if ((features & NETIF_F_NO_CSUM) &&
5366             (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5367                 netdev_warn(dev, "mixed no checksumming and other settings.\n");
5368                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5369         }
5370
5371         /* Fix illegal SG+CSUM combinations. */
5372         if ((features & NETIF_F_SG) &&
5373             !(features & NETIF_F_ALL_CSUM)) {
5374                 netdev_dbg(dev,
5375                         "Dropping NETIF_F_SG since no checksum feature.\n");
5376                 features &= ~NETIF_F_SG;
5377         }
5378
5379         /* TSO requires that SG is present as well. */
5380         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5381                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5382                 features &= ~NETIF_F_ALL_TSO;
5383         }
5384
5385         /* TSO ECN requires that TSO is present as well. */
5386         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5387                 features &= ~NETIF_F_TSO_ECN;
5388
5389         /* Software GSO depends on SG. */
5390         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5391                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5392                 features &= ~NETIF_F_GSO;
5393         }
5394
5395         /* UFO needs SG and checksumming */
5396         if (features & NETIF_F_UFO) {
5397                 /* maybe split UFO into V4 and V6? */
5398                 if (!((features & NETIF_F_GEN_CSUM) ||
5399                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5400                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5401                         netdev_dbg(dev,
5402                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5403                         features &= ~NETIF_F_UFO;
5404                 }
5405
5406                 if (!(features & NETIF_F_SG)) {
5407                         netdev_dbg(dev,
5408                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5409                         features &= ~NETIF_F_UFO;
5410                 }
5411         }
5412
5413         return features;
5414 }
5415
5416 int __netdev_update_features(struct net_device *dev)
5417 {
5418         netdev_features_t features;
5419         int err = 0;
5420
5421         ASSERT_RTNL();
5422
5423         features = netdev_get_wanted_features(dev);
5424
5425         if (dev->netdev_ops->ndo_fix_features)
5426                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5427
5428         /* driver might be less strict about feature dependencies */
5429         features = netdev_fix_features(dev, features);
5430
5431         if (dev->features == features)
5432                 return 0;
5433
5434         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5435                 &dev->features, &features);
5436
5437         if (dev->netdev_ops->ndo_set_features)
5438                 err = dev->netdev_ops->ndo_set_features(dev, features);
5439
5440         if (unlikely(err < 0)) {
5441                 netdev_err(dev,
5442                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5443                         err, &features, &dev->features);
5444                 return -1;
5445         }
5446
5447         if (!err)
5448                 dev->features = features;
5449
5450         return 1;
5451 }
5452
5453 /**
5454  *      netdev_update_features - recalculate device features
5455  *      @dev: the device to check
5456  *
5457  *      Recalculate dev->features set and send notifications if it
5458  *      has changed. Should be called after driver or hardware dependent
5459  *      conditions might have changed that influence the features.
5460  */
5461 void netdev_update_features(struct net_device *dev)
5462 {
5463         if (__netdev_update_features(dev))
5464                 netdev_features_change(dev);
5465 }
5466 EXPORT_SYMBOL(netdev_update_features);
5467
5468 /**
5469  *      netdev_change_features - recalculate device features
5470  *      @dev: the device to check
5471  *
5472  *      Recalculate dev->features set and send notifications even
5473  *      if they have not changed. Should be called instead of
5474  *      netdev_update_features() if also dev->vlan_features might
5475  *      have changed to allow the changes to be propagated to stacked
5476  *      VLAN devices.
5477  */
5478 void netdev_change_features(struct net_device *dev)
5479 {
5480         __netdev_update_features(dev);
5481         netdev_features_change(dev);
5482 }
5483 EXPORT_SYMBOL(netdev_change_features);
5484
5485 /**
5486  *      netif_stacked_transfer_operstate -      transfer operstate
5487  *      @rootdev: the root or lower level device to transfer state from
5488  *      @dev: the device to transfer operstate to
5489  *
5490  *      Transfer operational state from root to device. This is normally
5491  *      called when a stacking relationship exists between the root
5492  *      device and the device(a leaf device).
5493  */
5494 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5495                                         struct net_device *dev)
5496 {
5497         if (rootdev->operstate == IF_OPER_DORMANT)
5498                 netif_dormant_on(dev);
5499         else
5500                 netif_dormant_off(dev);
5501
5502         if (netif_carrier_ok(rootdev)) {
5503                 if (!netif_carrier_ok(dev))
5504                         netif_carrier_on(dev);
5505         } else {
5506                 if (netif_carrier_ok(dev))
5507                         netif_carrier_off(dev);
5508         }
5509 }
5510 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5511
5512 #ifdef CONFIG_RPS
5513 static int netif_alloc_rx_queues(struct net_device *dev)
5514 {
5515         unsigned int i, count = dev->num_rx_queues;
5516         struct netdev_rx_queue *rx;
5517
5518         BUG_ON(count < 1);
5519
5520         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5521         if (!rx) {
5522                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5523                 return -ENOMEM;
5524         }
5525         dev->_rx = rx;
5526
5527         for (i = 0; i < count; i++)
5528                 rx[i].dev = dev;
5529         return 0;
5530 }
5531 #endif
5532
5533 static void netdev_init_one_queue(struct net_device *dev,
5534                                   struct netdev_queue *queue, void *_unused)
5535 {
5536         /* Initialize queue lock */
5537         spin_lock_init(&queue->_xmit_lock);
5538         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5539         queue->xmit_lock_owner = -1;
5540         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5541         queue->dev = dev;
5542 }
5543
5544 static int netif_alloc_netdev_queues(struct net_device *dev)
5545 {
5546         unsigned int count = dev->num_tx_queues;
5547         struct netdev_queue *tx;
5548
5549         BUG_ON(count < 1);
5550
5551         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5552         if (!tx) {
5553                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5554                        count);
5555                 return -ENOMEM;
5556         }
5557         dev->_tx = tx;
5558
5559         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5560         spin_lock_init(&dev->tx_global_lock);
5561
5562         return 0;
5563 }
5564
5565 /**
5566  *      register_netdevice      - register a network device
5567  *      @dev: device to register
5568  *
5569  *      Take a completed network device structure and add it to the kernel
5570  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5571  *      chain. 0 is returned on success. A negative errno code is returned
5572  *      on a failure to set up the device, or if the name is a duplicate.
5573  *
5574  *      Callers must hold the rtnl semaphore. You may want
5575  *      register_netdev() instead of this.
5576  *
5577  *      BUGS:
5578  *      The locking appears insufficient to guarantee two parallel registers
5579  *      will not get the same name.
5580  */
5581
5582 int register_netdevice(struct net_device *dev)
5583 {
5584         int ret;
5585         struct net *net = dev_net(dev);
5586
5587         BUG_ON(dev_boot_phase);
5588         ASSERT_RTNL();
5589
5590         might_sleep();
5591
5592         /* When net_device's are persistent, this will be fatal. */
5593         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5594         BUG_ON(!net);
5595
5596         spin_lock_init(&dev->addr_list_lock);
5597         netdev_set_addr_lockdep_class(dev);
5598
5599         dev->iflink = -1;
5600
5601         ret = dev_get_valid_name(dev, dev->name);
5602         if (ret < 0)
5603                 goto out;
5604
5605         /* Init, if this function is available */
5606         if (dev->netdev_ops->ndo_init) {
5607                 ret = dev->netdev_ops->ndo_init(dev);
5608                 if (ret) {
5609                         if (ret > 0)
5610                                 ret = -EIO;
5611                         goto out;
5612                 }
5613         }
5614
5615         dev->ifindex = dev_new_index(net);
5616         if (dev->iflink == -1)
5617                 dev->iflink = dev->ifindex;
5618
5619         /* Transfer changeable features to wanted_features and enable
5620          * software offloads (GSO and GRO).
5621          */
5622         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5623         dev->features |= NETIF_F_SOFT_FEATURES;
5624         dev->wanted_features = dev->features & dev->hw_features;
5625
5626         /* Turn on no cache copy if HW is doing checksum */
5627         dev->hw_features |= NETIF_F_NOCACHE_COPY;
5628         if ((dev->features & NETIF_F_ALL_CSUM) &&
5629             !(dev->features & NETIF_F_NO_CSUM)) {
5630                 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5631                 dev->features |= NETIF_F_NOCACHE_COPY;
5632         }
5633
5634         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5635          */
5636         dev->vlan_features |= NETIF_F_HIGHDMA;
5637
5638         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5639         ret = notifier_to_errno(ret);
5640         if (ret)
5641                 goto err_uninit;
5642
5643         ret = netdev_register_kobject(dev);
5644         if (ret)
5645                 goto err_uninit;
5646         dev->reg_state = NETREG_REGISTERED;
5647
5648         __netdev_update_features(dev);
5649
5650         /*
5651          *      Default initial state at registry is that the
5652          *      device is present.
5653          */
5654
5655         set_bit(__LINK_STATE_PRESENT, &dev->state);
5656
5657         dev_init_scheduler(dev);
5658         dev_hold(dev);
5659         list_netdevice(dev);
5660
5661         /* Notify protocols, that a new device appeared. */
5662         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5663         ret = notifier_to_errno(ret);
5664         if (ret) {
5665                 rollback_registered(dev);
5666                 dev->reg_state = NETREG_UNREGISTERED;
5667         }
5668         /*
5669          *      Prevent userspace races by waiting until the network
5670          *      device is fully setup before sending notifications.
5671          */
5672         if (!dev->rtnl_link_ops ||
5673             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5674                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5675
5676 out:
5677         return ret;
5678
5679 err_uninit:
5680         if (dev->netdev_ops->ndo_uninit)
5681                 dev->netdev_ops->ndo_uninit(dev);
5682         goto out;
5683 }
5684 EXPORT_SYMBOL(register_netdevice);
5685
5686 /**
5687  *      init_dummy_netdev       - init a dummy network device for NAPI
5688  *      @dev: device to init
5689  *
5690  *      This takes a network device structure and initialize the minimum
5691  *      amount of fields so it can be used to schedule NAPI polls without
5692  *      registering a full blown interface. This is to be used by drivers
5693  *      that need to tie several hardware interfaces to a single NAPI
5694  *      poll scheduler due to HW limitations.
5695  */
5696 int init_dummy_netdev(struct net_device *dev)
5697 {
5698         /* Clear everything. Note we don't initialize spinlocks
5699          * are they aren't supposed to be taken by any of the
5700          * NAPI code and this dummy netdev is supposed to be
5701          * only ever used for NAPI polls
5702          */
5703         memset(dev, 0, sizeof(struct net_device));
5704
5705         /* make sure we BUG if trying to hit standard
5706          * register/unregister code path
5707          */
5708         dev->reg_state = NETREG_DUMMY;
5709
5710         /* NAPI wants this */
5711         INIT_LIST_HEAD(&dev->napi_list);
5712
5713         /* a dummy interface is started by default */
5714         set_bit(__LINK_STATE_PRESENT, &dev->state);
5715         set_bit(__LINK_STATE_START, &dev->state);
5716
5717         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5718          * because users of this 'device' dont need to change
5719          * its refcount.
5720          */
5721
5722         return 0;
5723 }
5724 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5725
5726
5727 /**
5728  *      register_netdev - register a network device
5729  *      @dev: device to register
5730  *
5731  *      Take a completed network device structure and add it to the kernel
5732  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5733  *      chain. 0 is returned on success. A negative errno code is returned
5734  *      on a failure to set up the device, or if the name is a duplicate.
5735  *
5736  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5737  *      and expands the device name if you passed a format string to
5738  *      alloc_netdev.
5739  */
5740 int register_netdev(struct net_device *dev)
5741 {
5742         int err;
5743
5744         rtnl_lock();
5745         err = register_netdevice(dev);
5746         rtnl_unlock();
5747         return err;
5748 }
5749 EXPORT_SYMBOL(register_netdev);
5750
5751 int netdev_refcnt_read(const struct net_device *dev)
5752 {
5753         int i, refcnt = 0;
5754
5755         for_each_possible_cpu(i)
5756                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5757         return refcnt;
5758 }
5759 EXPORT_SYMBOL(netdev_refcnt_read);
5760
5761 /*
5762  * netdev_wait_allrefs - wait until all references are gone.
5763  *
5764  * This is called when unregistering network devices.
5765  *
5766  * Any protocol or device that holds a reference should register
5767  * for netdevice notification, and cleanup and put back the
5768  * reference if they receive an UNREGISTER event.
5769  * We can get stuck here if buggy protocols don't correctly
5770  * call dev_put.
5771  */
5772 static void netdev_wait_allrefs(struct net_device *dev)
5773 {
5774         unsigned long rebroadcast_time, warning_time;
5775         int refcnt;
5776
5777         linkwatch_forget_dev(dev);
5778
5779         rebroadcast_time = warning_time = jiffies;
5780         refcnt = netdev_refcnt_read(dev);
5781
5782         while (refcnt != 0) {
5783                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5784                         rtnl_lock();
5785
5786                         /* Rebroadcast unregister notification */
5787                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5788                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5789                          * should have already handle it the first time */
5790
5791                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5792                                      &dev->state)) {
5793                                 /* We must not have linkwatch events
5794                                  * pending on unregister. If this
5795                                  * happens, we simply run the queue
5796                                  * unscheduled, resulting in a noop
5797                                  * for this device.
5798                                  */
5799                                 linkwatch_run_queue();
5800                         }
5801
5802                         __rtnl_unlock();
5803
5804                         rebroadcast_time = jiffies;
5805                 }
5806
5807                 msleep(250);
5808
5809                 refcnt = netdev_refcnt_read(dev);
5810
5811                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5812                         printk(KERN_EMERG "unregister_netdevice: "
5813                                "waiting for %s to become free. Usage "
5814                                "count = %d\n",
5815                                dev->name, refcnt);
5816                         warning_time = jiffies;
5817                 }
5818         }
5819 }
5820
5821 /* The sequence is:
5822  *
5823  *      rtnl_lock();
5824  *      ...
5825  *      register_netdevice(x1);
5826  *      register_netdevice(x2);
5827  *      ...
5828  *      unregister_netdevice(y1);
5829  *      unregister_netdevice(y2);
5830  *      ...
5831  *      rtnl_unlock();
5832  *      free_netdev(y1);
5833  *      free_netdev(y2);
5834  *
5835  * We are invoked by rtnl_unlock().
5836  * This allows us to deal with problems:
5837  * 1) We can delete sysfs objects which invoke hotplug
5838  *    without deadlocking with linkwatch via keventd.
5839  * 2) Since we run with the RTNL semaphore not held, we can sleep
5840  *    safely in order to wait for the netdev refcnt to drop to zero.
5841  *
5842  * We must not return until all unregister events added during
5843  * the interval the lock was held have been completed.
5844  */
5845 void netdev_run_todo(void)
5846 {
5847         struct list_head list;
5848
5849         /* Snapshot list, allow later requests */
5850         list_replace_init(&net_todo_list, &list);
5851
5852         __rtnl_unlock();
5853
5854         /* Wait for rcu callbacks to finish before attempting to drain
5855          * the device list.  This usually avoids a 250ms wait.
5856          */
5857         if (!list_empty(&list))
5858                 rcu_barrier();
5859
5860         while (!list_empty(&list)) {
5861                 struct net_device *dev
5862                         = list_first_entry(&list, struct net_device, todo_list);
5863                 list_del(&dev->todo_list);
5864
5865                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5866                         printk(KERN_ERR "network todo '%s' but state %d\n",
5867                                dev->name, dev->reg_state);
5868                         dump_stack();
5869                         continue;
5870                 }
5871
5872                 dev->reg_state = NETREG_UNREGISTERED;
5873
5874                 on_each_cpu(flush_backlog, dev, 1);
5875
5876                 netdev_wait_allrefs(dev);
5877
5878                 /* paranoia */
5879                 BUG_ON(netdev_refcnt_read(dev));
5880                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5881                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5882                 WARN_ON(dev->dn_ptr);
5883
5884                 if (dev->destructor)
5885                         dev->destructor(dev);
5886
5887                 /* Free network device */
5888                 kobject_put(&dev->dev.kobj);
5889         }
5890 }
5891
5892 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5893  * fields in the same order, with only the type differing.
5894  */
5895 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5896                                     const struct net_device_stats *netdev_stats)
5897 {
5898 #if BITS_PER_LONG == 64
5899         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5900         memcpy(stats64, netdev_stats, sizeof(*stats64));
5901 #else
5902         size_t i, n = sizeof(*stats64) / sizeof(u64);
5903         const unsigned long *src = (const unsigned long *)netdev_stats;
5904         u64 *dst = (u64 *)stats64;
5905
5906         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5907                      sizeof(*stats64) / sizeof(u64));
5908         for (i = 0; i < n; i++)
5909                 dst[i] = src[i];
5910 #endif
5911 }
5912
5913 /**
5914  *      dev_get_stats   - get network device statistics
5915  *      @dev: device to get statistics from
5916  *      @storage: place to store stats
5917  *
5918  *      Get network statistics from device. Return @storage.
5919  *      The device driver may provide its own method by setting
5920  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5921  *      otherwise the internal statistics structure is used.
5922  */
5923 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5924                                         struct rtnl_link_stats64 *storage)
5925 {
5926         const struct net_device_ops *ops = dev->netdev_ops;
5927
5928         if (ops->ndo_get_stats64) {
5929                 memset(storage, 0, sizeof(*storage));
5930                 ops->ndo_get_stats64(dev, storage);
5931         } else if (ops->ndo_get_stats) {
5932                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5933         } else {
5934                 netdev_stats_to_stats64(storage, &dev->stats);
5935         }
5936         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5937         return storage;
5938 }
5939 EXPORT_SYMBOL(dev_get_stats);
5940
5941 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5942 {
5943         struct netdev_queue *queue = dev_ingress_queue(dev);
5944
5945 #ifdef CONFIG_NET_CLS_ACT
5946         if (queue)
5947                 return queue;
5948         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5949         if (!queue)
5950                 return NULL;
5951         netdev_init_one_queue(dev, queue, NULL);
5952         queue->qdisc = &noop_qdisc;
5953         queue->qdisc_sleeping = &noop_qdisc;
5954         rcu_assign_pointer(dev->ingress_queue, queue);
5955 #endif
5956         return queue;
5957 }
5958
5959 /**
5960  *      alloc_netdev_mqs - allocate network device
5961  *      @sizeof_priv:   size of private data to allocate space for
5962  *      @name:          device name format string
5963  *      @setup:         callback to initialize device
5964  *      @txqs:          the number of TX subqueues to allocate
5965  *      @rxqs:          the number of RX subqueues to allocate
5966  *
5967  *      Allocates a struct net_device with private data area for driver use
5968  *      and performs basic initialization.  Also allocates subquue structs
5969  *      for each queue on the device.
5970  */
5971 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5972                 void (*setup)(struct net_device *),
5973                 unsigned int txqs, unsigned int rxqs)
5974 {
5975         struct net_device *dev;
5976         size_t alloc_size;
5977         struct net_device *p;
5978
5979         BUG_ON(strlen(name) >= sizeof(dev->name));
5980
5981         if (txqs < 1) {
5982                 pr_err("alloc_netdev: Unable to allocate device "
5983                        "with zero queues.\n");
5984                 return NULL;
5985         }
5986
5987 #ifdef CONFIG_RPS
5988         if (rxqs < 1) {
5989                 pr_err("alloc_netdev: Unable to allocate device "
5990                        "with zero RX queues.\n");
5991                 return NULL;
5992         }
5993 #endif
5994
5995         alloc_size = sizeof(struct net_device);
5996         if (sizeof_priv) {
5997                 /* ensure 32-byte alignment of private area */
5998                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5999                 alloc_size += sizeof_priv;
6000         }
6001         /* ensure 32-byte alignment of whole construct */
6002         alloc_size += NETDEV_ALIGN - 1;
6003
6004         p = kzalloc(alloc_size, GFP_KERNEL);
6005         if (!p) {
6006                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
6007                 return NULL;
6008         }
6009
6010         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6011         dev->padded = (char *)dev - (char *)p;
6012
6013         dev->pcpu_refcnt = alloc_percpu(int);
6014         if (!dev->pcpu_refcnt)
6015                 goto free_p;
6016
6017         if (dev_addr_init(dev))
6018                 goto free_pcpu;
6019
6020         dev_mc_init(dev);
6021         dev_uc_init(dev);
6022
6023         dev_net_set(dev, &init_net);
6024
6025         dev->gso_max_size = GSO_MAX_SIZE;
6026
6027         INIT_LIST_HEAD(&dev->napi_list);
6028         INIT_LIST_HEAD(&dev->unreg_list);
6029         INIT_LIST_HEAD(&dev->link_watch_list);
6030         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6031         setup(dev);
6032
6033         dev->num_tx_queues = txqs;
6034         dev->real_num_tx_queues = txqs;
6035         if (netif_alloc_netdev_queues(dev))
6036                 goto free_all;
6037
6038 #ifdef CONFIG_RPS
6039         dev->num_rx_queues = rxqs;
6040         dev->real_num_rx_queues = rxqs;
6041         if (netif_alloc_rx_queues(dev))
6042                 goto free_all;
6043 #endif
6044
6045         strcpy(dev->name, name);
6046         dev->group = INIT_NETDEV_GROUP;
6047         return dev;
6048
6049 free_all:
6050         free_netdev(dev);
6051         return NULL;
6052
6053 free_pcpu:
6054         free_percpu(dev->pcpu_refcnt);
6055         kfree(dev->_tx);
6056 #ifdef CONFIG_RPS
6057         kfree(dev->_rx);
6058 #endif
6059
6060 free_p:
6061         kfree(p);
6062         return NULL;
6063 }
6064 EXPORT_SYMBOL(alloc_netdev_mqs);
6065
6066 /**
6067  *      free_netdev - free network device
6068  *      @dev: device
6069  *
6070  *      This function does the last stage of destroying an allocated device
6071  *      interface. The reference to the device object is released.
6072  *      If this is the last reference then it will be freed.
6073  */
6074 void free_netdev(struct net_device *dev)
6075 {
6076         struct napi_struct *p, *n;
6077
6078         release_net(dev_net(dev));
6079
6080         kfree(dev->_tx);
6081 #ifdef CONFIG_RPS
6082         kfree(dev->_rx);
6083 #endif
6084
6085         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6086
6087         /* Flush device addresses */
6088         dev_addr_flush(dev);
6089
6090         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6091                 netif_napi_del(p);
6092
6093         free_percpu(dev->pcpu_refcnt);
6094         dev->pcpu_refcnt = NULL;
6095
6096         /*  Compatibility with error handling in drivers */
6097         if (dev->reg_state == NETREG_UNINITIALIZED) {
6098                 kfree((char *)dev - dev->padded);
6099                 return;
6100         }
6101
6102         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6103         dev->reg_state = NETREG_RELEASED;
6104
6105         /* will free via device release */
6106         put_device(&dev->dev);
6107 }
6108 EXPORT_SYMBOL(free_netdev);
6109
6110 /**
6111  *      synchronize_net -  Synchronize with packet receive processing
6112  *
6113  *      Wait for packets currently being received to be done.
6114  *      Does not block later packets from starting.
6115  */
6116 void synchronize_net(void)
6117 {
6118         might_sleep();
6119         if (rtnl_is_locked())
6120                 synchronize_rcu_expedited();
6121         else
6122                 synchronize_rcu();
6123 }
6124 EXPORT_SYMBOL(synchronize_net);
6125
6126 /**
6127  *      unregister_netdevice_queue - remove device from the kernel
6128  *      @dev: device
6129  *      @head: list
6130  *
6131  *      This function shuts down a device interface and removes it
6132  *      from the kernel tables.
6133  *      If head not NULL, device is queued to be unregistered later.
6134  *
6135  *      Callers must hold the rtnl semaphore.  You may want
6136  *      unregister_netdev() instead of this.
6137  */
6138
6139 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6140 {
6141         ASSERT_RTNL();
6142
6143         if (head) {
6144                 list_move_tail(&dev->unreg_list, head);
6145         } else {
6146                 rollback_registered(dev);
6147                 /* Finish processing unregister after unlock */
6148                 net_set_todo(dev);
6149         }
6150 }
6151 EXPORT_SYMBOL(unregister_netdevice_queue);
6152
6153 /**
6154  *      unregister_netdevice_many - unregister many devices
6155  *      @head: list of devices
6156  */
6157 void unregister_netdevice_many(struct list_head *head)
6158 {
6159         struct net_device *dev;
6160
6161         if (!list_empty(head)) {
6162                 rollback_registered_many(head);
6163                 list_for_each_entry(dev, head, unreg_list)
6164                         net_set_todo(dev);
6165         }
6166 }
6167 EXPORT_SYMBOL(unregister_netdevice_many);
6168
6169 /**
6170  *      unregister_netdev - remove device from the kernel
6171  *      @dev: device
6172  *
6173  *      This function shuts down a device interface and removes it
6174  *      from the kernel tables.
6175  *
6176  *      This is just a wrapper for unregister_netdevice that takes
6177  *      the rtnl semaphore.  In general you want to use this and not
6178  *      unregister_netdevice.
6179  */
6180 void unregister_netdev(struct net_device *dev)
6181 {
6182         rtnl_lock();
6183         unregister_netdevice(dev);
6184         rtnl_unlock();
6185 }
6186 EXPORT_SYMBOL(unregister_netdev);
6187
6188 /**
6189  *      dev_change_net_namespace - move device to different nethost namespace
6190  *      @dev: device
6191  *      @net: network namespace
6192  *      @pat: If not NULL name pattern to try if the current device name
6193  *            is already taken in the destination network namespace.
6194  *
6195  *      This function shuts down a device interface and moves it
6196  *      to a new network namespace. On success 0 is returned, on
6197  *      a failure a netagive errno code is returned.
6198  *
6199  *      Callers must hold the rtnl semaphore.
6200  */
6201
6202 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6203 {
6204         int err;
6205
6206         ASSERT_RTNL();
6207
6208         /* Don't allow namespace local devices to be moved. */
6209         err = -EINVAL;
6210         if (dev->features & NETIF_F_NETNS_LOCAL)
6211                 goto out;
6212
6213         /* Ensure the device has been registrered */
6214         err = -EINVAL;
6215         if (dev->reg_state != NETREG_REGISTERED)
6216                 goto out;
6217
6218         /* Get out if there is nothing todo */
6219         err = 0;
6220         if (net_eq(dev_net(dev), net))
6221                 goto out;
6222
6223         /* Pick the destination device name, and ensure
6224          * we can use it in the destination network namespace.
6225          */
6226         err = -EEXIST;
6227         if (__dev_get_by_name(net, dev->name)) {
6228                 /* We get here if we can't use the current device name */
6229                 if (!pat)
6230                         goto out;
6231                 if (dev_get_valid_name(dev, pat) < 0)
6232                         goto out;
6233         }
6234
6235         /*
6236          * And now a mini version of register_netdevice unregister_netdevice.
6237          */
6238
6239         /* If device is running close it first. */
6240         dev_close(dev);
6241
6242         /* And unlink it from device chain */
6243         err = -ENODEV;
6244         unlist_netdevice(dev);
6245
6246         synchronize_net();
6247
6248         /* Shutdown queueing discipline. */
6249         dev_shutdown(dev);
6250
6251         /* Notify protocols, that we are about to destroy
6252            this device. They should clean all the things.
6253
6254            Note that dev->reg_state stays at NETREG_REGISTERED.
6255            This is wanted because this way 8021q and macvlan know
6256            the device is just moving and can keep their slaves up.
6257         */
6258         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6259         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6260         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6261
6262         /*
6263          *      Flush the unicast and multicast chains
6264          */
6265         dev_uc_flush(dev);
6266         dev_mc_flush(dev);
6267
6268         /* Actually switch the network namespace */
6269         dev_net_set(dev, net);
6270
6271         /* If there is an ifindex conflict assign a new one */
6272         if (__dev_get_by_index(net, dev->ifindex)) {
6273                 int iflink = (dev->iflink == dev->ifindex);
6274                 dev->ifindex = dev_new_index(net);
6275                 if (iflink)
6276                         dev->iflink = dev->ifindex;
6277         }
6278
6279         /* Fixup kobjects */
6280         err = device_rename(&dev->dev, dev->name);
6281         WARN_ON(err);
6282
6283         /* Add the device back in the hashes */
6284         list_netdevice(dev);
6285
6286         /* Notify protocols, that a new device appeared. */
6287         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6288
6289         /*
6290          *      Prevent userspace races by waiting until the network
6291          *      device is fully setup before sending notifications.
6292          */
6293         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6294
6295         synchronize_net();
6296         err = 0;
6297 out:
6298         return err;
6299 }
6300 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6301
6302 static int dev_cpu_callback(struct notifier_block *nfb,
6303                             unsigned long action,
6304                             void *ocpu)
6305 {
6306         struct sk_buff **list_skb;
6307         struct sk_buff *skb;
6308         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6309         struct softnet_data *sd, *oldsd;
6310
6311         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6312                 return NOTIFY_OK;
6313
6314         local_irq_disable();
6315         cpu = smp_processor_id();
6316         sd = &per_cpu(softnet_data, cpu);
6317         oldsd = &per_cpu(softnet_data, oldcpu);
6318
6319         /* Find end of our completion_queue. */
6320         list_skb = &sd->completion_queue;
6321         while (*list_skb)
6322                 list_skb = &(*list_skb)->next;
6323         /* Append completion queue from offline CPU. */
6324         *list_skb = oldsd->completion_queue;
6325         oldsd->completion_queue = NULL;
6326
6327         /* Append output queue from offline CPU. */
6328         if (oldsd->output_queue) {
6329                 *sd->output_queue_tailp = oldsd->output_queue;
6330                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6331                 oldsd->output_queue = NULL;
6332                 oldsd->output_queue_tailp = &oldsd->output_queue;
6333         }
6334         /* Append NAPI poll list from offline CPU. */
6335         if (!list_empty(&oldsd->poll_list)) {
6336                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6337                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6338         }
6339
6340         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6341         local_irq_enable();
6342
6343         /* Process offline CPU's input_pkt_queue */
6344         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6345                 netif_rx(skb);
6346                 input_queue_head_incr(oldsd);
6347         }
6348         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6349                 netif_rx(skb);
6350                 input_queue_head_incr(oldsd);
6351         }
6352
6353         return NOTIFY_OK;
6354 }
6355
6356
6357 /**
6358  *      netdev_increment_features - increment feature set by one
6359  *      @all: current feature set
6360  *      @one: new feature set
6361  *      @mask: mask feature set
6362  *
6363  *      Computes a new feature set after adding a device with feature set
6364  *      @one to the master device with current feature set @all.  Will not
6365  *      enable anything that is off in @mask. Returns the new feature set.
6366  */
6367 netdev_features_t netdev_increment_features(netdev_features_t all,
6368         netdev_features_t one, netdev_features_t mask)
6369 {
6370         if (mask & NETIF_F_GEN_CSUM)
6371                 mask |= NETIF_F_ALL_CSUM;
6372         mask |= NETIF_F_VLAN_CHALLENGED;
6373
6374         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6375         all &= one | ~NETIF_F_ALL_FOR_ALL;
6376
6377         /* If device needs checksumming, downgrade to it. */
6378         if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6379                 all &= ~NETIF_F_NO_CSUM;
6380
6381         /* If one device supports hw checksumming, set for all. */
6382         if (all & NETIF_F_GEN_CSUM)
6383                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6384
6385         return all;
6386 }
6387 EXPORT_SYMBOL(netdev_increment_features);
6388
6389 static struct hlist_head *netdev_create_hash(void)
6390 {
6391         int i;
6392         struct hlist_head *hash;
6393
6394         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6395         if (hash != NULL)
6396                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6397                         INIT_HLIST_HEAD(&hash[i]);
6398
6399         return hash;
6400 }
6401
6402 /* Initialize per network namespace state */
6403 static int __net_init netdev_init(struct net *net)
6404 {
6405         INIT_LIST_HEAD(&net->dev_base_head);
6406
6407         net->dev_name_head = netdev_create_hash();
6408         if (net->dev_name_head == NULL)
6409                 goto err_name;
6410
6411         net->dev_index_head = netdev_create_hash();
6412         if (net->dev_index_head == NULL)
6413                 goto err_idx;
6414
6415         return 0;
6416
6417 err_idx:
6418         kfree(net->dev_name_head);
6419 err_name:
6420         return -ENOMEM;
6421 }
6422
6423 /**
6424  *      netdev_drivername - network driver for the device
6425  *      @dev: network device
6426  *
6427  *      Determine network driver for device.
6428  */
6429 const char *netdev_drivername(const struct net_device *dev)
6430 {
6431         const struct device_driver *driver;
6432         const struct device *parent;
6433         const char *empty = "";
6434
6435         parent = dev->dev.parent;
6436         if (!parent)
6437                 return empty;
6438
6439         driver = parent->driver;
6440         if (driver && driver->name)
6441                 return driver->name;
6442         return empty;
6443 }
6444
6445 int __netdev_printk(const char *level, const struct net_device *dev,
6446                            struct va_format *vaf)
6447 {
6448         int r;
6449
6450         if (dev && dev->dev.parent)
6451                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6452                                netdev_name(dev), vaf);
6453         else if (dev)
6454                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6455         else
6456                 r = printk("%s(NULL net_device): %pV", level, vaf);
6457
6458         return r;
6459 }
6460 EXPORT_SYMBOL(__netdev_printk);
6461
6462 int netdev_printk(const char *level, const struct net_device *dev,
6463                   const char *format, ...)
6464 {
6465         struct va_format vaf;
6466         va_list args;
6467         int r;
6468
6469         va_start(args, format);
6470
6471         vaf.fmt = format;
6472         vaf.va = &args;
6473
6474         r = __netdev_printk(level, dev, &vaf);
6475         va_end(args);
6476
6477         return r;
6478 }
6479 EXPORT_SYMBOL(netdev_printk);
6480
6481 #define define_netdev_printk_level(func, level)                 \
6482 int func(const struct net_device *dev, const char *fmt, ...)    \
6483 {                                                               \
6484         int r;                                                  \
6485         struct va_format vaf;                                   \
6486         va_list args;                                           \
6487                                                                 \
6488         va_start(args, fmt);                                    \
6489                                                                 \
6490         vaf.fmt = fmt;                                          \
6491         vaf.va = &args;                                         \
6492                                                                 \
6493         r = __netdev_printk(level, dev, &vaf);                  \
6494         va_end(args);                                           \
6495                                                                 \
6496         return r;                                               \
6497 }                                                               \
6498 EXPORT_SYMBOL(func);
6499
6500 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6501 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6502 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6503 define_netdev_printk_level(netdev_err, KERN_ERR);
6504 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6505 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6506 define_netdev_printk_level(netdev_info, KERN_INFO);
6507
6508 static void __net_exit netdev_exit(struct net *net)
6509 {
6510         kfree(net->dev_name_head);
6511         kfree(net->dev_index_head);
6512 }
6513
6514 static struct pernet_operations __net_initdata netdev_net_ops = {
6515         .init = netdev_init,
6516         .exit = netdev_exit,
6517 };
6518
6519 static void __net_exit default_device_exit(struct net *net)
6520 {
6521         struct net_device *dev, *aux;
6522         /*
6523          * Push all migratable network devices back to the
6524          * initial network namespace
6525          */
6526         rtnl_lock();
6527         for_each_netdev_safe(net, dev, aux) {
6528                 int err;
6529                 char fb_name[IFNAMSIZ];
6530
6531                 /* Ignore unmoveable devices (i.e. loopback) */
6532                 if (dev->features & NETIF_F_NETNS_LOCAL)
6533                         continue;
6534
6535                 /* Leave virtual devices for the generic cleanup */
6536                 if (dev->rtnl_link_ops)
6537                         continue;
6538
6539                 /* Push remaining network devices to init_net */
6540                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6541                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6542                 if (err) {
6543                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6544                                 __func__, dev->name, err);
6545                         BUG();
6546                 }
6547         }
6548         rtnl_unlock();
6549 }
6550
6551 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6552 {
6553         /* At exit all network devices most be removed from a network
6554          * namespace.  Do this in the reverse order of registration.
6555          * Do this across as many network namespaces as possible to
6556          * improve batching efficiency.
6557          */
6558         struct net_device *dev;
6559         struct net *net;
6560         LIST_HEAD(dev_kill_list);
6561
6562         rtnl_lock();
6563         list_for_each_entry(net, net_list, exit_list) {
6564                 for_each_netdev_reverse(net, dev) {
6565                         if (dev->rtnl_link_ops)
6566                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6567                         else
6568                                 unregister_netdevice_queue(dev, &dev_kill_list);
6569                 }
6570         }
6571         unregister_netdevice_many(&dev_kill_list);
6572         list_del(&dev_kill_list);
6573         rtnl_unlock();
6574 }
6575
6576 static struct pernet_operations __net_initdata default_device_ops = {
6577         .exit = default_device_exit,
6578         .exit_batch = default_device_exit_batch,
6579 };
6580
6581 /*
6582  *      Initialize the DEV module. At boot time this walks the device list and
6583  *      unhooks any devices that fail to initialise (normally hardware not
6584  *      present) and leaves us with a valid list of present and active devices.
6585  *
6586  */
6587
6588 /*
6589  *       This is called single threaded during boot, so no need
6590  *       to take the rtnl semaphore.
6591  */
6592 static int __init net_dev_init(void)
6593 {
6594         int i, rc = -ENOMEM;
6595
6596         BUG_ON(!dev_boot_phase);
6597
6598         if (dev_proc_init())
6599                 goto out;
6600
6601         if (netdev_kobject_init())
6602                 goto out;
6603
6604         INIT_LIST_HEAD(&ptype_all);
6605         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6606                 INIT_LIST_HEAD(&ptype_base[i]);
6607
6608         if (register_pernet_subsys(&netdev_net_ops))
6609                 goto out;
6610
6611         /*
6612          *      Initialise the packet receive queues.
6613          */
6614
6615         for_each_possible_cpu(i) {
6616                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6617
6618                 memset(sd, 0, sizeof(*sd));
6619                 skb_queue_head_init(&sd->input_pkt_queue);
6620                 skb_queue_head_init(&sd->process_queue);
6621                 sd->completion_queue = NULL;
6622                 INIT_LIST_HEAD(&sd->poll_list);
6623                 sd->output_queue = NULL;
6624                 sd->output_queue_tailp = &sd->output_queue;
6625 #ifdef CONFIG_RPS
6626                 sd->csd.func = rps_trigger_softirq;
6627                 sd->csd.info = sd;
6628                 sd->csd.flags = 0;
6629                 sd->cpu = i;
6630 #endif
6631
6632                 sd->backlog.poll = process_backlog;
6633                 sd->backlog.weight = weight_p;
6634                 sd->backlog.gro_list = NULL;
6635                 sd->backlog.gro_count = 0;
6636         }
6637
6638         dev_boot_phase = 0;
6639
6640         /* The loopback device is special if any other network devices
6641          * is present in a network namespace the loopback device must
6642          * be present. Since we now dynamically allocate and free the
6643          * loopback device ensure this invariant is maintained by
6644          * keeping the loopback device as the first device on the
6645          * list of network devices.  Ensuring the loopback devices
6646          * is the first device that appears and the last network device
6647          * that disappears.
6648          */
6649         if (register_pernet_device(&loopback_net_ops))
6650                 goto out;
6651
6652         if (register_pernet_device(&default_device_ops))
6653                 goto out;
6654
6655         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6656         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6657
6658         hotcpu_notifier(dev_cpu_callback, 0);
6659         dst_init();
6660         dev_mcast_init();
6661         rc = 0;
6662 out:
6663         return rc;
6664 }
6665
6666 subsys_initcall(net_dev_init);
6667
6668 static int __init initialize_hashrnd(void)
6669 {
6670         get_random_bytes(&hashrnd, sizeof(hashrnd));
6671         return 0;
6672 }
6673
6674 late_initcall_sync(initialize_hashrnd);
6675