net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132
 133 #include "net-sysfs.h"
 134
 135 /* Instead of increasing this, you should create a hash table. */
 136 #define MAX_GRO_SKBS 8
 137
 138 /* This should be increased if a protocol with a bigger head is added. */
 139 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 140
 141 static DEFINE_SPINLOCK(ptype_lock);
 142 static DEFINE_SPINLOCK(offload_lock);
 143 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 144 struct list_head ptype_all __read_mostly;       /* Taps */
 145 static struct list_head offload_base __read_mostly;
 146
 147 /*
 148  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 149  * semaphore.
 150  *
 151  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 152  *
 153  * Writers must hold the rtnl semaphore while they loop through the
 154  * dev_base_head list, and hold dev_base_lock for writing when they do the
 155  * actual updates.  This allows pure readers to access the list even
 156  * while a writer is preparing to update it.
 157  *
 158  * To put it another way, dev_base_lock is held for writing only to
 159  * protect against pure readers; the rtnl semaphore provides the
 160  * protection against other writers.
 161  *
 162  * See, for example usages, register_netdevice() and
 163  * unregister_netdevice(), which must be called with the rtnl
 164  * semaphore held.
 165  */
 166 DEFINE_RWLOCK(dev_base_lock);
 167 EXPORT_SYMBOL(dev_base_lock);
 168
 169 seqcount_t devnet_rename_seq;
 170
 171 static inline void dev_base_seq_inc(struct net *net)
 172 {
 173         while (++net->dev_base_seq == 0);
 174 }
 175
 176 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 177 {
 178         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 179
 180         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 181 }
 182
 183 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 184 {
 185         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 186 }
 187
 188 static inline void rps_lock(struct softnet_data *sd)
 189 {
 190 #ifdef CONFIG_RPS
 191         spin_lock(&sd->input_pkt_queue.lock);
 192 #endif
 193 }
 194
 195 static inline void rps_unlock(struct softnet_data *sd)
 196 {
 197 #ifdef CONFIG_RPS
 198         spin_unlock(&sd->input_pkt_queue.lock);
 199 #endif
 200 }
 201
 202 /* Device list insertion */
 203 static void list_netdevice(struct net_device *dev)
 204 {
 205         struct net *net = dev_net(dev);
 206
 207         ASSERT_RTNL();
 208
 209         write_lock_bh(&dev_base_lock);
 210         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 211         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 212         hlist_add_head_rcu(&dev->index_hlist,
 213                            dev_index_hash(net, dev->ifindex));
 214         write_unlock_bh(&dev_base_lock);
 215
 216         dev_base_seq_inc(net);
 217 }
 218
 219 /* Device list removal
 220  * caller must respect a RCU grace period before freeing/reusing dev
 221  */
 222 static void unlist_netdevice(struct net_device *dev)
 223 {
 224         ASSERT_RTNL();
 225
 226         /* Unlink dev from the device chain */
 227         write_lock_bh(&dev_base_lock);
 228         list_del_rcu(&dev->dev_list);
 229         hlist_del_rcu(&dev->name_hlist);
 230         hlist_del_rcu(&dev->index_hlist);
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(dev_net(dev));
 234 }
 235
 236 /*
 237  *      Our notifier list
 238  */
 239
 240 static RAW_NOTIFIER_HEAD(netdev_chain);
 241
 242 /*
 243  *      Device drivers call our routines to queue packets here. We empty the
 244  *      queue in the local softnet handler.
 245  */
 246
 247 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 248 EXPORT_PER_CPU_SYMBOL(softnet_data);
 249
 250 #ifdef CONFIG_LOCKDEP
 251 /*
 252  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 253  * according to dev->type
 254  */
 255 static const unsigned short netdev_lock_type[] =
 256         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 257          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 258          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 259          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 260          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 261          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 262          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 263          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 264          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 265          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 266          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 267          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 268          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 269          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 270          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 271
 272 static const char *const netdev_lock_name[] =
 273         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 274          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 275          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 276          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 277          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 278          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 279          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 280          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 281          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 282          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 283          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 284          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 285          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 286          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 287          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 288
 289 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 290 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 291
 292 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 293 {
 294         int i;
 295
 296         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 297                 if (netdev_lock_type[i] == dev_type)
 298                         return i;
 299         /* the last key is used by default */
 300         return ARRAY_SIZE(netdev_lock_type) - 1;
 301 }
 302
 303 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 304                                                  unsigned short dev_type)
 305 {
 306         int i;
 307
 308         i = netdev_lock_pos(dev_type);
 309         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 310                                    netdev_lock_name[i]);
 311 }
 312
 313 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 314 {
 315         int i;
 316
 317         i = netdev_lock_pos(dev->type);
 318         lockdep_set_class_and_name(&dev->addr_list_lock,
 319                                    &netdev_addr_lock_key[i],
 320                                    netdev_lock_name[i]);
 321 }
 322 #else
 323 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 324                                                  unsigned short dev_type)
 325 {
 326 }
 327 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 328 {
 329 }
 330 #endif
 331
 332 /*******************************************************************************
 333
 334                 Protocol management and registration routines
 335
 336 *******************************************************************************/
 337
 338 /*
 339  *      Add a protocol ID to the list. Now that the input handler is
 340  *      smarter we can dispense with all the messy stuff that used to be
 341  *      here.
 342  *
 343  *      BEWARE!!! Protocol handlers, mangling input packets,
 344  *      MUST BE last in hash buckets and checking protocol handlers
 345  *      MUST start from promiscuous ptype_all chain in net_bh.
 346  *      It is true now, do not change it.
 347  *      Explanation follows: if protocol handler, mangling packet, will
 348  *      be the first on list, it is not able to sense, that packet
 349  *      is cloned and should be copied-on-write, so that it will
 350  *      change it and subsequent readers will get broken packet.
 351  *                                                      --ANK (980803)
 352  */
 353
 354 static inline struct list_head *ptype_head(const struct packet_type *pt)
 355 {
 356         if (pt->type == htons(ETH_P_ALL))
 357                 return &ptype_all;
 358         else
 359                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 360 }
 361
 362 /**
 363  *      dev_add_pack - add packet handler
 364  *      @pt: packet type declaration
 365  *
 366  *      Add a protocol handler to the networking stack. The passed &packet_type
 367  *      is linked into kernel lists and may not be freed until it has been
 368  *      removed from the kernel lists.
 369  *
 370  *      This call does not sleep therefore it can not
 371  *      guarantee all CPU's that are in middle of receiving packets
 372  *      will see the new packet type (until the next received packet).
 373  */
 374
 375 void dev_add_pack(struct packet_type *pt)
 376 {
 377         struct list_head *head = ptype_head(pt);
 378
 379         spin_lock(&ptype_lock);
 380         list_add_rcu(&pt->list, head);
 381         spin_unlock(&ptype_lock);
 382 }
 383 EXPORT_SYMBOL(dev_add_pack);
 384
 385 /**
 386  *      __dev_remove_pack        - remove packet handler
 387  *      @pt: packet type declaration
 388  *
 389  *      Remove a protocol handler that was previously added to the kernel
 390  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 391  *      from the kernel lists and can be freed or reused once this function
 392  *      returns.
 393  *
 394  *      The packet type might still be in use by receivers
 395  *      and must not be freed until after all the CPU's have gone
 396  *      through a quiescent state.
 397  */
 398 void __dev_remove_pack(struct packet_type *pt)
 399 {
 400         struct list_head *head = ptype_head(pt);
 401         struct packet_type *pt1;
 402
 403         spin_lock(&ptype_lock);
 404
 405         list_for_each_entry(pt1, head, list) {
 406                 if (pt == pt1) {
 407                         list_del_rcu(&pt->list);
 408                         goto out;
 409                 }
 410         }
 411
 412         pr_warn("dev_remove_pack: %p not found\n", pt);
 413 out:
 414         spin_unlock(&ptype_lock);
 415 }
 416 EXPORT_SYMBOL(__dev_remove_pack);
 417
 418 /**
 419  *      dev_remove_pack  - remove packet handler
 420  *      @pt: packet type declaration
 421  *
 422  *      Remove a protocol handler that was previously added to the kernel
 423  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 424  *      from the kernel lists and can be freed or reused once this function
 425  *      returns.
 426  *
 427  *      This call sleeps to guarantee that no CPU is looking at the packet
 428  *      type after return.
 429  */
 430 void dev_remove_pack(struct packet_type *pt)
 431 {
 432         __dev_remove_pack(pt);
 433
 434         synchronize_net();
 435 }
 436 EXPORT_SYMBOL(dev_remove_pack);
 437
 438
 439 /**
 440  *      dev_add_offload - register offload handlers
 441  *      @po: protocol offload declaration
 442  *
 443  *      Add protocol offload handlers to the networking stack. The passed
 444  *      &proto_offload is linked into kernel lists and may not be freed until
 445  *      it has been removed from the kernel lists.
 446  *
 447  *      This call does not sleep therefore it can not
 448  *      guarantee all CPU's that are in middle of receiving packets
 449  *      will see the new offload handlers (until the next received packet).
 450  */
 451 void dev_add_offload(struct packet_offload *po)
 452 {
 453         struct list_head *head = &offload_base;
 454
 455         spin_lock(&offload_lock);
 456         list_add_rcu(&po->list, head);
 457         spin_unlock(&offload_lock);
 458 }
 459 EXPORT_SYMBOL(dev_add_offload);
 460
 461 /**
 462  *      __dev_remove_offload     - remove offload handler
 463  *      @po: packet offload declaration
 464  *
 465  *      Remove a protocol offload handler that was previously added to the
 466  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 467  *      is removed from the kernel lists and can be freed or reused once this
 468  *      function returns.
 469  *
 470  *      The packet type might still be in use by receivers
 471  *      and must not be freed until after all the CPU's have gone
 472  *      through a quiescent state.
 473  */
 474 void __dev_remove_offload(struct packet_offload *po)
 475 {
 476         struct list_head *head = &offload_base;
 477         struct packet_offload *po1;
 478
 479         spin_lock(&offload_lock);
 480
 481         list_for_each_entry(po1, head, list) {
 482                 if (po == po1) {
 483                         list_del_rcu(&po->list);
 484                         goto out;
 485                 }
 486         }
 487
 488         pr_warn("dev_remove_offload: %p not found\n", po);
 489 out:
 490         spin_unlock(&offload_lock);
 491 }
 492 EXPORT_SYMBOL(__dev_remove_offload);
 493
 494 /**
 495  *      dev_remove_offload       - remove packet offload handler
 496  *      @po: packet offload declaration
 497  *
 498  *      Remove a packet offload handler that was previously added to the kernel
 499  *      offload handlers by dev_add_offload(). The passed &offload_type is
 500  *      removed from the kernel lists and can be freed or reused once this
 501  *      function returns.
 502  *
 503  *      This call sleeps to guarantee that no CPU is looking at the packet
 504  *      type after return.
 505  */
 506 void dev_remove_offload(struct packet_offload *po)
 507 {
 508         __dev_remove_offload(po);
 509
 510         synchronize_net();
 511 }
 512 EXPORT_SYMBOL(dev_remove_offload);
 513
 514 /******************************************************************************
 515
 516                       Device Boot-time Settings Routines
 517
 518 *******************************************************************************/
 519
 520 /* Boot time configuration table */
 521 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 522
 523 /**
 524  *      netdev_boot_setup_add   - add new setup entry
 525  *      @name: name of the device
 526  *      @map: configured settings for the device
 527  *
 528  *      Adds new setup entry to the dev_boot_setup list.  The function
 529  *      returns 0 on error and 1 on success.  This is a generic routine to
 530  *      all netdevices.
 531  */
 532 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 533 {
 534         struct netdev_boot_setup *s;
 535         int i;
 536
 537         s = dev_boot_setup;
 538         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 539                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 540                         memset(s[i].name, 0, sizeof(s[i].name));
 541                         strlcpy(s[i].name, name, IFNAMSIZ);
 542                         memcpy(&s[i].map, map, sizeof(s[i].map));
 543                         break;
 544                 }
 545         }
 546
 547         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 548 }
 549
 550 /**
 551  *      netdev_boot_setup_check - check boot time settings
 552  *      @dev: the netdevice
 553  *
 554  *      Check boot time settings for the device.
 555  *      The found settings are set for the device to be used
 556  *      later in the device probing.
 557  *      Returns 0 if no settings found, 1 if they are.
 558  */
 559 int netdev_boot_setup_check(struct net_device *dev)
 560 {
 561         struct netdev_boot_setup *s = dev_boot_setup;
 562         int i;
 563
 564         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 566                     !strcmp(dev->name, s[i].name)) {
 567                         dev->irq        = s[i].map.irq;
 568                         dev->base_addr  = s[i].map.base_addr;
 569                         dev->mem_start  = s[i].map.mem_start;
 570                         dev->mem_end    = s[i].map.mem_end;
 571                         return 1;
 572                 }
 573         }
 574         return 0;
 575 }
 576 EXPORT_SYMBOL(netdev_boot_setup_check);
 577
 578
 579 /**
 580  *      netdev_boot_base        - get address from boot time settings
 581  *      @prefix: prefix for network device
 582  *      @unit: id for network device
 583  *
 584  *      Check boot time settings for the base address of device.
 585  *      The found settings are set for the device to be used
 586  *      later in the device probing.
 587  *      Returns 0 if no settings found.
 588  */
 589 unsigned long netdev_boot_base(const char *prefix, int unit)
 590 {
 591         const struct netdev_boot_setup *s = dev_boot_setup;
 592         char name[IFNAMSIZ];
 593         int i;
 594
 595         sprintf(name, "%s%d", prefix, unit);
 596
 597         /*
 598          * If device already registered then return base of 1
 599          * to indicate not to probe for this interface
 600          */
 601         if (__dev_get_by_name(&init_net, name))
 602                 return 1;
 603
 604         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 605                 if (!strcmp(name, s[i].name))
 606                         return s[i].map.base_addr;
 607         return 0;
 608 }
 609
 610 /*
 611  * Saves at boot time configured settings for any netdevice.
 612  */
 613 int __init netdev_boot_setup(char *str)
 614 {
 615         int ints[5];
 616         struct ifmap map;
 617
 618         str = get_options(str, ARRAY_SIZE(ints), ints);
 619         if (!str || !*str)
 620                 return 0;
 621
 622         /* Save settings */
 623         memset(&map, 0, sizeof(map));
 624         if (ints[0] > 0)
 625                 map.irq = ints[1];
 626         if (ints[0] > 1)
 627                 map.base_addr = ints[2];
 628         if (ints[0] > 2)
 629                 map.mem_start = ints[3];
 630         if (ints[0] > 3)
 631                 map.mem_end = ints[4];
 632
 633         /* Add new entry to the list */
 634         return netdev_boot_setup_add(str, &map);
 635 }
 636
 637 __setup("netdev=", netdev_boot_setup);
 638
 639 /*******************************************************************************
 640
 641                             Device Interface Subroutines
 642
 643 *******************************************************************************/
 644
 645 /**
 646  *      __dev_get_by_name       - find a device by its name
 647  *      @net: the applicable net namespace
 648  *      @name: name to find
 649  *
 650  *      Find an interface by name. Must be called under RTNL semaphore
 651  *      or @dev_base_lock. If the name is found a pointer to the device
 652  *      is returned. If the name is not found then %NULL is returned. The
 653  *      reference counters are not incremented so the caller must be
 654  *      careful with locks.
 655  */
 656
 657 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 658 {
 659         struct net_device *dev;
 660         struct hlist_head *head = dev_name_hash(net, name);
 661
 662         hlist_for_each_entry(dev, head, name_hlist)
 663                 if (!strncmp(dev->name, name, IFNAMSIZ))
 664                         return dev;
 665
 666         return NULL;
 667 }
 668 EXPORT_SYMBOL(__dev_get_by_name);
 669
 670 /**
 671  *      dev_get_by_name_rcu     - find a device by its name
 672  *      @net: the applicable net namespace
 673  *      @name: name to find
 674  *
 675  *      Find an interface by name.
 676  *      If the name is found a pointer to the device is returned.
 677  *      If the name is not found then %NULL is returned.
 678  *      The reference counters are not incremented so the caller must be
 679  *      careful with locks. The caller must hold RCU lock.
 680  */
 681
 682 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 683 {
 684         struct net_device *dev;
 685         struct hlist_head *head = dev_name_hash(net, name);
 686
 687         hlist_for_each_entry_rcu(dev, head, name_hlist)
 688                 if (!strncmp(dev->name, name, IFNAMSIZ))
 689                         return dev;
 690
 691         return NULL;
 692 }
 693 EXPORT_SYMBOL(dev_get_by_name_rcu);
 694
 695 /**
 696  *      dev_get_by_name         - find a device by its name
 697  *      @net: the applicable net namespace
 698  *      @name: name to find
 699  *
 700  *      Find an interface by name. This can be called from any
 701  *      context and does its own locking. The returned handle has
 702  *      the usage count incremented and the caller must use dev_put() to
 703  *      release it when it is no longer needed. %NULL is returned if no
 704  *      matching device is found.
 705  */
 706
 707 struct net_device *dev_get_by_name(struct net *net, const char *name)
 708 {
 709         struct net_device *dev;
 710
 711         rcu_read_lock();
 712         dev = dev_get_by_name_rcu(net, name);
 713         if (dev)
 714                 dev_hold(dev);
 715         rcu_read_unlock();
 716         return dev;
 717 }
 718 EXPORT_SYMBOL(dev_get_by_name);
 719
 720 /**
 721  *      __dev_get_by_index - find a device by its ifindex
 722  *      @net: the applicable net namespace
 723  *      @ifindex: index of device
 724  *
 725  *      Search for an interface by index. Returns %NULL if the device
 726  *      is not found or a pointer to the device. The device has not
 727  *      had its reference counter increased so the caller must be careful
 728  *      about locking. The caller must hold either the RTNL semaphore
 729  *      or @dev_base_lock.
 730  */
 731
 732 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 733 {
 734         struct net_device *dev;
 735         struct hlist_head *head = dev_index_hash(net, ifindex);
 736
 737         hlist_for_each_entry(dev, head, index_hlist)
 738                 if (dev->ifindex == ifindex)
 739                         return dev;
 740
 741         return NULL;
 742 }
 743 EXPORT_SYMBOL(__dev_get_by_index);
 744
 745 /**
 746  *      dev_get_by_index_rcu - find a device by its ifindex
 747  *      @net: the applicable net namespace
 748  *      @ifindex: index of device
 749  *
 750  *      Search for an interface by index. Returns %NULL if the device
 751  *      is not found or a pointer to the device. The device has not
 752  *      had its reference counter increased so the caller must be careful
 753  *      about locking. The caller must hold RCU lock.
 754  */
 755
 756 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 757 {
 758         struct net_device *dev;
 759         struct hlist_head *head = dev_index_hash(net, ifindex);
 760
 761         hlist_for_each_entry_rcu(dev, head, index_hlist)
 762                 if (dev->ifindex == ifindex)
 763                         return dev;
 764
 765         return NULL;
 766 }
 767 EXPORT_SYMBOL(dev_get_by_index_rcu);
 768
 769
 770 /**
 771  *      dev_get_by_index - find a device by its ifindex
 772  *      @net: the applicable net namespace
 773  *      @ifindex: index of device
 774  *
 775  *      Search for an interface by index. Returns NULL if the device
 776  *      is not found or a pointer to the device. The device returned has
 777  *      had a reference added and the pointer is safe until the user calls
 778  *      dev_put to indicate they have finished with it.
 779  */
 780
 781 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 782 {
 783         struct net_device *dev;
 784
 785         rcu_read_lock();
 786         dev = dev_get_by_index_rcu(net, ifindex);
 787         if (dev)
 788                 dev_hold(dev);
 789         rcu_read_unlock();
 790         return dev;
 791 }
 792 EXPORT_SYMBOL(dev_get_by_index);
 793
 794 /**
 795  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 796  *      @net: the applicable net namespace
 797  *      @type: media type of device
 798  *      @ha: hardware address
 799  *
 800  *      Search for an interface by MAC address. Returns NULL if the device
 801  *      is not found or a pointer to the device.
 802  *      The caller must hold RCU or RTNL.
 803  *      The returned device has not had its ref count increased
 804  *      and the caller must therefore be careful about locking
 805  *
 806  */
 807
 808 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 809                                        const char *ha)
 810 {
 811         struct net_device *dev;
 812
 813         for_each_netdev_rcu(net, dev)
 814                 if (dev->type == type &&
 815                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 816                         return dev;
 817
 818         return NULL;
 819 }
 820 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 821
 822 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 823 {
 824         struct net_device *dev;
 825
 826         ASSERT_RTNL();
 827         for_each_netdev(net, dev)
 828                 if (dev->type == type)
 829                         return dev;
 830
 831         return NULL;
 832 }
 833 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 834
 835 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 836 {
 837         struct net_device *dev, *ret = NULL;
 838
 839         rcu_read_lock();
 840         for_each_netdev_rcu(net, dev)
 841                 if (dev->type == type) {
 842                         dev_hold(dev);
 843                         ret = dev;
 844                         break;
 845                 }
 846         rcu_read_unlock();
 847         return ret;
 848 }
 849 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 850
 851 /**
 852  *      dev_get_by_flags_rcu - find any device with given flags
 853  *      @net: the applicable net namespace
 854  *      @if_flags: IFF_* values
 855  *      @mask: bitmask of bits in if_flags to check
 856  *
 857  *      Search for any interface with the given flags. Returns NULL if a device
 858  *      is not found or a pointer to the device. Must be called inside
 859  *      rcu_read_lock(), and result refcount is unchanged.
 860  */
 861
 862 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 863                                     unsigned short mask)
 864 {
 865         struct net_device *dev, *ret;
 866
 867         ret = NULL;
 868         for_each_netdev_rcu(net, dev) {
 869                 if (((dev->flags ^ if_flags) & mask) == 0) {
 870                         ret = dev;
 871                         break;
 872                 }
 873         }
 874         return ret;
 875 }
 876 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 877
 878 /**
 879  *      dev_valid_name - check if name is okay for network device
 880  *      @name: name string
 881  *
 882  *      Network device names need to be valid file names to
 883  *      to allow sysfs to work.  We also disallow any kind of
 884  *      whitespace.
 885  */
 886 bool dev_valid_name(const char *name)
 887 {
 888         if (*name == '\0')
 889                 return false;
 890         if (strlen(name) >= IFNAMSIZ)
 891                 return false;
 892         if (!strcmp(name, ".") || !strcmp(name, ".."))
 893                 return false;
 894
 895         while (*name) {
 896                 if (*name == '/' || isspace(*name))
 897                         return false;
 898                 name++;
 899         }
 900         return true;
 901 }
 902 EXPORT_SYMBOL(dev_valid_name);
 903
 904 /**
 905  *      __dev_alloc_name - allocate a name for a device
 906  *      @net: network namespace to allocate the device name in
 907  *      @name: name format string
 908  *      @buf:  scratch buffer and result name string
 909  *
 910  *      Passed a format string - eg "lt%d" it will try and find a suitable
 911  *      id. It scans list of devices to build up a free map, then chooses
 912  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 913  *      while allocating the name and adding the device in order to avoid
 914  *      duplicates.
 915  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 916  *      Returns the number of the unit assigned or a negative errno code.
 917  */
 918
 919 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 920 {
 921         int i = 0;
 922         const char *p;
 923         const int max_netdevices = 8*PAGE_SIZE;
 924         unsigned long *inuse;
 925         struct net_device *d;
 926
 927         p = strnchr(name, IFNAMSIZ-1, '%');
 928         if (p) {
 929                 /*
 930                  * Verify the string as this thing may have come from
 931                  * the user.  There must be either one "%d" and no other "%"
 932                  * characters.
 933                  */
 934                 if (p[1] != 'd' || strchr(p + 2, '%'))
 935                         return -EINVAL;
 936
 937                 /* Use one page as a bit array of possible slots */
 938                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 939                 if (!inuse)
 940                         return -ENOMEM;
 941
 942                 for_each_netdev(net, d) {
 943                         if (!sscanf(d->name, name, &i))
 944                                 continue;
 945                         if (i < 0 || i >= max_netdevices)
 946                                 continue;
 947
 948                         /*  avoid cases where sscanf is not exact inverse of printf */
 949                         snprintf(buf, IFNAMSIZ, name, i);
 950                         if (!strncmp(buf, d->name, IFNAMSIZ))
 951                                 set_bit(i, inuse);
 952                 }
 953
 954                 i = find_first_zero_bit(inuse, max_netdevices);
 955                 free_page((unsigned long) inuse);
 956         }
 957
 958         if (buf != name)
 959                 snprintf(buf, IFNAMSIZ, name, i);
 960         if (!__dev_get_by_name(net, buf))
 961                 return i;
 962
 963         /* It is possible to run out of possible slots
 964          * when the name is long and there isn't enough space left
 965          * for the digits, or if all bits are used.
 966          */
 967         return -ENFILE;
 968 }
 969
 970 /**
 971  *      dev_alloc_name - allocate a name for a device
 972  *      @dev: device
 973  *      @name: name format string
 974  *
 975  *      Passed a format string - eg "lt%d" it will try and find a suitable
 976  *      id. It scans list of devices to build up a free map, then chooses
 977  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 978  *      while allocating the name and adding the device in order to avoid
 979  *      duplicates.
 980  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 981  *      Returns the number of the unit assigned or a negative errno code.
 982  */
 983
 984 int dev_alloc_name(struct net_device *dev, const char *name)
 985 {
 986         char buf[IFNAMSIZ];
 987         struct net *net;
 988         int ret;
 989
 990         BUG_ON(!dev_net(dev));
 991         net = dev_net(dev);
 992         ret = __dev_alloc_name(net, name, buf);
 993         if (ret >= 0)
 994                 strlcpy(dev->name, buf, IFNAMSIZ);
 995         return ret;
 996 }
 997 EXPORT_SYMBOL(dev_alloc_name);
 998
 999 static int dev_alloc_name_ns(struct net *net,
1000                              struct net_device *dev,
1001                              const char *name)
1002 {
1003         char buf[IFNAMSIZ];
1004         int ret;
1005
1006         ret = __dev_alloc_name(net, name, buf);
1007         if (ret >= 0)
1008                 strlcpy(dev->name, buf, IFNAMSIZ);
1009         return ret;
1010 }
1011
1012 static int dev_get_valid_name(struct net *net,
1013                               struct net_device *dev,
1014                               const char *name)
1015 {
1016         BUG_ON(!net);
1017
1018         if (!dev_valid_name(name))
1019                 return -EINVAL;
1020
1021         if (strchr(name, '%'))
1022                 return dev_alloc_name_ns(net, dev, name);
1023         else if (__dev_get_by_name(net, name))
1024                 return -EEXIST;
1025         else if (dev->name != name)
1026                 strlcpy(dev->name, name, IFNAMSIZ);
1027
1028         return 0;
1029 }
1030
1031 /**
1032  *      dev_change_name - change name of a device
1033  *      @dev: device
1034  *      @newname: name (or format string) must be at least IFNAMSIZ
1035  *
1036  *      Change name of a device, can pass format strings "eth%d".
1037  *      for wildcarding.
1038  */
1039 int dev_change_name(struct net_device *dev, const char *newname)
1040 {
1041         char oldname[IFNAMSIZ];
1042         int err = 0;
1043         int ret;
1044         struct net *net;
1045
1046         ASSERT_RTNL();
1047         BUG_ON(!dev_net(dev));
1048
1049         net = dev_net(dev);
1050         if (dev->flags & IFF_UP)
1051                 return -EBUSY;
1052
1053         write_seqcount_begin(&devnet_rename_seq);
1054
1055         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1056                 write_seqcount_end(&devnet_rename_seq);
1057                 return 0;
1058         }
1059
1060         memcpy(oldname, dev->name, IFNAMSIZ);
1061
1062         err = dev_get_valid_name(net, dev, newname);
1063         if (err < 0) {
1064                 write_seqcount_end(&devnet_rename_seq);
1065                 return err;
1066         }
1067
1068 rollback:
1069         ret = device_rename(&dev->dev, dev->name);
1070         if (ret) {
1071                 memcpy(dev->name, oldname, IFNAMSIZ);
1072                 write_seqcount_end(&devnet_rename_seq);
1073                 return ret;
1074         }
1075
1076         write_seqcount_end(&devnet_rename_seq);
1077
1078         write_lock_bh(&dev_base_lock);
1079         hlist_del_rcu(&dev->name_hlist);
1080         write_unlock_bh(&dev_base_lock);
1081
1082         synchronize_rcu();
1083
1084         write_lock_bh(&dev_base_lock);
1085         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1086         write_unlock_bh(&dev_base_lock);
1087
1088         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1089         ret = notifier_to_errno(ret);
1090
1091         if (ret) {
1092                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1093                 if (err >= 0) {
1094                         err = ret;
1095                         write_seqcount_begin(&devnet_rename_seq);
1096                         memcpy(dev->name, oldname, IFNAMSIZ);
1097                         goto rollback;
1098                 } else {
1099                         pr_err("%s: name change rollback failed: %d\n",
1100                                dev->name, ret);
1101                 }
1102         }
1103
1104         return err;
1105 }
1106
1107 /**
1108  *      dev_set_alias - change ifalias of a device
1109  *      @dev: device
1110  *      @alias: name up to IFALIASZ
1111  *      @len: limit of bytes to copy from info
1112  *
1113  *      Set ifalias for a device,
1114  */
1115 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1116 {
1117         char *new_ifalias;
1118
1119         ASSERT_RTNL();
1120
1121         if (len >= IFALIASZ)
1122                 return -EINVAL;
1123
1124         if (!len) {
1125                 kfree(dev->ifalias);
1126                 dev->ifalias = NULL;
1127                 return 0;
1128         }
1129
1130         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1131         if (!new_ifalias)
1132                 return -ENOMEM;
1133         dev->ifalias = new_ifalias;
1134
1135         strlcpy(dev->ifalias, alias, len+1);
1136         return len;
1137 }
1138
1139
1140 /**
1141  *      netdev_features_change - device changes features
1142  *      @dev: device to cause notification
1143  *
1144  *      Called to indicate a device has changed features.
1145  */
1146 void netdev_features_change(struct net_device *dev)
1147 {
1148         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1149 }
1150 EXPORT_SYMBOL(netdev_features_change);
1151
1152 /**
1153  *      netdev_state_change - device changes state
1154  *      @dev: device to cause notification
1155  *
1156  *      Called to indicate a device has changed state. This function calls
1157  *      the notifier chains for netdev_chain and sends a NEWLINK message
1158  *      to the routing socket.
1159  */
1160 void netdev_state_change(struct net_device *dev)
1161 {
1162         if (dev->flags & IFF_UP) {
1163                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1164                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1165         }
1166 }
1167 EXPORT_SYMBOL(netdev_state_change);
1168
1169 /**
1170  *      netdev_notify_peers - notify network peers about existence of @dev
1171  *      @dev: network device
1172  *
1173  * Generate traffic such that interested network peers are aware of
1174  * @dev, such as by generating a gratuitous ARP. This may be used when
1175  * a device wants to inform the rest of the network about some sort of
1176  * reconfiguration such as a failover event or virtual machine
1177  * migration.
1178  */
1179 void netdev_notify_peers(struct net_device *dev)
1180 {
1181         rtnl_lock();
1182         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1183         rtnl_unlock();
1184 }
1185 EXPORT_SYMBOL(netdev_notify_peers);
1186
1187 static int __dev_open(struct net_device *dev)
1188 {
1189         const struct net_device_ops *ops = dev->netdev_ops;
1190         int ret;
1191
1192         ASSERT_RTNL();
1193
1194         if (!netif_device_present(dev))
1195                 return -ENODEV;
1196
1197         /* Block netpoll from trying to do any rx path servicing.
1198          * If we don't do this there is a chance ndo_poll_controller
1199          * or ndo_poll may be running while we open the device
1200          */
1201         netpoll_rx_disable(dev);
1202
1203         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1204         ret = notifier_to_errno(ret);
1205         if (ret)
1206                 return ret;
1207
1208         set_bit(__LINK_STATE_START, &dev->state);
1209
1210         if (ops->ndo_validate_addr)
1211                 ret = ops->ndo_validate_addr(dev);
1212
1213         if (!ret && ops->ndo_open)
1214                 ret = ops->ndo_open(dev);
1215
1216         netpoll_rx_enable(dev);
1217
1218         if (ret)
1219                 clear_bit(__LINK_STATE_START, &dev->state);
1220         else {
1221                 dev->flags |= IFF_UP;
1222                 net_dmaengine_get();
1223                 dev_set_rx_mode(dev);
1224                 dev_activate(dev);
1225                 add_device_randomness(dev->dev_addr, dev->addr_len);
1226         }
1227
1228         return ret;
1229 }
1230
1231 /**
1232  *      dev_open        - prepare an interface for use.
1233  *      @dev:   device to open
1234  *
1235  *      Takes a device from down to up state. The device's private open
1236  *      function is invoked and then the multicast lists are loaded. Finally
1237  *      the device is moved into the up state and a %NETDEV_UP message is
1238  *      sent to the netdev notifier chain.
1239  *
1240  *      Calling this function on an active interface is a nop. On a failure
1241  *      a negative errno code is returned.
1242  */
1243 int dev_open(struct net_device *dev)
1244 {
1245         int ret;
1246
1247         if (dev->flags & IFF_UP)
1248                 return 0;
1249
1250         ret = __dev_open(dev);
1251         if (ret < 0)
1252                 return ret;
1253
1254         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1255         call_netdevice_notifiers(NETDEV_UP, dev);
1256
1257         return ret;
1258 }
1259 EXPORT_SYMBOL(dev_open);
1260
1261 static int __dev_close_many(struct list_head *head)
1262 {
1263         struct net_device *dev;
1264
1265         ASSERT_RTNL();
1266         might_sleep();
1267
1268         list_for_each_entry(dev, head, unreg_list) {
1269                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1270
1271                 clear_bit(__LINK_STATE_START, &dev->state);
1272
1273                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1274                  * can be even on different cpu. So just clear netif_running().
1275                  *
1276                  * dev->stop() will invoke napi_disable() on all of it's
1277                  * napi_struct instances on this device.
1278                  */
1279                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1280         }
1281
1282         dev_deactivate_many(head);
1283
1284         list_for_each_entry(dev, head, unreg_list) {
1285                 const struct net_device_ops *ops = dev->netdev_ops;
1286
1287                 /*
1288                  *      Call the device specific close. This cannot fail.
1289                  *      Only if device is UP
1290                  *
1291                  *      We allow it to be called even after a DETACH hot-plug
1292                  *      event.
1293                  */
1294                 if (ops->ndo_stop)
1295                         ops->ndo_stop(dev);
1296
1297                 dev->flags &= ~IFF_UP;
1298                 net_dmaengine_put();
1299         }
1300
1301         return 0;
1302 }
1303
1304 static int __dev_close(struct net_device *dev)
1305 {
1306         int retval;
1307         LIST_HEAD(single);
1308
1309         /* Temporarily disable netpoll until the interface is down */
1310         netpoll_rx_disable(dev);
1311
1312         list_add(&dev->unreg_list, &single);
1313         retval = __dev_close_many(&single);
1314         list_del(&single);
1315
1316         netpoll_rx_enable(dev);
1317         return retval;
1318 }
1319
1320 static int dev_close_many(struct list_head *head)
1321 {
1322         struct net_device *dev, *tmp;
1323         LIST_HEAD(tmp_list);
1324
1325         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1326                 if (!(dev->flags & IFF_UP))
1327                         list_move(&dev->unreg_list, &tmp_list);
1328
1329         __dev_close_many(head);
1330
1331         list_for_each_entry(dev, head, unreg_list) {
1332                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1333                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1334         }
1335
1336         /* rollback_registered_many needs the complete original list */
1337         list_splice(&tmp_list, head);
1338         return 0;
1339 }
1340
1341 /**
1342  *      dev_close - shutdown an interface.
1343  *      @dev: device to shutdown
1344  *
1345  *      This function moves an active device into down state. A
1346  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1347  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1348  *      chain.
1349  */
1350 int dev_close(struct net_device *dev)
1351 {
1352         if (dev->flags & IFF_UP) {
1353                 LIST_HEAD(single);
1354
1355                 /* Block netpoll rx while the interface is going down */
1356                 netpoll_rx_disable(dev);
1357
1358                 list_add(&dev->unreg_list, &single);
1359                 dev_close_many(&single);
1360                 list_del(&single);
1361
1362                 netpoll_rx_enable(dev);
1363         }
1364         return 0;
1365 }
1366 EXPORT_SYMBOL(dev_close);
1367
1368
1369 /**
1370  *      dev_disable_lro - disable Large Receive Offload on a device
1371  *      @dev: device
1372  *
1373  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1374  *      called under RTNL.  This is needed if received packets may be
1375  *      forwarded to another interface.
1376  */
1377 void dev_disable_lro(struct net_device *dev)
1378 {
1379         /*
1380          * If we're trying to disable lro on a vlan device
1381          * use the underlying physical device instead
1382          */
1383         if (is_vlan_dev(dev))
1384                 dev = vlan_dev_real_dev(dev);
1385
1386         dev->wanted_features &= ~NETIF_F_LRO;
1387         netdev_update_features(dev);
1388
1389         if (unlikely(dev->features & NETIF_F_LRO))
1390                 netdev_WARN(dev, "failed to disable LRO!\n");
1391 }
1392 EXPORT_SYMBOL(dev_disable_lro);
1393
1394 static void netdev_notifier_info_init(struct netdev_notifier_info *info,
1395                                       struct net_device *dev)
1396 {
1397         info->dev = dev;
1398 }
1399
1400 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1401                                    struct net_device *dev)
1402 {
1403         struct netdev_notifier_info info;
1404
1405         netdev_notifier_info_init(&info, dev);
1406         return nb->notifier_call(nb, val, &info);
1407 }
1408
1409 static int dev_boot_phase = 1;
1410
1411 /**
1412  *      register_netdevice_notifier - register a network notifier block
1413  *      @nb: notifier
1414  *
1415  *      Register a notifier to be called when network device events occur.
1416  *      The notifier passed is linked into the kernel structures and must
1417  *      not be reused until it has been unregistered. A negative errno code
1418  *      is returned on a failure.
1419  *
1420  *      When registered all registration and up events are replayed
1421  *      to the new notifier to allow device to have a race free
1422  *      view of the network device list.
1423  */
1424
1425 int register_netdevice_notifier(struct notifier_block *nb)
1426 {
1427         struct net_device *dev;
1428         struct net_device *last;
1429         struct net *net;
1430         int err;
1431
1432         rtnl_lock();
1433         err = raw_notifier_chain_register(&netdev_chain, nb);
1434         if (err)
1435                 goto unlock;
1436         if (dev_boot_phase)
1437                 goto unlock;
1438         for_each_net(net) {
1439                 for_each_netdev(net, dev) {
1440                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1441                         err = notifier_to_errno(err);
1442                         if (err)
1443                                 goto rollback;
1444
1445                         if (!(dev->flags & IFF_UP))
1446                                 continue;
1447
1448                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1449                 }
1450         }
1451
1452 unlock:
1453         rtnl_unlock();
1454         return err;
1455
1456 rollback:
1457         last = dev;
1458         for_each_net(net) {
1459                 for_each_netdev(net, dev) {
1460                         if (dev == last)
1461                                 goto outroll;
1462
1463                         if (dev->flags & IFF_UP) {
1464                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1465                                                         dev);
1466                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1467                         }
1468                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1469                 }
1470         }
1471
1472 outroll:
1473         raw_notifier_chain_unregister(&netdev_chain, nb);
1474         goto unlock;
1475 }
1476 EXPORT_SYMBOL(register_netdevice_notifier);
1477
1478 /**
1479  *      unregister_netdevice_notifier - unregister a network notifier block
1480  *      @nb: notifier
1481  *
1482  *      Unregister a notifier previously registered by
1483  *      register_netdevice_notifier(). The notifier is unlinked into the
1484  *      kernel structures and may then be reused. A negative errno code
1485  *      is returned on a failure.
1486  *
1487  *      After unregistering unregister and down device events are synthesized
1488  *      for all devices on the device list to the removed notifier to remove
1489  *      the need for special case cleanup code.
1490  */
1491
1492 int unregister_netdevice_notifier(struct notifier_block *nb)
1493 {
1494         struct net_device *dev;
1495         struct net *net;
1496         int err;
1497
1498         rtnl_lock();
1499         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1500         if (err)
1501                 goto unlock;
1502
1503         for_each_net(net) {
1504                 for_each_netdev(net, dev) {
1505                         if (dev->flags & IFF_UP) {
1506                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1507                                                         dev);
1508                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1509                         }
1510                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1511                 }
1512         }
1513 unlock:
1514         rtnl_unlock();
1515         return err;
1516 }
1517 EXPORT_SYMBOL(unregister_netdevice_notifier);
1518
1519 /**
1520  *      call_netdevice_notifiers_info - call all network notifier blocks
1521  *      @val: value passed unmodified to notifier function
1522  *      @dev: net_device pointer passed unmodified to notifier function
1523  *      @info: notifier information data
1524  *
1525  *      Call all network notifier blocks.  Parameters and return value
1526  *      are as for raw_notifier_call_chain().
1527  */
1528
1529 int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1530                                   struct netdev_notifier_info *info)
1531 {
1532         ASSERT_RTNL();
1533         netdev_notifier_info_init(info, dev);
1534         return raw_notifier_call_chain(&netdev_chain, val, info);
1535 }
1536 EXPORT_SYMBOL(call_netdevice_notifiers_info);
1537
1538 /**
1539  *      call_netdevice_notifiers - call all network notifier blocks
1540  *      @val: value passed unmodified to notifier function
1541  *      @dev: net_device pointer passed unmodified to notifier function
1542  *
1543  *      Call all network notifier blocks.  Parameters and return value
1544  *      are as for raw_notifier_call_chain().
1545  */
1546
1547 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1548 {
1549         struct netdev_notifier_info info;
1550
1551         return call_netdevice_notifiers_info(val, dev, &info);
1552 }
1553 EXPORT_SYMBOL(call_netdevice_notifiers);
1554
1555 static struct static_key netstamp_needed __read_mostly;
1556 #ifdef HAVE_JUMP_LABEL
1557 /* We are not allowed to call static_key_slow_dec() from irq context
1558  * If net_disable_timestamp() is called from irq context, defer the
1559  * static_key_slow_dec() calls.
1560  */
1561 static atomic_t netstamp_needed_deferred;
1562 #endif
1563
1564 void net_enable_timestamp(void)
1565 {
1566 #ifdef HAVE_JUMP_LABEL
1567         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1568
1569         if (deferred) {
1570                 while (--deferred)
1571                         static_key_slow_dec(&netstamp_needed);
1572                 return;
1573         }
1574 #endif
1575         static_key_slow_inc(&netstamp_needed);
1576 }
1577 EXPORT_SYMBOL(net_enable_timestamp);
1578
1579 void net_disable_timestamp(void)
1580 {
1581 #ifdef HAVE_JUMP_LABEL
1582         if (in_interrupt()) {
1583                 atomic_inc(&netstamp_needed_deferred);
1584                 return;
1585         }
1586 #endif
1587         static_key_slow_dec(&netstamp_needed);
1588 }
1589 EXPORT_SYMBOL(net_disable_timestamp);
1590
1591 static inline void net_timestamp_set(struct sk_buff *skb)
1592 {
1593         skb->tstamp.tv64 = 0;
1594         if (static_key_false(&netstamp_needed))
1595                 __net_timestamp(skb);
1596 }
1597
1598 #define net_timestamp_check(COND, SKB)                  \
1599         if (static_key_false(&netstamp_needed)) {               \
1600                 if ((COND) && !(SKB)->tstamp.tv64)      \
1601                         __net_timestamp(SKB);           \
1602         }                                               \
1603
1604 static inline bool is_skb_forwardable(struct net_device *dev,
1605                                       struct sk_buff *skb)
1606 {
1607         unsigned int len;
1608
1609         if (!(dev->flags & IFF_UP))
1610                 return false;
1611
1612         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1613         if (skb->len <= len)
1614                 return true;
1615
1616         /* if TSO is enabled, we don't care about the length as the packet
1617          * could be forwarded without being segmented before
1618          */
1619         if (skb_is_gso(skb))
1620                 return true;
1621
1622         return false;
1623 }
1624
1625 /**
1626  * dev_forward_skb - loopback an skb to another netif
1627  *
1628  * @dev: destination network device
1629  * @skb: buffer to forward
1630  *
1631  * return values:
1632  *      NET_RX_SUCCESS  (no congestion)
1633  *      NET_RX_DROP     (packet was dropped, but freed)
1634  *
1635  * dev_forward_skb can be used for injecting an skb from the
1636  * start_xmit function of one device into the receive queue
1637  * of another device.
1638  *
1639  * The receiving device may be in another namespace, so
1640  * we have to clear all information in the skb that could
1641  * impact namespace isolation.
1642  */
1643 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1644 {
1645         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1646                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1647                         atomic_long_inc(&dev->rx_dropped);
1648                         kfree_skb(skb);
1649                         return NET_RX_DROP;
1650                 }
1651         }
1652
1653         skb_orphan(skb);
1654
1655         if (unlikely(!is_skb_forwardable(dev, skb))) {
1656                 atomic_long_inc(&dev->rx_dropped);
1657                 kfree_skb(skb);
1658                 return NET_RX_DROP;
1659         }
1660         skb->skb_iif = 0;
1661         skb_dst_drop(skb);
1662         skb->tstamp.tv64 = 0;
1663         skb->pkt_type = PACKET_HOST;
1664         skb->protocol = eth_type_trans(skb, dev);
1665         skb->mark = 0;
1666         secpath_reset(skb);
1667         nf_reset(skb);
1668         nf_reset_trace(skb);
1669         return netif_rx(skb);
1670 }
1671 EXPORT_SYMBOL_GPL(dev_forward_skb);
1672
1673 static inline int deliver_skb(struct sk_buff *skb,
1674                               struct packet_type *pt_prev,
1675                               struct net_device *orig_dev)
1676 {
1677         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1678                 return -ENOMEM;
1679         atomic_inc(&skb->users);
1680         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1681 }
1682
1683 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1684 {
1685         if (!ptype->af_packet_priv || !skb->sk)
1686                 return false;
1687
1688         if (ptype->id_match)
1689                 return ptype->id_match(ptype, skb->sk);
1690         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1691                 return true;
1692
1693         return false;
1694 }
1695
1696 /*
1697  *      Support routine. Sends outgoing frames to any network
1698  *      taps currently in use.
1699  */
1700
1701 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1702 {
1703         struct packet_type *ptype;
1704         struct sk_buff *skb2 = NULL;
1705         struct packet_type *pt_prev = NULL;
1706
1707         rcu_read_lock();
1708         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1709                 /* Never send packets back to the socket
1710                  * they originated from - MvS (miquels@drinkel.ow.org)
1711                  */
1712                 if ((ptype->dev == dev || !ptype->dev) &&
1713                     (!skb_loop_sk(ptype, skb))) {
1714                         if (pt_prev) {
1715                                 deliver_skb(skb2, pt_prev, skb->dev);
1716                                 pt_prev = ptype;
1717                                 continue;
1718                         }
1719
1720                         skb2 = skb_clone(skb, GFP_ATOMIC);
1721                         if (!skb2)
1722                                 break;
1723
1724                         net_timestamp_set(skb2);
1725
1726                         /* skb->nh should be correctly
1727                            set by sender, so that the second statement is
1728                            just protection against buggy protocols.
1729                          */
1730                         skb_reset_mac_header(skb2);
1731
1732                         if (skb_network_header(skb2) < skb2->data ||
1733                             skb2->network_header > skb2->tail) {
1734                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1735                                                      ntohs(skb2->protocol),
1736                                                      dev->name);
1737                                 skb_reset_network_header(skb2);
1738                         }
1739
1740                         skb2->transport_header = skb2->network_header;
1741                         skb2->pkt_type = PACKET_OUTGOING;
1742                         pt_prev = ptype;
1743                 }
1744         }
1745         if (pt_prev)
1746                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1747         rcu_read_unlock();
1748 }
1749
1750 /**
1751  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1752  * @dev: Network device
1753  * @txq: number of queues available
1754  *
1755  * If real_num_tx_queues is changed the tc mappings may no longer be
1756  * valid. To resolve this verify the tc mapping remains valid and if
1757  * not NULL the mapping. With no priorities mapping to this
1758  * offset/count pair it will no longer be used. In the worst case TC0
1759  * is invalid nothing can be done so disable priority mappings. If is
1760  * expected that drivers will fix this mapping if they can before
1761  * calling netif_set_real_num_tx_queues.
1762  */
1763 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1764 {
1765         int i;
1766         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1767
1768         /* If TC0 is invalidated disable TC mapping */
1769         if (tc->offset + tc->count > txq) {
1770                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1771                 dev->num_tc = 0;
1772                 return;
1773         }
1774
1775         /* Invalidated prio to tc mappings set to TC0 */
1776         for (i = 1; i < TC_BITMASK + 1; i++) {
1777                 int q = netdev_get_prio_tc_map(dev, i);
1778
1779                 tc = &dev->tc_to_txq[q];
1780                 if (tc->offset + tc->count > txq) {
1781                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1782                                 i, q);
1783                         netdev_set_prio_tc_map(dev, i, 0);
1784                 }
1785         }
1786 }
1787
1788 #ifdef CONFIG_XPS
1789 static DEFINE_MUTEX(xps_map_mutex);
1790 #define xmap_dereference(P)             \
1791         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1792
1793 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1794                                         int cpu, u16 index)
1795 {
1796         struct xps_map *map = NULL;
1797         int pos;
1798
1799         if (dev_maps)
1800                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1801
1802         for (pos = 0; map && pos < map->len; pos++) {
1803                 if (map->queues[pos] == index) {
1804                         if (map->len > 1) {
1805                                 map->queues[pos] = map->queues[--map->len];
1806                         } else {
1807                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1808                                 kfree_rcu(map, rcu);
1809                                 map = NULL;
1810                         }
1811                         break;
1812                 }
1813         }
1814
1815         return map;
1816 }
1817
1818 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1819 {
1820         struct xps_dev_maps *dev_maps;
1821         int cpu, i;
1822         bool active = false;
1823
1824         mutex_lock(&xps_map_mutex);
1825         dev_maps = xmap_dereference(dev->xps_maps);
1826
1827         if (!dev_maps)
1828                 goto out_no_maps;
1829
1830         for_each_possible_cpu(cpu) {
1831                 for (i = index; i < dev->num_tx_queues; i++) {
1832                         if (!remove_xps_queue(dev_maps, cpu, i))
1833                                 break;
1834                 }
1835                 if (i == dev->num_tx_queues)
1836                         active = true;
1837         }
1838
1839         if (!active) {
1840                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1841                 kfree_rcu(dev_maps, rcu);
1842         }
1843
1844         for (i = index; i < dev->num_tx_queues; i++)
1845                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1846                                              NUMA_NO_NODE);
1847
1848 out_no_maps:
1849         mutex_unlock(&xps_map_mutex);
1850 }
1851
1852 static struct xps_map *expand_xps_map(struct xps_map *map,
1853                                       int cpu, u16 index)
1854 {
1855         struct xps_map *new_map;
1856         int alloc_len = XPS_MIN_MAP_ALLOC;
1857         int i, pos;
1858
1859         for (pos = 0; map && pos < map->len; pos++) {
1860                 if (map->queues[pos] != index)
1861                         continue;
1862                 return map;
1863         }
1864
1865         /* Need to add queue to this CPU's existing map */
1866         if (map) {
1867                 if (pos < map->alloc_len)
1868                         return map;
1869
1870                 alloc_len = map->alloc_len * 2;
1871         }
1872
1873         /* Need to allocate new map to store queue on this CPU's map */
1874         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1875                                cpu_to_node(cpu));
1876         if (!new_map)
1877                 return NULL;
1878
1879         for (i = 0; i < pos; i++)
1880                 new_map->queues[i] = map->queues[i];
1881         new_map->alloc_len = alloc_len;
1882         new_map->len = pos;
1883
1884         return new_map;
1885 }
1886
1887 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1888 {
1889         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1890         struct xps_map *map, *new_map;
1891         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1892         int cpu, numa_node_id = -2;
1893         bool active = false;
1894
1895         mutex_lock(&xps_map_mutex);
1896
1897         dev_maps = xmap_dereference(dev->xps_maps);
1898
1899         /* allocate memory for queue storage */
1900         for_each_online_cpu(cpu) {
1901                 if (!cpumask_test_cpu(cpu, mask))
1902                         continue;
1903
1904                 if (!new_dev_maps)
1905                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1906                 if (!new_dev_maps) {
1907                         mutex_unlock(&xps_map_mutex);
1908                         return -ENOMEM;
1909                 }
1910
1911                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1912                                  NULL;
1913
1914                 map = expand_xps_map(map, cpu, index);
1915                 if (!map)
1916                         goto error;
1917
1918                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1919         }
1920
1921         if (!new_dev_maps)
1922                 goto out_no_new_maps;
1923
1924         for_each_possible_cpu(cpu) {
1925                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1926                         /* add queue to CPU maps */
1927                         int pos = 0;
1928
1929                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1930                         while ((pos < map->len) && (map->queues[pos] != index))
1931                                 pos++;
1932
1933                         if (pos == map->len)
1934                                 map->queues[map->len++] = index;
1935 #ifdef CONFIG_NUMA
1936                         if (numa_node_id == -2)
1937                                 numa_node_id = cpu_to_node(cpu);
1938                         else if (numa_node_id != cpu_to_node(cpu))
1939                                 numa_node_id = -1;
1940 #endif
1941                 } else if (dev_maps) {
1942                         /* fill in the new device map from the old device map */
1943                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1944                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1945                 }
1946
1947         }
1948
1949         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1950
1951         /* Cleanup old maps */
1952         if (dev_maps) {
1953                 for_each_possible_cpu(cpu) {
1954                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1955                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1956                         if (map && map != new_map)
1957                                 kfree_rcu(map, rcu);
1958                 }
1959
1960                 kfree_rcu(dev_maps, rcu);
1961         }
1962
1963         dev_maps = new_dev_maps;
1964         active = true;
1965
1966 out_no_new_maps:
1967         /* update Tx queue numa node */
1968         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1969                                      (numa_node_id >= 0) ? numa_node_id :
1970                                      NUMA_NO_NODE);
1971
1972         if (!dev_maps)
1973                 goto out_no_maps;
1974
1975         /* removes queue from unused CPUs */
1976         for_each_possible_cpu(cpu) {
1977                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1978                         continue;
1979
1980                 if (remove_xps_queue(dev_maps, cpu, index))
1981                         active = true;
1982         }
1983
1984         /* free map if not active */
1985         if (!active) {
1986                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1987                 kfree_rcu(dev_maps, rcu);
1988         }
1989
1990 out_no_maps:
1991         mutex_unlock(&xps_map_mutex);
1992
1993         return 0;
1994 error:
1995         /* remove any maps that we added */
1996         for_each_possible_cpu(cpu) {
1997                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1998                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1999                                  NULL;
2000                 if (new_map && new_map != map)
2001                         kfree(new_map);
2002         }
2003
2004         mutex_unlock(&xps_map_mutex);
2005
2006         kfree(new_dev_maps);
2007         return -ENOMEM;
2008 }
2009 EXPORT_SYMBOL(netif_set_xps_queue);
2010
2011 #endif
2012 /*
2013  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2014  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2015  */
2016 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2017 {
2018         int rc;
2019
2020         if (txq < 1 || txq > dev->num_tx_queues)
2021                 return -EINVAL;
2022
2023         if (dev->reg_state == NETREG_REGISTERED ||
2024             dev->reg_state == NETREG_UNREGISTERING) {
2025                 ASSERT_RTNL();
2026
2027                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2028                                                   txq);
2029                 if (rc)
2030                         return rc;
2031
2032                 if (dev->num_tc)
2033                         netif_setup_tc(dev, txq);
2034
2035                 if (txq < dev->real_num_tx_queues) {
2036                         qdisc_reset_all_tx_gt(dev, txq);
2037 #ifdef CONFIG_XPS
2038                         netif_reset_xps_queues_gt(dev, txq);
2039 #endif
2040                 }
2041         }
2042
2043         dev->real_num_tx_queues = txq;
2044         return 0;
2045 }
2046 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2047
2048 #ifdef CONFIG_RPS
2049 /**
2050  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2051  *      @dev: Network device
2052  *      @rxq: Actual number of RX queues
2053  *
2054  *      This must be called either with the rtnl_lock held or before
2055  *      registration of the net device.  Returns 0 on success, or a
2056  *      negative error code.  If called before registration, it always
2057  *      succeeds.
2058  */
2059 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2060 {
2061         int rc;
2062
2063         if (rxq < 1 || rxq > dev->num_rx_queues)
2064                 return -EINVAL;
2065
2066         if (dev->reg_state == NETREG_REGISTERED) {
2067                 ASSERT_RTNL();
2068
2069                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2070                                                   rxq);
2071                 if (rc)
2072                         return rc;
2073         }
2074
2075         dev->real_num_rx_queues = rxq;
2076         return 0;
2077 }
2078 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2079 #endif
2080
2081 /**
2082  * netif_get_num_default_rss_queues - default number of RSS queues
2083  *
2084  * This routine should set an upper limit on the number of RSS queues
2085  * used by default by multiqueue devices.
2086  */
2087 int netif_get_num_default_rss_queues(void)
2088 {
2089         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2090 }
2091 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2092
2093 static inline void __netif_reschedule(struct Qdisc *q)
2094 {
2095         struct softnet_data *sd;
2096         unsigned long flags;
2097
2098         local_irq_save(flags);
2099         sd = &__get_cpu_var(softnet_data);
2100         q->next_sched = NULL;
2101         *sd->output_queue_tailp = q;
2102         sd->output_queue_tailp = &q->next_sched;
2103         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2104         local_irq_restore(flags);
2105 }
2106
2107 void __netif_schedule(struct Qdisc *q)
2108 {
2109         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2110                 __netif_reschedule(q);
2111 }
2112 EXPORT_SYMBOL(__netif_schedule);
2113
2114 void dev_kfree_skb_irq(struct sk_buff *skb)
2115 {
2116         if (atomic_dec_and_test(&skb->users)) {
2117                 struct softnet_data *sd;
2118                 unsigned long flags;
2119
2120                 local_irq_save(flags);
2121                 sd = &__get_cpu_var(softnet_data);
2122                 skb->next = sd->completion_queue;
2123                 sd->completion_queue = skb;
2124                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2125                 local_irq_restore(flags);
2126         }
2127 }
2128 EXPORT_SYMBOL(dev_kfree_skb_irq);
2129
2130 void dev_kfree_skb_any(struct sk_buff *skb)
2131 {
2132         if (in_irq() || irqs_disabled())
2133                 dev_kfree_skb_irq(skb);
2134         else
2135                 dev_kfree_skb(skb);
2136 }
2137 EXPORT_SYMBOL(dev_kfree_skb_any);
2138
2139
2140 /**
2141  * netif_device_detach - mark device as removed
2142  * @dev: network device
2143  *
2144  * Mark device as removed from system and therefore no longer available.
2145  */
2146 void netif_device_detach(struct net_device *dev)
2147 {
2148         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2149             netif_running(dev)) {
2150                 netif_tx_stop_all_queues(dev);
2151         }
2152 }
2153 EXPORT_SYMBOL(netif_device_detach);
2154
2155 /**
2156  * netif_device_attach - mark device as attached
2157  * @dev: network device
2158  *
2159  * Mark device as attached from system and restart if needed.
2160  */
2161 void netif_device_attach(struct net_device *dev)
2162 {
2163         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2164             netif_running(dev)) {
2165                 netif_tx_wake_all_queues(dev);
2166                 __netdev_watchdog_up(dev);
2167         }
2168 }
2169 EXPORT_SYMBOL(netif_device_attach);
2170
2171 static void skb_warn_bad_offload(const struct sk_buff *skb)
2172 {
2173         static const netdev_features_t null_features = 0;
2174         struct net_device *dev = skb->dev;
2175         const char *driver = "";
2176
2177         if (!net_ratelimit())
2178                 return;
2179
2180         if (dev && dev->dev.parent)
2181                 driver = dev_driver_string(dev->dev.parent);
2182
2183         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2184              "gso_type=%d ip_summed=%d\n",
2185              driver, dev ? &dev->features : &null_features,
2186              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2187              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2188              skb_shinfo(skb)->gso_type, skb->ip_summed);
2189 }
2190
2191 /*
2192  * Invalidate hardware checksum when packet is to be mangled, and
2193  * complete checksum manually on outgoing path.
2194  */
2195 int skb_checksum_help(struct sk_buff *skb)
2196 {
2197         __wsum csum;
2198         int ret = 0, offset;
2199
2200         if (skb->ip_summed == CHECKSUM_COMPLETE)
2201                 goto out_set_summed;
2202
2203         if (unlikely(skb_shinfo(skb)->gso_size)) {
2204                 skb_warn_bad_offload(skb);
2205                 return -EINVAL;
2206         }
2207
2208         /* Before computing a checksum, we should make sure no frag could
2209          * be modified by an external entity : checksum could be wrong.
2210          */
2211         if (skb_has_shared_frag(skb)) {
2212                 ret = __skb_linearize(skb);
2213                 if (ret)
2214                         goto out;
2215         }
2216
2217         offset = skb_checksum_start_offset(skb);
2218         BUG_ON(offset >= skb_headlen(skb));
2219         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2220
2221         offset += skb->csum_offset;
2222         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2223
2224         if (skb_cloned(skb) &&
2225             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2226                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2227                 if (ret)
2228                         goto out;
2229         }
2230
2231         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2232 out_set_summed:
2233         skb->ip_summed = CHECKSUM_NONE;
2234 out:
2235         return ret;
2236 }
2237 EXPORT_SYMBOL(skb_checksum_help);
2238
2239 __be16 skb_network_protocol(struct sk_buff *skb)
2240 {
2241         __be16 type = skb->protocol;
2242         int vlan_depth = ETH_HLEN;
2243
2244         /* Tunnel gso handlers can set protocol to ethernet. */
2245         if (type == htons(ETH_P_TEB)) {
2246                 struct ethhdr *eth;
2247
2248                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2249                         return 0;
2250
2251                 eth = (struct ethhdr *)skb_mac_header(skb);
2252                 type = eth->h_proto;
2253         }
2254
2255         while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2256                 struct vlan_hdr *vh;
2257
2258                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2259                         return 0;
2260
2261                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2262                 type = vh->h_vlan_encapsulated_proto;
2263                 vlan_depth += VLAN_HLEN;
2264         }
2265
2266         return type;
2267 }
2268
2269 /**
2270  *      skb_mac_gso_segment - mac layer segmentation handler.
2271  *      @skb: buffer to segment
2272  *      @features: features for the output path (see dev->features)
2273  */
2274 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2275                                     netdev_features_t features)
2276 {
2277         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2278         struct packet_offload *ptype;
2279         __be16 type = skb_network_protocol(skb);
2280
2281         if (unlikely(!type))
2282                 return ERR_PTR(-EINVAL);
2283
2284         __skb_pull(skb, skb->mac_len);
2285
2286         rcu_read_lock();
2287         list_for_each_entry_rcu(ptype, &offload_base, list) {
2288                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2289                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2290                                 int err;
2291
2292                                 err = ptype->callbacks.gso_send_check(skb);
2293                                 segs = ERR_PTR(err);
2294                                 if (err || skb_gso_ok(skb, features))
2295                                         break;
2296                                 __skb_push(skb, (skb->data -
2297                                                  skb_network_header(skb)));
2298                         }
2299                         segs = ptype->callbacks.gso_segment(skb, features);
2300                         break;
2301                 }
2302         }
2303         rcu_read_unlock();
2304
2305         __skb_push(skb, skb->data - skb_mac_header(skb));
2306
2307         return segs;
2308 }
2309 EXPORT_SYMBOL(skb_mac_gso_segment);
2310
2311
2312 /* openvswitch calls this on rx path, so we need a different check.
2313  */
2314 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2315 {
2316         if (tx_path)
2317                 return skb->ip_summed != CHECKSUM_PARTIAL;
2318         else
2319                 return skb->ip_summed == CHECKSUM_NONE;
2320 }
2321
2322 /**
2323  *      __skb_gso_segment - Perform segmentation on skb.
2324  *      @skb: buffer to segment
2325  *      @features: features for the output path (see dev->features)
2326  *      @tx_path: whether it is called in TX path
2327  *
2328  *      This function segments the given skb and returns a list of segments.
2329  *
2330  *      It may return NULL if the skb requires no segmentation.  This is
2331  *      only possible when GSO is used for verifying header integrity.
2332  */
2333 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2334                                   netdev_features_t features, bool tx_path)
2335 {
2336         if (unlikely(skb_needs_check(skb, tx_path))) {
2337                 int err;
2338
2339                 skb_warn_bad_offload(skb);
2340
2341                 if (skb_header_cloned(skb) &&
2342                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2343                         return ERR_PTR(err);
2344         }
2345
2346         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2347         skb_reset_mac_header(skb);
2348         skb_reset_mac_len(skb);
2349
2350         return skb_mac_gso_segment(skb, features);
2351 }
2352 EXPORT_SYMBOL(__skb_gso_segment);
2353
2354 /* Take action when hardware reception checksum errors are detected. */
2355 #ifdef CONFIG_BUG
2356 void netdev_rx_csum_fault(struct net_device *dev)
2357 {
2358         if (net_ratelimit()) {
2359                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2360                 dump_stack();
2361         }
2362 }
2363 EXPORT_SYMBOL(netdev_rx_csum_fault);
2364 #endif
2365
2366 /* Actually, we should eliminate this check as soon as we know, that:
2367  * 1. IOMMU is present and allows to map all the memory.
2368  * 2. No high memory really exists on this machine.
2369  */
2370
2371 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2372 {
2373 #ifdef CONFIG_HIGHMEM
2374         int i;
2375         if (!(dev->features & NETIF_F_HIGHDMA)) {
2376                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2377                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2378                         if (PageHighMem(skb_frag_page(frag)))
2379                                 return 1;
2380                 }
2381         }
2382
2383         if (PCI_DMA_BUS_IS_PHYS) {
2384                 struct device *pdev = dev->dev.parent;
2385
2386                 if (!pdev)
2387                         return 0;
2388                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2389                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2390                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2391                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2392                                 return 1;
2393                 }
2394         }
2395 #endif
2396         return 0;
2397 }
2398
2399 struct dev_gso_cb {
2400         void (*destructor)(struct sk_buff *skb);
2401 };
2402
2403 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2404
2405 static void dev_gso_skb_destructor(struct sk_buff *skb)
2406 {
2407         struct dev_gso_cb *cb;
2408
2409         do {
2410                 struct sk_buff *nskb = skb->next;
2411
2412                 skb->next = nskb->next;
2413                 nskb->next = NULL;
2414                 kfree_skb(nskb);
2415         } while (skb->next);
2416
2417         cb = DEV_GSO_CB(skb);
2418         if (cb->destructor)
2419                 cb->destructor(skb);
2420 }
2421
2422 /**
2423  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2424  *      @skb: buffer to segment
2425  *      @features: device features as applicable to this skb
2426  *
2427  *      This function segments the given skb and stores the list of segments
2428  *      in skb->next.
2429  */
2430 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2431 {
2432         struct sk_buff *segs;
2433
2434         segs = skb_gso_segment(skb, features);
2435
2436         /* Verifying header integrity only. */
2437         if (!segs)
2438                 return 0;
2439
2440         if (IS_ERR(segs))
2441                 return PTR_ERR(segs);
2442
2443         skb->next = segs;
2444         DEV_GSO_CB(skb)->destructor = skb->destructor;
2445         skb->destructor = dev_gso_skb_destructor;
2446
2447         return 0;
2448 }
2449
2450 static netdev_features_t harmonize_features(struct sk_buff *skb,
2451         __be16 protocol, netdev_features_t features)
2452 {
2453         if (skb->ip_summed != CHECKSUM_NONE &&
2454             !can_checksum_protocol(features, protocol)) {
2455                 features &= ~NETIF_F_ALL_CSUM;
2456         } else if (illegal_highdma(skb->dev, skb)) {
2457                 features &= ~NETIF_F_SG;
2458         }
2459
2460         return features;
2461 }
2462
2463 netdev_features_t netif_skb_features(struct sk_buff *skb)
2464 {
2465         __be16 protocol = skb->protocol;
2466         netdev_features_t features = skb->dev->features;
2467
2468         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2469                 features &= ~NETIF_F_GSO_MASK;
2470
2471         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2472                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2473                 protocol = veh->h_vlan_encapsulated_proto;
2474         } else if (!vlan_tx_tag_present(skb)) {
2475                 return harmonize_features(skb, protocol, features);
2476         }
2477
2478         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2479                                                NETIF_F_HW_VLAN_STAG_TX);
2480
2481         if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
2482                 return harmonize_features(skb, protocol, features);
2483         } else {
2484                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2485                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2486                                 NETIF_F_HW_VLAN_STAG_TX;
2487                 return harmonize_features(skb, protocol, features);
2488         }
2489 }
2490 EXPORT_SYMBOL(netif_skb_features);
2491
2492 /*
2493  * Returns true if either:
2494  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2495  *      2. skb is fragmented and the device does not support SG.
2496  */
2497 static inline int skb_needs_linearize(struct sk_buff *skb,
2498                                       netdev_features_t features)
2499 {
2500         return skb_is_nonlinear(skb) &&
2501                         ((skb_has_frag_list(skb) &&
2502                                 !(features & NETIF_F_FRAGLIST)) ||
2503                         (skb_shinfo(skb)->nr_frags &&
2504                                 !(features & NETIF_F_SG)));
2505 }
2506
2507 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2508                         struct netdev_queue *txq)
2509 {
2510         const struct net_device_ops *ops = dev->netdev_ops;
2511         int rc = NETDEV_TX_OK;
2512         unsigned int skb_len;
2513
2514         if (likely(!skb->next)) {
2515                 netdev_features_t features;
2516
2517                 /*
2518                  * If device doesn't need skb->dst, release it right now while
2519                  * its hot in this cpu cache
2520                  */
2521                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2522                         skb_dst_drop(skb);
2523
2524                 features = netif_skb_features(skb);
2525
2526                 if (vlan_tx_tag_present(skb) &&
2527                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2528                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2529                                              vlan_tx_tag_get(skb));
2530                         if (unlikely(!skb))
2531                                 goto out;
2532
2533                         skb->vlan_tci = 0;
2534                 }
2535
2536                 /* If encapsulation offload request, verify we are testing
2537                  * hardware encapsulation features instead of standard
2538                  * features for the netdev
2539                  */
2540                 if (skb->encapsulation)
2541                         features &= dev->hw_enc_features;
2542
2543                 if (netif_needs_gso(skb, features)) {
2544                         if (unlikely(dev_gso_segment(skb, features)))
2545                                 goto out_kfree_skb;
2546                         if (skb->next)
2547                                 goto gso;
2548                 } else {
2549                         if (skb_needs_linearize(skb, features) &&
2550                             __skb_linearize(skb))
2551                                 goto out_kfree_skb;
2552
2553                         /* If packet is not checksummed and device does not
2554                          * support checksumming for this protocol, complete
2555                          * checksumming here.
2556                          */
2557                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2558                                 if (skb->encapsulation)
2559                                         skb_set_inner_transport_header(skb,
2560                                                 skb_checksum_start_offset(skb));
2561                                 else
2562                                         skb_set_transport_header(skb,
2563                                                 skb_checksum_start_offset(skb));
2564                                 if (!(features & NETIF_F_ALL_CSUM) &&
2565                                      skb_checksum_help(skb))
2566                                         goto out_kfree_skb;
2567                         }
2568                 }
2569
2570                 if (!list_empty(&ptype_all))
2571                         dev_queue_xmit_nit(skb, dev);
2572
2573                 skb_len = skb->len;
2574                 rc = ops->ndo_start_xmit(skb, dev);
2575                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2576                 if (rc == NETDEV_TX_OK)
2577                         txq_trans_update(txq);
2578                 return rc;
2579         }
2580
2581 gso:
2582         do {
2583                 struct sk_buff *nskb = skb->next;
2584
2585                 skb->next = nskb->next;
2586                 nskb->next = NULL;
2587
2588                 if (!list_empty(&ptype_all))
2589                         dev_queue_xmit_nit(nskb, dev);
2590
2591                 skb_len = nskb->len;
2592                 rc = ops->ndo_start_xmit(nskb, dev);
2593                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2594                 if (unlikely(rc != NETDEV_TX_OK)) {
2595                         if (rc & ~NETDEV_TX_MASK)
2596                                 goto out_kfree_gso_skb;
2597                         nskb->next = skb->next;
2598                         skb->next = nskb;
2599                         return rc;
2600                 }
2601                 txq_trans_update(txq);
2602                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2603                         return NETDEV_TX_BUSY;
2604         } while (skb->next);
2605
2606 out_kfree_gso_skb:
2607         if (likely(skb->next == NULL)) {
2608                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2609                 consume_skb(skb);
2610                 return rc;
2611         }
2612 out_kfree_skb:
2613         kfree_skb(skb);
2614 out:
2615         return rc;
2616 }
2617
2618 static void qdisc_pkt_len_init(struct sk_buff *skb)
2619 {
2620         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2621
2622         qdisc_skb_cb(skb)->pkt_len = skb->len;
2623
2624         /* To get more precise estimation of bytes sent on wire,
2625          * we add to pkt_len the headers size of all segments
2626          */
2627         if (shinfo->gso_size)  {
2628                 unsigned int hdr_len;
2629                 u16 gso_segs = shinfo->gso_segs;
2630
2631                 /* mac layer + network layer */
2632                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2633
2634                 /* + transport layer */
2635                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2636                         hdr_len += tcp_hdrlen(skb);
2637                 else
2638                         hdr_len += sizeof(struct udphdr);
2639
2640                 if (shinfo->gso_type & SKB_GSO_DODGY)
2641                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2642                                                 shinfo->gso_size);
2643
2644                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2645         }
2646 }
2647
2648 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2649                                  struct net_device *dev,
2650                                  struct netdev_queue *txq)
2651 {
2652         spinlock_t *root_lock = qdisc_lock(q);
2653         bool contended;
2654         int rc;
2655
2656         qdisc_pkt_len_init(skb);
2657         qdisc_calculate_pkt_len(skb, q);
2658         /*
2659          * Heuristic to force contended enqueues to serialize on a
2660          * separate lock before trying to get qdisc main lock.
2661          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2662          * and dequeue packets faster.
2663          */
2664         contended = qdisc_is_running(q);
2665         if (unlikely(contended))
2666                 spin_lock(&q->busylock);
2667
2668         spin_lock(root_lock);
2669         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2670                 kfree_skb(skb);
2671                 rc = NET_XMIT_DROP;
2672         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2673                    qdisc_run_begin(q)) {
2674                 /*
2675                  * This is a work-conserving queue; there are no old skbs
2676                  * waiting to be sent out; and the qdisc is not running -
2677                  * xmit the skb directly.
2678                  */
2679                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2680                         skb_dst_force(skb);
2681
2682                 qdisc_bstats_update(q, skb);
2683
2684                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2685                         if (unlikely(contended)) {
2686                                 spin_unlock(&q->busylock);
2687                                 contended = false;
2688                         }
2689                         __qdisc_run(q);
2690                 } else
2691                         qdisc_run_end(q);
2692
2693                 rc = NET_XMIT_SUCCESS;
2694         } else {
2695                 skb_dst_force(skb);
2696                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2697                 if (qdisc_run_begin(q)) {
2698                         if (unlikely(contended)) {
2699                                 spin_unlock(&q->busylock);
2700                                 contended = false;
2701                         }
2702                         __qdisc_run(q);
2703                 }
2704         }
2705         spin_unlock(root_lock);
2706         if (unlikely(contended))
2707                 spin_unlock(&q->busylock);
2708         return rc;
2709 }
2710
2711 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2712 static void skb_update_prio(struct sk_buff *skb)
2713 {
2714         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2715
2716         if (!skb->priority && skb->sk && map) {
2717                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2718
2719                 if (prioidx < map->priomap_len)
2720                         skb->priority = map->priomap[prioidx];
2721         }
2722 }
2723 #else
2724 #define skb_update_prio(skb)
2725 #endif
2726
2727 static DEFINE_PER_CPU(int, xmit_recursion);
2728 #define RECURSION_LIMIT 10
2729
2730 /**
2731  *      dev_loopback_xmit - loop back @skb
2732  *      @skb: buffer to transmit
2733  */
2734 int dev_loopback_xmit(struct sk_buff *skb)
2735 {
2736         skb_reset_mac_header(skb);
2737         __skb_pull(skb, skb_network_offset(skb));
2738         skb->pkt_type = PACKET_LOOPBACK;
2739         skb->ip_summed = CHECKSUM_UNNECESSARY;
2740         WARN_ON(!skb_dst(skb));
2741         skb_dst_force(skb);
2742         netif_rx_ni(skb);
2743         return 0;
2744 }
2745 EXPORT_SYMBOL(dev_loopback_xmit);
2746
2747 /**
2748  *      dev_queue_xmit - transmit a buffer
2749  *      @skb: buffer to transmit
2750  *
2751  *      Queue a buffer for transmission to a network device. The caller must
2752  *      have set the device and priority and built the buffer before calling
2753  *      this function. The function can be called from an interrupt.
2754  *
2755  *      A negative errno code is returned on a failure. A success does not
2756  *      guarantee the frame will be transmitted as it may be dropped due
2757  *      to congestion or traffic shaping.
2758  *
2759  * -----------------------------------------------------------------------------------
2760  *      I notice this method can also return errors from the queue disciplines,
2761  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2762  *      be positive.
2763  *
2764  *      Regardless of the return value, the skb is consumed, so it is currently
2765  *      difficult to retry a send to this method.  (You can bump the ref count
2766  *      before sending to hold a reference for retry if you are careful.)
2767  *
2768  *      When calling this method, interrupts MUST be enabled.  This is because
2769  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2770  *          --BLG
2771  */
2772 int dev_queue_xmit(struct sk_buff *skb)
2773 {
2774         struct net_device *dev = skb->dev;
2775         struct netdev_queue *txq;
2776         struct Qdisc *q;
2777         int rc = -ENOMEM;
2778
2779         skb_reset_mac_header(skb);
2780
2781         /* Disable soft irqs for various locks below. Also
2782          * stops preemption for RCU.
2783          */
2784         rcu_read_lock_bh();
2785
2786         skb_update_prio(skb);
2787
2788         txq = netdev_pick_tx(dev, skb);
2789         q = rcu_dereference_bh(txq->qdisc);
2790
2791 #ifdef CONFIG_NET_CLS_ACT
2792         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2793 #endif
2794         trace_net_dev_queue(skb);
2795         if (q->enqueue) {
2796                 rc = __dev_xmit_skb(skb, q, dev, txq);
2797                 goto out;
2798         }
2799
2800         /* The device has no queue. Common case for software devices:
2801            loopback, all the sorts of tunnels...
2802
2803            Really, it is unlikely that netif_tx_lock protection is necessary
2804            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2805            counters.)
2806            However, it is possible, that they rely on protection
2807            made by us here.
2808
2809            Check this and shot the lock. It is not prone from deadlocks.
2810            Either shot noqueue qdisc, it is even simpler 8)
2811          */
2812         if (dev->flags & IFF_UP) {
2813                 int cpu = smp_processor_id(); /* ok because BHs are off */
2814
2815                 if (txq->xmit_lock_owner != cpu) {
2816
2817                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2818                                 goto recursion_alert;
2819
2820                         HARD_TX_LOCK(dev, txq, cpu);
2821
2822                         if (!netif_xmit_stopped(txq)) {
2823                                 __this_cpu_inc(xmit_recursion);
2824                                 rc = dev_hard_start_xmit(skb, dev, txq);
2825                                 __this_cpu_dec(xmit_recursion);
2826                                 if (dev_xmit_complete(rc)) {
2827                                         HARD_TX_UNLOCK(dev, txq);
2828                                         goto out;
2829                                 }
2830                         }
2831                         HARD_TX_UNLOCK(dev, txq);
2832                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2833                                              dev->name);
2834                 } else {
2835                         /* Recursion is detected! It is possible,
2836                          * unfortunately
2837                          */
2838 recursion_alert:
2839                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2840                                              dev->name);
2841                 }
2842         }
2843
2844         rc = -ENETDOWN;
2845         rcu_read_unlock_bh();
2846
2847         kfree_skb(skb);
2848         return rc;
2849 out:
2850         rcu_read_unlock_bh();
2851         return rc;
2852 }
2853 EXPORT_SYMBOL(dev_queue_xmit);
2854
2855
2856 /*=======================================================================
2857                         Receiver routines
2858   =======================================================================*/
2859
2860 int netdev_max_backlog __read_mostly = 1000;
2861 EXPORT_SYMBOL(netdev_max_backlog);
2862
2863 int netdev_tstamp_prequeue __read_mostly = 1;
2864 int netdev_budget __read_mostly = 300;
2865 int weight_p __read_mostly = 64;            /* old backlog weight */
2866
2867 /* Called with irq disabled */
2868 static inline void ____napi_schedule(struct softnet_data *sd,
2869                                      struct napi_struct *napi)
2870 {
2871         list_add_tail(&napi->poll_list, &sd->poll_list);
2872         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2873 }
2874
2875 #ifdef CONFIG_RPS
2876
2877 /* One global table that all flow-based protocols share. */
2878 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2879 EXPORT_SYMBOL(rps_sock_flow_table);
2880
2881 struct static_key rps_needed __read_mostly;
2882
2883 static struct rps_dev_flow *
2884 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2885             struct rps_dev_flow *rflow, u16 next_cpu)
2886 {
2887         if (next_cpu != RPS_NO_CPU) {
2888 #ifdef CONFIG_RFS_ACCEL
2889                 struct netdev_rx_queue *rxqueue;
2890                 struct rps_dev_flow_table *flow_table;
2891                 struct rps_dev_flow *old_rflow;
2892                 u32 flow_id;
2893                 u16 rxq_index;
2894                 int rc;
2895
2896                 /* Should we steer this flow to a different hardware queue? */
2897                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2898                     !(dev->features & NETIF_F_NTUPLE))
2899                         goto out;
2900                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2901                 if (rxq_index == skb_get_rx_queue(skb))
2902                         goto out;
2903
2904                 rxqueue = dev->_rx + rxq_index;
2905                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2906                 if (!flow_table)
2907                         goto out;
2908                 flow_id = skb->rxhash & flow_table->mask;
2909                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2910                                                         rxq_index, flow_id);
2911                 if (rc < 0)
2912                         goto out;
2913                 old_rflow = rflow;
2914                 rflow = &flow_table->flows[flow_id];
2915                 rflow->filter = rc;
2916                 if (old_rflow->filter == rflow->filter)
2917                         old_rflow->filter = RPS_NO_FILTER;
2918         out:
2919 #endif
2920                 rflow->last_qtail =
2921                         per_cpu(softnet_data, next_cpu).input_queue_head;
2922         }
2923
2924         rflow->cpu = next_cpu;
2925         return rflow;
2926 }
2927
2928 /*
2929  * get_rps_cpu is called from netif_receive_skb and returns the target
2930  * CPU from the RPS map of the receiving queue for a given skb.
2931  * rcu_read_lock must be held on entry.
2932  */
2933 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2934                        struct rps_dev_flow **rflowp)
2935 {
2936         struct netdev_rx_queue *rxqueue;
2937         struct rps_map *map;
2938         struct rps_dev_flow_table *flow_table;
2939         struct rps_sock_flow_table *sock_flow_table;
2940         int cpu = -1;
2941         u16 tcpu;
2942
2943         if (skb_rx_queue_recorded(skb)) {
2944                 u16 index = skb_get_rx_queue(skb);
2945                 if (unlikely(index >= dev->real_num_rx_queues)) {
2946                         WARN_ONCE(dev->real_num_rx_queues > 1,
2947                                   "%s received packet on queue %u, but number "
2948                                   "of RX queues is %u\n",
2949                                   dev->name, index, dev->real_num_rx_queues);
2950                         goto done;
2951                 }
2952                 rxqueue = dev->_rx + index;
2953         } else
2954                 rxqueue = dev->_rx;
2955
2956         map = rcu_dereference(rxqueue->rps_map);
2957         if (map) {
2958                 if (map->len == 1 &&
2959                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2960                         tcpu = map->cpus[0];
2961                         if (cpu_online(tcpu))
2962                                 cpu = tcpu;
2963                         goto done;
2964                 }
2965         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2966                 goto done;
2967         }
2968
2969         skb_reset_network_header(skb);
2970         if (!skb_get_rxhash(skb))
2971                 goto done;
2972
2973         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2974         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2975         if (flow_table && sock_flow_table) {
2976                 u16 next_cpu;
2977                 struct rps_dev_flow *rflow;
2978
2979                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2980                 tcpu = rflow->cpu;
2981
2982                 next_cpu = sock_flow_table->ents[skb->rxhash &
2983                     sock_flow_table->mask];
2984
2985                 /*
2986                  * If the desired CPU (where last recvmsg was done) is
2987                  * different from current CPU (one in the rx-queue flow
2988                  * table entry), switch if one of the following holds:
2989                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2990                  *   - Current CPU is offline.
2991                  *   - The current CPU's queue tail has advanced beyond the
2992                  *     last packet that was enqueued using this table entry.
2993                  *     This guarantees that all previous packets for the flow
2994                  *     have been dequeued, thus preserving in order delivery.
2995                  */
2996                 if (unlikely(tcpu != next_cpu) &&
2997                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2998                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2999                       rflow->last_qtail)) >= 0)) {
3000                         tcpu = next_cpu;
3001                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3002                 }
3003
3004                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3005                         *rflowp = rflow;
3006                         cpu = tcpu;
3007                         goto done;
3008                 }
3009         }
3010
3011         if (map) {
3012                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3013
3014                 if (cpu_online(tcpu)) {
3015                         cpu = tcpu;
3016                         goto done;
3017                 }
3018         }
3019
3020 done:
3021         return cpu;
3022 }
3023
3024 #ifdef CONFIG_RFS_ACCEL
3025
3026 /**
3027  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3028  * @dev: Device on which the filter was set
3029  * @rxq_index: RX queue index
3030  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3031  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3032  *
3033  * Drivers that implement ndo_rx_flow_steer() should periodically call
3034  * this function for each installed filter and remove the filters for
3035  * which it returns %true.
3036  */
3037 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3038                          u32 flow_id, u16 filter_id)
3039 {
3040         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3041         struct rps_dev_flow_table *flow_table;
3042         struct rps_dev_flow *rflow;
3043         bool expire = true;
3044         int cpu;
3045
3046         rcu_read_lock();
3047         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3048         if (flow_table && flow_id <= flow_table->mask) {
3049                 rflow = &flow_table->flows[flow_id];
3050                 cpu = ACCESS_ONCE(rflow->cpu);
3051                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3052                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3053                            rflow->last_qtail) <
3054                      (int)(10 * flow_table->mask)))
3055                         expire = false;
3056         }
3057         rcu_read_unlock();
3058         return expire;
3059 }
3060 EXPORT_SYMBOL(rps_may_expire_flow);
3061
3062 #endif /* CONFIG_RFS_ACCEL */
3063
3064 /* Called from hardirq (IPI) context */
3065 static void rps_trigger_softirq(void *data)
3066 {
3067         struct softnet_data *sd = data;
3068
3069         ____napi_schedule(sd, &sd->backlog);
3070         sd->received_rps++;
3071 }
3072
3073 #endif /* CONFIG_RPS */
3074
3075 /*
3076  * Check if this softnet_data structure is another cpu one
3077  * If yes, queue it to our IPI list and return 1
3078  * If no, return 0
3079  */
3080 static int rps_ipi_queued(struct softnet_data *sd)
3081 {
3082 #ifdef CONFIG_RPS
3083         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3084
3085         if (sd != mysd) {
3086                 sd->rps_ipi_next = mysd->rps_ipi_list;
3087                 mysd->rps_ipi_list = sd;
3088
3089                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3090                 return 1;
3091         }
3092 #endif /* CONFIG_RPS */
3093         return 0;
3094 }
3095
3096 #ifdef CONFIG_NET_FLOW_LIMIT
3097 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3098 #endif
3099
3100 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3101 {
3102 #ifdef CONFIG_NET_FLOW_LIMIT
3103         struct sd_flow_limit *fl;
3104         struct softnet_data *sd;
3105         unsigned int old_flow, new_flow;
3106
3107         if (qlen < (netdev_max_backlog >> 1))
3108                 return false;
3109
3110         sd = &__get_cpu_var(softnet_data);
3111
3112         rcu_read_lock();
3113         fl = rcu_dereference(sd->flow_limit);
3114         if (fl) {
3115                 new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3116                 old_flow = fl->history[fl->history_head];
3117                 fl->history[fl->history_head] = new_flow;
3118
3119                 fl->history_head++;
3120                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3121
3122                 if (likely(fl->buckets[old_flow]))
3123                         fl->buckets[old_flow]--;
3124
3125                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3126                         fl->count++;
3127                         rcu_read_unlock();
3128                         return true;
3129                 }
3130         }
3131         rcu_read_unlock();
3132 #endif
3133         return false;
3134 }
3135
3136 /*
3137  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3138  * queue (may be a remote CPU queue).
3139  */
3140 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3141                               unsigned int *qtail)
3142 {
3143         struct softnet_data *sd;
3144         unsigned long flags;
3145         unsigned int qlen;
3146
3147         sd = &per_cpu(softnet_data, cpu);
3148
3149         local_irq_save(flags);
3150
3151         rps_lock(sd);
3152         qlen = skb_queue_len(&sd->input_pkt_queue);
3153         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3154                 if (skb_queue_len(&sd->input_pkt_queue)) {
3155 enqueue:
3156                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3157                         input_queue_tail_incr_save(sd, qtail);
3158                         rps_unlock(sd);
3159                         local_irq_restore(flags);
3160                         return NET_RX_SUCCESS;
3161                 }
3162
3163                 /* Schedule NAPI for backlog device
3164                  * We can use non atomic operation since we own the queue lock
3165                  */
3166                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3167                         if (!rps_ipi_queued(sd))
3168                                 ____napi_schedule(sd, &sd->backlog);
3169                 }
3170                 goto enqueue;
3171         }
3172
3173         sd->dropped++;
3174         rps_unlock(sd);
3175
3176         local_irq_restore(flags);
3177
3178         atomic_long_inc(&skb->dev->rx_dropped);
3179         kfree_skb(skb);
3180         return NET_RX_DROP;
3181 }
3182
3183 /**
3184  *      netif_rx        -       post buffer to the network code
3185  *      @skb: buffer to post
3186  *
3187  *      This function receives a packet from a device driver and queues it for
3188  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3189  *      may be dropped during processing for congestion control or by the
3190  *      protocol layers.
3191  *
3192  *      return values:
3193  *      NET_RX_SUCCESS  (no congestion)
3194  *      NET_RX_DROP     (packet was dropped)
3195  *
3196  */
3197
3198 int netif_rx(struct sk_buff *skb)
3199 {
3200         int ret;
3201
3202         /* if netpoll wants it, pretend we never saw it */
3203         if (netpoll_rx(skb))
3204                 return NET_RX_DROP;
3205
3206         net_timestamp_check(netdev_tstamp_prequeue, skb);
3207
3208         trace_netif_rx(skb);
3209 #ifdef CONFIG_RPS
3210         if (static_key_false(&rps_needed)) {
3211                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3212                 int cpu;
3213
3214                 preempt_disable();
3215                 rcu_read_lock();
3216
3217                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3218                 if (cpu < 0)
3219                         cpu = smp_processor_id();
3220
3221                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3222
3223                 rcu_read_unlock();
3224                 preempt_enable();
3225         } else
3226 #endif
3227         {
3228                 unsigned int qtail;
3229                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3230                 put_cpu();
3231         }
3232         return ret;
3233 }
3234 EXPORT_SYMBOL(netif_rx);
3235
3236 int netif_rx_ni(struct sk_buff *skb)
3237 {
3238         int err;
3239
3240         preempt_disable();
3241         err = netif_rx(skb);
3242         if (local_softirq_pending())
3243                 do_softirq();
3244         preempt_enable();
3245
3246         return err;
3247 }
3248 EXPORT_SYMBOL(netif_rx_ni);
3249
3250 static void net_tx_action(struct softirq_action *h)
3251 {
3252         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3253
3254         if (sd->completion_queue) {
3255                 struct sk_buff *clist;
3256
3257                 local_irq_disable();
3258                 clist = sd->completion_queue;
3259                 sd->completion_queue = NULL;
3260                 local_irq_enable();
3261
3262                 while (clist) {
3263                         struct sk_buff *skb = clist;
3264                         clist = clist->next;
3265
3266                         WARN_ON(atomic_read(&skb->users));
3267                         trace_kfree_skb(skb, net_tx_action);
3268                         __kfree_skb(skb);
3269                 }
3270         }
3271
3272         if (sd->output_queue) {
3273                 struct Qdisc *head;
3274
3275                 local_irq_disable();
3276                 head = sd->output_queue;
3277                 sd->output_queue = NULL;
3278                 sd->output_queue_tailp = &sd->output_queue;
3279                 local_irq_enable();
3280
3281                 while (head) {
3282                         struct Qdisc *q = head;
3283                         spinlock_t *root_lock;
3284
3285                         head = head->next_sched;
3286
3287                         root_lock = qdisc_lock(q);
3288                         if (spin_trylock(root_lock)) {
3289                                 smp_mb__before_clear_bit();
3290                                 clear_bit(__QDISC_STATE_SCHED,
3291                                           &q->state);
3292                                 qdisc_run(q);
3293                                 spin_unlock(root_lock);
3294                         } else {
3295                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3296                                               &q->state)) {
3297                                         __netif_reschedule(q);
3298                                 } else {
3299                                         smp_mb__before_clear_bit();
3300                                         clear_bit(__QDISC_STATE_SCHED,
3301                                                   &q->state);
3302                                 }
3303                         }
3304                 }
3305         }
3306 }
3307
3308 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3309     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3310 /* This hook is defined here for ATM LANE */
3311 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3312                              unsigned char *addr) __read_mostly;
3313 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3314 #endif
3315
3316 #ifdef CONFIG_NET_CLS_ACT
3317 /* TODO: Maybe we should just force sch_ingress to be compiled in
3318  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3319  * a compare and 2 stores extra right now if we dont have it on
3320  * but have CONFIG_NET_CLS_ACT
3321  * NOTE: This doesn't stop any functionality; if you dont have
3322  * the ingress scheduler, you just can't add policies on ingress.
3323  *
3324  */
3325 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3326 {
3327         struct net_device *dev = skb->dev;
3328         u32 ttl = G_TC_RTTL(skb->tc_verd);
3329         int result = TC_ACT_OK;
3330         struct Qdisc *q;
3331
3332         if (unlikely(MAX_RED_LOOP < ttl++)) {
3333                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3334                                      skb->skb_iif, dev->ifindex);
3335                 return TC_ACT_SHOT;
3336         }
3337
3338         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3339         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3340
3341         q = rxq->qdisc;
3342         if (q != &noop_qdisc) {
3343                 spin_lock(qdisc_lock(q));
3344                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3345                         result = qdisc_enqueue_root(skb, q);
3346                 spin_unlock(qdisc_lock(q));
3347         }
3348
3349         return result;
3350 }
3351
3352 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3353                                          struct packet_type **pt_prev,
3354                                          int *ret, struct net_device *orig_dev)
3355 {
3356         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3357
3358         if (!rxq || rxq->qdisc == &noop_qdisc)
3359                 goto out;
3360
3361         if (*pt_prev) {
3362                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3363                 *pt_prev = NULL;
3364         }
3365
3366         switch (ing_filter(skb, rxq)) {
3367         case TC_ACT_SHOT:
3368         case TC_ACT_STOLEN:
3369                 kfree_skb(skb);
3370                 return NULL;
3371         }
3372
3373 out:
3374         skb->tc_verd = 0;
3375         return skb;
3376 }
3377 #endif
3378
3379 /**
3380  *      netdev_rx_handler_register - register receive handler
3381  *      @dev: device to register a handler for
3382  *      @rx_handler: receive handler to register
3383  *      @rx_handler_data: data pointer that is used by rx handler
3384  *
3385  *      Register a receive hander for a device. This handler will then be
3386  *      called from __netif_receive_skb. A negative errno code is returned
3387  *      on a failure.
3388  *
3389  *      The caller must hold the rtnl_mutex.
3390  *
3391  *      For a general description of rx_handler, see enum rx_handler_result.
3392  */
3393 int netdev_rx_handler_register(struct net_device *dev,
3394                                rx_handler_func_t *rx_handler,
3395                                void *rx_handler_data)
3396 {
3397         ASSERT_RTNL();
3398
3399         if (dev->rx_handler)
3400                 return -EBUSY;
3401
3402         /* Note: rx_handler_data must be set before rx_handler */
3403         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3404         rcu_assign_pointer(dev->rx_handler, rx_handler);
3405
3406         return 0;
3407 }
3408 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3409
3410 /**
3411  *      netdev_rx_handler_unregister - unregister receive handler
3412  *      @dev: device to unregister a handler from
3413  *
3414  *      Unregister a receive handler from a device.
3415  *
3416  *      The caller must hold the rtnl_mutex.
3417  */
3418 void netdev_rx_handler_unregister(struct net_device *dev)
3419 {
3420
3421         ASSERT_RTNL();
3422         RCU_INIT_POINTER(dev->rx_handler, NULL);
3423         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3424          * section has a guarantee to see a non NULL rx_handler_data
3425          * as well.
3426          */
3427         synchronize_net();
3428         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3429 }
3430 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3431
3432 /*
3433  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3434  * the special handling of PFMEMALLOC skbs.
3435  */
3436 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3437 {
3438         switch (skb->protocol) {
3439         case __constant_htons(ETH_P_ARP):
3440         case __constant_htons(ETH_P_IP):
3441         case __constant_htons(ETH_P_IPV6):
3442         case __constant_htons(ETH_P_8021Q):
3443         case __constant_htons(ETH_P_8021AD):
3444                 return true;
3445         default:
3446                 return false;
3447         }
3448 }
3449
3450 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3451 {
3452         struct packet_type *ptype, *pt_prev;
3453         rx_handler_func_t *rx_handler;
3454         struct net_device *orig_dev;
3455         struct net_device *null_or_dev;
3456         bool deliver_exact = false;
3457         int ret = NET_RX_DROP;
3458         __be16 type;
3459
3460         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3461
3462         trace_netif_receive_skb(skb);
3463
3464         /* if we've gotten here through NAPI, check netpoll */
3465         if (netpoll_receive_skb(skb))
3466                 goto out;
3467
3468         orig_dev = skb->dev;
3469
3470         skb_reset_network_header(skb);
3471         if (!skb_transport_header_was_set(skb))
3472                 skb_reset_transport_header(skb);
3473         skb_reset_mac_len(skb);
3474
3475         pt_prev = NULL;
3476
3477         rcu_read_lock();
3478
3479 another_round:
3480         skb->skb_iif = skb->dev->ifindex;
3481
3482         __this_cpu_inc(softnet_data.processed);
3483
3484         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3485             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3486                 skb = vlan_untag(skb);
3487                 if (unlikely(!skb))
3488                         goto unlock;
3489         }
3490
3491 #ifdef CONFIG_NET_CLS_ACT
3492         if (skb->tc_verd & TC_NCLS) {
3493                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3494                 goto ncls;
3495         }
3496 #endif
3497
3498         if (pfmemalloc)
3499                 goto skip_taps;
3500
3501         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3502                 if (!ptype->dev || ptype->dev == skb->dev) {
3503                         if (pt_prev)
3504                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3505                         pt_prev = ptype;
3506                 }
3507         }
3508
3509 skip_taps:
3510 #ifdef CONFIG_NET_CLS_ACT
3511         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3512         if (!skb)
3513                 goto unlock;
3514 ncls:
3515 #endif
3516
3517         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3518                 goto drop;
3519
3520         if (vlan_tx_tag_present(skb)) {
3521                 if (pt_prev) {
3522                         ret = deliver_skb(skb, pt_prev, orig_dev);
3523                         pt_prev = NULL;
3524                 }
3525                 if (vlan_do_receive(&skb))
3526                         goto another_round;
3527                 else if (unlikely(!skb))
3528                         goto unlock;
3529         }
3530
3531         rx_handler = rcu_dereference(skb->dev->rx_handler);
3532         if (rx_handler) {
3533                 if (pt_prev) {
3534                         ret = deliver_skb(skb, pt_prev, orig_dev);
3535                         pt_prev = NULL;
3536                 }
3537                 switch (rx_handler(&skb)) {
3538                 case RX_HANDLER_CONSUMED:
3539                         ret = NET_RX_SUCCESS;
3540                         goto unlock;
3541                 case RX_HANDLER_ANOTHER:
3542                         goto another_round;
3543                 case RX_HANDLER_EXACT:
3544                         deliver_exact = true;
3545                 case RX_HANDLER_PASS:
3546                         break;
3547                 default:
3548                         BUG();
3549                 }
3550         }
3551
3552         if (vlan_tx_nonzero_tag_present(skb))
3553                 skb->pkt_type = PACKET_OTHERHOST;
3554
3555         /* deliver only exact match when indicated */
3556         null_or_dev = deliver_exact ? skb->dev : NULL;
3557
3558         type = skb->protocol;
3559         list_for_each_entry_rcu(ptype,
3560                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3561                 if (ptype->type == type &&
3562                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3563                      ptype->dev == orig_dev)) {
3564                         if (pt_prev)
3565                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3566                         pt_prev = ptype;
3567                 }
3568         }
3569
3570         if (pt_prev) {
3571                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3572                         goto drop;
3573                 else
3574                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3575         } else {
3576 drop:
3577                 atomic_long_inc(&skb->dev->rx_dropped);
3578                 kfree_skb(skb);
3579                 /* Jamal, now you will not able to escape explaining
3580                  * me how you were going to use this. :-)
3581                  */
3582                 ret = NET_RX_DROP;
3583         }
3584
3585 unlock:
3586         rcu_read_unlock();
3587 out:
3588         return ret;
3589 }
3590
3591 static int __netif_receive_skb(struct sk_buff *skb)
3592 {
3593         int ret;
3594
3595         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3596                 unsigned long pflags = current->flags;
3597
3598                 /*
3599                  * PFMEMALLOC skbs are special, they should
3600                  * - be delivered to SOCK_MEMALLOC sockets only
3601                  * - stay away from userspace
3602                  * - have bounded memory usage
3603                  *
3604                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3605                  * context down to all allocation sites.
3606                  */
3607                 current->flags |= PF_MEMALLOC;
3608                 ret = __netif_receive_skb_core(skb, true);
3609                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3610         } else
3611                 ret = __netif_receive_skb_core(skb, false);
3612
3613         return ret;
3614 }
3615
3616 /**
3617  *      netif_receive_skb - process receive buffer from network
3618  *      @skb: buffer to process
3619  *
3620  *      netif_receive_skb() is the main receive data processing function.
3621  *      It always succeeds. The buffer may be dropped during processing
3622  *      for congestion control or by the protocol layers.
3623  *
3624  *      This function may only be called from softirq context and interrupts
3625  *      should be enabled.
3626  *
3627  *      Return values (usually ignored):
3628  *      NET_RX_SUCCESS: no congestion
3629  *      NET_RX_DROP: packet was dropped
3630  */
3631 int netif_receive_skb(struct sk_buff *skb)
3632 {
3633         net_timestamp_check(netdev_tstamp_prequeue, skb);
3634
3635         if (skb_defer_rx_timestamp(skb))
3636                 return NET_RX_SUCCESS;
3637
3638 #ifdef CONFIG_RPS
3639         if (static_key_false(&rps_needed)) {
3640                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3641                 int cpu, ret;
3642
3643                 rcu_read_lock();
3644
3645                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3646
3647                 if (cpu >= 0) {
3648                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3649                         rcu_read_unlock();
3650                         return ret;
3651                 }
3652                 rcu_read_unlock();
3653         }
3654 #endif
3655         return __netif_receive_skb(skb);
3656 }
3657 EXPORT_SYMBOL(netif_receive_skb);
3658
3659 /* Network device is going away, flush any packets still pending
3660  * Called with irqs disabled.
3661  */
3662 static void flush_backlog(void *arg)
3663 {
3664         struct net_device *dev = arg;
3665         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3666         struct sk_buff *skb, *tmp;
3667
3668         rps_lock(sd);
3669         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3670                 if (skb->dev == dev) {
3671                         __skb_unlink(skb, &sd->input_pkt_queue);
3672                         kfree_skb(skb);
3673                         input_queue_head_incr(sd);
3674                 }
3675         }
3676         rps_unlock(sd);
3677
3678         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3679                 if (skb->dev == dev) {
3680                         __skb_unlink(skb, &sd->process_queue);
3681                         kfree_skb(skb);
3682                         input_queue_head_incr(sd);
3683                 }
3684         }
3685 }
3686
3687 static int napi_gro_complete(struct sk_buff *skb)
3688 {
3689         struct packet_offload *ptype;
3690         __be16 type = skb->protocol;
3691         struct list_head *head = &offload_base;
3692         int err = -ENOENT;
3693
3694         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3695
3696         if (NAPI_GRO_CB(skb)->count == 1) {
3697                 skb_shinfo(skb)->gso_size = 0;
3698                 goto out;
3699         }
3700
3701         rcu_read_lock();
3702         list_for_each_entry_rcu(ptype, head, list) {
3703                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3704                         continue;
3705
3706                 err = ptype->callbacks.gro_complete(skb);
3707                 break;
3708         }
3709         rcu_read_unlock();
3710
3711         if (err) {
3712                 WARN_ON(&ptype->list == head);
3713                 kfree_skb(skb);
3714                 return NET_RX_SUCCESS;
3715         }
3716
3717 out:
3718         return netif_receive_skb(skb);
3719 }
3720
3721 /* napi->gro_list contains packets ordered by age.
3722  * youngest packets at the head of it.
3723  * Complete skbs in reverse order to reduce latencies.
3724  */
3725 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3726 {
3727         struct sk_buff *skb, *prev = NULL;
3728
3729         /* scan list and build reverse chain */
3730         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3731                 skb->prev = prev;
3732                 prev = skb;
3733         }
3734
3735         for (skb = prev; skb; skb = prev) {
3736                 skb->next = NULL;
3737
3738                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3739                         return;
3740
3741                 prev = skb->prev;
3742                 napi_gro_complete(skb);
3743                 napi->gro_count--;
3744         }
3745
3746         napi->gro_list = NULL;
3747 }
3748 EXPORT_SYMBOL(napi_gro_flush);
3749
3750 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3751 {
3752         struct sk_buff *p;
3753         unsigned int maclen = skb->dev->hard_header_len;
3754
3755         for (p = napi->gro_list; p; p = p->next) {
3756                 unsigned long diffs;
3757
3758                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3759                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3760                 if (maclen == ETH_HLEN)
3761                         diffs |= compare_ether_header(skb_mac_header(p),
3762                                                       skb_gro_mac_header(skb));
3763                 else if (!diffs)
3764                         diffs = memcmp(skb_mac_header(p),
3765                                        skb_gro_mac_header(skb),
3766                                        maclen);
3767                 NAPI_GRO_CB(p)->same_flow = !diffs;
3768                 NAPI_GRO_CB(p)->flush = 0;
3769         }
3770 }
3771
3772 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3773 {
3774         struct sk_buff **pp = NULL;
3775         struct packet_offload *ptype;
3776         __be16 type = skb->protocol;
3777         struct list_head *head = &offload_base;
3778         int same_flow;
3779         enum gro_result ret;
3780
3781         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3782                 goto normal;
3783
3784         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3785                 goto normal;
3786
3787         gro_list_prepare(napi, skb);
3788
3789         rcu_read_lock();
3790         list_for_each_entry_rcu(ptype, head, list) {
3791                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3792                         continue;
3793
3794                 skb_set_network_header(skb, skb_gro_offset(skb));
3795                 skb_reset_mac_len(skb);
3796                 NAPI_GRO_CB(skb)->same_flow = 0;
3797                 NAPI_GRO_CB(skb)->flush = 0;
3798                 NAPI_GRO_CB(skb)->free = 0;
3799
3800                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3801                 break;
3802         }
3803         rcu_read_unlock();
3804
3805         if (&ptype->list == head)
3806                 goto normal;
3807
3808         same_flow = NAPI_GRO_CB(skb)->same_flow;
3809         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3810
3811         if (pp) {
3812                 struct sk_buff *nskb = *pp;
3813
3814                 *pp = nskb->next;
3815                 nskb->next = NULL;
3816                 napi_gro_complete(nskb);
3817                 napi->gro_count--;
3818         }
3819
3820         if (same_flow)
3821                 goto ok;
3822
3823         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3824                 goto normal;
3825
3826         napi->gro_count++;
3827         NAPI_GRO_CB(skb)->count = 1;
3828         NAPI_GRO_CB(skb)->age = jiffies;
3829         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3830         skb->next = napi->gro_list;
3831         napi->gro_list = skb;
3832         ret = GRO_HELD;
3833
3834 pull:
3835         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3836                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3837
3838                 BUG_ON(skb->end - skb->tail < grow);
3839
3840                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3841
3842                 skb->tail += grow;
3843                 skb->data_len -= grow;
3844
3845                 skb_shinfo(skb)->frags[0].page_offset += grow;
3846                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3847
3848                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3849                         skb_frag_unref(skb, 0);
3850                         memmove(skb_shinfo(skb)->frags,
3851                                 skb_shinfo(skb)->frags + 1,
3852                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3853                 }
3854         }
3855
3856 ok:
3857         return ret;
3858
3859 normal:
3860         ret = GRO_NORMAL;
3861         goto pull;
3862 }
3863
3864
3865 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3866 {
3867         switch (ret) {
3868         case GRO_NORMAL:
3869                 if (netif_receive_skb(skb))
3870                         ret = GRO_DROP;
3871                 break;
3872
3873         case GRO_DROP:
3874                 kfree_skb(skb);
3875                 break;
3876
3877         case GRO_MERGED_FREE:
3878                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3879                         kmem_cache_free(skbuff_head_cache, skb);
3880                 else
3881                         __kfree_skb(skb);
3882                 break;
3883
3884         case GRO_HELD:
3885         case GRO_MERGED:
3886                 break;
3887         }
3888
3889         return ret;
3890 }
3891
3892 static void skb_gro_reset_offset(struct sk_buff *skb)
3893 {
3894         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3895         const skb_frag_t *frag0 = &pinfo->frags[0];
3896
3897         NAPI_GRO_CB(skb)->data_offset = 0;
3898         NAPI_GRO_CB(skb)->frag0 = NULL;
3899         NAPI_GRO_CB(skb)->frag0_len = 0;
3900
3901         if (skb->mac_header == skb->tail &&
3902             pinfo->nr_frags &&
3903             !PageHighMem(skb_frag_page(frag0))) {
3904                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3905                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3906         }
3907 }
3908
3909 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3910 {
3911         skb_gro_reset_offset(skb);
3912
3913         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3914 }
3915 EXPORT_SYMBOL(napi_gro_receive);
3916
3917 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3918 {
3919         __skb_pull(skb, skb_headlen(skb));
3920         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3921         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3922         skb->vlan_tci = 0;
3923         skb->dev = napi->dev;
3924         skb->skb_iif = 0;
3925
3926         napi->skb = skb;
3927 }
3928
3929 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3930 {
3931         struct sk_buff *skb = napi->skb;
3932
3933         if (!skb) {
3934                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3935                 if (skb)
3936                         napi->skb = skb;
3937         }
3938         return skb;
3939 }
3940 EXPORT_SYMBOL(napi_get_frags);
3941
3942 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3943                                gro_result_t ret)
3944 {
3945         switch (ret) {
3946         case GRO_NORMAL:
3947         case GRO_HELD:
3948                 skb->protocol = eth_type_trans(skb, skb->dev);
3949
3950                 if (ret == GRO_HELD)
3951                         skb_gro_pull(skb, -ETH_HLEN);
3952                 else if (netif_receive_skb(skb))
3953                         ret = GRO_DROP;
3954                 break;
3955
3956         case GRO_DROP:
3957         case GRO_MERGED_FREE:
3958                 napi_reuse_skb(napi, skb);
3959                 break;
3960
3961         case GRO_MERGED:
3962                 break;
3963         }
3964
3965         return ret;
3966 }
3967
3968 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3969 {
3970         struct sk_buff *skb = napi->skb;
3971         struct ethhdr *eth;
3972         unsigned int hlen;
3973         unsigned int off;
3974
3975         napi->skb = NULL;
3976
3977         skb_reset_mac_header(skb);
3978         skb_gro_reset_offset(skb);
3979
3980         off = skb_gro_offset(skb);
3981         hlen = off + sizeof(*eth);
3982         eth = skb_gro_header_fast(skb, off);
3983         if (skb_gro_header_hard(skb, hlen)) {
3984                 eth = skb_gro_header_slow(skb, hlen, off);
3985                 if (unlikely(!eth)) {
3986                         napi_reuse_skb(napi, skb);
3987                         skb = NULL;
3988                         goto out;
3989                 }
3990         }
3991
3992         skb_gro_pull(skb, sizeof(*eth));
3993
3994         /*
3995          * This works because the only protocols we care about don't require
3996          * special handling.  We'll fix it up properly at the end.
3997          */
3998         skb->protocol = eth->h_proto;
3999
4000 out:
4001         return skb;
4002 }
4003
4004 gro_result_t napi_gro_frags(struct napi_struct *napi)
4005 {
4006         struct sk_buff *skb = napi_frags_skb(napi);
4007
4008         if (!skb)
4009                 return GRO_DROP;
4010
4011         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4012 }
4013 EXPORT_SYMBOL(napi_gro_frags);
4014
4015 /*
4016  * net_rps_action sends any pending IPI's for rps.
4017  * Note: called with local irq disabled, but exits with local irq enabled.
4018  */
4019 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4020 {
4021 #ifdef CONFIG_RPS
4022         struct softnet_data *remsd = sd->rps_ipi_list;
4023
4024         if (remsd) {
4025                 sd->rps_ipi_list = NULL;
4026
4027                 local_irq_enable();
4028
4029                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4030                 while (remsd) {
4031                         struct softnet_data *next = remsd->rps_ipi_next;
4032
4033                         if (cpu_online(remsd->cpu))
4034                                 __smp_call_function_single(remsd->cpu,
4035                                                            &remsd->csd, 0);
4036                         remsd = next;
4037                 }
4038         } else
4039 #endif
4040                 local_irq_enable();
4041 }
4042
4043 static int process_backlog(struct napi_struct *napi, int quota)
4044 {
4045         int work = 0;
4046         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4047
4048 #ifdef CONFIG_RPS
4049         /* Check if we have pending ipi, its better to send them now,
4050          * not waiting net_rx_action() end.
4051          */
4052         if (sd->rps_ipi_list) {
4053                 local_irq_disable();
4054                 net_rps_action_and_irq_enable(sd);
4055         }
4056 #endif
4057         napi->weight = weight_p;
4058         local_irq_disable();
4059         while (work < quota) {
4060                 struct sk_buff *skb;
4061                 unsigned int qlen;
4062
4063                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4064                         local_irq_enable();
4065                         __netif_receive_skb(skb);
4066                         local_irq_disable();
4067                         input_queue_head_incr(sd);
4068                         if (++work >= quota) {
4069                                 local_irq_enable();
4070                                 return work;
4071                         }
4072                 }
4073
4074                 rps_lock(sd);
4075                 qlen = skb_queue_len(&sd->input_pkt_queue);
4076                 if (qlen)
4077                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4078                                                    &sd->process_queue);
4079
4080                 if (qlen < quota - work) {
4081                         /*
4082                          * Inline a custom version of __napi_complete().
4083                          * only current cpu owns and manipulates this napi,
4084                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4085                          * we can use a plain write instead of clear_bit(),
4086                          * and we dont need an smp_mb() memory barrier.
4087                          */
4088                         list_del(&napi->poll_list);
4089                         napi->state = 0;
4090
4091                         quota = work + qlen;
4092                 }
4093                 rps_unlock(sd);
4094         }
4095         local_irq_enable();
4096
4097         return work;
4098 }
4099
4100 /**
4101  * __napi_schedule - schedule for receive
4102  * @n: entry to schedule
4103  *
4104  * The entry's receive function will be scheduled to run
4105  */
4106 void __napi_schedule(struct napi_struct *n)
4107 {
4108         unsigned long flags;
4109
4110         local_irq_save(flags);
4111         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4112         local_irq_restore(flags);
4113 }
4114 EXPORT_SYMBOL(__napi_schedule);
4115
4116 void __napi_complete(struct napi_struct *n)
4117 {
4118         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4119         BUG_ON(n->gro_list);
4120
4121         list_del(&n->poll_list);
4122         smp_mb__before_clear_bit();
4123         clear_bit(NAPI_STATE_SCHED, &n->state);
4124 }
4125 EXPORT_SYMBOL(__napi_complete);
4126
4127 void napi_complete(struct napi_struct *n)
4128 {
4129         unsigned long flags;
4130
4131         /*
4132          * don't let napi dequeue from the cpu poll list
4133          * just in case its running on a different cpu
4134          */
4135         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4136                 return;
4137
4138         napi_gro_flush(n, false);
4139         local_irq_save(flags);
4140         __napi_complete(n);
4141         local_irq_restore(flags);
4142 }
4143 EXPORT_SYMBOL(napi_complete);
4144
4145 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4146                     int (*poll)(struct napi_struct *, int), int weight)
4147 {
4148         INIT_LIST_HEAD(&napi->poll_list);
4149         napi->gro_count = 0;
4150         napi->gro_list = NULL;
4151         napi->skb = NULL;
4152         napi->poll = poll;
4153         if (weight > NAPI_POLL_WEIGHT)
4154                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4155                             weight, dev->name);
4156         napi->weight = weight;
4157         list_add(&napi->dev_list, &dev->napi_list);
4158         napi->dev = dev;
4159 #ifdef CONFIG_NETPOLL
4160         spin_lock_init(&napi->poll_lock);
4161         napi->poll_owner = -1;
4162 #endif
4163         set_bit(NAPI_STATE_SCHED, &napi->state);
4164 }
4165 EXPORT_SYMBOL(netif_napi_add);
4166
4167 void netif_napi_del(struct napi_struct *napi)
4168 {
4169         struct sk_buff *skb, *next;
4170
4171         list_del_init(&napi->dev_list);
4172         napi_free_frags(napi);
4173
4174         for (skb = napi->gro_list; skb; skb = next) {
4175                 next = skb->next;
4176                 skb->next = NULL;
4177                 kfree_skb(skb);
4178         }
4179
4180         napi->gro_list = NULL;
4181         napi->gro_count = 0;
4182 }
4183 EXPORT_SYMBOL(netif_napi_del);
4184
4185 static void net_rx_action(struct softirq_action *h)
4186 {
4187         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4188         unsigned long time_limit = jiffies + 2;
4189         int budget = netdev_budget;
4190         void *have;
4191
4192         local_irq_disable();
4193
4194         while (!list_empty(&sd->poll_list)) {
4195                 struct napi_struct *n;
4196                 int work, weight;
4197
4198                 /* If softirq window is exhuasted then punt.
4199                  * Allow this to run for 2 jiffies since which will allow
4200                  * an average latency of 1.5/HZ.
4201                  */
4202                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4203                         goto softnet_break;
4204
4205                 local_irq_enable();
4206
4207                 /* Even though interrupts have been re-enabled, this
4208                  * access is safe because interrupts can only add new
4209                  * entries to the tail of this list, and only ->poll()
4210                  * calls can remove this head entry from the list.
4211                  */
4212                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4213
4214                 have = netpoll_poll_lock(n);
4215
4216                 weight = n->weight;
4217
4218                 /* This NAPI_STATE_SCHED test is for avoiding a race
4219                  * with netpoll's poll_napi().  Only the entity which
4220                  * obtains the lock and sees NAPI_STATE_SCHED set will
4221                  * actually make the ->poll() call.  Therefore we avoid
4222                  * accidentally calling ->poll() when NAPI is not scheduled.
4223                  */
4224                 work = 0;
4225                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4226                         work = n->poll(n, weight);
4227                         trace_napi_poll(n);
4228                 }
4229
4230                 WARN_ON_ONCE(work > weight);
4231
4232                 budget -= work;
4233
4234                 local_irq_disable();
4235
4236                 /* Drivers must not modify the NAPI state if they
4237                  * consume the entire weight.  In such cases this code
4238                  * still "owns" the NAPI instance and therefore can
4239                  * move the instance around on the list at-will.
4240                  */
4241                 if (unlikely(work == weight)) {
4242                         if (unlikely(napi_disable_pending(n))) {
4243                                 local_irq_enable();
4244                                 napi_complete(n);
4245                                 local_irq_disable();
4246                         } else {
4247                                 if (n->gro_list) {
4248                                         /* flush too old packets
4249                                          * If HZ < 1000, flush all packets.
4250                                          */
4251                                         local_irq_enable();
4252                                         napi_gro_flush(n, HZ >= 1000);
4253                                         local_irq_disable();
4254                                 }
4255                                 list_move_tail(&n->poll_list, &sd->poll_list);
4256                         }
4257                 }
4258
4259                 netpoll_poll_unlock(have);
4260         }
4261 out:
4262         net_rps_action_and_irq_enable(sd);
4263
4264 #ifdef CONFIG_NET_DMA
4265         /*
4266          * There may not be any more sk_buffs coming right now, so push
4267          * any pending DMA copies to hardware
4268          */
4269         dma_issue_pending_all();
4270 #endif
4271
4272         return;
4273
4274 softnet_break:
4275         sd->time_squeeze++;
4276         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4277         goto out;
4278 }
4279
4280 struct netdev_upper {
4281         struct net_device *dev;
4282         bool master;
4283         struct list_head list;
4284         struct rcu_head rcu;
4285         struct list_head search_list;
4286 };
4287
4288 static void __append_search_uppers(struct list_head *search_list,
4289                                    struct net_device *dev)
4290 {
4291         struct netdev_upper *upper;
4292
4293         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4294                 /* check if this upper is not already in search list */
4295                 if (list_empty(&upper->search_list))
4296                         list_add_tail(&upper->search_list, search_list);
4297         }
4298 }
4299
4300 static bool __netdev_search_upper_dev(struct net_device *dev,
4301                                       struct net_device *upper_dev)
4302 {
4303         LIST_HEAD(search_list);
4304         struct netdev_upper *upper;
4305         struct netdev_upper *tmp;
4306         bool ret = false;
4307
4308         __append_search_uppers(&search_list, dev);
4309         list_for_each_entry(upper, &search_list, search_list) {
4310                 if (upper->dev == upper_dev) {
4311                         ret = true;
4312                         break;
4313                 }
4314                 __append_search_uppers(&search_list, upper->dev);
4315         }
4316         list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4317                 INIT_LIST_HEAD(&upper->search_list);
4318         return ret;
4319 }
4320
4321 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4322                                                 struct net_device *upper_dev)
4323 {
4324         struct netdev_upper *upper;
4325
4326         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4327                 if (upper->dev == upper_dev)
4328                         return upper;
4329         }
4330         return NULL;
4331 }
4332
4333 /**
4334  * netdev_has_upper_dev - Check if device is linked to an upper device
4335  * @dev: device
4336  * @upper_dev: upper device to check
4337  *
4338  * Find out if a device is linked to specified upper device and return true
4339  * in case it is. Note that this checks only immediate upper device,
4340  * not through a complete stack of devices. The caller must hold the RTNL lock.
4341  */
4342 bool netdev_has_upper_dev(struct net_device *dev,
4343                           struct net_device *upper_dev)
4344 {
4345         ASSERT_RTNL();
4346
4347         return __netdev_find_upper(dev, upper_dev);
4348 }
4349 EXPORT_SYMBOL(netdev_has_upper_dev);
4350
4351 /**
4352  * netdev_has_any_upper_dev - Check if device is linked to some device
4353  * @dev: device
4354  *
4355  * Find out if a device is linked to an upper device and return true in case
4356  * it is. The caller must hold the RTNL lock.
4357  */
4358 bool netdev_has_any_upper_dev(struct net_device *dev)
4359 {
4360         ASSERT_RTNL();
4361
4362         return !list_empty(&dev->upper_dev_list);
4363 }
4364 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4365
4366 /**
4367  * netdev_master_upper_dev_get - Get master upper device
4368  * @dev: device
4369  *
4370  * Find a master upper device and return pointer to it or NULL in case
4371  * it's not there. The caller must hold the RTNL lock.
4372  */
4373 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4374 {
4375         struct netdev_upper *upper;
4376
4377         ASSERT_RTNL();
4378
4379         if (list_empty(&dev->upper_dev_list))
4380                 return NULL;
4381
4382         upper = list_first_entry(&dev->upper_dev_list,
4383                                  struct netdev_upper, list);
4384         if (likely(upper->master))
4385                 return upper->dev;
4386         return NULL;
4387 }
4388 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4389
4390 /**
4391  * netdev_master_upper_dev_get_rcu - Get master upper device
4392  * @dev: device
4393  *
4394  * Find a master upper device and return pointer to it or NULL in case
4395  * it's not there. The caller must hold the RCU read lock.
4396  */
4397 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4398 {
4399         struct netdev_upper *upper;
4400
4401         upper = list_first_or_null_rcu(&dev->upper_dev_list,
4402                                        struct netdev_upper, list);
4403         if (upper && likely(upper->master))
4404                 return upper->dev;
4405         return NULL;
4406 }
4407 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4408
4409 static int __netdev_upper_dev_link(struct net_device *dev,
4410                                    struct net_device *upper_dev, bool master)
4411 {
4412         struct netdev_upper *upper;
4413
4414         ASSERT_RTNL();
4415
4416         if (dev == upper_dev)
4417                 return -EBUSY;
4418
4419         /* To prevent loops, check if dev is not upper device to upper_dev. */
4420         if (__netdev_search_upper_dev(upper_dev, dev))
4421                 return -EBUSY;
4422
4423         if (__netdev_find_upper(dev, upper_dev))
4424                 return -EEXIST;
4425
4426         if (master && netdev_master_upper_dev_get(dev))
4427                 return -EBUSY;
4428
4429         upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4430         if (!upper)
4431                 return -ENOMEM;
4432
4433         upper->dev = upper_dev;
4434         upper->master = master;
4435         INIT_LIST_HEAD(&upper->search_list);
4436
4437         /* Ensure that master upper link is always the first item in list. */
4438         if (master)
4439                 list_add_rcu(&upper->list, &dev->upper_dev_list);
4440         else
4441                 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4442         dev_hold(upper_dev);
4443         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4444         return 0;
4445 }
4446
4447 /**
4448  * netdev_upper_dev_link - Add a link to the upper device
4449  * @dev: device
4450  * @upper_dev: new upper device
4451  *
4452  * Adds a link to device which is upper to this one. The caller must hold
4453  * the RTNL lock. On a failure a negative errno code is returned.
4454  * On success the reference counts are adjusted and the function
4455  * returns zero.
4456  */
4457 int netdev_upper_dev_link(struct net_device *dev,
4458                           struct net_device *upper_dev)
4459 {
4460         return __netdev_upper_dev_link(dev, upper_dev, false);
4461 }
4462 EXPORT_SYMBOL(netdev_upper_dev_link);
4463
4464 /**
4465  * netdev_master_upper_dev_link - Add a master link to the upper device
4466  * @dev: device
4467  * @upper_dev: new upper device
4468  *
4469  * Adds a link to device which is upper to this one. In this case, only
4470  * one master upper device can be linked, although other non-master devices
4471  * might be linked as well. The caller must hold the RTNL lock.
4472  * On a failure a negative errno code is returned. On success the reference
4473  * counts are adjusted and the function returns zero.
4474  */
4475 int netdev_master_upper_dev_link(struct net_device *dev,
4476                                  struct net_device *upper_dev)
4477 {
4478         return __netdev_upper_dev_link(dev, upper_dev, true);
4479 }
4480 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4481
4482 /**
4483  * netdev_upper_dev_unlink - Removes a link to upper device
4484  * @dev: device
4485  * @upper_dev: new upper device
4486  *
4487  * Removes a link to device which is upper to this one. The caller must hold
4488  * the RTNL lock.
4489  */
4490 void netdev_upper_dev_unlink(struct net_device *dev,
4491                              struct net_device *upper_dev)
4492 {
4493         struct netdev_upper *upper;
4494
4495         ASSERT_RTNL();
4496
4497         upper = __netdev_find_upper(dev, upper_dev);
4498         if (!upper)
4499                 return;
4500         list_del_rcu(&upper->list);
4501         dev_put(upper_dev);
4502         kfree_rcu(upper, rcu);
4503         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4504 }
4505 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4506
4507 static void dev_change_rx_flags(struct net_device *dev, int flags)
4508 {
4509         const struct net_device_ops *ops = dev->netdev_ops;
4510
4511         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4512                 ops->ndo_change_rx_flags(dev, flags);
4513 }
4514
4515 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4516 {
4517         unsigned int old_flags = dev->flags;
4518         kuid_t uid;
4519         kgid_t gid;
4520
4521         ASSERT_RTNL();
4522
4523         dev->flags |= IFF_PROMISC;
4524         dev->promiscuity += inc;
4525         if (dev->promiscuity == 0) {
4526                 /*
4527                  * Avoid overflow.
4528                  * If inc causes overflow, untouch promisc and return error.
4529                  */
4530                 if (inc < 0)
4531                         dev->flags &= ~IFF_PROMISC;
4532                 else {
4533                         dev->promiscuity -= inc;
4534                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4535                                 dev->name);
4536                         return -EOVERFLOW;
4537                 }
4538         }
4539         if (dev->flags != old_flags) {
4540                 pr_info("device %s %s promiscuous mode\n",
4541                         dev->name,
4542                         dev->flags & IFF_PROMISC ? "entered" : "left");
4543                 if (audit_enabled) {
4544                         current_uid_gid(&uid, &gid);
4545                         audit_log(current->audit_context, GFP_ATOMIC,
4546                                 AUDIT_ANOM_PROMISCUOUS,
4547                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4548                                 dev->name, (dev->flags & IFF_PROMISC),
4549                                 (old_flags & IFF_PROMISC),
4550                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4551                                 from_kuid(&init_user_ns, uid),
4552                                 from_kgid(&init_user_ns, gid),
4553                                 audit_get_sessionid(current));
4554                 }
4555
4556                 dev_change_rx_flags(dev, IFF_PROMISC);
4557         }
4558         return 0;
4559 }
4560
4561 /**
4562  *      dev_set_promiscuity     - update promiscuity count on a device
4563  *      @dev: device
4564  *      @inc: modifier
4565  *
4566  *      Add or remove promiscuity from a device. While the count in the device
4567  *      remains above zero the interface remains promiscuous. Once it hits zero
4568  *      the device reverts back to normal filtering operation. A negative inc
4569  *      value is used to drop promiscuity on the device.
4570  *      Return 0 if successful or a negative errno code on error.
4571  */
4572 int dev_set_promiscuity(struct net_device *dev, int inc)
4573 {
4574         unsigned int old_flags = dev->flags;
4575         int err;
4576
4577         err = __dev_set_promiscuity(dev, inc);
4578         if (err < 0)
4579                 return err;
4580         if (dev->flags != old_flags)
4581                 dev_set_rx_mode(dev);
4582         return err;
4583 }
4584 EXPORT_SYMBOL(dev_set_promiscuity);
4585
4586 /**
4587  *      dev_set_allmulti        - update allmulti count on a device
4588  *      @dev: device
4589  *      @inc: modifier
4590  *
4591  *      Add or remove reception of all multicast frames to a device. While the
4592  *      count in the device remains above zero the interface remains listening
4593  *      to all interfaces. Once it hits zero the device reverts back to normal
4594  *      filtering operation. A negative @inc value is used to drop the counter
4595  *      when releasing a resource needing all multicasts.
4596  *      Return 0 if successful or a negative errno code on error.
4597  */
4598
4599 int dev_set_allmulti(struct net_device *dev, int inc)
4600 {
4601         unsigned int old_flags = dev->flags;
4602
4603         ASSERT_RTNL();
4604
4605         dev->flags |= IFF_ALLMULTI;
4606         dev->allmulti += inc;
4607         if (dev->allmulti == 0) {
4608                 /*
4609                  * Avoid overflow.
4610                  * If inc causes overflow, untouch allmulti and return error.
4611                  */
4612                 if (inc < 0)
4613                         dev->flags &= ~IFF_ALLMULTI;
4614                 else {
4615                         dev->allmulti -= inc;
4616                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4617                                 dev->name);
4618                         return -EOVERFLOW;
4619                 }
4620         }
4621         if (dev->flags ^ old_flags) {
4622                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4623                 dev_set_rx_mode(dev);
4624         }
4625         return 0;
4626 }
4627 EXPORT_SYMBOL(dev_set_allmulti);
4628
4629 /*
4630  *      Upload unicast and multicast address lists to device and
4631  *      configure RX filtering. When the device doesn't support unicast
4632  *      filtering it is put in promiscuous mode while unicast addresses
4633  *      are present.
4634  */
4635 void __dev_set_rx_mode(struct net_device *dev)
4636 {
4637         const struct net_device_ops *ops = dev->netdev_ops;
4638
4639         /* dev_open will call this function so the list will stay sane. */
4640         if (!(dev->flags&IFF_UP))
4641                 return;
4642
4643         if (!netif_device_present(dev))
4644                 return;
4645
4646         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4647                 /* Unicast addresses changes may only happen under the rtnl,
4648                  * therefore calling __dev_set_promiscuity here is safe.
4649                  */
4650                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4651                         __dev_set_promiscuity(dev, 1);
4652                         dev->uc_promisc = true;
4653                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4654                         __dev_set_promiscuity(dev, -1);
4655                         dev->uc_promisc = false;
4656                 }
4657         }
4658
4659         if (ops->ndo_set_rx_mode)
4660                 ops->ndo_set_rx_mode(dev);
4661 }
4662
4663 void dev_set_rx_mode(struct net_device *dev)
4664 {
4665         netif_addr_lock_bh(dev);
4666         __dev_set_rx_mode(dev);
4667         netif_addr_unlock_bh(dev);
4668 }
4669
4670 /**
4671  *      dev_get_flags - get flags reported to userspace
4672  *      @dev: device
4673  *
4674  *      Get the combination of flag bits exported through APIs to userspace.
4675  */
4676 unsigned int dev_get_flags(const struct net_device *dev)
4677 {
4678         unsigned int flags;
4679
4680         flags = (dev->flags & ~(IFF_PROMISC |
4681                                 IFF_ALLMULTI |
4682                                 IFF_RUNNING |
4683                                 IFF_LOWER_UP |
4684                                 IFF_DORMANT)) |
4685                 (dev->gflags & (IFF_PROMISC |
4686                                 IFF_ALLMULTI));
4687
4688         if (netif_running(dev)) {
4689                 if (netif_oper_up(dev))
4690                         flags |= IFF_RUNNING;
4691                 if (netif_carrier_ok(dev))
4692                         flags |= IFF_LOWER_UP;
4693                 if (netif_dormant(dev))
4694                         flags |= IFF_DORMANT;
4695         }
4696
4697         return flags;
4698 }
4699 EXPORT_SYMBOL(dev_get_flags);
4700
4701 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4702 {
4703         unsigned int old_flags = dev->flags;
4704         int ret;
4705
4706         ASSERT_RTNL();
4707
4708         /*
4709          *      Set the flags on our device.
4710          */
4711
4712         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4713                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4714                                IFF_AUTOMEDIA)) |
4715                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4716                                     IFF_ALLMULTI));
4717
4718         /*
4719          *      Load in the correct multicast list now the flags have changed.
4720          */
4721
4722         if ((old_flags ^ flags) & IFF_MULTICAST)
4723                 dev_change_rx_flags(dev, IFF_MULTICAST);
4724
4725         dev_set_rx_mode(dev);
4726
4727         /*
4728          *      Have we downed the interface. We handle IFF_UP ourselves
4729          *      according to user attempts to set it, rather than blindly
4730          *      setting it.
4731          */
4732
4733         ret = 0;
4734         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4735                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4736
4737                 if (!ret)
4738                         dev_set_rx_mode(dev);
4739         }
4740
4741         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4742                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4743
4744                 dev->gflags ^= IFF_PROMISC;
4745                 dev_set_promiscuity(dev, inc);
4746         }
4747
4748         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4749            is important. Some (broken) drivers set IFF_PROMISC, when
4750            IFF_ALLMULTI is requested not asking us and not reporting.
4751          */
4752         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4753                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4754
4755                 dev->gflags ^= IFF_ALLMULTI;
4756                 dev_set_allmulti(dev, inc);
4757         }
4758
4759         return ret;
4760 }
4761
4762 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4763 {
4764         unsigned int changes = dev->flags ^ old_flags;
4765
4766         if (changes & IFF_UP) {
4767                 if (dev->flags & IFF_UP)
4768                         call_netdevice_notifiers(NETDEV_UP, dev);
4769                 else
4770                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4771         }
4772
4773         if (dev->flags & IFF_UP &&
4774             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4775                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4776 }
4777
4778 /**
4779  *      dev_change_flags - change device settings
4780  *      @dev: device
4781  *      @flags: device state flags
4782  *
4783  *      Change settings on device based state flags. The flags are
4784  *      in the userspace exported format.
4785  */
4786 int dev_change_flags(struct net_device *dev, unsigned int flags)
4787 {
4788         int ret;
4789         unsigned int changes, old_flags = dev->flags;
4790
4791         ret = __dev_change_flags(dev, flags);
4792         if (ret < 0)
4793                 return ret;
4794
4795         changes = old_flags ^ dev->flags;
4796         if (changes)
4797                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4798
4799         __dev_notify_flags(dev, old_flags);
4800         return ret;
4801 }
4802 EXPORT_SYMBOL(dev_change_flags);
4803
4804 /**
4805  *      dev_set_mtu - Change maximum transfer unit
4806  *      @dev: device
4807  *      @new_mtu: new transfer unit
4808  *
4809  *      Change the maximum transfer size of the network device.
4810  */
4811 int dev_set_mtu(struct net_device *dev, int new_mtu)
4812 {
4813         const struct net_device_ops *ops = dev->netdev_ops;
4814         int err;
4815
4816         if (new_mtu == dev->mtu)
4817                 return 0;
4818
4819         /*      MTU must be positive.    */
4820         if (new_mtu < 0)
4821                 return -EINVAL;
4822
4823         if (!netif_device_present(dev))
4824                 return -ENODEV;
4825
4826         err = 0;
4827         if (ops->ndo_change_mtu)
4828                 err = ops->ndo_change_mtu(dev, new_mtu);
4829         else
4830                 dev->mtu = new_mtu;
4831
4832         if (!err)
4833                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4834         return err;
4835 }
4836 EXPORT_SYMBOL(dev_set_mtu);
4837
4838 /**
4839  *      dev_set_group - Change group this device belongs to
4840  *      @dev: device
4841  *      @new_group: group this device should belong to
4842  */
4843 void dev_set_group(struct net_device *dev, int new_group)
4844 {
4845         dev->group = new_group;
4846 }
4847 EXPORT_SYMBOL(dev_set_group);
4848
4849 /**
4850  *      dev_set_mac_address - Change Media Access Control Address
4851  *      @dev: device
4852  *      @sa: new address
4853  *
4854  *      Change the hardware (MAC) address of the device
4855  */
4856 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4857 {
4858         const struct net_device_ops *ops = dev->netdev_ops;
4859         int err;
4860
4861         if (!ops->ndo_set_mac_address)
4862                 return -EOPNOTSUPP;
4863         if (sa->sa_family != dev->type)
4864                 return -EINVAL;
4865         if (!netif_device_present(dev))
4866                 return -ENODEV;
4867         err = ops->ndo_set_mac_address(dev, sa);
4868         if (err)
4869                 return err;
4870         dev->addr_assign_type = NET_ADDR_SET;
4871         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4872         add_device_randomness(dev->dev_addr, dev->addr_len);
4873         return 0;
4874 }
4875 EXPORT_SYMBOL(dev_set_mac_address);
4876
4877 /**
4878  *      dev_change_carrier - Change device carrier
4879  *      @dev: device
4880  *      @new_carrier: new value
4881  *
4882  *      Change device carrier
4883  */
4884 int dev_change_carrier(struct net_device *dev, bool new_carrier)
4885 {
4886         const struct net_device_ops *ops = dev->netdev_ops;
4887
4888         if (!ops->ndo_change_carrier)
4889                 return -EOPNOTSUPP;
4890         if (!netif_device_present(dev))
4891                 return -ENODEV;
4892         return ops->ndo_change_carrier(dev, new_carrier);
4893 }
4894 EXPORT_SYMBOL(dev_change_carrier);
4895
4896 /**
4897  *      dev_new_index   -       allocate an ifindex
4898  *      @net: the applicable net namespace
4899  *
4900  *      Returns a suitable unique value for a new device interface
4901  *      number.  The caller must hold the rtnl semaphore or the
4902  *      dev_base_lock to be sure it remains unique.
4903  */
4904 static int dev_new_index(struct net *net)
4905 {
4906         int ifindex = net->ifindex;
4907         for (;;) {
4908                 if (++ifindex <= 0)
4909                         ifindex = 1;
4910                 if (!__dev_get_by_index(net, ifindex))
4911                         return net->ifindex = ifindex;
4912         }
4913 }
4914
4915 /* Delayed registration/unregisteration */
4916 static LIST_HEAD(net_todo_list);
4917
4918 static void net_set_todo(struct net_device *dev)
4919 {
4920         list_add_tail(&dev->todo_list, &net_todo_list);
4921 }
4922
4923 static void rollback_registered_many(struct list_head *head)
4924 {
4925         struct net_device *dev, *tmp;
4926
4927         BUG_ON(dev_boot_phase);
4928         ASSERT_RTNL();
4929
4930         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4931                 /* Some devices call without registering
4932                  * for initialization unwind. Remove those
4933                  * devices and proceed with the remaining.
4934                  */
4935                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4936                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4937                                  dev->name, dev);
4938
4939                         WARN_ON(1);
4940                         list_del(&dev->unreg_list);
4941                         continue;
4942                 }
4943                 dev->dismantle = true;
4944                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4945         }
4946
4947         /* If device is running, close it first. */
4948         dev_close_many(head);
4949
4950         list_for_each_entry(dev, head, unreg_list) {
4951                 /* And unlink it from device chain. */
4952                 unlist_netdevice(dev);
4953
4954                 dev->reg_state = NETREG_UNREGISTERING;
4955         }
4956
4957         synchronize_net();
4958
4959         list_for_each_entry(dev, head, unreg_list) {
4960                 /* Shutdown queueing discipline. */
4961                 dev_shutdown(dev);
4962
4963
4964                 /* Notify protocols, that we are about to destroy
4965                    this device. They should clean all the things.
4966                 */
4967                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4968
4969                 if (!dev->rtnl_link_ops ||
4970                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4971                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4972
4973                 /*
4974                  *      Flush the unicast and multicast chains
4975                  */
4976                 dev_uc_flush(dev);
4977                 dev_mc_flush(dev);
4978
4979                 if (dev->netdev_ops->ndo_uninit)
4980                         dev->netdev_ops->ndo_uninit(dev);
4981
4982                 /* Notifier chain MUST detach us all upper devices. */
4983                 WARN_ON(netdev_has_any_upper_dev(dev));
4984
4985                 /* Remove entries from kobject tree */
4986                 netdev_unregister_kobject(dev);
4987 #ifdef CONFIG_XPS
4988                 /* Remove XPS queueing entries */
4989                 netif_reset_xps_queues_gt(dev, 0);
4990 #endif
4991         }
4992
4993         synchronize_net();
4994
4995         list_for_each_entry(dev, head, unreg_list)
4996                 dev_put(dev);
4997 }
4998
4999 static void rollback_registered(struct net_device *dev)
5000 {
5001         LIST_HEAD(single);
5002
5003         list_add(&dev->unreg_list, &single);
5004         rollback_registered_many(&single);
5005         list_del(&single);
5006 }
5007
5008 static netdev_features_t netdev_fix_features(struct net_device *dev,
5009         netdev_features_t features)
5010 {
5011         /* Fix illegal checksum combinations */
5012         if ((features & NETIF_F_HW_CSUM) &&
5013             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5014                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5015                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5016         }
5017
5018         /* TSO requires that SG is present as well. */
5019         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5020                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5021                 features &= ~NETIF_F_ALL_TSO;
5022         }
5023
5024         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5025                                         !(features & NETIF_F_IP_CSUM)) {
5026                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5027                 features &= ~NETIF_F_TSO;
5028                 features &= ~NETIF_F_TSO_ECN;
5029         }
5030
5031         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5032                                          !(features & NETIF_F_IPV6_CSUM)) {
5033                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5034                 features &= ~NETIF_F_TSO6;
5035         }
5036
5037         /* TSO ECN requires that TSO is present as well. */
5038         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5039                 features &= ~NETIF_F_TSO_ECN;
5040
5041         /* Software GSO depends on SG. */
5042         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5043                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5044                 features &= ~NETIF_F_GSO;
5045         }
5046
5047         /* UFO needs SG and checksumming */
5048         if (features & NETIF_F_UFO) {
5049                 /* maybe split UFO into V4 and V6? */
5050                 if (!((features & NETIF_F_GEN_CSUM) ||
5051                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5052                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5053                         netdev_dbg(dev,
5054                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5055                         features &= ~NETIF_F_UFO;
5056                 }
5057
5058                 if (!(features & NETIF_F_SG)) {
5059                         netdev_dbg(dev,
5060                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5061                         features &= ~NETIF_F_UFO;
5062                 }
5063         }
5064
5065         return features;
5066 }
5067
5068 int __netdev_update_features(struct net_device *dev)
5069 {
5070         netdev_features_t features;
5071         int err = 0;
5072
5073         ASSERT_RTNL();
5074
5075         features = netdev_get_wanted_features(dev);
5076
5077         if (dev->netdev_ops->ndo_fix_features)
5078                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5079
5080         /* driver might be less strict about feature dependencies */
5081         features = netdev_fix_features(dev, features);
5082
5083         if (dev->features == features)
5084                 return 0;
5085
5086         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5087                 &dev->features, &features);
5088
5089         if (dev->netdev_ops->ndo_set_features)
5090                 err = dev->netdev_ops->ndo_set_features(dev, features);
5091
5092         if (unlikely(err < 0)) {
5093                 netdev_err(dev,
5094                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5095                         err, &features, &dev->features);
5096                 return -1;
5097         }
5098
5099         if (!err)
5100                 dev->features = features;
5101
5102         return 1;
5103 }
5104
5105 /**
5106  *      netdev_update_features - recalculate device features
5107  *      @dev: the device to check
5108  *
5109  *      Recalculate dev->features set and send notifications if it
5110  *      has changed. Should be called after driver or hardware dependent
5111  *      conditions might have changed that influence the features.
5112  */
5113 void netdev_update_features(struct net_device *dev)
5114 {
5115         if (__netdev_update_features(dev))
5116                 netdev_features_change(dev);
5117 }
5118 EXPORT_SYMBOL(netdev_update_features);
5119
5120 /**
5121  *      netdev_change_features - recalculate device features
5122  *      @dev: the device to check
5123  *
5124  *      Recalculate dev->features set and send notifications even
5125  *      if they have not changed. Should be called instead of
5126  *      netdev_update_features() if also dev->vlan_features might
5127  *      have changed to allow the changes to be propagated to stacked
5128  *      VLAN devices.
5129  */
5130 void netdev_change_features(struct net_device *dev)
5131 {
5132         __netdev_update_features(dev);
5133         netdev_features_change(dev);
5134 }
5135 EXPORT_SYMBOL(netdev_change_features);
5136
5137 /**
5138  *      netif_stacked_transfer_operstate -      transfer operstate
5139  *      @rootdev: the root or lower level device to transfer state from
5140  *      @dev: the device to transfer operstate to
5141  *
5142  *      Transfer operational state from root to device. This is normally
5143  *      called when a stacking relationship exists between the root
5144  *      device and the device(a leaf device).
5145  */
5146 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5147                                         struct net_device *dev)
5148 {
5149         if (rootdev->operstate == IF_OPER_DORMANT)
5150                 netif_dormant_on(dev);
5151         else
5152                 netif_dormant_off(dev);
5153
5154         if (netif_carrier_ok(rootdev)) {
5155                 if (!netif_carrier_ok(dev))
5156                         netif_carrier_on(dev);
5157         } else {
5158                 if (netif_carrier_ok(dev))
5159                         netif_carrier_off(dev);
5160         }
5161 }
5162 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5163
5164 #ifdef CONFIG_RPS
5165 static int netif_alloc_rx_queues(struct net_device *dev)
5166 {
5167         unsigned int i, count = dev->num_rx_queues;
5168         struct netdev_rx_queue *rx;
5169
5170         BUG_ON(count < 1);
5171
5172         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5173         if (!rx)
5174                 return -ENOMEM;
5175
5176         dev->_rx = rx;
5177
5178         for (i = 0; i < count; i++)
5179                 rx[i].dev = dev;
5180         return 0;
5181 }
5182 #endif
5183
5184 static void netdev_init_one_queue(struct net_device *dev,
5185                                   struct netdev_queue *queue, void *_unused)
5186 {
5187         /* Initialize queue lock */
5188         spin_lock_init(&queue->_xmit_lock);
5189         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5190         queue->xmit_lock_owner = -1;
5191         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5192         queue->dev = dev;
5193 #ifdef CONFIG_BQL
5194         dql_init(&queue->dql, HZ);
5195 #endif
5196 }
5197
5198 static int netif_alloc_netdev_queues(struct net_device *dev)
5199 {
5200         unsigned int count = dev->num_tx_queues;
5201         struct netdev_queue *tx;
5202
5203         BUG_ON(count < 1);
5204
5205         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5206         if (!tx)
5207                 return -ENOMEM;
5208
5209         dev->_tx = tx;
5210
5211         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5212         spin_lock_init(&dev->tx_global_lock);
5213
5214         return 0;
5215 }
5216
5217 /**
5218  *      register_netdevice      - register a network device
5219  *      @dev: device to register
5220  *
5221  *      Take a completed network device structure and add it to the kernel
5222  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5223  *      chain. 0 is returned on success. A negative errno code is returned
5224  *      on a failure to set up the device, or if the name is a duplicate.
5225  *
5226  *      Callers must hold the rtnl semaphore. You may want
5227  *      register_netdev() instead of this.
5228  *
5229  *      BUGS:
5230  *      The locking appears insufficient to guarantee two parallel registers
5231  *      will not get the same name.
5232  */
5233
5234 int register_netdevice(struct net_device *dev)
5235 {
5236         int ret;
5237         struct net *net = dev_net(dev);
5238
5239         BUG_ON(dev_boot_phase);
5240         ASSERT_RTNL();
5241
5242         might_sleep();
5243
5244         /* When net_device's are persistent, this will be fatal. */
5245         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5246         BUG_ON(!net);
5247
5248         spin_lock_init(&dev->addr_list_lock);
5249         netdev_set_addr_lockdep_class(dev);
5250
5251         dev->iflink = -1;
5252
5253         ret = dev_get_valid_name(net, dev, dev->name);
5254         if (ret < 0)
5255                 goto out;
5256
5257         /* Init, if this function is available */
5258         if (dev->netdev_ops->ndo_init) {
5259                 ret = dev->netdev_ops->ndo_init(dev);
5260                 if (ret) {
5261                         if (ret > 0)
5262                                 ret = -EIO;
5263                         goto out;
5264                 }
5265         }
5266
5267         if (((dev->hw_features | dev->features) &
5268              NETIF_F_HW_VLAN_CTAG_FILTER) &&
5269             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5270              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5271                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5272                 ret = -EINVAL;
5273                 goto err_uninit;
5274         }
5275
5276         ret = -EBUSY;
5277         if (!dev->ifindex)
5278                 dev->ifindex = dev_new_index(net);
5279         else if (__dev_get_by_index(net, dev->ifindex))
5280                 goto err_uninit;
5281
5282         if (dev->iflink == -1)
5283                 dev->iflink = dev->ifindex;
5284
5285         /* Transfer changeable features to wanted_features and enable
5286          * software offloads (GSO and GRO).
5287          */
5288         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5289         dev->features |= NETIF_F_SOFT_FEATURES;
5290         dev->wanted_features = dev->features & dev->hw_features;
5291
5292         /* Turn on no cache copy if HW is doing checksum */
5293         if (!(dev->flags & IFF_LOOPBACK)) {
5294                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5295                 if (dev->features & NETIF_F_ALL_CSUM) {
5296                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5297                         dev->features |= NETIF_F_NOCACHE_COPY;
5298                 }
5299         }
5300
5301         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5302          */
5303         dev->vlan_features |= NETIF_F_HIGHDMA;
5304
5305         /* Make NETIF_F_SG inheritable to tunnel devices.
5306          */
5307         dev->hw_enc_features |= NETIF_F_SG;
5308
5309         /* Make NETIF_F_SG inheritable to MPLS.
5310          */
5311         dev->mpls_features |= NETIF_F_SG;
5312
5313         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5314         ret = notifier_to_errno(ret);
5315         if (ret)
5316                 goto err_uninit;
5317
5318         ret = netdev_register_kobject(dev);
5319         if (ret)
5320                 goto err_uninit;
5321         dev->reg_state = NETREG_REGISTERED;
5322
5323         __netdev_update_features(dev);
5324
5325         /*
5326          *      Default initial state at registry is that the
5327          *      device is present.
5328          */
5329
5330         set_bit(__LINK_STATE_PRESENT, &dev->state);
5331
5332         linkwatch_init_dev(dev);
5333
5334         dev_init_scheduler(dev);
5335         dev_hold(dev);
5336         list_netdevice(dev);
5337         add_device_randomness(dev->dev_addr, dev->addr_len);
5338
5339         /* If the device has permanent device address, driver should
5340          * set dev_addr and also addr_assign_type should be set to
5341          * NET_ADDR_PERM (default value).
5342          */
5343         if (dev->addr_assign_type == NET_ADDR_PERM)
5344                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5345
5346         /* Notify protocols, that a new device appeared. */
5347         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5348         ret = notifier_to_errno(ret);
5349         if (ret) {
5350                 rollback_registered(dev);
5351                 dev->reg_state = NETREG_UNREGISTERED;
5352         }
5353         /*
5354          *      Prevent userspace races by waiting until the network
5355          *      device is fully setup before sending notifications.
5356          */
5357         if (!dev->rtnl_link_ops ||
5358             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5359                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5360
5361 out:
5362         return ret;
5363
5364 err_uninit:
5365         if (dev->netdev_ops->ndo_uninit)
5366                 dev->netdev_ops->ndo_uninit(dev);
5367         goto out;
5368 }
5369 EXPORT_SYMBOL(register_netdevice);
5370
5371 /**
5372  *      init_dummy_netdev       - init a dummy network device for NAPI
5373  *      @dev: device to init
5374  *
5375  *      This takes a network device structure and initialize the minimum
5376  *      amount of fields so it can be used to schedule NAPI polls without
5377  *      registering a full blown interface. This is to be used by drivers
5378  *      that need to tie several hardware interfaces to a single NAPI
5379  *      poll scheduler due to HW limitations.
5380  */
5381 int init_dummy_netdev(struct net_device *dev)
5382 {
5383         /* Clear everything. Note we don't initialize spinlocks
5384          * are they aren't supposed to be taken by any of the
5385          * NAPI code and this dummy netdev is supposed to be
5386          * only ever used for NAPI polls
5387          */
5388         memset(dev, 0, sizeof(struct net_device));
5389
5390         /* make sure we BUG if trying to hit standard
5391          * register/unregister code path
5392          */
5393         dev->reg_state = NETREG_DUMMY;
5394
5395         /* NAPI wants this */
5396         INIT_LIST_HEAD(&dev->napi_list);
5397
5398         /* a dummy interface is started by default */
5399         set_bit(__LINK_STATE_PRESENT, &dev->state);
5400         set_bit(__LINK_STATE_START, &dev->state);
5401
5402         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5403          * because users of this 'device' dont need to change
5404          * its refcount.
5405          */
5406
5407         return 0;
5408 }
5409 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5410
5411
5412 /**
5413  *      register_netdev - register a network device
5414  *      @dev: device to register
5415  *
5416  *      Take a completed network device structure and add it to the kernel
5417  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5418  *      chain. 0 is returned on success. A negative errno code is returned
5419  *      on a failure to set up the device, or if the name is a duplicate.
5420  *
5421  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5422  *      and expands the device name if you passed a format string to
5423  *      alloc_netdev.
5424  */
5425 int register_netdev(struct net_device *dev)
5426 {
5427         int err;
5428
5429         rtnl_lock();
5430         err = register_netdevice(dev);
5431         rtnl_unlock();
5432         return err;
5433 }
5434 EXPORT_SYMBOL(register_netdev);
5435
5436 int netdev_refcnt_read(const struct net_device *dev)
5437 {
5438         int i, refcnt = 0;
5439
5440         for_each_possible_cpu(i)
5441                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5442         return refcnt;
5443 }
5444 EXPORT_SYMBOL(netdev_refcnt_read);
5445
5446 /**
5447  * netdev_wait_allrefs - wait until all references are gone.
5448  * @dev: target net_device
5449  *
5450  * This is called when unregistering network devices.
5451  *
5452  * Any protocol or device that holds a reference should register
5453  * for netdevice notification, and cleanup and put back the
5454  * reference if they receive an UNREGISTER event.
5455  * We can get stuck here if buggy protocols don't correctly
5456  * call dev_put.
5457  */
5458 static void netdev_wait_allrefs(struct net_device *dev)
5459 {
5460         unsigned long rebroadcast_time, warning_time;
5461         int refcnt;
5462
5463         linkwatch_forget_dev(dev);
5464
5465         rebroadcast_time = warning_time = jiffies;
5466         refcnt = netdev_refcnt_read(dev);
5467
5468         while (refcnt != 0) {
5469                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5470                         rtnl_lock();
5471
5472                         /* Rebroadcast unregister notification */
5473                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5474
5475                         __rtnl_unlock();
5476                         rcu_barrier();
5477                         rtnl_lock();
5478
5479                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5480                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5481                                      &dev->state)) {
5482                                 /* We must not have linkwatch events
5483                                  * pending on unregister. If this
5484                                  * happens, we simply run the queue
5485                                  * unscheduled, resulting in a noop
5486                                  * for this device.
5487                                  */
5488                                 linkwatch_run_queue();
5489                         }
5490
5491                         __rtnl_unlock();
5492
5493                         rebroadcast_time = jiffies;
5494                 }
5495
5496                 msleep(250);
5497
5498                 refcnt = netdev_refcnt_read(dev);
5499
5500                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5501                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5502                                  dev->name, refcnt);
5503                         warning_time = jiffies;
5504                 }
5505         }
5506 }
5507
5508 /* The sequence is:
5509  *
5510  *      rtnl_lock();
5511  *      ...
5512  *      register_netdevice(x1);
5513  *      register_netdevice(x2);
5514  *      ...
5515  *      unregister_netdevice(y1);
5516  *      unregister_netdevice(y2);
5517  *      ...
5518  *      rtnl_unlock();
5519  *      free_netdev(y1);
5520  *      free_netdev(y2);
5521  *
5522  * We are invoked by rtnl_unlock().
5523  * This allows us to deal with problems:
5524  * 1) We can delete sysfs objects which invoke hotplug
5525  *    without deadlocking with linkwatch via keventd.
5526  * 2) Since we run with the RTNL semaphore not held, we can sleep
5527  *    safely in order to wait for the netdev refcnt to drop to zero.
5528  *
5529  * We must not return until all unregister events added during
5530  * the interval the lock was held have been completed.
5531  */
5532 void netdev_run_todo(void)
5533 {
5534         struct list_head list;
5535
5536         /* Snapshot list, allow later requests */
5537         list_replace_init(&net_todo_list, &list);
5538
5539         __rtnl_unlock();
5540
5541
5542         /* Wait for rcu callbacks to finish before next phase */
5543         if (!list_empty(&list))
5544                 rcu_barrier();
5545
5546         while (!list_empty(&list)) {
5547                 struct net_device *dev
5548                         = list_first_entry(&list, struct net_device, todo_list);
5549                 list_del(&dev->todo_list);
5550
5551                 rtnl_lock();
5552                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5553                 __rtnl_unlock();
5554
5555                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5556                         pr_err("network todo '%s' but state %d\n",
5557                                dev->name, dev->reg_state);
5558                         dump_stack();
5559                         continue;
5560                 }
5561
5562                 dev->reg_state = NETREG_UNREGISTERED;
5563
5564                 on_each_cpu(flush_backlog, dev, 1);
5565
5566                 netdev_wait_allrefs(dev);
5567
5568                 /* paranoia */
5569                 BUG_ON(netdev_refcnt_read(dev));
5570                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5571                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5572                 WARN_ON(dev->dn_ptr);
5573
5574                 if (dev->destructor)
5575                         dev->destructor(dev);
5576
5577                 /* Free network device */
5578                 kobject_put(&dev->dev.kobj);
5579         }
5580 }
5581
5582 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5583  * fields in the same order, with only the type differing.
5584  */
5585 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5586                              const struct net_device_stats *netdev_stats)
5587 {
5588 #if BITS_PER_LONG == 64
5589         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5590         memcpy(stats64, netdev_stats, sizeof(*stats64));
5591 #else
5592         size_t i, n = sizeof(*stats64) / sizeof(u64);
5593         const unsigned long *src = (const unsigned long *)netdev_stats;
5594         u64 *dst = (u64 *)stats64;
5595
5596         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5597                      sizeof(*stats64) / sizeof(u64));
5598         for (i = 0; i < n; i++)
5599                 dst[i] = src[i];
5600 #endif
5601 }
5602 EXPORT_SYMBOL(netdev_stats_to_stats64);
5603
5604 /**
5605  *      dev_get_stats   - get network device statistics
5606  *      @dev: device to get statistics from
5607  *      @storage: place to store stats
5608  *
5609  *      Get network statistics from device. Return @storage.
5610  *      The device driver may provide its own method by setting
5611  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5612  *      otherwise the internal statistics structure is used.
5613  */
5614 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5615                                         struct rtnl_link_stats64 *storage)
5616 {
5617         const struct net_device_ops *ops = dev->netdev_ops;
5618
5619         if (ops->ndo_get_stats64) {
5620                 memset(storage, 0, sizeof(*storage));
5621                 ops->ndo_get_stats64(dev, storage);
5622         } else if (ops->ndo_get_stats) {
5623                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5624         } else {
5625                 netdev_stats_to_stats64(storage, &dev->stats);
5626         }
5627         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5628         return storage;
5629 }
5630 EXPORT_SYMBOL(dev_get_stats);
5631
5632 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5633 {
5634         struct netdev_queue *queue = dev_ingress_queue(dev);
5635
5636 #ifdef CONFIG_NET_CLS_ACT
5637         if (queue)
5638                 return queue;
5639         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5640         if (!queue)
5641                 return NULL;
5642         netdev_init_one_queue(dev, queue, NULL);
5643         queue->qdisc = &noop_qdisc;
5644         queue->qdisc_sleeping = &noop_qdisc;
5645         rcu_assign_pointer(dev->ingress_queue, queue);
5646 #endif
5647         return queue;
5648 }
5649
5650 static const struct ethtool_ops default_ethtool_ops;
5651
5652 void netdev_set_default_ethtool_ops(struct net_device *dev,
5653                                     const struct ethtool_ops *ops)
5654 {
5655         if (dev->ethtool_ops == &default_ethtool_ops)
5656                 dev->ethtool_ops = ops;
5657 }
5658 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5659
5660 /**
5661  *      alloc_netdev_mqs - allocate network device
5662  *      @sizeof_priv:   size of private data to allocate space for
5663  *      @name:          device name format string
5664  *      @setup:         callback to initialize device
5665  *      @txqs:          the number of TX subqueues to allocate
5666  *      @rxqs:          the number of RX subqueues to allocate
5667  *
5668  *      Allocates a struct net_device with private data area for driver use
5669  *      and performs basic initialization.  Also allocates subquue structs
5670  *      for each queue on the device.
5671  */
5672 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5673                 void (*setup)(struct net_device *),
5674                 unsigned int txqs, unsigned int rxqs)
5675 {
5676         struct net_device *dev;
5677         size_t alloc_size;
5678         struct net_device *p;
5679
5680         BUG_ON(strlen(name) >= sizeof(dev->name));
5681
5682         if (txqs < 1) {
5683                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5684                 return NULL;
5685         }
5686
5687 #ifdef CONFIG_RPS
5688         if (rxqs < 1) {
5689                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5690                 return NULL;
5691         }
5692 #endif
5693
5694         alloc_size = sizeof(struct net_device);
5695         if (sizeof_priv) {
5696                 /* ensure 32-byte alignment of private area */
5697                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5698                 alloc_size += sizeof_priv;
5699         }
5700         /* ensure 32-byte alignment of whole construct */
5701         alloc_size += NETDEV_ALIGN - 1;
5702
5703         p = kzalloc(alloc_size, GFP_KERNEL);
5704         if (!p)
5705                 return NULL;
5706
5707         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5708         dev->padded = (char *)dev - (char *)p;
5709
5710         dev->pcpu_refcnt = alloc_percpu(int);
5711         if (!dev->pcpu_refcnt)
5712                 goto free_p;
5713
5714         if (dev_addr_init(dev))
5715                 goto free_pcpu;
5716
5717         dev_mc_init(dev);
5718         dev_uc_init(dev);
5719
5720         dev_net_set(dev, &init_net);
5721
5722         dev->gso_max_size = GSO_MAX_SIZE;
5723         dev->gso_max_segs = GSO_MAX_SEGS;
5724
5725         INIT_LIST_HEAD(&dev->napi_list);
5726         INIT_LIST_HEAD(&dev->unreg_list);
5727         INIT_LIST_HEAD(&dev->link_watch_list);
5728         INIT_LIST_HEAD(&dev->upper_dev_list);
5729         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5730         setup(dev);
5731
5732         dev->num_tx_queues = txqs;
5733         dev->real_num_tx_queues = txqs;
5734         if (netif_alloc_netdev_queues(dev))
5735                 goto free_all;
5736
5737 #ifdef CONFIG_RPS
5738         dev->num_rx_queues = rxqs;
5739         dev->real_num_rx_queues = rxqs;
5740         if (netif_alloc_rx_queues(dev))
5741                 goto free_all;
5742 #endif
5743
5744         strcpy(dev->name, name);
5745         dev->group = INIT_NETDEV_GROUP;
5746         if (!dev->ethtool_ops)
5747                 dev->ethtool_ops = &default_ethtool_ops;
5748         return dev;
5749
5750 free_all:
5751         free_netdev(dev);
5752         return NULL;
5753
5754 free_pcpu:
5755         free_percpu(dev->pcpu_refcnt);
5756         kfree(dev->_tx);
5757 #ifdef CONFIG_RPS
5758         kfree(dev->_rx);
5759 #endif
5760
5761 free_p:
5762         kfree(p);
5763         return NULL;
5764 }
5765 EXPORT_SYMBOL(alloc_netdev_mqs);
5766
5767 /**
5768  *      free_netdev - free network device
5769  *      @dev: device
5770  *
5771  *      This function does the last stage of destroying an allocated device
5772  *      interface. The reference to the device object is released.
5773  *      If this is the last reference then it will be freed.
5774  */
5775 void free_netdev(struct net_device *dev)
5776 {
5777         struct napi_struct *p, *n;
5778
5779         release_net(dev_net(dev));
5780
5781         kfree(dev->_tx);
5782 #ifdef CONFIG_RPS
5783         kfree(dev->_rx);
5784 #endif
5785
5786         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5787
5788         /* Flush device addresses */
5789         dev_addr_flush(dev);
5790
5791         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5792                 netif_napi_del(p);
5793
5794         free_percpu(dev->pcpu_refcnt);
5795         dev->pcpu_refcnt = NULL;
5796
5797         /*  Compatibility with error handling in drivers */
5798         if (dev->reg_state == NETREG_UNINITIALIZED) {
5799                 kfree((char *)dev - dev->padded);
5800                 return;
5801         }
5802
5803         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5804         dev->reg_state = NETREG_RELEASED;
5805
5806         /* will free via device release */
5807         put_device(&dev->dev);
5808 }
5809 EXPORT_SYMBOL(free_netdev);
5810
5811 /**
5812  *      synchronize_net -  Synchronize with packet receive processing
5813  *
5814  *      Wait for packets currently being received to be done.
5815  *      Does not block later packets from starting.
5816  */
5817 void synchronize_net(void)
5818 {
5819         might_sleep();
5820         if (rtnl_is_locked())
5821                 synchronize_rcu_expedited();
5822         else
5823                 synchronize_rcu();
5824 }
5825 EXPORT_SYMBOL(synchronize_net);
5826
5827 /**
5828  *      unregister_netdevice_queue - remove device from the kernel
5829  *      @dev: device
5830  *      @head: list
5831  *
5832  *      This function shuts down a device interface and removes it
5833  *      from the kernel tables.
5834  *      If head not NULL, device is queued to be unregistered later.
5835  *
5836  *      Callers must hold the rtnl semaphore.  You may want
5837  *      unregister_netdev() instead of this.
5838  */
5839
5840 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5841 {
5842         ASSERT_RTNL();
5843
5844         if (head) {
5845                 list_move_tail(&dev->unreg_list, head);
5846         } else {
5847                 rollback_registered(dev);
5848                 /* Finish processing unregister after unlock */
5849                 net_set_todo(dev);
5850         }
5851 }
5852 EXPORT_SYMBOL(unregister_netdevice_queue);
5853
5854 /**
5855  *      unregister_netdevice_many - unregister many devices
5856  *      @head: list of devices
5857  */
5858 void unregister_netdevice_many(struct list_head *head)
5859 {
5860         struct net_device *dev;
5861
5862         if (!list_empty(head)) {
5863                 rollback_registered_many(head);
5864                 list_for_each_entry(dev, head, unreg_list)
5865                         net_set_todo(dev);
5866         }
5867 }
5868 EXPORT_SYMBOL(unregister_netdevice_many);
5869
5870 /**
5871  *      unregister_netdev - remove device from the kernel
5872  *      @dev: device
5873  *
5874  *      This function shuts down a device interface and removes it
5875  *      from the kernel tables.
5876  *
5877  *      This is just a wrapper for unregister_netdevice that takes
5878  *      the rtnl semaphore.  In general you want to use this and not
5879  *      unregister_netdevice.
5880  */
5881 void unregister_netdev(struct net_device *dev)
5882 {
5883         rtnl_lock();
5884         unregister_netdevice(dev);
5885         rtnl_unlock();
5886 }
5887 EXPORT_SYMBOL(unregister_netdev);
5888
5889 /**
5890  *      dev_change_net_namespace - move device to different nethost namespace
5891  *      @dev: device
5892  *      @net: network namespace
5893  *      @pat: If not NULL name pattern to try if the current device name
5894  *            is already taken in the destination network namespace.
5895  *
5896  *      This function shuts down a device interface and moves it
5897  *      to a new network namespace. On success 0 is returned, on
5898  *      a failure a netagive errno code is returned.
5899  *
5900  *      Callers must hold the rtnl semaphore.
5901  */
5902
5903 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5904 {
5905         int err;
5906
5907         ASSERT_RTNL();
5908
5909         /* Don't allow namespace local devices to be moved. */
5910         err = -EINVAL;
5911         if (dev->features & NETIF_F_NETNS_LOCAL)
5912                 goto out;
5913
5914         /* Ensure the device has been registrered */
5915         if (dev->reg_state != NETREG_REGISTERED)
5916                 goto out;
5917
5918         /* Get out if there is nothing todo */
5919         err = 0;
5920         if (net_eq(dev_net(dev), net))
5921                 goto out;
5922
5923         /* Pick the destination device name, and ensure
5924          * we can use it in the destination network namespace.
5925          */
5926         err = -EEXIST;
5927         if (__dev_get_by_name(net, dev->name)) {
5928                 /* We get here if we can't use the current device name */
5929                 if (!pat)
5930                         goto out;
5931                 if (dev_get_valid_name(net, dev, pat) < 0)
5932                         goto out;
5933         }
5934
5935         /*
5936          * And now a mini version of register_netdevice unregister_netdevice.
5937          */
5938
5939         /* If device is running close it first. */
5940         dev_close(dev);
5941
5942         /* And unlink it from device chain */
5943         err = -ENODEV;
5944         unlist_netdevice(dev);
5945
5946         synchronize_net();
5947
5948         /* Shutdown queueing discipline. */
5949         dev_shutdown(dev);
5950
5951         /* Notify protocols, that we are about to destroy
5952            this device. They should clean all the things.
5953
5954            Note that dev->reg_state stays at NETREG_REGISTERED.
5955            This is wanted because this way 8021q and macvlan know
5956            the device is just moving and can keep their slaves up.
5957         */
5958         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5959         rcu_barrier();
5960         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5961         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5962
5963         /*
5964          *      Flush the unicast and multicast chains
5965          */
5966         dev_uc_flush(dev);
5967         dev_mc_flush(dev);
5968
5969         /* Send a netdev-removed uevent to the old namespace */
5970         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
5971
5972         /* Actually switch the network namespace */
5973         dev_net_set(dev, net);
5974
5975         /* If there is an ifindex conflict assign a new one */
5976         if (__dev_get_by_index(net, dev->ifindex)) {
5977                 int iflink = (dev->iflink == dev->ifindex);
5978                 dev->ifindex = dev_new_index(net);
5979                 if (iflink)
5980                         dev->iflink = dev->ifindex;
5981         }
5982
5983         /* Send a netdev-add uevent to the new namespace */
5984         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
5985
5986         /* Fixup kobjects */
5987         err = device_rename(&dev->dev, dev->name);
5988         WARN_ON(err);
5989
5990         /* Add the device back in the hashes */
5991         list_netdevice(dev);
5992
5993         /* Notify protocols, that a new device appeared. */
5994         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5995
5996         /*
5997          *      Prevent userspace races by waiting until the network
5998          *      device is fully setup before sending notifications.
5999          */
6000         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6001
6002         synchronize_net();
6003         err = 0;
6004 out:
6005         return err;
6006 }
6007 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6008
6009 static int dev_cpu_callback(struct notifier_block *nfb,
6010                             unsigned long action,
6011                             void *ocpu)
6012 {
6013         struct sk_buff **list_skb;
6014         struct sk_buff *skb;
6015         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6016         struct softnet_data *sd, *oldsd;
6017
6018         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6019                 return NOTIFY_OK;
6020
6021         local_irq_disable();
6022         cpu = smp_processor_id();
6023         sd = &per_cpu(softnet_data, cpu);
6024         oldsd = &per_cpu(softnet_data, oldcpu);
6025
6026         /* Find end of our completion_queue. */
6027         list_skb = &sd->completion_queue;
6028         while (*list_skb)
6029                 list_skb = &(*list_skb)->next;
6030         /* Append completion queue from offline CPU. */
6031         *list_skb = oldsd->completion_queue;
6032         oldsd->completion_queue = NULL;
6033
6034         /* Append output queue from offline CPU. */
6035         if (oldsd->output_queue) {
6036                 *sd->output_queue_tailp = oldsd->output_queue;
6037                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6038                 oldsd->output_queue = NULL;
6039                 oldsd->output_queue_tailp = &oldsd->output_queue;
6040         }
6041         /* Append NAPI poll list from offline CPU. */
6042         if (!list_empty(&oldsd->poll_list)) {
6043                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6044                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6045         }
6046
6047         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6048         local_irq_enable();
6049
6050         /* Process offline CPU's input_pkt_queue */
6051         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6052                 netif_rx(skb);
6053                 input_queue_head_incr(oldsd);
6054         }
6055         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6056                 netif_rx(skb);
6057                 input_queue_head_incr(oldsd);
6058         }
6059
6060         return NOTIFY_OK;
6061 }
6062
6063
6064 /**
6065  *      netdev_increment_features - increment feature set by one
6066  *      @all: current feature set
6067  *      @one: new feature set
6068  *      @mask: mask feature set
6069  *
6070  *      Computes a new feature set after adding a device with feature set
6071  *      @one to the master device with current feature set @all.  Will not
6072  *      enable anything that is off in @mask. Returns the new feature set.
6073  */
6074 netdev_features_t netdev_increment_features(netdev_features_t all,
6075         netdev_features_t one, netdev_features_t mask)
6076 {
6077         if (mask & NETIF_F_GEN_CSUM)
6078                 mask |= NETIF_F_ALL_CSUM;
6079         mask |= NETIF_F_VLAN_CHALLENGED;
6080
6081         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6082         all &= one | ~NETIF_F_ALL_FOR_ALL;
6083
6084         /* If one device supports hw checksumming, set for all. */
6085         if (all & NETIF_F_GEN_CSUM)
6086                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6087
6088         return all;
6089 }
6090 EXPORT_SYMBOL(netdev_increment_features);
6091
6092 static struct hlist_head *netdev_create_hash(void)
6093 {
6094         int i;
6095         struct hlist_head *hash;
6096
6097         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6098         if (hash != NULL)
6099                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6100                         INIT_HLIST_HEAD(&hash[i]);
6101
6102         return hash;
6103 }
6104
6105 /* Initialize per network namespace state */
6106 static int __net_init netdev_init(struct net *net)
6107 {
6108         if (net != &init_net)
6109                 INIT_LIST_HEAD(&net->dev_base_head);
6110
6111         net->dev_name_head = netdev_create_hash();
6112         if (net->dev_name_head == NULL)
6113                 goto err_name;
6114
6115         net->dev_index_head = netdev_create_hash();
6116         if (net->dev_index_head == NULL)
6117                 goto err_idx;
6118
6119         return 0;
6120
6121 err_idx:
6122         kfree(net->dev_name_head);
6123 err_name:
6124         return -ENOMEM;
6125 }
6126
6127 /**
6128  *      netdev_drivername - network driver for the device
6129  *      @dev: network device
6130  *
6131  *      Determine network driver for device.
6132  */
6133 const char *netdev_drivername(const struct net_device *dev)
6134 {
6135         const struct device_driver *driver;
6136         const struct device *parent;
6137         const char *empty = "";
6138
6139         parent = dev->dev.parent;
6140         if (!parent)
6141                 return empty;
6142
6143         driver = parent->driver;
6144         if (driver && driver->name)
6145                 return driver->name;
6146         return empty;
6147 }
6148
6149 static int __netdev_printk(const char *level, const struct net_device *dev,
6150                            struct va_format *vaf)
6151 {
6152         int r;
6153
6154         if (dev && dev->dev.parent) {
6155                 r = dev_printk_emit(level[1] - '0',
6156                                     dev->dev.parent,
6157                                     "%s %s %s: %pV",
6158                                     dev_driver_string(dev->dev.parent),
6159                                     dev_name(dev->dev.parent),
6160                                     netdev_name(dev), vaf);
6161         } else if (dev) {
6162                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6163         } else {
6164                 r = printk("%s(NULL net_device): %pV", level, vaf);
6165         }
6166
6167         return r;
6168 }
6169
6170 int netdev_printk(const char *level, const struct net_device *dev,
6171                   const char *format, ...)
6172 {
6173         struct va_format vaf;
6174         va_list args;
6175         int r;
6176
6177         va_start(args, format);
6178
6179         vaf.fmt = format;
6180         vaf.va = &args;
6181
6182         r = __netdev_printk(level, dev, &vaf);
6183
6184         va_end(args);
6185
6186         return r;
6187 }
6188 EXPORT_SYMBOL(netdev_printk);
6189
6190 #define define_netdev_printk_level(func, level)                 \
6191 int func(const struct net_device *dev, const char *fmt, ...)    \
6192 {                                                               \
6193         int r;                                                  \
6194         struct va_format vaf;                                   \
6195         va_list args;                                           \
6196                                                                 \
6197         va_start(args, fmt);                                    \
6198                                                                 \
6199         vaf.fmt = fmt;                                          \
6200         vaf.va = &args;                                         \
6201                                                                 \
6202         r = __netdev_printk(level, dev, &vaf);                  \
6203                                                                 \
6204         va_end(args);                                           \
6205                                                                 \
6206         return r;                                               \
6207 }                                                               \
6208 EXPORT_SYMBOL(func);
6209
6210 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6211 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6212 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6213 define_netdev_printk_level(netdev_err, KERN_ERR);
6214 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6215 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6216 define_netdev_printk_level(netdev_info, KERN_INFO);
6217
6218 static void __net_exit netdev_exit(struct net *net)
6219 {
6220         kfree(net->dev_name_head);
6221         kfree(net->dev_index_head);
6222 }
6223
6224 static struct pernet_operations __net_initdata netdev_net_ops = {
6225         .init = netdev_init,
6226         .exit = netdev_exit,
6227 };
6228
6229 static void __net_exit default_device_exit(struct net *net)
6230 {
6231         struct net_device *dev, *aux;
6232         /*
6233          * Push all migratable network devices back to the
6234          * initial network namespace
6235          */
6236         rtnl_lock();
6237         for_each_netdev_safe(net, dev, aux) {
6238                 int err;
6239                 char fb_name[IFNAMSIZ];
6240
6241                 /* Ignore unmoveable devices (i.e. loopback) */
6242                 if (dev->features & NETIF_F_NETNS_LOCAL)
6243                         continue;
6244
6245                 /* Leave virtual devices for the generic cleanup */
6246                 if (dev->rtnl_link_ops)
6247                         continue;
6248
6249                 /* Push remaining network devices to init_net */
6250                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6251                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6252                 if (err) {
6253                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6254                                  __func__, dev->name, err);
6255                         BUG();
6256                 }
6257         }
6258         rtnl_unlock();
6259 }
6260
6261 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6262 {
6263         /* At exit all network devices most be removed from a network
6264          * namespace.  Do this in the reverse order of registration.
6265          * Do this across as many network namespaces as possible to
6266          * improve batching efficiency.
6267          */
6268         struct net_device *dev;
6269         struct net *net;
6270         LIST_HEAD(dev_kill_list);
6271
6272         rtnl_lock();
6273         list_for_each_entry(net, net_list, exit_list) {
6274                 for_each_netdev_reverse(net, dev) {
6275                         if (dev->rtnl_link_ops)
6276                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6277                         else
6278                                 unregister_netdevice_queue(dev, &dev_kill_list);
6279                 }
6280         }
6281         unregister_netdevice_many(&dev_kill_list);
6282         list_del(&dev_kill_list);
6283         rtnl_unlock();
6284 }
6285
6286 static struct pernet_operations __net_initdata default_device_ops = {
6287         .exit = default_device_exit,
6288         .exit_batch = default_device_exit_batch,
6289 };
6290
6291 /*
6292  *      Initialize the DEV module. At boot time this walks the device list and
6293  *      unhooks any devices that fail to initialise (normally hardware not
6294  *      present) and leaves us with a valid list of present and active devices.
6295  *
6296  */
6297
6298 /*
6299  *       This is called single threaded during boot, so no need
6300  *       to take the rtnl semaphore.
6301  */
6302 static int __init net_dev_init(void)
6303 {
6304         int i, rc = -ENOMEM;
6305
6306         BUG_ON(!dev_boot_phase);
6307
6308         if (dev_proc_init())
6309                 goto out;
6310
6311         if (netdev_kobject_init())
6312                 goto out;
6313
6314         INIT_LIST_HEAD(&ptype_all);
6315         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6316                 INIT_LIST_HEAD(&ptype_base[i]);
6317
6318         INIT_LIST_HEAD(&offload_base);
6319
6320         if (register_pernet_subsys(&netdev_net_ops))
6321                 goto out;
6322
6323         /*
6324          *      Initialise the packet receive queues.
6325          */
6326
6327         for_each_possible_cpu(i) {
6328                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6329
6330                 memset(sd, 0, sizeof(*sd));
6331                 skb_queue_head_init(&sd->input_pkt_queue);
6332                 skb_queue_head_init(&sd->process_queue);
6333                 sd->completion_queue = NULL;
6334                 INIT_LIST_HEAD(&sd->poll_list);
6335                 sd->output_queue = NULL;
6336                 sd->output_queue_tailp = &sd->output_queue;
6337 #ifdef CONFIG_RPS
6338                 sd->csd.func = rps_trigger_softirq;
6339                 sd->csd.info = sd;
6340                 sd->csd.flags = 0;
6341                 sd->cpu = i;
6342 #endif
6343
6344                 sd->backlog.poll = process_backlog;
6345                 sd->backlog.weight = weight_p;
6346                 sd->backlog.gro_list = NULL;
6347                 sd->backlog.gro_count = 0;
6348
6349 #ifdef CONFIG_NET_FLOW_LIMIT
6350                 sd->flow_limit = NULL;
6351 #endif
6352         }
6353
6354         dev_boot_phase = 0;
6355
6356         /* The loopback device is special if any other network devices
6357          * is present in a network namespace the loopback device must
6358          * be present. Since we now dynamically allocate and free the
6359          * loopback device ensure this invariant is maintained by
6360          * keeping the loopback device as the first device on the
6361          * list of network devices.  Ensuring the loopback devices
6362          * is the first device that appears and the last network device
6363          * that disappears.
6364          */
6365         if (register_pernet_device(&loopback_net_ops))
6366                 goto out;
6367
6368         if (register_pernet_device(&default_device_ops))
6369                 goto out;
6370
6371         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6372         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6373
6374         hotcpu_notifier(dev_cpu_callback, 0);
6375         dst_init();
6376         rc = 0;
6377 out:
6378         return rc;
6379 }
6380
6381 subsys_initcall(net_dev_init);