net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/proc_fs.h>
 101 #include <linux/seq_file.h>
 102 #include <linux/stat.h>
 103 #include <net/dst.h>
 104 #include <net/pkt_sched.h>
 105 #include <net/checksum.h>
 106 #include <net/xfrm.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/pci.h>
 133 #include <linux/inetdevice.h>
 134 #include <linux/cpu_rmap.h>
 135 #include <linux/net_tstamp.h>
 136 #include <linux/static_key.h>
 137 #include <net/flow_keys.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 /*
 148  *      The list of packet types we will receive (as opposed to discard)
 149  *      and the routines to invoke.
 150  *
 151  *      Why 16. Because with 16 the only overlap we get on a hash of the
 152  *      low nibble of the protocol value is RARP/SNAP/X.25.
 153  *
 154  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 155  *             sure which should go first, but I bet it won't make much
 156  *             difference if we are running VLANs.  The good news is that
 157  *             this protocol won't be in the list unless compiled in, so
 158  *             the average user (w/out VLANs) will not be adversely affected.
 159  *             --BLG
 160  *
 161  *              0800    IP
 162  *              8100    802.1Q VLAN
 163  *              0001    802.3
 164  *              0002    AX.25
 165  *              0004    802.2
 166  *              8035    RARP
 167  *              0005    SNAP
 168  *              0805    X.25
 169  *              0806    ARP
 170  *              8137    IPX
 171  *              0009    Localtalk
 172  *              86DD    IPv6
 173  */
 174
 175 #define PTYPE_HASH_SIZE (16)
 176 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 177
 178 static DEFINE_SPINLOCK(ptype_lock);
 179 static DEFINE_SPINLOCK(offload_lock);
 180 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 181 static struct list_head ptype_all __read_mostly;        /* Taps */
 182 static struct list_head offload_base __read_mostly;
 183
 184 /*
 185  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 186  * semaphore.
 187  *
 188  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 189  *
 190  * Writers must hold the rtnl semaphore while they loop through the
 191  * dev_base_head list, and hold dev_base_lock for writing when they do the
 192  * actual updates.  This allows pure readers to access the list even
 193  * while a writer is preparing to update it.
 194  *
 195  * To put it another way, dev_base_lock is held for writing only to
 196  * protect against pure readers; the rtnl semaphore provides the
 197  * protection against other writers.
 198  *
 199  * See, for example usages, register_netdevice() and
 200  * unregister_netdevice(), which must be called with the rtnl
 201  * semaphore held.
 202  */
 203 DEFINE_RWLOCK(dev_base_lock);
 204 EXPORT_SYMBOL(dev_base_lock);
 205
 206 static inline void dev_base_seq_inc(struct net *net)
 207 {
 208         while (++net->dev_base_seq == 0);
 209 }
 210
 211 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 212 {
 213         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 214
 215         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 216 }
 217
 218 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 219 {
 220         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 221 }
 222
 223 static inline void rps_lock(struct softnet_data *sd)
 224 {
 225 #ifdef CONFIG_RPS
 226         spin_lock(&sd->input_pkt_queue.lock);
 227 #endif
 228 }
 229
 230 static inline void rps_unlock(struct softnet_data *sd)
 231 {
 232 #ifdef CONFIG_RPS
 233         spin_unlock(&sd->input_pkt_queue.lock);
 234 #endif
 235 }
 236
 237 /* Device list insertion */
 238 static int list_netdevice(struct net_device *dev)
 239 {
 240         struct net *net = dev_net(dev);
 241
 242         ASSERT_RTNL();
 243
 244         write_lock_bh(&dev_base_lock);
 245         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 246         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 247         hlist_add_head_rcu(&dev->index_hlist,
 248                            dev_index_hash(net, dev->ifindex));
 249         write_unlock_bh(&dev_base_lock);
 250
 251         dev_base_seq_inc(net);
 252
 253         return 0;
 254 }
 255
 256 /* Device list removal
 257  * caller must respect a RCU grace period before freeing/reusing dev
 258  */
 259 static void unlist_netdevice(struct net_device *dev)
 260 {
 261         ASSERT_RTNL();
 262
 263         /* Unlink dev from the device chain */
 264         write_lock_bh(&dev_base_lock);
 265         list_del_rcu(&dev->dev_list);
 266         hlist_del_rcu(&dev->name_hlist);
 267         hlist_del_rcu(&dev->index_hlist);
 268         write_unlock_bh(&dev_base_lock);
 269
 270         dev_base_seq_inc(dev_net(dev));
 271 }
 272
 273 /*
 274  *      Our notifier list
 275  */
 276
 277 static RAW_NOTIFIER_HEAD(netdev_chain);
 278
 279 /*
 280  *      Device drivers call our routines to queue packets here. We empty the
 281  *      queue in the local softnet handler.
 282  */
 283
 284 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 285 EXPORT_PER_CPU_SYMBOL(softnet_data);
 286
 287 #ifdef CONFIG_LOCKDEP
 288 /*
 289  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 290  * according to dev->type
 291  */
 292 static const unsigned short netdev_lock_type[] =
 293         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 294          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 295          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 296          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 297          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 298          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 299          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 300          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 301          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 302          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 303          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 304          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 305          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 306          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 307          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 308
 309 static const char *const netdev_lock_name[] =
 310         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 311          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 312          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 313          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 314          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 315          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 316          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 317          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 318          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 319          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 320          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 321          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 322          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 323          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 324          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 325
 326 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 327 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328
 329 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 330 {
 331         int i;
 332
 333         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 334                 if (netdev_lock_type[i] == dev_type)
 335                         return i;
 336         /* the last key is used by default */
 337         return ARRAY_SIZE(netdev_lock_type) - 1;
 338 }
 339
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343         int i;
 344
 345         i = netdev_lock_pos(dev_type);
 346         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 347                                    netdev_lock_name[i]);
 348 }
 349
 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351 {
 352         int i;
 353
 354         i = netdev_lock_pos(dev->type);
 355         lockdep_set_class_and_name(&dev->addr_list_lock,
 356                                    &netdev_addr_lock_key[i],
 357                                    netdev_lock_name[i]);
 358 }
 359 #else
 360 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 361                                                  unsigned short dev_type)
 362 {
 363 }
 364 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 365 {
 366 }
 367 #endif
 368
 369 /*******************************************************************************
 370
 371                 Protocol management and registration routines
 372
 373 *******************************************************************************/
 374
 375 /*
 376  *      Add a protocol ID to the list. Now that the input handler is
 377  *      smarter we can dispense with all the messy stuff that used to be
 378  *      here.
 379  *
 380  *      BEWARE!!! Protocol handlers, mangling input packets,
 381  *      MUST BE last in hash buckets and checking protocol handlers
 382  *      MUST start from promiscuous ptype_all chain in net_bh.
 383  *      It is true now, do not change it.
 384  *      Explanation follows: if protocol handler, mangling packet, will
 385  *      be the first on list, it is not able to sense, that packet
 386  *      is cloned and should be copied-on-write, so that it will
 387  *      change it and subsequent readers will get broken packet.
 388  *                                                      --ANK (980803)
 389  */
 390
 391 static inline struct list_head *ptype_head(const struct packet_type *pt)
 392 {
 393         if (pt->type == htons(ETH_P_ALL))
 394                 return &ptype_all;
 395         else
 396                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 397 }
 398
 399 /**
 400  *      dev_add_pack - add packet handler
 401  *      @pt: packet type declaration
 402  *
 403  *      Add a protocol handler to the networking stack. The passed &packet_type
 404  *      is linked into kernel lists and may not be freed until it has been
 405  *      removed from the kernel lists.
 406  *
 407  *      This call does not sleep therefore it can not
 408  *      guarantee all CPU's that are in middle of receiving packets
 409  *      will see the new packet type (until the next received packet).
 410  */
 411
 412 void dev_add_pack(struct packet_type *pt)
 413 {
 414         struct list_head *head = ptype_head(pt);
 415
 416         spin_lock(&ptype_lock);
 417         list_add_rcu(&pt->list, head);
 418         spin_unlock(&ptype_lock);
 419 }
 420 EXPORT_SYMBOL(dev_add_pack);
 421
 422 /**
 423  *      __dev_remove_pack        - remove packet handler
 424  *      @pt: packet type declaration
 425  *
 426  *      Remove a protocol handler that was previously added to the kernel
 427  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 428  *      from the kernel lists and can be freed or reused once this function
 429  *      returns.
 430  *
 431  *      The packet type might still be in use by receivers
 432  *      and must not be freed until after all the CPU's have gone
 433  *      through a quiescent state.
 434  */
 435 void __dev_remove_pack(struct packet_type *pt)
 436 {
 437         struct list_head *head = ptype_head(pt);
 438         struct packet_type *pt1;
 439
 440         spin_lock(&ptype_lock);
 441
 442         list_for_each_entry(pt1, head, list) {
 443                 if (pt == pt1) {
 444                         list_del_rcu(&pt->list);
 445                         goto out;
 446                 }
 447         }
 448
 449         pr_warn("dev_remove_pack: %p not found\n", pt);
 450 out:
 451         spin_unlock(&ptype_lock);
 452 }
 453 EXPORT_SYMBOL(__dev_remove_pack);
 454
 455 /**
 456  *      dev_remove_pack  - remove packet handler
 457  *      @pt: packet type declaration
 458  *
 459  *      Remove a protocol handler that was previously added to the kernel
 460  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 461  *      from the kernel lists and can be freed or reused once this function
 462  *      returns.
 463  *
 464  *      This call sleeps to guarantee that no CPU is looking at the packet
 465  *      type after return.
 466  */
 467 void dev_remove_pack(struct packet_type *pt)
 468 {
 469         __dev_remove_pack(pt);
 470
 471         synchronize_net();
 472 }
 473 EXPORT_SYMBOL(dev_remove_pack);
 474
 475
 476 /**
 477  *      dev_add_offload - register offload handlers
 478  *      @po: protocol offload declaration
 479  *
 480  *      Add protocol offload handlers to the networking stack. The passed
 481  *      &proto_offload is linked into kernel lists and may not be freed until
 482  *      it has been removed from the kernel lists.
 483  *
 484  *      This call does not sleep therefore it can not
 485  *      guarantee all CPU's that are in middle of receiving packets
 486  *      will see the new offload handlers (until the next received packet).
 487  */
 488 void dev_add_offload(struct packet_offload *po)
 489 {
 490         struct list_head *head = &offload_base;
 491
 492         spin_lock(&offload_lock);
 493         list_add_rcu(&po->list, head);
 494         spin_unlock(&offload_lock);
 495 }
 496 EXPORT_SYMBOL(dev_add_offload);
 497
 498 /**
 499  *      __dev_remove_offload     - remove offload handler
 500  *      @po: packet offload declaration
 501  *
 502  *      Remove a protocol offload handler that was previously added to the
 503  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 504  *      is removed from the kernel lists and can be freed or reused once this
 505  *      function returns.
 506  *
 507  *      The packet type might still be in use by receivers
 508  *      and must not be freed until after all the CPU's have gone
 509  *      through a quiescent state.
 510  */
 511 void __dev_remove_offload(struct packet_offload *po)
 512 {
 513         struct list_head *head = &offload_base;
 514         struct packet_offload *po1;
 515
 516         spin_lock(&offload_lock);
 517
 518         list_for_each_entry(po1, head, list) {
 519                 if (po == po1) {
 520                         list_del_rcu(&po->list);
 521                         goto out;
 522                 }
 523         }
 524
 525         pr_warn("dev_remove_offload: %p not found\n", po);
 526 out:
 527         spin_unlock(&offload_lock);
 528 }
 529 EXPORT_SYMBOL(__dev_remove_offload);
 530
 531 /**
 532  *      dev_remove_offload       - remove packet offload handler
 533  *      @po: packet offload declaration
 534  *
 535  *      Remove a packet offload handler that was previously added to the kernel
 536  *      offload handlers by dev_add_offload(). The passed &offload_type is
 537  *      removed from the kernel lists and can be freed or reused once this
 538  *      function returns.
 539  *
 540  *      This call sleeps to guarantee that no CPU is looking at the packet
 541  *      type after return.
 542  */
 543 void dev_remove_offload(struct packet_offload *po)
 544 {
 545         __dev_remove_offload(po);
 546
 547         synchronize_net();
 548 }
 549 EXPORT_SYMBOL(dev_remove_offload);
 550
 551 /******************************************************************************
 552
 553                       Device Boot-time Settings Routines
 554
 555 *******************************************************************************/
 556
 557 /* Boot time configuration table */
 558 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 559
 560 /**
 561  *      netdev_boot_setup_add   - add new setup entry
 562  *      @name: name of the device
 563  *      @map: configured settings for the device
 564  *
 565  *      Adds new setup entry to the dev_boot_setup list.  The function
 566  *      returns 0 on error and 1 on success.  This is a generic routine to
 567  *      all netdevices.
 568  */
 569 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 570 {
 571         struct netdev_boot_setup *s;
 572         int i;
 573
 574         s = dev_boot_setup;
 575         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 576                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 577                         memset(s[i].name, 0, sizeof(s[i].name));
 578                         strlcpy(s[i].name, name, IFNAMSIZ);
 579                         memcpy(&s[i].map, map, sizeof(s[i].map));
 580                         break;
 581                 }
 582         }
 583
 584         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 585 }
 586
 587 /**
 588  *      netdev_boot_setup_check - check boot time settings
 589  *      @dev: the netdevice
 590  *
 591  *      Check boot time settings for the device.
 592  *      The found settings are set for the device to be used
 593  *      later in the device probing.
 594  *      Returns 0 if no settings found, 1 if they are.
 595  */
 596 int netdev_boot_setup_check(struct net_device *dev)
 597 {
 598         struct netdev_boot_setup *s = dev_boot_setup;
 599         int i;
 600
 601         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 602                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 603                     !strcmp(dev->name, s[i].name)) {
 604                         dev->irq        = s[i].map.irq;
 605                         dev->base_addr  = s[i].map.base_addr;
 606                         dev->mem_start  = s[i].map.mem_start;
 607                         dev->mem_end    = s[i].map.mem_end;
 608                         return 1;
 609                 }
 610         }
 611         return 0;
 612 }
 613 EXPORT_SYMBOL(netdev_boot_setup_check);
 614
 615
 616 /**
 617  *      netdev_boot_base        - get address from boot time settings
 618  *      @prefix: prefix for network device
 619  *      @unit: id for network device
 620  *
 621  *      Check boot time settings for the base address of device.
 622  *      The found settings are set for the device to be used
 623  *      later in the device probing.
 624  *      Returns 0 if no settings found.
 625  */
 626 unsigned long netdev_boot_base(const char *prefix, int unit)
 627 {
 628         const struct netdev_boot_setup *s = dev_boot_setup;
 629         char name[IFNAMSIZ];
 630         int i;
 631
 632         sprintf(name, "%s%d", prefix, unit);
 633
 634         /*
 635          * If device already registered then return base of 1
 636          * to indicate not to probe for this interface
 637          */
 638         if (__dev_get_by_name(&init_net, name))
 639                 return 1;
 640
 641         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 642                 if (!strcmp(name, s[i].name))
 643                         return s[i].map.base_addr;
 644         return 0;
 645 }
 646
 647 /*
 648  * Saves at boot time configured settings for any netdevice.
 649  */
 650 int __init netdev_boot_setup(char *str)
 651 {
 652         int ints[5];
 653         struct ifmap map;
 654
 655         str = get_options(str, ARRAY_SIZE(ints), ints);
 656         if (!str || !*str)
 657                 return 0;
 658
 659         /* Save settings */
 660         memset(&map, 0, sizeof(map));
 661         if (ints[0] > 0)
 662                 map.irq = ints[1];
 663         if (ints[0] > 1)
 664                 map.base_addr = ints[2];
 665         if (ints[0] > 2)
 666                 map.mem_start = ints[3];
 667         if (ints[0] > 3)
 668                 map.mem_end = ints[4];
 669
 670         /* Add new entry to the list */
 671         return netdev_boot_setup_add(str, &map);
 672 }
 673
 674 __setup("netdev=", netdev_boot_setup);
 675
 676 /*******************************************************************************
 677
 678                             Device Interface Subroutines
 679
 680 *******************************************************************************/
 681
 682 /**
 683  *      __dev_get_by_name       - find a device by its name
 684  *      @net: the applicable net namespace
 685  *      @name: name to find
 686  *
 687  *      Find an interface by name. Must be called under RTNL semaphore
 688  *      or @dev_base_lock. If the name is found a pointer to the device
 689  *      is returned. If the name is not found then %NULL is returned. The
 690  *      reference counters are not incremented so the caller must be
 691  *      careful with locks.
 692  */
 693
 694 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 695 {
 696         struct hlist_node *p;
 697         struct net_device *dev;
 698         struct hlist_head *head = dev_name_hash(net, name);
 699
 700         hlist_for_each_entry(dev, p, head, name_hlist)
 701                 if (!strncmp(dev->name, name, IFNAMSIZ))
 702                         return dev;
 703
 704         return NULL;
 705 }
 706 EXPORT_SYMBOL(__dev_get_by_name);
 707
 708 /**
 709  *      dev_get_by_name_rcu     - find a device by its name
 710  *      @net: the applicable net namespace
 711  *      @name: name to find
 712  *
 713  *      Find an interface by name.
 714  *      If the name is found a pointer to the device is returned.
 715  *      If the name is not found then %NULL is returned.
 716  *      The reference counters are not incremented so the caller must be
 717  *      careful with locks. The caller must hold RCU lock.
 718  */
 719
 720 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 721 {
 722         struct hlist_node *p;
 723         struct net_device *dev;
 724         struct hlist_head *head = dev_name_hash(net, name);
 725
 726         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 727                 if (!strncmp(dev->name, name, IFNAMSIZ))
 728                         return dev;
 729
 730         return NULL;
 731 }
 732 EXPORT_SYMBOL(dev_get_by_name_rcu);
 733
 734 /**
 735  *      dev_get_by_name         - find a device by its name
 736  *      @net: the applicable net namespace
 737  *      @name: name to find
 738  *
 739  *      Find an interface by name. This can be called from any
 740  *      context and does its own locking. The returned handle has
 741  *      the usage count incremented and the caller must use dev_put() to
 742  *      release it when it is no longer needed. %NULL is returned if no
 743  *      matching device is found.
 744  */
 745
 746 struct net_device *dev_get_by_name(struct net *net, const char *name)
 747 {
 748         struct net_device *dev;
 749
 750         rcu_read_lock();
 751         dev = dev_get_by_name_rcu(net, name);
 752         if (dev)
 753                 dev_hold(dev);
 754         rcu_read_unlock();
 755         return dev;
 756 }
 757 EXPORT_SYMBOL(dev_get_by_name);
 758
 759 /**
 760  *      __dev_get_by_index - find a device by its ifindex
 761  *      @net: the applicable net namespace
 762  *      @ifindex: index of device
 763  *
 764  *      Search for an interface by index. Returns %NULL if the device
 765  *      is not found or a pointer to the device. The device has not
 766  *      had its reference counter increased so the caller must be careful
 767  *      about locking. The caller must hold either the RTNL semaphore
 768  *      or @dev_base_lock.
 769  */
 770
 771 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 772 {
 773         struct hlist_node *p;
 774         struct net_device *dev;
 775         struct hlist_head *head = dev_index_hash(net, ifindex);
 776
 777         hlist_for_each_entry(dev, p, head, index_hlist)
 778                 if (dev->ifindex == ifindex)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(__dev_get_by_index);
 784
 785 /**
 786  *      dev_get_by_index_rcu - find a device by its ifindex
 787  *      @net: the applicable net namespace
 788  *      @ifindex: index of device
 789  *
 790  *      Search for an interface by index. Returns %NULL if the device
 791  *      is not found or a pointer to the device. The device has not
 792  *      had its reference counter increased so the caller must be careful
 793  *      about locking. The caller must hold RCU lock.
 794  */
 795
 796 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 797 {
 798         struct hlist_node *p;
 799         struct net_device *dev;
 800         struct hlist_head *head = dev_index_hash(net, ifindex);
 801
 802         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 803                 if (dev->ifindex == ifindex)
 804                         return dev;
 805
 806         return NULL;
 807 }
 808 EXPORT_SYMBOL(dev_get_by_index_rcu);
 809
 810
 811 /**
 812  *      dev_get_by_index - find a device by its ifindex
 813  *      @net: the applicable net namespace
 814  *      @ifindex: index of device
 815  *
 816  *      Search for an interface by index. Returns NULL if the device
 817  *      is not found or a pointer to the device. The device returned has
 818  *      had a reference added and the pointer is safe until the user calls
 819  *      dev_put to indicate they have finished with it.
 820  */
 821
 822 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 823 {
 824         struct net_device *dev;
 825
 826         rcu_read_lock();
 827         dev = dev_get_by_index_rcu(net, ifindex);
 828         if (dev)
 829                 dev_hold(dev);
 830         rcu_read_unlock();
 831         return dev;
 832 }
 833 EXPORT_SYMBOL(dev_get_by_index);
 834
 835 /**
 836  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 837  *      @net: the applicable net namespace
 838  *      @type: media type of device
 839  *      @ha: hardware address
 840  *
 841  *      Search for an interface by MAC address. Returns NULL if the device
 842  *      is not found or a pointer to the device.
 843  *      The caller must hold RCU or RTNL.
 844  *      The returned device has not had its ref count increased
 845  *      and the caller must therefore be careful about locking
 846  *
 847  */
 848
 849 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 850                                        const char *ha)
 851 {
 852         struct net_device *dev;
 853
 854         for_each_netdev_rcu(net, dev)
 855                 if (dev->type == type &&
 856                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 857                         return dev;
 858
 859         return NULL;
 860 }
 861 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 862
 863 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 864 {
 865         struct net_device *dev;
 866
 867         ASSERT_RTNL();
 868         for_each_netdev(net, dev)
 869                 if (dev->type == type)
 870                         return dev;
 871
 872         return NULL;
 873 }
 874 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 875
 876 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 877 {
 878         struct net_device *dev, *ret = NULL;
 879
 880         rcu_read_lock();
 881         for_each_netdev_rcu(net, dev)
 882                 if (dev->type == type) {
 883                         dev_hold(dev);
 884                         ret = dev;
 885                         break;
 886                 }
 887         rcu_read_unlock();
 888         return ret;
 889 }
 890 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 891
 892 /**
 893  *      dev_get_by_flags_rcu - find any device with given flags
 894  *      @net: the applicable net namespace
 895  *      @if_flags: IFF_* values
 896  *      @mask: bitmask of bits in if_flags to check
 897  *
 898  *      Search for any interface with the given flags. Returns NULL if a device
 899  *      is not found or a pointer to the device. Must be called inside
 900  *      rcu_read_lock(), and result refcount is unchanged.
 901  */
 902
 903 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 904                                     unsigned short mask)
 905 {
 906         struct net_device *dev, *ret;
 907
 908         ret = NULL;
 909         for_each_netdev_rcu(net, dev) {
 910                 if (((dev->flags ^ if_flags) & mask) == 0) {
 911                         ret = dev;
 912                         break;
 913                 }
 914         }
 915         return ret;
 916 }
 917 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 918
 919 /**
 920  *      dev_valid_name - check if name is okay for network device
 921  *      @name: name string
 922  *
 923  *      Network device names need to be valid file names to
 924  *      to allow sysfs to work.  We also disallow any kind of
 925  *      whitespace.
 926  */
 927 bool dev_valid_name(const char *name)
 928 {
 929         if (*name == '\0')
 930                 return false;
 931         if (strlen(name) >= IFNAMSIZ)
 932                 return false;
 933         if (!strcmp(name, ".") || !strcmp(name, ".."))
 934                 return false;
 935
 936         while (*name) {
 937                 if (*name == '/' || isspace(*name))
 938                         return false;
 939                 name++;
 940         }
 941         return true;
 942 }
 943 EXPORT_SYMBOL(dev_valid_name);
 944
 945 /**
 946  *      __dev_alloc_name - allocate a name for a device
 947  *      @net: network namespace to allocate the device name in
 948  *      @name: name format string
 949  *      @buf:  scratch buffer and result name string
 950  *
 951  *      Passed a format string - eg "lt%d" it will try and find a suitable
 952  *      id. It scans list of devices to build up a free map, then chooses
 953  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 954  *      while allocating the name and adding the device in order to avoid
 955  *      duplicates.
 956  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 957  *      Returns the number of the unit assigned or a negative errno code.
 958  */
 959
 960 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 961 {
 962         int i = 0;
 963         const char *p;
 964         const int max_netdevices = 8*PAGE_SIZE;
 965         unsigned long *inuse;
 966         struct net_device *d;
 967
 968         p = strnchr(name, IFNAMSIZ-1, '%');
 969         if (p) {
 970                 /*
 971                  * Verify the string as this thing may have come from
 972                  * the user.  There must be either one "%d" and no other "%"
 973                  * characters.
 974                  */
 975                 if (p[1] != 'd' || strchr(p + 2, '%'))
 976                         return -EINVAL;
 977
 978                 /* Use one page as a bit array of possible slots */
 979                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 980                 if (!inuse)
 981                         return -ENOMEM;
 982
 983                 for_each_netdev(net, d) {
 984                         if (!sscanf(d->name, name, &i))
 985                                 continue;
 986                         if (i < 0 || i >= max_netdevices)
 987                                 continue;
 988
 989                         /*  avoid cases where sscanf is not exact inverse of printf */
 990                         snprintf(buf, IFNAMSIZ, name, i);
 991                         if (!strncmp(buf, d->name, IFNAMSIZ))
 992                                 set_bit(i, inuse);
 993                 }
 994
 995                 i = find_first_zero_bit(inuse, max_netdevices);
 996                 free_page((unsigned long) inuse);
 997         }
 998
 999         if (buf != name)
1000                 snprintf(buf, IFNAMSIZ, name, i);
1001         if (!__dev_get_by_name(net, buf))
1002                 return i;
1003
1004         /* It is possible to run out of possible slots
1005          * when the name is long and there isn't enough space left
1006          * for the digits, or if all bits are used.
1007          */
1008         return -ENFILE;
1009 }
1010
1011 /**
1012  *      dev_alloc_name - allocate a name for a device
1013  *      @dev: device
1014  *      @name: name format string
1015  *
1016  *      Passed a format string - eg "lt%d" it will try and find a suitable
1017  *      id. It scans list of devices to build up a free map, then chooses
1018  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1019  *      while allocating the name and adding the device in order to avoid
1020  *      duplicates.
1021  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1022  *      Returns the number of the unit assigned or a negative errno code.
1023  */
1024
1025 int dev_alloc_name(struct net_device *dev, const char *name)
1026 {
1027         char buf[IFNAMSIZ];
1028         struct net *net;
1029         int ret;
1030
1031         BUG_ON(!dev_net(dev));
1032         net = dev_net(dev);
1033         ret = __dev_alloc_name(net, name, buf);
1034         if (ret >= 0)
1035                 strlcpy(dev->name, buf, IFNAMSIZ);
1036         return ret;
1037 }
1038 EXPORT_SYMBOL(dev_alloc_name);
1039
1040 static int dev_alloc_name_ns(struct net *net,
1041                              struct net_device *dev,
1042                              const char *name)
1043 {
1044         char buf[IFNAMSIZ];
1045         int ret;
1046
1047         ret = __dev_alloc_name(net, name, buf);
1048         if (ret >= 0)
1049                 strlcpy(dev->name, buf, IFNAMSIZ);
1050         return ret;
1051 }
1052
1053 static int dev_get_valid_name(struct net *net,
1054                               struct net_device *dev,
1055                               const char *name)
1056 {
1057         BUG_ON(!net);
1058
1059         if (!dev_valid_name(name))
1060                 return -EINVAL;
1061
1062         if (strchr(name, '%'))
1063                 return dev_alloc_name_ns(net, dev, name);
1064         else if (__dev_get_by_name(net, name))
1065                 return -EEXIST;
1066         else if (dev->name != name)
1067                 strlcpy(dev->name, name, IFNAMSIZ);
1068
1069         return 0;
1070 }
1071
1072 /**
1073  *      dev_change_name - change name of a device
1074  *      @dev: device
1075  *      @newname: name (or format string) must be at least IFNAMSIZ
1076  *
1077  *      Change name of a device, can pass format strings "eth%d".
1078  *      for wildcarding.
1079  */
1080 int dev_change_name(struct net_device *dev, const char *newname)
1081 {
1082         char oldname[IFNAMSIZ];
1083         int err = 0;
1084         int ret;
1085         struct net *net;
1086
1087         ASSERT_RTNL();
1088         BUG_ON(!dev_net(dev));
1089
1090         net = dev_net(dev);
1091         if (dev->flags & IFF_UP)
1092                 return -EBUSY;
1093
1094         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1095                 return 0;
1096
1097         memcpy(oldname, dev->name, IFNAMSIZ);
1098
1099         err = dev_get_valid_name(net, dev, newname);
1100         if (err < 0)
1101                 return err;
1102
1103 rollback:
1104         ret = device_rename(&dev->dev, dev->name);
1105         if (ret) {
1106                 memcpy(dev->name, oldname, IFNAMSIZ);
1107                 return ret;
1108         }
1109
1110         write_lock_bh(&dev_base_lock);
1111         hlist_del_rcu(&dev->name_hlist);
1112         write_unlock_bh(&dev_base_lock);
1113
1114         synchronize_rcu();
1115
1116         write_lock_bh(&dev_base_lock);
1117         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1118         write_unlock_bh(&dev_base_lock);
1119
1120         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1121         ret = notifier_to_errno(ret);
1122
1123         if (ret) {
1124                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1125                 if (err >= 0) {
1126                         err = ret;
1127                         memcpy(dev->name, oldname, IFNAMSIZ);
1128                         goto rollback;
1129                 } else {
1130                         pr_err("%s: name change rollback failed: %d\n",
1131                                dev->name, ret);
1132                 }
1133         }
1134
1135         return err;
1136 }
1137
1138 /**
1139  *      dev_set_alias - change ifalias of a device
1140  *      @dev: device
1141  *      @alias: name up to IFALIASZ
1142  *      @len: limit of bytes to copy from info
1143  *
1144  *      Set ifalias for a device,
1145  */
1146 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1147 {
1148         char *new_ifalias;
1149
1150         ASSERT_RTNL();
1151
1152         if (len >= IFALIASZ)
1153                 return -EINVAL;
1154
1155         if (!len) {
1156                 kfree(dev->ifalias);
1157                 dev->ifalias = NULL;
1158                 return 0;
1159         }
1160
1161         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1162         if (!new_ifalias)
1163                 return -ENOMEM;
1164         dev->ifalias = new_ifalias;
1165
1166         strlcpy(dev->ifalias, alias, len+1);
1167         return len;
1168 }
1169
1170
1171 /**
1172  *      netdev_features_change - device changes features
1173  *      @dev: device to cause notification
1174  *
1175  *      Called to indicate a device has changed features.
1176  */
1177 void netdev_features_change(struct net_device *dev)
1178 {
1179         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1180 }
1181 EXPORT_SYMBOL(netdev_features_change);
1182
1183 /**
1184  *      netdev_state_change - device changes state
1185  *      @dev: device to cause notification
1186  *
1187  *      Called to indicate a device has changed state. This function calls
1188  *      the notifier chains for netdev_chain and sends a NEWLINK message
1189  *      to the routing socket.
1190  */
1191 void netdev_state_change(struct net_device *dev)
1192 {
1193         if (dev->flags & IFF_UP) {
1194                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1195                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1196         }
1197 }
1198 EXPORT_SYMBOL(netdev_state_change);
1199
1200 /**
1201  *      netdev_notify_peers - notify network peers about existence of @dev
1202  *      @dev: network device
1203  *
1204  * Generate traffic such that interested network peers are aware of
1205  * @dev, such as by generating a gratuitous ARP. This may be used when
1206  * a device wants to inform the rest of the network about some sort of
1207  * reconfiguration such as a failover event or virtual machine
1208  * migration.
1209  */
1210 void netdev_notify_peers(struct net_device *dev)
1211 {
1212         rtnl_lock();
1213         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1214         rtnl_unlock();
1215 }
1216 EXPORT_SYMBOL(netdev_notify_peers);
1217
1218 /**
1219  *      dev_load        - load a network module
1220  *      @net: the applicable net namespace
1221  *      @name: name of interface
1222  *
1223  *      If a network interface is not present and the process has suitable
1224  *      privileges this function loads the module. If module loading is not
1225  *      available in this kernel then it becomes a nop.
1226  */
1227
1228 void dev_load(struct net *net, const char *name)
1229 {
1230         struct net_device *dev;
1231         int no_module;
1232
1233         rcu_read_lock();
1234         dev = dev_get_by_name_rcu(net, name);
1235         rcu_read_unlock();
1236
1237         no_module = !dev;
1238         if (no_module && capable(CAP_NET_ADMIN))
1239                 no_module = request_module("netdev-%s", name);
1240         if (no_module && capable(CAP_SYS_MODULE)) {
1241                 if (!request_module("%s", name))
1242                         pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1243                                 name);
1244         }
1245 }
1246 EXPORT_SYMBOL(dev_load);
1247
1248 static int __dev_open(struct net_device *dev)
1249 {
1250         const struct net_device_ops *ops = dev->netdev_ops;
1251         int ret;
1252
1253         ASSERT_RTNL();
1254
1255         if (!netif_device_present(dev))
1256                 return -ENODEV;
1257
1258         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1259         ret = notifier_to_errno(ret);
1260         if (ret)
1261                 return ret;
1262
1263         set_bit(__LINK_STATE_START, &dev->state);
1264
1265         if (ops->ndo_validate_addr)
1266                 ret = ops->ndo_validate_addr(dev);
1267
1268         if (!ret && ops->ndo_open)
1269                 ret = ops->ndo_open(dev);
1270
1271         if (ret)
1272                 clear_bit(__LINK_STATE_START, &dev->state);
1273         else {
1274                 dev->flags |= IFF_UP;
1275                 net_dmaengine_get();
1276                 dev_set_rx_mode(dev);
1277                 dev_activate(dev);
1278                 add_device_randomness(dev->dev_addr, dev->addr_len);
1279         }
1280
1281         return ret;
1282 }
1283
1284 /**
1285  *      dev_open        - prepare an interface for use.
1286  *      @dev:   device to open
1287  *
1288  *      Takes a device from down to up state. The device's private open
1289  *      function is invoked and then the multicast lists are loaded. Finally
1290  *      the device is moved into the up state and a %NETDEV_UP message is
1291  *      sent to the netdev notifier chain.
1292  *
1293  *      Calling this function on an active interface is a nop. On a failure
1294  *      a negative errno code is returned.
1295  */
1296 int dev_open(struct net_device *dev)
1297 {
1298         int ret;
1299
1300         if (dev->flags & IFF_UP)
1301                 return 0;
1302
1303         ret = __dev_open(dev);
1304         if (ret < 0)
1305                 return ret;
1306
1307         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1308         call_netdevice_notifiers(NETDEV_UP, dev);
1309
1310         return ret;
1311 }
1312 EXPORT_SYMBOL(dev_open);
1313
1314 static int __dev_close_many(struct list_head *head)
1315 {
1316         struct net_device *dev;
1317
1318         ASSERT_RTNL();
1319         might_sleep();
1320
1321         list_for_each_entry(dev, head, unreg_list) {
1322                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1323
1324                 clear_bit(__LINK_STATE_START, &dev->state);
1325
1326                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1327                  * can be even on different cpu. So just clear netif_running().
1328                  *
1329                  * dev->stop() will invoke napi_disable() on all of it's
1330                  * napi_struct instances on this device.
1331                  */
1332                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1333         }
1334
1335         dev_deactivate_many(head);
1336
1337         list_for_each_entry(dev, head, unreg_list) {
1338                 const struct net_device_ops *ops = dev->netdev_ops;
1339
1340                 /*
1341                  *      Call the device specific close. This cannot fail.
1342                  *      Only if device is UP
1343                  *
1344                  *      We allow it to be called even after a DETACH hot-plug
1345                  *      event.
1346                  */
1347                 if (ops->ndo_stop)
1348                         ops->ndo_stop(dev);
1349
1350                 dev->flags &= ~IFF_UP;
1351                 net_dmaengine_put();
1352         }
1353
1354         return 0;
1355 }
1356
1357 static int __dev_close(struct net_device *dev)
1358 {
1359         int retval;
1360         LIST_HEAD(single);
1361
1362         list_add(&dev->unreg_list, &single);
1363         retval = __dev_close_many(&single);
1364         list_del(&single);
1365         return retval;
1366 }
1367
1368 static int dev_close_many(struct list_head *head)
1369 {
1370         struct net_device *dev, *tmp;
1371         LIST_HEAD(tmp_list);
1372
1373         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1374                 if (!(dev->flags & IFF_UP))
1375                         list_move(&dev->unreg_list, &tmp_list);
1376
1377         __dev_close_many(head);
1378
1379         list_for_each_entry(dev, head, unreg_list) {
1380                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1381                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1382         }
1383
1384         /* rollback_registered_many needs the complete original list */
1385         list_splice(&tmp_list, head);
1386         return 0;
1387 }
1388
1389 /**
1390  *      dev_close - shutdown an interface.
1391  *      @dev: device to shutdown
1392  *
1393  *      This function moves an active device into down state. A
1394  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1395  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1396  *      chain.
1397  */
1398 int dev_close(struct net_device *dev)
1399 {
1400         if (dev->flags & IFF_UP) {
1401                 LIST_HEAD(single);
1402
1403                 list_add(&dev->unreg_list, &single);
1404                 dev_close_many(&single);
1405                 list_del(&single);
1406         }
1407         return 0;
1408 }
1409 EXPORT_SYMBOL(dev_close);
1410
1411
1412 /**
1413  *      dev_disable_lro - disable Large Receive Offload on a device
1414  *      @dev: device
1415  *
1416  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1417  *      called under RTNL.  This is needed if received packets may be
1418  *      forwarded to another interface.
1419  */
1420 void dev_disable_lro(struct net_device *dev)
1421 {
1422         /*
1423          * If we're trying to disable lro on a vlan device
1424          * use the underlying physical device instead
1425          */
1426         if (is_vlan_dev(dev))
1427                 dev = vlan_dev_real_dev(dev);
1428
1429         dev->wanted_features &= ~NETIF_F_LRO;
1430         netdev_update_features(dev);
1431
1432         if (unlikely(dev->features & NETIF_F_LRO))
1433                 netdev_WARN(dev, "failed to disable LRO!\n");
1434 }
1435 EXPORT_SYMBOL(dev_disable_lro);
1436
1437
1438 static int dev_boot_phase = 1;
1439
1440 /**
1441  *      register_netdevice_notifier - register a network notifier block
1442  *      @nb: notifier
1443  *
1444  *      Register a notifier to be called when network device events occur.
1445  *      The notifier passed is linked into the kernel structures and must
1446  *      not be reused until it has been unregistered. A negative errno code
1447  *      is returned on a failure.
1448  *
1449  *      When registered all registration and up events are replayed
1450  *      to the new notifier to allow device to have a race free
1451  *      view of the network device list.
1452  */
1453
1454 int register_netdevice_notifier(struct notifier_block *nb)
1455 {
1456         struct net_device *dev;
1457         struct net_device *last;
1458         struct net *net;
1459         int err;
1460
1461         rtnl_lock();
1462         err = raw_notifier_chain_register(&netdev_chain, nb);
1463         if (err)
1464                 goto unlock;
1465         if (dev_boot_phase)
1466                 goto unlock;
1467         for_each_net(net) {
1468                 for_each_netdev(net, dev) {
1469                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1470                         err = notifier_to_errno(err);
1471                         if (err)
1472                                 goto rollback;
1473
1474                         if (!(dev->flags & IFF_UP))
1475                                 continue;
1476
1477                         nb->notifier_call(nb, NETDEV_UP, dev);
1478                 }
1479         }
1480
1481 unlock:
1482         rtnl_unlock();
1483         return err;
1484
1485 rollback:
1486         last = dev;
1487         for_each_net(net) {
1488                 for_each_netdev(net, dev) {
1489                         if (dev == last)
1490                                 goto outroll;
1491
1492                         if (dev->flags & IFF_UP) {
1493                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1494                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1495                         }
1496                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1497                 }
1498         }
1499
1500 outroll:
1501         raw_notifier_chain_unregister(&netdev_chain, nb);
1502         goto unlock;
1503 }
1504 EXPORT_SYMBOL(register_netdevice_notifier);
1505
1506 /**
1507  *      unregister_netdevice_notifier - unregister a network notifier block
1508  *      @nb: notifier
1509  *
1510  *      Unregister a notifier previously registered by
1511  *      register_netdevice_notifier(). The notifier is unlinked into the
1512  *      kernel structures and may then be reused. A negative errno code
1513  *      is returned on a failure.
1514  *
1515  *      After unregistering unregister and down device events are synthesized
1516  *      for all devices on the device list to the removed notifier to remove
1517  *      the need for special case cleanup code.
1518  */
1519
1520 int unregister_netdevice_notifier(struct notifier_block *nb)
1521 {
1522         struct net_device *dev;
1523         struct net *net;
1524         int err;
1525
1526         rtnl_lock();
1527         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1528         if (err)
1529                 goto unlock;
1530
1531         for_each_net(net) {
1532                 for_each_netdev(net, dev) {
1533                         if (dev->flags & IFF_UP) {
1534                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1535                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1536                         }
1537                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1538                 }
1539         }
1540 unlock:
1541         rtnl_unlock();
1542         return err;
1543 }
1544 EXPORT_SYMBOL(unregister_netdevice_notifier);
1545
1546 /**
1547  *      call_netdevice_notifiers - call all network notifier blocks
1548  *      @val: value passed unmodified to notifier function
1549  *      @dev: net_device pointer passed unmodified to notifier function
1550  *
1551  *      Call all network notifier blocks.  Parameters and return value
1552  *      are as for raw_notifier_call_chain().
1553  */
1554
1555 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1556 {
1557         ASSERT_RTNL();
1558         return raw_notifier_call_chain(&netdev_chain, val, dev);
1559 }
1560 EXPORT_SYMBOL(call_netdevice_notifiers);
1561
1562 static struct static_key netstamp_needed __read_mostly;
1563 #ifdef HAVE_JUMP_LABEL
1564 /* We are not allowed to call static_key_slow_dec() from irq context
1565  * If net_disable_timestamp() is called from irq context, defer the
1566  * static_key_slow_dec() calls.
1567  */
1568 static atomic_t netstamp_needed_deferred;
1569 #endif
1570
1571 void net_enable_timestamp(void)
1572 {
1573 #ifdef HAVE_JUMP_LABEL
1574         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1575
1576         if (deferred) {
1577                 while (--deferred)
1578                         static_key_slow_dec(&netstamp_needed);
1579                 return;
1580         }
1581 #endif
1582         WARN_ON(in_interrupt());
1583         static_key_slow_inc(&netstamp_needed);
1584 }
1585 EXPORT_SYMBOL(net_enable_timestamp);
1586
1587 void net_disable_timestamp(void)
1588 {
1589 #ifdef HAVE_JUMP_LABEL
1590         if (in_interrupt()) {
1591                 atomic_inc(&netstamp_needed_deferred);
1592                 return;
1593         }
1594 #endif
1595         static_key_slow_dec(&netstamp_needed);
1596 }
1597 EXPORT_SYMBOL(net_disable_timestamp);
1598
1599 static inline void net_timestamp_set(struct sk_buff *skb)
1600 {
1601         skb->tstamp.tv64 = 0;
1602         if (static_key_false(&netstamp_needed))
1603                 __net_timestamp(skb);
1604 }
1605
1606 #define net_timestamp_check(COND, SKB)                  \
1607         if (static_key_false(&netstamp_needed)) {               \
1608                 if ((COND) && !(SKB)->tstamp.tv64)      \
1609                         __net_timestamp(SKB);           \
1610         }                                               \
1611
1612 static int net_hwtstamp_validate(struct ifreq *ifr)
1613 {
1614         struct hwtstamp_config cfg;
1615         enum hwtstamp_tx_types tx_type;
1616         enum hwtstamp_rx_filters rx_filter;
1617         int tx_type_valid = 0;
1618         int rx_filter_valid = 0;
1619
1620         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1621                 return -EFAULT;
1622
1623         if (cfg.flags) /* reserved for future extensions */
1624                 return -EINVAL;
1625
1626         tx_type = cfg.tx_type;
1627         rx_filter = cfg.rx_filter;
1628
1629         switch (tx_type) {
1630         case HWTSTAMP_TX_OFF:
1631         case HWTSTAMP_TX_ON:
1632         case HWTSTAMP_TX_ONESTEP_SYNC:
1633                 tx_type_valid = 1;
1634                 break;
1635         }
1636
1637         switch (rx_filter) {
1638         case HWTSTAMP_FILTER_NONE:
1639         case HWTSTAMP_FILTER_ALL:
1640         case HWTSTAMP_FILTER_SOME:
1641         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1642         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1643         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1644         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1645         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1646         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1647         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1648         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1649         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1650         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1651         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1652         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1653                 rx_filter_valid = 1;
1654                 break;
1655         }
1656
1657         if (!tx_type_valid || !rx_filter_valid)
1658                 return -ERANGE;
1659
1660         return 0;
1661 }
1662
1663 static inline bool is_skb_forwardable(struct net_device *dev,
1664                                       struct sk_buff *skb)
1665 {
1666         unsigned int len;
1667
1668         if (!(dev->flags & IFF_UP))
1669                 return false;
1670
1671         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1672         if (skb->len <= len)
1673                 return true;
1674
1675         /* if TSO is enabled, we don't care about the length as the packet
1676          * could be forwarded without being segmented before
1677          */
1678         if (skb_is_gso(skb))
1679                 return true;
1680
1681         return false;
1682 }
1683
1684 /**
1685  * dev_forward_skb - loopback an skb to another netif
1686  *
1687  * @dev: destination network device
1688  * @skb: buffer to forward
1689  *
1690  * return values:
1691  *      NET_RX_SUCCESS  (no congestion)
1692  *      NET_RX_DROP     (packet was dropped, but freed)
1693  *
1694  * dev_forward_skb can be used for injecting an skb from the
1695  * start_xmit function of one device into the receive queue
1696  * of another device.
1697  *
1698  * The receiving device may be in another namespace, so
1699  * we have to clear all information in the skb that could
1700  * impact namespace isolation.
1701  */
1702 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1703 {
1704         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1705                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1706                         atomic_long_inc(&dev->rx_dropped);
1707                         kfree_skb(skb);
1708                         return NET_RX_DROP;
1709                 }
1710         }
1711
1712         skb_orphan(skb);
1713         nf_reset(skb);
1714
1715         if (unlikely(!is_skb_forwardable(dev, skb))) {
1716                 atomic_long_inc(&dev->rx_dropped);
1717                 kfree_skb(skb);
1718                 return NET_RX_DROP;
1719         }
1720         skb->skb_iif = 0;
1721         skb->dev = dev;
1722         skb_dst_drop(skb);
1723         skb->tstamp.tv64 = 0;
1724         skb->pkt_type = PACKET_HOST;
1725         skb->protocol = eth_type_trans(skb, dev);
1726         skb->mark = 0;
1727         secpath_reset(skb);
1728         nf_reset(skb);
1729         return netif_rx(skb);
1730 }
1731 EXPORT_SYMBOL_GPL(dev_forward_skb);
1732
1733 static inline int deliver_skb(struct sk_buff *skb,
1734                               struct packet_type *pt_prev,
1735                               struct net_device *orig_dev)
1736 {
1737         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1738                 return -ENOMEM;
1739         atomic_inc(&skb->users);
1740         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1741 }
1742
1743 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1744 {
1745         if (!ptype->af_packet_priv || !skb->sk)
1746                 return false;
1747
1748         if (ptype->id_match)
1749                 return ptype->id_match(ptype, skb->sk);
1750         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1751                 return true;
1752
1753         return false;
1754 }
1755
1756 /*
1757  *      Support routine. Sends outgoing frames to any network
1758  *      taps currently in use.
1759  */
1760
1761 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1762 {
1763         struct packet_type *ptype;
1764         struct sk_buff *skb2 = NULL;
1765         struct packet_type *pt_prev = NULL;
1766
1767         rcu_read_lock();
1768         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1769                 /* Never send packets back to the socket
1770                  * they originated from - MvS (miquels@drinkel.ow.org)
1771                  */
1772                 if ((ptype->dev == dev || !ptype->dev) &&
1773                     (!skb_loop_sk(ptype, skb))) {
1774                         if (pt_prev) {
1775                                 deliver_skb(skb2, pt_prev, skb->dev);
1776                                 pt_prev = ptype;
1777                                 continue;
1778                         }
1779
1780                         skb2 = skb_clone(skb, GFP_ATOMIC);
1781                         if (!skb2)
1782                                 break;
1783
1784                         net_timestamp_set(skb2);
1785
1786                         /* skb->nh should be correctly
1787                            set by sender, so that the second statement is
1788                            just protection against buggy protocols.
1789                          */
1790                         skb_reset_mac_header(skb2);
1791
1792                         if (skb_network_header(skb2) < skb2->data ||
1793                             skb2->network_header > skb2->tail) {
1794                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1795                                                      ntohs(skb2->protocol),
1796                                                      dev->name);
1797                                 skb_reset_network_header(skb2);
1798                         }
1799
1800                         skb2->transport_header = skb2->network_header;
1801                         skb2->pkt_type = PACKET_OUTGOING;
1802                         pt_prev = ptype;
1803                 }
1804         }
1805         if (pt_prev)
1806                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1807         rcu_read_unlock();
1808 }
1809
1810 /**
1811  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1812  * @dev: Network device
1813  * @txq: number of queues available
1814  *
1815  * If real_num_tx_queues is changed the tc mappings may no longer be
1816  * valid. To resolve this verify the tc mapping remains valid and if
1817  * not NULL the mapping. With no priorities mapping to this
1818  * offset/count pair it will no longer be used. In the worst case TC0
1819  * is invalid nothing can be done so disable priority mappings. If is
1820  * expected that drivers will fix this mapping if they can before
1821  * calling netif_set_real_num_tx_queues.
1822  */
1823 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1824 {
1825         int i;
1826         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1827
1828         /* If TC0 is invalidated disable TC mapping */
1829         if (tc->offset + tc->count > txq) {
1830                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1831                 dev->num_tc = 0;
1832                 return;
1833         }
1834
1835         /* Invalidated prio to tc mappings set to TC0 */
1836         for (i = 1; i < TC_BITMASK + 1; i++) {
1837                 int q = netdev_get_prio_tc_map(dev, i);
1838
1839                 tc = &dev->tc_to_txq[q];
1840                 if (tc->offset + tc->count > txq) {
1841                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1842                                 i, q);
1843                         netdev_set_prio_tc_map(dev, i, 0);
1844                 }
1845         }
1846 }
1847
1848 /*
1849  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1850  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1851  */
1852 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1853 {
1854         int rc;
1855
1856         if (txq < 1 || txq > dev->num_tx_queues)
1857                 return -EINVAL;
1858
1859         if (dev->reg_state == NETREG_REGISTERED ||
1860             dev->reg_state == NETREG_UNREGISTERING) {
1861                 ASSERT_RTNL();
1862
1863                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1864                                                   txq);
1865                 if (rc)
1866                         return rc;
1867
1868                 if (dev->num_tc)
1869                         netif_setup_tc(dev, txq);
1870
1871                 if (txq < dev->real_num_tx_queues)
1872                         qdisc_reset_all_tx_gt(dev, txq);
1873         }
1874
1875         dev->real_num_tx_queues = txq;
1876         return 0;
1877 }
1878 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1879
1880 #ifdef CONFIG_RPS
1881 /**
1882  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1883  *      @dev: Network device
1884  *      @rxq: Actual number of RX queues
1885  *
1886  *      This must be called either with the rtnl_lock held or before
1887  *      registration of the net device.  Returns 0 on success, or a
1888  *      negative error code.  If called before registration, it always
1889  *      succeeds.
1890  */
1891 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1892 {
1893         int rc;
1894
1895         if (rxq < 1 || rxq > dev->num_rx_queues)
1896                 return -EINVAL;
1897
1898         if (dev->reg_state == NETREG_REGISTERED) {
1899                 ASSERT_RTNL();
1900
1901                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1902                                                   rxq);
1903                 if (rc)
1904                         return rc;
1905         }
1906
1907         dev->real_num_rx_queues = rxq;
1908         return 0;
1909 }
1910 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1911 #endif
1912
1913 /**
1914  * netif_get_num_default_rss_queues - default number of RSS queues
1915  *
1916  * This routine should set an upper limit on the number of RSS queues
1917  * used by default by multiqueue devices.
1918  */
1919 int netif_get_num_default_rss_queues(void)
1920 {
1921         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
1922 }
1923 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
1924
1925 static inline void __netif_reschedule(struct Qdisc *q)
1926 {
1927         struct softnet_data *sd;
1928         unsigned long flags;
1929
1930         local_irq_save(flags);
1931         sd = &__get_cpu_var(softnet_data);
1932         q->next_sched = NULL;
1933         *sd->output_queue_tailp = q;
1934         sd->output_queue_tailp = &q->next_sched;
1935         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1936         local_irq_restore(flags);
1937 }
1938
1939 void __netif_schedule(struct Qdisc *q)
1940 {
1941         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1942                 __netif_reschedule(q);
1943 }
1944 EXPORT_SYMBOL(__netif_schedule);
1945
1946 void dev_kfree_skb_irq(struct sk_buff *skb)
1947 {
1948         if (atomic_dec_and_test(&skb->users)) {
1949                 struct softnet_data *sd;
1950                 unsigned long flags;
1951
1952                 local_irq_save(flags);
1953                 sd = &__get_cpu_var(softnet_data);
1954                 skb->next = sd->completion_queue;
1955                 sd->completion_queue = skb;
1956                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1957                 local_irq_restore(flags);
1958         }
1959 }
1960 EXPORT_SYMBOL(dev_kfree_skb_irq);
1961
1962 void dev_kfree_skb_any(struct sk_buff *skb)
1963 {
1964         if (in_irq() || irqs_disabled())
1965                 dev_kfree_skb_irq(skb);
1966         else
1967                 dev_kfree_skb(skb);
1968 }
1969 EXPORT_SYMBOL(dev_kfree_skb_any);
1970
1971
1972 /**
1973  * netif_device_detach - mark device as removed
1974  * @dev: network device
1975  *
1976  * Mark device as removed from system and therefore no longer available.
1977  */
1978 void netif_device_detach(struct net_device *dev)
1979 {
1980         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1981             netif_running(dev)) {
1982                 netif_tx_stop_all_queues(dev);
1983         }
1984 }
1985 EXPORT_SYMBOL(netif_device_detach);
1986
1987 /**
1988  * netif_device_attach - mark device as attached
1989  * @dev: network device
1990  *
1991  * Mark device as attached from system and restart if needed.
1992  */
1993 void netif_device_attach(struct net_device *dev)
1994 {
1995         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1996             netif_running(dev)) {
1997                 netif_tx_wake_all_queues(dev);
1998                 __netdev_watchdog_up(dev);
1999         }
2000 }
2001 EXPORT_SYMBOL(netif_device_attach);
2002
2003 static void skb_warn_bad_offload(const struct sk_buff *skb)
2004 {
2005         static const netdev_features_t null_features = 0;
2006         struct net_device *dev = skb->dev;
2007         const char *driver = "";
2008
2009         if (dev && dev->dev.parent)
2010                 driver = dev_driver_string(dev->dev.parent);
2011
2012         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2013              "gso_type=%d ip_summed=%d\n",
2014              driver, dev ? &dev->features : &null_features,
2015              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2016              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2017              skb_shinfo(skb)->gso_type, skb->ip_summed);
2018 }
2019
2020 /*
2021  * Invalidate hardware checksum when packet is to be mangled, and
2022  * complete checksum manually on outgoing path.
2023  */
2024 int skb_checksum_help(struct sk_buff *skb)
2025 {
2026         __wsum csum;
2027         int ret = 0, offset;
2028
2029         if (skb->ip_summed == CHECKSUM_COMPLETE)
2030                 goto out_set_summed;
2031
2032         if (unlikely(skb_shinfo(skb)->gso_size)) {
2033                 skb_warn_bad_offload(skb);
2034                 return -EINVAL;
2035         }
2036
2037         offset = skb_checksum_start_offset(skb);
2038         BUG_ON(offset >= skb_headlen(skb));
2039         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2040
2041         offset += skb->csum_offset;
2042         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2043
2044         if (skb_cloned(skb) &&
2045             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2046                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2047                 if (ret)
2048                         goto out;
2049         }
2050
2051         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2052 out_set_summed:
2053         skb->ip_summed = CHECKSUM_NONE;
2054 out:
2055         return ret;
2056 }
2057 EXPORT_SYMBOL(skb_checksum_help);
2058
2059 /**
2060  *      skb_gso_segment - Perform segmentation on skb.
2061  *      @skb: buffer to segment
2062  *      @features: features for the output path (see dev->features)
2063  *
2064  *      This function segments the given skb and returns a list of segments.
2065  *
2066  *      It may return NULL if the skb requires no segmentation.  This is
2067  *      only possible when GSO is used for verifying header integrity.
2068  */
2069 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
2070         netdev_features_t features)
2071 {
2072         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2073         struct packet_offload *ptype;
2074         __be16 type = skb->protocol;
2075         int vlan_depth = ETH_HLEN;
2076         int err;
2077
2078         while (type == htons(ETH_P_8021Q)) {
2079                 struct vlan_hdr *vh;
2080
2081                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2082                         return ERR_PTR(-EINVAL);
2083
2084                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2085                 type = vh->h_vlan_encapsulated_proto;
2086                 vlan_depth += VLAN_HLEN;
2087         }
2088
2089         skb_reset_mac_header(skb);
2090         skb->mac_len = skb->network_header - skb->mac_header;
2091         __skb_pull(skb, skb->mac_len);
2092
2093         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2094                 skb_warn_bad_offload(skb);
2095
2096                 if (skb_header_cloned(skb) &&
2097                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2098                         return ERR_PTR(err);
2099         }
2100
2101         rcu_read_lock();
2102         list_for_each_entry_rcu(ptype, &offload_base, list) {
2103                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2104                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2105                                 err = ptype->callbacks.gso_send_check(skb);
2106                                 segs = ERR_PTR(err);
2107                                 if (err || skb_gso_ok(skb, features))
2108                                         break;
2109                                 __skb_push(skb, (skb->data -
2110                                                  skb_network_header(skb)));
2111                         }
2112                         segs = ptype->callbacks.gso_segment(skb, features);
2113                         break;
2114                 }
2115         }
2116         rcu_read_unlock();
2117
2118         __skb_push(skb, skb->data - skb_mac_header(skb));
2119
2120         return segs;
2121 }
2122 EXPORT_SYMBOL(skb_gso_segment);
2123
2124 /* Take action when hardware reception checksum errors are detected. */
2125 #ifdef CONFIG_BUG
2126 void netdev_rx_csum_fault(struct net_device *dev)
2127 {
2128         if (net_ratelimit()) {
2129                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2130                 dump_stack();
2131         }
2132 }
2133 EXPORT_SYMBOL(netdev_rx_csum_fault);
2134 #endif
2135
2136 /* Actually, we should eliminate this check as soon as we know, that:
2137  * 1. IOMMU is present and allows to map all the memory.
2138  * 2. No high memory really exists on this machine.
2139  */
2140
2141 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2142 {
2143 #ifdef CONFIG_HIGHMEM
2144         int i;
2145         if (!(dev->features & NETIF_F_HIGHDMA)) {
2146                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2147                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2148                         if (PageHighMem(skb_frag_page(frag)))
2149                                 return 1;
2150                 }
2151         }
2152
2153         if (PCI_DMA_BUS_IS_PHYS) {
2154                 struct device *pdev = dev->dev.parent;
2155
2156                 if (!pdev)
2157                         return 0;
2158                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2159                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2160                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2161                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2162                                 return 1;
2163                 }
2164         }
2165 #endif
2166         return 0;
2167 }
2168
2169 struct dev_gso_cb {
2170         void (*destructor)(struct sk_buff *skb);
2171 };
2172
2173 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2174
2175 static void dev_gso_skb_destructor(struct sk_buff *skb)
2176 {
2177         struct dev_gso_cb *cb;
2178
2179         do {
2180                 struct sk_buff *nskb = skb->next;
2181
2182                 skb->next = nskb->next;
2183                 nskb->next = NULL;
2184                 kfree_skb(nskb);
2185         } while (skb->next);
2186
2187         cb = DEV_GSO_CB(skb);
2188         if (cb->destructor)
2189                 cb->destructor(skb);
2190 }
2191
2192 /**
2193  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2194  *      @skb: buffer to segment
2195  *      @features: device features as applicable to this skb
2196  *
2197  *      This function segments the given skb and stores the list of segments
2198  *      in skb->next.
2199  */
2200 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2201 {
2202         struct sk_buff *segs;
2203
2204         segs = skb_gso_segment(skb, features);
2205
2206         /* Verifying header integrity only. */
2207         if (!segs)
2208                 return 0;
2209
2210         if (IS_ERR(segs))
2211                 return PTR_ERR(segs);
2212
2213         skb->next = segs;
2214         DEV_GSO_CB(skb)->destructor = skb->destructor;
2215         skb->destructor = dev_gso_skb_destructor;
2216
2217         return 0;
2218 }
2219
2220 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2221 {
2222         return ((features & NETIF_F_GEN_CSUM) ||
2223                 ((features & NETIF_F_V4_CSUM) &&
2224                  protocol == htons(ETH_P_IP)) ||
2225                 ((features & NETIF_F_V6_CSUM) &&
2226                  protocol == htons(ETH_P_IPV6)) ||
2227                 ((features & NETIF_F_FCOE_CRC) &&
2228                  protocol == htons(ETH_P_FCOE)));
2229 }
2230
2231 static netdev_features_t harmonize_features(struct sk_buff *skb,
2232         __be16 protocol, netdev_features_t features)
2233 {
2234         if (skb->ip_summed != CHECKSUM_NONE &&
2235             !can_checksum_protocol(features, protocol)) {
2236                 features &= ~NETIF_F_ALL_CSUM;
2237                 features &= ~NETIF_F_SG;
2238         } else if (illegal_highdma(skb->dev, skb)) {
2239                 features &= ~NETIF_F_SG;
2240         }
2241
2242         return features;
2243 }
2244
2245 netdev_features_t netif_skb_features(struct sk_buff *skb)
2246 {
2247         __be16 protocol = skb->protocol;
2248         netdev_features_t features = skb->dev->features;
2249
2250         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2251                 features &= ~NETIF_F_GSO_MASK;
2252
2253         if (protocol == htons(ETH_P_8021Q)) {
2254                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2255                 protocol = veh->h_vlan_encapsulated_proto;
2256         } else if (!vlan_tx_tag_present(skb)) {
2257                 return harmonize_features(skb, protocol, features);
2258         }
2259
2260         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2261
2262         if (protocol != htons(ETH_P_8021Q)) {
2263                 return harmonize_features(skb, protocol, features);
2264         } else {
2265                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2266                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2267                 return harmonize_features(skb, protocol, features);
2268         }
2269 }
2270 EXPORT_SYMBOL(netif_skb_features);
2271
2272 /*
2273  * Returns true if either:
2274  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2275  *      2. skb is fragmented and the device does not support SG.
2276  */
2277 static inline int skb_needs_linearize(struct sk_buff *skb,
2278                                       int features)
2279 {
2280         return skb_is_nonlinear(skb) &&
2281                         ((skb_has_frag_list(skb) &&
2282                                 !(features & NETIF_F_FRAGLIST)) ||
2283                         (skb_shinfo(skb)->nr_frags &&
2284                                 !(features & NETIF_F_SG)));
2285 }
2286
2287 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2288                         struct netdev_queue *txq)
2289 {
2290         const struct net_device_ops *ops = dev->netdev_ops;
2291         int rc = NETDEV_TX_OK;
2292         unsigned int skb_len;
2293
2294         if (likely(!skb->next)) {
2295                 netdev_features_t features;
2296
2297                 /*
2298                  * If device doesn't need skb->dst, release it right now while
2299                  * its hot in this cpu cache
2300                  */
2301                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2302                         skb_dst_drop(skb);
2303
2304                 features = netif_skb_features(skb);
2305
2306                 if (vlan_tx_tag_present(skb) &&
2307                     !(features & NETIF_F_HW_VLAN_TX)) {
2308                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2309                         if (unlikely(!skb))
2310                                 goto out;
2311
2312                         skb->vlan_tci = 0;
2313                 }
2314
2315                 if (netif_needs_gso(skb, features)) {
2316                         if (unlikely(dev_gso_segment(skb, features)))
2317                                 goto out_kfree_skb;
2318                         if (skb->next)
2319                                 goto gso;
2320                 } else {
2321                         if (skb_needs_linearize(skb, features) &&
2322                             __skb_linearize(skb))
2323                                 goto out_kfree_skb;
2324
2325                         /* If packet is not checksummed and device does not
2326                          * support checksumming for this protocol, complete
2327                          * checksumming here.
2328                          */
2329                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2330                                 skb_set_transport_header(skb,
2331                                         skb_checksum_start_offset(skb));
2332                                 if (!(features & NETIF_F_ALL_CSUM) &&
2333                                      skb_checksum_help(skb))
2334                                         goto out_kfree_skb;
2335                         }
2336                 }
2337
2338                 if (!list_empty(&ptype_all))
2339                         dev_queue_xmit_nit(skb, dev);
2340
2341                 skb_len = skb->len;
2342                 rc = ops->ndo_start_xmit(skb, dev);
2343                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2344                 if (rc == NETDEV_TX_OK)
2345                         txq_trans_update(txq);
2346                 return rc;
2347         }
2348
2349 gso:
2350         do {
2351                 struct sk_buff *nskb = skb->next;
2352
2353                 skb->next = nskb->next;
2354                 nskb->next = NULL;
2355
2356                 /*
2357                  * If device doesn't need nskb->dst, release it right now while
2358                  * its hot in this cpu cache
2359                  */
2360                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2361                         skb_dst_drop(nskb);
2362
2363                 if (!list_empty(&ptype_all))
2364                         dev_queue_xmit_nit(nskb, dev);
2365
2366                 skb_len = nskb->len;
2367                 rc = ops->ndo_start_xmit(nskb, dev);
2368                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2369                 if (unlikely(rc != NETDEV_TX_OK)) {
2370                         if (rc & ~NETDEV_TX_MASK)
2371                                 goto out_kfree_gso_skb;
2372                         nskb->next = skb->next;
2373                         skb->next = nskb;
2374                         return rc;
2375                 }
2376                 txq_trans_update(txq);
2377                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2378                         return NETDEV_TX_BUSY;
2379         } while (skb->next);
2380
2381 out_kfree_gso_skb:
2382         if (likely(skb->next == NULL))
2383                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2384 out_kfree_skb:
2385         kfree_skb(skb);
2386 out:
2387         return rc;
2388 }
2389
2390 static u32 hashrnd __read_mostly;
2391
2392 /*
2393  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2394  * to be used as a distribution range.
2395  */
2396 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2397                   unsigned int num_tx_queues)
2398 {
2399         u32 hash;
2400         u16 qoffset = 0;
2401         u16 qcount = num_tx_queues;
2402
2403         if (skb_rx_queue_recorded(skb)) {
2404                 hash = skb_get_rx_queue(skb);
2405                 while (unlikely(hash >= num_tx_queues))
2406                         hash -= num_tx_queues;
2407                 return hash;
2408         }
2409
2410         if (dev->num_tc) {
2411                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2412                 qoffset = dev->tc_to_txq[tc].offset;
2413                 qcount = dev->tc_to_txq[tc].count;
2414         }
2415
2416         if (skb->sk && skb->sk->sk_hash)
2417                 hash = skb->sk->sk_hash;
2418         else
2419                 hash = (__force u16) skb->protocol;
2420         hash = jhash_1word(hash, hashrnd);
2421
2422         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2423 }
2424 EXPORT_SYMBOL(__skb_tx_hash);
2425
2426 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2427 {
2428         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2429                 net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2430                                      dev->name, queue_index,
2431                                      dev->real_num_tx_queues);
2432                 return 0;
2433         }
2434         return queue_index;
2435 }
2436
2437 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2438 {
2439 #ifdef CONFIG_XPS
2440         struct xps_dev_maps *dev_maps;
2441         struct xps_map *map;
2442         int queue_index = -1;
2443
2444         rcu_read_lock();
2445         dev_maps = rcu_dereference(dev->xps_maps);
2446         if (dev_maps) {
2447                 map = rcu_dereference(
2448                     dev_maps->cpu_map[raw_smp_processor_id()]);
2449                 if (map) {
2450                         if (map->len == 1)
2451                                 queue_index = map->queues[0];
2452                         else {
2453                                 u32 hash;
2454                                 if (skb->sk && skb->sk->sk_hash)
2455                                         hash = skb->sk->sk_hash;
2456                                 else
2457                                         hash = (__force u16) skb->protocol ^
2458                                             skb->rxhash;
2459                                 hash = jhash_1word(hash, hashrnd);
2460                                 queue_index = map->queues[
2461                                     ((u64)hash * map->len) >> 32];
2462                         }
2463                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2464                                 queue_index = -1;
2465                 }
2466         }
2467         rcu_read_unlock();
2468
2469         return queue_index;
2470 #else
2471         return -1;
2472 #endif
2473 }
2474
2475 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
2476                                     struct sk_buff *skb)
2477 {
2478         int queue_index;
2479         const struct net_device_ops *ops = dev->netdev_ops;
2480
2481         if (dev->real_num_tx_queues == 1)
2482                 queue_index = 0;
2483         else if (ops->ndo_select_queue) {
2484                 queue_index = ops->ndo_select_queue(dev, skb);
2485                 queue_index = dev_cap_txqueue(dev, queue_index);
2486         } else {
2487                 struct sock *sk = skb->sk;
2488                 queue_index = sk_tx_queue_get(sk);
2489
2490                 if (queue_index < 0 || skb->ooo_okay ||
2491                     queue_index >= dev->real_num_tx_queues) {
2492                         int old_index = queue_index;
2493
2494                         queue_index = get_xps_queue(dev, skb);
2495                         if (queue_index < 0)
2496                                 queue_index = skb_tx_hash(dev, skb);
2497
2498                         if (queue_index != old_index && sk) {
2499                                 struct dst_entry *dst =
2500                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2501
2502                                 if (dst && skb_dst(skb) == dst)
2503                                         sk_tx_queue_set(sk, queue_index);
2504                         }
2505                 }
2506         }
2507
2508         skb_set_queue_mapping(skb, queue_index);
2509         return netdev_get_tx_queue(dev, queue_index);
2510 }
2511
2512 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2513                                  struct net_device *dev,
2514                                  struct netdev_queue *txq)
2515 {
2516         spinlock_t *root_lock = qdisc_lock(q);
2517         bool contended;
2518         int rc;
2519
2520         qdisc_skb_cb(skb)->pkt_len = skb->len;
2521         qdisc_calculate_pkt_len(skb, q);
2522         /*
2523          * Heuristic to force contended enqueues to serialize on a
2524          * separate lock before trying to get qdisc main lock.
2525          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2526          * and dequeue packets faster.
2527          */
2528         contended = qdisc_is_running(q);
2529         if (unlikely(contended))
2530                 spin_lock(&q->busylock);
2531
2532         spin_lock(root_lock);
2533         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2534                 kfree_skb(skb);
2535                 rc = NET_XMIT_DROP;
2536         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2537                    qdisc_run_begin(q)) {
2538                 /*
2539                  * This is a work-conserving queue; there are no old skbs
2540                  * waiting to be sent out; and the qdisc is not running -
2541                  * xmit the skb directly.
2542                  */
2543                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2544                         skb_dst_force(skb);
2545
2546                 qdisc_bstats_update(q, skb);
2547
2548                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2549                         if (unlikely(contended)) {
2550                                 spin_unlock(&q->busylock);
2551                                 contended = false;
2552                         }
2553                         __qdisc_run(q);
2554                 } else
2555                         qdisc_run_end(q);
2556
2557                 rc = NET_XMIT_SUCCESS;
2558         } else {
2559                 skb_dst_force(skb);
2560                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2561                 if (qdisc_run_begin(q)) {
2562                         if (unlikely(contended)) {
2563                                 spin_unlock(&q->busylock);
2564                                 contended = false;
2565                         }
2566                         __qdisc_run(q);
2567                 }
2568         }
2569         spin_unlock(root_lock);
2570         if (unlikely(contended))
2571                 spin_unlock(&q->busylock);
2572         return rc;
2573 }
2574
2575 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2576 static void skb_update_prio(struct sk_buff *skb)
2577 {
2578         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2579
2580         if (!skb->priority && skb->sk && map) {
2581                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2582
2583                 if (prioidx < map->priomap_len)
2584                         skb->priority = map->priomap[prioidx];
2585         }
2586 }
2587 #else
2588 #define skb_update_prio(skb)
2589 #endif
2590
2591 static DEFINE_PER_CPU(int, xmit_recursion);
2592 #define RECURSION_LIMIT 10
2593
2594 /**
2595  *      dev_loopback_xmit - loop back @skb
2596  *      @skb: buffer to transmit
2597  */
2598 int dev_loopback_xmit(struct sk_buff *skb)
2599 {
2600         skb_reset_mac_header(skb);
2601         __skb_pull(skb, skb_network_offset(skb));
2602         skb->pkt_type = PACKET_LOOPBACK;
2603         skb->ip_summed = CHECKSUM_UNNECESSARY;
2604         WARN_ON(!skb_dst(skb));
2605         skb_dst_force(skb);
2606         netif_rx_ni(skb);
2607         return 0;
2608 }
2609 EXPORT_SYMBOL(dev_loopback_xmit);
2610
2611 /**
2612  *      dev_queue_xmit - transmit a buffer
2613  *      @skb: buffer to transmit
2614  *
2615  *      Queue a buffer for transmission to a network device. The caller must
2616  *      have set the device and priority and built the buffer before calling
2617  *      this function. The function can be called from an interrupt.
2618  *
2619  *      A negative errno code is returned on a failure. A success does not
2620  *      guarantee the frame will be transmitted as it may be dropped due
2621  *      to congestion or traffic shaping.
2622  *
2623  * -----------------------------------------------------------------------------------
2624  *      I notice this method can also return errors from the queue disciplines,
2625  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2626  *      be positive.
2627  *
2628  *      Regardless of the return value, the skb is consumed, so it is currently
2629  *      difficult to retry a send to this method.  (You can bump the ref count
2630  *      before sending to hold a reference for retry if you are careful.)
2631  *
2632  *      When calling this method, interrupts MUST be enabled.  This is because
2633  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2634  *          --BLG
2635  */
2636 int dev_queue_xmit(struct sk_buff *skb)
2637 {
2638         struct net_device *dev = skb->dev;
2639         struct netdev_queue *txq;
2640         struct Qdisc *q;
2641         int rc = -ENOMEM;
2642
2643         /* Disable soft irqs for various locks below. Also
2644          * stops preemption for RCU.
2645          */
2646         rcu_read_lock_bh();
2647
2648         skb_update_prio(skb);
2649
2650         txq = netdev_pick_tx(dev, skb);
2651         q = rcu_dereference_bh(txq->qdisc);
2652
2653 #ifdef CONFIG_NET_CLS_ACT
2654         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2655 #endif
2656         trace_net_dev_queue(skb);
2657         if (q->enqueue) {
2658                 rc = __dev_xmit_skb(skb, q, dev, txq);
2659                 goto out;
2660         }
2661
2662         /* The device has no queue. Common case for software devices:
2663            loopback, all the sorts of tunnels...
2664
2665            Really, it is unlikely that netif_tx_lock protection is necessary
2666            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2667            counters.)
2668            However, it is possible, that they rely on protection
2669            made by us here.
2670
2671            Check this and shot the lock. It is not prone from deadlocks.
2672            Either shot noqueue qdisc, it is even simpler 8)
2673          */
2674         if (dev->flags & IFF_UP) {
2675                 int cpu = smp_processor_id(); /* ok because BHs are off */
2676
2677                 if (txq->xmit_lock_owner != cpu) {
2678
2679                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2680                                 goto recursion_alert;
2681
2682                         HARD_TX_LOCK(dev, txq, cpu);
2683
2684                         if (!netif_xmit_stopped(txq)) {
2685                                 __this_cpu_inc(xmit_recursion);
2686                                 rc = dev_hard_start_xmit(skb, dev, txq);
2687                                 __this_cpu_dec(xmit_recursion);
2688                                 if (dev_xmit_complete(rc)) {
2689                                         HARD_TX_UNLOCK(dev, txq);
2690                                         goto out;
2691                                 }
2692                         }
2693                         HARD_TX_UNLOCK(dev, txq);
2694                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2695                                              dev->name);
2696                 } else {
2697                         /* Recursion is detected! It is possible,
2698                          * unfortunately
2699                          */
2700 recursion_alert:
2701                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2702                                              dev->name);
2703                 }
2704         }
2705
2706         rc = -ENETDOWN;
2707         rcu_read_unlock_bh();
2708
2709         kfree_skb(skb);
2710         return rc;
2711 out:
2712         rcu_read_unlock_bh();
2713         return rc;
2714 }
2715 EXPORT_SYMBOL(dev_queue_xmit);
2716
2717
2718 /*=======================================================================
2719                         Receiver routines
2720   =======================================================================*/
2721
2722 int netdev_max_backlog __read_mostly = 1000;
2723 EXPORT_SYMBOL(netdev_max_backlog);
2724
2725 int netdev_tstamp_prequeue __read_mostly = 1;
2726 int netdev_budget __read_mostly = 300;
2727 int weight_p __read_mostly = 64;            /* old backlog weight */
2728
2729 /* Called with irq disabled */
2730 static inline void ____napi_schedule(struct softnet_data *sd,
2731                                      struct napi_struct *napi)
2732 {
2733         list_add_tail(&napi->poll_list, &sd->poll_list);
2734         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2735 }
2736
2737 /*
2738  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2739  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2740  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2741  * if hash is a canonical 4-tuple hash over transport ports.
2742  */
2743 void __skb_get_rxhash(struct sk_buff *skb)
2744 {
2745         struct flow_keys keys;
2746         u32 hash;
2747
2748         if (!skb_flow_dissect(skb, &keys))
2749                 return;
2750
2751         if (keys.ports)
2752                 skb->l4_rxhash = 1;
2753
2754         /* get a consistent hash (same value on both flow directions) */
2755         if (((__force u32)keys.dst < (__force u32)keys.src) ||
2756             (((__force u32)keys.dst == (__force u32)keys.src) &&
2757              ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
2758                 swap(keys.dst, keys.src);
2759                 swap(keys.port16[0], keys.port16[1]);
2760         }
2761
2762         hash = jhash_3words((__force u32)keys.dst,
2763                             (__force u32)keys.src,
2764                             (__force u32)keys.ports, hashrnd);
2765         if (!hash)
2766                 hash = 1;
2767
2768         skb->rxhash = hash;
2769 }
2770 EXPORT_SYMBOL(__skb_get_rxhash);
2771
2772 #ifdef CONFIG_RPS
2773
2774 /* One global table that all flow-based protocols share. */
2775 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2776 EXPORT_SYMBOL(rps_sock_flow_table);
2777
2778 struct static_key rps_needed __read_mostly;
2779
2780 static struct rps_dev_flow *
2781 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2782             struct rps_dev_flow *rflow, u16 next_cpu)
2783 {
2784         if (next_cpu != RPS_NO_CPU) {
2785 #ifdef CONFIG_RFS_ACCEL
2786                 struct netdev_rx_queue *rxqueue;
2787                 struct rps_dev_flow_table *flow_table;
2788                 struct rps_dev_flow *old_rflow;
2789                 u32 flow_id;
2790                 u16 rxq_index;
2791                 int rc;
2792
2793                 /* Should we steer this flow to a different hardware queue? */
2794                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2795                     !(dev->features & NETIF_F_NTUPLE))
2796                         goto out;
2797                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2798                 if (rxq_index == skb_get_rx_queue(skb))
2799                         goto out;
2800
2801                 rxqueue = dev->_rx + rxq_index;
2802                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2803                 if (!flow_table)
2804                         goto out;
2805                 flow_id = skb->rxhash & flow_table->mask;
2806                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2807                                                         rxq_index, flow_id);
2808                 if (rc < 0)
2809                         goto out;
2810                 old_rflow = rflow;
2811                 rflow = &flow_table->flows[flow_id];
2812                 rflow->filter = rc;
2813                 if (old_rflow->filter == rflow->filter)
2814                         old_rflow->filter = RPS_NO_FILTER;
2815         out:
2816 #endif
2817                 rflow->last_qtail =
2818                         per_cpu(softnet_data, next_cpu).input_queue_head;
2819         }
2820
2821         rflow->cpu = next_cpu;
2822         return rflow;
2823 }
2824
2825 /*
2826  * get_rps_cpu is called from netif_receive_skb and returns the target
2827  * CPU from the RPS map of the receiving queue for a given skb.
2828  * rcu_read_lock must be held on entry.
2829  */
2830 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2831                        struct rps_dev_flow **rflowp)
2832 {
2833         struct netdev_rx_queue *rxqueue;
2834         struct rps_map *map;
2835         struct rps_dev_flow_table *flow_table;
2836         struct rps_sock_flow_table *sock_flow_table;
2837         int cpu = -1;
2838         u16 tcpu;
2839
2840         if (skb_rx_queue_recorded(skb)) {
2841                 u16 index = skb_get_rx_queue(skb);
2842                 if (unlikely(index >= dev->real_num_rx_queues)) {
2843                         WARN_ONCE(dev->real_num_rx_queues > 1,
2844                                   "%s received packet on queue %u, but number "
2845                                   "of RX queues is %u\n",
2846                                   dev->name, index, dev->real_num_rx_queues);
2847                         goto done;
2848                 }
2849                 rxqueue = dev->_rx + index;
2850         } else
2851                 rxqueue = dev->_rx;
2852
2853         map = rcu_dereference(rxqueue->rps_map);
2854         if (map) {
2855                 if (map->len == 1 &&
2856                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2857                         tcpu = map->cpus[0];
2858                         if (cpu_online(tcpu))
2859                                 cpu = tcpu;
2860                         goto done;
2861                 }
2862         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2863                 goto done;
2864         }
2865
2866         skb_reset_network_header(skb);
2867         if (!skb_get_rxhash(skb))
2868                 goto done;
2869
2870         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2871         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2872         if (flow_table && sock_flow_table) {
2873                 u16 next_cpu;
2874                 struct rps_dev_flow *rflow;
2875
2876                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2877                 tcpu = rflow->cpu;
2878
2879                 next_cpu = sock_flow_table->ents[skb->rxhash &
2880                     sock_flow_table->mask];
2881
2882                 /*
2883                  * If the desired CPU (where last recvmsg was done) is
2884                  * different from current CPU (one in the rx-queue flow
2885                  * table entry), switch if one of the following holds:
2886                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2887                  *   - Current CPU is offline.
2888                  *   - The current CPU's queue tail has advanced beyond the
2889                  *     last packet that was enqueued using this table entry.
2890                  *     This guarantees that all previous packets for the flow
2891                  *     have been dequeued, thus preserving in order delivery.
2892                  */
2893                 if (unlikely(tcpu != next_cpu) &&
2894                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2895                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2896                       rflow->last_qtail)) >= 0)) {
2897                         tcpu = next_cpu;
2898                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2899                 }
2900
2901                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2902                         *rflowp = rflow;
2903                         cpu = tcpu;
2904                         goto done;
2905                 }
2906         }
2907
2908         if (map) {
2909                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2910
2911                 if (cpu_online(tcpu)) {
2912                         cpu = tcpu;
2913                         goto done;
2914                 }
2915         }
2916
2917 done:
2918         return cpu;
2919 }
2920
2921 #ifdef CONFIG_RFS_ACCEL
2922
2923 /**
2924  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2925  * @dev: Device on which the filter was set
2926  * @rxq_index: RX queue index
2927  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2928  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2929  *
2930  * Drivers that implement ndo_rx_flow_steer() should periodically call
2931  * this function for each installed filter and remove the filters for
2932  * which it returns %true.
2933  */
2934 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2935                          u32 flow_id, u16 filter_id)
2936 {
2937         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2938         struct rps_dev_flow_table *flow_table;
2939         struct rps_dev_flow *rflow;
2940         bool expire = true;
2941         int cpu;
2942
2943         rcu_read_lock();
2944         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2945         if (flow_table && flow_id <= flow_table->mask) {
2946                 rflow = &flow_table->flows[flow_id];
2947                 cpu = ACCESS_ONCE(rflow->cpu);
2948                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2949                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2950                            rflow->last_qtail) <
2951                      (int)(10 * flow_table->mask)))
2952                         expire = false;
2953         }
2954         rcu_read_unlock();
2955         return expire;
2956 }
2957 EXPORT_SYMBOL(rps_may_expire_flow);
2958
2959 #endif /* CONFIG_RFS_ACCEL */
2960
2961 /* Called from hardirq (IPI) context */
2962 static void rps_trigger_softirq(void *data)
2963 {
2964         struct softnet_data *sd = data;
2965
2966         ____napi_schedule(sd, &sd->backlog);
2967         sd->received_rps++;
2968 }
2969
2970 #endif /* CONFIG_RPS */
2971
2972 /*
2973  * Check if this softnet_data structure is another cpu one
2974  * If yes, queue it to our IPI list and return 1
2975  * If no, return 0
2976  */
2977 static int rps_ipi_queued(struct softnet_data *sd)
2978 {
2979 #ifdef CONFIG_RPS
2980         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2981
2982         if (sd != mysd) {
2983                 sd->rps_ipi_next = mysd->rps_ipi_list;
2984                 mysd->rps_ipi_list = sd;
2985
2986                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2987                 return 1;
2988         }
2989 #endif /* CONFIG_RPS */
2990         return 0;
2991 }
2992
2993 /*
2994  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2995  * queue (may be a remote CPU queue).
2996  */
2997 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2998                               unsigned int *qtail)
2999 {
3000         struct softnet_data *sd;
3001         unsigned long flags;
3002
3003         sd = &per_cpu(softnet_data, cpu);
3004
3005         local_irq_save(flags);
3006
3007         rps_lock(sd);
3008         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3009                 if (skb_queue_len(&sd->input_pkt_queue)) {
3010 enqueue:
3011                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3012                         input_queue_tail_incr_save(sd, qtail);
3013                         rps_unlock(sd);
3014                         local_irq_restore(flags);
3015                         return NET_RX_SUCCESS;
3016                 }
3017
3018                 /* Schedule NAPI for backlog device
3019                  * We can use non atomic operation since we own the queue lock
3020                  */
3021                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3022                         if (!rps_ipi_queued(sd))
3023                                 ____napi_schedule(sd, &sd->backlog);
3024                 }
3025                 goto enqueue;
3026         }
3027
3028         sd->dropped++;
3029         rps_unlock(sd);
3030
3031         local_irq_restore(flags);
3032
3033         atomic_long_inc(&skb->dev->rx_dropped);
3034         kfree_skb(skb);
3035         return NET_RX_DROP;
3036 }
3037
3038 /**
3039  *      netif_rx        -       post buffer to the network code
3040  *      @skb: buffer to post
3041  *
3042  *      This function receives a packet from a device driver and queues it for
3043  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3044  *      may be dropped during processing for congestion control or by the
3045  *      protocol layers.
3046  *
3047  *      return values:
3048  *      NET_RX_SUCCESS  (no congestion)
3049  *      NET_RX_DROP     (packet was dropped)
3050  *
3051  */
3052
3053 int netif_rx(struct sk_buff *skb)
3054 {
3055         int ret;
3056
3057         /* if netpoll wants it, pretend we never saw it */
3058         if (netpoll_rx(skb))
3059                 return NET_RX_DROP;
3060
3061         net_timestamp_check(netdev_tstamp_prequeue, skb);
3062
3063         trace_netif_rx(skb);
3064 #ifdef CONFIG_RPS
3065         if (static_key_false(&rps_needed)) {
3066                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3067                 int cpu;
3068
3069                 preempt_disable();
3070                 rcu_read_lock();
3071
3072                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3073                 if (cpu < 0)
3074                         cpu = smp_processor_id();
3075
3076                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3077
3078                 rcu_read_unlock();
3079                 preempt_enable();
3080         } else
3081 #endif
3082         {
3083                 unsigned int qtail;
3084                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3085                 put_cpu();
3086         }
3087         return ret;
3088 }
3089 EXPORT_SYMBOL(netif_rx);
3090
3091 int netif_rx_ni(struct sk_buff *skb)
3092 {
3093         int err;
3094
3095         preempt_disable();
3096         err = netif_rx(skb);
3097         if (local_softirq_pending())
3098                 do_softirq();
3099         preempt_enable();
3100
3101         return err;
3102 }
3103 EXPORT_SYMBOL(netif_rx_ni);
3104
3105 static void net_tx_action(struct softirq_action *h)
3106 {
3107         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3108
3109         if (sd->completion_queue) {
3110                 struct sk_buff *clist;
3111
3112                 local_irq_disable();
3113                 clist = sd->completion_queue;
3114                 sd->completion_queue = NULL;
3115                 local_irq_enable();
3116
3117                 while (clist) {
3118                         struct sk_buff *skb = clist;
3119                         clist = clist->next;
3120
3121                         WARN_ON(atomic_read(&skb->users));
3122                         trace_kfree_skb(skb, net_tx_action);
3123                         __kfree_skb(skb);
3124                 }
3125         }
3126
3127         if (sd->output_queue) {
3128                 struct Qdisc *head;
3129
3130                 local_irq_disable();
3131                 head = sd->output_queue;
3132                 sd->output_queue = NULL;
3133                 sd->output_queue_tailp = &sd->output_queue;
3134                 local_irq_enable();
3135
3136                 while (head) {
3137                         struct Qdisc *q = head;
3138                         spinlock_t *root_lock;
3139
3140                         head = head->next_sched;
3141
3142                         root_lock = qdisc_lock(q);
3143                         if (spin_trylock(root_lock)) {
3144                                 smp_mb__before_clear_bit();
3145                                 clear_bit(__QDISC_STATE_SCHED,
3146                                           &q->state);
3147                                 qdisc_run(q);
3148                                 spin_unlock(root_lock);
3149                         } else {
3150                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3151                                               &q->state)) {
3152                                         __netif_reschedule(q);
3153                                 } else {
3154                                         smp_mb__before_clear_bit();
3155                                         clear_bit(__QDISC_STATE_SCHED,
3156                                                   &q->state);
3157                                 }
3158                         }
3159                 }
3160         }
3161 }
3162
3163 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3164     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3165 /* This hook is defined here for ATM LANE */
3166 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3167                              unsigned char *addr) __read_mostly;
3168 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3169 #endif
3170
3171 #ifdef CONFIG_NET_CLS_ACT
3172 /* TODO: Maybe we should just force sch_ingress to be compiled in
3173  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3174  * a compare and 2 stores extra right now if we dont have it on
3175  * but have CONFIG_NET_CLS_ACT
3176  * NOTE: This doesn't stop any functionality; if you dont have
3177  * the ingress scheduler, you just can't add policies on ingress.
3178  *
3179  */
3180 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3181 {
3182         struct net_device *dev = skb->dev;
3183         u32 ttl = G_TC_RTTL(skb->tc_verd);
3184         int result = TC_ACT_OK;
3185         struct Qdisc *q;
3186
3187         if (unlikely(MAX_RED_LOOP < ttl++)) {
3188                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3189                                      skb->skb_iif, dev->ifindex);
3190                 return TC_ACT_SHOT;
3191         }
3192
3193         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3194         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3195
3196         q = rxq->qdisc;
3197         if (q != &noop_qdisc) {
3198                 spin_lock(qdisc_lock(q));
3199                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3200                         result = qdisc_enqueue_root(skb, q);
3201                 spin_unlock(qdisc_lock(q));
3202         }
3203
3204         return result;
3205 }
3206
3207 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3208                                          struct packet_type **pt_prev,
3209                                          int *ret, struct net_device *orig_dev)
3210 {
3211         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3212
3213         if (!rxq || rxq->qdisc == &noop_qdisc)
3214                 goto out;
3215
3216         if (*pt_prev) {
3217                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3218                 *pt_prev = NULL;
3219         }
3220
3221         switch (ing_filter(skb, rxq)) {
3222         case TC_ACT_SHOT:
3223         case TC_ACT_STOLEN:
3224                 kfree_skb(skb);
3225                 return NULL;
3226         }
3227
3228 out:
3229         skb->tc_verd = 0;
3230         return skb;
3231 }
3232 #endif
3233
3234 /**
3235  *      netdev_rx_handler_register - register receive handler
3236  *      @dev: device to register a handler for
3237  *      @rx_handler: receive handler to register
3238  *      @rx_handler_data: data pointer that is used by rx handler
3239  *
3240  *      Register a receive hander for a device. This handler will then be
3241  *      called from __netif_receive_skb. A negative errno code is returned
3242  *      on a failure.
3243  *
3244  *      The caller must hold the rtnl_mutex.
3245  *
3246  *      For a general description of rx_handler, see enum rx_handler_result.
3247  */
3248 int netdev_rx_handler_register(struct net_device *dev,
3249                                rx_handler_func_t *rx_handler,
3250                                void *rx_handler_data)
3251 {
3252         ASSERT_RTNL();
3253
3254         if (dev->rx_handler)
3255                 return -EBUSY;
3256
3257         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3258         rcu_assign_pointer(dev->rx_handler, rx_handler);
3259
3260         return 0;
3261 }
3262 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3263
3264 /**
3265  *      netdev_rx_handler_unregister - unregister receive handler
3266  *      @dev: device to unregister a handler from
3267  *
3268  *      Unregister a receive hander from a device.
3269  *
3270  *      The caller must hold the rtnl_mutex.
3271  */
3272 void netdev_rx_handler_unregister(struct net_device *dev)
3273 {
3274
3275         ASSERT_RTNL();
3276         RCU_INIT_POINTER(dev->rx_handler, NULL);
3277         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3278 }
3279 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3280
3281 /*
3282  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3283  * the special handling of PFMEMALLOC skbs.
3284  */
3285 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3286 {
3287         switch (skb->protocol) {
3288         case __constant_htons(ETH_P_ARP):
3289         case __constant_htons(ETH_P_IP):
3290         case __constant_htons(ETH_P_IPV6):
3291         case __constant_htons(ETH_P_8021Q):
3292                 return true;
3293         default:
3294                 return false;
3295         }
3296 }
3297
3298 static int __netif_receive_skb(struct sk_buff *skb)
3299 {
3300         struct packet_type *ptype, *pt_prev;
3301         rx_handler_func_t *rx_handler;
3302         struct net_device *orig_dev;
3303         struct net_device *null_or_dev;
3304         bool deliver_exact = false;
3305         int ret = NET_RX_DROP;
3306         __be16 type;
3307         unsigned long pflags = current->flags;
3308
3309         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3310
3311         trace_netif_receive_skb(skb);
3312
3313         /*
3314          * PFMEMALLOC skbs are special, they should
3315          * - be delivered to SOCK_MEMALLOC sockets only
3316          * - stay away from userspace
3317          * - have bounded memory usage
3318          *
3319          * Use PF_MEMALLOC as this saves us from propagating the allocation
3320          * context down to all allocation sites.
3321          */
3322         if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3323                 current->flags |= PF_MEMALLOC;
3324
3325         /* if we've gotten here through NAPI, check netpoll */
3326         if (netpoll_receive_skb(skb))
3327                 goto out;
3328
3329         orig_dev = skb->dev;
3330
3331         skb_reset_network_header(skb);
3332         skb_reset_transport_header(skb);
3333         skb_reset_mac_len(skb);
3334
3335         pt_prev = NULL;
3336
3337         rcu_read_lock();
3338
3339 another_round:
3340         skb->skb_iif = skb->dev->ifindex;
3341
3342         __this_cpu_inc(softnet_data.processed);
3343
3344         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3345                 skb = vlan_untag(skb);
3346                 if (unlikely(!skb))
3347                         goto unlock;
3348         }
3349
3350 #ifdef CONFIG_NET_CLS_ACT
3351         if (skb->tc_verd & TC_NCLS) {
3352                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3353                 goto ncls;
3354         }
3355 #endif
3356
3357         if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3358                 goto skip_taps;
3359
3360         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3361                 if (!ptype->dev || ptype->dev == skb->dev) {
3362                         if (pt_prev)
3363                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3364                         pt_prev = ptype;
3365                 }
3366         }
3367
3368 skip_taps:
3369 #ifdef CONFIG_NET_CLS_ACT
3370         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3371         if (!skb)
3372                 goto unlock;
3373 ncls:
3374 #endif
3375
3376         if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3377                                 && !skb_pfmemalloc_protocol(skb))
3378                 goto drop;
3379
3380         if (vlan_tx_tag_present(skb)) {
3381                 if (pt_prev) {
3382                         ret = deliver_skb(skb, pt_prev, orig_dev);
3383                         pt_prev = NULL;
3384                 }
3385                 if (vlan_do_receive(&skb))
3386                         goto another_round;
3387                 else if (unlikely(!skb))
3388                         goto unlock;
3389         }
3390
3391         rx_handler = rcu_dereference(skb->dev->rx_handler);
3392         if (rx_handler) {
3393                 if (pt_prev) {
3394                         ret = deliver_skb(skb, pt_prev, orig_dev);
3395                         pt_prev = NULL;
3396                 }
3397                 switch (rx_handler(&skb)) {
3398                 case RX_HANDLER_CONSUMED:
3399                         goto unlock;
3400                 case RX_HANDLER_ANOTHER:
3401                         goto another_round;
3402                 case RX_HANDLER_EXACT:
3403                         deliver_exact = true;
3404                 case RX_HANDLER_PASS:
3405                         break;
3406                 default:
3407                         BUG();
3408                 }
3409         }
3410
3411         if (vlan_tx_nonzero_tag_present(skb))
3412                 skb->pkt_type = PACKET_OTHERHOST;
3413
3414         /* deliver only exact match when indicated */
3415         null_or_dev = deliver_exact ? skb->dev : NULL;
3416
3417         type = skb->protocol;
3418         list_for_each_entry_rcu(ptype,
3419                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3420                 if (ptype->type == type &&
3421                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3422                      ptype->dev == orig_dev)) {
3423                         if (pt_prev)
3424                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3425                         pt_prev = ptype;
3426                 }
3427         }
3428
3429         if (pt_prev) {
3430                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3431                         goto drop;
3432                 else
3433                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3434         } else {
3435 drop:
3436                 atomic_long_inc(&skb->dev->rx_dropped);
3437                 kfree_skb(skb);
3438                 /* Jamal, now you will not able to escape explaining
3439                  * me how you were going to use this. :-)
3440                  */
3441                 ret = NET_RX_DROP;
3442         }
3443
3444 unlock:
3445         rcu_read_unlock();
3446 out:
3447         tsk_restore_flags(current, pflags, PF_MEMALLOC);
3448         return ret;
3449 }
3450
3451 /**
3452  *      netif_receive_skb - process receive buffer from network
3453  *      @skb: buffer to process
3454  *
3455  *      netif_receive_skb() is the main receive data processing function.
3456  *      It always succeeds. The buffer may be dropped during processing
3457  *      for congestion control or by the protocol layers.
3458  *
3459  *      This function may only be called from softirq context and interrupts
3460  *      should be enabled.
3461  *
3462  *      Return values (usually ignored):
3463  *      NET_RX_SUCCESS: no congestion
3464  *      NET_RX_DROP: packet was dropped
3465  */
3466 int netif_receive_skb(struct sk_buff *skb)
3467 {
3468         net_timestamp_check(netdev_tstamp_prequeue, skb);
3469
3470         if (skb_defer_rx_timestamp(skb))
3471                 return NET_RX_SUCCESS;
3472
3473 #ifdef CONFIG_RPS
3474         if (static_key_false(&rps_needed)) {
3475                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3476                 int cpu, ret;
3477
3478                 rcu_read_lock();
3479
3480                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3481
3482                 if (cpu >= 0) {
3483                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3484                         rcu_read_unlock();
3485                         return ret;
3486                 }
3487                 rcu_read_unlock();
3488         }
3489 #endif
3490         return __netif_receive_skb(skb);
3491 }
3492 EXPORT_SYMBOL(netif_receive_skb);
3493
3494 /* Network device is going away, flush any packets still pending
3495  * Called with irqs disabled.
3496  */
3497 static void flush_backlog(void *arg)
3498 {
3499         struct net_device *dev = arg;
3500         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3501         struct sk_buff *skb, *tmp;
3502
3503         rps_lock(sd);
3504         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3505                 if (skb->dev == dev) {
3506                         __skb_unlink(skb, &sd->input_pkt_queue);
3507                         kfree_skb(skb);
3508                         input_queue_head_incr(sd);
3509                 }
3510         }
3511         rps_unlock(sd);
3512
3513         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3514                 if (skb->dev == dev) {
3515                         __skb_unlink(skb, &sd->process_queue);
3516                         kfree_skb(skb);
3517                         input_queue_head_incr(sd);
3518                 }
3519         }
3520 }
3521
3522 static int napi_gro_complete(struct sk_buff *skb)
3523 {
3524         struct packet_offload *ptype;
3525         __be16 type = skb->protocol;
3526         struct list_head *head = &offload_base;
3527         int err = -ENOENT;
3528
3529         if (NAPI_GRO_CB(skb)->count == 1) {
3530                 skb_shinfo(skb)->gso_size = 0;
3531                 goto out;
3532         }
3533
3534         rcu_read_lock();
3535         list_for_each_entry_rcu(ptype, head, list) {
3536                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3537                         continue;
3538
3539                 err = ptype->callbacks.gro_complete(skb);
3540                 break;
3541         }
3542         rcu_read_unlock();
3543
3544         if (err) {
3545                 WARN_ON(&ptype->list == head);
3546                 kfree_skb(skb);
3547                 return NET_RX_SUCCESS;
3548         }
3549
3550 out:
3551         return netif_receive_skb(skb);
3552 }
3553
3554 /* napi->gro_list contains packets ordered by age.
3555  * youngest packets at the head of it.
3556  * Complete skbs in reverse order to reduce latencies.
3557  */
3558 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3559 {
3560         struct sk_buff *skb, *prev = NULL;
3561
3562         /* scan list and build reverse chain */
3563         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3564                 skb->prev = prev;
3565                 prev = skb;
3566         }
3567
3568         for (skb = prev; skb; skb = prev) {
3569                 skb->next = NULL;
3570
3571                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3572                         return;
3573
3574                 prev = skb->prev;
3575                 napi_gro_complete(skb);
3576                 napi->gro_count--;
3577         }
3578
3579         napi->gro_list = NULL;
3580 }
3581 EXPORT_SYMBOL(napi_gro_flush);
3582
3583 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3584 {
3585         struct sk_buff **pp = NULL;
3586         struct packet_offload *ptype;
3587         __be16 type = skb->protocol;
3588         struct list_head *head = &offload_base;
3589         int same_flow;
3590         int mac_len;
3591         enum gro_result ret;
3592
3593         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3594                 goto normal;
3595
3596         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3597                 goto normal;
3598
3599         rcu_read_lock();
3600         list_for_each_entry_rcu(ptype, head, list) {
3601                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3602                         continue;
3603
3604                 skb_set_network_header(skb, skb_gro_offset(skb));
3605                 mac_len = skb->network_header - skb->mac_header;
3606                 skb->mac_len = mac_len;
3607                 NAPI_GRO_CB(skb)->same_flow = 0;
3608                 NAPI_GRO_CB(skb)->flush = 0;
3609                 NAPI_GRO_CB(skb)->free = 0;
3610
3611                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3612                 break;
3613         }
3614         rcu_read_unlock();
3615
3616         if (&ptype->list == head)
3617                 goto normal;
3618
3619         same_flow = NAPI_GRO_CB(skb)->same_flow;
3620         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3621
3622         if (pp) {
3623                 struct sk_buff *nskb = *pp;
3624
3625                 *pp = nskb->next;
3626                 nskb->next = NULL;
3627                 napi_gro_complete(nskb);
3628                 napi->gro_count--;
3629         }
3630
3631         if (same_flow)
3632                 goto ok;
3633
3634         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3635                 goto normal;
3636
3637         napi->gro_count++;
3638         NAPI_GRO_CB(skb)->count = 1;
3639         NAPI_GRO_CB(skb)->age = jiffies;
3640         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3641         skb->next = napi->gro_list;
3642         napi->gro_list = skb;
3643         ret = GRO_HELD;
3644
3645 pull:
3646         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3647                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3648
3649                 BUG_ON(skb->end - skb->tail < grow);
3650
3651                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3652
3653                 skb->tail += grow;
3654                 skb->data_len -= grow;
3655
3656                 skb_shinfo(skb)->frags[0].page_offset += grow;
3657                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3658
3659                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3660                         skb_frag_unref(skb, 0);
3661                         memmove(skb_shinfo(skb)->frags,
3662                                 skb_shinfo(skb)->frags + 1,
3663                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3664                 }
3665         }
3666
3667 ok:
3668         return ret;
3669
3670 normal:
3671         ret = GRO_NORMAL;
3672         goto pull;
3673 }
3674 EXPORT_SYMBOL(dev_gro_receive);
3675
3676 static inline gro_result_t
3677 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3678 {
3679         struct sk_buff *p;
3680         unsigned int maclen = skb->dev->hard_header_len;
3681
3682         for (p = napi->gro_list; p; p = p->next) {
3683                 unsigned long diffs;
3684
3685                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3686                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3687                 if (maclen == ETH_HLEN)
3688                         diffs |= compare_ether_header(skb_mac_header(p),
3689                                                       skb_gro_mac_header(skb));
3690                 else if (!diffs)
3691                         diffs = memcmp(skb_mac_header(p),
3692                                        skb_gro_mac_header(skb),
3693                                        maclen);
3694                 NAPI_GRO_CB(p)->same_flow = !diffs;
3695                 NAPI_GRO_CB(p)->flush = 0;
3696         }
3697
3698         return dev_gro_receive(napi, skb);
3699 }
3700
3701 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3702 {
3703         switch (ret) {
3704         case GRO_NORMAL:
3705                 if (netif_receive_skb(skb))
3706                         ret = GRO_DROP;
3707                 break;
3708
3709         case GRO_DROP:
3710                 kfree_skb(skb);
3711                 break;
3712
3713         case GRO_MERGED_FREE:
3714                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3715                         kmem_cache_free(skbuff_head_cache, skb);
3716                 else
3717                         __kfree_skb(skb);
3718                 break;
3719
3720         case GRO_HELD:
3721         case GRO_MERGED:
3722                 break;
3723         }
3724
3725         return ret;
3726 }
3727 EXPORT_SYMBOL(napi_skb_finish);
3728
3729 static void skb_gro_reset_offset(struct sk_buff *skb)
3730 {
3731         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3732         const skb_frag_t *frag0 = &pinfo->frags[0];
3733
3734         NAPI_GRO_CB(skb)->data_offset = 0;
3735         NAPI_GRO_CB(skb)->frag0 = NULL;
3736         NAPI_GRO_CB(skb)->frag0_len = 0;
3737
3738         if (skb->mac_header == skb->tail &&
3739             pinfo->nr_frags &&
3740             !PageHighMem(skb_frag_page(frag0))) {
3741                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3742                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3743         }
3744 }
3745
3746 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3747 {
3748         skb_gro_reset_offset(skb);
3749
3750         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3751 }
3752 EXPORT_SYMBOL(napi_gro_receive);
3753
3754 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3755 {
3756         __skb_pull(skb, skb_headlen(skb));
3757         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3758         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3759         skb->vlan_tci = 0;
3760         skb->dev = napi->dev;
3761         skb->skb_iif = 0;
3762
3763         napi->skb = skb;
3764 }
3765
3766 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3767 {
3768         struct sk_buff *skb = napi->skb;
3769
3770         if (!skb) {
3771                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3772                 if (skb)
3773                         napi->skb = skb;
3774         }
3775         return skb;
3776 }
3777 EXPORT_SYMBOL(napi_get_frags);
3778
3779 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3780                                gro_result_t ret)
3781 {
3782         switch (ret) {
3783         case GRO_NORMAL:
3784         case GRO_HELD:
3785                 skb->protocol = eth_type_trans(skb, skb->dev);
3786
3787                 if (ret == GRO_HELD)
3788                         skb_gro_pull(skb, -ETH_HLEN);
3789                 else if (netif_receive_skb(skb))
3790                         ret = GRO_DROP;
3791                 break;
3792
3793         case GRO_DROP:
3794         case GRO_MERGED_FREE:
3795                 napi_reuse_skb(napi, skb);
3796                 break;
3797
3798         case GRO_MERGED:
3799                 break;
3800         }
3801
3802         return ret;
3803 }
3804 EXPORT_SYMBOL(napi_frags_finish);
3805
3806 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3807 {
3808         struct sk_buff *skb = napi->skb;
3809         struct ethhdr *eth;
3810         unsigned int hlen;
3811         unsigned int off;
3812
3813         napi->skb = NULL;
3814
3815         skb_reset_mac_header(skb);
3816         skb_gro_reset_offset(skb);
3817
3818         off = skb_gro_offset(skb);
3819         hlen = off + sizeof(*eth);
3820         eth = skb_gro_header_fast(skb, off);
3821         if (skb_gro_header_hard(skb, hlen)) {
3822                 eth = skb_gro_header_slow(skb, hlen, off);
3823                 if (unlikely(!eth)) {
3824                         napi_reuse_skb(napi, skb);
3825                         skb = NULL;
3826                         goto out;
3827                 }
3828         }
3829
3830         skb_gro_pull(skb, sizeof(*eth));
3831
3832         /*
3833          * This works because the only protocols we care about don't require
3834          * special handling.  We'll fix it up properly at the end.
3835          */
3836         skb->protocol = eth->h_proto;
3837
3838 out:
3839         return skb;
3840 }
3841
3842 gro_result_t napi_gro_frags(struct napi_struct *napi)
3843 {
3844         struct sk_buff *skb = napi_frags_skb(napi);
3845
3846         if (!skb)
3847                 return GRO_DROP;
3848
3849         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3850 }
3851 EXPORT_SYMBOL(napi_gro_frags);
3852
3853 /*
3854  * net_rps_action sends any pending IPI's for rps.
3855  * Note: called with local irq disabled, but exits with local irq enabled.
3856  */
3857 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3858 {
3859 #ifdef CONFIG_RPS
3860         struct softnet_data *remsd = sd->rps_ipi_list;
3861
3862         if (remsd) {
3863                 sd->rps_ipi_list = NULL;
3864
3865                 local_irq_enable();
3866
3867                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3868                 while (remsd) {
3869                         struct softnet_data *next = remsd->rps_ipi_next;
3870
3871                         if (cpu_online(remsd->cpu))
3872                                 __smp_call_function_single(remsd->cpu,
3873                                                            &remsd->csd, 0);
3874                         remsd = next;
3875                 }
3876         } else
3877 #endif
3878                 local_irq_enable();
3879 }
3880
3881 static int process_backlog(struct napi_struct *napi, int quota)
3882 {
3883         int work = 0;
3884         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3885
3886 #ifdef CONFIG_RPS
3887         /* Check if we have pending ipi, its better to send them now,
3888          * not waiting net_rx_action() end.
3889          */
3890         if (sd->rps_ipi_list) {
3891                 local_irq_disable();
3892                 net_rps_action_and_irq_enable(sd);
3893         }
3894 #endif
3895         napi->weight = weight_p;
3896         local_irq_disable();
3897         while (work < quota) {
3898                 struct sk_buff *skb;
3899                 unsigned int qlen;
3900
3901                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3902                         local_irq_enable();
3903                         __netif_receive_skb(skb);
3904                         local_irq_disable();
3905                         input_queue_head_incr(sd);
3906                         if (++work >= quota) {
3907                                 local_irq_enable();
3908                                 return work;
3909                         }
3910                 }
3911
3912                 rps_lock(sd);
3913                 qlen = skb_queue_len(&sd->input_pkt_queue);
3914                 if (qlen)
3915                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3916                                                    &sd->process_queue);
3917
3918                 if (qlen < quota - work) {
3919                         /*
3920                          * Inline a custom version of __napi_complete().
3921                          * only current cpu owns and manipulates this napi,
3922                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3923                          * we can use a plain write instead of clear_bit(),
3924                          * and we dont need an smp_mb() memory barrier.
3925                          */
3926                         list_del(&napi->poll_list);
3927                         napi->state = 0;
3928
3929                         quota = work + qlen;
3930                 }
3931                 rps_unlock(sd);
3932         }
3933         local_irq_enable();
3934
3935         return work;
3936 }
3937
3938 /**
3939  * __napi_schedule - schedule for receive
3940  * @n: entry to schedule
3941  *
3942  * The entry's receive function will be scheduled to run
3943  */
3944 void __napi_schedule(struct napi_struct *n)
3945 {
3946         unsigned long flags;
3947
3948         local_irq_save(flags);
3949         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3950         local_irq_restore(flags);
3951 }
3952 EXPORT_SYMBOL(__napi_schedule);
3953
3954 void __napi_complete(struct napi_struct *n)
3955 {
3956         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3957         BUG_ON(n->gro_list);
3958
3959         list_del(&n->poll_list);
3960         smp_mb__before_clear_bit();
3961         clear_bit(NAPI_STATE_SCHED, &n->state);
3962 }
3963 EXPORT_SYMBOL(__napi_complete);
3964
3965 void napi_complete(struct napi_struct *n)
3966 {
3967         unsigned long flags;
3968
3969         /*
3970          * don't let napi dequeue from the cpu poll list
3971          * just in case its running on a different cpu
3972          */
3973         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3974                 return;
3975
3976         napi_gro_flush(n, false);
3977         local_irq_save(flags);
3978         __napi_complete(n);
3979         local_irq_restore(flags);
3980 }
3981 EXPORT_SYMBOL(napi_complete);
3982
3983 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3984                     int (*poll)(struct napi_struct *, int), int weight)
3985 {
3986         INIT_LIST_HEAD(&napi->poll_list);
3987         napi->gro_count = 0;
3988         napi->gro_list = NULL;
3989         napi->skb = NULL;
3990         napi->poll = poll;
3991         napi->weight = weight;
3992         list_add(&napi->dev_list, &dev->napi_list);
3993         napi->dev = dev;
3994 #ifdef CONFIG_NETPOLL
3995         spin_lock_init(&napi->poll_lock);
3996         napi->poll_owner = -1;
3997 #endif
3998         set_bit(NAPI_STATE_SCHED, &napi->state);
3999 }
4000 EXPORT_SYMBOL(netif_napi_add);
4001
4002 void netif_napi_del(struct napi_struct *napi)
4003 {
4004         struct sk_buff *skb, *next;
4005
4006         list_del_init(&napi->dev_list);
4007         napi_free_frags(napi);
4008
4009         for (skb = napi->gro_list; skb; skb = next) {
4010                 next = skb->next;
4011                 skb->next = NULL;
4012                 kfree_skb(skb);
4013         }
4014
4015         napi->gro_list = NULL;
4016         napi->gro_count = 0;
4017 }
4018 EXPORT_SYMBOL(netif_napi_del);
4019
4020 static void net_rx_action(struct softirq_action *h)
4021 {
4022         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4023         unsigned long time_limit = jiffies + 2;
4024         int budget = netdev_budget;
4025         void *have;
4026
4027         local_irq_disable();
4028
4029         while (!list_empty(&sd->poll_list)) {
4030                 struct napi_struct *n;
4031                 int work, weight;
4032
4033                 /* If softirq window is exhuasted then punt.
4034                  * Allow this to run for 2 jiffies since which will allow
4035                  * an average latency of 1.5/HZ.
4036                  */
4037                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4038                         goto softnet_break;
4039
4040                 local_irq_enable();
4041
4042                 /* Even though interrupts have been re-enabled, this
4043                  * access is safe because interrupts can only add new
4044                  * entries to the tail of this list, and only ->poll()
4045                  * calls can remove this head entry from the list.
4046                  */
4047                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4048
4049                 have = netpoll_poll_lock(n);
4050
4051                 weight = n->weight;
4052
4053                 /* This NAPI_STATE_SCHED test is for avoiding a race
4054                  * with netpoll's poll_napi().  Only the entity which
4055                  * obtains the lock and sees NAPI_STATE_SCHED set will
4056                  * actually make the ->poll() call.  Therefore we avoid
4057                  * accidentally calling ->poll() when NAPI is not scheduled.
4058                  */
4059                 work = 0;
4060                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4061                         work = n->poll(n, weight);
4062                         trace_napi_poll(n);
4063                 }
4064
4065                 WARN_ON_ONCE(work > weight);
4066
4067                 budget -= work;
4068
4069                 local_irq_disable();
4070
4071                 /* Drivers must not modify the NAPI state if they
4072                  * consume the entire weight.  In such cases this code
4073                  * still "owns" the NAPI instance and therefore can
4074                  * move the instance around on the list at-will.
4075                  */
4076                 if (unlikely(work == weight)) {
4077                         if (unlikely(napi_disable_pending(n))) {
4078                                 local_irq_enable();
4079                                 napi_complete(n);
4080                                 local_irq_disable();
4081                         } else {
4082                                 if (n->gro_list) {
4083                                         /* flush too old packets
4084                                          * If HZ < 1000, flush all packets.
4085                                          */
4086                                         local_irq_enable();
4087                                         napi_gro_flush(n, HZ >= 1000);
4088                                         local_irq_disable();
4089                                 }
4090                                 list_move_tail(&n->poll_list, &sd->poll_list);
4091                         }
4092                 }
4093
4094                 netpoll_poll_unlock(have);
4095         }
4096 out:
4097         net_rps_action_and_irq_enable(sd);
4098
4099 #ifdef CONFIG_NET_DMA
4100         /*
4101          * There may not be any more sk_buffs coming right now, so push
4102          * any pending DMA copies to hardware
4103          */
4104         dma_issue_pending_all();
4105 #endif
4106
4107         return;
4108
4109 softnet_break:
4110         sd->time_squeeze++;
4111         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4112         goto out;
4113 }
4114
4115 static gifconf_func_t *gifconf_list[NPROTO];
4116
4117 /**
4118  *      register_gifconf        -       register a SIOCGIF handler
4119  *      @family: Address family
4120  *      @gifconf: Function handler
4121  *
4122  *      Register protocol dependent address dumping routines. The handler
4123  *      that is passed must not be freed or reused until it has been replaced
4124  *      by another handler.
4125  */
4126 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4127 {
4128         if (family >= NPROTO)
4129                 return -EINVAL;
4130         gifconf_list[family] = gifconf;
4131         return 0;
4132 }
4133 EXPORT_SYMBOL(register_gifconf);
4134
4135
4136 /*
4137  *      Map an interface index to its name (SIOCGIFNAME)
4138  */
4139
4140 /*
4141  *      We need this ioctl for efficient implementation of the
4142  *      if_indextoname() function required by the IPv6 API.  Without
4143  *      it, we would have to search all the interfaces to find a
4144  *      match.  --pb
4145  */
4146
4147 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4148 {
4149         struct net_device *dev;
4150         struct ifreq ifr;
4151
4152         /*
4153          *      Fetch the caller's info block.
4154          */
4155
4156         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4157                 return -EFAULT;
4158
4159         rcu_read_lock();
4160         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4161         if (!dev) {
4162                 rcu_read_unlock();
4163                 return -ENODEV;
4164         }
4165
4166         strcpy(ifr.ifr_name, dev->name);
4167         rcu_read_unlock();
4168
4169         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4170                 return -EFAULT;
4171         return 0;
4172 }
4173
4174 /*
4175  *      Perform a SIOCGIFCONF call. This structure will change
4176  *      size eventually, and there is nothing I can do about it.
4177  *      Thus we will need a 'compatibility mode'.
4178  */
4179
4180 static int dev_ifconf(struct net *net, char __user *arg)
4181 {
4182         struct ifconf ifc;
4183         struct net_device *dev;
4184         char __user *pos;
4185         int len;
4186         int total;
4187         int i;
4188
4189         /*
4190          *      Fetch the caller's info block.
4191          */
4192
4193         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4194                 return -EFAULT;
4195
4196         pos = ifc.ifc_buf;
4197         len = ifc.ifc_len;
4198
4199         /*
4200          *      Loop over the interfaces, and write an info block for each.
4201          */
4202
4203         total = 0;
4204         for_each_netdev(net, dev) {
4205                 for (i = 0; i < NPROTO; i++) {
4206                         if (gifconf_list[i]) {
4207                                 int done;
4208                                 if (!pos)
4209                                         done = gifconf_list[i](dev, NULL, 0);
4210                                 else
4211                                         done = gifconf_list[i](dev, pos + total,
4212                                                                len - total);
4213                                 if (done < 0)
4214                                         return -EFAULT;
4215                                 total += done;
4216                         }
4217                 }
4218         }
4219
4220         /*
4221          *      All done.  Write the updated control block back to the caller.
4222          */
4223         ifc.ifc_len = total;
4224
4225         /*
4226          *      Both BSD and Solaris return 0 here, so we do too.
4227          */
4228         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4229 }
4230
4231 #ifdef CONFIG_PROC_FS
4232
4233 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4234
4235 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4236 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4237 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4238
4239 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4240 {
4241         struct net *net = seq_file_net(seq);
4242         struct net_device *dev;
4243         struct hlist_node *p;
4244         struct hlist_head *h;
4245         unsigned int count = 0, offset = get_offset(*pos);
4246
4247         h = &net->dev_name_head[get_bucket(*pos)];
4248         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4249                 if (++count == offset)
4250                         return dev;
4251         }
4252
4253         return NULL;
4254 }
4255
4256 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4257 {
4258         struct net_device *dev;
4259         unsigned int bucket;
4260
4261         do {
4262                 dev = dev_from_same_bucket(seq, pos);
4263                 if (dev)
4264                         return dev;
4265
4266                 bucket = get_bucket(*pos) + 1;
4267                 *pos = set_bucket_offset(bucket, 1);
4268         } while (bucket < NETDEV_HASHENTRIES);
4269
4270         return NULL;
4271 }
4272
4273 /*
4274  *      This is invoked by the /proc filesystem handler to display a device
4275  *      in detail.
4276  */
4277 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4278         __acquires(RCU)
4279 {
4280         rcu_read_lock();
4281         if (!*pos)
4282                 return SEQ_START_TOKEN;
4283
4284         if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4285                 return NULL;
4286
4287         return dev_from_bucket(seq, pos);
4288 }
4289
4290 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4291 {
4292         ++*pos;
4293         return dev_from_bucket(seq, pos);
4294 }
4295
4296 void dev_seq_stop(struct seq_file *seq, void *v)
4297         __releases(RCU)
4298 {
4299         rcu_read_unlock();
4300 }
4301
4302 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4303 {
4304         struct rtnl_link_stats64 temp;
4305         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4306
4307         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4308                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4309                    dev->name, stats->rx_bytes, stats->rx_packets,
4310                    stats->rx_errors,
4311                    stats->rx_dropped + stats->rx_missed_errors,
4312                    stats->rx_fifo_errors,
4313                    stats->rx_length_errors + stats->rx_over_errors +
4314                     stats->rx_crc_errors + stats->rx_frame_errors,
4315                    stats->rx_compressed, stats->multicast,
4316                    stats->tx_bytes, stats->tx_packets,
4317                    stats->tx_errors, stats->tx_dropped,
4318                    stats->tx_fifo_errors, stats->collisions,
4319                    stats->tx_carrier_errors +
4320                     stats->tx_aborted_errors +
4321                     stats->tx_window_errors +
4322                     stats->tx_heartbeat_errors,
4323                    stats->tx_compressed);
4324 }
4325
4326 /*
4327  *      Called from the PROCfs module. This now uses the new arbitrary sized
4328  *      /proc/net interface to create /proc/net/dev
4329  */
4330 static int dev_seq_show(struct seq_file *seq, void *v)
4331 {
4332         if (v == SEQ_START_TOKEN)
4333                 seq_puts(seq, "Inter-|   Receive                            "
4334                               "                    |  Transmit\n"
4335                               " face |bytes    packets errs drop fifo frame "
4336                               "compressed multicast|bytes    packets errs "
4337                               "drop fifo colls carrier compressed\n");
4338         else
4339                 dev_seq_printf_stats(seq, v);
4340         return 0;
4341 }
4342
4343 static struct softnet_data *softnet_get_online(loff_t *pos)
4344 {
4345         struct softnet_data *sd = NULL;
4346
4347         while (*pos < nr_cpu_ids)
4348                 if (cpu_online(*pos)) {
4349                         sd = &per_cpu(softnet_data, *pos);
4350                         break;
4351                 } else
4352                         ++*pos;
4353         return sd;
4354 }
4355
4356 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4357 {
4358         return softnet_get_online(pos);
4359 }
4360
4361 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4362 {
4363         ++*pos;
4364         return softnet_get_online(pos);
4365 }
4366
4367 static void softnet_seq_stop(struct seq_file *seq, void *v)
4368 {
4369 }
4370
4371 static int softnet_seq_show(struct seq_file *seq, void *v)
4372 {
4373         struct softnet_data *sd = v;
4374
4375         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4376                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4377                    0, 0, 0, 0, /* was fastroute */
4378                    sd->cpu_collision, sd->received_rps);
4379         return 0;
4380 }
4381
4382 static const struct seq_operations dev_seq_ops = {
4383         .start = dev_seq_start,
4384         .next  = dev_seq_next,
4385         .stop  = dev_seq_stop,
4386         .show  = dev_seq_show,
4387 };
4388
4389 static int dev_seq_open(struct inode *inode, struct file *file)
4390 {
4391         return seq_open_net(inode, file, &dev_seq_ops,
4392                             sizeof(struct seq_net_private));
4393 }
4394
4395 static const struct file_operations dev_seq_fops = {
4396         .owner   = THIS_MODULE,
4397         .open    = dev_seq_open,
4398         .read    = seq_read,
4399         .llseek  = seq_lseek,
4400         .release = seq_release_net,
4401 };
4402
4403 static const struct seq_operations softnet_seq_ops = {
4404         .start = softnet_seq_start,
4405         .next  = softnet_seq_next,
4406         .stop  = softnet_seq_stop,
4407         .show  = softnet_seq_show,
4408 };
4409
4410 static int softnet_seq_open(struct inode *inode, struct file *file)
4411 {
4412         return seq_open(file, &softnet_seq_ops);
4413 }
4414
4415 static const struct file_operations softnet_seq_fops = {
4416         .owner   = THIS_MODULE,
4417         .open    = softnet_seq_open,
4418         .read    = seq_read,
4419         .llseek  = seq_lseek,
4420         .release = seq_release,
4421 };
4422
4423 static void *ptype_get_idx(loff_t pos)
4424 {
4425         struct packet_type *pt = NULL;
4426         loff_t i = 0;
4427         int t;
4428
4429         list_for_each_entry_rcu(pt, &ptype_all, list) {
4430                 if (i == pos)
4431                         return pt;
4432                 ++i;
4433         }
4434
4435         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4436                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4437                         if (i == pos)
4438                                 return pt;
4439                         ++i;
4440                 }
4441         }
4442         return NULL;
4443 }
4444
4445 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4446         __acquires(RCU)
4447 {
4448         rcu_read_lock();
4449         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4450 }
4451
4452 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4453 {
4454         struct packet_type *pt;
4455         struct list_head *nxt;
4456         int hash;
4457
4458         ++*pos;
4459         if (v == SEQ_START_TOKEN)
4460                 return ptype_get_idx(0);
4461
4462         pt = v;
4463         nxt = pt->list.next;
4464         if (pt->type == htons(ETH_P_ALL)) {
4465                 if (nxt != &ptype_all)
4466                         goto found;
4467                 hash = 0;
4468                 nxt = ptype_base[0].next;
4469         } else
4470                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4471
4472         while (nxt == &ptype_base[hash]) {
4473                 if (++hash >= PTYPE_HASH_SIZE)
4474                         return NULL;
4475                 nxt = ptype_base[hash].next;
4476         }
4477 found:
4478         return list_entry(nxt, struct packet_type, list);
4479 }
4480
4481 static void ptype_seq_stop(struct seq_file *seq, void *v)
4482         __releases(RCU)
4483 {
4484         rcu_read_unlock();
4485 }
4486
4487 static int ptype_seq_show(struct seq_file *seq, void *v)
4488 {
4489         struct packet_type *pt = v;
4490
4491         if (v == SEQ_START_TOKEN)
4492                 seq_puts(seq, "Type Device      Function\n");
4493         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4494                 if (pt->type == htons(ETH_P_ALL))
4495                         seq_puts(seq, "ALL ");
4496                 else
4497                         seq_printf(seq, "%04x", ntohs(pt->type));
4498
4499                 seq_printf(seq, " %-8s %pF\n",
4500                            pt->dev ? pt->dev->name : "", pt->func);
4501         }
4502
4503         return 0;
4504 }
4505
4506 static const struct seq_operations ptype_seq_ops = {
4507         .start = ptype_seq_start,
4508         .next  = ptype_seq_next,
4509         .stop  = ptype_seq_stop,
4510         .show  = ptype_seq_show,
4511 };
4512
4513 static int ptype_seq_open(struct inode *inode, struct file *file)
4514 {
4515         return seq_open_net(inode, file, &ptype_seq_ops,
4516                         sizeof(struct seq_net_private));
4517 }
4518
4519 static const struct file_operations ptype_seq_fops = {
4520         .owner   = THIS_MODULE,
4521         .open    = ptype_seq_open,
4522         .read    = seq_read,
4523         .llseek  = seq_lseek,
4524         .release = seq_release_net,
4525 };
4526
4527
4528 static int __net_init dev_proc_net_init(struct net *net)
4529 {
4530         int rc = -ENOMEM;
4531
4532         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4533                 goto out;
4534         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4535                 goto out_dev;
4536         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4537                 goto out_softnet;
4538
4539         if (wext_proc_init(net))
4540                 goto out_ptype;
4541         rc = 0;
4542 out:
4543         return rc;
4544 out_ptype:
4545         proc_net_remove(net, "ptype");
4546 out_softnet:
4547         proc_net_remove(net, "softnet_stat");
4548 out_dev:
4549         proc_net_remove(net, "dev");
4550         goto out;
4551 }
4552
4553 static void __net_exit dev_proc_net_exit(struct net *net)
4554 {
4555         wext_proc_exit(net);
4556
4557         proc_net_remove(net, "ptype");
4558         proc_net_remove(net, "softnet_stat");
4559         proc_net_remove(net, "dev");
4560 }
4561
4562 static struct pernet_operations __net_initdata dev_proc_ops = {
4563         .init = dev_proc_net_init,
4564         .exit = dev_proc_net_exit,
4565 };
4566
4567 static int __init dev_proc_init(void)
4568 {
4569         return register_pernet_subsys(&dev_proc_ops);
4570 }
4571 #else
4572 #define dev_proc_init() 0
4573 #endif  /* CONFIG_PROC_FS */
4574
4575
4576 /**
4577  *      netdev_set_master       -       set up master pointer
4578  *      @slave: slave device
4579  *      @master: new master device
4580  *
4581  *      Changes the master device of the slave. Pass %NULL to break the
4582  *      bonding. The caller must hold the RTNL semaphore. On a failure
4583  *      a negative errno code is returned. On success the reference counts
4584  *      are adjusted and the function returns zero.
4585  */
4586 int netdev_set_master(struct net_device *slave, struct net_device *master)
4587 {
4588         struct net_device *old = slave->master;
4589
4590         ASSERT_RTNL();
4591
4592         if (master) {
4593                 if (old)
4594                         return -EBUSY;
4595                 dev_hold(master);
4596         }
4597
4598         slave->master = master;
4599
4600         if (old)
4601                 dev_put(old);
4602         return 0;
4603 }
4604 EXPORT_SYMBOL(netdev_set_master);
4605
4606 /**
4607  *      netdev_set_bond_master  -       set up bonding master/slave pair
4608  *      @slave: slave device
4609  *      @master: new master device
4610  *
4611  *      Changes the master device of the slave. Pass %NULL to break the
4612  *      bonding. The caller must hold the RTNL semaphore. On a failure
4613  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4614  *      to the routing socket and the function returns zero.
4615  */
4616 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4617 {
4618         int err;
4619
4620         ASSERT_RTNL();
4621
4622         err = netdev_set_master(slave, master);
4623         if (err)
4624                 return err;
4625         if (master)
4626                 slave->flags |= IFF_SLAVE;
4627         else
4628                 slave->flags &= ~IFF_SLAVE;
4629
4630         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4631         return 0;
4632 }
4633 EXPORT_SYMBOL(netdev_set_bond_master);
4634
4635 static void dev_change_rx_flags(struct net_device *dev, int flags)
4636 {
4637         const struct net_device_ops *ops = dev->netdev_ops;
4638
4639         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4640                 ops->ndo_change_rx_flags(dev, flags);
4641 }
4642
4643 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4644 {
4645         unsigned int old_flags = dev->flags;
4646         kuid_t uid;
4647         kgid_t gid;
4648
4649         ASSERT_RTNL();
4650
4651         dev->flags |= IFF_PROMISC;
4652         dev->promiscuity += inc;
4653         if (dev->promiscuity == 0) {
4654                 /*
4655                  * Avoid overflow.
4656                  * If inc causes overflow, untouch promisc and return error.
4657                  */
4658                 if (inc < 0)
4659                         dev->flags &= ~IFF_PROMISC;
4660                 else {
4661                         dev->promiscuity -= inc;
4662                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4663                                 dev->name);
4664                         return -EOVERFLOW;
4665                 }
4666         }
4667         if (dev->flags != old_flags) {
4668                 pr_info("device %s %s promiscuous mode\n",
4669                         dev->name,
4670                         dev->flags & IFF_PROMISC ? "entered" : "left");
4671                 if (audit_enabled) {
4672                         current_uid_gid(&uid, &gid);
4673                         audit_log(current->audit_context, GFP_ATOMIC,
4674                                 AUDIT_ANOM_PROMISCUOUS,
4675                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4676                                 dev->name, (dev->flags & IFF_PROMISC),
4677                                 (old_flags & IFF_PROMISC),
4678                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4679                                 from_kuid(&init_user_ns, uid),
4680                                 from_kgid(&init_user_ns, gid),
4681                                 audit_get_sessionid(current));
4682                 }
4683
4684                 dev_change_rx_flags(dev, IFF_PROMISC);
4685         }
4686         return 0;
4687 }
4688
4689 /**
4690  *      dev_set_promiscuity     - update promiscuity count on a device
4691  *      @dev: device
4692  *      @inc: modifier
4693  *
4694  *      Add or remove promiscuity from a device. While the count in the device
4695  *      remains above zero the interface remains promiscuous. Once it hits zero
4696  *      the device reverts back to normal filtering operation. A negative inc
4697  *      value is used to drop promiscuity on the device.
4698  *      Return 0 if successful or a negative errno code on error.
4699  */
4700 int dev_set_promiscuity(struct net_device *dev, int inc)
4701 {
4702         unsigned int old_flags = dev->flags;
4703         int err;
4704
4705         err = __dev_set_promiscuity(dev, inc);
4706         if (err < 0)
4707                 return err;
4708         if (dev->flags != old_flags)
4709                 dev_set_rx_mode(dev);
4710         return err;
4711 }
4712 EXPORT_SYMBOL(dev_set_promiscuity);
4713
4714 /**
4715  *      dev_set_allmulti        - update allmulti count on a device
4716  *      @dev: device
4717  *      @inc: modifier
4718  *
4719  *      Add or remove reception of all multicast frames to a device. While the
4720  *      count in the device remains above zero the interface remains listening
4721  *      to all interfaces. Once it hits zero the device reverts back to normal
4722  *      filtering operation. A negative @inc value is used to drop the counter
4723  *      when releasing a resource needing all multicasts.
4724  *      Return 0 if successful or a negative errno code on error.
4725  */
4726
4727 int dev_set_allmulti(struct net_device *dev, int inc)
4728 {
4729         unsigned int old_flags = dev->flags;
4730
4731         ASSERT_RTNL();
4732
4733         dev->flags |= IFF_ALLMULTI;
4734         dev->allmulti += inc;
4735         if (dev->allmulti == 0) {
4736                 /*
4737                  * Avoid overflow.
4738                  * If inc causes overflow, untouch allmulti and return error.
4739                  */
4740                 if (inc < 0)
4741                         dev->flags &= ~IFF_ALLMULTI;
4742                 else {
4743                         dev->allmulti -= inc;
4744                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4745                                 dev->name);
4746                         return -EOVERFLOW;
4747                 }
4748         }
4749         if (dev->flags ^ old_flags) {
4750                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4751                 dev_set_rx_mode(dev);
4752         }
4753         return 0;
4754 }
4755 EXPORT_SYMBOL(dev_set_allmulti);
4756
4757 /*
4758  *      Upload unicast and multicast address lists to device and
4759  *      configure RX filtering. When the device doesn't support unicast
4760  *      filtering it is put in promiscuous mode while unicast addresses
4761  *      are present.
4762  */
4763 void __dev_set_rx_mode(struct net_device *dev)
4764 {
4765         const struct net_device_ops *ops = dev->netdev_ops;
4766
4767         /* dev_open will call this function so the list will stay sane. */
4768         if (!(dev->flags&IFF_UP))
4769                 return;
4770
4771         if (!netif_device_present(dev))
4772                 return;
4773
4774         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4775                 /* Unicast addresses changes may only happen under the rtnl,
4776                  * therefore calling __dev_set_promiscuity here is safe.
4777                  */
4778                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4779                         __dev_set_promiscuity(dev, 1);
4780                         dev->uc_promisc = true;
4781                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4782                         __dev_set_promiscuity(dev, -1);
4783                         dev->uc_promisc = false;
4784                 }
4785         }
4786
4787         if (ops->ndo_set_rx_mode)
4788                 ops->ndo_set_rx_mode(dev);
4789 }
4790
4791 void dev_set_rx_mode(struct net_device *dev)
4792 {
4793         netif_addr_lock_bh(dev);
4794         __dev_set_rx_mode(dev);
4795         netif_addr_unlock_bh(dev);
4796 }
4797
4798 /**
4799  *      dev_get_flags - get flags reported to userspace
4800  *      @dev: device
4801  *
4802  *      Get the combination of flag bits exported through APIs to userspace.
4803  */
4804 unsigned int dev_get_flags(const struct net_device *dev)
4805 {
4806         unsigned int flags;
4807
4808         flags = (dev->flags & ~(IFF_PROMISC |
4809                                 IFF_ALLMULTI |
4810                                 IFF_RUNNING |
4811                                 IFF_LOWER_UP |
4812                                 IFF_DORMANT)) |
4813                 (dev->gflags & (IFF_PROMISC |
4814                                 IFF_ALLMULTI));
4815
4816         if (netif_running(dev)) {
4817                 if (netif_oper_up(dev))
4818                         flags |= IFF_RUNNING;
4819                 if (netif_carrier_ok(dev))
4820                         flags |= IFF_LOWER_UP;
4821                 if (netif_dormant(dev))
4822                         flags |= IFF_DORMANT;
4823         }
4824
4825         return flags;
4826 }
4827 EXPORT_SYMBOL(dev_get_flags);
4828
4829 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4830 {
4831         unsigned int old_flags = dev->flags;
4832         int ret;
4833
4834         ASSERT_RTNL();
4835
4836         /*
4837          *      Set the flags on our device.
4838          */
4839
4840         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4841                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4842                                IFF_AUTOMEDIA)) |
4843                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4844                                     IFF_ALLMULTI));
4845
4846         /*
4847          *      Load in the correct multicast list now the flags have changed.
4848          */
4849
4850         if ((old_flags ^ flags) & IFF_MULTICAST)
4851                 dev_change_rx_flags(dev, IFF_MULTICAST);
4852
4853         dev_set_rx_mode(dev);
4854
4855         /*
4856          *      Have we downed the interface. We handle IFF_UP ourselves
4857          *      according to user attempts to set it, rather than blindly
4858          *      setting it.
4859          */
4860
4861         ret = 0;
4862         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4863                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4864
4865                 if (!ret)
4866                         dev_set_rx_mode(dev);
4867         }
4868
4869         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4870                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4871
4872                 dev->gflags ^= IFF_PROMISC;
4873                 dev_set_promiscuity(dev, inc);
4874         }
4875
4876         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4877            is important. Some (broken) drivers set IFF_PROMISC, when
4878            IFF_ALLMULTI is requested not asking us and not reporting.
4879          */
4880         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4881                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4882
4883                 dev->gflags ^= IFF_ALLMULTI;
4884                 dev_set_allmulti(dev, inc);
4885         }
4886
4887         return ret;
4888 }
4889
4890 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4891 {
4892         unsigned int changes = dev->flags ^ old_flags;
4893
4894         if (changes & IFF_UP) {
4895                 if (dev->flags & IFF_UP)
4896                         call_netdevice_notifiers(NETDEV_UP, dev);
4897                 else
4898                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4899         }
4900
4901         if (dev->flags & IFF_UP &&
4902             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4903                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4904 }
4905
4906 /**
4907  *      dev_change_flags - change device settings
4908  *      @dev: device
4909  *      @flags: device state flags
4910  *
4911  *      Change settings on device based state flags. The flags are
4912  *      in the userspace exported format.
4913  */
4914 int dev_change_flags(struct net_device *dev, unsigned int flags)
4915 {
4916         int ret;
4917         unsigned int changes, old_flags = dev->flags;
4918
4919         ret = __dev_change_flags(dev, flags);
4920         if (ret < 0)
4921                 return ret;
4922
4923         changes = old_flags ^ dev->flags;
4924         if (changes)
4925                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4926
4927         __dev_notify_flags(dev, old_flags);
4928         return ret;
4929 }
4930 EXPORT_SYMBOL(dev_change_flags);
4931
4932 /**
4933  *      dev_set_mtu - Change maximum transfer unit
4934  *      @dev: device
4935  *      @new_mtu: new transfer unit
4936  *
4937  *      Change the maximum transfer size of the network device.
4938  */
4939 int dev_set_mtu(struct net_device *dev, int new_mtu)
4940 {
4941         const struct net_device_ops *ops = dev->netdev_ops;
4942         int err;
4943
4944         if (new_mtu == dev->mtu)
4945                 return 0;
4946
4947         /*      MTU must be positive.    */
4948         if (new_mtu < 0)
4949                 return -EINVAL;
4950
4951         if (!netif_device_present(dev))
4952                 return -ENODEV;
4953
4954         err = 0;
4955         if (ops->ndo_change_mtu)
4956                 err = ops->ndo_change_mtu(dev, new_mtu);
4957         else
4958                 dev->mtu = new_mtu;
4959
4960         if (!err && dev->flags & IFF_UP)
4961                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4962         return err;
4963 }
4964 EXPORT_SYMBOL(dev_set_mtu);
4965
4966 /**
4967  *      dev_set_group - Change group this device belongs to
4968  *      @dev: device
4969  *      @new_group: group this device should belong to
4970  */
4971 void dev_set_group(struct net_device *dev, int new_group)
4972 {
4973         dev->group = new_group;
4974 }
4975 EXPORT_SYMBOL(dev_set_group);
4976
4977 /**
4978  *      dev_set_mac_address - Change Media Access Control Address
4979  *      @dev: device
4980  *      @sa: new address
4981  *
4982  *      Change the hardware (MAC) address of the device
4983  */
4984 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4985 {
4986         const struct net_device_ops *ops = dev->netdev_ops;
4987         int err;
4988
4989         if (!ops->ndo_set_mac_address)
4990                 return -EOPNOTSUPP;
4991         if (sa->sa_family != dev->type)
4992                 return -EINVAL;
4993         if (!netif_device_present(dev))
4994                 return -ENODEV;
4995         err = ops->ndo_set_mac_address(dev, sa);
4996         if (!err)
4997                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4998         add_device_randomness(dev->dev_addr, dev->addr_len);
4999         return err;
5000 }
5001 EXPORT_SYMBOL(dev_set_mac_address);
5002
5003 /*
5004  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
5005  */
5006 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
5007 {
5008         int err;
5009         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
5010
5011         if (!dev)
5012                 return -ENODEV;
5013
5014         switch (cmd) {
5015         case SIOCGIFFLAGS:      /* Get interface flags */
5016                 ifr->ifr_flags = (short) dev_get_flags(dev);
5017                 return 0;
5018
5019         case SIOCGIFMETRIC:     /* Get the metric on the interface
5020                                    (currently unused) */
5021                 ifr->ifr_metric = 0;
5022                 return 0;
5023
5024         case SIOCGIFMTU:        /* Get the MTU of a device */
5025                 ifr->ifr_mtu = dev->mtu;
5026                 return 0;
5027
5028         case SIOCGIFHWADDR:
5029                 if (!dev->addr_len)
5030                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
5031                 else
5032                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
5033                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5034                 ifr->ifr_hwaddr.sa_family = dev->type;
5035                 return 0;
5036
5037         case SIOCGIFSLAVE:
5038                 err = -EINVAL;
5039                 break;
5040
5041         case SIOCGIFMAP:
5042                 ifr->ifr_map.mem_start = dev->mem_start;
5043                 ifr->ifr_map.mem_end   = dev->mem_end;
5044                 ifr->ifr_map.base_addr = dev->base_addr;
5045                 ifr->ifr_map.irq       = dev->irq;
5046                 ifr->ifr_map.dma       = dev->dma;
5047                 ifr->ifr_map.port      = dev->if_port;
5048                 return 0;
5049
5050         case SIOCGIFINDEX:
5051                 ifr->ifr_ifindex = dev->ifindex;
5052                 return 0;
5053
5054         case SIOCGIFTXQLEN:
5055                 ifr->ifr_qlen = dev->tx_queue_len;
5056                 return 0;
5057
5058         default:
5059                 /* dev_ioctl() should ensure this case
5060                  * is never reached
5061                  */
5062                 WARN_ON(1);
5063                 err = -ENOTTY;
5064                 break;
5065
5066         }
5067         return err;
5068 }
5069
5070 /*
5071  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
5072  */
5073 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5074 {
5075         int err;
5076         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5077         const struct net_device_ops *ops;
5078
5079         if (!dev)
5080                 return -ENODEV;
5081
5082         ops = dev->netdev_ops;
5083
5084         switch (cmd) {
5085         case SIOCSIFFLAGS:      /* Set interface flags */
5086                 return dev_change_flags(dev, ifr->ifr_flags);
5087
5088         case SIOCSIFMETRIC:     /* Set the metric on the interface
5089                                    (currently unused) */
5090                 return -EOPNOTSUPP;
5091
5092         case SIOCSIFMTU:        /* Set the MTU of a device */
5093                 return dev_set_mtu(dev, ifr->ifr_mtu);
5094
5095         case SIOCSIFHWADDR:
5096                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5097
5098         case SIOCSIFHWBROADCAST:
5099                 if (ifr->ifr_hwaddr.sa_family != dev->type)
5100                         return -EINVAL;
5101                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5102                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5103                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5104                 return 0;
5105
5106         case SIOCSIFMAP:
5107                 if (ops->ndo_set_config) {
5108                         if (!netif_device_present(dev))
5109                                 return -ENODEV;
5110                         return ops->ndo_set_config(dev, &ifr->ifr_map);
5111                 }
5112                 return -EOPNOTSUPP;
5113
5114         case SIOCADDMULTI:
5115                 if (!ops->ndo_set_rx_mode ||
5116                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5117                         return -EINVAL;
5118                 if (!netif_device_present(dev))
5119                         return -ENODEV;
5120                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5121
5122         case SIOCDELMULTI:
5123                 if (!ops->ndo_set_rx_mode ||
5124                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5125                         return -EINVAL;
5126                 if (!netif_device_present(dev))
5127                         return -ENODEV;
5128                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5129
5130         case SIOCSIFTXQLEN:
5131                 if (ifr->ifr_qlen < 0)
5132                         return -EINVAL;
5133                 dev->tx_queue_len = ifr->ifr_qlen;
5134                 return 0;
5135
5136         case SIOCSIFNAME:
5137                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5138                 return dev_change_name(dev, ifr->ifr_newname);
5139
5140         case SIOCSHWTSTAMP:
5141                 err = net_hwtstamp_validate(ifr);
5142                 if (err)
5143                         return err;
5144                 /* fall through */
5145
5146         /*
5147          *      Unknown or private ioctl
5148          */
5149         default:
5150                 if ((cmd >= SIOCDEVPRIVATE &&
5151                     cmd <= SIOCDEVPRIVATE + 15) ||
5152                     cmd == SIOCBONDENSLAVE ||
5153                     cmd == SIOCBONDRELEASE ||
5154                     cmd == SIOCBONDSETHWADDR ||
5155                     cmd == SIOCBONDSLAVEINFOQUERY ||
5156                     cmd == SIOCBONDINFOQUERY ||
5157                     cmd == SIOCBONDCHANGEACTIVE ||
5158                     cmd == SIOCGMIIPHY ||
5159                     cmd == SIOCGMIIREG ||
5160                     cmd == SIOCSMIIREG ||
5161                     cmd == SIOCBRADDIF ||
5162                     cmd == SIOCBRDELIF ||
5163                     cmd == SIOCSHWTSTAMP ||
5164                     cmd == SIOCWANDEV) {
5165                         err = -EOPNOTSUPP;
5166                         if (ops->ndo_do_ioctl) {
5167                                 if (netif_device_present(dev))
5168                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
5169                                 else
5170                                         err = -ENODEV;
5171                         }
5172                 } else
5173                         err = -EINVAL;
5174
5175         }
5176         return err;
5177 }
5178
5179 /*
5180  *      This function handles all "interface"-type I/O control requests. The actual
5181  *      'doing' part of this is dev_ifsioc above.
5182  */
5183
5184 /**
5185  *      dev_ioctl       -       network device ioctl
5186  *      @net: the applicable net namespace
5187  *      @cmd: command to issue
5188  *      @arg: pointer to a struct ifreq in user space
5189  *
5190  *      Issue ioctl functions to devices. This is normally called by the
5191  *      user space syscall interfaces but can sometimes be useful for
5192  *      other purposes. The return value is the return from the syscall if
5193  *      positive or a negative errno code on error.
5194  */
5195
5196 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5197 {
5198         struct ifreq ifr;
5199         int ret;
5200         char *colon;
5201
5202         /* One special case: SIOCGIFCONF takes ifconf argument
5203            and requires shared lock, because it sleeps writing
5204            to user space.
5205          */
5206
5207         if (cmd == SIOCGIFCONF) {
5208                 rtnl_lock();
5209                 ret = dev_ifconf(net, (char __user *) arg);
5210                 rtnl_unlock();
5211                 return ret;
5212         }
5213         if (cmd == SIOCGIFNAME)
5214                 return dev_ifname(net, (struct ifreq __user *)arg);
5215
5216         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5217                 return -EFAULT;
5218
5219         ifr.ifr_name[IFNAMSIZ-1] = 0;
5220
5221         colon = strchr(ifr.ifr_name, ':');
5222         if (colon)
5223                 *colon = 0;
5224
5225         /*
5226          *      See which interface the caller is talking about.
5227          */
5228
5229         switch (cmd) {
5230         /*
5231          *      These ioctl calls:
5232          *      - can be done by all.
5233          *      - atomic and do not require locking.
5234          *      - return a value
5235          */
5236         case SIOCGIFFLAGS:
5237         case SIOCGIFMETRIC:
5238         case SIOCGIFMTU:
5239         case SIOCGIFHWADDR:
5240         case SIOCGIFSLAVE:
5241         case SIOCGIFMAP:
5242         case SIOCGIFINDEX:
5243         case SIOCGIFTXQLEN:
5244                 dev_load(net, ifr.ifr_name);
5245                 rcu_read_lock();
5246                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5247                 rcu_read_unlock();
5248                 if (!ret) {
5249                         if (colon)
5250                                 *colon = ':';
5251                         if (copy_to_user(arg, &ifr,
5252                                          sizeof(struct ifreq)))
5253                                 ret = -EFAULT;
5254                 }
5255                 return ret;
5256
5257         case SIOCETHTOOL:
5258                 dev_load(net, ifr.ifr_name);
5259                 rtnl_lock();
5260                 ret = dev_ethtool(net, &ifr);
5261                 rtnl_unlock();
5262                 if (!ret) {
5263                         if (colon)
5264                                 *colon = ':';
5265                         if (copy_to_user(arg, &ifr,
5266                                          sizeof(struct ifreq)))
5267                                 ret = -EFAULT;
5268                 }
5269                 return ret;
5270
5271         /*
5272          *      These ioctl calls:
5273          *      - require superuser power.
5274          *      - require strict serialization.
5275          *      - return a value
5276          */
5277         case SIOCGMIIPHY:
5278         case SIOCGMIIREG:
5279         case SIOCSIFNAME:
5280                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5281                         return -EPERM;
5282                 dev_load(net, ifr.ifr_name);
5283                 rtnl_lock();
5284                 ret = dev_ifsioc(net, &ifr, cmd);
5285                 rtnl_unlock();
5286                 if (!ret) {
5287                         if (colon)
5288                                 *colon = ':';
5289                         if (copy_to_user(arg, &ifr,
5290                                          sizeof(struct ifreq)))
5291                                 ret = -EFAULT;
5292                 }
5293                 return ret;
5294
5295         /*
5296          *      These ioctl calls:
5297          *      - require superuser power.
5298          *      - require strict serialization.
5299          *      - do not return a value
5300          */
5301         case SIOCSIFMAP:
5302         case SIOCSIFTXQLEN:
5303                 if (!capable(CAP_NET_ADMIN))
5304                         return -EPERM;
5305                 /* fall through */
5306         /*
5307          *      These ioctl calls:
5308          *      - require local superuser power.
5309          *      - require strict serialization.
5310          *      - do not return a value
5311          */
5312         case SIOCSIFFLAGS:
5313         case SIOCSIFMETRIC:
5314         case SIOCSIFMTU:
5315         case SIOCSIFHWADDR:
5316         case SIOCSIFSLAVE:
5317         case SIOCADDMULTI:
5318         case SIOCDELMULTI:
5319         case SIOCSIFHWBROADCAST:
5320         case SIOCSMIIREG:
5321         case SIOCBONDENSLAVE:
5322         case SIOCBONDRELEASE:
5323         case SIOCBONDSETHWADDR:
5324         case SIOCBONDCHANGEACTIVE:
5325         case SIOCBRADDIF:
5326         case SIOCBRDELIF:
5327         case SIOCSHWTSTAMP:
5328                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5329                         return -EPERM;
5330                 /* fall through */
5331         case SIOCBONDSLAVEINFOQUERY:
5332         case SIOCBONDINFOQUERY:
5333                 dev_load(net, ifr.ifr_name);
5334                 rtnl_lock();
5335                 ret = dev_ifsioc(net, &ifr, cmd);
5336                 rtnl_unlock();
5337                 return ret;
5338
5339         case SIOCGIFMEM:
5340                 /* Get the per device memory space. We can add this but
5341                  * currently do not support it */
5342         case SIOCSIFMEM:
5343                 /* Set the per device memory buffer space.
5344                  * Not applicable in our case */
5345         case SIOCSIFLINK:
5346                 return -ENOTTY;
5347
5348         /*
5349          *      Unknown or private ioctl.
5350          */
5351         default:
5352                 if (cmd == SIOCWANDEV ||
5353                     (cmd >= SIOCDEVPRIVATE &&
5354                      cmd <= SIOCDEVPRIVATE + 15)) {
5355                         dev_load(net, ifr.ifr_name);
5356                         rtnl_lock();
5357                         ret = dev_ifsioc(net, &ifr, cmd);
5358                         rtnl_unlock();
5359                         if (!ret && copy_to_user(arg, &ifr,
5360                                                  sizeof(struct ifreq)))
5361                                 ret = -EFAULT;
5362                         return ret;
5363                 }
5364                 /* Take care of Wireless Extensions */
5365                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5366                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5367                 return -ENOTTY;
5368         }
5369 }
5370
5371
5372 /**
5373  *      dev_new_index   -       allocate an ifindex
5374  *      @net: the applicable net namespace
5375  *
5376  *      Returns a suitable unique value for a new device interface
5377  *      number.  The caller must hold the rtnl semaphore or the
5378  *      dev_base_lock to be sure it remains unique.
5379  */
5380 static int dev_new_index(struct net *net)
5381 {
5382         int ifindex = net->ifindex;
5383         for (;;) {
5384                 if (++ifindex <= 0)
5385                         ifindex = 1;
5386                 if (!__dev_get_by_index(net, ifindex))
5387                         return net->ifindex = ifindex;
5388         }
5389 }
5390
5391 /* Delayed registration/unregisteration */
5392 static LIST_HEAD(net_todo_list);
5393
5394 static void net_set_todo(struct net_device *dev)
5395 {
5396         list_add_tail(&dev->todo_list, &net_todo_list);
5397 }
5398
5399 static void rollback_registered_many(struct list_head *head)
5400 {
5401         struct net_device *dev, *tmp;
5402
5403         BUG_ON(dev_boot_phase);
5404         ASSERT_RTNL();
5405
5406         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5407                 /* Some devices call without registering
5408                  * for initialization unwind. Remove those
5409                  * devices and proceed with the remaining.
5410                  */
5411                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5412                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5413                                  dev->name, dev);
5414
5415                         WARN_ON(1);
5416                         list_del(&dev->unreg_list);
5417                         continue;
5418                 }
5419                 dev->dismantle = true;
5420                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5421         }
5422
5423         /* If device is running, close it first. */
5424         dev_close_many(head);
5425
5426         list_for_each_entry(dev, head, unreg_list) {
5427                 /* And unlink it from device chain. */
5428                 unlist_netdevice(dev);
5429
5430                 dev->reg_state = NETREG_UNREGISTERING;
5431         }
5432
5433         synchronize_net();
5434
5435         list_for_each_entry(dev, head, unreg_list) {
5436                 /* Shutdown queueing discipline. */
5437                 dev_shutdown(dev);
5438
5439
5440                 /* Notify protocols, that we are about to destroy
5441                    this device. They should clean all the things.
5442                 */
5443                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5444
5445                 if (!dev->rtnl_link_ops ||
5446                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5447                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5448
5449                 /*
5450                  *      Flush the unicast and multicast chains
5451                  */
5452                 dev_uc_flush(dev);
5453                 dev_mc_flush(dev);
5454
5455                 if (dev->netdev_ops->ndo_uninit)
5456                         dev->netdev_ops->ndo_uninit(dev);
5457
5458                 /* Notifier chain MUST detach us from master device. */
5459                 WARN_ON(dev->master);
5460
5461                 /* Remove entries from kobject tree */
5462                 netdev_unregister_kobject(dev);
5463         }
5464
5465         synchronize_net();
5466
5467         list_for_each_entry(dev, head, unreg_list)
5468                 dev_put(dev);
5469 }
5470
5471 static void rollback_registered(struct net_device *dev)
5472 {
5473         LIST_HEAD(single);
5474
5475         list_add(&dev->unreg_list, &single);
5476         rollback_registered_many(&single);
5477         list_del(&single);
5478 }
5479
5480 static netdev_features_t netdev_fix_features(struct net_device *dev,
5481         netdev_features_t features)
5482 {
5483         /* Fix illegal checksum combinations */
5484         if ((features & NETIF_F_HW_CSUM) &&
5485             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5486                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5487                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5488         }
5489
5490         /* Fix illegal SG+CSUM combinations. */
5491         if ((features & NETIF_F_SG) &&
5492             !(features & NETIF_F_ALL_CSUM)) {
5493                 netdev_dbg(dev,
5494                         "Dropping NETIF_F_SG since no checksum feature.\n");
5495                 features &= ~NETIF_F_SG;
5496         }
5497
5498         /* TSO requires that SG is present as well. */
5499         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5500                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5501                 features &= ~NETIF_F_ALL_TSO;
5502         }
5503
5504         /* TSO ECN requires that TSO is present as well. */
5505         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5506                 features &= ~NETIF_F_TSO_ECN;
5507
5508         /* Software GSO depends on SG. */
5509         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5510                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5511                 features &= ~NETIF_F_GSO;
5512         }
5513
5514         /* UFO needs SG and checksumming */
5515         if (features & NETIF_F_UFO) {
5516                 /* maybe split UFO into V4 and V6? */
5517                 if (!((features & NETIF_F_GEN_CSUM) ||
5518                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5519                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5520                         netdev_dbg(dev,
5521                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5522                         features &= ~NETIF_F_UFO;
5523                 }
5524
5525                 if (!(features & NETIF_F_SG)) {
5526                         netdev_dbg(dev,
5527                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5528                         features &= ~NETIF_F_UFO;
5529                 }
5530         }
5531
5532         return features;
5533 }
5534
5535 int __netdev_update_features(struct net_device *dev)
5536 {
5537         netdev_features_t features;
5538         int err = 0;
5539
5540         ASSERT_RTNL();
5541
5542         features = netdev_get_wanted_features(dev);
5543
5544         if (dev->netdev_ops->ndo_fix_features)
5545                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5546
5547         /* driver might be less strict about feature dependencies */
5548         features = netdev_fix_features(dev, features);
5549
5550         if (dev->features == features)
5551                 return 0;
5552
5553         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5554                 &dev->features, &features);
5555
5556         if (dev->netdev_ops->ndo_set_features)
5557                 err = dev->netdev_ops->ndo_set_features(dev, features);
5558
5559         if (unlikely(err < 0)) {
5560                 netdev_err(dev,
5561                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5562                         err, &features, &dev->features);
5563                 return -1;
5564         }
5565
5566         if (!err)
5567                 dev->features = features;
5568
5569         return 1;
5570 }
5571
5572 /**
5573  *      netdev_update_features - recalculate device features
5574  *      @dev: the device to check
5575  *
5576  *      Recalculate dev->features set and send notifications if it
5577  *      has changed. Should be called after driver or hardware dependent
5578  *      conditions might have changed that influence the features.
5579  */
5580 void netdev_update_features(struct net_device *dev)
5581 {
5582         if (__netdev_update_features(dev))
5583                 netdev_features_change(dev);
5584 }
5585 EXPORT_SYMBOL(netdev_update_features);
5586
5587 /**
5588  *      netdev_change_features - recalculate device features
5589  *      @dev: the device to check
5590  *
5591  *      Recalculate dev->features set and send notifications even
5592  *      if they have not changed. Should be called instead of
5593  *      netdev_update_features() if also dev->vlan_features might
5594  *      have changed to allow the changes to be propagated to stacked
5595  *      VLAN devices.
5596  */
5597 void netdev_change_features(struct net_device *dev)
5598 {
5599         __netdev_update_features(dev);
5600         netdev_features_change(dev);
5601 }
5602 EXPORT_SYMBOL(netdev_change_features);
5603
5604 /**
5605  *      netif_stacked_transfer_operstate -      transfer operstate
5606  *      @rootdev: the root or lower level device to transfer state from
5607  *      @dev: the device to transfer operstate to
5608  *
5609  *      Transfer operational state from root to device. This is normally
5610  *      called when a stacking relationship exists between the root
5611  *      device and the device(a leaf device).
5612  */
5613 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5614                                         struct net_device *dev)
5615 {
5616         if (rootdev->operstate == IF_OPER_DORMANT)
5617                 netif_dormant_on(dev);
5618         else
5619                 netif_dormant_off(dev);
5620
5621         if (netif_carrier_ok(rootdev)) {
5622                 if (!netif_carrier_ok(dev))
5623                         netif_carrier_on(dev);
5624         } else {
5625                 if (netif_carrier_ok(dev))
5626                         netif_carrier_off(dev);
5627         }
5628 }
5629 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5630
5631 #ifdef CONFIG_RPS
5632 static int netif_alloc_rx_queues(struct net_device *dev)
5633 {
5634         unsigned int i, count = dev->num_rx_queues;
5635         struct netdev_rx_queue *rx;
5636
5637         BUG_ON(count < 1);
5638
5639         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5640         if (!rx) {
5641                 pr_err("netdev: Unable to allocate %u rx queues\n", count);
5642                 return -ENOMEM;
5643         }
5644         dev->_rx = rx;
5645
5646         for (i = 0; i < count; i++)
5647                 rx[i].dev = dev;
5648         return 0;
5649 }
5650 #endif
5651
5652 static void netdev_init_one_queue(struct net_device *dev,
5653                                   struct netdev_queue *queue, void *_unused)
5654 {
5655         /* Initialize queue lock */
5656         spin_lock_init(&queue->_xmit_lock);
5657         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5658         queue->xmit_lock_owner = -1;
5659         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5660         queue->dev = dev;
5661 #ifdef CONFIG_BQL
5662         dql_init(&queue->dql, HZ);
5663 #endif
5664 }
5665
5666 static int netif_alloc_netdev_queues(struct net_device *dev)
5667 {
5668         unsigned int count = dev->num_tx_queues;
5669         struct netdev_queue *tx;
5670
5671         BUG_ON(count < 1);
5672
5673         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5674         if (!tx) {
5675                 pr_err("netdev: Unable to allocate %u tx queues\n", count);
5676                 return -ENOMEM;
5677         }
5678         dev->_tx = tx;
5679
5680         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5681         spin_lock_init(&dev->tx_global_lock);
5682
5683         return 0;
5684 }
5685
5686 /**
5687  *      register_netdevice      - register a network device
5688  *      @dev: device to register
5689  *
5690  *      Take a completed network device structure and add it to the kernel
5691  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5692  *      chain. 0 is returned on success. A negative errno code is returned
5693  *      on a failure to set up the device, or if the name is a duplicate.
5694  *
5695  *      Callers must hold the rtnl semaphore. You may want
5696  *      register_netdev() instead of this.
5697  *
5698  *      BUGS:
5699  *      The locking appears insufficient to guarantee two parallel registers
5700  *      will not get the same name.
5701  */
5702
5703 int register_netdevice(struct net_device *dev)
5704 {
5705         int ret;
5706         struct net *net = dev_net(dev);
5707
5708         BUG_ON(dev_boot_phase);
5709         ASSERT_RTNL();
5710
5711         might_sleep();
5712
5713         /* When net_device's are persistent, this will be fatal. */
5714         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5715         BUG_ON(!net);
5716
5717         spin_lock_init(&dev->addr_list_lock);
5718         netdev_set_addr_lockdep_class(dev);
5719
5720         dev->iflink = -1;
5721
5722         ret = dev_get_valid_name(net, dev, dev->name);
5723         if (ret < 0)
5724                 goto out;
5725
5726         /* Init, if this function is available */
5727         if (dev->netdev_ops->ndo_init) {
5728                 ret = dev->netdev_ops->ndo_init(dev);
5729                 if (ret) {
5730                         if (ret > 0)
5731                                 ret = -EIO;
5732                         goto out;
5733                 }
5734         }
5735
5736         ret = -EBUSY;
5737         if (!dev->ifindex)
5738                 dev->ifindex = dev_new_index(net);
5739         else if (__dev_get_by_index(net, dev->ifindex))
5740                 goto err_uninit;
5741
5742         if (dev->iflink == -1)
5743                 dev->iflink = dev->ifindex;
5744
5745         /* Transfer changeable features to wanted_features and enable
5746          * software offloads (GSO and GRO).
5747          */
5748         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5749         dev->features |= NETIF_F_SOFT_FEATURES;
5750         dev->wanted_features = dev->features & dev->hw_features;
5751
5752         /* Turn on no cache copy if HW is doing checksum */
5753         if (!(dev->flags & IFF_LOOPBACK)) {
5754                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5755                 if (dev->features & NETIF_F_ALL_CSUM) {
5756                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5757                         dev->features |= NETIF_F_NOCACHE_COPY;
5758                 }
5759         }
5760
5761         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5762          */
5763         dev->vlan_features |= NETIF_F_HIGHDMA;
5764
5765         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5766         ret = notifier_to_errno(ret);
5767         if (ret)
5768                 goto err_uninit;
5769
5770         ret = netdev_register_kobject(dev);
5771         if (ret)
5772                 goto err_uninit;
5773         dev->reg_state = NETREG_REGISTERED;
5774
5775         __netdev_update_features(dev);
5776
5777         /*
5778          *      Default initial state at registry is that the
5779          *      device is present.
5780          */
5781
5782         set_bit(__LINK_STATE_PRESENT, &dev->state);
5783
5784         linkwatch_init_dev(dev);
5785
5786         dev_init_scheduler(dev);
5787         dev_hold(dev);
5788         list_netdevice(dev);
5789         add_device_randomness(dev->dev_addr, dev->addr_len);
5790
5791         /* Notify protocols, that a new device appeared. */
5792         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5793         ret = notifier_to_errno(ret);
5794         if (ret) {
5795                 rollback_registered(dev);
5796                 dev->reg_state = NETREG_UNREGISTERED;
5797         }
5798         /*
5799          *      Prevent userspace races by waiting until the network
5800          *      device is fully setup before sending notifications.
5801          */
5802         if (!dev->rtnl_link_ops ||
5803             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5804                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5805
5806 out:
5807         return ret;
5808
5809 err_uninit:
5810         if (dev->netdev_ops->ndo_uninit)
5811                 dev->netdev_ops->ndo_uninit(dev);
5812         goto out;
5813 }
5814 EXPORT_SYMBOL(register_netdevice);
5815
5816 /**
5817  *      init_dummy_netdev       - init a dummy network device for NAPI
5818  *      @dev: device to init
5819  *
5820  *      This takes a network device structure and initialize the minimum
5821  *      amount of fields so it can be used to schedule NAPI polls without
5822  *      registering a full blown interface. This is to be used by drivers
5823  *      that need to tie several hardware interfaces to a single NAPI
5824  *      poll scheduler due to HW limitations.
5825  */
5826 int init_dummy_netdev(struct net_device *dev)
5827 {
5828         /* Clear everything. Note we don't initialize spinlocks
5829          * are they aren't supposed to be taken by any of the
5830          * NAPI code and this dummy netdev is supposed to be
5831          * only ever used for NAPI polls
5832          */
5833         memset(dev, 0, sizeof(struct net_device));
5834
5835         /* make sure we BUG if trying to hit standard
5836          * register/unregister code path
5837          */
5838         dev->reg_state = NETREG_DUMMY;
5839
5840         /* NAPI wants this */
5841         INIT_LIST_HEAD(&dev->napi_list);
5842
5843         /* a dummy interface is started by default */
5844         set_bit(__LINK_STATE_PRESENT, &dev->state);
5845         set_bit(__LINK_STATE_START, &dev->state);
5846
5847         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5848          * because users of this 'device' dont need to change
5849          * its refcount.
5850          */
5851
5852         return 0;
5853 }
5854 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5855
5856
5857 /**
5858  *      register_netdev - register a network device
5859  *      @dev: device to register
5860  *
5861  *      Take a completed network device structure and add it to the kernel
5862  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5863  *      chain. 0 is returned on success. A negative errno code is returned
5864  *      on a failure to set up the device, or if the name is a duplicate.
5865  *
5866  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5867  *      and expands the device name if you passed a format string to
5868  *      alloc_netdev.
5869  */
5870 int register_netdev(struct net_device *dev)
5871 {
5872         int err;
5873
5874         rtnl_lock();
5875         err = register_netdevice(dev);
5876         rtnl_unlock();
5877         return err;
5878 }
5879 EXPORT_SYMBOL(register_netdev);
5880
5881 int netdev_refcnt_read(const struct net_device *dev)
5882 {
5883         int i, refcnt = 0;
5884
5885         for_each_possible_cpu(i)
5886                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5887         return refcnt;
5888 }
5889 EXPORT_SYMBOL(netdev_refcnt_read);
5890
5891 /**
5892  * netdev_wait_allrefs - wait until all references are gone.
5893  * @dev: target net_device
5894  *
5895  * This is called when unregistering network devices.
5896  *
5897  * Any protocol or device that holds a reference should register
5898  * for netdevice notification, and cleanup and put back the
5899  * reference if they receive an UNREGISTER event.
5900  * We can get stuck here if buggy protocols don't correctly
5901  * call dev_put.
5902  */
5903 static void netdev_wait_allrefs(struct net_device *dev)
5904 {
5905         unsigned long rebroadcast_time, warning_time;
5906         int refcnt;
5907
5908         linkwatch_forget_dev(dev);
5909
5910         rebroadcast_time = warning_time = jiffies;
5911         refcnt = netdev_refcnt_read(dev);
5912
5913         while (refcnt != 0) {
5914                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5915                         rtnl_lock();
5916
5917                         /* Rebroadcast unregister notification */
5918                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5919
5920                         __rtnl_unlock();
5921                         rcu_barrier();
5922                         rtnl_lock();
5923
5924                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5925                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5926                                      &dev->state)) {
5927                                 /* We must not have linkwatch events
5928                                  * pending on unregister. If this
5929                                  * happens, we simply run the queue
5930                                  * unscheduled, resulting in a noop
5931                                  * for this device.
5932                                  */
5933                                 linkwatch_run_queue();
5934                         }
5935
5936                         __rtnl_unlock();
5937
5938                         rebroadcast_time = jiffies;
5939                 }
5940
5941                 msleep(250);
5942
5943                 refcnt = netdev_refcnt_read(dev);
5944
5945                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5946                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5947                                  dev->name, refcnt);
5948                         warning_time = jiffies;
5949                 }
5950         }
5951 }
5952
5953 /* The sequence is:
5954  *
5955  *      rtnl_lock();
5956  *      ...
5957  *      register_netdevice(x1);
5958  *      register_netdevice(x2);
5959  *      ...
5960  *      unregister_netdevice(y1);
5961  *      unregister_netdevice(y2);
5962  *      ...
5963  *      rtnl_unlock();
5964  *      free_netdev(y1);
5965  *      free_netdev(y2);
5966  *
5967  * We are invoked by rtnl_unlock().
5968  * This allows us to deal with problems:
5969  * 1) We can delete sysfs objects which invoke hotplug
5970  *    without deadlocking with linkwatch via keventd.
5971  * 2) Since we run with the RTNL semaphore not held, we can sleep
5972  *    safely in order to wait for the netdev refcnt to drop to zero.
5973  *
5974  * We must not return until all unregister events added during
5975  * the interval the lock was held have been completed.
5976  */
5977 void netdev_run_todo(void)
5978 {
5979         struct list_head list;
5980
5981         /* Snapshot list, allow later requests */
5982         list_replace_init(&net_todo_list, &list);
5983
5984         __rtnl_unlock();
5985
5986
5987         /* Wait for rcu callbacks to finish before next phase */
5988         if (!list_empty(&list))
5989                 rcu_barrier();
5990
5991         while (!list_empty(&list)) {
5992                 struct net_device *dev
5993                         = list_first_entry(&list, struct net_device, todo_list);
5994                 list_del(&dev->todo_list);
5995
5996                 rtnl_lock();
5997                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5998                 __rtnl_unlock();
5999
6000                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6001                         pr_err("network todo '%s' but state %d\n",
6002                                dev->name, dev->reg_state);
6003                         dump_stack();
6004                         continue;
6005                 }
6006
6007                 dev->reg_state = NETREG_UNREGISTERED;
6008
6009                 on_each_cpu(flush_backlog, dev, 1);
6010
6011                 netdev_wait_allrefs(dev);
6012
6013                 /* paranoia */
6014                 BUG_ON(netdev_refcnt_read(dev));
6015                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6016                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6017                 WARN_ON(dev->dn_ptr);
6018
6019                 if (dev->destructor)
6020                         dev->destructor(dev);
6021
6022                 /* Free network device */
6023                 kobject_put(&dev->dev.kobj);
6024         }
6025 }
6026
6027 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6028  * fields in the same order, with only the type differing.
6029  */
6030 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6031                              const struct net_device_stats *netdev_stats)
6032 {
6033 #if BITS_PER_LONG == 64
6034         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6035         memcpy(stats64, netdev_stats, sizeof(*stats64));
6036 #else
6037         size_t i, n = sizeof(*stats64) / sizeof(u64);
6038         const unsigned long *src = (const unsigned long *)netdev_stats;
6039         u64 *dst = (u64 *)stats64;
6040
6041         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6042                      sizeof(*stats64) / sizeof(u64));
6043         for (i = 0; i < n; i++)
6044                 dst[i] = src[i];
6045 #endif
6046 }
6047 EXPORT_SYMBOL(netdev_stats_to_stats64);
6048
6049 /**
6050  *      dev_get_stats   - get network device statistics
6051  *      @dev: device to get statistics from
6052  *      @storage: place to store stats
6053  *
6054  *      Get network statistics from device. Return @storage.
6055  *      The device driver may provide its own method by setting
6056  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6057  *      otherwise the internal statistics structure is used.
6058  */
6059 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6060                                         struct rtnl_link_stats64 *storage)
6061 {
6062         const struct net_device_ops *ops = dev->netdev_ops;
6063
6064         if (ops->ndo_get_stats64) {
6065                 memset(storage, 0, sizeof(*storage));
6066                 ops->ndo_get_stats64(dev, storage);
6067         } else if (ops->ndo_get_stats) {
6068                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6069         } else {
6070                 netdev_stats_to_stats64(storage, &dev->stats);
6071         }
6072         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6073         return storage;
6074 }
6075 EXPORT_SYMBOL(dev_get_stats);
6076
6077 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6078 {
6079         struct netdev_queue *queue = dev_ingress_queue(dev);
6080
6081 #ifdef CONFIG_NET_CLS_ACT
6082         if (queue)
6083                 return queue;
6084         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6085         if (!queue)
6086                 return NULL;
6087         netdev_init_one_queue(dev, queue, NULL);
6088         queue->qdisc = &noop_qdisc;
6089         queue->qdisc_sleeping = &noop_qdisc;
6090         rcu_assign_pointer(dev->ingress_queue, queue);
6091 #endif
6092         return queue;
6093 }
6094
6095 static const struct ethtool_ops default_ethtool_ops;
6096
6097 /**
6098  *      alloc_netdev_mqs - allocate network device
6099  *      @sizeof_priv:   size of private data to allocate space for
6100  *      @name:          device name format string
6101  *      @setup:         callback to initialize device
6102  *      @txqs:          the number of TX subqueues to allocate
6103  *      @rxqs:          the number of RX subqueues to allocate
6104  *
6105  *      Allocates a struct net_device with private data area for driver use
6106  *      and performs basic initialization.  Also allocates subquue structs
6107  *      for each queue on the device.
6108  */
6109 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6110                 void (*setup)(struct net_device *),
6111                 unsigned int txqs, unsigned int rxqs)
6112 {
6113         struct net_device *dev;
6114         size_t alloc_size;
6115         struct net_device *p;
6116
6117         BUG_ON(strlen(name) >= sizeof(dev->name));
6118
6119         if (txqs < 1) {
6120                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6121                 return NULL;
6122         }
6123
6124 #ifdef CONFIG_RPS
6125         if (rxqs < 1) {
6126                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6127                 return NULL;
6128         }
6129 #endif
6130
6131         alloc_size = sizeof(struct net_device);
6132         if (sizeof_priv) {
6133                 /* ensure 32-byte alignment of private area */
6134                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6135                 alloc_size += sizeof_priv;
6136         }
6137         /* ensure 32-byte alignment of whole construct */
6138         alloc_size += NETDEV_ALIGN - 1;
6139
6140         p = kzalloc(alloc_size, GFP_KERNEL);
6141         if (!p) {
6142                 pr_err("alloc_netdev: Unable to allocate device\n");
6143                 return NULL;
6144         }
6145
6146         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6147         dev->padded = (char *)dev - (char *)p;
6148
6149         dev->pcpu_refcnt = alloc_percpu(int);
6150         if (!dev->pcpu_refcnt)
6151                 goto free_p;
6152
6153         if (dev_addr_init(dev))
6154                 goto free_pcpu;
6155
6156         dev_mc_init(dev);
6157         dev_uc_init(dev);
6158
6159         dev_net_set(dev, &init_net);
6160
6161         dev->gso_max_size = GSO_MAX_SIZE;
6162         dev->gso_max_segs = GSO_MAX_SEGS;
6163
6164         INIT_LIST_HEAD(&dev->napi_list);
6165         INIT_LIST_HEAD(&dev->unreg_list);
6166         INIT_LIST_HEAD(&dev->link_watch_list);
6167         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6168         setup(dev);
6169
6170         dev->num_tx_queues = txqs;
6171         dev->real_num_tx_queues = txqs;
6172         if (netif_alloc_netdev_queues(dev))
6173                 goto free_all;
6174
6175 #ifdef CONFIG_RPS
6176         dev->num_rx_queues = rxqs;
6177         dev->real_num_rx_queues = rxqs;
6178         if (netif_alloc_rx_queues(dev))
6179                 goto free_all;
6180 #endif
6181
6182         strcpy(dev->name, name);
6183         dev->group = INIT_NETDEV_GROUP;
6184         if (!dev->ethtool_ops)
6185                 dev->ethtool_ops = &default_ethtool_ops;
6186         return dev;
6187
6188 free_all:
6189         free_netdev(dev);
6190         return NULL;
6191
6192 free_pcpu:
6193         free_percpu(dev->pcpu_refcnt);
6194         kfree(dev->_tx);
6195 #ifdef CONFIG_RPS
6196         kfree(dev->_rx);
6197 #endif
6198
6199 free_p:
6200         kfree(p);
6201         return NULL;
6202 }
6203 EXPORT_SYMBOL(alloc_netdev_mqs);
6204
6205 /**
6206  *      free_netdev - free network device
6207  *      @dev: device
6208  *
6209  *      This function does the last stage of destroying an allocated device
6210  *      interface. The reference to the device object is released.
6211  *      If this is the last reference then it will be freed.
6212  */
6213 void free_netdev(struct net_device *dev)
6214 {
6215         struct napi_struct *p, *n;
6216
6217         release_net(dev_net(dev));
6218
6219         kfree(dev->_tx);
6220 #ifdef CONFIG_RPS
6221         kfree(dev->_rx);
6222 #endif
6223
6224         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6225
6226         /* Flush device addresses */
6227         dev_addr_flush(dev);
6228
6229         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6230                 netif_napi_del(p);
6231
6232         free_percpu(dev->pcpu_refcnt);
6233         dev->pcpu_refcnt = NULL;
6234
6235         /*  Compatibility with error handling in drivers */
6236         if (dev->reg_state == NETREG_UNINITIALIZED) {
6237                 kfree((char *)dev - dev->padded);
6238                 return;
6239         }
6240
6241         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6242         dev->reg_state = NETREG_RELEASED;
6243
6244         /* will free via device release */
6245         put_device(&dev->dev);
6246 }
6247 EXPORT_SYMBOL(free_netdev);
6248
6249 /**
6250  *      synchronize_net -  Synchronize with packet receive processing
6251  *
6252  *      Wait for packets currently being received to be done.
6253  *      Does not block later packets from starting.
6254  */
6255 void synchronize_net(void)
6256 {
6257         might_sleep();
6258         if (rtnl_is_locked())
6259                 synchronize_rcu_expedited();
6260         else
6261                 synchronize_rcu();
6262 }
6263 EXPORT_SYMBOL(synchronize_net);
6264
6265 /**
6266  *      unregister_netdevice_queue - remove device from the kernel
6267  *      @dev: device
6268  *      @head: list
6269  *
6270  *      This function shuts down a device interface and removes it
6271  *      from the kernel tables.
6272  *      If head not NULL, device is queued to be unregistered later.
6273  *
6274  *      Callers must hold the rtnl semaphore.  You may want
6275  *      unregister_netdev() instead of this.
6276  */
6277
6278 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6279 {
6280         ASSERT_RTNL();
6281
6282         if (head) {
6283                 list_move_tail(&dev->unreg_list, head);
6284         } else {
6285                 rollback_registered(dev);
6286                 /* Finish processing unregister after unlock */
6287                 net_set_todo(dev);
6288         }
6289 }
6290 EXPORT_SYMBOL(unregister_netdevice_queue);
6291
6292 /**
6293  *      unregister_netdevice_many - unregister many devices
6294  *      @head: list of devices
6295  */
6296 void unregister_netdevice_many(struct list_head *head)
6297 {
6298         struct net_device *dev;
6299
6300         if (!list_empty(head)) {
6301                 rollback_registered_many(head);
6302                 list_for_each_entry(dev, head, unreg_list)
6303                         net_set_todo(dev);
6304         }
6305 }
6306 EXPORT_SYMBOL(unregister_netdevice_many);
6307
6308 /**
6309  *      unregister_netdev - remove device from the kernel
6310  *      @dev: device
6311  *
6312  *      This function shuts down a device interface and removes it
6313  *      from the kernel tables.
6314  *
6315  *      This is just a wrapper for unregister_netdevice that takes
6316  *      the rtnl semaphore.  In general you want to use this and not
6317  *      unregister_netdevice.
6318  */
6319 void unregister_netdev(struct net_device *dev)
6320 {
6321         rtnl_lock();
6322         unregister_netdevice(dev);
6323         rtnl_unlock();
6324 }
6325 EXPORT_SYMBOL(unregister_netdev);
6326
6327 /**
6328  *      dev_change_net_namespace - move device to different nethost namespace
6329  *      @dev: device
6330  *      @net: network namespace
6331  *      @pat: If not NULL name pattern to try if the current device name
6332  *            is already taken in the destination network namespace.
6333  *
6334  *      This function shuts down a device interface and moves it
6335  *      to a new network namespace. On success 0 is returned, on
6336  *      a failure a netagive errno code is returned.
6337  *
6338  *      Callers must hold the rtnl semaphore.
6339  */
6340
6341 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6342 {
6343         int err;
6344
6345         ASSERT_RTNL();
6346
6347         /* Don't allow namespace local devices to be moved. */
6348         err = -EINVAL;
6349         if (dev->features & NETIF_F_NETNS_LOCAL)
6350                 goto out;
6351
6352         /* Ensure the device has been registrered */
6353         if (dev->reg_state != NETREG_REGISTERED)
6354                 goto out;
6355
6356         /* Get out if there is nothing todo */
6357         err = 0;
6358         if (net_eq(dev_net(dev), net))
6359                 goto out;
6360
6361         /* Pick the destination device name, and ensure
6362          * we can use it in the destination network namespace.
6363          */
6364         err = -EEXIST;
6365         if (__dev_get_by_name(net, dev->name)) {
6366                 /* We get here if we can't use the current device name */
6367                 if (!pat)
6368                         goto out;
6369                 if (dev_get_valid_name(net, dev, pat) < 0)
6370                         goto out;
6371         }
6372
6373         /*
6374          * And now a mini version of register_netdevice unregister_netdevice.
6375          */
6376
6377         /* If device is running close it first. */
6378         dev_close(dev);
6379
6380         /* And unlink it from device chain */
6381         err = -ENODEV;
6382         unlist_netdevice(dev);
6383
6384         synchronize_net();
6385
6386         /* Shutdown queueing discipline. */
6387         dev_shutdown(dev);
6388
6389         /* Notify protocols, that we are about to destroy
6390            this device. They should clean all the things.
6391
6392            Note that dev->reg_state stays at NETREG_REGISTERED.
6393            This is wanted because this way 8021q and macvlan know
6394            the device is just moving and can keep their slaves up.
6395         */
6396         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6397         rcu_barrier();
6398         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6399         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6400
6401         /*
6402          *      Flush the unicast and multicast chains
6403          */
6404         dev_uc_flush(dev);
6405         dev_mc_flush(dev);
6406
6407         /* Actually switch the network namespace */
6408         dev_net_set(dev, net);
6409
6410         /* If there is an ifindex conflict assign a new one */
6411         if (__dev_get_by_index(net, dev->ifindex)) {
6412                 int iflink = (dev->iflink == dev->ifindex);
6413                 dev->ifindex = dev_new_index(net);
6414                 if (iflink)
6415                         dev->iflink = dev->ifindex;
6416         }
6417
6418         /* Fixup kobjects */
6419         err = device_rename(&dev->dev, dev->name);
6420         WARN_ON(err);
6421
6422         /* Add the device back in the hashes */
6423         list_netdevice(dev);
6424
6425         /* Notify protocols, that a new device appeared. */
6426         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6427
6428         /*
6429          *      Prevent userspace races by waiting until the network
6430          *      device is fully setup before sending notifications.
6431          */
6432         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6433
6434         synchronize_net();
6435         err = 0;
6436 out:
6437         return err;
6438 }
6439 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6440
6441 static int dev_cpu_callback(struct notifier_block *nfb,
6442                             unsigned long action,
6443                             void *ocpu)
6444 {
6445         struct sk_buff **list_skb;
6446         struct sk_buff *skb;
6447         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6448         struct softnet_data *sd, *oldsd;
6449
6450         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6451                 return NOTIFY_OK;
6452
6453         local_irq_disable();
6454         cpu = smp_processor_id();
6455         sd = &per_cpu(softnet_data, cpu);
6456         oldsd = &per_cpu(softnet_data, oldcpu);
6457
6458         /* Find end of our completion_queue. */
6459         list_skb = &sd->completion_queue;
6460         while (*list_skb)
6461                 list_skb = &(*list_skb)->next;
6462         /* Append completion queue from offline CPU. */
6463         *list_skb = oldsd->completion_queue;
6464         oldsd->completion_queue = NULL;
6465
6466         /* Append output queue from offline CPU. */
6467         if (oldsd->output_queue) {
6468                 *sd->output_queue_tailp = oldsd->output_queue;
6469                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6470                 oldsd->output_queue = NULL;
6471                 oldsd->output_queue_tailp = &oldsd->output_queue;
6472         }
6473         /* Append NAPI poll list from offline CPU. */
6474         if (!list_empty(&oldsd->poll_list)) {
6475                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6476                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6477         }
6478
6479         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6480         local_irq_enable();
6481
6482         /* Process offline CPU's input_pkt_queue */
6483         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6484                 netif_rx(skb);
6485                 input_queue_head_incr(oldsd);
6486         }
6487         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6488                 netif_rx(skb);
6489                 input_queue_head_incr(oldsd);
6490         }
6491
6492         return NOTIFY_OK;
6493 }
6494
6495
6496 /**
6497  *      netdev_increment_features - increment feature set by one
6498  *      @all: current feature set
6499  *      @one: new feature set
6500  *      @mask: mask feature set
6501  *
6502  *      Computes a new feature set after adding a device with feature set
6503  *      @one to the master device with current feature set @all.  Will not
6504  *      enable anything that is off in @mask. Returns the new feature set.
6505  */
6506 netdev_features_t netdev_increment_features(netdev_features_t all,
6507         netdev_features_t one, netdev_features_t mask)
6508 {
6509         if (mask & NETIF_F_GEN_CSUM)
6510                 mask |= NETIF_F_ALL_CSUM;
6511         mask |= NETIF_F_VLAN_CHALLENGED;
6512
6513         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6514         all &= one | ~NETIF_F_ALL_FOR_ALL;
6515
6516         /* If one device supports hw checksumming, set for all. */
6517         if (all & NETIF_F_GEN_CSUM)
6518                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6519
6520         return all;
6521 }
6522 EXPORT_SYMBOL(netdev_increment_features);
6523
6524 static struct hlist_head *netdev_create_hash(void)
6525 {
6526         int i;
6527         struct hlist_head *hash;
6528
6529         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6530         if (hash != NULL)
6531                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6532                         INIT_HLIST_HEAD(&hash[i]);
6533
6534         return hash;
6535 }
6536
6537 /* Initialize per network namespace state */
6538 static int __net_init netdev_init(struct net *net)
6539 {
6540         if (net != &init_net)
6541                 INIT_LIST_HEAD(&net->dev_base_head);
6542
6543         net->dev_name_head = netdev_create_hash();
6544         if (net->dev_name_head == NULL)
6545                 goto err_name;
6546
6547         net->dev_index_head = netdev_create_hash();
6548         if (net->dev_index_head == NULL)
6549                 goto err_idx;
6550
6551         return 0;
6552
6553 err_idx:
6554         kfree(net->dev_name_head);
6555 err_name:
6556         return -ENOMEM;
6557 }
6558
6559 /**
6560  *      netdev_drivername - network driver for the device
6561  *      @dev: network device
6562  *
6563  *      Determine network driver for device.
6564  */
6565 const char *netdev_drivername(const struct net_device *dev)
6566 {
6567         const struct device_driver *driver;
6568         const struct device *parent;
6569         const char *empty = "";
6570
6571         parent = dev->dev.parent;
6572         if (!parent)
6573                 return empty;
6574
6575         driver = parent->driver;
6576         if (driver && driver->name)
6577                 return driver->name;
6578         return empty;
6579 }
6580
6581 static int __netdev_printk(const char *level, const struct net_device *dev,
6582                            struct va_format *vaf)
6583 {
6584         int r;
6585
6586         if (dev && dev->dev.parent) {
6587                 r = dev_printk_emit(level[1] - '0',
6588                                     dev->dev.parent,
6589                                     "%s %s %s: %pV",
6590                                     dev_driver_string(dev->dev.parent),
6591                                     dev_name(dev->dev.parent),
6592                                     netdev_name(dev), vaf);
6593         } else if (dev) {
6594                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6595         } else {
6596                 r = printk("%s(NULL net_device): %pV", level, vaf);
6597         }
6598
6599         return r;
6600 }
6601
6602 int netdev_printk(const char *level, const struct net_device *dev,
6603                   const char *format, ...)
6604 {
6605         struct va_format vaf;
6606         va_list args;
6607         int r;
6608
6609         va_start(args, format);
6610
6611         vaf.fmt = format;
6612         vaf.va = &args;
6613
6614         r = __netdev_printk(level, dev, &vaf);
6615
6616         va_end(args);
6617
6618         return r;
6619 }
6620 EXPORT_SYMBOL(netdev_printk);
6621
6622 #define define_netdev_printk_level(func, level)                 \
6623 int func(const struct net_device *dev, const char *fmt, ...)    \
6624 {                                                               \
6625         int r;                                                  \
6626         struct va_format vaf;                                   \
6627         va_list args;                                           \
6628                                                                 \
6629         va_start(args, fmt);                                    \
6630                                                                 \
6631         vaf.fmt = fmt;                                          \
6632         vaf.va = &args;                                         \
6633                                                                 \
6634         r = __netdev_printk(level, dev, &vaf);                  \
6635                                                                 \
6636         va_end(args);                                           \
6637                                                                 \
6638         return r;                                               \
6639 }                                                               \
6640 EXPORT_SYMBOL(func);
6641
6642 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6643 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6644 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6645 define_netdev_printk_level(netdev_err, KERN_ERR);
6646 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6647 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6648 define_netdev_printk_level(netdev_info, KERN_INFO);
6649
6650 static void __net_exit netdev_exit(struct net *net)
6651 {
6652         kfree(net->dev_name_head);
6653         kfree(net->dev_index_head);
6654 }
6655
6656 static struct pernet_operations __net_initdata netdev_net_ops = {
6657         .init = netdev_init,
6658         .exit = netdev_exit,
6659 };
6660
6661 static void __net_exit default_device_exit(struct net *net)
6662 {
6663         struct net_device *dev, *aux;
6664         /*
6665          * Push all migratable network devices back to the
6666          * initial network namespace
6667          */
6668         rtnl_lock();
6669         for_each_netdev_safe(net, dev, aux) {
6670                 int err;
6671                 char fb_name[IFNAMSIZ];
6672
6673                 /* Ignore unmoveable devices (i.e. loopback) */
6674                 if (dev->features & NETIF_F_NETNS_LOCAL)
6675                         continue;
6676
6677                 /* Leave virtual devices for the generic cleanup */
6678                 if (dev->rtnl_link_ops)
6679                         continue;
6680
6681                 /* Push remaining network devices to init_net */
6682                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6683                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6684                 if (err) {
6685                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6686                                  __func__, dev->name, err);
6687                         BUG();
6688                 }
6689         }
6690         rtnl_unlock();
6691 }
6692
6693 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6694 {
6695         /* At exit all network devices most be removed from a network
6696          * namespace.  Do this in the reverse order of registration.
6697          * Do this across as many network namespaces as possible to
6698          * improve batching efficiency.
6699          */
6700         struct net_device *dev;
6701         struct net *net;
6702         LIST_HEAD(dev_kill_list);
6703
6704         rtnl_lock();
6705         list_for_each_entry(net, net_list, exit_list) {
6706                 for_each_netdev_reverse(net, dev) {
6707                         if (dev->rtnl_link_ops)
6708                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6709                         else
6710                                 unregister_netdevice_queue(dev, &dev_kill_list);
6711                 }
6712         }
6713         unregister_netdevice_many(&dev_kill_list);
6714         list_del(&dev_kill_list);
6715         rtnl_unlock();
6716 }
6717
6718 static struct pernet_operations __net_initdata default_device_ops = {
6719         .exit = default_device_exit,
6720         .exit_batch = default_device_exit_batch,
6721 };
6722
6723 /*
6724  *      Initialize the DEV module. At boot time this walks the device list and
6725  *      unhooks any devices that fail to initialise (normally hardware not
6726  *      present) and leaves us with a valid list of present and active devices.
6727  *
6728  */
6729
6730 /*
6731  *       This is called single threaded during boot, so no need
6732  *       to take the rtnl semaphore.
6733  */
6734 static int __init net_dev_init(void)
6735 {
6736         int i, rc = -ENOMEM;
6737
6738         BUG_ON(!dev_boot_phase);
6739
6740         if (dev_proc_init())
6741                 goto out;
6742
6743         if (netdev_kobject_init())
6744                 goto out;
6745
6746         INIT_LIST_HEAD(&ptype_all);
6747         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6748                 INIT_LIST_HEAD(&ptype_base[i]);
6749
6750         INIT_LIST_HEAD(&offload_base);
6751
6752         if (register_pernet_subsys(&netdev_net_ops))
6753                 goto out;
6754
6755         /*
6756          *      Initialise the packet receive queues.
6757          */
6758
6759         for_each_possible_cpu(i) {
6760                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6761
6762                 memset(sd, 0, sizeof(*sd));
6763                 skb_queue_head_init(&sd->input_pkt_queue);
6764                 skb_queue_head_init(&sd->process_queue);
6765                 sd->completion_queue = NULL;
6766                 INIT_LIST_HEAD(&sd->poll_list);
6767                 sd->output_queue = NULL;
6768                 sd->output_queue_tailp = &sd->output_queue;
6769 #ifdef CONFIG_RPS
6770                 sd->csd.func = rps_trigger_softirq;
6771                 sd->csd.info = sd;
6772                 sd->csd.flags = 0;
6773                 sd->cpu = i;
6774 #endif
6775
6776                 sd->backlog.poll = process_backlog;
6777                 sd->backlog.weight = weight_p;
6778                 sd->backlog.gro_list = NULL;
6779                 sd->backlog.gro_count = 0;
6780         }
6781
6782         dev_boot_phase = 0;
6783
6784         /* The loopback device is special if any other network devices
6785          * is present in a network namespace the loopback device must
6786          * be present. Since we now dynamically allocate and free the
6787          * loopback device ensure this invariant is maintained by
6788          * keeping the loopback device as the first device on the
6789          * list of network devices.  Ensuring the loopback devices
6790          * is the first device that appears and the last network device
6791          * that disappears.
6792          */
6793         if (register_pernet_device(&loopback_net_ops))
6794                 goto out;
6795
6796         if (register_pernet_device(&default_device_ops))
6797                 goto out;
6798
6799         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6800         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6801
6802         hotcpu_notifier(dev_cpu_callback, 0);
6803         dst_init();
6804         dev_mcast_init();
6805         rc = 0;
6806 out:
6807         return rc;
6808 }
6809
6810 subsys_initcall(net_dev_init);
6811
6812 static int __init initialize_hashrnd(void)
6813 {
6814         get_random_bytes(&hashrnd, sizeof(hashrnd));
6815         return 0;
6816 }
6817
6818 late_initcall_sync(initialize_hashrnd);
6819