net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135 #include <linux/cpu_rmap.h>
 136 #include <linux/if_tunnel.h>
 137 #include <linux/if_pppox.h>
 138 #include <linux/ppp_defs.h>
 139 #include <linux/net_tstamp.h>
 140 #include <linux/jump_label.h>
 141
 142 #include "net-sysfs.h"
 143
 144 /* Instead of increasing this, you should create a hash table. */
 145 #define MAX_GRO_SKBS 8
 146
 147 /* This should be increased if a protocol with a bigger head is added. */
 148 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 149
 150 /*
 151  *      The list of packet types we will receive (as opposed to discard)
 152  *      and the routines to invoke.
 153  *
 154  *      Why 16. Because with 16 the only overlap we get on a hash of the
 155  *      low nibble of the protocol value is RARP/SNAP/X.25.
 156  *
 157  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 158  *             sure which should go first, but I bet it won't make much
 159  *             difference if we are running VLANs.  The good news is that
 160  *             this protocol won't be in the list unless compiled in, so
 161  *             the average user (w/out VLANs) will not be adversely affected.
 162  *             --BLG
 163  *
 164  *              0800    IP
 165  *              8100    802.1Q VLAN
 166  *              0001    802.3
 167  *              0002    AX.25
 168  *              0004    802.2
 169  *              8035    RARP
 170  *              0005    SNAP
 171  *              0805    X.25
 172  *              0806    ARP
 173  *              8137    IPX
 174  *              0009    Localtalk
 175  *              86DD    IPv6
 176  */
 177
 178 #define PTYPE_HASH_SIZE (16)
 179 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 180
 181 static DEFINE_SPINLOCK(ptype_lock);
 182 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 183 static struct list_head ptype_all __read_mostly;        /* Taps */
 184
 185 /*
 186  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 187  * semaphore.
 188  *
 189  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 190  *
 191  * Writers must hold the rtnl semaphore while they loop through the
 192  * dev_base_head list, and hold dev_base_lock for writing when they do the
 193  * actual updates.  This allows pure readers to access the list even
 194  * while a writer is preparing to update it.
 195  *
 196  * To put it another way, dev_base_lock is held for writing only to
 197  * protect against pure readers; the rtnl semaphore provides the
 198  * protection against other writers.
 199  *
 200  * See, for example usages, register_netdevice() and
 201  * unregister_netdevice(), which must be called with the rtnl
 202  * semaphore held.
 203  */
 204 DEFINE_RWLOCK(dev_base_lock);
 205 EXPORT_SYMBOL(dev_base_lock);
 206
 207 static inline void dev_base_seq_inc(struct net *net)
 208 {
 209         while (++net->dev_base_seq == 0);
 210 }
 211
 212 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 213 {
 214         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 215         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 216 }
 217
 218 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 219 {
 220         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 221 }
 222
 223 static inline void rps_lock(struct softnet_data *sd)
 224 {
 225 #ifdef CONFIG_RPS
 226         spin_lock(&sd->input_pkt_queue.lock);
 227 #endif
 228 }
 229
 230 static inline void rps_unlock(struct softnet_data *sd)
 231 {
 232 #ifdef CONFIG_RPS
 233         spin_unlock(&sd->input_pkt_queue.lock);
 234 #endif
 235 }
 236
 237 /* Device list insertion */
 238 static int list_netdevice(struct net_device *dev)
 239 {
 240         struct net *net = dev_net(dev);
 241
 242         ASSERT_RTNL();
 243
 244         write_lock_bh(&dev_base_lock);
 245         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 246         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 247         hlist_add_head_rcu(&dev->index_hlist,
 248                            dev_index_hash(net, dev->ifindex));
 249         write_unlock_bh(&dev_base_lock);
 250
 251         dev_base_seq_inc(net);
 252
 253         return 0;
 254 }
 255
 256 /* Device list removal
 257  * caller must respect a RCU grace period before freeing/reusing dev
 258  */
 259 static void unlist_netdevice(struct net_device *dev)
 260 {
 261         ASSERT_RTNL();
 262
 263         /* Unlink dev from the device chain */
 264         write_lock_bh(&dev_base_lock);
 265         list_del_rcu(&dev->dev_list);
 266         hlist_del_rcu(&dev->name_hlist);
 267         hlist_del_rcu(&dev->index_hlist);
 268         write_unlock_bh(&dev_base_lock);
 269
 270         dev_base_seq_inc(dev_net(dev));
 271 }
 272
 273 /*
 274  *      Our notifier list
 275  */
 276
 277 static RAW_NOTIFIER_HEAD(netdev_chain);
 278
 279 /*
 280  *      Device drivers call our routines to queue packets here. We empty the
 281  *      queue in the local softnet handler.
 282  */
 283
 284 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 285 EXPORT_PER_CPU_SYMBOL(softnet_data);
 286
 287 #ifdef CONFIG_LOCKDEP
 288 /*
 289  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 290  * according to dev->type
 291  */
 292 static const unsigned short netdev_lock_type[] =
 293         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 294          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 295          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 296          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 297          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 298          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 299          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 300          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 301          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 302          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 303          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 304          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 305          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 306          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 307          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 308          ARPHRD_VOID, ARPHRD_NONE};
 309
 310 static const char *const netdev_lock_name[] =
 311         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 312          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 313          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 314          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 315          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 316          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 317          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 318          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 319          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 320          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 321          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 322          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 323          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 324          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 325          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 326          "_xmit_VOID", "_xmit_NONE"};
 327
 328 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 329 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 330
 331 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 332 {
 333         int i;
 334
 335         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 336                 if (netdev_lock_type[i] == dev_type)
 337                         return i;
 338         /* the last key is used by default */
 339         return ARRAY_SIZE(netdev_lock_type) - 1;
 340 }
 341
 342 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 343                                                  unsigned short dev_type)
 344 {
 345         int i;
 346
 347         i = netdev_lock_pos(dev_type);
 348         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 349                                    netdev_lock_name[i]);
 350 }
 351
 352 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 353 {
 354         int i;
 355
 356         i = netdev_lock_pos(dev->type);
 357         lockdep_set_class_and_name(&dev->addr_list_lock,
 358                                    &netdev_addr_lock_key[i],
 359                                    netdev_lock_name[i]);
 360 }
 361 #else
 362 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 363                                                  unsigned short dev_type)
 364 {
 365 }
 366 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 367 {
 368 }
 369 #endif
 370
 371 /*******************************************************************************
 372
 373                 Protocol management and registration routines
 374
 375 *******************************************************************************/
 376
 377 /*
 378  *      Add a protocol ID to the list. Now that the input handler is
 379  *      smarter we can dispense with all the messy stuff that used to be
 380  *      here.
 381  *
 382  *      BEWARE!!! Protocol handlers, mangling input packets,
 383  *      MUST BE last in hash buckets and checking protocol handlers
 384  *      MUST start from promiscuous ptype_all chain in net_bh.
 385  *      It is true now, do not change it.
 386  *      Explanation follows: if protocol handler, mangling packet, will
 387  *      be the first on list, it is not able to sense, that packet
 388  *      is cloned and should be copied-on-write, so that it will
 389  *      change it and subsequent readers will get broken packet.
 390  *                                                      --ANK (980803)
 391  */
 392
 393 static inline struct list_head *ptype_head(const struct packet_type *pt)
 394 {
 395         if (pt->type == htons(ETH_P_ALL))
 396                 return &ptype_all;
 397         else
 398                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 399 }
 400
 401 /**
 402  *      dev_add_pack - add packet handler
 403  *      @pt: packet type declaration
 404  *
 405  *      Add a protocol handler to the networking stack. The passed &packet_type
 406  *      is linked into kernel lists and may not be freed until it has been
 407  *      removed from the kernel lists.
 408  *
 409  *      This call does not sleep therefore it can not
 410  *      guarantee all CPU's that are in middle of receiving packets
 411  *      will see the new packet type (until the next received packet).
 412  */
 413
 414 void dev_add_pack(struct packet_type *pt)
 415 {
 416         struct list_head *head = ptype_head(pt);
 417
 418         spin_lock(&ptype_lock);
 419         list_add_rcu(&pt->list, head);
 420         spin_unlock(&ptype_lock);
 421 }
 422 EXPORT_SYMBOL(dev_add_pack);
 423
 424 /**
 425  *      __dev_remove_pack        - remove packet handler
 426  *      @pt: packet type declaration
 427  *
 428  *      Remove a protocol handler that was previously added to the kernel
 429  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 430  *      from the kernel lists and can be freed or reused once this function
 431  *      returns.
 432  *
 433  *      The packet type might still be in use by receivers
 434  *      and must not be freed until after all the CPU's have gone
 435  *      through a quiescent state.
 436  */
 437 void __dev_remove_pack(struct packet_type *pt)
 438 {
 439         struct list_head *head = ptype_head(pt);
 440         struct packet_type *pt1;
 441
 442         spin_lock(&ptype_lock);
 443
 444         list_for_each_entry(pt1, head, list) {
 445                 if (pt == pt1) {
 446                         list_del_rcu(&pt->list);
 447                         goto out;
 448                 }
 449         }
 450
 451         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 452 out:
 453         spin_unlock(&ptype_lock);
 454 }
 455 EXPORT_SYMBOL(__dev_remove_pack);
 456
 457 /**
 458  *      dev_remove_pack  - remove packet handler
 459  *      @pt: packet type declaration
 460  *
 461  *      Remove a protocol handler that was previously added to the kernel
 462  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 463  *      from the kernel lists and can be freed or reused once this function
 464  *      returns.
 465  *
 466  *      This call sleeps to guarantee that no CPU is looking at the packet
 467  *      type after return.
 468  */
 469 void dev_remove_pack(struct packet_type *pt)
 470 {
 471         __dev_remove_pack(pt);
 472
 473         synchronize_net();
 474 }
 475 EXPORT_SYMBOL(dev_remove_pack);
 476
 477 /******************************************************************************
 478
 479                       Device Boot-time Settings Routines
 480
 481 *******************************************************************************/
 482
 483 /* Boot time configuration table */
 484 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 485
 486 /**
 487  *      netdev_boot_setup_add   - add new setup entry
 488  *      @name: name of the device
 489  *      @map: configured settings for the device
 490  *
 491  *      Adds new setup entry to the dev_boot_setup list.  The function
 492  *      returns 0 on error and 1 on success.  This is a generic routine to
 493  *      all netdevices.
 494  */
 495 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 496 {
 497         struct netdev_boot_setup *s;
 498         int i;
 499
 500         s = dev_boot_setup;
 501         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 502                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 503                         memset(s[i].name, 0, sizeof(s[i].name));
 504                         strlcpy(s[i].name, name, IFNAMSIZ);
 505                         memcpy(&s[i].map, map, sizeof(s[i].map));
 506                         break;
 507                 }
 508         }
 509
 510         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 511 }
 512
 513 /**
 514  *      netdev_boot_setup_check - check boot time settings
 515  *      @dev: the netdevice
 516  *
 517  *      Check boot time settings for the device.
 518  *      The found settings are set for the device to be used
 519  *      later in the device probing.
 520  *      Returns 0 if no settings found, 1 if they are.
 521  */
 522 int netdev_boot_setup_check(struct net_device *dev)
 523 {
 524         struct netdev_boot_setup *s = dev_boot_setup;
 525         int i;
 526
 527         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 528                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 529                     !strcmp(dev->name, s[i].name)) {
 530                         dev->irq        = s[i].map.irq;
 531                         dev->base_addr  = s[i].map.base_addr;
 532                         dev->mem_start  = s[i].map.mem_start;
 533                         dev->mem_end    = s[i].map.mem_end;
 534                         return 1;
 535                 }
 536         }
 537         return 0;
 538 }
 539 EXPORT_SYMBOL(netdev_boot_setup_check);
 540
 541
 542 /**
 543  *      netdev_boot_base        - get address from boot time settings
 544  *      @prefix: prefix for network device
 545  *      @unit: id for network device
 546  *
 547  *      Check boot time settings for the base address of device.
 548  *      The found settings are set for the device to be used
 549  *      later in the device probing.
 550  *      Returns 0 if no settings found.
 551  */
 552 unsigned long netdev_boot_base(const char *prefix, int unit)
 553 {
 554         const struct netdev_boot_setup *s = dev_boot_setup;
 555         char name[IFNAMSIZ];
 556         int i;
 557
 558         sprintf(name, "%s%d", prefix, unit);
 559
 560         /*
 561          * If device already registered then return base of 1
 562          * to indicate not to probe for this interface
 563          */
 564         if (__dev_get_by_name(&init_net, name))
 565                 return 1;
 566
 567         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 568                 if (!strcmp(name, s[i].name))
 569                         return s[i].map.base_addr;
 570         return 0;
 571 }
 572
 573 /*
 574  * Saves at boot time configured settings for any netdevice.
 575  */
 576 int __init netdev_boot_setup(char *str)
 577 {
 578         int ints[5];
 579         struct ifmap map;
 580
 581         str = get_options(str, ARRAY_SIZE(ints), ints);
 582         if (!str || !*str)
 583                 return 0;
 584
 585         /* Save settings */
 586         memset(&map, 0, sizeof(map));
 587         if (ints[0] > 0)
 588                 map.irq = ints[1];
 589         if (ints[0] > 1)
 590                 map.base_addr = ints[2];
 591         if (ints[0] > 2)
 592                 map.mem_start = ints[3];
 593         if (ints[0] > 3)
 594                 map.mem_end = ints[4];
 595
 596         /* Add new entry to the list */
 597         return netdev_boot_setup_add(str, &map);
 598 }
 599
 600 __setup("netdev=", netdev_boot_setup);
 601
 602 /*******************************************************************************
 603
 604                             Device Interface Subroutines
 605
 606 *******************************************************************************/
 607
 608 /**
 609  *      __dev_get_by_name       - find a device by its name
 610  *      @net: the applicable net namespace
 611  *      @name: name to find
 612  *
 613  *      Find an interface by name. Must be called under RTNL semaphore
 614  *      or @dev_base_lock. If the name is found a pointer to the device
 615  *      is returned. If the name is not found then %NULL is returned. The
 616  *      reference counters are not incremented so the caller must be
 617  *      careful with locks.
 618  */
 619
 620 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 621 {
 622         struct hlist_node *p;
 623         struct net_device *dev;
 624         struct hlist_head *head = dev_name_hash(net, name);
 625
 626         hlist_for_each_entry(dev, p, head, name_hlist)
 627                 if (!strncmp(dev->name, name, IFNAMSIZ))
 628                         return dev;
 629
 630         return NULL;
 631 }
 632 EXPORT_SYMBOL(__dev_get_by_name);
 633
 634 /**
 635  *      dev_get_by_name_rcu     - find a device by its name
 636  *      @net: the applicable net namespace
 637  *      @name: name to find
 638  *
 639  *      Find an interface by name.
 640  *      If the name is found a pointer to the device is returned.
 641  *      If the name is not found then %NULL is returned.
 642  *      The reference counters are not incremented so the caller must be
 643  *      careful with locks. The caller must hold RCU lock.
 644  */
 645
 646 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 647 {
 648         struct hlist_node *p;
 649         struct net_device *dev;
 650         struct hlist_head *head = dev_name_hash(net, name);
 651
 652         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 653                 if (!strncmp(dev->name, name, IFNAMSIZ))
 654                         return dev;
 655
 656         return NULL;
 657 }
 658 EXPORT_SYMBOL(dev_get_by_name_rcu);
 659
 660 /**
 661  *      dev_get_by_name         - find a device by its name
 662  *      @net: the applicable net namespace
 663  *      @name: name to find
 664  *
 665  *      Find an interface by name. This can be called from any
 666  *      context and does its own locking. The returned handle has
 667  *      the usage count incremented and the caller must use dev_put() to
 668  *      release it when it is no longer needed. %NULL is returned if no
 669  *      matching device is found.
 670  */
 671
 672 struct net_device *dev_get_by_name(struct net *net, const char *name)
 673 {
 674         struct net_device *dev;
 675
 676         rcu_read_lock();
 677         dev = dev_get_by_name_rcu(net, name);
 678         if (dev)
 679                 dev_hold(dev);
 680         rcu_read_unlock();
 681         return dev;
 682 }
 683 EXPORT_SYMBOL(dev_get_by_name);
 684
 685 /**
 686  *      __dev_get_by_index - find a device by its ifindex
 687  *      @net: the applicable net namespace
 688  *      @ifindex: index of device
 689  *
 690  *      Search for an interface by index. Returns %NULL if the device
 691  *      is not found or a pointer to the device. The device has not
 692  *      had its reference counter increased so the caller must be careful
 693  *      about locking. The caller must hold either the RTNL semaphore
 694  *      or @dev_base_lock.
 695  */
 696
 697 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 698 {
 699         struct hlist_node *p;
 700         struct net_device *dev;
 701         struct hlist_head *head = dev_index_hash(net, ifindex);
 702
 703         hlist_for_each_entry(dev, p, head, index_hlist)
 704                 if (dev->ifindex == ifindex)
 705                         return dev;
 706
 707         return NULL;
 708 }
 709 EXPORT_SYMBOL(__dev_get_by_index);
 710
 711 /**
 712  *      dev_get_by_index_rcu - find a device by its ifindex
 713  *      @net: the applicable net namespace
 714  *      @ifindex: index of device
 715  *
 716  *      Search for an interface by index. Returns %NULL if the device
 717  *      is not found or a pointer to the device. The device has not
 718  *      had its reference counter increased so the caller must be careful
 719  *      about locking. The caller must hold RCU lock.
 720  */
 721
 722 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 723 {
 724         struct hlist_node *p;
 725         struct net_device *dev;
 726         struct hlist_head *head = dev_index_hash(net, ifindex);
 727
 728         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 729                 if (dev->ifindex == ifindex)
 730                         return dev;
 731
 732         return NULL;
 733 }
 734 EXPORT_SYMBOL(dev_get_by_index_rcu);
 735
 736
 737 /**
 738  *      dev_get_by_index - find a device by its ifindex
 739  *      @net: the applicable net namespace
 740  *      @ifindex: index of device
 741  *
 742  *      Search for an interface by index. Returns NULL if the device
 743  *      is not found or a pointer to the device. The device returned has
 744  *      had a reference added and the pointer is safe until the user calls
 745  *      dev_put to indicate they have finished with it.
 746  */
 747
 748 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 749 {
 750         struct net_device *dev;
 751
 752         rcu_read_lock();
 753         dev = dev_get_by_index_rcu(net, ifindex);
 754         if (dev)
 755                 dev_hold(dev);
 756         rcu_read_unlock();
 757         return dev;
 758 }
 759 EXPORT_SYMBOL(dev_get_by_index);
 760
 761 /**
 762  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 763  *      @net: the applicable net namespace
 764  *      @type: media type of device
 765  *      @ha: hardware address
 766  *
 767  *      Search for an interface by MAC address. Returns NULL if the device
 768  *      is not found or a pointer to the device.
 769  *      The caller must hold RCU or RTNL.
 770  *      The returned device has not had its ref count increased
 771  *      and the caller must therefore be careful about locking
 772  *
 773  */
 774
 775 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 776                                        const char *ha)
 777 {
 778         struct net_device *dev;
 779
 780         for_each_netdev_rcu(net, dev)
 781                 if (dev->type == type &&
 782                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 783                         return dev;
 784
 785         return NULL;
 786 }
 787 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 788
 789 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 790 {
 791         struct net_device *dev;
 792
 793         ASSERT_RTNL();
 794         for_each_netdev(net, dev)
 795                 if (dev->type == type)
 796                         return dev;
 797
 798         return NULL;
 799 }
 800 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 801
 802 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 803 {
 804         struct net_device *dev, *ret = NULL;
 805
 806         rcu_read_lock();
 807         for_each_netdev_rcu(net, dev)
 808                 if (dev->type == type) {
 809                         dev_hold(dev);
 810                         ret = dev;
 811                         break;
 812                 }
 813         rcu_read_unlock();
 814         return ret;
 815 }
 816 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 817
 818 /**
 819  *      dev_get_by_flags_rcu - find any device with given flags
 820  *      @net: the applicable net namespace
 821  *      @if_flags: IFF_* values
 822  *      @mask: bitmask of bits in if_flags to check
 823  *
 824  *      Search for any interface with the given flags. Returns NULL if a device
 825  *      is not found or a pointer to the device. Must be called inside
 826  *      rcu_read_lock(), and result refcount is unchanged.
 827  */
 828
 829 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 830                                     unsigned short mask)
 831 {
 832         struct net_device *dev, *ret;
 833
 834         ret = NULL;
 835         for_each_netdev_rcu(net, dev) {
 836                 if (((dev->flags ^ if_flags) & mask) == 0) {
 837                         ret = dev;
 838                         break;
 839                 }
 840         }
 841         return ret;
 842 }
 843 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 844
 845 /**
 846  *      dev_valid_name - check if name is okay for network device
 847  *      @name: name string
 848  *
 849  *      Network device names need to be valid file names to
 850  *      to allow sysfs to work.  We also disallow any kind of
 851  *      whitespace.
 852  */
 853 int dev_valid_name(const char *name)
 854 {
 855         if (*name == '\0')
 856                 return 0;
 857         if (strlen(name) >= IFNAMSIZ)
 858                 return 0;
 859         if (!strcmp(name, ".") || !strcmp(name, ".."))
 860                 return 0;
 861
 862         while (*name) {
 863                 if (*name == '/' || isspace(*name))
 864                         return 0;
 865                 name++;
 866         }
 867         return 1;
 868 }
 869 EXPORT_SYMBOL(dev_valid_name);
 870
 871 /**
 872  *      __dev_alloc_name - allocate a name for a device
 873  *      @net: network namespace to allocate the device name in
 874  *      @name: name format string
 875  *      @buf:  scratch buffer and result name string
 876  *
 877  *      Passed a format string - eg "lt%d" it will try and find a suitable
 878  *      id. It scans list of devices to build up a free map, then chooses
 879  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 880  *      while allocating the name and adding the device in order to avoid
 881  *      duplicates.
 882  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 883  *      Returns the number of the unit assigned or a negative errno code.
 884  */
 885
 886 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 887 {
 888         int i = 0;
 889         const char *p;
 890         const int max_netdevices = 8*PAGE_SIZE;
 891         unsigned long *inuse;
 892         struct net_device *d;
 893
 894         p = strnchr(name, IFNAMSIZ-1, '%');
 895         if (p) {
 896                 /*
 897                  * Verify the string as this thing may have come from
 898                  * the user.  There must be either one "%d" and no other "%"
 899                  * characters.
 900                  */
 901                 if (p[1] != 'd' || strchr(p + 2, '%'))
 902                         return -EINVAL;
 903
 904                 /* Use one page as a bit array of possible slots */
 905                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 906                 if (!inuse)
 907                         return -ENOMEM;
 908
 909                 for_each_netdev(net, d) {
 910                         if (!sscanf(d->name, name, &i))
 911                                 continue;
 912                         if (i < 0 || i >= max_netdevices)
 913                                 continue;
 914
 915                         /*  avoid cases where sscanf is not exact inverse of printf */
 916                         snprintf(buf, IFNAMSIZ, name, i);
 917                         if (!strncmp(buf, d->name, IFNAMSIZ))
 918                                 set_bit(i, inuse);
 919                 }
 920
 921                 i = find_first_zero_bit(inuse, max_netdevices);
 922                 free_page((unsigned long) inuse);
 923         }
 924
 925         if (buf != name)
 926                 snprintf(buf, IFNAMSIZ, name, i);
 927         if (!__dev_get_by_name(net, buf))
 928                 return i;
 929
 930         /* It is possible to run out of possible slots
 931          * when the name is long and there isn't enough space left
 932          * for the digits, or if all bits are used.
 933          */
 934         return -ENFILE;
 935 }
 936
 937 /**
 938  *      dev_alloc_name - allocate a name for a device
 939  *      @dev: device
 940  *      @name: name format string
 941  *
 942  *      Passed a format string - eg "lt%d" it will try and find a suitable
 943  *      id. It scans list of devices to build up a free map, then chooses
 944  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 945  *      while allocating the name and adding the device in order to avoid
 946  *      duplicates.
 947  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 948  *      Returns the number of the unit assigned or a negative errno code.
 949  */
 950
 951 int dev_alloc_name(struct net_device *dev, const char *name)
 952 {
 953         char buf[IFNAMSIZ];
 954         struct net *net;
 955         int ret;
 956
 957         BUG_ON(!dev_net(dev));
 958         net = dev_net(dev);
 959         ret = __dev_alloc_name(net, name, buf);
 960         if (ret >= 0)
 961                 strlcpy(dev->name, buf, IFNAMSIZ);
 962         return ret;
 963 }
 964 EXPORT_SYMBOL(dev_alloc_name);
 965
 966 static int dev_get_valid_name(struct net_device *dev, const char *name)
 967 {
 968         struct net *net;
 969
 970         BUG_ON(!dev_net(dev));
 971         net = dev_net(dev);
 972
 973         if (!dev_valid_name(name))
 974                 return -EINVAL;
 975
 976         if (strchr(name, '%'))
 977                 return dev_alloc_name(dev, name);
 978         else if (__dev_get_by_name(net, name))
 979                 return -EEXIST;
 980         else if (dev->name != name)
 981                 strlcpy(dev->name, name, IFNAMSIZ);
 982
 983         return 0;
 984 }
 985
 986 /**
 987  *      dev_change_name - change name of a device
 988  *      @dev: device
 989  *      @newname: name (or format string) must be at least IFNAMSIZ
 990  *
 991  *      Change name of a device, can pass format strings "eth%d".
 992  *      for wildcarding.
 993  */
 994 int dev_change_name(struct net_device *dev, const char *newname)
 995 {
 996         char oldname[IFNAMSIZ];
 997         int err = 0;
 998         int ret;
 999         struct net *net;
1000
1001         ASSERT_RTNL();
1002         BUG_ON(!dev_net(dev));
1003
1004         net = dev_net(dev);
1005         if (dev->flags & IFF_UP)
1006                 return -EBUSY;
1007
1008         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1009                 return 0;
1010
1011         memcpy(oldname, dev->name, IFNAMSIZ);
1012
1013         err = dev_get_valid_name(dev, newname);
1014         if (err < 0)
1015                 return err;
1016
1017 rollback:
1018         ret = device_rename(&dev->dev, dev->name);
1019         if (ret) {
1020                 memcpy(dev->name, oldname, IFNAMSIZ);
1021                 return ret;
1022         }
1023
1024         write_lock_bh(&dev_base_lock);
1025         hlist_del_rcu(&dev->name_hlist);
1026         write_unlock_bh(&dev_base_lock);
1027
1028         synchronize_rcu();
1029
1030         write_lock_bh(&dev_base_lock);
1031         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1032         write_unlock_bh(&dev_base_lock);
1033
1034         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1035         ret = notifier_to_errno(ret);
1036
1037         if (ret) {
1038                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1039                 if (err >= 0) {
1040                         err = ret;
1041                         memcpy(dev->name, oldname, IFNAMSIZ);
1042                         goto rollback;
1043                 } else {
1044                         printk(KERN_ERR
1045                                "%s: name change rollback failed: %d.\n",
1046                                dev->name, ret);
1047                 }
1048         }
1049
1050         return err;
1051 }
1052
1053 /**
1054  *      dev_set_alias - change ifalias of a device
1055  *      @dev: device
1056  *      @alias: name up to IFALIASZ
1057  *      @len: limit of bytes to copy from info
1058  *
1059  *      Set ifalias for a device,
1060  */
1061 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1062 {
1063         ASSERT_RTNL();
1064
1065         if (len >= IFALIASZ)
1066                 return -EINVAL;
1067
1068         if (!len) {
1069                 if (dev->ifalias) {
1070                         kfree(dev->ifalias);
1071                         dev->ifalias = NULL;
1072                 }
1073                 return 0;
1074         }
1075
1076         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1077         if (!dev->ifalias)
1078                 return -ENOMEM;
1079
1080         strlcpy(dev->ifalias, alias, len+1);
1081         return len;
1082 }
1083
1084
1085 /**
1086  *      netdev_features_change - device changes features
1087  *      @dev: device to cause notification
1088  *
1089  *      Called to indicate a device has changed features.
1090  */
1091 void netdev_features_change(struct net_device *dev)
1092 {
1093         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1094 }
1095 EXPORT_SYMBOL(netdev_features_change);
1096
1097 /**
1098  *      netdev_state_change - device changes state
1099  *      @dev: device to cause notification
1100  *
1101  *      Called to indicate a device has changed state. This function calls
1102  *      the notifier chains for netdev_chain and sends a NEWLINK message
1103  *      to the routing socket.
1104  */
1105 void netdev_state_change(struct net_device *dev)
1106 {
1107         if (dev->flags & IFF_UP) {
1108                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1109                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1110         }
1111 }
1112 EXPORT_SYMBOL(netdev_state_change);
1113
1114 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1115 {
1116         return call_netdevice_notifiers(event, dev);
1117 }
1118 EXPORT_SYMBOL(netdev_bonding_change);
1119
1120 /**
1121  *      dev_load        - load a network module
1122  *      @net: the applicable net namespace
1123  *      @name: name of interface
1124  *
1125  *      If a network interface is not present and the process has suitable
1126  *      privileges this function loads the module. If module loading is not
1127  *      available in this kernel then it becomes a nop.
1128  */
1129
1130 void dev_load(struct net *net, const char *name)
1131 {
1132         struct net_device *dev;
1133         int no_module;
1134
1135         rcu_read_lock();
1136         dev = dev_get_by_name_rcu(net, name);
1137         rcu_read_unlock();
1138
1139         no_module = !dev;
1140         if (no_module && capable(CAP_NET_ADMIN))
1141                 no_module = request_module("netdev-%s", name);
1142         if (no_module && capable(CAP_SYS_MODULE)) {
1143                 if (!request_module("%s", name))
1144                         pr_err("Loading kernel module for a network device "
1145 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1146 "instead\n", name);
1147         }
1148 }
1149 EXPORT_SYMBOL(dev_load);
1150
1151 static int __dev_open(struct net_device *dev)
1152 {
1153         const struct net_device_ops *ops = dev->netdev_ops;
1154         int ret;
1155
1156         ASSERT_RTNL();
1157
1158         if (!netif_device_present(dev))
1159                 return -ENODEV;
1160
1161         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1162         ret = notifier_to_errno(ret);
1163         if (ret)
1164                 return ret;
1165
1166         set_bit(__LINK_STATE_START, &dev->state);
1167
1168         if (ops->ndo_validate_addr)
1169                 ret = ops->ndo_validate_addr(dev);
1170
1171         if (!ret && ops->ndo_open)
1172                 ret = ops->ndo_open(dev);
1173
1174         if (ret)
1175                 clear_bit(__LINK_STATE_START, &dev->state);
1176         else {
1177                 dev->flags |= IFF_UP;
1178                 net_dmaengine_get();
1179                 dev_set_rx_mode(dev);
1180                 dev_activate(dev);
1181         }
1182
1183         return ret;
1184 }
1185
1186 /**
1187  *      dev_open        - prepare an interface for use.
1188  *      @dev:   device to open
1189  *
1190  *      Takes a device from down to up state. The device's private open
1191  *      function is invoked and then the multicast lists are loaded. Finally
1192  *      the device is moved into the up state and a %NETDEV_UP message is
1193  *      sent to the netdev notifier chain.
1194  *
1195  *      Calling this function on an active interface is a nop. On a failure
1196  *      a negative errno code is returned.
1197  */
1198 int dev_open(struct net_device *dev)
1199 {
1200         int ret;
1201
1202         if (dev->flags & IFF_UP)
1203                 return 0;
1204
1205         ret = __dev_open(dev);
1206         if (ret < 0)
1207                 return ret;
1208
1209         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1210         call_netdevice_notifiers(NETDEV_UP, dev);
1211
1212         return ret;
1213 }
1214 EXPORT_SYMBOL(dev_open);
1215
1216 static int __dev_close_many(struct list_head *head)
1217 {
1218         struct net_device *dev;
1219
1220         ASSERT_RTNL();
1221         might_sleep();
1222
1223         list_for_each_entry(dev, head, unreg_list) {
1224                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1225
1226                 clear_bit(__LINK_STATE_START, &dev->state);
1227
1228                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1229                  * can be even on different cpu. So just clear netif_running().
1230                  *
1231                  * dev->stop() will invoke napi_disable() on all of it's
1232                  * napi_struct instances on this device.
1233                  */
1234                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1235         }
1236
1237         dev_deactivate_many(head);
1238
1239         list_for_each_entry(dev, head, unreg_list) {
1240                 const struct net_device_ops *ops = dev->netdev_ops;
1241
1242                 /*
1243                  *      Call the device specific close. This cannot fail.
1244                  *      Only if device is UP
1245                  *
1246                  *      We allow it to be called even after a DETACH hot-plug
1247                  *      event.
1248                  */
1249                 if (ops->ndo_stop)
1250                         ops->ndo_stop(dev);
1251
1252                 dev->flags &= ~IFF_UP;
1253                 net_dmaengine_put();
1254         }
1255
1256         return 0;
1257 }
1258
1259 static int __dev_close(struct net_device *dev)
1260 {
1261         int retval;
1262         LIST_HEAD(single);
1263
1264         list_add(&dev->unreg_list, &single);
1265         retval = __dev_close_many(&single);
1266         list_del(&single);
1267         return retval;
1268 }
1269
1270 static int dev_close_many(struct list_head *head)
1271 {
1272         struct net_device *dev, *tmp;
1273         LIST_HEAD(tmp_list);
1274
1275         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1276                 if (!(dev->flags & IFF_UP))
1277                         list_move(&dev->unreg_list, &tmp_list);
1278
1279         __dev_close_many(head);
1280
1281         list_for_each_entry(dev, head, unreg_list) {
1282                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1283                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1284         }
1285
1286         /* rollback_registered_many needs the complete original list */
1287         list_splice(&tmp_list, head);
1288         return 0;
1289 }
1290
1291 /**
1292  *      dev_close - shutdown an interface.
1293  *      @dev: device to shutdown
1294  *
1295  *      This function moves an active device into down state. A
1296  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1297  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1298  *      chain.
1299  */
1300 int dev_close(struct net_device *dev)
1301 {
1302         if (dev->flags & IFF_UP) {
1303                 LIST_HEAD(single);
1304
1305                 list_add(&dev->unreg_list, &single);
1306                 dev_close_many(&single);
1307                 list_del(&single);
1308         }
1309         return 0;
1310 }
1311 EXPORT_SYMBOL(dev_close);
1312
1313
1314 /**
1315  *      dev_disable_lro - disable Large Receive Offload on a device
1316  *      @dev: device
1317  *
1318  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1319  *      called under RTNL.  This is needed if received packets may be
1320  *      forwarded to another interface.
1321  */
1322 void dev_disable_lro(struct net_device *dev)
1323 {
1324         /*
1325          * If we're trying to disable lro on a vlan device
1326          * use the underlying physical device instead
1327          */
1328         if (is_vlan_dev(dev))
1329                 dev = vlan_dev_real_dev(dev);
1330
1331         dev->wanted_features &= ~NETIF_F_LRO;
1332         netdev_update_features(dev);
1333
1334         if (unlikely(dev->features & NETIF_F_LRO))
1335                 netdev_WARN(dev, "failed to disable LRO!\n");
1336 }
1337 EXPORT_SYMBOL(dev_disable_lro);
1338
1339
1340 static int dev_boot_phase = 1;
1341
1342 /**
1343  *      register_netdevice_notifier - register a network notifier block
1344  *      @nb: notifier
1345  *
1346  *      Register a notifier to be called when network device events occur.
1347  *      The notifier passed is linked into the kernel structures and must
1348  *      not be reused until it has been unregistered. A negative errno code
1349  *      is returned on a failure.
1350  *
1351  *      When registered all registration and up events are replayed
1352  *      to the new notifier to allow device to have a race free
1353  *      view of the network device list.
1354  */
1355
1356 int register_netdevice_notifier(struct notifier_block *nb)
1357 {
1358         struct net_device *dev;
1359         struct net_device *last;
1360         struct net *net;
1361         int err;
1362
1363         rtnl_lock();
1364         err = raw_notifier_chain_register(&netdev_chain, nb);
1365         if (err)
1366                 goto unlock;
1367         if (dev_boot_phase)
1368                 goto unlock;
1369         for_each_net(net) {
1370                 for_each_netdev(net, dev) {
1371                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1372                         err = notifier_to_errno(err);
1373                         if (err)
1374                                 goto rollback;
1375
1376                         if (!(dev->flags & IFF_UP))
1377                                 continue;
1378
1379                         nb->notifier_call(nb, NETDEV_UP, dev);
1380                 }
1381         }
1382
1383 unlock:
1384         rtnl_unlock();
1385         return err;
1386
1387 rollback:
1388         last = dev;
1389         for_each_net(net) {
1390                 for_each_netdev(net, dev) {
1391                         if (dev == last)
1392                                 break;
1393
1394                         if (dev->flags & IFF_UP) {
1395                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1396                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1397                         }
1398                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1399                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1400                 }
1401         }
1402
1403         raw_notifier_chain_unregister(&netdev_chain, nb);
1404         goto unlock;
1405 }
1406 EXPORT_SYMBOL(register_netdevice_notifier);
1407
1408 /**
1409  *      unregister_netdevice_notifier - unregister a network notifier block
1410  *      @nb: notifier
1411  *
1412  *      Unregister a notifier previously registered by
1413  *      register_netdevice_notifier(). The notifier is unlinked into the
1414  *      kernel structures and may then be reused. A negative errno code
1415  *      is returned on a failure.
1416  */
1417
1418 int unregister_netdevice_notifier(struct notifier_block *nb)
1419 {
1420         int err;
1421
1422         rtnl_lock();
1423         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1424         rtnl_unlock();
1425         return err;
1426 }
1427 EXPORT_SYMBOL(unregister_netdevice_notifier);
1428
1429 /**
1430  *      call_netdevice_notifiers - call all network notifier blocks
1431  *      @val: value passed unmodified to notifier function
1432  *      @dev: net_device pointer passed unmodified to notifier function
1433  *
1434  *      Call all network notifier blocks.  Parameters and return value
1435  *      are as for raw_notifier_call_chain().
1436  */
1437
1438 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1439 {
1440         ASSERT_RTNL();
1441         return raw_notifier_call_chain(&netdev_chain, val, dev);
1442 }
1443 EXPORT_SYMBOL(call_netdevice_notifiers);
1444
1445 static struct jump_label_key netstamp_needed __read_mostly;
1446
1447 void net_enable_timestamp(void)
1448 {
1449         jump_label_inc(&netstamp_needed);
1450 }
1451 EXPORT_SYMBOL(net_enable_timestamp);
1452
1453 void net_disable_timestamp(void)
1454 {
1455         jump_label_dec(&netstamp_needed);
1456 }
1457 EXPORT_SYMBOL(net_disable_timestamp);
1458
1459 static inline void net_timestamp_set(struct sk_buff *skb)
1460 {
1461         skb->tstamp.tv64 = 0;
1462         if (static_branch(&netstamp_needed))
1463                 __net_timestamp(skb);
1464 }
1465
1466 #define net_timestamp_check(COND, SKB)                  \
1467         if (static_branch(&netstamp_needed)) {          \
1468                 if ((COND) && !(SKB)->tstamp.tv64)      \
1469                         __net_timestamp(SKB);           \
1470         }                                               \
1471
1472 static int net_hwtstamp_validate(struct ifreq *ifr)
1473 {
1474         struct hwtstamp_config cfg;
1475         enum hwtstamp_tx_types tx_type;
1476         enum hwtstamp_rx_filters rx_filter;
1477         int tx_type_valid = 0;
1478         int rx_filter_valid = 0;
1479
1480         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1481                 return -EFAULT;
1482
1483         if (cfg.flags) /* reserved for future extensions */
1484                 return -EINVAL;
1485
1486         tx_type = cfg.tx_type;
1487         rx_filter = cfg.rx_filter;
1488
1489         switch (tx_type) {
1490         case HWTSTAMP_TX_OFF:
1491         case HWTSTAMP_TX_ON:
1492         case HWTSTAMP_TX_ONESTEP_SYNC:
1493                 tx_type_valid = 1;
1494                 break;
1495         }
1496
1497         switch (rx_filter) {
1498         case HWTSTAMP_FILTER_NONE:
1499         case HWTSTAMP_FILTER_ALL:
1500         case HWTSTAMP_FILTER_SOME:
1501         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1502         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1503         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1504         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1505         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1506         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1507         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1508         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1509         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1510         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1511         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1512         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1513                 rx_filter_valid = 1;
1514                 break;
1515         }
1516
1517         if (!tx_type_valid || !rx_filter_valid)
1518                 return -ERANGE;
1519
1520         return 0;
1521 }
1522
1523 static inline bool is_skb_forwardable(struct net_device *dev,
1524                                       struct sk_buff *skb)
1525 {
1526         unsigned int len;
1527
1528         if (!(dev->flags & IFF_UP))
1529                 return false;
1530
1531         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1532         if (skb->len <= len)
1533                 return true;
1534
1535         /* if TSO is enabled, we don't care about the length as the packet
1536          * could be forwarded without being segmented before
1537          */
1538         if (skb_is_gso(skb))
1539                 return true;
1540
1541         return false;
1542 }
1543
1544 /**
1545  * dev_forward_skb - loopback an skb to another netif
1546  *
1547  * @dev: destination network device
1548  * @skb: buffer to forward
1549  *
1550  * return values:
1551  *      NET_RX_SUCCESS  (no congestion)
1552  *      NET_RX_DROP     (packet was dropped, but freed)
1553  *
1554  * dev_forward_skb can be used for injecting an skb from the
1555  * start_xmit function of one device into the receive queue
1556  * of another device.
1557  *
1558  * The receiving device may be in another namespace, so
1559  * we have to clear all information in the skb that could
1560  * impact namespace isolation.
1561  */
1562 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1563 {
1564         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1565                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1566                         atomic_long_inc(&dev->rx_dropped);
1567                         kfree_skb(skb);
1568                         return NET_RX_DROP;
1569                 }
1570         }
1571
1572         skb_orphan(skb);
1573         nf_reset(skb);
1574
1575         if (unlikely(!is_skb_forwardable(dev, skb))) {
1576                 atomic_long_inc(&dev->rx_dropped);
1577                 kfree_skb(skb);
1578                 return NET_RX_DROP;
1579         }
1580         skb_set_dev(skb, dev);
1581         skb->tstamp.tv64 = 0;
1582         skb->pkt_type = PACKET_HOST;
1583         skb->protocol = eth_type_trans(skb, dev);
1584         return netif_rx(skb);
1585 }
1586 EXPORT_SYMBOL_GPL(dev_forward_skb);
1587
1588 static inline int deliver_skb(struct sk_buff *skb,
1589                               struct packet_type *pt_prev,
1590                               struct net_device *orig_dev)
1591 {
1592         atomic_inc(&skb->users);
1593         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1594 }
1595
1596 /*
1597  *      Support routine. Sends outgoing frames to any network
1598  *      taps currently in use.
1599  */
1600
1601 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1602 {
1603         struct packet_type *ptype;
1604         struct sk_buff *skb2 = NULL;
1605         struct packet_type *pt_prev = NULL;
1606
1607         rcu_read_lock();
1608         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1609                 /* Never send packets back to the socket
1610                  * they originated from - MvS (miquels@drinkel.ow.org)
1611                  */
1612                 if ((ptype->dev == dev || !ptype->dev) &&
1613                     (ptype->af_packet_priv == NULL ||
1614                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1615                         if (pt_prev) {
1616                                 deliver_skb(skb2, pt_prev, skb->dev);
1617                                 pt_prev = ptype;
1618                                 continue;
1619                         }
1620
1621                         skb2 = skb_clone(skb, GFP_ATOMIC);
1622                         if (!skb2)
1623                                 break;
1624
1625                         net_timestamp_set(skb2);
1626
1627                         /* skb->nh should be correctly
1628                            set by sender, so that the second statement is
1629                            just protection against buggy protocols.
1630                          */
1631                         skb_reset_mac_header(skb2);
1632
1633                         if (skb_network_header(skb2) < skb2->data ||
1634                             skb2->network_header > skb2->tail) {
1635                                 if (net_ratelimit())
1636                                         printk(KERN_CRIT "protocol %04x is "
1637                                                "buggy, dev %s\n",
1638                                                ntohs(skb2->protocol),
1639                                                dev->name);
1640                                 skb_reset_network_header(skb2);
1641                         }
1642
1643                         skb2->transport_header = skb2->network_header;
1644                         skb2->pkt_type = PACKET_OUTGOING;
1645                         pt_prev = ptype;
1646                 }
1647         }
1648         if (pt_prev)
1649                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1650         rcu_read_unlock();
1651 }
1652
1653 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1654  * @dev: Network device
1655  * @txq: number of queues available
1656  *
1657  * If real_num_tx_queues is changed the tc mappings may no longer be
1658  * valid. To resolve this verify the tc mapping remains valid and if
1659  * not NULL the mapping. With no priorities mapping to this
1660  * offset/count pair it will no longer be used. In the worst case TC0
1661  * is invalid nothing can be done so disable priority mappings. If is
1662  * expected that drivers will fix this mapping if they can before
1663  * calling netif_set_real_num_tx_queues.
1664  */
1665 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1666 {
1667         int i;
1668         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1669
1670         /* If TC0 is invalidated disable TC mapping */
1671         if (tc->offset + tc->count > txq) {
1672                 pr_warning("Number of in use tx queues changed "
1673                            "invalidating tc mappings. Priority "
1674                            "traffic classification disabled!\n");
1675                 dev->num_tc = 0;
1676                 return;
1677         }
1678
1679         /* Invalidated prio to tc mappings set to TC0 */
1680         for (i = 1; i < TC_BITMASK + 1; i++) {
1681                 int q = netdev_get_prio_tc_map(dev, i);
1682
1683                 tc = &dev->tc_to_txq[q];
1684                 if (tc->offset + tc->count > txq) {
1685                         pr_warning("Number of in use tx queues "
1686                                    "changed. Priority %i to tc "
1687                                    "mapping %i is no longer valid "
1688                                    "setting map to 0\n",
1689                                    i, q);
1690                         netdev_set_prio_tc_map(dev, i, 0);
1691                 }
1692         }
1693 }
1694
1695 /*
1696  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1697  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1698  */
1699 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1700 {
1701         int rc;
1702
1703         if (txq < 1 || txq > dev->num_tx_queues)
1704                 return -EINVAL;
1705
1706         if (dev->reg_state == NETREG_REGISTERED ||
1707             dev->reg_state == NETREG_UNREGISTERING) {
1708                 ASSERT_RTNL();
1709
1710                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1711                                                   txq);
1712                 if (rc)
1713                         return rc;
1714
1715                 if (dev->num_tc)
1716                         netif_setup_tc(dev, txq);
1717
1718                 if (txq < dev->real_num_tx_queues)
1719                         qdisc_reset_all_tx_gt(dev, txq);
1720         }
1721
1722         dev->real_num_tx_queues = txq;
1723         return 0;
1724 }
1725 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1726
1727 #ifdef CONFIG_RPS
1728 /**
1729  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1730  *      @dev: Network device
1731  *      @rxq: Actual number of RX queues
1732  *
1733  *      This must be called either with the rtnl_lock held or before
1734  *      registration of the net device.  Returns 0 on success, or a
1735  *      negative error code.  If called before registration, it always
1736  *      succeeds.
1737  */
1738 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1739 {
1740         int rc;
1741
1742         if (rxq < 1 || rxq > dev->num_rx_queues)
1743                 return -EINVAL;
1744
1745         if (dev->reg_state == NETREG_REGISTERED) {
1746                 ASSERT_RTNL();
1747
1748                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1749                                                   rxq);
1750                 if (rc)
1751                         return rc;
1752         }
1753
1754         dev->real_num_rx_queues = rxq;
1755         return 0;
1756 }
1757 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1758 #endif
1759
1760 static inline void __netif_reschedule(struct Qdisc *q)
1761 {
1762         struct softnet_data *sd;
1763         unsigned long flags;
1764
1765         local_irq_save(flags);
1766         sd = &__get_cpu_var(softnet_data);
1767         q->next_sched = NULL;
1768         *sd->output_queue_tailp = q;
1769         sd->output_queue_tailp = &q->next_sched;
1770         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1771         local_irq_restore(flags);
1772 }
1773
1774 void __netif_schedule(struct Qdisc *q)
1775 {
1776         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1777                 __netif_reschedule(q);
1778 }
1779 EXPORT_SYMBOL(__netif_schedule);
1780
1781 void dev_kfree_skb_irq(struct sk_buff *skb)
1782 {
1783         if (atomic_dec_and_test(&skb->users)) {
1784                 struct softnet_data *sd;
1785                 unsigned long flags;
1786
1787                 local_irq_save(flags);
1788                 sd = &__get_cpu_var(softnet_data);
1789                 skb->next = sd->completion_queue;
1790                 sd->completion_queue = skb;
1791                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1792                 local_irq_restore(flags);
1793         }
1794 }
1795 EXPORT_SYMBOL(dev_kfree_skb_irq);
1796
1797 void dev_kfree_skb_any(struct sk_buff *skb)
1798 {
1799         if (in_irq() || irqs_disabled())
1800                 dev_kfree_skb_irq(skb);
1801         else
1802                 dev_kfree_skb(skb);
1803 }
1804 EXPORT_SYMBOL(dev_kfree_skb_any);
1805
1806
1807 /**
1808  * netif_device_detach - mark device as removed
1809  * @dev: network device
1810  *
1811  * Mark device as removed from system and therefore no longer available.
1812  */
1813 void netif_device_detach(struct net_device *dev)
1814 {
1815         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1816             netif_running(dev)) {
1817                 netif_tx_stop_all_queues(dev);
1818         }
1819 }
1820 EXPORT_SYMBOL(netif_device_detach);
1821
1822 /**
1823  * netif_device_attach - mark device as attached
1824  * @dev: network device
1825  *
1826  * Mark device as attached from system and restart if needed.
1827  */
1828 void netif_device_attach(struct net_device *dev)
1829 {
1830         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1831             netif_running(dev)) {
1832                 netif_tx_wake_all_queues(dev);
1833                 __netdev_watchdog_up(dev);
1834         }
1835 }
1836 EXPORT_SYMBOL(netif_device_attach);
1837
1838 /**
1839  * skb_dev_set -- assign a new device to a buffer
1840  * @skb: buffer for the new device
1841  * @dev: network device
1842  *
1843  * If an skb is owned by a device already, we have to reset
1844  * all data private to the namespace a device belongs to
1845  * before assigning it a new device.
1846  */
1847 #ifdef CONFIG_NET_NS
1848 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1849 {
1850         skb_dst_drop(skb);
1851         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1852                 secpath_reset(skb);
1853                 nf_reset(skb);
1854                 skb_init_secmark(skb);
1855                 skb->mark = 0;
1856                 skb->priority = 0;
1857                 skb->nf_trace = 0;
1858                 skb->ipvs_property = 0;
1859 #ifdef CONFIG_NET_SCHED
1860                 skb->tc_index = 0;
1861 #endif
1862         }
1863         skb->dev = dev;
1864 }
1865 EXPORT_SYMBOL(skb_set_dev);
1866 #endif /* CONFIG_NET_NS */
1867
1868 /*
1869  * Invalidate hardware checksum when packet is to be mangled, and
1870  * complete checksum manually on outgoing path.
1871  */
1872 int skb_checksum_help(struct sk_buff *skb)
1873 {
1874         __wsum csum;
1875         int ret = 0, offset;
1876
1877         if (skb->ip_summed == CHECKSUM_COMPLETE)
1878                 goto out_set_summed;
1879
1880         if (unlikely(skb_shinfo(skb)->gso_size)) {
1881                 /* Let GSO fix up the checksum. */
1882                 goto out_set_summed;
1883         }
1884
1885         offset = skb_checksum_start_offset(skb);
1886         BUG_ON(offset >= skb_headlen(skb));
1887         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1888
1889         offset += skb->csum_offset;
1890         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1891
1892         if (skb_cloned(skb) &&
1893             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1894                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1895                 if (ret)
1896                         goto out;
1897         }
1898
1899         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1900 out_set_summed:
1901         skb->ip_summed = CHECKSUM_NONE;
1902 out:
1903         return ret;
1904 }
1905 EXPORT_SYMBOL(skb_checksum_help);
1906
1907 /**
1908  *      skb_gso_segment - Perform segmentation on skb.
1909  *      @skb: buffer to segment
1910  *      @features: features for the output path (see dev->features)
1911  *
1912  *      This function segments the given skb and returns a list of segments.
1913  *
1914  *      It may return NULL if the skb requires no segmentation.  This is
1915  *      only possible when GSO is used for verifying header integrity.
1916  */
1917 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1918         netdev_features_t features)
1919 {
1920         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1921         struct packet_type *ptype;
1922         __be16 type = skb->protocol;
1923         int vlan_depth = ETH_HLEN;
1924         int err;
1925
1926         while (type == htons(ETH_P_8021Q)) {
1927                 struct vlan_hdr *vh;
1928
1929                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1930                         return ERR_PTR(-EINVAL);
1931
1932                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1933                 type = vh->h_vlan_encapsulated_proto;
1934                 vlan_depth += VLAN_HLEN;
1935         }
1936
1937         skb_reset_mac_header(skb);
1938         skb->mac_len = skb->network_header - skb->mac_header;
1939         __skb_pull(skb, skb->mac_len);
1940
1941         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1942                 struct net_device *dev = skb->dev;
1943                 struct ethtool_drvinfo info = {};
1944
1945                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1946                         dev->ethtool_ops->get_drvinfo(dev, &info);
1947
1948                 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d ip_summed=%d\n",
1949                      info.driver, dev ? &dev->features : NULL,
1950                      skb->sk ? &skb->sk->sk_route_caps : NULL,
1951                      skb->len, skb->data_len, skb->ip_summed);
1952
1953                 if (skb_header_cloned(skb) &&
1954                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1955                         return ERR_PTR(err);
1956         }
1957
1958         rcu_read_lock();
1959         list_for_each_entry_rcu(ptype,
1960                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1961                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1962                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1963                                 err = ptype->gso_send_check(skb);
1964                                 segs = ERR_PTR(err);
1965                                 if (err || skb_gso_ok(skb, features))
1966                                         break;
1967                                 __skb_push(skb, (skb->data -
1968                                                  skb_network_header(skb)));
1969                         }
1970                         segs = ptype->gso_segment(skb, features);
1971                         break;
1972                 }
1973         }
1974         rcu_read_unlock();
1975
1976         __skb_push(skb, skb->data - skb_mac_header(skb));
1977
1978         return segs;
1979 }
1980 EXPORT_SYMBOL(skb_gso_segment);
1981
1982 /* Take action when hardware reception checksum errors are detected. */
1983 #ifdef CONFIG_BUG
1984 void netdev_rx_csum_fault(struct net_device *dev)
1985 {
1986         if (net_ratelimit()) {
1987                 printk(KERN_ERR "%s: hw csum failure.\n",
1988                         dev ? dev->name : "<unknown>");
1989                 dump_stack();
1990         }
1991 }
1992 EXPORT_SYMBOL(netdev_rx_csum_fault);
1993 #endif
1994
1995 /* Actually, we should eliminate this check as soon as we know, that:
1996  * 1. IOMMU is present and allows to map all the memory.
1997  * 2. No high memory really exists on this machine.
1998  */
1999
2000 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2001 {
2002 #ifdef CONFIG_HIGHMEM
2003         int i;
2004         if (!(dev->features & NETIF_F_HIGHDMA)) {
2005                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2006                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2007                         if (PageHighMem(skb_frag_page(frag)))
2008                                 return 1;
2009                 }
2010         }
2011
2012         if (PCI_DMA_BUS_IS_PHYS) {
2013                 struct device *pdev = dev->dev.parent;
2014
2015                 if (!pdev)
2016                         return 0;
2017                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2018                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2019                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2020                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2021                                 return 1;
2022                 }
2023         }
2024 #endif
2025         return 0;
2026 }
2027
2028 struct dev_gso_cb {
2029         void (*destructor)(struct sk_buff *skb);
2030 };
2031
2032 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2033
2034 static void dev_gso_skb_destructor(struct sk_buff *skb)
2035 {
2036         struct dev_gso_cb *cb;
2037
2038         do {
2039                 struct sk_buff *nskb = skb->next;
2040
2041                 skb->next = nskb->next;
2042                 nskb->next = NULL;
2043                 kfree_skb(nskb);
2044         } while (skb->next);
2045
2046         cb = DEV_GSO_CB(skb);
2047         if (cb->destructor)
2048                 cb->destructor(skb);
2049 }
2050
2051 /**
2052  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2053  *      @skb: buffer to segment
2054  *      @features: device features as applicable to this skb
2055  *
2056  *      This function segments the given skb and stores the list of segments
2057  *      in skb->next.
2058  */
2059 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2060 {
2061         struct sk_buff *segs;
2062
2063         segs = skb_gso_segment(skb, features);
2064
2065         /* Verifying header integrity only. */
2066         if (!segs)
2067                 return 0;
2068
2069         if (IS_ERR(segs))
2070                 return PTR_ERR(segs);
2071
2072         skb->next = segs;
2073         DEV_GSO_CB(skb)->destructor = skb->destructor;
2074         skb->destructor = dev_gso_skb_destructor;
2075
2076         return 0;
2077 }
2078
2079 /*
2080  * Try to orphan skb early, right before transmission by the device.
2081  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2082  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2083  */
2084 static inline void skb_orphan_try(struct sk_buff *skb)
2085 {
2086         struct sock *sk = skb->sk;
2087
2088         if (sk && !skb_shinfo(skb)->tx_flags) {
2089                 /* skb_tx_hash() wont be able to get sk.
2090                  * We copy sk_hash into skb->rxhash
2091                  */
2092                 if (!skb->rxhash)
2093                         skb->rxhash = sk->sk_hash;
2094                 skb_orphan(skb);
2095         }
2096 }
2097
2098 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2099 {
2100         return ((features & NETIF_F_GEN_CSUM) ||
2101                 ((features & NETIF_F_V4_CSUM) &&
2102                  protocol == htons(ETH_P_IP)) ||
2103                 ((features & NETIF_F_V6_CSUM) &&
2104                  protocol == htons(ETH_P_IPV6)) ||
2105                 ((features & NETIF_F_FCOE_CRC) &&
2106                  protocol == htons(ETH_P_FCOE)));
2107 }
2108
2109 static netdev_features_t harmonize_features(struct sk_buff *skb,
2110         __be16 protocol, netdev_features_t features)
2111 {
2112         if (!can_checksum_protocol(features, protocol)) {
2113                 features &= ~NETIF_F_ALL_CSUM;
2114                 features &= ~NETIF_F_SG;
2115         } else if (illegal_highdma(skb->dev, skb)) {
2116                 features &= ~NETIF_F_SG;
2117         }
2118
2119         return features;
2120 }
2121
2122 netdev_features_t netif_skb_features(struct sk_buff *skb)
2123 {
2124         __be16 protocol = skb->protocol;
2125         netdev_features_t features = skb->dev->features;
2126
2127         if (protocol == htons(ETH_P_8021Q)) {
2128                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2129                 protocol = veh->h_vlan_encapsulated_proto;
2130         } else if (!vlan_tx_tag_present(skb)) {
2131                 return harmonize_features(skb, protocol, features);
2132         }
2133
2134         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2135
2136         if (protocol != htons(ETH_P_8021Q)) {
2137                 return harmonize_features(skb, protocol, features);
2138         } else {
2139                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2140                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2141                 return harmonize_features(skb, protocol, features);
2142         }
2143 }
2144 EXPORT_SYMBOL(netif_skb_features);
2145
2146 /*
2147  * Returns true if either:
2148  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2149  *      2. skb is fragmented and the device does not support SG, or if
2150  *         at least one of fragments is in highmem and device does not
2151  *         support DMA from it.
2152  */
2153 static inline int skb_needs_linearize(struct sk_buff *skb,
2154                                       int features)
2155 {
2156         return skb_is_nonlinear(skb) &&
2157                         ((skb_has_frag_list(skb) &&
2158                                 !(features & NETIF_F_FRAGLIST)) ||
2159                         (skb_shinfo(skb)->nr_frags &&
2160                                 !(features & NETIF_F_SG)));
2161 }
2162
2163 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2164                         struct netdev_queue *txq)
2165 {
2166         const struct net_device_ops *ops = dev->netdev_ops;
2167         int rc = NETDEV_TX_OK;
2168         unsigned int skb_len;
2169
2170         if (likely(!skb->next)) {
2171                 netdev_features_t features;
2172
2173                 /*
2174                  * If device doesn't need skb->dst, release it right now while
2175                  * its hot in this cpu cache
2176                  */
2177                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2178                         skb_dst_drop(skb);
2179
2180                 if (!list_empty(&ptype_all))
2181                         dev_queue_xmit_nit(skb, dev);
2182
2183                 skb_orphan_try(skb);
2184
2185                 features = netif_skb_features(skb);
2186
2187                 if (vlan_tx_tag_present(skb) &&
2188                     !(features & NETIF_F_HW_VLAN_TX)) {
2189                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2190                         if (unlikely(!skb))
2191                                 goto out;
2192
2193                         skb->vlan_tci = 0;
2194                 }
2195
2196                 if (netif_needs_gso(skb, features)) {
2197                         if (unlikely(dev_gso_segment(skb, features)))
2198                                 goto out_kfree_skb;
2199                         if (skb->next)
2200                                 goto gso;
2201                 } else {
2202                         if (skb_needs_linearize(skb, features) &&
2203                             __skb_linearize(skb))
2204                                 goto out_kfree_skb;
2205
2206                         /* If packet is not checksummed and device does not
2207                          * support checksumming for this protocol, complete
2208                          * checksumming here.
2209                          */
2210                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2211                                 skb_set_transport_header(skb,
2212                                         skb_checksum_start_offset(skb));
2213                                 if (!(features & NETIF_F_ALL_CSUM) &&
2214                                      skb_checksum_help(skb))
2215                                         goto out_kfree_skb;
2216                         }
2217                 }
2218
2219                 skb_len = skb->len;
2220                 rc = ops->ndo_start_xmit(skb, dev);
2221                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2222                 if (rc == NETDEV_TX_OK)
2223                         txq_trans_update(txq);
2224                 return rc;
2225         }
2226
2227 gso:
2228         do {
2229                 struct sk_buff *nskb = skb->next;
2230
2231                 skb->next = nskb->next;
2232                 nskb->next = NULL;
2233
2234                 /*
2235                  * If device doesn't need nskb->dst, release it right now while
2236                  * its hot in this cpu cache
2237                  */
2238                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2239                         skb_dst_drop(nskb);
2240
2241                 skb_len = nskb->len;
2242                 rc = ops->ndo_start_xmit(nskb, dev);
2243                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2244                 if (unlikely(rc != NETDEV_TX_OK)) {
2245                         if (rc & ~NETDEV_TX_MASK)
2246                                 goto out_kfree_gso_skb;
2247                         nskb->next = skb->next;
2248                         skb->next = nskb;
2249                         return rc;
2250                 }
2251                 txq_trans_update(txq);
2252                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2253                         return NETDEV_TX_BUSY;
2254         } while (skb->next);
2255
2256 out_kfree_gso_skb:
2257         if (likely(skb->next == NULL))
2258                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2259 out_kfree_skb:
2260         kfree_skb(skb);
2261 out:
2262         return rc;
2263 }
2264
2265 static u32 hashrnd __read_mostly;
2266
2267 /*
2268  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2269  * to be used as a distribution range.
2270  */
2271 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2272                   unsigned int num_tx_queues)
2273 {
2274         u32 hash;
2275         u16 qoffset = 0;
2276         u16 qcount = num_tx_queues;
2277
2278         if (skb_rx_queue_recorded(skb)) {
2279                 hash = skb_get_rx_queue(skb);
2280                 while (unlikely(hash >= num_tx_queues))
2281                         hash -= num_tx_queues;
2282                 return hash;
2283         }
2284
2285         if (dev->num_tc) {
2286                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2287                 qoffset = dev->tc_to_txq[tc].offset;
2288                 qcount = dev->tc_to_txq[tc].count;
2289         }
2290
2291         if (skb->sk && skb->sk->sk_hash)
2292                 hash = skb->sk->sk_hash;
2293         else
2294                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2295         hash = jhash_1word(hash, hashrnd);
2296
2297         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2298 }
2299 EXPORT_SYMBOL(__skb_tx_hash);
2300
2301 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2302 {
2303         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2304                 if (net_ratelimit()) {
2305                         pr_warning("%s selects TX queue %d, but "
2306                                 "real number of TX queues is %d\n",
2307                                 dev->name, queue_index, dev->real_num_tx_queues);
2308                 }
2309                 return 0;
2310         }
2311         return queue_index;
2312 }
2313
2314 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2315 {
2316 #ifdef CONFIG_XPS
2317         struct xps_dev_maps *dev_maps;
2318         struct xps_map *map;
2319         int queue_index = -1;
2320
2321         rcu_read_lock();
2322         dev_maps = rcu_dereference(dev->xps_maps);
2323         if (dev_maps) {
2324                 map = rcu_dereference(
2325                     dev_maps->cpu_map[raw_smp_processor_id()]);
2326                 if (map) {
2327                         if (map->len == 1)
2328                                 queue_index = map->queues[0];
2329                         else {
2330                                 u32 hash;
2331                                 if (skb->sk && skb->sk->sk_hash)
2332                                         hash = skb->sk->sk_hash;
2333                                 else
2334                                         hash = (__force u16) skb->protocol ^
2335                                             skb->rxhash;
2336                                 hash = jhash_1word(hash, hashrnd);
2337                                 queue_index = map->queues[
2338                                     ((u64)hash * map->len) >> 32];
2339                         }
2340                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2341                                 queue_index = -1;
2342                 }
2343         }
2344         rcu_read_unlock();
2345
2346         return queue_index;
2347 #else
2348         return -1;
2349 #endif
2350 }
2351
2352 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2353                                         struct sk_buff *skb)
2354 {
2355         int queue_index;
2356         const struct net_device_ops *ops = dev->netdev_ops;
2357
2358         if (dev->real_num_tx_queues == 1)
2359                 queue_index = 0;
2360         else if (ops->ndo_select_queue) {
2361                 queue_index = ops->ndo_select_queue(dev, skb);
2362                 queue_index = dev_cap_txqueue(dev, queue_index);
2363         } else {
2364                 struct sock *sk = skb->sk;
2365                 queue_index = sk_tx_queue_get(sk);
2366
2367                 if (queue_index < 0 || skb->ooo_okay ||
2368                     queue_index >= dev->real_num_tx_queues) {
2369                         int old_index = queue_index;
2370
2371                         queue_index = get_xps_queue(dev, skb);
2372                         if (queue_index < 0)
2373                                 queue_index = skb_tx_hash(dev, skb);
2374
2375                         if (queue_index != old_index && sk) {
2376                                 struct dst_entry *dst =
2377                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2378
2379                                 if (dst && skb_dst(skb) == dst)
2380                                         sk_tx_queue_set(sk, queue_index);
2381                         }
2382                 }
2383         }
2384
2385         skb_set_queue_mapping(skb, queue_index);
2386         return netdev_get_tx_queue(dev, queue_index);
2387 }
2388
2389 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2390                                  struct net_device *dev,
2391                                  struct netdev_queue *txq)
2392 {
2393         spinlock_t *root_lock = qdisc_lock(q);
2394         bool contended;
2395         int rc;
2396
2397         qdisc_skb_cb(skb)->pkt_len = skb->len;
2398         qdisc_calculate_pkt_len(skb, q);
2399         /*
2400          * Heuristic to force contended enqueues to serialize on a
2401          * separate lock before trying to get qdisc main lock.
2402          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2403          * and dequeue packets faster.
2404          */
2405         contended = qdisc_is_running(q);
2406         if (unlikely(contended))
2407                 spin_lock(&q->busylock);
2408
2409         spin_lock(root_lock);
2410         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2411                 kfree_skb(skb);
2412                 rc = NET_XMIT_DROP;
2413         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2414                    qdisc_run_begin(q)) {
2415                 /*
2416                  * This is a work-conserving queue; there are no old skbs
2417                  * waiting to be sent out; and the qdisc is not running -
2418                  * xmit the skb directly.
2419                  */
2420                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2421                         skb_dst_force(skb);
2422
2423                 qdisc_bstats_update(q, skb);
2424
2425                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2426                         if (unlikely(contended)) {
2427                                 spin_unlock(&q->busylock);
2428                                 contended = false;
2429                         }
2430                         __qdisc_run(q);
2431                 } else
2432                         qdisc_run_end(q);
2433
2434                 rc = NET_XMIT_SUCCESS;
2435         } else {
2436                 skb_dst_force(skb);
2437                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2438                 if (qdisc_run_begin(q)) {
2439                         if (unlikely(contended)) {
2440                                 spin_unlock(&q->busylock);
2441                                 contended = false;
2442                         }
2443                         __qdisc_run(q);
2444                 }
2445         }
2446         spin_unlock(root_lock);
2447         if (unlikely(contended))
2448                 spin_unlock(&q->busylock);
2449         return rc;
2450 }
2451
2452 static DEFINE_PER_CPU(int, xmit_recursion);
2453 #define RECURSION_LIMIT 10
2454
2455 /**
2456  *      dev_queue_xmit - transmit a buffer
2457  *      @skb: buffer to transmit
2458  *
2459  *      Queue a buffer for transmission to a network device. The caller must
2460  *      have set the device and priority and built the buffer before calling
2461  *      this function. The function can be called from an interrupt.
2462  *
2463  *      A negative errno code is returned on a failure. A success does not
2464  *      guarantee the frame will be transmitted as it may be dropped due
2465  *      to congestion or traffic shaping.
2466  *
2467  * -----------------------------------------------------------------------------------
2468  *      I notice this method can also return errors from the queue disciplines,
2469  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2470  *      be positive.
2471  *
2472  *      Regardless of the return value, the skb is consumed, so it is currently
2473  *      difficult to retry a send to this method.  (You can bump the ref count
2474  *      before sending to hold a reference for retry if you are careful.)
2475  *
2476  *      When calling this method, interrupts MUST be enabled.  This is because
2477  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2478  *          --BLG
2479  */
2480 int dev_queue_xmit(struct sk_buff *skb)
2481 {
2482         struct net_device *dev = skb->dev;
2483         struct netdev_queue *txq;
2484         struct Qdisc *q;
2485         int rc = -ENOMEM;
2486
2487         /* Disable soft irqs for various locks below. Also
2488          * stops preemption for RCU.
2489          */
2490         rcu_read_lock_bh();
2491
2492         txq = dev_pick_tx(dev, skb);
2493         q = rcu_dereference_bh(txq->qdisc);
2494
2495 #ifdef CONFIG_NET_CLS_ACT
2496         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2497 #endif
2498         trace_net_dev_queue(skb);
2499         if (q->enqueue) {
2500                 rc = __dev_xmit_skb(skb, q, dev, txq);
2501                 goto out;
2502         }
2503
2504         /* The device has no queue. Common case for software devices:
2505            loopback, all the sorts of tunnels...
2506
2507            Really, it is unlikely that netif_tx_lock protection is necessary
2508            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2509            counters.)
2510            However, it is possible, that they rely on protection
2511            made by us here.
2512
2513            Check this and shot the lock. It is not prone from deadlocks.
2514            Either shot noqueue qdisc, it is even simpler 8)
2515          */
2516         if (dev->flags & IFF_UP) {
2517                 int cpu = smp_processor_id(); /* ok because BHs are off */
2518
2519                 if (txq->xmit_lock_owner != cpu) {
2520
2521                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2522                                 goto recursion_alert;
2523
2524                         HARD_TX_LOCK(dev, txq, cpu);
2525
2526                         if (!netif_tx_queue_stopped(txq)) {
2527                                 __this_cpu_inc(xmit_recursion);
2528                                 rc = dev_hard_start_xmit(skb, dev, txq);
2529                                 __this_cpu_dec(xmit_recursion);
2530                                 if (dev_xmit_complete(rc)) {
2531                                         HARD_TX_UNLOCK(dev, txq);
2532                                         goto out;
2533                                 }
2534                         }
2535                         HARD_TX_UNLOCK(dev, txq);
2536                         if (net_ratelimit())
2537                                 printk(KERN_CRIT "Virtual device %s asks to "
2538                                        "queue packet!\n", dev->name);
2539                 } else {
2540                         /* Recursion is detected! It is possible,
2541                          * unfortunately
2542                          */
2543 recursion_alert:
2544                         if (net_ratelimit())
2545                                 printk(KERN_CRIT "Dead loop on virtual device "
2546                                        "%s, fix it urgently!\n", dev->name);
2547                 }
2548         }
2549
2550         rc = -ENETDOWN;
2551         rcu_read_unlock_bh();
2552
2553         kfree_skb(skb);
2554         return rc;
2555 out:
2556         rcu_read_unlock_bh();
2557         return rc;
2558 }
2559 EXPORT_SYMBOL(dev_queue_xmit);
2560
2561
2562 /*=======================================================================
2563                         Receiver routines
2564   =======================================================================*/
2565
2566 int netdev_max_backlog __read_mostly = 1000;
2567 int netdev_tstamp_prequeue __read_mostly = 1;
2568 int netdev_budget __read_mostly = 300;
2569 int weight_p __read_mostly = 64;            /* old backlog weight */
2570
2571 /* Called with irq disabled */
2572 static inline void ____napi_schedule(struct softnet_data *sd,
2573                                      struct napi_struct *napi)
2574 {
2575         list_add_tail(&napi->poll_list, &sd->poll_list);
2576         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2577 }
2578
2579 /*
2580  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2581  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2582  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2583  * if hash is a canonical 4-tuple hash over transport ports.
2584  */
2585 void __skb_get_rxhash(struct sk_buff *skb)
2586 {
2587         int nhoff, hash = 0, poff;
2588         const struct ipv6hdr *ip6;
2589         const struct iphdr *ip;
2590         const struct vlan_hdr *vlan;
2591         u8 ip_proto;
2592         u32 addr1, addr2;
2593         u16 proto;
2594         union {
2595                 u32 v32;
2596                 u16 v16[2];
2597         } ports;
2598
2599         nhoff = skb_network_offset(skb);
2600         proto = skb->protocol;
2601
2602 again:
2603         switch (proto) {
2604         case __constant_htons(ETH_P_IP):
2605 ip:
2606                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2607                         goto done;
2608
2609                 ip = (const struct iphdr *) (skb->data + nhoff);
2610                 if (ip_is_fragment(ip))
2611                         ip_proto = 0;
2612                 else
2613                         ip_proto = ip->protocol;
2614                 addr1 = (__force u32) ip->saddr;
2615                 addr2 = (__force u32) ip->daddr;
2616                 nhoff += ip->ihl * 4;
2617                 break;
2618         case __constant_htons(ETH_P_IPV6):
2619 ipv6:
2620                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2621                         goto done;
2622
2623                 ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2624                 ip_proto = ip6->nexthdr;
2625                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2626                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2627                 nhoff += 40;
2628                 break;
2629         case __constant_htons(ETH_P_8021Q):
2630                 if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
2631                         goto done;
2632                 vlan = (const struct vlan_hdr *) (skb->data + nhoff);
2633                 proto = vlan->h_vlan_encapsulated_proto;
2634                 nhoff += sizeof(*vlan);
2635                 goto again;
2636         case __constant_htons(ETH_P_PPP_SES):
2637                 if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
2638                         goto done;
2639                 proto = *((__be16 *) (skb->data + nhoff +
2640                                       sizeof(struct pppoe_hdr)));
2641                 nhoff += PPPOE_SES_HLEN;
2642                 switch (proto) {
2643                 case __constant_htons(PPP_IP):
2644                         goto ip;
2645                 case __constant_htons(PPP_IPV6):
2646                         goto ipv6;
2647                 default:
2648                         goto done;
2649                 }
2650         default:
2651                 goto done;
2652         }
2653
2654         switch (ip_proto) {
2655         case IPPROTO_GRE:
2656                 if (pskb_may_pull(skb, nhoff + 16)) {
2657                         u8 *h = skb->data + nhoff;
2658                         __be16 flags = *(__be16 *)h;
2659
2660                         /*
2661                          * Only look inside GRE if version zero and no
2662                          * routing
2663                          */
2664                         if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
2665                                 proto = *(__be16 *)(h + 2);
2666                                 nhoff += 4;
2667                                 if (flags & GRE_CSUM)
2668                                         nhoff += 4;
2669                                 if (flags & GRE_KEY)
2670                                         nhoff += 4;
2671                                 if (flags & GRE_SEQ)
2672                                         nhoff += 4;
2673                                 goto again;
2674                         }
2675                 }
2676                 break;
2677         case IPPROTO_IPIP:
2678                 goto again;
2679         default:
2680                 break;
2681         }
2682
2683         ports.v32 = 0;
2684         poff = proto_ports_offset(ip_proto);
2685         if (poff >= 0) {
2686                 nhoff += poff;
2687                 if (pskb_may_pull(skb, nhoff + 4)) {
2688                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2689                         if (ports.v16[1] < ports.v16[0])
2690                                 swap(ports.v16[0], ports.v16[1]);
2691                         skb->l4_rxhash = 1;
2692                 }
2693         }
2694
2695         /* get a consistent hash (same value on both flow directions) */
2696         if (addr2 < addr1)
2697                 swap(addr1, addr2);
2698
2699         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2700         if (!hash)
2701                 hash = 1;
2702
2703 done:
2704         skb->rxhash = hash;
2705 }
2706 EXPORT_SYMBOL(__skb_get_rxhash);
2707
2708 #ifdef CONFIG_RPS
2709
2710 /* One global table that all flow-based protocols share. */
2711 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2712 EXPORT_SYMBOL(rps_sock_flow_table);
2713
2714 struct jump_label_key rps_needed __read_mostly;
2715
2716 static struct rps_dev_flow *
2717 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2718             struct rps_dev_flow *rflow, u16 next_cpu)
2719 {
2720         if (next_cpu != RPS_NO_CPU) {
2721 #ifdef CONFIG_RFS_ACCEL
2722                 struct netdev_rx_queue *rxqueue;
2723                 struct rps_dev_flow_table *flow_table;
2724                 struct rps_dev_flow *old_rflow;
2725                 u32 flow_id;
2726                 u16 rxq_index;
2727                 int rc;
2728
2729                 /* Should we steer this flow to a different hardware queue? */
2730                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2731                     !(dev->features & NETIF_F_NTUPLE))
2732                         goto out;
2733                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2734                 if (rxq_index == skb_get_rx_queue(skb))
2735                         goto out;
2736
2737                 rxqueue = dev->_rx + rxq_index;
2738                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2739                 if (!flow_table)
2740                         goto out;
2741                 flow_id = skb->rxhash & flow_table->mask;
2742                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2743                                                         rxq_index, flow_id);
2744                 if (rc < 0)
2745                         goto out;
2746                 old_rflow = rflow;
2747                 rflow = &flow_table->flows[flow_id];
2748                 rflow->filter = rc;
2749                 if (old_rflow->filter == rflow->filter)
2750                         old_rflow->filter = RPS_NO_FILTER;
2751         out:
2752 #endif
2753                 rflow->last_qtail =
2754                         per_cpu(softnet_data, next_cpu).input_queue_head;
2755         }
2756
2757         rflow->cpu = next_cpu;
2758         return rflow;
2759 }
2760
2761 /*
2762  * get_rps_cpu is called from netif_receive_skb and returns the target
2763  * CPU from the RPS map of the receiving queue for a given skb.
2764  * rcu_read_lock must be held on entry.
2765  */
2766 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2767                        struct rps_dev_flow **rflowp)
2768 {
2769         struct netdev_rx_queue *rxqueue;
2770         struct rps_map *map;
2771         struct rps_dev_flow_table *flow_table;
2772         struct rps_sock_flow_table *sock_flow_table;
2773         int cpu = -1;
2774         u16 tcpu;
2775
2776         if (skb_rx_queue_recorded(skb)) {
2777                 u16 index = skb_get_rx_queue(skb);
2778                 if (unlikely(index >= dev->real_num_rx_queues)) {
2779                         WARN_ONCE(dev->real_num_rx_queues > 1,
2780                                   "%s received packet on queue %u, but number "
2781                                   "of RX queues is %u\n",
2782                                   dev->name, index, dev->real_num_rx_queues);
2783                         goto done;
2784                 }
2785                 rxqueue = dev->_rx + index;
2786         } else
2787                 rxqueue = dev->_rx;
2788
2789         map = rcu_dereference(rxqueue->rps_map);
2790         if (map) {
2791                 if (map->len == 1 &&
2792                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2793                         tcpu = map->cpus[0];
2794                         if (cpu_online(tcpu))
2795                                 cpu = tcpu;
2796                         goto done;
2797                 }
2798         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2799                 goto done;
2800         }
2801
2802         skb_reset_network_header(skb);
2803         if (!skb_get_rxhash(skb))
2804                 goto done;
2805
2806         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2807         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2808         if (flow_table && sock_flow_table) {
2809                 u16 next_cpu;
2810                 struct rps_dev_flow *rflow;
2811
2812                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2813                 tcpu = rflow->cpu;
2814
2815                 next_cpu = sock_flow_table->ents[skb->rxhash &
2816                     sock_flow_table->mask];
2817
2818                 /*
2819                  * If the desired CPU (where last recvmsg was done) is
2820                  * different from current CPU (one in the rx-queue flow
2821                  * table entry), switch if one of the following holds:
2822                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2823                  *   - Current CPU is offline.
2824                  *   - The current CPU's queue tail has advanced beyond the
2825                  *     last packet that was enqueued using this table entry.
2826                  *     This guarantees that all previous packets for the flow
2827                  *     have been dequeued, thus preserving in order delivery.
2828                  */
2829                 if (unlikely(tcpu != next_cpu) &&
2830                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2831                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2832                       rflow->last_qtail)) >= 0))
2833                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2834
2835                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2836                         *rflowp = rflow;
2837                         cpu = tcpu;
2838                         goto done;
2839                 }
2840         }
2841
2842         if (map) {
2843                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2844
2845                 if (cpu_online(tcpu)) {
2846                         cpu = tcpu;
2847                         goto done;
2848                 }
2849         }
2850
2851 done:
2852         return cpu;
2853 }
2854
2855 #ifdef CONFIG_RFS_ACCEL
2856
2857 /**
2858  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2859  * @dev: Device on which the filter was set
2860  * @rxq_index: RX queue index
2861  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2862  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2863  *
2864  * Drivers that implement ndo_rx_flow_steer() should periodically call
2865  * this function for each installed filter and remove the filters for
2866  * which it returns %true.
2867  */
2868 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2869                          u32 flow_id, u16 filter_id)
2870 {
2871         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2872         struct rps_dev_flow_table *flow_table;
2873         struct rps_dev_flow *rflow;
2874         bool expire = true;
2875         int cpu;
2876
2877         rcu_read_lock();
2878         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2879         if (flow_table && flow_id <= flow_table->mask) {
2880                 rflow = &flow_table->flows[flow_id];
2881                 cpu = ACCESS_ONCE(rflow->cpu);
2882                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2883                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2884                            rflow->last_qtail) <
2885                      (int)(10 * flow_table->mask)))
2886                         expire = false;
2887         }
2888         rcu_read_unlock();
2889         return expire;
2890 }
2891 EXPORT_SYMBOL(rps_may_expire_flow);
2892
2893 #endif /* CONFIG_RFS_ACCEL */
2894
2895 /* Called from hardirq (IPI) context */
2896 static void rps_trigger_softirq(void *data)
2897 {
2898         struct softnet_data *sd = data;
2899
2900         ____napi_schedule(sd, &sd->backlog);
2901         sd->received_rps++;
2902 }
2903
2904 #endif /* CONFIG_RPS */
2905
2906 /*
2907  * Check if this softnet_data structure is another cpu one
2908  * If yes, queue it to our IPI list and return 1
2909  * If no, return 0
2910  */
2911 static int rps_ipi_queued(struct softnet_data *sd)
2912 {
2913 #ifdef CONFIG_RPS
2914         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2915
2916         if (sd != mysd) {
2917                 sd->rps_ipi_next = mysd->rps_ipi_list;
2918                 mysd->rps_ipi_list = sd;
2919
2920                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2921                 return 1;
2922         }
2923 #endif /* CONFIG_RPS */
2924         return 0;
2925 }
2926
2927 /*
2928  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2929  * queue (may be a remote CPU queue).
2930  */
2931 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2932                               unsigned int *qtail)
2933 {
2934         struct softnet_data *sd;
2935         unsigned long flags;
2936
2937         sd = &per_cpu(softnet_data, cpu);
2938
2939         local_irq_save(flags);
2940
2941         rps_lock(sd);
2942         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2943                 if (skb_queue_len(&sd->input_pkt_queue)) {
2944 enqueue:
2945                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2946                         input_queue_tail_incr_save(sd, qtail);
2947                         rps_unlock(sd);
2948                         local_irq_restore(flags);
2949                         return NET_RX_SUCCESS;
2950                 }
2951
2952                 /* Schedule NAPI for backlog device
2953                  * We can use non atomic operation since we own the queue lock
2954                  */
2955                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2956                         if (!rps_ipi_queued(sd))
2957                                 ____napi_schedule(sd, &sd->backlog);
2958                 }
2959                 goto enqueue;
2960         }
2961
2962         sd->dropped++;
2963         rps_unlock(sd);
2964
2965         local_irq_restore(flags);
2966
2967         atomic_long_inc(&skb->dev->rx_dropped);
2968         kfree_skb(skb);
2969         return NET_RX_DROP;
2970 }
2971
2972 /**
2973  *      netif_rx        -       post buffer to the network code
2974  *      @skb: buffer to post
2975  *
2976  *      This function receives a packet from a device driver and queues it for
2977  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2978  *      may be dropped during processing for congestion control or by the
2979  *      protocol layers.
2980  *
2981  *      return values:
2982  *      NET_RX_SUCCESS  (no congestion)
2983  *      NET_RX_DROP     (packet was dropped)
2984  *
2985  */
2986
2987 int netif_rx(struct sk_buff *skb)
2988 {
2989         int ret;
2990
2991         /* if netpoll wants it, pretend we never saw it */
2992         if (netpoll_rx(skb))
2993                 return NET_RX_DROP;
2994
2995         net_timestamp_check(netdev_tstamp_prequeue, skb);
2996
2997         trace_netif_rx(skb);
2998 #ifdef CONFIG_RPS
2999         if (static_branch(&rps_needed)) {
3000                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3001                 int cpu;
3002
3003                 preempt_disable();
3004                 rcu_read_lock();
3005
3006                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3007                 if (cpu < 0)
3008                         cpu = smp_processor_id();
3009
3010                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3011
3012                 rcu_read_unlock();
3013                 preempt_enable();
3014         } else
3015 #endif
3016         {
3017                 unsigned int qtail;
3018                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3019                 put_cpu();
3020         }
3021         return ret;
3022 }
3023 EXPORT_SYMBOL(netif_rx);
3024
3025 int netif_rx_ni(struct sk_buff *skb)
3026 {
3027         int err;
3028
3029         preempt_disable();
3030         err = netif_rx(skb);
3031         if (local_softirq_pending())
3032                 do_softirq();
3033         preempt_enable();
3034
3035         return err;
3036 }
3037 EXPORT_SYMBOL(netif_rx_ni);
3038
3039 static void net_tx_action(struct softirq_action *h)
3040 {
3041         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3042
3043         if (sd->completion_queue) {
3044                 struct sk_buff *clist;
3045
3046                 local_irq_disable();
3047                 clist = sd->completion_queue;
3048                 sd->completion_queue = NULL;
3049                 local_irq_enable();
3050
3051                 while (clist) {
3052                         struct sk_buff *skb = clist;
3053                         clist = clist->next;
3054
3055                         WARN_ON(atomic_read(&skb->users));
3056                         trace_kfree_skb(skb, net_tx_action);
3057                         __kfree_skb(skb);
3058                 }
3059         }
3060
3061         if (sd->output_queue) {
3062                 struct Qdisc *head;
3063
3064                 local_irq_disable();
3065                 head = sd->output_queue;
3066                 sd->output_queue = NULL;
3067                 sd->output_queue_tailp = &sd->output_queue;
3068                 local_irq_enable();
3069
3070                 while (head) {
3071                         struct Qdisc *q = head;
3072                         spinlock_t *root_lock;
3073
3074                         head = head->next_sched;
3075
3076                         root_lock = qdisc_lock(q);
3077                         if (spin_trylock(root_lock)) {
3078                                 smp_mb__before_clear_bit();
3079                                 clear_bit(__QDISC_STATE_SCHED,
3080                                           &q->state);
3081                                 qdisc_run(q);
3082                                 spin_unlock(root_lock);
3083                         } else {
3084                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3085                                               &q->state)) {
3086                                         __netif_reschedule(q);
3087                                 } else {
3088                                         smp_mb__before_clear_bit();
3089                                         clear_bit(__QDISC_STATE_SCHED,
3090                                                   &q->state);
3091                                 }
3092                         }
3093                 }
3094         }
3095 }
3096
3097 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3098     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3099 /* This hook is defined here for ATM LANE */
3100 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3101                              unsigned char *addr) __read_mostly;
3102 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3103 #endif
3104
3105 #ifdef CONFIG_NET_CLS_ACT
3106 /* TODO: Maybe we should just force sch_ingress to be compiled in
3107  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3108  * a compare and 2 stores extra right now if we dont have it on
3109  * but have CONFIG_NET_CLS_ACT
3110  * NOTE: This doesn't stop any functionality; if you dont have
3111  * the ingress scheduler, you just can't add policies on ingress.
3112  *
3113  */
3114 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3115 {
3116         struct net_device *dev = skb->dev;
3117         u32 ttl = G_TC_RTTL(skb->tc_verd);
3118         int result = TC_ACT_OK;
3119         struct Qdisc *q;
3120
3121         if (unlikely(MAX_RED_LOOP < ttl++)) {
3122                 if (net_ratelimit())
3123                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3124                                skb->skb_iif, dev->ifindex);
3125                 return TC_ACT_SHOT;
3126         }
3127
3128         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3129         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3130
3131         q = rxq->qdisc;
3132         if (q != &noop_qdisc) {
3133                 spin_lock(qdisc_lock(q));
3134                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3135                         result = qdisc_enqueue_root(skb, q);
3136                 spin_unlock(qdisc_lock(q));
3137         }
3138
3139         return result;
3140 }
3141
3142 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3143                                          struct packet_type **pt_prev,
3144                                          int *ret, struct net_device *orig_dev)
3145 {
3146         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3147
3148         if (!rxq || rxq->qdisc == &noop_qdisc)
3149                 goto out;
3150
3151         if (*pt_prev) {
3152                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3153                 *pt_prev = NULL;
3154         }
3155
3156         switch (ing_filter(skb, rxq)) {
3157         case TC_ACT_SHOT:
3158         case TC_ACT_STOLEN:
3159                 kfree_skb(skb);
3160                 return NULL;
3161         }
3162
3163 out:
3164         skb->tc_verd = 0;
3165         return skb;
3166 }
3167 #endif
3168
3169 /**
3170  *      netdev_rx_handler_register - register receive handler
3171  *      @dev: device to register a handler for
3172  *      @rx_handler: receive handler to register
3173  *      @rx_handler_data: data pointer that is used by rx handler
3174  *
3175  *      Register a receive hander for a device. This handler will then be
3176  *      called from __netif_receive_skb. A negative errno code is returned
3177  *      on a failure.
3178  *
3179  *      The caller must hold the rtnl_mutex.
3180  *
3181  *      For a general description of rx_handler, see enum rx_handler_result.
3182  */
3183 int netdev_rx_handler_register(struct net_device *dev,
3184                                rx_handler_func_t *rx_handler,
3185                                void *rx_handler_data)
3186 {
3187         ASSERT_RTNL();
3188
3189         if (dev->rx_handler)
3190                 return -EBUSY;
3191
3192         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3193         rcu_assign_pointer(dev->rx_handler, rx_handler);
3194
3195         return 0;
3196 }
3197 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3198
3199 /**
3200  *      netdev_rx_handler_unregister - unregister receive handler
3201  *      @dev: device to unregister a handler from
3202  *
3203  *      Unregister a receive hander from a device.
3204  *
3205  *      The caller must hold the rtnl_mutex.
3206  */
3207 void netdev_rx_handler_unregister(struct net_device *dev)
3208 {
3209
3210         ASSERT_RTNL();
3211         RCU_INIT_POINTER(dev->rx_handler, NULL);
3212         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3213 }
3214 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3215
3216 static int __netif_receive_skb(struct sk_buff *skb)
3217 {
3218         struct packet_type *ptype, *pt_prev;
3219         rx_handler_func_t *rx_handler;
3220         struct net_device *orig_dev;
3221         struct net_device *null_or_dev;
3222         bool deliver_exact = false;
3223         int ret = NET_RX_DROP;
3224         __be16 type;
3225
3226         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3227
3228         trace_netif_receive_skb(skb);
3229
3230         /* if we've gotten here through NAPI, check netpoll */
3231         if (netpoll_receive_skb(skb))
3232                 return NET_RX_DROP;
3233
3234         if (!skb->skb_iif)
3235                 skb->skb_iif = skb->dev->ifindex;
3236         orig_dev = skb->dev;
3237
3238         skb_reset_network_header(skb);
3239         skb_reset_transport_header(skb);
3240         skb_reset_mac_len(skb);
3241
3242         pt_prev = NULL;
3243
3244         rcu_read_lock();
3245
3246 another_round:
3247
3248         __this_cpu_inc(softnet_data.processed);
3249
3250         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3251                 skb = vlan_untag(skb);
3252                 if (unlikely(!skb))
3253                         goto out;
3254         }
3255
3256 #ifdef CONFIG_NET_CLS_ACT
3257         if (skb->tc_verd & TC_NCLS) {
3258                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3259                 goto ncls;
3260         }
3261 #endif
3262
3263         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3264                 if (!ptype->dev || ptype->dev == skb->dev) {
3265                         if (pt_prev)
3266                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3267                         pt_prev = ptype;
3268                 }
3269         }
3270
3271 #ifdef CONFIG_NET_CLS_ACT
3272         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3273         if (!skb)
3274                 goto out;
3275 ncls:
3276 #endif
3277
3278         rx_handler = rcu_dereference(skb->dev->rx_handler);
3279         if (vlan_tx_tag_present(skb)) {
3280                 if (pt_prev) {
3281                         ret = deliver_skb(skb, pt_prev, orig_dev);
3282                         pt_prev = NULL;
3283                 }
3284                 if (vlan_do_receive(&skb, !rx_handler))
3285                         goto another_round;
3286                 else if (unlikely(!skb))
3287                         goto out;
3288         }
3289
3290         if (rx_handler) {
3291                 if (pt_prev) {
3292                         ret = deliver_skb(skb, pt_prev, orig_dev);
3293                         pt_prev = NULL;
3294                 }
3295                 switch (rx_handler(&skb)) {
3296                 case RX_HANDLER_CONSUMED:
3297                         goto out;
3298                 case RX_HANDLER_ANOTHER:
3299                         goto another_round;
3300                 case RX_HANDLER_EXACT:
3301                         deliver_exact = true;
3302                 case RX_HANDLER_PASS:
3303                         break;
3304                 default:
3305                         BUG();
3306                 }
3307         }
3308
3309         /* deliver only exact match when indicated */
3310         null_or_dev = deliver_exact ? skb->dev : NULL;
3311
3312         type = skb->protocol;
3313         list_for_each_entry_rcu(ptype,
3314                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3315                 if (ptype->type == type &&
3316                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3317                      ptype->dev == orig_dev)) {
3318                         if (pt_prev)
3319                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3320                         pt_prev = ptype;
3321                 }
3322         }
3323
3324         if (pt_prev) {
3325                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3326         } else {
3327                 atomic_long_inc(&skb->dev->rx_dropped);
3328                 kfree_skb(skb);
3329                 /* Jamal, now you will not able to escape explaining
3330                  * me how you were going to use this. :-)
3331                  */
3332                 ret = NET_RX_DROP;
3333         }
3334
3335 out:
3336         rcu_read_unlock();
3337         return ret;
3338 }
3339
3340 /**
3341  *      netif_receive_skb - process receive buffer from network
3342  *      @skb: buffer to process
3343  *
3344  *      netif_receive_skb() is the main receive data processing function.
3345  *      It always succeeds. The buffer may be dropped during processing
3346  *      for congestion control or by the protocol layers.
3347  *
3348  *      This function may only be called from softirq context and interrupts
3349  *      should be enabled.
3350  *
3351  *      Return values (usually ignored):
3352  *      NET_RX_SUCCESS: no congestion
3353  *      NET_RX_DROP: packet was dropped
3354  */
3355 int netif_receive_skb(struct sk_buff *skb)
3356 {
3357         net_timestamp_check(netdev_tstamp_prequeue, skb);
3358
3359         if (skb_defer_rx_timestamp(skb))
3360                 return NET_RX_SUCCESS;
3361
3362 #ifdef CONFIG_RPS
3363         if (static_branch(&rps_needed)) {
3364                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3365                 int cpu, ret;
3366
3367                 rcu_read_lock();
3368
3369                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3370
3371                 if (cpu >= 0) {
3372                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3373                         rcu_read_unlock();
3374                         return ret;
3375                 }
3376                 rcu_read_unlock();
3377         }
3378 #endif
3379         return __netif_receive_skb(skb);
3380 }
3381 EXPORT_SYMBOL(netif_receive_skb);
3382
3383 /* Network device is going away, flush any packets still pending
3384  * Called with irqs disabled.
3385  */
3386 static void flush_backlog(void *arg)
3387 {
3388         struct net_device *dev = arg;
3389         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3390         struct sk_buff *skb, *tmp;
3391
3392         rps_lock(sd);
3393         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3394                 if (skb->dev == dev) {
3395                         __skb_unlink(skb, &sd->input_pkt_queue);
3396                         kfree_skb(skb);
3397                         input_queue_head_incr(sd);
3398                 }
3399         }
3400         rps_unlock(sd);
3401
3402         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3403                 if (skb->dev == dev) {
3404                         __skb_unlink(skb, &sd->process_queue);
3405                         kfree_skb(skb);
3406                         input_queue_head_incr(sd);
3407                 }
3408         }
3409 }
3410
3411 static int napi_gro_complete(struct sk_buff *skb)
3412 {
3413         struct packet_type *ptype;
3414         __be16 type = skb->protocol;
3415         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3416         int err = -ENOENT;
3417
3418         if (NAPI_GRO_CB(skb)->count == 1) {
3419                 skb_shinfo(skb)->gso_size = 0;
3420                 goto out;
3421         }
3422
3423         rcu_read_lock();
3424         list_for_each_entry_rcu(ptype, head, list) {
3425                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3426                         continue;
3427
3428                 err = ptype->gro_complete(skb);
3429                 break;
3430         }
3431         rcu_read_unlock();
3432
3433         if (err) {
3434                 WARN_ON(&ptype->list == head);
3435                 kfree_skb(skb);
3436                 return NET_RX_SUCCESS;
3437         }
3438
3439 out:
3440         return netif_receive_skb(skb);
3441 }
3442
3443 inline void napi_gro_flush(struct napi_struct *napi)
3444 {
3445         struct sk_buff *skb, *next;
3446
3447         for (skb = napi->gro_list; skb; skb = next) {
3448                 next = skb->next;
3449                 skb->next = NULL;
3450                 napi_gro_complete(skb);
3451         }
3452
3453         napi->gro_count = 0;
3454         napi->gro_list = NULL;
3455 }
3456 EXPORT_SYMBOL(napi_gro_flush);
3457
3458 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3459 {
3460         struct sk_buff **pp = NULL;
3461         struct packet_type *ptype;
3462         __be16 type = skb->protocol;
3463         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3464         int same_flow;
3465         int mac_len;
3466         enum gro_result ret;
3467
3468         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3469                 goto normal;
3470
3471         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3472                 goto normal;
3473
3474         rcu_read_lock();
3475         list_for_each_entry_rcu(ptype, head, list) {
3476                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3477                         continue;
3478
3479                 skb_set_network_header(skb, skb_gro_offset(skb));
3480                 mac_len = skb->network_header - skb->mac_header;
3481                 skb->mac_len = mac_len;
3482                 NAPI_GRO_CB(skb)->same_flow = 0;
3483                 NAPI_GRO_CB(skb)->flush = 0;
3484                 NAPI_GRO_CB(skb)->free = 0;
3485
3486                 pp = ptype->gro_receive(&napi->gro_list, skb);
3487                 break;
3488         }
3489         rcu_read_unlock();
3490
3491         if (&ptype->list == head)
3492                 goto normal;
3493
3494         same_flow = NAPI_GRO_CB(skb)->same_flow;
3495         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3496
3497         if (pp) {
3498                 struct sk_buff *nskb = *pp;
3499
3500                 *pp = nskb->next;
3501                 nskb->next = NULL;
3502                 napi_gro_complete(nskb);
3503                 napi->gro_count--;
3504         }
3505
3506         if (same_flow)
3507                 goto ok;
3508
3509         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3510                 goto normal;
3511
3512         napi->gro_count++;
3513         NAPI_GRO_CB(skb)->count = 1;
3514         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3515         skb->next = napi->gro_list;
3516         napi->gro_list = skb;
3517         ret = GRO_HELD;
3518
3519 pull:
3520         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3521                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3522
3523                 BUG_ON(skb->end - skb->tail < grow);
3524
3525                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3526
3527                 skb->tail += grow;
3528                 skb->data_len -= grow;
3529
3530                 skb_shinfo(skb)->frags[0].page_offset += grow;
3531                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3532
3533                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3534                         skb_frag_unref(skb, 0);
3535                         memmove(skb_shinfo(skb)->frags,
3536                                 skb_shinfo(skb)->frags + 1,
3537                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3538                 }
3539         }
3540
3541 ok:
3542         return ret;
3543
3544 normal:
3545         ret = GRO_NORMAL;
3546         goto pull;
3547 }
3548 EXPORT_SYMBOL(dev_gro_receive);
3549
3550 static inline gro_result_t
3551 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3552 {
3553         struct sk_buff *p;
3554
3555         for (p = napi->gro_list; p; p = p->next) {
3556                 unsigned long diffs;
3557
3558                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3559                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3560                 diffs |= compare_ether_header(skb_mac_header(p),
3561                                               skb_gro_mac_header(skb));
3562                 NAPI_GRO_CB(p)->same_flow = !diffs;
3563                 NAPI_GRO_CB(p)->flush = 0;
3564         }
3565
3566         return dev_gro_receive(napi, skb);
3567 }
3568
3569 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3570 {
3571         switch (ret) {
3572         case GRO_NORMAL:
3573                 if (netif_receive_skb(skb))
3574                         ret = GRO_DROP;
3575                 break;
3576
3577         case GRO_DROP:
3578         case GRO_MERGED_FREE:
3579                 kfree_skb(skb);
3580                 break;
3581
3582         case GRO_HELD:
3583         case GRO_MERGED:
3584                 break;
3585         }
3586
3587         return ret;
3588 }
3589 EXPORT_SYMBOL(napi_skb_finish);
3590
3591 void skb_gro_reset_offset(struct sk_buff *skb)
3592 {
3593         NAPI_GRO_CB(skb)->data_offset = 0;
3594         NAPI_GRO_CB(skb)->frag0 = NULL;
3595         NAPI_GRO_CB(skb)->frag0_len = 0;
3596
3597         if (skb->mac_header == skb->tail &&
3598             !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3599                 NAPI_GRO_CB(skb)->frag0 =
3600                         skb_frag_address(&skb_shinfo(skb)->frags[0]);
3601                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3602         }
3603 }
3604 EXPORT_SYMBOL(skb_gro_reset_offset);
3605
3606 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3607 {
3608         skb_gro_reset_offset(skb);
3609
3610         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3611 }
3612 EXPORT_SYMBOL(napi_gro_receive);
3613
3614 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3615 {
3616         __skb_pull(skb, skb_headlen(skb));
3617         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3618         skb->vlan_tci = 0;
3619         skb->dev = napi->dev;
3620         skb->skb_iif = 0;
3621
3622         napi->skb = skb;
3623 }
3624
3625 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3626 {
3627         struct sk_buff *skb = napi->skb;
3628
3629         if (!skb) {
3630                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3631                 if (skb)
3632                         napi->skb = skb;
3633         }
3634         return skb;
3635 }
3636 EXPORT_SYMBOL(napi_get_frags);
3637
3638 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3639                                gro_result_t ret)
3640 {
3641         switch (ret) {
3642         case GRO_NORMAL:
3643         case GRO_HELD:
3644                 skb->protocol = eth_type_trans(skb, skb->dev);
3645
3646                 if (ret == GRO_HELD)
3647                         skb_gro_pull(skb, -ETH_HLEN);
3648                 else if (netif_receive_skb(skb))
3649                         ret = GRO_DROP;
3650                 break;
3651
3652         case GRO_DROP:
3653         case GRO_MERGED_FREE:
3654                 napi_reuse_skb(napi, skb);
3655                 break;
3656
3657         case GRO_MERGED:
3658                 break;
3659         }
3660
3661         return ret;
3662 }
3663 EXPORT_SYMBOL(napi_frags_finish);
3664
3665 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3666 {
3667         struct sk_buff *skb = napi->skb;
3668         struct ethhdr *eth;
3669         unsigned int hlen;
3670         unsigned int off;
3671
3672         napi->skb = NULL;
3673
3674         skb_reset_mac_header(skb);
3675         skb_gro_reset_offset(skb);
3676
3677         off = skb_gro_offset(skb);
3678         hlen = off + sizeof(*eth);
3679         eth = skb_gro_header_fast(skb, off);
3680         if (skb_gro_header_hard(skb, hlen)) {
3681                 eth = skb_gro_header_slow(skb, hlen, off);
3682                 if (unlikely(!eth)) {
3683                         napi_reuse_skb(napi, skb);
3684                         skb = NULL;
3685                         goto out;
3686                 }
3687         }
3688
3689         skb_gro_pull(skb, sizeof(*eth));
3690
3691         /*
3692          * This works because the only protocols we care about don't require
3693          * special handling.  We'll fix it up properly at the end.
3694          */
3695         skb->protocol = eth->h_proto;
3696
3697 out:
3698         return skb;
3699 }
3700 EXPORT_SYMBOL(napi_frags_skb);
3701
3702 gro_result_t napi_gro_frags(struct napi_struct *napi)
3703 {
3704         struct sk_buff *skb = napi_frags_skb(napi);
3705
3706         if (!skb)
3707                 return GRO_DROP;
3708
3709         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3710 }
3711 EXPORT_SYMBOL(napi_gro_frags);
3712
3713 /*
3714  * net_rps_action sends any pending IPI's for rps.
3715  * Note: called with local irq disabled, but exits with local irq enabled.
3716  */
3717 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3718 {
3719 #ifdef CONFIG_RPS
3720         struct softnet_data *remsd = sd->rps_ipi_list;
3721
3722         if (remsd) {
3723                 sd->rps_ipi_list = NULL;
3724
3725                 local_irq_enable();
3726
3727                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3728                 while (remsd) {
3729                         struct softnet_data *next = remsd->rps_ipi_next;
3730
3731                         if (cpu_online(remsd->cpu))
3732                                 __smp_call_function_single(remsd->cpu,
3733                                                            &remsd->csd, 0);
3734                         remsd = next;
3735                 }
3736         } else
3737 #endif
3738                 local_irq_enable();
3739 }
3740
3741 static int process_backlog(struct napi_struct *napi, int quota)
3742 {
3743         int work = 0;
3744         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3745
3746 #ifdef CONFIG_RPS
3747         /* Check if we have pending ipi, its better to send them now,
3748          * not waiting net_rx_action() end.
3749          */
3750         if (sd->rps_ipi_list) {
3751                 local_irq_disable();
3752                 net_rps_action_and_irq_enable(sd);
3753         }
3754 #endif
3755         napi->weight = weight_p;
3756         local_irq_disable();
3757         while (work < quota) {
3758                 struct sk_buff *skb;
3759                 unsigned int qlen;
3760
3761                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3762                         local_irq_enable();
3763                         __netif_receive_skb(skb);
3764                         local_irq_disable();
3765                         input_queue_head_incr(sd);
3766                         if (++work >= quota) {
3767                                 local_irq_enable();
3768                                 return work;
3769                         }
3770                 }
3771
3772                 rps_lock(sd);
3773                 qlen = skb_queue_len(&sd->input_pkt_queue);
3774                 if (qlen)
3775                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3776                                                    &sd->process_queue);
3777
3778                 if (qlen < quota - work) {
3779                         /*
3780                          * Inline a custom version of __napi_complete().
3781                          * only current cpu owns and manipulates this napi,
3782                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3783                          * we can use a plain write instead of clear_bit(),
3784                          * and we dont need an smp_mb() memory barrier.
3785                          */
3786                         list_del(&napi->poll_list);
3787                         napi->state = 0;
3788
3789                         quota = work + qlen;
3790                 }
3791                 rps_unlock(sd);
3792         }
3793         local_irq_enable();
3794
3795         return work;
3796 }
3797
3798 /**
3799  * __napi_schedule - schedule for receive
3800  * @n: entry to schedule
3801  *
3802  * The entry's receive function will be scheduled to run
3803  */
3804 void __napi_schedule(struct napi_struct *n)
3805 {
3806         unsigned long flags;
3807
3808         local_irq_save(flags);
3809         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3810         local_irq_restore(flags);
3811 }
3812 EXPORT_SYMBOL(__napi_schedule);
3813
3814 void __napi_complete(struct napi_struct *n)
3815 {
3816         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3817         BUG_ON(n->gro_list);
3818
3819         list_del(&n->poll_list);
3820         smp_mb__before_clear_bit();
3821         clear_bit(NAPI_STATE_SCHED, &n->state);
3822 }
3823 EXPORT_SYMBOL(__napi_complete);
3824
3825 void napi_complete(struct napi_struct *n)
3826 {
3827         unsigned long flags;
3828
3829         /*
3830          * don't let napi dequeue from the cpu poll list
3831          * just in case its running on a different cpu
3832          */
3833         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3834                 return;
3835
3836         napi_gro_flush(n);
3837         local_irq_save(flags);
3838         __napi_complete(n);
3839         local_irq_restore(flags);
3840 }
3841 EXPORT_SYMBOL(napi_complete);
3842
3843 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3844                     int (*poll)(struct napi_struct *, int), int weight)
3845 {
3846         INIT_LIST_HEAD(&napi->poll_list);
3847         napi->gro_count = 0;
3848         napi->gro_list = NULL;
3849         napi->skb = NULL;
3850         napi->poll = poll;
3851         napi->weight = weight;
3852         list_add(&napi->dev_list, &dev->napi_list);
3853         napi->dev = dev;
3854 #ifdef CONFIG_NETPOLL
3855         spin_lock_init(&napi->poll_lock);
3856         napi->poll_owner = -1;
3857 #endif
3858         set_bit(NAPI_STATE_SCHED, &napi->state);
3859 }
3860 EXPORT_SYMBOL(netif_napi_add);
3861
3862 void netif_napi_del(struct napi_struct *napi)
3863 {
3864         struct sk_buff *skb, *next;
3865
3866         list_del_init(&napi->dev_list);
3867         napi_free_frags(napi);
3868
3869         for (skb = napi->gro_list; skb; skb = next) {
3870                 next = skb->next;
3871                 skb->next = NULL;
3872                 kfree_skb(skb);
3873         }
3874
3875         napi->gro_list = NULL;
3876         napi->gro_count = 0;
3877 }
3878 EXPORT_SYMBOL(netif_napi_del);
3879
3880 static void net_rx_action(struct softirq_action *h)
3881 {
3882         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3883         unsigned long time_limit = jiffies + 2;
3884         int budget = netdev_budget;
3885         void *have;
3886
3887         local_irq_disable();
3888
3889         while (!list_empty(&sd->poll_list)) {
3890                 struct napi_struct *n;
3891                 int work, weight;
3892
3893                 /* If softirq window is exhuasted then punt.
3894                  * Allow this to run for 2 jiffies since which will allow
3895                  * an average latency of 1.5/HZ.
3896                  */
3897                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3898                         goto softnet_break;
3899
3900                 local_irq_enable();
3901
3902                 /* Even though interrupts have been re-enabled, this
3903                  * access is safe because interrupts can only add new
3904                  * entries to the tail of this list, and only ->poll()
3905                  * calls can remove this head entry from the list.
3906                  */
3907                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3908
3909                 have = netpoll_poll_lock(n);
3910
3911                 weight = n->weight;
3912
3913                 /* This NAPI_STATE_SCHED test is for avoiding a race
3914                  * with netpoll's poll_napi().  Only the entity which
3915                  * obtains the lock and sees NAPI_STATE_SCHED set will
3916                  * actually make the ->poll() call.  Therefore we avoid
3917                  * accidentally calling ->poll() when NAPI is not scheduled.
3918                  */
3919                 work = 0;
3920                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3921                         work = n->poll(n, weight);
3922                         trace_napi_poll(n);
3923                 }
3924
3925                 WARN_ON_ONCE(work > weight);
3926
3927                 budget -= work;
3928
3929                 local_irq_disable();
3930
3931                 /* Drivers must not modify the NAPI state if they
3932                  * consume the entire weight.  In such cases this code
3933                  * still "owns" the NAPI instance and therefore can
3934                  * move the instance around on the list at-will.
3935                  */
3936                 if (unlikely(work == weight)) {
3937                         if (unlikely(napi_disable_pending(n))) {
3938                                 local_irq_enable();
3939                                 napi_complete(n);
3940                                 local_irq_disable();
3941                         } else
3942                                 list_move_tail(&n->poll_list, &sd->poll_list);
3943                 }
3944
3945                 netpoll_poll_unlock(have);
3946         }
3947 out:
3948         net_rps_action_and_irq_enable(sd);
3949
3950 #ifdef CONFIG_NET_DMA
3951         /*
3952          * There may not be any more sk_buffs coming right now, so push
3953          * any pending DMA copies to hardware
3954          */
3955         dma_issue_pending_all();
3956 #endif
3957
3958         return;
3959
3960 softnet_break:
3961         sd->time_squeeze++;
3962         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3963         goto out;
3964 }
3965
3966 static gifconf_func_t *gifconf_list[NPROTO];
3967
3968 /**
3969  *      register_gifconf        -       register a SIOCGIF handler
3970  *      @family: Address family
3971  *      @gifconf: Function handler
3972  *
3973  *      Register protocol dependent address dumping routines. The handler
3974  *      that is passed must not be freed or reused until it has been replaced
3975  *      by another handler.
3976  */
3977 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3978 {
3979         if (family >= NPROTO)
3980                 return -EINVAL;
3981         gifconf_list[family] = gifconf;
3982         return 0;
3983 }
3984 EXPORT_SYMBOL(register_gifconf);
3985
3986
3987 /*
3988  *      Map an interface index to its name (SIOCGIFNAME)
3989  */
3990
3991 /*
3992  *      We need this ioctl for efficient implementation of the
3993  *      if_indextoname() function required by the IPv6 API.  Without
3994  *      it, we would have to search all the interfaces to find a
3995  *      match.  --pb
3996  */
3997
3998 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3999 {
4000         struct net_device *dev;
4001         struct ifreq ifr;
4002
4003         /*
4004          *      Fetch the caller's info block.
4005          */
4006
4007         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4008                 return -EFAULT;
4009
4010         rcu_read_lock();
4011         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4012         if (!dev) {
4013                 rcu_read_unlock();
4014                 return -ENODEV;
4015         }
4016
4017         strcpy(ifr.ifr_name, dev->name);
4018         rcu_read_unlock();
4019
4020         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4021                 return -EFAULT;
4022         return 0;
4023 }
4024
4025 /*
4026  *      Perform a SIOCGIFCONF call. This structure will change
4027  *      size eventually, and there is nothing I can do about it.
4028  *      Thus we will need a 'compatibility mode'.
4029  */
4030
4031 static int dev_ifconf(struct net *net, char __user *arg)
4032 {
4033         struct ifconf ifc;
4034         struct net_device *dev;
4035         char __user *pos;
4036         int len;
4037         int total;
4038         int i;
4039
4040         /*
4041          *      Fetch the caller's info block.
4042          */
4043
4044         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4045                 return -EFAULT;
4046
4047         pos = ifc.ifc_buf;
4048         len = ifc.ifc_len;
4049
4050         /*
4051          *      Loop over the interfaces, and write an info block for each.
4052          */
4053
4054         total = 0;
4055         for_each_netdev(net, dev) {
4056                 for (i = 0; i < NPROTO; i++) {
4057                         if (gifconf_list[i]) {
4058                                 int done;
4059                                 if (!pos)
4060                                         done = gifconf_list[i](dev, NULL, 0);
4061                                 else
4062                                         done = gifconf_list[i](dev, pos + total,
4063                                                                len - total);
4064                                 if (done < 0)
4065                                         return -EFAULT;
4066                                 total += done;
4067                         }
4068                 }
4069         }
4070
4071         /*
4072          *      All done.  Write the updated control block back to the caller.
4073          */
4074         ifc.ifc_len = total;
4075
4076         /*
4077          *      Both BSD and Solaris return 0 here, so we do too.
4078          */
4079         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4080 }
4081
4082 #ifdef CONFIG_PROC_FS
4083
4084 #define BUCKET_SPACE (32 - NETDEV_HASHBITS)
4085
4086 struct dev_iter_state {
4087         struct seq_net_private p;
4088         unsigned int pos; /* bucket << BUCKET_SPACE + offset */
4089 };
4090
4091 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4092 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4093 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4094
4095 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
4096 {
4097         struct dev_iter_state *state = seq->private;
4098         struct net *net = seq_file_net(seq);
4099         struct net_device *dev;
4100         struct hlist_node *p;
4101         struct hlist_head *h;
4102         unsigned int count, bucket, offset;
4103
4104         bucket = get_bucket(state->pos);
4105         offset = get_offset(state->pos);
4106         h = &net->dev_name_head[bucket];
4107         count = 0;
4108         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4109                 if (count++ == offset) {
4110                         state->pos = set_bucket_offset(bucket, count);
4111                         return dev;
4112                 }
4113         }
4114
4115         return NULL;
4116 }
4117
4118 static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
4119 {
4120         struct dev_iter_state *state = seq->private;
4121         struct net_device *dev;
4122         unsigned int bucket;
4123
4124         bucket = get_bucket(state->pos);
4125         do {
4126                 dev = dev_from_same_bucket(seq);
4127                 if (dev)
4128                         return dev;
4129
4130                 bucket++;
4131                 state->pos = set_bucket_offset(bucket, 0);
4132         } while (bucket < NETDEV_HASHENTRIES);
4133
4134         return NULL;
4135 }
4136
4137 /*
4138  *      This is invoked by the /proc filesystem handler to display a device
4139  *      in detail.
4140  */
4141 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4142         __acquires(RCU)
4143 {
4144         struct dev_iter_state *state = seq->private;
4145
4146         rcu_read_lock();
4147         if (!*pos)
4148                 return SEQ_START_TOKEN;
4149
4150         /* check for end of the hash */
4151         if (state->pos == 0 && *pos > 1)
4152                 return NULL;
4153
4154         return dev_from_new_bucket(seq);
4155 }
4156
4157 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4158 {
4159         struct net_device *dev;
4160
4161         ++*pos;
4162
4163         if (v == SEQ_START_TOKEN)
4164                 return dev_from_new_bucket(seq);
4165
4166         dev = dev_from_same_bucket(seq);
4167         if (dev)
4168                 return dev;
4169
4170         return dev_from_new_bucket(seq);
4171 }
4172
4173 void dev_seq_stop(struct seq_file *seq, void *v)
4174         __releases(RCU)
4175 {
4176         rcu_read_unlock();
4177 }
4178
4179 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4180 {
4181         struct rtnl_link_stats64 temp;
4182         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4183
4184         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4185                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4186                    dev->name, stats->rx_bytes, stats->rx_packets,
4187                    stats->rx_errors,
4188                    stats->rx_dropped + stats->rx_missed_errors,
4189                    stats->rx_fifo_errors,
4190                    stats->rx_length_errors + stats->rx_over_errors +
4191                     stats->rx_crc_errors + stats->rx_frame_errors,
4192                    stats->rx_compressed, stats->multicast,
4193                    stats->tx_bytes, stats->tx_packets,
4194                    stats->tx_errors, stats->tx_dropped,
4195                    stats->tx_fifo_errors, stats->collisions,
4196                    stats->tx_carrier_errors +
4197                     stats->tx_aborted_errors +
4198                     stats->tx_window_errors +
4199                     stats->tx_heartbeat_errors,
4200                    stats->tx_compressed);
4201 }
4202
4203 /*
4204  *      Called from the PROCfs module. This now uses the new arbitrary sized
4205  *      /proc/net interface to create /proc/net/dev
4206  */
4207 static int dev_seq_show(struct seq_file *seq, void *v)
4208 {
4209         if (v == SEQ_START_TOKEN)
4210                 seq_puts(seq, "Inter-|   Receive                            "
4211                               "                    |  Transmit\n"
4212                               " face |bytes    packets errs drop fifo frame "
4213                               "compressed multicast|bytes    packets errs "
4214                               "drop fifo colls carrier compressed\n");
4215         else
4216                 dev_seq_printf_stats(seq, v);
4217         return 0;
4218 }
4219
4220 static struct softnet_data *softnet_get_online(loff_t *pos)
4221 {
4222         struct softnet_data *sd = NULL;
4223
4224         while (*pos < nr_cpu_ids)
4225                 if (cpu_online(*pos)) {
4226                         sd = &per_cpu(softnet_data, *pos);
4227                         break;
4228                 } else
4229                         ++*pos;
4230         return sd;
4231 }
4232
4233 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4234 {
4235         return softnet_get_online(pos);
4236 }
4237
4238 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4239 {
4240         ++*pos;
4241         return softnet_get_online(pos);
4242 }
4243
4244 static void softnet_seq_stop(struct seq_file *seq, void *v)
4245 {
4246 }
4247
4248 static int softnet_seq_show(struct seq_file *seq, void *v)
4249 {
4250         struct softnet_data *sd = v;
4251
4252         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4253                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4254                    0, 0, 0, 0, /* was fastroute */
4255                    sd->cpu_collision, sd->received_rps);
4256         return 0;
4257 }
4258
4259 static const struct seq_operations dev_seq_ops = {
4260         .start = dev_seq_start,
4261         .next  = dev_seq_next,
4262         .stop  = dev_seq_stop,
4263         .show  = dev_seq_show,
4264 };
4265
4266 static int dev_seq_open(struct inode *inode, struct file *file)
4267 {
4268         return seq_open_net(inode, file, &dev_seq_ops,
4269                             sizeof(struct dev_iter_state));
4270 }
4271
4272 static const struct file_operations dev_seq_fops = {
4273         .owner   = THIS_MODULE,
4274         .open    = dev_seq_open,
4275         .read    = seq_read,
4276         .llseek  = seq_lseek,
4277         .release = seq_release_net,
4278 };
4279
4280 static const struct seq_operations softnet_seq_ops = {
4281         .start = softnet_seq_start,
4282         .next  = softnet_seq_next,
4283         .stop  = softnet_seq_stop,
4284         .show  = softnet_seq_show,
4285 };
4286
4287 static int softnet_seq_open(struct inode *inode, struct file *file)
4288 {
4289         return seq_open(file, &softnet_seq_ops);
4290 }
4291
4292 static const struct file_operations softnet_seq_fops = {
4293         .owner   = THIS_MODULE,
4294         .open    = softnet_seq_open,
4295         .read    = seq_read,
4296         .llseek  = seq_lseek,
4297         .release = seq_release,
4298 };
4299
4300 static void *ptype_get_idx(loff_t pos)
4301 {
4302         struct packet_type *pt = NULL;
4303         loff_t i = 0;
4304         int t;
4305
4306         list_for_each_entry_rcu(pt, &ptype_all, list) {
4307                 if (i == pos)
4308                         return pt;
4309                 ++i;
4310         }
4311
4312         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4313                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4314                         if (i == pos)
4315                                 return pt;
4316                         ++i;
4317                 }
4318         }
4319         return NULL;
4320 }
4321
4322 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4323         __acquires(RCU)
4324 {
4325         rcu_read_lock();
4326         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4327 }
4328
4329 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4330 {
4331         struct packet_type *pt;
4332         struct list_head *nxt;
4333         int hash;
4334
4335         ++*pos;
4336         if (v == SEQ_START_TOKEN)
4337                 return ptype_get_idx(0);
4338
4339         pt = v;
4340         nxt = pt->list.next;
4341         if (pt->type == htons(ETH_P_ALL)) {
4342                 if (nxt != &ptype_all)
4343                         goto found;
4344                 hash = 0;
4345                 nxt = ptype_base[0].next;
4346         } else
4347                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4348
4349         while (nxt == &ptype_base[hash]) {
4350                 if (++hash >= PTYPE_HASH_SIZE)
4351                         return NULL;
4352                 nxt = ptype_base[hash].next;
4353         }
4354 found:
4355         return list_entry(nxt, struct packet_type, list);
4356 }
4357
4358 static void ptype_seq_stop(struct seq_file *seq, void *v)
4359         __releases(RCU)
4360 {
4361         rcu_read_unlock();
4362 }
4363
4364 static int ptype_seq_show(struct seq_file *seq, void *v)
4365 {
4366         struct packet_type *pt = v;
4367
4368         if (v == SEQ_START_TOKEN)
4369                 seq_puts(seq, "Type Device      Function\n");
4370         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4371                 if (pt->type == htons(ETH_P_ALL))
4372                         seq_puts(seq, "ALL ");
4373                 else
4374                         seq_printf(seq, "%04x", ntohs(pt->type));
4375
4376                 seq_printf(seq, " %-8s %pF\n",
4377                            pt->dev ? pt->dev->name : "", pt->func);
4378         }
4379
4380         return 0;
4381 }
4382
4383 static const struct seq_operations ptype_seq_ops = {
4384         .start = ptype_seq_start,
4385         .next  = ptype_seq_next,
4386         .stop  = ptype_seq_stop,
4387         .show  = ptype_seq_show,
4388 };
4389
4390 static int ptype_seq_open(struct inode *inode, struct file *file)
4391 {
4392         return seq_open_net(inode, file, &ptype_seq_ops,
4393                         sizeof(struct seq_net_private));
4394 }
4395
4396 static const struct file_operations ptype_seq_fops = {
4397         .owner   = THIS_MODULE,
4398         .open    = ptype_seq_open,
4399         .read    = seq_read,
4400         .llseek  = seq_lseek,
4401         .release = seq_release_net,
4402 };
4403
4404
4405 static int __net_init dev_proc_net_init(struct net *net)
4406 {
4407         int rc = -ENOMEM;
4408
4409         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4410                 goto out;
4411         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4412                 goto out_dev;
4413         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4414                 goto out_softnet;
4415
4416         if (wext_proc_init(net))
4417                 goto out_ptype;
4418         rc = 0;
4419 out:
4420         return rc;
4421 out_ptype:
4422         proc_net_remove(net, "ptype");
4423 out_softnet:
4424         proc_net_remove(net, "softnet_stat");
4425 out_dev:
4426         proc_net_remove(net, "dev");
4427         goto out;
4428 }
4429
4430 static void __net_exit dev_proc_net_exit(struct net *net)
4431 {
4432         wext_proc_exit(net);
4433
4434         proc_net_remove(net, "ptype");
4435         proc_net_remove(net, "softnet_stat");
4436         proc_net_remove(net, "dev");
4437 }
4438
4439 static struct pernet_operations __net_initdata dev_proc_ops = {
4440         .init = dev_proc_net_init,
4441         .exit = dev_proc_net_exit,
4442 };
4443
4444 static int __init dev_proc_init(void)
4445 {
4446         return register_pernet_subsys(&dev_proc_ops);
4447 }
4448 #else
4449 #define dev_proc_init() 0
4450 #endif  /* CONFIG_PROC_FS */
4451
4452
4453 /**
4454  *      netdev_set_master       -       set up master pointer
4455  *      @slave: slave device
4456  *      @master: new master device
4457  *
4458  *      Changes the master device of the slave. Pass %NULL to break the
4459  *      bonding. The caller must hold the RTNL semaphore. On a failure
4460  *      a negative errno code is returned. On success the reference counts
4461  *      are adjusted and the function returns zero.
4462  */
4463 int netdev_set_master(struct net_device *slave, struct net_device *master)
4464 {
4465         struct net_device *old = slave->master;
4466
4467         ASSERT_RTNL();
4468
4469         if (master) {
4470                 if (old)
4471                         return -EBUSY;
4472                 dev_hold(master);
4473         }
4474
4475         slave->master = master;
4476
4477         if (old)
4478                 dev_put(old);
4479         return 0;
4480 }
4481 EXPORT_SYMBOL(netdev_set_master);
4482
4483 /**
4484  *      netdev_set_bond_master  -       set up bonding master/slave pair
4485  *      @slave: slave device
4486  *      @master: new master device
4487  *
4488  *      Changes the master device of the slave. Pass %NULL to break the
4489  *      bonding. The caller must hold the RTNL semaphore. On a failure
4490  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4491  *      to the routing socket and the function returns zero.
4492  */
4493 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4494 {
4495         int err;
4496
4497         ASSERT_RTNL();
4498
4499         err = netdev_set_master(slave, master);
4500         if (err)
4501                 return err;
4502         if (master)
4503                 slave->flags |= IFF_SLAVE;
4504         else
4505                 slave->flags &= ~IFF_SLAVE;
4506
4507         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4508         return 0;
4509 }
4510 EXPORT_SYMBOL(netdev_set_bond_master);
4511
4512 static void dev_change_rx_flags(struct net_device *dev, int flags)
4513 {
4514         const struct net_device_ops *ops = dev->netdev_ops;
4515
4516         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4517                 ops->ndo_change_rx_flags(dev, flags);
4518 }
4519
4520 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4521 {
4522         unsigned short old_flags = dev->flags;
4523         uid_t uid;
4524         gid_t gid;
4525
4526         ASSERT_RTNL();
4527
4528         dev->flags |= IFF_PROMISC;
4529         dev->promiscuity += inc;
4530         if (dev->promiscuity == 0) {
4531                 /*
4532                  * Avoid overflow.
4533                  * If inc causes overflow, untouch promisc and return error.
4534                  */
4535                 if (inc < 0)
4536                         dev->flags &= ~IFF_PROMISC;
4537                 else {
4538                         dev->promiscuity -= inc;
4539                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4540                                 "set promiscuity failed, promiscuity feature "
4541                                 "of device might be broken.\n", dev->name);
4542                         return -EOVERFLOW;
4543                 }
4544         }
4545         if (dev->flags != old_flags) {
4546                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4547                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4548                                                                "left");
4549                 if (audit_enabled) {
4550                         current_uid_gid(&uid, &gid);
4551                         audit_log(current->audit_context, GFP_ATOMIC,
4552                                 AUDIT_ANOM_PROMISCUOUS,
4553                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4554                                 dev->name, (dev->flags & IFF_PROMISC),
4555                                 (old_flags & IFF_PROMISC),
4556                                 audit_get_loginuid(current),
4557                                 uid, gid,
4558                                 audit_get_sessionid(current));
4559                 }
4560
4561                 dev_change_rx_flags(dev, IFF_PROMISC);
4562         }
4563         return 0;
4564 }
4565
4566 /**
4567  *      dev_set_promiscuity     - update promiscuity count on a device
4568  *      @dev: device
4569  *      @inc: modifier
4570  *
4571  *      Add or remove promiscuity from a device. While the count in the device
4572  *      remains above zero the interface remains promiscuous. Once it hits zero
4573  *      the device reverts back to normal filtering operation. A negative inc
4574  *      value is used to drop promiscuity on the device.
4575  *      Return 0 if successful or a negative errno code on error.
4576  */
4577 int dev_set_promiscuity(struct net_device *dev, int inc)
4578 {
4579         unsigned short old_flags = dev->flags;
4580         int err;
4581
4582         err = __dev_set_promiscuity(dev, inc);
4583         if (err < 0)
4584                 return err;
4585         if (dev->flags != old_flags)
4586                 dev_set_rx_mode(dev);
4587         return err;
4588 }
4589 EXPORT_SYMBOL(dev_set_promiscuity);
4590
4591 /**
4592  *      dev_set_allmulti        - update allmulti count on a device
4593  *      @dev: device
4594  *      @inc: modifier
4595  *
4596  *      Add or remove reception of all multicast frames to a device. While the
4597  *      count in the device remains above zero the interface remains listening
4598  *      to all interfaces. Once it hits zero the device reverts back to normal
4599  *      filtering operation. A negative @inc value is used to drop the counter
4600  *      when releasing a resource needing all multicasts.
4601  *      Return 0 if successful or a negative errno code on error.
4602  */
4603
4604 int dev_set_allmulti(struct net_device *dev, int inc)
4605 {
4606         unsigned short old_flags = dev->flags;
4607
4608         ASSERT_RTNL();
4609
4610         dev->flags |= IFF_ALLMULTI;
4611         dev->allmulti += inc;
4612         if (dev->allmulti == 0) {
4613                 /*
4614                  * Avoid overflow.
4615                  * If inc causes overflow, untouch allmulti and return error.
4616                  */
4617                 if (inc < 0)
4618                         dev->flags &= ~IFF_ALLMULTI;
4619                 else {
4620                         dev->allmulti -= inc;
4621                         printk(KERN_WARNING "%s: allmulti touches roof, "
4622                                 "set allmulti failed, allmulti feature of "
4623                                 "device might be broken.\n", dev->name);
4624                         return -EOVERFLOW;
4625                 }
4626         }
4627         if (dev->flags ^ old_flags) {
4628                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4629                 dev_set_rx_mode(dev);
4630         }
4631         return 0;
4632 }
4633 EXPORT_SYMBOL(dev_set_allmulti);
4634
4635 /*
4636  *      Upload unicast and multicast address lists to device and
4637  *      configure RX filtering. When the device doesn't support unicast
4638  *      filtering it is put in promiscuous mode while unicast addresses
4639  *      are present.
4640  */
4641 void __dev_set_rx_mode(struct net_device *dev)
4642 {
4643         const struct net_device_ops *ops = dev->netdev_ops;
4644
4645         /* dev_open will call this function so the list will stay sane. */
4646         if (!(dev->flags&IFF_UP))
4647                 return;
4648
4649         if (!netif_device_present(dev))
4650                 return;
4651
4652         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4653                 /* Unicast addresses changes may only happen under the rtnl,
4654                  * therefore calling __dev_set_promiscuity here is safe.
4655                  */
4656                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4657                         __dev_set_promiscuity(dev, 1);
4658                         dev->uc_promisc = true;
4659                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4660                         __dev_set_promiscuity(dev, -1);
4661                         dev->uc_promisc = false;
4662                 }
4663         }
4664
4665         if (ops->ndo_set_rx_mode)
4666                 ops->ndo_set_rx_mode(dev);
4667 }
4668
4669 void dev_set_rx_mode(struct net_device *dev)
4670 {
4671         netif_addr_lock_bh(dev);
4672         __dev_set_rx_mode(dev);
4673         netif_addr_unlock_bh(dev);
4674 }
4675
4676 /**
4677  *      dev_get_flags - get flags reported to userspace
4678  *      @dev: device
4679  *
4680  *      Get the combination of flag bits exported through APIs to userspace.
4681  */
4682 unsigned dev_get_flags(const struct net_device *dev)
4683 {
4684         unsigned flags;
4685
4686         flags = (dev->flags & ~(IFF_PROMISC |
4687                                 IFF_ALLMULTI |
4688                                 IFF_RUNNING |
4689                                 IFF_LOWER_UP |
4690                                 IFF_DORMANT)) |
4691                 (dev->gflags & (IFF_PROMISC |
4692                                 IFF_ALLMULTI));
4693
4694         if (netif_running(dev)) {
4695                 if (netif_oper_up(dev))
4696                         flags |= IFF_RUNNING;
4697                 if (netif_carrier_ok(dev))
4698                         flags |= IFF_LOWER_UP;
4699                 if (netif_dormant(dev))
4700                         flags |= IFF_DORMANT;
4701         }
4702
4703         return flags;
4704 }
4705 EXPORT_SYMBOL(dev_get_flags);
4706
4707 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4708 {
4709         int old_flags = dev->flags;
4710         int ret;
4711
4712         ASSERT_RTNL();
4713
4714         /*
4715          *      Set the flags on our device.
4716          */
4717
4718         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4719                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4720                                IFF_AUTOMEDIA)) |
4721                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4722                                     IFF_ALLMULTI));
4723
4724         /*
4725          *      Load in the correct multicast list now the flags have changed.
4726          */
4727
4728         if ((old_flags ^ flags) & IFF_MULTICAST)
4729                 dev_change_rx_flags(dev, IFF_MULTICAST);
4730
4731         dev_set_rx_mode(dev);
4732
4733         /*
4734          *      Have we downed the interface. We handle IFF_UP ourselves
4735          *      according to user attempts to set it, rather than blindly
4736          *      setting it.
4737          */
4738
4739         ret = 0;
4740         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4741                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4742
4743                 if (!ret)
4744                         dev_set_rx_mode(dev);
4745         }
4746
4747         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4748                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4749
4750                 dev->gflags ^= IFF_PROMISC;
4751                 dev_set_promiscuity(dev, inc);
4752         }
4753
4754         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4755            is important. Some (broken) drivers set IFF_PROMISC, when
4756            IFF_ALLMULTI is requested not asking us and not reporting.
4757          */
4758         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4759                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4760
4761                 dev->gflags ^= IFF_ALLMULTI;
4762                 dev_set_allmulti(dev, inc);
4763         }
4764
4765         return ret;
4766 }
4767
4768 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4769 {
4770         unsigned int changes = dev->flags ^ old_flags;
4771
4772         if (changes & IFF_UP) {
4773                 if (dev->flags & IFF_UP)
4774                         call_netdevice_notifiers(NETDEV_UP, dev);
4775                 else
4776                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4777         }
4778
4779         if (dev->flags & IFF_UP &&
4780             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4781                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4782 }
4783
4784 /**
4785  *      dev_change_flags - change device settings
4786  *      @dev: device
4787  *      @flags: device state flags
4788  *
4789  *      Change settings on device based state flags. The flags are
4790  *      in the userspace exported format.
4791  */
4792 int dev_change_flags(struct net_device *dev, unsigned flags)
4793 {
4794         int ret, changes;
4795         int old_flags = dev->flags;
4796
4797         ret = __dev_change_flags(dev, flags);
4798         if (ret < 0)
4799                 return ret;
4800
4801         changes = old_flags ^ dev->flags;
4802         if (changes)
4803                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4804
4805         __dev_notify_flags(dev, old_flags);
4806         return ret;
4807 }
4808 EXPORT_SYMBOL(dev_change_flags);
4809
4810 /**
4811  *      dev_set_mtu - Change maximum transfer unit
4812  *      @dev: device
4813  *      @new_mtu: new transfer unit
4814  *
4815  *      Change the maximum transfer size of the network device.
4816  */
4817 int dev_set_mtu(struct net_device *dev, int new_mtu)
4818 {
4819         const struct net_device_ops *ops = dev->netdev_ops;
4820         int err;
4821
4822         if (new_mtu == dev->mtu)
4823                 return 0;
4824
4825         /*      MTU must be positive.    */
4826         if (new_mtu < 0)
4827                 return -EINVAL;
4828
4829         if (!netif_device_present(dev))
4830                 return -ENODEV;
4831
4832         err = 0;
4833         if (ops->ndo_change_mtu)
4834                 err = ops->ndo_change_mtu(dev, new_mtu);
4835         else
4836                 dev->mtu = new_mtu;
4837
4838         if (!err && dev->flags & IFF_UP)
4839                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4840         return err;
4841 }
4842 EXPORT_SYMBOL(dev_set_mtu);
4843
4844 /**
4845  *      dev_set_group - Change group this device belongs to
4846  *      @dev: device
4847  *      @new_group: group this device should belong to
4848  */
4849 void dev_set_group(struct net_device *dev, int new_group)
4850 {
4851         dev->group = new_group;
4852 }
4853 EXPORT_SYMBOL(dev_set_group);
4854
4855 /**
4856  *      dev_set_mac_address - Change Media Access Control Address
4857  *      @dev: device
4858  *      @sa: new address
4859  *
4860  *      Change the hardware (MAC) address of the device
4861  */
4862 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4863 {
4864         const struct net_device_ops *ops = dev->netdev_ops;
4865         int err;
4866
4867         if (!ops->ndo_set_mac_address)
4868                 return -EOPNOTSUPP;
4869         if (sa->sa_family != dev->type)
4870                 return -EINVAL;
4871         if (!netif_device_present(dev))
4872                 return -ENODEV;
4873         err = ops->ndo_set_mac_address(dev, sa);
4874         if (!err)
4875                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4876         return err;
4877 }
4878 EXPORT_SYMBOL(dev_set_mac_address);
4879
4880 /*
4881  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4882  */
4883 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4884 {
4885         int err;
4886         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4887
4888         if (!dev)
4889                 return -ENODEV;
4890
4891         switch (cmd) {
4892         case SIOCGIFFLAGS:      /* Get interface flags */
4893                 ifr->ifr_flags = (short) dev_get_flags(dev);
4894                 return 0;
4895
4896         case SIOCGIFMETRIC:     /* Get the metric on the interface
4897                                    (currently unused) */
4898                 ifr->ifr_metric = 0;
4899                 return 0;
4900
4901         case SIOCGIFMTU:        /* Get the MTU of a device */
4902                 ifr->ifr_mtu = dev->mtu;
4903                 return 0;
4904
4905         case SIOCGIFHWADDR:
4906                 if (!dev->addr_len)
4907                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4908                 else
4909                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4910                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4911                 ifr->ifr_hwaddr.sa_family = dev->type;
4912                 return 0;
4913
4914         case SIOCGIFSLAVE:
4915                 err = -EINVAL;
4916                 break;
4917
4918         case SIOCGIFMAP:
4919                 ifr->ifr_map.mem_start = dev->mem_start;
4920                 ifr->ifr_map.mem_end   = dev->mem_end;
4921                 ifr->ifr_map.base_addr = dev->base_addr;
4922                 ifr->ifr_map.irq       = dev->irq;
4923                 ifr->ifr_map.dma       = dev->dma;
4924                 ifr->ifr_map.port      = dev->if_port;
4925                 return 0;
4926
4927         case SIOCGIFINDEX:
4928                 ifr->ifr_ifindex = dev->ifindex;
4929                 return 0;
4930
4931         case SIOCGIFTXQLEN:
4932                 ifr->ifr_qlen = dev->tx_queue_len;
4933                 return 0;
4934
4935         default:
4936                 /* dev_ioctl() should ensure this case
4937                  * is never reached
4938                  */
4939                 WARN_ON(1);
4940                 err = -ENOTTY;
4941                 break;
4942
4943         }
4944         return err;
4945 }
4946
4947 /*
4948  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4949  */
4950 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4951 {
4952         int err;
4953         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4954         const struct net_device_ops *ops;
4955
4956         if (!dev)
4957                 return -ENODEV;
4958
4959         ops = dev->netdev_ops;
4960
4961         switch (cmd) {
4962         case SIOCSIFFLAGS:      /* Set interface flags */
4963                 return dev_change_flags(dev, ifr->ifr_flags);
4964
4965         case SIOCSIFMETRIC:     /* Set the metric on the interface
4966                                    (currently unused) */
4967                 return -EOPNOTSUPP;
4968
4969         case SIOCSIFMTU:        /* Set the MTU of a device */
4970                 return dev_set_mtu(dev, ifr->ifr_mtu);
4971
4972         case SIOCSIFHWADDR:
4973                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4974
4975         case SIOCSIFHWBROADCAST:
4976                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4977                         return -EINVAL;
4978                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4979                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4980                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4981                 return 0;
4982
4983         case SIOCSIFMAP:
4984                 if (ops->ndo_set_config) {
4985                         if (!netif_device_present(dev))
4986                                 return -ENODEV;
4987                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4988                 }
4989                 return -EOPNOTSUPP;
4990
4991         case SIOCADDMULTI:
4992                 if (!ops->ndo_set_rx_mode ||
4993                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4994                         return -EINVAL;
4995                 if (!netif_device_present(dev))
4996                         return -ENODEV;
4997                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4998
4999         case SIOCDELMULTI:
5000                 if (!ops->ndo_set_rx_mode ||
5001                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5002                         return -EINVAL;
5003                 if (!netif_device_present(dev))
5004                         return -ENODEV;
5005                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5006
5007         case SIOCSIFTXQLEN:
5008                 if (ifr->ifr_qlen < 0)
5009                         return -EINVAL;
5010                 dev->tx_queue_len = ifr->ifr_qlen;
5011                 return 0;
5012
5013         case SIOCSIFNAME:
5014                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5015                 return dev_change_name(dev, ifr->ifr_newname);
5016
5017         case SIOCSHWTSTAMP:
5018                 err = net_hwtstamp_validate(ifr);
5019                 if (err)
5020                         return err;
5021                 /* fall through */
5022
5023         /*
5024          *      Unknown or private ioctl
5025          */
5026         default:
5027                 if ((cmd >= SIOCDEVPRIVATE &&
5028                     cmd <= SIOCDEVPRIVATE + 15) ||
5029                     cmd == SIOCBONDENSLAVE ||
5030                     cmd == SIOCBONDRELEASE ||
5031                     cmd == SIOCBONDSETHWADDR ||
5032                     cmd == SIOCBONDSLAVEINFOQUERY ||
5033                     cmd == SIOCBONDINFOQUERY ||
5034                     cmd == SIOCBONDCHANGEACTIVE ||
5035                     cmd == SIOCGMIIPHY ||
5036                     cmd == SIOCGMIIREG ||
5037                     cmd == SIOCSMIIREG ||
5038                     cmd == SIOCBRADDIF ||
5039                     cmd == SIOCBRDELIF ||
5040                     cmd == SIOCSHWTSTAMP ||
5041                     cmd == SIOCWANDEV) {
5042                         err = -EOPNOTSUPP;
5043                         if (ops->ndo_do_ioctl) {
5044                                 if (netif_device_present(dev))
5045                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
5046                                 else
5047                                         err = -ENODEV;
5048                         }
5049                 } else
5050                         err = -EINVAL;
5051
5052         }
5053         return err;
5054 }
5055
5056 /*
5057  *      This function handles all "interface"-type I/O control requests. The actual
5058  *      'doing' part of this is dev_ifsioc above.
5059  */
5060
5061 /**
5062  *      dev_ioctl       -       network device ioctl
5063  *      @net: the applicable net namespace
5064  *      @cmd: command to issue
5065  *      @arg: pointer to a struct ifreq in user space
5066  *
5067  *      Issue ioctl functions to devices. This is normally called by the
5068  *      user space syscall interfaces but can sometimes be useful for
5069  *      other purposes. The return value is the return from the syscall if
5070  *      positive or a negative errno code on error.
5071  */
5072
5073 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5074 {
5075         struct ifreq ifr;
5076         int ret;
5077         char *colon;
5078
5079         /* One special case: SIOCGIFCONF takes ifconf argument
5080            and requires shared lock, because it sleeps writing
5081            to user space.
5082          */
5083
5084         if (cmd == SIOCGIFCONF) {
5085                 rtnl_lock();
5086                 ret = dev_ifconf(net, (char __user *) arg);
5087                 rtnl_unlock();
5088                 return ret;
5089         }
5090         if (cmd == SIOCGIFNAME)
5091                 return dev_ifname(net, (struct ifreq __user *)arg);
5092
5093         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5094                 return -EFAULT;
5095
5096         ifr.ifr_name[IFNAMSIZ-1] = 0;
5097
5098         colon = strchr(ifr.ifr_name, ':');
5099         if (colon)
5100                 *colon = 0;
5101
5102         /*
5103          *      See which interface the caller is talking about.
5104          */
5105
5106         switch (cmd) {
5107         /*
5108          *      These ioctl calls:
5109          *      - can be done by all.
5110          *      - atomic and do not require locking.
5111          *      - return a value
5112          */
5113         case SIOCGIFFLAGS:
5114         case SIOCGIFMETRIC:
5115         case SIOCGIFMTU:
5116         case SIOCGIFHWADDR:
5117         case SIOCGIFSLAVE:
5118         case SIOCGIFMAP:
5119         case SIOCGIFINDEX:
5120         case SIOCGIFTXQLEN:
5121                 dev_load(net, ifr.ifr_name);
5122                 rcu_read_lock();
5123                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5124                 rcu_read_unlock();
5125                 if (!ret) {
5126                         if (colon)
5127                                 *colon = ':';
5128                         if (copy_to_user(arg, &ifr,
5129                                          sizeof(struct ifreq)))
5130                                 ret = -EFAULT;
5131                 }
5132                 return ret;
5133
5134         case SIOCETHTOOL:
5135                 dev_load(net, ifr.ifr_name);
5136                 rtnl_lock();
5137                 ret = dev_ethtool(net, &ifr);
5138                 rtnl_unlock();
5139                 if (!ret) {
5140                         if (colon)
5141                                 *colon = ':';
5142                         if (copy_to_user(arg, &ifr,
5143                                          sizeof(struct ifreq)))
5144                                 ret = -EFAULT;
5145                 }
5146                 return ret;
5147
5148         /*
5149          *      These ioctl calls:
5150          *      - require superuser power.
5151          *      - require strict serialization.
5152          *      - return a value
5153          */
5154         case SIOCGMIIPHY:
5155         case SIOCGMIIREG:
5156         case SIOCSIFNAME:
5157                 if (!capable(CAP_NET_ADMIN))
5158                         return -EPERM;
5159                 dev_load(net, ifr.ifr_name);
5160                 rtnl_lock();
5161                 ret = dev_ifsioc(net, &ifr, cmd);
5162                 rtnl_unlock();
5163                 if (!ret) {
5164                         if (colon)
5165                                 *colon = ':';
5166                         if (copy_to_user(arg, &ifr,
5167                                          sizeof(struct ifreq)))
5168                                 ret = -EFAULT;
5169                 }
5170                 return ret;
5171
5172         /*
5173          *      These ioctl calls:
5174          *      - require superuser power.
5175          *      - require strict serialization.
5176          *      - do not return a value
5177          */
5178         case SIOCSIFFLAGS:
5179         case SIOCSIFMETRIC:
5180         case SIOCSIFMTU:
5181         case SIOCSIFMAP:
5182         case SIOCSIFHWADDR:
5183         case SIOCSIFSLAVE:
5184         case SIOCADDMULTI:
5185         case SIOCDELMULTI:
5186         case SIOCSIFHWBROADCAST:
5187         case SIOCSIFTXQLEN:
5188         case SIOCSMIIREG:
5189         case SIOCBONDENSLAVE:
5190         case SIOCBONDRELEASE:
5191         case SIOCBONDSETHWADDR:
5192         case SIOCBONDCHANGEACTIVE:
5193         case SIOCBRADDIF:
5194         case SIOCBRDELIF:
5195         case SIOCSHWTSTAMP:
5196                 if (!capable(CAP_NET_ADMIN))
5197                         return -EPERM;
5198                 /* fall through */
5199         case SIOCBONDSLAVEINFOQUERY:
5200         case SIOCBONDINFOQUERY:
5201                 dev_load(net, ifr.ifr_name);
5202                 rtnl_lock();
5203                 ret = dev_ifsioc(net, &ifr, cmd);
5204                 rtnl_unlock();
5205                 return ret;
5206
5207         case SIOCGIFMEM:
5208                 /* Get the per device memory space. We can add this but
5209                  * currently do not support it */
5210         case SIOCSIFMEM:
5211                 /* Set the per device memory buffer space.
5212                  * Not applicable in our case */
5213         case SIOCSIFLINK:
5214                 return -ENOTTY;
5215
5216         /*
5217          *      Unknown or private ioctl.
5218          */
5219         default:
5220                 if (cmd == SIOCWANDEV ||
5221                     (cmd >= SIOCDEVPRIVATE &&
5222                      cmd <= SIOCDEVPRIVATE + 15)) {
5223                         dev_load(net, ifr.ifr_name);
5224                         rtnl_lock();
5225                         ret = dev_ifsioc(net, &ifr, cmd);
5226                         rtnl_unlock();
5227                         if (!ret && copy_to_user(arg, &ifr,
5228                                                  sizeof(struct ifreq)))
5229                                 ret = -EFAULT;
5230                         return ret;
5231                 }
5232                 /* Take care of Wireless Extensions */
5233                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5234                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5235                 return -ENOTTY;
5236         }
5237 }
5238
5239
5240 /**
5241  *      dev_new_index   -       allocate an ifindex
5242  *      @net: the applicable net namespace
5243  *
5244  *      Returns a suitable unique value for a new device interface
5245  *      number.  The caller must hold the rtnl semaphore or the
5246  *      dev_base_lock to be sure it remains unique.
5247  */
5248 static int dev_new_index(struct net *net)
5249 {
5250         static int ifindex;
5251         for (;;) {
5252                 if (++ifindex <= 0)
5253                         ifindex = 1;
5254                 if (!__dev_get_by_index(net, ifindex))
5255                         return ifindex;
5256         }
5257 }
5258
5259 /* Delayed registration/unregisteration */
5260 static LIST_HEAD(net_todo_list);
5261
5262 static void net_set_todo(struct net_device *dev)
5263 {
5264         list_add_tail(&dev->todo_list, &net_todo_list);
5265 }
5266
5267 static void rollback_registered_many(struct list_head *head)
5268 {
5269         struct net_device *dev, *tmp;
5270
5271         BUG_ON(dev_boot_phase);
5272         ASSERT_RTNL();
5273
5274         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5275                 /* Some devices call without registering
5276                  * for initialization unwind. Remove those
5277                  * devices and proceed with the remaining.
5278                  */
5279                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5280                         pr_debug("unregister_netdevice: device %s/%p never "
5281                                  "was registered\n", dev->name, dev);
5282
5283                         WARN_ON(1);
5284                         list_del(&dev->unreg_list);
5285                         continue;
5286                 }
5287                 dev->dismantle = true;
5288                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5289         }
5290
5291         /* If device is running, close it first. */
5292         dev_close_many(head);
5293
5294         list_for_each_entry(dev, head, unreg_list) {
5295                 /* And unlink it from device chain. */
5296                 unlist_netdevice(dev);
5297
5298                 dev->reg_state = NETREG_UNREGISTERING;
5299         }
5300
5301         synchronize_net();
5302
5303         list_for_each_entry(dev, head, unreg_list) {
5304                 /* Shutdown queueing discipline. */
5305                 dev_shutdown(dev);
5306
5307
5308                 /* Notify protocols, that we are about to destroy
5309                    this device. They should clean all the things.
5310                 */
5311                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5312
5313                 if (!dev->rtnl_link_ops ||
5314                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5315                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5316
5317                 /*
5318                  *      Flush the unicast and multicast chains
5319                  */
5320                 dev_uc_flush(dev);
5321                 dev_mc_flush(dev);
5322
5323                 if (dev->netdev_ops->ndo_uninit)
5324                         dev->netdev_ops->ndo_uninit(dev);
5325
5326                 /* Notifier chain MUST detach us from master device. */
5327                 WARN_ON(dev->master);
5328
5329                 /* Remove entries from kobject tree */
5330                 netdev_unregister_kobject(dev);
5331         }
5332
5333         /* Process any work delayed until the end of the batch */
5334         dev = list_first_entry(head, struct net_device, unreg_list);
5335         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5336
5337         synchronize_net();
5338
5339         list_for_each_entry(dev, head, unreg_list)
5340                 dev_put(dev);
5341 }
5342
5343 static void rollback_registered(struct net_device *dev)
5344 {
5345         LIST_HEAD(single);
5346
5347         list_add(&dev->unreg_list, &single);
5348         rollback_registered_many(&single);
5349         list_del(&single);
5350 }
5351
5352 static netdev_features_t netdev_fix_features(struct net_device *dev,
5353         netdev_features_t features)
5354 {
5355         /* Fix illegal checksum combinations */
5356         if ((features & NETIF_F_HW_CSUM) &&
5357             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5358                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5359                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5360         }
5361
5362         /* Fix illegal SG+CSUM combinations. */
5363         if ((features & NETIF_F_SG) &&
5364             !(features & NETIF_F_ALL_CSUM)) {
5365                 netdev_dbg(dev,
5366                         "Dropping NETIF_F_SG since no checksum feature.\n");
5367                 features &= ~NETIF_F_SG;
5368         }
5369
5370         /* TSO requires that SG is present as well. */
5371         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5372                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5373                 features &= ~NETIF_F_ALL_TSO;
5374         }
5375
5376         /* TSO ECN requires that TSO is present as well. */
5377         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5378                 features &= ~NETIF_F_TSO_ECN;
5379
5380         /* Software GSO depends on SG. */
5381         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5382                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5383                 features &= ~NETIF_F_GSO;
5384         }
5385
5386         /* UFO needs SG and checksumming */
5387         if (features & NETIF_F_UFO) {
5388                 /* maybe split UFO into V4 and V6? */
5389                 if (!((features & NETIF_F_GEN_CSUM) ||
5390                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5391                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5392                         netdev_dbg(dev,
5393                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5394                         features &= ~NETIF_F_UFO;
5395                 }
5396
5397                 if (!(features & NETIF_F_SG)) {
5398                         netdev_dbg(dev,
5399                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5400                         features &= ~NETIF_F_UFO;
5401                 }
5402         }
5403
5404         return features;
5405 }
5406
5407 int __netdev_update_features(struct net_device *dev)
5408 {
5409         netdev_features_t features;
5410         int err = 0;
5411
5412         ASSERT_RTNL();
5413
5414         features = netdev_get_wanted_features(dev);
5415
5416         if (dev->netdev_ops->ndo_fix_features)
5417                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5418
5419         /* driver might be less strict about feature dependencies */
5420         features = netdev_fix_features(dev, features);
5421
5422         if (dev->features == features)
5423                 return 0;
5424
5425         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5426                 &dev->features, &features);
5427
5428         if (dev->netdev_ops->ndo_set_features)
5429                 err = dev->netdev_ops->ndo_set_features(dev, features);
5430
5431         if (unlikely(err < 0)) {
5432                 netdev_err(dev,
5433                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5434                         err, &features, &dev->features);
5435                 return -1;
5436         }
5437
5438         if (!err)
5439                 dev->features = features;
5440
5441         return 1;
5442 }
5443
5444 /**
5445  *      netdev_update_features - recalculate device features
5446  *      @dev: the device to check
5447  *
5448  *      Recalculate dev->features set and send notifications if it
5449  *      has changed. Should be called after driver or hardware dependent
5450  *      conditions might have changed that influence the features.
5451  */
5452 void netdev_update_features(struct net_device *dev)
5453 {
5454         if (__netdev_update_features(dev))
5455                 netdev_features_change(dev);
5456 }
5457 EXPORT_SYMBOL(netdev_update_features);
5458
5459 /**
5460  *      netdev_change_features - recalculate device features
5461  *      @dev: the device to check
5462  *
5463  *      Recalculate dev->features set and send notifications even
5464  *      if they have not changed. Should be called instead of
5465  *      netdev_update_features() if also dev->vlan_features might
5466  *      have changed to allow the changes to be propagated to stacked
5467  *      VLAN devices.
5468  */
5469 void netdev_change_features(struct net_device *dev)
5470 {
5471         __netdev_update_features(dev);
5472         netdev_features_change(dev);
5473 }
5474 EXPORT_SYMBOL(netdev_change_features);
5475
5476 /**
5477  *      netif_stacked_transfer_operstate -      transfer operstate
5478  *      @rootdev: the root or lower level device to transfer state from
5479  *      @dev: the device to transfer operstate to
5480  *
5481  *      Transfer operational state from root to device. This is normally
5482  *      called when a stacking relationship exists between the root
5483  *      device and the device(a leaf device).
5484  */
5485 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5486                                         struct net_device *dev)
5487 {
5488         if (rootdev->operstate == IF_OPER_DORMANT)
5489                 netif_dormant_on(dev);
5490         else
5491                 netif_dormant_off(dev);
5492
5493         if (netif_carrier_ok(rootdev)) {
5494                 if (!netif_carrier_ok(dev))
5495                         netif_carrier_on(dev);
5496         } else {
5497                 if (netif_carrier_ok(dev))
5498                         netif_carrier_off(dev);
5499         }
5500 }
5501 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5502
5503 #ifdef CONFIG_RPS
5504 static int netif_alloc_rx_queues(struct net_device *dev)
5505 {
5506         unsigned int i, count = dev->num_rx_queues;
5507         struct netdev_rx_queue *rx;
5508
5509         BUG_ON(count < 1);
5510
5511         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5512         if (!rx) {
5513                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5514                 return -ENOMEM;
5515         }
5516         dev->_rx = rx;
5517
5518         for (i = 0; i < count; i++)
5519                 rx[i].dev = dev;
5520         return 0;
5521 }
5522 #endif
5523
5524 static void netdev_init_one_queue(struct net_device *dev,
5525                                   struct netdev_queue *queue, void *_unused)
5526 {
5527         /* Initialize queue lock */
5528         spin_lock_init(&queue->_xmit_lock);
5529         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5530         queue->xmit_lock_owner = -1;
5531         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5532         queue->dev = dev;
5533 }
5534
5535 static int netif_alloc_netdev_queues(struct net_device *dev)
5536 {
5537         unsigned int count = dev->num_tx_queues;
5538         struct netdev_queue *tx;
5539
5540         BUG_ON(count < 1);
5541
5542         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5543         if (!tx) {
5544                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5545                        count);
5546                 return -ENOMEM;
5547         }
5548         dev->_tx = tx;
5549
5550         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5551         spin_lock_init(&dev->tx_global_lock);
5552
5553         return 0;
5554 }
5555
5556 /**
5557  *      register_netdevice      - register a network device
5558  *      @dev: device to register
5559  *
5560  *      Take a completed network device structure and add it to the kernel
5561  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5562  *      chain. 0 is returned on success. A negative errno code is returned
5563  *      on a failure to set up the device, or if the name is a duplicate.
5564  *
5565  *      Callers must hold the rtnl semaphore. You may want
5566  *      register_netdev() instead of this.
5567  *
5568  *      BUGS:
5569  *      The locking appears insufficient to guarantee two parallel registers
5570  *      will not get the same name.
5571  */
5572
5573 int register_netdevice(struct net_device *dev)
5574 {
5575         int ret;
5576         struct net *net = dev_net(dev);
5577
5578         BUG_ON(dev_boot_phase);
5579         ASSERT_RTNL();
5580
5581         might_sleep();
5582
5583         /* When net_device's are persistent, this will be fatal. */
5584         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5585         BUG_ON(!net);
5586
5587         spin_lock_init(&dev->addr_list_lock);
5588         netdev_set_addr_lockdep_class(dev);
5589
5590         dev->iflink = -1;
5591
5592         ret = dev_get_valid_name(dev, dev->name);
5593         if (ret < 0)
5594                 goto out;
5595
5596         /* Init, if this function is available */
5597         if (dev->netdev_ops->ndo_init) {
5598                 ret = dev->netdev_ops->ndo_init(dev);
5599                 if (ret) {
5600                         if (ret > 0)
5601                                 ret = -EIO;
5602                         goto out;
5603                 }
5604         }
5605
5606         dev->ifindex = dev_new_index(net);
5607         if (dev->iflink == -1)
5608                 dev->iflink = dev->ifindex;
5609
5610         /* Transfer changeable features to wanted_features and enable
5611          * software offloads (GSO and GRO).
5612          */
5613         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5614         dev->features |= NETIF_F_SOFT_FEATURES;
5615         dev->wanted_features = dev->features & dev->hw_features;
5616
5617         /* Turn on no cache copy if HW is doing checksum */
5618         if (!(dev->flags & IFF_LOOPBACK)) {
5619                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5620                 if (dev->features & NETIF_F_ALL_CSUM) {
5621                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5622                         dev->features |= NETIF_F_NOCACHE_COPY;
5623                 }
5624         }
5625
5626         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5627          */
5628         dev->vlan_features |= NETIF_F_HIGHDMA;
5629
5630         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5631         ret = notifier_to_errno(ret);
5632         if (ret)
5633                 goto err_uninit;
5634
5635         ret = netdev_register_kobject(dev);
5636         if (ret)
5637                 goto err_uninit;
5638         dev->reg_state = NETREG_REGISTERED;
5639
5640         __netdev_update_features(dev);
5641
5642         /*
5643          *      Default initial state at registry is that the
5644          *      device is present.
5645          */
5646
5647         set_bit(__LINK_STATE_PRESENT, &dev->state);
5648
5649         dev_init_scheduler(dev);
5650         dev_hold(dev);
5651         list_netdevice(dev);
5652
5653         /* Notify protocols, that a new device appeared. */
5654         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5655         ret = notifier_to_errno(ret);
5656         if (ret) {
5657                 rollback_registered(dev);
5658                 dev->reg_state = NETREG_UNREGISTERED;
5659         }
5660         /*
5661          *      Prevent userspace races by waiting until the network
5662          *      device is fully setup before sending notifications.
5663          */
5664         if (!dev->rtnl_link_ops ||
5665             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5666                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5667
5668 out:
5669         return ret;
5670
5671 err_uninit:
5672         if (dev->netdev_ops->ndo_uninit)
5673                 dev->netdev_ops->ndo_uninit(dev);
5674         goto out;
5675 }
5676 EXPORT_SYMBOL(register_netdevice);
5677
5678 /**
5679  *      init_dummy_netdev       - init a dummy network device for NAPI
5680  *      @dev: device to init
5681  *
5682  *      This takes a network device structure and initialize the minimum
5683  *      amount of fields so it can be used to schedule NAPI polls without
5684  *      registering a full blown interface. This is to be used by drivers
5685  *      that need to tie several hardware interfaces to a single NAPI
5686  *      poll scheduler due to HW limitations.
5687  */
5688 int init_dummy_netdev(struct net_device *dev)
5689 {
5690         /* Clear everything. Note we don't initialize spinlocks
5691          * are they aren't supposed to be taken by any of the
5692          * NAPI code and this dummy netdev is supposed to be
5693          * only ever used for NAPI polls
5694          */
5695         memset(dev, 0, sizeof(struct net_device));
5696
5697         /* make sure we BUG if trying to hit standard
5698          * register/unregister code path
5699          */
5700         dev->reg_state = NETREG_DUMMY;
5701
5702         /* NAPI wants this */
5703         INIT_LIST_HEAD(&dev->napi_list);
5704
5705         /* a dummy interface is started by default */
5706         set_bit(__LINK_STATE_PRESENT, &dev->state);
5707         set_bit(__LINK_STATE_START, &dev->state);
5708
5709         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5710          * because users of this 'device' dont need to change
5711          * its refcount.
5712          */
5713
5714         return 0;
5715 }
5716 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5717
5718
5719 /**
5720  *      register_netdev - register a network device
5721  *      @dev: device to register
5722  *
5723  *      Take a completed network device structure and add it to the kernel
5724  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5725  *      chain. 0 is returned on success. A negative errno code is returned
5726  *      on a failure to set up the device, or if the name is a duplicate.
5727  *
5728  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5729  *      and expands the device name if you passed a format string to
5730  *      alloc_netdev.
5731  */
5732 int register_netdev(struct net_device *dev)
5733 {
5734         int err;
5735
5736         rtnl_lock();
5737         err = register_netdevice(dev);
5738         rtnl_unlock();
5739         return err;
5740 }
5741 EXPORT_SYMBOL(register_netdev);
5742
5743 int netdev_refcnt_read(const struct net_device *dev)
5744 {
5745         int i, refcnt = 0;
5746
5747         for_each_possible_cpu(i)
5748                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5749         return refcnt;
5750 }
5751 EXPORT_SYMBOL(netdev_refcnt_read);
5752
5753 /*
5754  * netdev_wait_allrefs - wait until all references are gone.
5755  *
5756  * This is called when unregistering network devices.
5757  *
5758  * Any protocol or device that holds a reference should register
5759  * for netdevice notification, and cleanup and put back the
5760  * reference if they receive an UNREGISTER event.
5761  * We can get stuck here if buggy protocols don't correctly
5762  * call dev_put.
5763  */
5764 static void netdev_wait_allrefs(struct net_device *dev)
5765 {
5766         unsigned long rebroadcast_time, warning_time;
5767         int refcnt;
5768
5769         linkwatch_forget_dev(dev);
5770
5771         rebroadcast_time = warning_time = jiffies;
5772         refcnt = netdev_refcnt_read(dev);
5773
5774         while (refcnt != 0) {
5775                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5776                         rtnl_lock();
5777
5778                         /* Rebroadcast unregister notification */
5779                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5780                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5781                          * should have already handle it the first time */
5782
5783                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5784                                      &dev->state)) {
5785                                 /* We must not have linkwatch events
5786                                  * pending on unregister. If this
5787                                  * happens, we simply run the queue
5788                                  * unscheduled, resulting in a noop
5789                                  * for this device.
5790                                  */
5791                                 linkwatch_run_queue();
5792                         }
5793
5794                         __rtnl_unlock();
5795
5796                         rebroadcast_time = jiffies;
5797                 }
5798
5799                 msleep(250);
5800
5801                 refcnt = netdev_refcnt_read(dev);
5802
5803                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5804                         printk(KERN_EMERG "unregister_netdevice: "
5805                                "waiting for %s to become free. Usage "
5806                                "count = %d\n",
5807                                dev->name, refcnt);
5808                         warning_time = jiffies;
5809                 }
5810         }
5811 }
5812
5813 /* The sequence is:
5814  *
5815  *      rtnl_lock();
5816  *      ...
5817  *      register_netdevice(x1);
5818  *      register_netdevice(x2);
5819  *      ...
5820  *      unregister_netdevice(y1);
5821  *      unregister_netdevice(y2);
5822  *      ...
5823  *      rtnl_unlock();
5824  *      free_netdev(y1);
5825  *      free_netdev(y2);
5826  *
5827  * We are invoked by rtnl_unlock().
5828  * This allows us to deal with problems:
5829  * 1) We can delete sysfs objects which invoke hotplug
5830  *    without deadlocking with linkwatch via keventd.
5831  * 2) Since we run with the RTNL semaphore not held, we can sleep
5832  *    safely in order to wait for the netdev refcnt to drop to zero.
5833  *
5834  * We must not return until all unregister events added during
5835  * the interval the lock was held have been completed.
5836  */
5837 void netdev_run_todo(void)
5838 {
5839         struct list_head list;
5840
5841         /* Snapshot list, allow later requests */
5842         list_replace_init(&net_todo_list, &list);
5843
5844         __rtnl_unlock();
5845
5846         /* Wait for rcu callbacks to finish before attempting to drain
5847          * the device list.  This usually avoids a 250ms wait.
5848          */
5849         if (!list_empty(&list))
5850                 rcu_barrier();
5851
5852         while (!list_empty(&list)) {
5853                 struct net_device *dev
5854                         = list_first_entry(&list, struct net_device, todo_list);
5855                 list_del(&dev->todo_list);
5856
5857                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5858                         printk(KERN_ERR "network todo '%s' but state %d\n",
5859                                dev->name, dev->reg_state);
5860                         dump_stack();
5861                         continue;
5862                 }
5863
5864                 dev->reg_state = NETREG_UNREGISTERED;
5865
5866                 on_each_cpu(flush_backlog, dev, 1);
5867
5868                 netdev_wait_allrefs(dev);
5869
5870                 /* paranoia */
5871                 BUG_ON(netdev_refcnt_read(dev));
5872                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5873                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5874                 WARN_ON(dev->dn_ptr);
5875
5876                 if (dev->destructor)
5877                         dev->destructor(dev);
5878
5879                 /* Free network device */
5880                 kobject_put(&dev->dev.kobj);
5881         }
5882 }
5883
5884 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5885  * fields in the same order, with only the type differing.
5886  */
5887 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5888                                     const struct net_device_stats *netdev_stats)
5889 {
5890 #if BITS_PER_LONG == 64
5891         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5892         memcpy(stats64, netdev_stats, sizeof(*stats64));
5893 #else
5894         size_t i, n = sizeof(*stats64) / sizeof(u64);
5895         const unsigned long *src = (const unsigned long *)netdev_stats;
5896         u64 *dst = (u64 *)stats64;
5897
5898         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5899                      sizeof(*stats64) / sizeof(u64));
5900         for (i = 0; i < n; i++)
5901                 dst[i] = src[i];
5902 #endif
5903 }
5904
5905 /**
5906  *      dev_get_stats   - get network device statistics
5907  *      @dev: device to get statistics from
5908  *      @storage: place to store stats
5909  *
5910  *      Get network statistics from device. Return @storage.
5911  *      The device driver may provide its own method by setting
5912  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5913  *      otherwise the internal statistics structure is used.
5914  */
5915 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5916                                         struct rtnl_link_stats64 *storage)
5917 {
5918         const struct net_device_ops *ops = dev->netdev_ops;
5919
5920         if (ops->ndo_get_stats64) {
5921                 memset(storage, 0, sizeof(*storage));
5922                 ops->ndo_get_stats64(dev, storage);
5923         } else if (ops->ndo_get_stats) {
5924                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5925         } else {
5926                 netdev_stats_to_stats64(storage, &dev->stats);
5927         }
5928         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5929         return storage;
5930 }
5931 EXPORT_SYMBOL(dev_get_stats);
5932
5933 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5934 {
5935         struct netdev_queue *queue = dev_ingress_queue(dev);
5936
5937 #ifdef CONFIG_NET_CLS_ACT
5938         if (queue)
5939                 return queue;
5940         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5941         if (!queue)
5942                 return NULL;
5943         netdev_init_one_queue(dev, queue, NULL);
5944         queue->qdisc = &noop_qdisc;
5945         queue->qdisc_sleeping = &noop_qdisc;
5946         rcu_assign_pointer(dev->ingress_queue, queue);
5947 #endif
5948         return queue;
5949 }
5950
5951 /**
5952  *      alloc_netdev_mqs - allocate network device
5953  *      @sizeof_priv:   size of private data to allocate space for
5954  *      @name:          device name format string
5955  *      @setup:         callback to initialize device
5956  *      @txqs:          the number of TX subqueues to allocate
5957  *      @rxqs:          the number of RX subqueues to allocate
5958  *
5959  *      Allocates a struct net_device with private data area for driver use
5960  *      and performs basic initialization.  Also allocates subquue structs
5961  *      for each queue on the device.
5962  */
5963 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5964                 void (*setup)(struct net_device *),
5965                 unsigned int txqs, unsigned int rxqs)
5966 {
5967         struct net_device *dev;
5968         size_t alloc_size;
5969         struct net_device *p;
5970
5971         BUG_ON(strlen(name) >= sizeof(dev->name));
5972
5973         if (txqs < 1) {
5974                 pr_err("alloc_netdev: Unable to allocate device "
5975                        "with zero queues.\n");
5976                 return NULL;
5977         }
5978
5979 #ifdef CONFIG_RPS
5980         if (rxqs < 1) {
5981                 pr_err("alloc_netdev: Unable to allocate device "
5982                        "with zero RX queues.\n");
5983                 return NULL;
5984         }
5985 #endif
5986
5987         alloc_size = sizeof(struct net_device);
5988         if (sizeof_priv) {
5989                 /* ensure 32-byte alignment of private area */
5990                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5991                 alloc_size += sizeof_priv;
5992         }
5993         /* ensure 32-byte alignment of whole construct */
5994         alloc_size += NETDEV_ALIGN - 1;
5995
5996         p = kzalloc(alloc_size, GFP_KERNEL);
5997         if (!p) {
5998                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5999                 return NULL;
6000         }
6001
6002         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6003         dev->padded = (char *)dev - (char *)p;
6004
6005         dev->pcpu_refcnt = alloc_percpu(int);
6006         if (!dev->pcpu_refcnt)
6007                 goto free_p;
6008
6009         if (dev_addr_init(dev))
6010                 goto free_pcpu;
6011
6012         dev_mc_init(dev);
6013         dev_uc_init(dev);
6014
6015         dev_net_set(dev, &init_net);
6016
6017         dev->gso_max_size = GSO_MAX_SIZE;
6018
6019         INIT_LIST_HEAD(&dev->napi_list);
6020         INIT_LIST_HEAD(&dev->unreg_list);
6021         INIT_LIST_HEAD(&dev->link_watch_list);
6022         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6023         setup(dev);
6024
6025         dev->num_tx_queues = txqs;
6026         dev->real_num_tx_queues = txqs;
6027         if (netif_alloc_netdev_queues(dev))
6028                 goto free_all;
6029
6030 #ifdef CONFIG_RPS
6031         dev->num_rx_queues = rxqs;
6032         dev->real_num_rx_queues = rxqs;
6033         if (netif_alloc_rx_queues(dev))
6034                 goto free_all;
6035 #endif
6036
6037         strcpy(dev->name, name);
6038         dev->group = INIT_NETDEV_GROUP;
6039         return dev;
6040
6041 free_all:
6042         free_netdev(dev);
6043         return NULL;
6044
6045 free_pcpu:
6046         free_percpu(dev->pcpu_refcnt);
6047         kfree(dev->_tx);
6048 #ifdef CONFIG_RPS
6049         kfree(dev->_rx);
6050 #endif
6051
6052 free_p:
6053         kfree(p);
6054         return NULL;
6055 }
6056 EXPORT_SYMBOL(alloc_netdev_mqs);
6057
6058 /**
6059  *      free_netdev - free network device
6060  *      @dev: device
6061  *
6062  *      This function does the last stage of destroying an allocated device
6063  *      interface. The reference to the device object is released.
6064  *      If this is the last reference then it will be freed.
6065  */
6066 void free_netdev(struct net_device *dev)
6067 {
6068         struct napi_struct *p, *n;
6069
6070         release_net(dev_net(dev));
6071
6072         kfree(dev->_tx);
6073 #ifdef CONFIG_RPS
6074         kfree(dev->_rx);
6075 #endif
6076
6077         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6078
6079         /* Flush device addresses */
6080         dev_addr_flush(dev);
6081
6082         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6083                 netif_napi_del(p);
6084
6085         free_percpu(dev->pcpu_refcnt);
6086         dev->pcpu_refcnt = NULL;
6087
6088         /*  Compatibility with error handling in drivers */
6089         if (dev->reg_state == NETREG_UNINITIALIZED) {
6090                 kfree((char *)dev - dev->padded);
6091                 return;
6092         }
6093
6094         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6095         dev->reg_state = NETREG_RELEASED;
6096
6097         /* will free via device release */
6098         put_device(&dev->dev);
6099 }
6100 EXPORT_SYMBOL(free_netdev);
6101
6102 /**
6103  *      synchronize_net -  Synchronize with packet receive processing
6104  *
6105  *      Wait for packets currently being received to be done.
6106  *      Does not block later packets from starting.
6107  */
6108 void synchronize_net(void)
6109 {
6110         might_sleep();
6111         if (rtnl_is_locked())
6112                 synchronize_rcu_expedited();
6113         else
6114                 synchronize_rcu();
6115 }
6116 EXPORT_SYMBOL(synchronize_net);
6117
6118 /**
6119  *      unregister_netdevice_queue - remove device from the kernel
6120  *      @dev: device
6121  *      @head: list
6122  *
6123  *      This function shuts down a device interface and removes it
6124  *      from the kernel tables.
6125  *      If head not NULL, device is queued to be unregistered later.
6126  *
6127  *      Callers must hold the rtnl semaphore.  You may want
6128  *      unregister_netdev() instead of this.
6129  */
6130
6131 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6132 {
6133         ASSERT_RTNL();
6134
6135         if (head) {
6136                 list_move_tail(&dev->unreg_list, head);
6137         } else {
6138                 rollback_registered(dev);
6139                 /* Finish processing unregister after unlock */
6140                 net_set_todo(dev);
6141         }
6142 }
6143 EXPORT_SYMBOL(unregister_netdevice_queue);
6144
6145 /**
6146  *      unregister_netdevice_many - unregister many devices
6147  *      @head: list of devices
6148  */
6149 void unregister_netdevice_many(struct list_head *head)
6150 {
6151         struct net_device *dev;
6152
6153         if (!list_empty(head)) {
6154                 rollback_registered_many(head);
6155                 list_for_each_entry(dev, head, unreg_list)
6156                         net_set_todo(dev);
6157         }
6158 }
6159 EXPORT_SYMBOL(unregister_netdevice_many);
6160
6161 /**
6162  *      unregister_netdev - remove device from the kernel
6163  *      @dev: device
6164  *
6165  *      This function shuts down a device interface and removes it
6166  *      from the kernel tables.
6167  *
6168  *      This is just a wrapper for unregister_netdevice that takes
6169  *      the rtnl semaphore.  In general you want to use this and not
6170  *      unregister_netdevice.
6171  */
6172 void unregister_netdev(struct net_device *dev)
6173 {
6174         rtnl_lock();
6175         unregister_netdevice(dev);
6176         rtnl_unlock();
6177 }
6178 EXPORT_SYMBOL(unregister_netdev);
6179
6180 /**
6181  *      dev_change_net_namespace - move device to different nethost namespace
6182  *      @dev: device
6183  *      @net: network namespace
6184  *      @pat: If not NULL name pattern to try if the current device name
6185  *            is already taken in the destination network namespace.
6186  *
6187  *      This function shuts down a device interface and moves it
6188  *      to a new network namespace. On success 0 is returned, on
6189  *      a failure a netagive errno code is returned.
6190  *
6191  *      Callers must hold the rtnl semaphore.
6192  */
6193
6194 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6195 {
6196         int err;
6197
6198         ASSERT_RTNL();
6199
6200         /* Don't allow namespace local devices to be moved. */
6201         err = -EINVAL;
6202         if (dev->features & NETIF_F_NETNS_LOCAL)
6203                 goto out;
6204
6205         /* Ensure the device has been registrered */
6206         err = -EINVAL;
6207         if (dev->reg_state != NETREG_REGISTERED)
6208                 goto out;
6209
6210         /* Get out if there is nothing todo */
6211         err = 0;
6212         if (net_eq(dev_net(dev), net))
6213                 goto out;
6214
6215         /* Pick the destination device name, and ensure
6216          * we can use it in the destination network namespace.
6217          */
6218         err = -EEXIST;
6219         if (__dev_get_by_name(net, dev->name)) {
6220                 /* We get here if we can't use the current device name */
6221                 if (!pat)
6222                         goto out;
6223                 if (dev_get_valid_name(dev, pat) < 0)
6224                         goto out;
6225         }
6226
6227         /*
6228          * And now a mini version of register_netdevice unregister_netdevice.
6229          */
6230
6231         /* If device is running close it first. */
6232         dev_close(dev);
6233
6234         /* And unlink it from device chain */
6235         err = -ENODEV;
6236         unlist_netdevice(dev);
6237
6238         synchronize_net();
6239
6240         /* Shutdown queueing discipline. */
6241         dev_shutdown(dev);
6242
6243         /* Notify protocols, that we are about to destroy
6244            this device. They should clean all the things.
6245
6246            Note that dev->reg_state stays at NETREG_REGISTERED.
6247            This is wanted because this way 8021q and macvlan know
6248            the device is just moving and can keep their slaves up.
6249         */
6250         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6251         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6252         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6253
6254         /*
6255          *      Flush the unicast and multicast chains
6256          */
6257         dev_uc_flush(dev);
6258         dev_mc_flush(dev);
6259
6260         /* Actually switch the network namespace */
6261         dev_net_set(dev, net);
6262
6263         /* If there is an ifindex conflict assign a new one */
6264         if (__dev_get_by_index(net, dev->ifindex)) {
6265                 int iflink = (dev->iflink == dev->ifindex);
6266                 dev->ifindex = dev_new_index(net);
6267                 if (iflink)
6268                         dev->iflink = dev->ifindex;
6269         }
6270
6271         /* Fixup kobjects */
6272         err = device_rename(&dev->dev, dev->name);
6273         WARN_ON(err);
6274
6275         /* Add the device back in the hashes */
6276         list_netdevice(dev);
6277
6278         /* Notify protocols, that a new device appeared. */
6279         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6280
6281         /*
6282          *      Prevent userspace races by waiting until the network
6283          *      device is fully setup before sending notifications.
6284          */
6285         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6286
6287         synchronize_net();
6288         err = 0;
6289 out:
6290         return err;
6291 }
6292 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6293
6294 static int dev_cpu_callback(struct notifier_block *nfb,
6295                             unsigned long action,
6296                             void *ocpu)
6297 {
6298         struct sk_buff **list_skb;
6299         struct sk_buff *skb;
6300         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6301         struct softnet_data *sd, *oldsd;
6302
6303         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6304                 return NOTIFY_OK;
6305
6306         local_irq_disable();
6307         cpu = smp_processor_id();
6308         sd = &per_cpu(softnet_data, cpu);
6309         oldsd = &per_cpu(softnet_data, oldcpu);
6310
6311         /* Find end of our completion_queue. */
6312         list_skb = &sd->completion_queue;
6313         while (*list_skb)
6314                 list_skb = &(*list_skb)->next;
6315         /* Append completion queue from offline CPU. */
6316         *list_skb = oldsd->completion_queue;
6317         oldsd->completion_queue = NULL;
6318
6319         /* Append output queue from offline CPU. */
6320         if (oldsd->output_queue) {
6321                 *sd->output_queue_tailp = oldsd->output_queue;
6322                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6323                 oldsd->output_queue = NULL;
6324                 oldsd->output_queue_tailp = &oldsd->output_queue;
6325         }
6326         /* Append NAPI poll list from offline CPU. */
6327         if (!list_empty(&oldsd->poll_list)) {
6328                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6329                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6330         }
6331
6332         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6333         local_irq_enable();
6334
6335         /* Process offline CPU's input_pkt_queue */
6336         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6337                 netif_rx(skb);
6338                 input_queue_head_incr(oldsd);
6339         }
6340         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6341                 netif_rx(skb);
6342                 input_queue_head_incr(oldsd);
6343         }
6344
6345         return NOTIFY_OK;
6346 }
6347
6348
6349 /**
6350  *      netdev_increment_features - increment feature set by one
6351  *      @all: current feature set
6352  *      @one: new feature set
6353  *      @mask: mask feature set
6354  *
6355  *      Computes a new feature set after adding a device with feature set
6356  *      @one to the master device with current feature set @all.  Will not
6357  *      enable anything that is off in @mask. Returns the new feature set.
6358  */
6359 netdev_features_t netdev_increment_features(netdev_features_t all,
6360         netdev_features_t one, netdev_features_t mask)
6361 {
6362         if (mask & NETIF_F_GEN_CSUM)
6363                 mask |= NETIF_F_ALL_CSUM;
6364         mask |= NETIF_F_VLAN_CHALLENGED;
6365
6366         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6367         all &= one | ~NETIF_F_ALL_FOR_ALL;
6368
6369         /* If one device supports hw checksumming, set for all. */
6370         if (all & NETIF_F_GEN_CSUM)
6371                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6372
6373         return all;
6374 }
6375 EXPORT_SYMBOL(netdev_increment_features);
6376
6377 static struct hlist_head *netdev_create_hash(void)
6378 {
6379         int i;
6380         struct hlist_head *hash;
6381
6382         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6383         if (hash != NULL)
6384                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6385                         INIT_HLIST_HEAD(&hash[i]);
6386
6387         return hash;
6388 }
6389
6390 /* Initialize per network namespace state */
6391 static int __net_init netdev_init(struct net *net)
6392 {
6393         INIT_LIST_HEAD(&net->dev_base_head);
6394
6395         net->dev_name_head = netdev_create_hash();
6396         if (net->dev_name_head == NULL)
6397                 goto err_name;
6398
6399         net->dev_index_head = netdev_create_hash();
6400         if (net->dev_index_head == NULL)
6401                 goto err_idx;
6402
6403         return 0;
6404
6405 err_idx:
6406         kfree(net->dev_name_head);
6407 err_name:
6408         return -ENOMEM;
6409 }
6410
6411 /**
6412  *      netdev_drivername - network driver for the device
6413  *      @dev: network device
6414  *
6415  *      Determine network driver for device.
6416  */
6417 const char *netdev_drivername(const struct net_device *dev)
6418 {
6419         const struct device_driver *driver;
6420         const struct device *parent;
6421         const char *empty = "";
6422
6423         parent = dev->dev.parent;
6424         if (!parent)
6425                 return empty;
6426
6427         driver = parent->driver;
6428         if (driver && driver->name)
6429                 return driver->name;
6430         return empty;
6431 }
6432
6433 int __netdev_printk(const char *level, const struct net_device *dev,
6434                            struct va_format *vaf)
6435 {
6436         int r;
6437
6438         if (dev && dev->dev.parent)
6439                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6440                                netdev_name(dev), vaf);
6441         else if (dev)
6442                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6443         else
6444                 r = printk("%s(NULL net_device): %pV", level, vaf);
6445
6446         return r;
6447 }
6448 EXPORT_SYMBOL(__netdev_printk);
6449
6450 int netdev_printk(const char *level, const struct net_device *dev,
6451                   const char *format, ...)
6452 {
6453         struct va_format vaf;
6454         va_list args;
6455         int r;
6456
6457         va_start(args, format);
6458
6459         vaf.fmt = format;
6460         vaf.va = &args;
6461
6462         r = __netdev_printk(level, dev, &vaf);
6463         va_end(args);
6464
6465         return r;
6466 }
6467 EXPORT_SYMBOL(netdev_printk);
6468
6469 #define define_netdev_printk_level(func, level)                 \
6470 int func(const struct net_device *dev, const char *fmt, ...)    \
6471 {                                                               \
6472         int r;                                                  \
6473         struct va_format vaf;                                   \
6474         va_list args;                                           \
6475                                                                 \
6476         va_start(args, fmt);                                    \
6477                                                                 \
6478         vaf.fmt = fmt;                                          \
6479         vaf.va = &args;                                         \
6480                                                                 \
6481         r = __netdev_printk(level, dev, &vaf);                  \
6482         va_end(args);                                           \
6483                                                                 \
6484         return r;                                               \
6485 }                                                               \
6486 EXPORT_SYMBOL(func);
6487
6488 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6489 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6490 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6491 define_netdev_printk_level(netdev_err, KERN_ERR);
6492 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6493 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6494 define_netdev_printk_level(netdev_info, KERN_INFO);
6495
6496 static void __net_exit netdev_exit(struct net *net)
6497 {
6498         kfree(net->dev_name_head);
6499         kfree(net->dev_index_head);
6500 }
6501
6502 static struct pernet_operations __net_initdata netdev_net_ops = {
6503         .init = netdev_init,
6504         .exit = netdev_exit,
6505 };
6506
6507 static void __net_exit default_device_exit(struct net *net)
6508 {
6509         struct net_device *dev, *aux;
6510         /*
6511          * Push all migratable network devices back to the
6512          * initial network namespace
6513          */
6514         rtnl_lock();
6515         for_each_netdev_safe(net, dev, aux) {
6516                 int err;
6517                 char fb_name[IFNAMSIZ];
6518
6519                 /* Ignore unmoveable devices (i.e. loopback) */
6520                 if (dev->features & NETIF_F_NETNS_LOCAL)
6521                         continue;
6522
6523                 /* Leave virtual devices for the generic cleanup */
6524                 if (dev->rtnl_link_ops)
6525                         continue;
6526
6527                 /* Push remaining network devices to init_net */
6528                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6529                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6530                 if (err) {
6531                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6532                                 __func__, dev->name, err);
6533                         BUG();
6534                 }
6535         }
6536         rtnl_unlock();
6537 }
6538
6539 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6540 {
6541         /* At exit all network devices most be removed from a network
6542          * namespace.  Do this in the reverse order of registration.
6543          * Do this across as many network namespaces as possible to
6544          * improve batching efficiency.
6545          */
6546         struct net_device *dev;
6547         struct net *net;
6548         LIST_HEAD(dev_kill_list);
6549
6550         rtnl_lock();
6551         list_for_each_entry(net, net_list, exit_list) {
6552                 for_each_netdev_reverse(net, dev) {
6553                         if (dev->rtnl_link_ops)
6554                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6555                         else
6556                                 unregister_netdevice_queue(dev, &dev_kill_list);
6557                 }
6558         }
6559         unregister_netdevice_many(&dev_kill_list);
6560         list_del(&dev_kill_list);
6561         rtnl_unlock();
6562 }
6563
6564 static struct pernet_operations __net_initdata default_device_ops = {
6565         .exit = default_device_exit,
6566         .exit_batch = default_device_exit_batch,
6567 };
6568
6569 /*
6570  *      Initialize the DEV module. At boot time this walks the device list and
6571  *      unhooks any devices that fail to initialise (normally hardware not
6572  *      present) and leaves us with a valid list of present and active devices.
6573  *
6574  */
6575
6576 /*
6577  *       This is called single threaded during boot, so no need
6578  *       to take the rtnl semaphore.
6579  */
6580 static int __init net_dev_init(void)
6581 {
6582         int i, rc = -ENOMEM;
6583
6584         BUG_ON(!dev_boot_phase);
6585
6586         if (dev_proc_init())
6587                 goto out;
6588
6589         if (netdev_kobject_init())
6590                 goto out;
6591
6592         INIT_LIST_HEAD(&ptype_all);
6593         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6594                 INIT_LIST_HEAD(&ptype_base[i]);
6595
6596         if (register_pernet_subsys(&netdev_net_ops))
6597                 goto out;
6598
6599         /*
6600          *      Initialise the packet receive queues.
6601          */
6602
6603         for_each_possible_cpu(i) {
6604                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6605
6606                 memset(sd, 0, sizeof(*sd));
6607                 skb_queue_head_init(&sd->input_pkt_queue);
6608                 skb_queue_head_init(&sd->process_queue);
6609                 sd->completion_queue = NULL;
6610                 INIT_LIST_HEAD(&sd->poll_list);
6611                 sd->output_queue = NULL;
6612                 sd->output_queue_tailp = &sd->output_queue;
6613 #ifdef CONFIG_RPS
6614                 sd->csd.func = rps_trigger_softirq;
6615                 sd->csd.info = sd;
6616                 sd->csd.flags = 0;
6617                 sd->cpu = i;
6618 #endif
6619
6620                 sd->backlog.poll = process_backlog;
6621                 sd->backlog.weight = weight_p;
6622                 sd->backlog.gro_list = NULL;
6623                 sd->backlog.gro_count = 0;
6624         }
6625
6626         dev_boot_phase = 0;
6627
6628         /* The loopback device is special if any other network devices
6629          * is present in a network namespace the loopback device must
6630          * be present. Since we now dynamically allocate and free the
6631          * loopback device ensure this invariant is maintained by
6632          * keeping the loopback device as the first device on the
6633          * list of network devices.  Ensuring the loopback devices
6634          * is the first device that appears and the last network device
6635          * that disappears.
6636          */
6637         if (register_pernet_device(&loopback_net_ops))
6638                 goto out;
6639
6640         if (register_pernet_device(&default_device_ops))
6641                 goto out;
6642
6643         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6644         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6645
6646         hotcpu_notifier(dev_cpu_callback, 0);
6647         dst_init();
6648         dev_mcast_init();
6649         rc = 0;
6650 out:
6651         return rc;
6652 }
6653
6654 subsys_initcall(net_dev_init);
6655
6656 static int __init initialize_hashrnd(void)
6657 {
6658         get_random_bytes(&hashrnd, sizeof(hashrnd));
6659         return 0;
6660 }
6661
6662 late_initcall_sync(initialize_hashrnd);
6663