net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/mutex.h>
  84 #include <linux/string.h>
  85 #include <linux/mm.h>
  86 #include <linux/socket.h>
  87 #include <linux/sockios.h>
  88 #include <linux/errno.h>
  89 #include <linux/interrupt.h>
  90 #include <linux/if_ether.h>
  91 #include <linux/netdevice.h>
  92 #include <linux/etherdevice.h>
  93 #include <linux/ethtool.h>
  94 #include <linux/notifier.h>
  95 #include <linux/skbuff.h>
  96 #include <net/net_namespace.h>
  97 #include <net/sock.h>
  98 #include <linux/rtnetlink.h>
  99 #include <linux/proc_fs.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/stat.h>
 102 #include <linux/if_bridge.h>
 103 #include <linux/if_macvlan.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129
 130 #include "net-sysfs.h"
 131
 132 /* Instead of increasing this, you should create a hash table. */
 133 #define MAX_GRO_SKBS 8
 134
 135 /* This should be increased if a protocol with a bigger head is added. */
 136 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 137
 138 /*
 139  *      The list of packet types we will receive (as opposed to discard)
 140  *      and the routines to invoke.
 141  *
 142  *      Why 16. Because with 16 the only overlap we get on a hash of the
 143  *      low nibble of the protocol value is RARP/SNAP/X.25.
 144  *
 145  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 146  *             sure which should go first, but I bet it won't make much
 147  *             difference if we are running VLANs.  The good news is that
 148  *             this protocol won't be in the list unless compiled in, so
 149  *             the average user (w/out VLANs) will not be adversely affected.
 150  *             --BLG
 151  *
 152  *              0800    IP
 153  *              8100    802.1Q VLAN
 154  *              0001    802.3
 155  *              0002    AX.25
 156  *              0004    802.2
 157  *              8035    RARP
 158  *              0005    SNAP
 159  *              0805    X.25
 160  *              0806    ARP
 161  *              8137    IPX
 162  *              0009    Localtalk
 163  *              86DD    IPv6
 164  */
 165
 166 #define PTYPE_HASH_SIZE (16)
 167 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 168
 169 static DEFINE_SPINLOCK(ptype_lock);
 170 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 171 static struct list_head ptype_all __read_mostly;        /* Taps */
 172
 173 #ifdef CONFIG_NET_DMA
 174 struct net_dma {
 175         struct dma_client client;
 176         spinlock_t lock;
 177         cpumask_t channel_mask;
 178         struct dma_chan **channels;
 179 };
 180
 181 static enum dma_state_client
 182 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
 183         enum dma_state state);
 184
 185 static struct net_dma net_dma = {
 186         .client = {
 187                 .event_callback = netdev_dma_event,
 188         },
 189 };
 190 #endif
 191
 192 /*
 193  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 194  * semaphore.
 195  *
 196  * Pure readers hold dev_base_lock for reading.
 197  *
 198  * Writers must hold the rtnl semaphore while they loop through the
 199  * dev_base_head list, and hold dev_base_lock for writing when they do the
 200  * actual updates.  This allows pure readers to access the list even
 201  * while a writer is preparing to update it.
 202  *
 203  * To put it another way, dev_base_lock is held for writing only to
 204  * protect against pure readers; the rtnl semaphore provides the
 205  * protection against other writers.
 206  *
 207  * See, for example usages, register_netdevice() and
 208  * unregister_netdevice(), which must be called with the rtnl
 209  * semaphore held.
 210  */
 211 DEFINE_RWLOCK(dev_base_lock);
 212
 213 EXPORT_SYMBOL(dev_base_lock);
 214
 215 #define NETDEV_HASHBITS 8
 216 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 217
 218 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 219 {
 220         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 221         return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 222 }
 223
 224 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 225 {
 226         return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 227 }
 228
 229 /* Device list insertion */
 230 static int list_netdevice(struct net_device *dev)
 231 {
 232         struct net *net = dev_net(dev);
 233
 234         ASSERT_RTNL();
 235
 236         write_lock_bh(&dev_base_lock);
 237         list_add_tail(&dev->dev_list, &net->dev_base_head);
 238         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 239         hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 240         write_unlock_bh(&dev_base_lock);
 241         return 0;
 242 }
 243
 244 /* Device list removal */
 245 static void unlist_netdevice(struct net_device *dev)
 246 {
 247         ASSERT_RTNL();
 248
 249         /* Unlink dev from the device chain */
 250         write_lock_bh(&dev_base_lock);
 251         list_del(&dev->dev_list);
 252         hlist_del(&dev->name_hlist);
 253         hlist_del(&dev->index_hlist);
 254         write_unlock_bh(&dev_base_lock);
 255 }
 256
 257 /*
 258  *      Our notifier list
 259  */
 260
 261 static RAW_NOTIFIER_HEAD(netdev_chain);
 262
 263 /*
 264  *      Device drivers call our routines to queue packets here. We empty the
 265  *      queue in the local softnet handler.
 266  */
 267
 268 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 269
 270 #ifdef CONFIG_LOCKDEP
 271 /*
 272  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 273  * according to dev->type
 274  */
 275 static const unsigned short netdev_lock_type[] =
 276         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 277          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 278          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 279          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 280          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 281          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 282          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 283          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 284          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 285          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 286          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 287          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 288          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 289          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 290          ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
 291
 292 static const char *netdev_lock_name[] =
 293         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 294          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 295          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 296          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 297          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 298          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 299          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 300          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 301          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 302          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 303          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 304          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 305          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 306          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 307          "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
 308
 309 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 310 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 311
 312 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 313 {
 314         int i;
 315
 316         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 317                 if (netdev_lock_type[i] == dev_type)
 318                         return i;
 319         /* the last key is used by default */
 320         return ARRAY_SIZE(netdev_lock_type) - 1;
 321 }
 322
 323 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 324                                                  unsigned short dev_type)
 325 {
 326         int i;
 327
 328         i = netdev_lock_pos(dev_type);
 329         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 330                                    netdev_lock_name[i]);
 331 }
 332
 333 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 334 {
 335         int i;
 336
 337         i = netdev_lock_pos(dev->type);
 338         lockdep_set_class_and_name(&dev->addr_list_lock,
 339                                    &netdev_addr_lock_key[i],
 340                                    netdev_lock_name[i]);
 341 }
 342 #else
 343 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 344                                                  unsigned short dev_type)
 345 {
 346 }
 347 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 348 {
 349 }
 350 #endif
 351
 352 /*******************************************************************************
 353
 354                 Protocol management and registration routines
 355
 356 *******************************************************************************/
 357
 358 /*
 359  *      Add a protocol ID to the list. Now that the input handler is
 360  *      smarter we can dispense with all the messy stuff that used to be
 361  *      here.
 362  *
 363  *      BEWARE!!! Protocol handlers, mangling input packets,
 364  *      MUST BE last in hash buckets and checking protocol handlers
 365  *      MUST start from promiscuous ptype_all chain in net_bh.
 366  *      It is true now, do not change it.
 367  *      Explanation follows: if protocol handler, mangling packet, will
 368  *      be the first on list, it is not able to sense, that packet
 369  *      is cloned and should be copied-on-write, so that it will
 370  *      change it and subsequent readers will get broken packet.
 371  *                                                      --ANK (980803)
 372  */
 373
 374 /**
 375  *      dev_add_pack - add packet handler
 376  *      @pt: packet type declaration
 377  *
 378  *      Add a protocol handler to the networking stack. The passed &packet_type
 379  *      is linked into kernel lists and may not be freed until it has been
 380  *      removed from the kernel lists.
 381  *
 382  *      This call does not sleep therefore it can not
 383  *      guarantee all CPU's that are in middle of receiving packets
 384  *      will see the new packet type (until the next received packet).
 385  */
 386
 387 void dev_add_pack(struct packet_type *pt)
 388 {
 389         int hash;
 390
 391         spin_lock_bh(&ptype_lock);
 392         if (pt->type == htons(ETH_P_ALL))
 393                 list_add_rcu(&pt->list, &ptype_all);
 394         else {
 395                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 396                 list_add_rcu(&pt->list, &ptype_base[hash]);
 397         }
 398         spin_unlock_bh(&ptype_lock);
 399 }
 400
 401 /**
 402  *      __dev_remove_pack        - remove packet handler
 403  *      @pt: packet type declaration
 404  *
 405  *      Remove a protocol handler that was previously added to the kernel
 406  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 407  *      from the kernel lists and can be freed or reused once this function
 408  *      returns.
 409  *
 410  *      The packet type might still be in use by receivers
 411  *      and must not be freed until after all the CPU's have gone
 412  *      through a quiescent state.
 413  */
 414 void __dev_remove_pack(struct packet_type *pt)
 415 {
 416         struct list_head *head;
 417         struct packet_type *pt1;
 418
 419         spin_lock_bh(&ptype_lock);
 420
 421         if (pt->type == htons(ETH_P_ALL))
 422                 head = &ptype_all;
 423         else
 424                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 425
 426         list_for_each_entry(pt1, head, list) {
 427                 if (pt == pt1) {
 428                         list_del_rcu(&pt->list);
 429                         goto out;
 430                 }
 431         }
 432
 433         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 434 out:
 435         spin_unlock_bh(&ptype_lock);
 436 }
 437 /**
 438  *      dev_remove_pack  - remove packet handler
 439  *      @pt: packet type declaration
 440  *
 441  *      Remove a protocol handler that was previously added to the kernel
 442  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 443  *      from the kernel lists and can be freed or reused once this function
 444  *      returns.
 445  *
 446  *      This call sleeps to guarantee that no CPU is looking at the packet
 447  *      type after return.
 448  */
 449 void dev_remove_pack(struct packet_type *pt)
 450 {
 451         __dev_remove_pack(pt);
 452
 453         synchronize_net();
 454 }
 455
 456 /******************************************************************************
 457
 458                       Device Boot-time Settings Routines
 459
 460 *******************************************************************************/
 461
 462 /* Boot time configuration table */
 463 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 464
 465 /**
 466  *      netdev_boot_setup_add   - add new setup entry
 467  *      @name: name of the device
 468  *      @map: configured settings for the device
 469  *
 470  *      Adds new setup entry to the dev_boot_setup list.  The function
 471  *      returns 0 on error and 1 on success.  This is a generic routine to
 472  *      all netdevices.
 473  */
 474 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 475 {
 476         struct netdev_boot_setup *s;
 477         int i;
 478
 479         s = dev_boot_setup;
 480         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 481                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 482                         memset(s[i].name, 0, sizeof(s[i].name));
 483                         strlcpy(s[i].name, name, IFNAMSIZ);
 484                         memcpy(&s[i].map, map, sizeof(s[i].map));
 485                         break;
 486                 }
 487         }
 488
 489         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 490 }
 491
 492 /**
 493  *      netdev_boot_setup_check - check boot time settings
 494  *      @dev: the netdevice
 495  *
 496  *      Check boot time settings for the device.
 497  *      The found settings are set for the device to be used
 498  *      later in the device probing.
 499  *      Returns 0 if no settings found, 1 if they are.
 500  */
 501 int netdev_boot_setup_check(struct net_device *dev)
 502 {
 503         struct netdev_boot_setup *s = dev_boot_setup;
 504         int i;
 505
 506         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 507                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 508                     !strcmp(dev->name, s[i].name)) {
 509                         dev->irq        = s[i].map.irq;
 510                         dev->base_addr  = s[i].map.base_addr;
 511                         dev->mem_start  = s[i].map.mem_start;
 512                         dev->mem_end    = s[i].map.mem_end;
 513                         return 1;
 514                 }
 515         }
 516         return 0;
 517 }
 518
 519
 520 /**
 521  *      netdev_boot_base        - get address from boot time settings
 522  *      @prefix: prefix for network device
 523  *      @unit: id for network device
 524  *
 525  *      Check boot time settings for the base address of device.
 526  *      The found settings are set for the device to be used
 527  *      later in the device probing.
 528  *      Returns 0 if no settings found.
 529  */
 530 unsigned long netdev_boot_base(const char *prefix, int unit)
 531 {
 532         const struct netdev_boot_setup *s = dev_boot_setup;
 533         char name[IFNAMSIZ];
 534         int i;
 535
 536         sprintf(name, "%s%d", prefix, unit);
 537
 538         /*
 539          * If device already registered then return base of 1
 540          * to indicate not to probe for this interface
 541          */
 542         if (__dev_get_by_name(&init_net, name))
 543                 return 1;
 544
 545         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 546                 if (!strcmp(name, s[i].name))
 547                         return s[i].map.base_addr;
 548         return 0;
 549 }
 550
 551 /*
 552  * Saves at boot time configured settings for any netdevice.
 553  */
 554 int __init netdev_boot_setup(char *str)
 555 {
 556         int ints[5];
 557         struct ifmap map;
 558
 559         str = get_options(str, ARRAY_SIZE(ints), ints);
 560         if (!str || !*str)
 561                 return 0;
 562
 563         /* Save settings */
 564         memset(&map, 0, sizeof(map));
 565         if (ints[0] > 0)
 566                 map.irq = ints[1];
 567         if (ints[0] > 1)
 568                 map.base_addr = ints[2];
 569         if (ints[0] > 2)
 570                 map.mem_start = ints[3];
 571         if (ints[0] > 3)
 572                 map.mem_end = ints[4];
 573
 574         /* Add new entry to the list */
 575         return netdev_boot_setup_add(str, &map);
 576 }
 577
 578 __setup("netdev=", netdev_boot_setup);
 579
 580 /*******************************************************************************
 581
 582                             Device Interface Subroutines
 583
 584 *******************************************************************************/
 585
 586 /**
 587  *      __dev_get_by_name       - find a device by its name
 588  *      @net: the applicable net namespace
 589  *      @name: name to find
 590  *
 591  *      Find an interface by name. Must be called under RTNL semaphore
 592  *      or @dev_base_lock. If the name is found a pointer to the device
 593  *      is returned. If the name is not found then %NULL is returned. The
 594  *      reference counters are not incremented so the caller must be
 595  *      careful with locks.
 596  */
 597
 598 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 599 {
 600         struct hlist_node *p;
 601
 602         hlist_for_each(p, dev_name_hash(net, name)) {
 603                 struct net_device *dev
 604                         = hlist_entry(p, struct net_device, name_hlist);
 605                 if (!strncmp(dev->name, name, IFNAMSIZ))
 606                         return dev;
 607         }
 608         return NULL;
 609 }
 610
 611 /**
 612  *      dev_get_by_name         - find a device by its name
 613  *      @net: the applicable net namespace
 614  *      @name: name to find
 615  *
 616  *      Find an interface by name. This can be called from any
 617  *      context and does its own locking. The returned handle has
 618  *      the usage count incremented and the caller must use dev_put() to
 619  *      release it when it is no longer needed. %NULL is returned if no
 620  *      matching device is found.
 621  */
 622
 623 struct net_device *dev_get_by_name(struct net *net, const char *name)
 624 {
 625         struct net_device *dev;
 626
 627         read_lock(&dev_base_lock);
 628         dev = __dev_get_by_name(net, name);
 629         if (dev)
 630                 dev_hold(dev);
 631         read_unlock(&dev_base_lock);
 632         return dev;
 633 }
 634
 635 /**
 636  *      __dev_get_by_index - find a device by its ifindex
 637  *      @net: the applicable net namespace
 638  *      @ifindex: index of device
 639  *
 640  *      Search for an interface by index. Returns %NULL if the device
 641  *      is not found or a pointer to the device. The device has not
 642  *      had its reference counter increased so the caller must be careful
 643  *      about locking. The caller must hold either the RTNL semaphore
 644  *      or @dev_base_lock.
 645  */
 646
 647 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 648 {
 649         struct hlist_node *p;
 650
 651         hlist_for_each(p, dev_index_hash(net, ifindex)) {
 652                 struct net_device *dev
 653                         = hlist_entry(p, struct net_device, index_hlist);
 654                 if (dev->ifindex == ifindex)
 655                         return dev;
 656         }
 657         return NULL;
 658 }
 659
 660
 661 /**
 662  *      dev_get_by_index - find a device by its ifindex
 663  *      @net: the applicable net namespace
 664  *      @ifindex: index of device
 665  *
 666  *      Search for an interface by index. Returns NULL if the device
 667  *      is not found or a pointer to the device. The device returned has
 668  *      had a reference added and the pointer is safe until the user calls
 669  *      dev_put to indicate they have finished with it.
 670  */
 671
 672 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 673 {
 674         struct net_device *dev;
 675
 676         read_lock(&dev_base_lock);
 677         dev = __dev_get_by_index(net, ifindex);
 678         if (dev)
 679                 dev_hold(dev);
 680         read_unlock(&dev_base_lock);
 681         return dev;
 682 }
 683
 684 /**
 685  *      dev_getbyhwaddr - find a device by its hardware address
 686  *      @net: the applicable net namespace
 687  *      @type: media type of device
 688  *      @ha: hardware address
 689  *
 690  *      Search for an interface by MAC address. Returns NULL if the device
 691  *      is not found or a pointer to the device. The caller must hold the
 692  *      rtnl semaphore. The returned device has not had its ref count increased
 693  *      and the caller must therefore be careful about locking
 694  *
 695  *      BUGS:
 696  *      If the API was consistent this would be __dev_get_by_hwaddr
 697  */
 698
 699 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 700 {
 701         struct net_device *dev;
 702
 703         ASSERT_RTNL();
 704
 705         for_each_netdev(net, dev)
 706                 if (dev->type == type &&
 707                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 708                         return dev;
 709
 710         return NULL;
 711 }
 712
 713 EXPORT_SYMBOL(dev_getbyhwaddr);
 714
 715 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 716 {
 717         struct net_device *dev;
 718
 719         ASSERT_RTNL();
 720         for_each_netdev(net, dev)
 721                 if (dev->type == type)
 722                         return dev;
 723
 724         return NULL;
 725 }
 726
 727 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 728
 729 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 730 {
 731         struct net_device *dev;
 732
 733         rtnl_lock();
 734         dev = __dev_getfirstbyhwtype(net, type);
 735         if (dev)
 736                 dev_hold(dev);
 737         rtnl_unlock();
 738         return dev;
 739 }
 740
 741 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 742
 743 /**
 744  *      dev_get_by_flags - find any device with given flags
 745  *      @net: the applicable net namespace
 746  *      @if_flags: IFF_* values
 747  *      @mask: bitmask of bits in if_flags to check
 748  *
 749  *      Search for any interface with the given flags. Returns NULL if a device
 750  *      is not found or a pointer to the device. The device returned has
 751  *      had a reference added and the pointer is safe until the user calls
 752  *      dev_put to indicate they have finished with it.
 753  */
 754
 755 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
 756 {
 757         struct net_device *dev, *ret;
 758
 759         ret = NULL;
 760         read_lock(&dev_base_lock);
 761         for_each_netdev(net, dev) {
 762                 if (((dev->flags ^ if_flags) & mask) == 0) {
 763                         dev_hold(dev);
 764                         ret = dev;
 765                         break;
 766                 }
 767         }
 768         read_unlock(&dev_base_lock);
 769         return ret;
 770 }
 771
 772 /**
 773  *      dev_valid_name - check if name is okay for network device
 774  *      @name: name string
 775  *
 776  *      Network device names need to be valid file names to
 777  *      to allow sysfs to work.  We also disallow any kind of
 778  *      whitespace.
 779  */
 780 int dev_valid_name(const char *name)
 781 {
 782         if (*name == '\0')
 783                 return 0;
 784         if (strlen(name) >= IFNAMSIZ)
 785                 return 0;
 786         if (!strcmp(name, ".") || !strcmp(name, ".."))
 787                 return 0;
 788
 789         while (*name) {
 790                 if (*name == '/' || isspace(*name))
 791                         return 0;
 792                 name++;
 793         }
 794         return 1;
 795 }
 796
 797 /**
 798  *      __dev_alloc_name - allocate a name for a device
 799  *      @net: network namespace to allocate the device name in
 800  *      @name: name format string
 801  *      @buf:  scratch buffer and result name string
 802  *
 803  *      Passed a format string - eg "lt%d" it will try and find a suitable
 804  *      id. It scans list of devices to build up a free map, then chooses
 805  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 806  *      while allocating the name and adding the device in order to avoid
 807  *      duplicates.
 808  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 809  *      Returns the number of the unit assigned or a negative errno code.
 810  */
 811
 812 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 813 {
 814         int i = 0;
 815         const char *p;
 816         const int max_netdevices = 8*PAGE_SIZE;
 817         unsigned long *inuse;
 818         struct net_device *d;
 819
 820         p = strnchr(name, IFNAMSIZ-1, '%');
 821         if (p) {
 822                 /*
 823                  * Verify the string as this thing may have come from
 824                  * the user.  There must be either one "%d" and no other "%"
 825                  * characters.
 826                  */
 827                 if (p[1] != 'd' || strchr(p + 2, '%'))
 828                         return -EINVAL;
 829
 830                 /* Use one page as a bit array of possible slots */
 831                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 832                 if (!inuse)
 833                         return -ENOMEM;
 834
 835                 for_each_netdev(net, d) {
 836                         if (!sscanf(d->name, name, &i))
 837                                 continue;
 838                         if (i < 0 || i >= max_netdevices)
 839                                 continue;
 840
 841                         /*  avoid cases where sscanf is not exact inverse of printf */
 842                         snprintf(buf, IFNAMSIZ, name, i);
 843                         if (!strncmp(buf, d->name, IFNAMSIZ))
 844                                 set_bit(i, inuse);
 845                 }
 846
 847                 i = find_first_zero_bit(inuse, max_netdevices);
 848                 free_page((unsigned long) inuse);
 849         }
 850
 851         snprintf(buf, IFNAMSIZ, name, i);
 852         if (!__dev_get_by_name(net, buf))
 853                 return i;
 854
 855         /* It is possible to run out of possible slots
 856          * when the name is long and there isn't enough space left
 857          * for the digits, or if all bits are used.
 858          */
 859         return -ENFILE;
 860 }
 861
 862 /**
 863  *      dev_alloc_name - allocate a name for a device
 864  *      @dev: device
 865  *      @name: name format string
 866  *
 867  *      Passed a format string - eg "lt%d" it will try and find a suitable
 868  *      id. It scans list of devices to build up a free map, then chooses
 869  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 870  *      while allocating the name and adding the device in order to avoid
 871  *      duplicates.
 872  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 873  *      Returns the number of the unit assigned or a negative errno code.
 874  */
 875
 876 int dev_alloc_name(struct net_device *dev, const char *name)
 877 {
 878         char buf[IFNAMSIZ];
 879         struct net *net;
 880         int ret;
 881
 882         BUG_ON(!dev_net(dev));
 883         net = dev_net(dev);
 884         ret = __dev_alloc_name(net, name, buf);
 885         if (ret >= 0)
 886                 strlcpy(dev->name, buf, IFNAMSIZ);
 887         return ret;
 888 }
 889
 890
 891 /**
 892  *      dev_change_name - change name of a device
 893  *      @dev: device
 894  *      @newname: name (or format string) must be at least IFNAMSIZ
 895  *
 896  *      Change name of a device, can pass format strings "eth%d".
 897  *      for wildcarding.
 898  */
 899 int dev_change_name(struct net_device *dev, const char *newname)
 900 {
 901         char oldname[IFNAMSIZ];
 902         int err = 0;
 903         int ret;
 904         struct net *net;
 905
 906         ASSERT_RTNL();
 907         BUG_ON(!dev_net(dev));
 908
 909         net = dev_net(dev);
 910         if (dev->flags & IFF_UP)
 911                 return -EBUSY;
 912
 913         if (!dev_valid_name(newname))
 914                 return -EINVAL;
 915
 916         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 917                 return 0;
 918
 919         memcpy(oldname, dev->name, IFNAMSIZ);
 920
 921         if (strchr(newname, '%')) {
 922                 err = dev_alloc_name(dev, newname);
 923                 if (err < 0)
 924                         return err;
 925         }
 926         else if (__dev_get_by_name(net, newname))
 927                 return -EEXIST;
 928         else
 929                 strlcpy(dev->name, newname, IFNAMSIZ);
 930
 931 rollback:
 932         /* For now only devices in the initial network namespace
 933          * are in sysfs.
 934          */
 935         if (net == &init_net) {
 936                 ret = device_rename(&dev->dev, dev->name);
 937                 if (ret) {
 938                         memcpy(dev->name, oldname, IFNAMSIZ);
 939                         return ret;
 940                 }
 941         }
 942
 943         write_lock_bh(&dev_base_lock);
 944         hlist_del(&dev->name_hlist);
 945         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 946         write_unlock_bh(&dev_base_lock);
 947
 948         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 949         ret = notifier_to_errno(ret);
 950
 951         if (ret) {
 952                 if (err) {
 953                         printk(KERN_ERR
 954                                "%s: name change rollback failed: %d.\n",
 955                                dev->name, ret);
 956                 } else {
 957                         err = ret;
 958                         memcpy(dev->name, oldname, IFNAMSIZ);
 959                         goto rollback;
 960                 }
 961         }
 962
 963         return err;
 964 }
 965
 966 /**
 967  *      dev_set_alias - change ifalias of a device
 968  *      @dev: device
 969  *      @alias: name up to IFALIASZ
 970  *      @len: limit of bytes to copy from info
 971  *
 972  *      Set ifalias for a device,
 973  */
 974 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 975 {
 976         ASSERT_RTNL();
 977
 978         if (len >= IFALIASZ)
 979                 return -EINVAL;
 980
 981         if (!len) {
 982                 if (dev->ifalias) {
 983                         kfree(dev->ifalias);
 984                         dev->ifalias = NULL;
 985                 }
 986                 return 0;
 987         }
 988
 989         dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
 990         if (!dev->ifalias)
 991                 return -ENOMEM;
 992
 993         strlcpy(dev->ifalias, alias, len+1);
 994         return len;
 995 }
 996
 997
 998 /**
 999  *      netdev_features_change - device changes features
1000  *      @dev: device to cause notification
1001  *
1002  *      Called to indicate a device has changed features.
1003  */
1004 void netdev_features_change(struct net_device *dev)
1005 {
1006         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1007 }
1008 EXPORT_SYMBOL(netdev_features_change);
1009
1010 /**
1011  *      netdev_state_change - device changes state
1012  *      @dev: device to cause notification
1013  *
1014  *      Called to indicate a device has changed state. This function calls
1015  *      the notifier chains for netdev_chain and sends a NEWLINK message
1016  *      to the routing socket.
1017  */
1018 void netdev_state_change(struct net_device *dev)
1019 {
1020         if (dev->flags & IFF_UP) {
1021                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1022                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1023         }
1024 }
1025
1026 void netdev_bonding_change(struct net_device *dev)
1027 {
1028         call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1029 }
1030 EXPORT_SYMBOL(netdev_bonding_change);
1031
1032 /**
1033  *      dev_load        - load a network module
1034  *      @net: the applicable net namespace
1035  *      @name: name of interface
1036  *
1037  *      If a network interface is not present and the process has suitable
1038  *      privileges this function loads the module. If module loading is not
1039  *      available in this kernel then it becomes a nop.
1040  */
1041
1042 void dev_load(struct net *net, const char *name)
1043 {
1044         struct net_device *dev;
1045
1046         read_lock(&dev_base_lock);
1047         dev = __dev_get_by_name(net, name);
1048         read_unlock(&dev_base_lock);
1049
1050         if (!dev && capable(CAP_SYS_MODULE))
1051                 request_module("%s", name);
1052 }
1053
1054 /**
1055  *      dev_open        - prepare an interface for use.
1056  *      @dev:   device to open
1057  *
1058  *      Takes a device from down to up state. The device's private open
1059  *      function is invoked and then the multicast lists are loaded. Finally
1060  *      the device is moved into the up state and a %NETDEV_UP message is
1061  *      sent to the netdev notifier chain.
1062  *
1063  *      Calling this function on an active interface is a nop. On a failure
1064  *      a negative errno code is returned.
1065  */
1066 int dev_open(struct net_device *dev)
1067 {
1068         const struct net_device_ops *ops = dev->netdev_ops;
1069         int ret = 0;
1070
1071         ASSERT_RTNL();
1072
1073         /*
1074          *      Is it already up?
1075          */
1076
1077         if (dev->flags & IFF_UP)
1078                 return 0;
1079
1080         /*
1081          *      Is it even present?
1082          */
1083         if (!netif_device_present(dev))
1084                 return -ENODEV;
1085
1086         /*
1087          *      Call device private open method
1088          */
1089         set_bit(__LINK_STATE_START, &dev->state);
1090
1091         if (ops->ndo_validate_addr)
1092                 ret = ops->ndo_validate_addr(dev);
1093
1094         if (!ret && ops->ndo_open)
1095                 ret = ops->ndo_open(dev);
1096
1097         /*
1098          *      If it went open OK then:
1099          */
1100
1101         if (ret)
1102                 clear_bit(__LINK_STATE_START, &dev->state);
1103         else {
1104                 /*
1105                  *      Set the flags.
1106                  */
1107                 dev->flags |= IFF_UP;
1108
1109                 /*
1110                  *      Initialize multicasting status
1111                  */
1112                 dev_set_rx_mode(dev);
1113
1114                 /*
1115                  *      Wakeup transmit queue engine
1116                  */
1117                 dev_activate(dev);
1118
1119                 /*
1120                  *      ... and announce new interface.
1121                  */
1122                 call_netdevice_notifiers(NETDEV_UP, dev);
1123         }
1124
1125         return ret;
1126 }
1127
1128 /**
1129  *      dev_close - shutdown an interface.
1130  *      @dev: device to shutdown
1131  *
1132  *      This function moves an active device into down state. A
1133  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1134  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1135  *      chain.
1136  */
1137 int dev_close(struct net_device *dev)
1138 {
1139         const struct net_device_ops *ops = dev->netdev_ops;
1140         ASSERT_RTNL();
1141
1142         might_sleep();
1143
1144         if (!(dev->flags & IFF_UP))
1145                 return 0;
1146
1147         /*
1148          *      Tell people we are going down, so that they can
1149          *      prepare to death, when device is still operating.
1150          */
1151         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1152
1153         clear_bit(__LINK_STATE_START, &dev->state);
1154
1155         /* Synchronize to scheduled poll. We cannot touch poll list,
1156          * it can be even on different cpu. So just clear netif_running().
1157          *
1158          * dev->stop() will invoke napi_disable() on all of it's
1159          * napi_struct instances on this device.
1160          */
1161         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1162
1163         dev_deactivate(dev);
1164
1165         /*
1166          *      Call the device specific close. This cannot fail.
1167          *      Only if device is UP
1168          *
1169          *      We allow it to be called even after a DETACH hot-plug
1170          *      event.
1171          */
1172         if (ops->ndo_stop)
1173                 ops->ndo_stop(dev);
1174
1175         /*
1176          *      Device is now down.
1177          */
1178
1179         dev->flags &= ~IFF_UP;
1180
1181         /*
1182          * Tell people we are down
1183          */
1184         call_netdevice_notifiers(NETDEV_DOWN, dev);
1185
1186         return 0;
1187 }
1188
1189
1190 /**
1191  *      dev_disable_lro - disable Large Receive Offload on a device
1192  *      @dev: device
1193  *
1194  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1195  *      called under RTNL.  This is needed if received packets may be
1196  *      forwarded to another interface.
1197  */
1198 void dev_disable_lro(struct net_device *dev)
1199 {
1200         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1201             dev->ethtool_ops->set_flags) {
1202                 u32 flags = dev->ethtool_ops->get_flags(dev);
1203                 if (flags & ETH_FLAG_LRO) {
1204                         flags &= ~ETH_FLAG_LRO;
1205                         dev->ethtool_ops->set_flags(dev, flags);
1206                 }
1207         }
1208         WARN_ON(dev->features & NETIF_F_LRO);
1209 }
1210 EXPORT_SYMBOL(dev_disable_lro);
1211
1212
1213 static int dev_boot_phase = 1;
1214
1215 /*
1216  *      Device change register/unregister. These are not inline or static
1217  *      as we export them to the world.
1218  */
1219
1220 /**
1221  *      register_netdevice_notifier - register a network notifier block
1222  *      @nb: notifier
1223  *
1224  *      Register a notifier to be called when network device events occur.
1225  *      The notifier passed is linked into the kernel structures and must
1226  *      not be reused until it has been unregistered. A negative errno code
1227  *      is returned on a failure.
1228  *
1229  *      When registered all registration and up events are replayed
1230  *      to the new notifier to allow device to have a race free
1231  *      view of the network device list.
1232  */
1233
1234 int register_netdevice_notifier(struct notifier_block *nb)
1235 {
1236         struct net_device *dev;
1237         struct net_device *last;
1238         struct net *net;
1239         int err;
1240
1241         rtnl_lock();
1242         err = raw_notifier_chain_register(&netdev_chain, nb);
1243         if (err)
1244                 goto unlock;
1245         if (dev_boot_phase)
1246                 goto unlock;
1247         for_each_net(net) {
1248                 for_each_netdev(net, dev) {
1249                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1250                         err = notifier_to_errno(err);
1251                         if (err)
1252                                 goto rollback;
1253
1254                         if (!(dev->flags & IFF_UP))
1255                                 continue;
1256
1257                         nb->notifier_call(nb, NETDEV_UP, dev);
1258                 }
1259         }
1260
1261 unlock:
1262         rtnl_unlock();
1263         return err;
1264
1265 rollback:
1266         last = dev;
1267         for_each_net(net) {
1268                 for_each_netdev(net, dev) {
1269                         if (dev == last)
1270                                 break;
1271
1272                         if (dev->flags & IFF_UP) {
1273                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1274                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1275                         }
1276                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1277                 }
1278         }
1279
1280         raw_notifier_chain_unregister(&netdev_chain, nb);
1281         goto unlock;
1282 }
1283
1284 /**
1285  *      unregister_netdevice_notifier - unregister a network notifier block
1286  *      @nb: notifier
1287  *
1288  *      Unregister a notifier previously registered by
1289  *      register_netdevice_notifier(). The notifier is unlinked into the
1290  *      kernel structures and may then be reused. A negative errno code
1291  *      is returned on a failure.
1292  */
1293
1294 int unregister_netdevice_notifier(struct notifier_block *nb)
1295 {
1296         int err;
1297
1298         rtnl_lock();
1299         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1300         rtnl_unlock();
1301         return err;
1302 }
1303
1304 /**
1305  *      call_netdevice_notifiers - call all network notifier blocks
1306  *      @val: value passed unmodified to notifier function
1307  *      @dev: net_device pointer passed unmodified to notifier function
1308  *
1309  *      Call all network notifier blocks.  Parameters and return value
1310  *      are as for raw_notifier_call_chain().
1311  */
1312
1313 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1314 {
1315         return raw_notifier_call_chain(&netdev_chain, val, dev);
1316 }
1317
1318 /* When > 0 there are consumers of rx skb time stamps */
1319 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1320
1321 void net_enable_timestamp(void)
1322 {
1323         atomic_inc(&netstamp_needed);
1324 }
1325
1326 void net_disable_timestamp(void)
1327 {
1328         atomic_dec(&netstamp_needed);
1329 }
1330
1331 static inline void net_timestamp(struct sk_buff *skb)
1332 {
1333         if (atomic_read(&netstamp_needed))
1334                 __net_timestamp(skb);
1335         else
1336                 skb->tstamp.tv64 = 0;
1337 }
1338
1339 /*
1340  *      Support routine. Sends outgoing frames to any network
1341  *      taps currently in use.
1342  */
1343
1344 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1345 {
1346         struct packet_type *ptype;
1347
1348         net_timestamp(skb);
1349
1350         rcu_read_lock();
1351         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1352                 /* Never send packets back to the socket
1353                  * they originated from - MvS (miquels@drinkel.ow.org)
1354                  */
1355                 if ((ptype->dev == dev || !ptype->dev) &&
1356                     (ptype->af_packet_priv == NULL ||
1357                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1358                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1359                         if (!skb2)
1360                                 break;
1361
1362                         /* skb->nh should be correctly
1363                            set by sender, so that the second statement is
1364                            just protection against buggy protocols.
1365                          */
1366                         skb_reset_mac_header(skb2);
1367
1368                         if (skb_network_header(skb2) < skb2->data ||
1369                             skb2->network_header > skb2->tail) {
1370                                 if (net_ratelimit())
1371                                         printk(KERN_CRIT "protocol %04x is "
1372                                                "buggy, dev %s\n",
1373                                                skb2->protocol, dev->name);
1374                                 skb_reset_network_header(skb2);
1375                         }
1376
1377                         skb2->transport_header = skb2->network_header;
1378                         skb2->pkt_type = PACKET_OUTGOING;
1379                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1380                 }
1381         }
1382         rcu_read_unlock();
1383 }
1384
1385
1386 static inline void __netif_reschedule(struct Qdisc *q)
1387 {
1388         struct softnet_data *sd;
1389         unsigned long flags;
1390
1391         local_irq_save(flags);
1392         sd = &__get_cpu_var(softnet_data);
1393         q->next_sched = sd->output_queue;
1394         sd->output_queue = q;
1395         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1396         local_irq_restore(flags);
1397 }
1398
1399 void __netif_schedule(struct Qdisc *q)
1400 {
1401         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1402                 __netif_reschedule(q);
1403 }
1404 EXPORT_SYMBOL(__netif_schedule);
1405
1406 void dev_kfree_skb_irq(struct sk_buff *skb)
1407 {
1408         if (atomic_dec_and_test(&skb->users)) {
1409                 struct softnet_data *sd;
1410                 unsigned long flags;
1411
1412                 local_irq_save(flags);
1413                 sd = &__get_cpu_var(softnet_data);
1414                 skb->next = sd->completion_queue;
1415                 sd->completion_queue = skb;
1416                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1417                 local_irq_restore(flags);
1418         }
1419 }
1420 EXPORT_SYMBOL(dev_kfree_skb_irq);
1421
1422 void dev_kfree_skb_any(struct sk_buff *skb)
1423 {
1424         if (in_irq() || irqs_disabled())
1425                 dev_kfree_skb_irq(skb);
1426         else
1427                 dev_kfree_skb(skb);
1428 }
1429 EXPORT_SYMBOL(dev_kfree_skb_any);
1430
1431
1432 /**
1433  * netif_device_detach - mark device as removed
1434  * @dev: network device
1435  *
1436  * Mark device as removed from system and therefore no longer available.
1437  */
1438 void netif_device_detach(struct net_device *dev)
1439 {
1440         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1441             netif_running(dev)) {
1442                 netif_stop_queue(dev);
1443         }
1444 }
1445 EXPORT_SYMBOL(netif_device_detach);
1446
1447 /**
1448  * netif_device_attach - mark device as attached
1449  * @dev: network device
1450  *
1451  * Mark device as attached from system and restart if needed.
1452  */
1453 void netif_device_attach(struct net_device *dev)
1454 {
1455         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1456             netif_running(dev)) {
1457                 netif_wake_queue(dev);
1458                 __netdev_watchdog_up(dev);
1459         }
1460 }
1461 EXPORT_SYMBOL(netif_device_attach);
1462
1463 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1464 {
1465         return ((features & NETIF_F_GEN_CSUM) ||
1466                 ((features & NETIF_F_IP_CSUM) &&
1467                  protocol == htons(ETH_P_IP)) ||
1468                 ((features & NETIF_F_IPV6_CSUM) &&
1469                  protocol == htons(ETH_P_IPV6)));
1470 }
1471
1472 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1473 {
1474         if (can_checksum_protocol(dev->features, skb->protocol))
1475                 return true;
1476
1477         if (skb->protocol == htons(ETH_P_8021Q)) {
1478                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1479                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1480                                           veh->h_vlan_encapsulated_proto))
1481                         return true;
1482         }
1483
1484         return false;
1485 }
1486
1487 /*
1488  * Invalidate hardware checksum when packet is to be mangled, and
1489  * complete checksum manually on outgoing path.
1490  */
1491 int skb_checksum_help(struct sk_buff *skb)
1492 {
1493         __wsum csum;
1494         int ret = 0, offset;
1495
1496         if (skb->ip_summed == CHECKSUM_COMPLETE)
1497                 goto out_set_summed;
1498
1499         if (unlikely(skb_shinfo(skb)->gso_size)) {
1500                 /* Let GSO fix up the checksum. */
1501                 goto out_set_summed;
1502         }
1503
1504         offset = skb->csum_start - skb_headroom(skb);
1505         BUG_ON(offset >= skb_headlen(skb));
1506         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1507
1508         offset += skb->csum_offset;
1509         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1510
1511         if (skb_cloned(skb) &&
1512             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1513                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1514                 if (ret)
1515                         goto out;
1516         }
1517
1518         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1519 out_set_summed:
1520         skb->ip_summed = CHECKSUM_NONE;
1521 out:
1522         return ret;
1523 }
1524
1525 /**
1526  *      skb_gso_segment - Perform segmentation on skb.
1527  *      @skb: buffer to segment
1528  *      @features: features for the output path (see dev->features)
1529  *
1530  *      This function segments the given skb and returns a list of segments.
1531  *
1532  *      It may return NULL if the skb requires no segmentation.  This is
1533  *      only possible when GSO is used for verifying header integrity.
1534  */
1535 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1536 {
1537         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1538         struct packet_type *ptype;
1539         __be16 type = skb->protocol;
1540         int err;
1541
1542         skb_reset_mac_header(skb);
1543         skb->mac_len = skb->network_header - skb->mac_header;
1544         __skb_pull(skb, skb->mac_len);
1545
1546         if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1547                 if (skb_header_cloned(skb) &&
1548                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1549                         return ERR_PTR(err);
1550         }
1551
1552         rcu_read_lock();
1553         list_for_each_entry_rcu(ptype,
1554                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1555                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1556                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1557                                 err = ptype->gso_send_check(skb);
1558                                 segs = ERR_PTR(err);
1559                                 if (err || skb_gso_ok(skb, features))
1560                                         break;
1561                                 __skb_push(skb, (skb->data -
1562                                                  skb_network_header(skb)));
1563                         }
1564                         segs = ptype->gso_segment(skb, features);
1565                         break;
1566                 }
1567         }
1568         rcu_read_unlock();
1569
1570         __skb_push(skb, skb->data - skb_mac_header(skb));
1571
1572         return segs;
1573 }
1574
1575 EXPORT_SYMBOL(skb_gso_segment);
1576
1577 /* Take action when hardware reception checksum errors are detected. */
1578 #ifdef CONFIG_BUG
1579 void netdev_rx_csum_fault(struct net_device *dev)
1580 {
1581         if (net_ratelimit()) {
1582                 printk(KERN_ERR "%s: hw csum failure.\n",
1583                         dev ? dev->name : "<unknown>");
1584                 dump_stack();
1585         }
1586 }
1587 EXPORT_SYMBOL(netdev_rx_csum_fault);
1588 #endif
1589
1590 /* Actually, we should eliminate this check as soon as we know, that:
1591  * 1. IOMMU is present and allows to map all the memory.
1592  * 2. No high memory really exists on this machine.
1593  */
1594
1595 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1596 {
1597 #ifdef CONFIG_HIGHMEM
1598         int i;
1599
1600         if (dev->features & NETIF_F_HIGHDMA)
1601                 return 0;
1602
1603         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1604                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1605                         return 1;
1606
1607 #endif
1608         return 0;
1609 }
1610
1611 struct dev_gso_cb {
1612         void (*destructor)(struct sk_buff *skb);
1613 };
1614
1615 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1616
1617 static void dev_gso_skb_destructor(struct sk_buff *skb)
1618 {
1619         struct dev_gso_cb *cb;
1620
1621         do {
1622                 struct sk_buff *nskb = skb->next;
1623
1624                 skb->next = nskb->next;
1625                 nskb->next = NULL;
1626                 kfree_skb(nskb);
1627         } while (skb->next);
1628
1629         cb = DEV_GSO_CB(skb);
1630         if (cb->destructor)
1631                 cb->destructor(skb);
1632 }
1633
1634 /**
1635  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1636  *      @skb: buffer to segment
1637  *
1638  *      This function segments the given skb and stores the list of segments
1639  *      in skb->next.
1640  */
1641 static int dev_gso_segment(struct sk_buff *skb)
1642 {
1643         struct net_device *dev = skb->dev;
1644         struct sk_buff *segs;
1645         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1646                                          NETIF_F_SG : 0);
1647
1648         segs = skb_gso_segment(skb, features);
1649
1650         /* Verifying header integrity only. */
1651         if (!segs)
1652                 return 0;
1653
1654         if (IS_ERR(segs))
1655                 return PTR_ERR(segs);
1656
1657         skb->next = segs;
1658         DEV_GSO_CB(skb)->destructor = skb->destructor;
1659         skb->destructor = dev_gso_skb_destructor;
1660
1661         return 0;
1662 }
1663
1664 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1665                         struct netdev_queue *txq)
1666 {
1667         const struct net_device_ops *ops = dev->netdev_ops;
1668
1669         prefetch(&dev->netdev_ops->ndo_start_xmit);
1670         if (likely(!skb->next)) {
1671                 if (!list_empty(&ptype_all))
1672                         dev_queue_xmit_nit(skb, dev);
1673
1674                 if (netif_needs_gso(dev, skb)) {
1675                         if (unlikely(dev_gso_segment(skb)))
1676                                 goto out_kfree_skb;
1677                         if (skb->next)
1678                                 goto gso;
1679                 }
1680
1681                 return ops->ndo_start_xmit(skb, dev);
1682         }
1683
1684 gso:
1685         do {
1686                 struct sk_buff *nskb = skb->next;
1687                 int rc;
1688
1689                 skb->next = nskb->next;
1690                 nskb->next = NULL;
1691                 rc = ops->ndo_start_xmit(nskb, dev);
1692                 if (unlikely(rc)) {
1693                         nskb->next = skb->next;
1694                         skb->next = nskb;
1695                         return rc;
1696                 }
1697                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1698                         return NETDEV_TX_BUSY;
1699         } while (skb->next);
1700
1701         skb->destructor = DEV_GSO_CB(skb)->destructor;
1702
1703 out_kfree_skb:
1704         kfree_skb(skb);
1705         return 0;
1706 }
1707
1708 static u32 simple_tx_hashrnd;
1709 static int simple_tx_hashrnd_initialized = 0;
1710
1711 static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
1712 {
1713         u32 addr1, addr2, ports;
1714         u32 hash, ihl;
1715         u8 ip_proto = 0;
1716
1717         if (unlikely(!simple_tx_hashrnd_initialized)) {
1718                 get_random_bytes(&simple_tx_hashrnd, 4);
1719                 simple_tx_hashrnd_initialized = 1;
1720         }
1721
1722         switch (skb->protocol) {
1723         case htons(ETH_P_IP):
1724                 if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
1725                         ip_proto = ip_hdr(skb)->protocol;
1726                 addr1 = ip_hdr(skb)->saddr;
1727                 addr2 = ip_hdr(skb)->daddr;
1728                 ihl = ip_hdr(skb)->ihl;
1729                 break;
1730         case htons(ETH_P_IPV6):
1731                 ip_proto = ipv6_hdr(skb)->nexthdr;
1732                 addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
1733                 addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
1734                 ihl = (40 >> 2);
1735                 break;
1736         default:
1737                 return 0;
1738         }
1739
1740
1741         switch (ip_proto) {
1742         case IPPROTO_TCP:
1743         case IPPROTO_UDP:
1744         case IPPROTO_DCCP:
1745         case IPPROTO_ESP:
1746         case IPPROTO_AH:
1747         case IPPROTO_SCTP:
1748         case IPPROTO_UDPLITE:
1749                 ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
1750                 break;
1751
1752         default:
1753                 ports = 0;
1754                 break;
1755         }
1756
1757         hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
1758
1759         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1760 }
1761
1762 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1763                                         struct sk_buff *skb)
1764 {
1765         const struct net_device_ops *ops = dev->netdev_ops;
1766         u16 queue_index = 0;
1767
1768         if (ops->ndo_select_queue)
1769                 queue_index = ops->ndo_select_queue(dev, skb);
1770         else if (dev->real_num_tx_queues > 1)
1771                 queue_index = simple_tx_hash(dev, skb);
1772
1773         skb_set_queue_mapping(skb, queue_index);
1774         return netdev_get_tx_queue(dev, queue_index);
1775 }
1776
1777 /**
1778  *      dev_queue_xmit - transmit a buffer
1779  *      @skb: buffer to transmit
1780  *
1781  *      Queue a buffer for transmission to a network device. The caller must
1782  *      have set the device and priority and built the buffer before calling
1783  *      this function. The function can be called from an interrupt.
1784  *
1785  *      A negative errno code is returned on a failure. A success does not
1786  *      guarantee the frame will be transmitted as it may be dropped due
1787  *      to congestion or traffic shaping.
1788  *
1789  * -----------------------------------------------------------------------------------
1790  *      I notice this method can also return errors from the queue disciplines,
1791  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1792  *      be positive.
1793  *
1794  *      Regardless of the return value, the skb is consumed, so it is currently
1795  *      difficult to retry a send to this method.  (You can bump the ref count
1796  *      before sending to hold a reference for retry if you are careful.)
1797  *
1798  *      When calling this method, interrupts MUST be enabled.  This is because
1799  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1800  *          --BLG
1801  */
1802 int dev_queue_xmit(struct sk_buff *skb)
1803 {
1804         struct net_device *dev = skb->dev;
1805         struct netdev_queue *txq;
1806         struct Qdisc *q;
1807         int rc = -ENOMEM;
1808
1809         /* GSO will handle the following emulations directly. */
1810         if (netif_needs_gso(dev, skb))
1811                 goto gso;
1812
1813         if (skb_shinfo(skb)->frag_list &&
1814             !(dev->features & NETIF_F_FRAGLIST) &&
1815             __skb_linearize(skb))
1816                 goto out_kfree_skb;
1817
1818         /* Fragmented skb is linearized if device does not support SG,
1819          * or if at least one of fragments is in highmem and device
1820          * does not support DMA from it.
1821          */
1822         if (skb_shinfo(skb)->nr_frags &&
1823             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1824             __skb_linearize(skb))
1825                 goto out_kfree_skb;
1826
1827         /* If packet is not checksummed and device does not support
1828          * checksumming for this protocol, complete checksumming here.
1829          */
1830         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1831                 skb_set_transport_header(skb, skb->csum_start -
1832                                               skb_headroom(skb));
1833                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1834                         goto out_kfree_skb;
1835         }
1836
1837 gso:
1838         /* Disable soft irqs for various locks below. Also
1839          * stops preemption for RCU.
1840          */
1841         rcu_read_lock_bh();
1842
1843         txq = dev_pick_tx(dev, skb);
1844         q = rcu_dereference(txq->qdisc);
1845
1846 #ifdef CONFIG_NET_CLS_ACT
1847         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1848 #endif
1849         if (q->enqueue) {
1850                 spinlock_t *root_lock = qdisc_lock(q);
1851
1852                 spin_lock(root_lock);
1853
1854                 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1855                         kfree_skb(skb);
1856                         rc = NET_XMIT_DROP;
1857                 } else {
1858                         rc = qdisc_enqueue_root(skb, q);
1859                         qdisc_run(q);
1860                 }
1861                 spin_unlock(root_lock);
1862
1863                 goto out;
1864         }
1865
1866         /* The device has no queue. Common case for software devices:
1867            loopback, all the sorts of tunnels...
1868
1869            Really, it is unlikely that netif_tx_lock protection is necessary
1870            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1871            counters.)
1872            However, it is possible, that they rely on protection
1873            made by us here.
1874
1875            Check this and shot the lock. It is not prone from deadlocks.
1876            Either shot noqueue qdisc, it is even simpler 8)
1877          */
1878         if (dev->flags & IFF_UP) {
1879                 int cpu = smp_processor_id(); /* ok because BHs are off */
1880
1881                 if (txq->xmit_lock_owner != cpu) {
1882
1883                         HARD_TX_LOCK(dev, txq, cpu);
1884
1885                         if (!netif_tx_queue_stopped(txq)) {
1886                                 rc = 0;
1887                                 if (!dev_hard_start_xmit(skb, dev, txq)) {
1888                                         HARD_TX_UNLOCK(dev, txq);
1889                                         goto out;
1890                                 }
1891                         }
1892                         HARD_TX_UNLOCK(dev, txq);
1893                         if (net_ratelimit())
1894                                 printk(KERN_CRIT "Virtual device %s asks to "
1895                                        "queue packet!\n", dev->name);
1896                 } else {
1897                         /* Recursion is detected! It is possible,
1898                          * unfortunately */
1899                         if (net_ratelimit())
1900                                 printk(KERN_CRIT "Dead loop on virtual device "
1901                                        "%s, fix it urgently!\n", dev->name);
1902                 }
1903         }
1904
1905         rc = -ENETDOWN;
1906         rcu_read_unlock_bh();
1907
1908 out_kfree_skb:
1909         kfree_skb(skb);
1910         return rc;
1911 out:
1912         rcu_read_unlock_bh();
1913         return rc;
1914 }
1915
1916
1917 /*=======================================================================
1918                         Receiver routines
1919   =======================================================================*/
1920
1921 int netdev_max_backlog __read_mostly = 1000;
1922 int netdev_budget __read_mostly = 300;
1923 int weight_p __read_mostly = 64;            /* old backlog weight */
1924
1925 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1926
1927
1928 /**
1929  *      netif_rx        -       post buffer to the network code
1930  *      @skb: buffer to post
1931  *
1932  *      This function receives a packet from a device driver and queues it for
1933  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1934  *      may be dropped during processing for congestion control or by the
1935  *      protocol layers.
1936  *
1937  *      return values:
1938  *      NET_RX_SUCCESS  (no congestion)
1939  *      NET_RX_DROP     (packet was dropped)
1940  *
1941  */
1942
1943 int netif_rx(struct sk_buff *skb)
1944 {
1945         struct softnet_data *queue;
1946         unsigned long flags;
1947
1948         /* if netpoll wants it, pretend we never saw it */
1949         if (netpoll_rx(skb))
1950                 return NET_RX_DROP;
1951
1952         if (!skb->tstamp.tv64)
1953                 net_timestamp(skb);
1954
1955         /*
1956          * The code is rearranged so that the path is the most
1957          * short when CPU is congested, but is still operating.
1958          */
1959         local_irq_save(flags);
1960         queue = &__get_cpu_var(softnet_data);
1961
1962         __get_cpu_var(netdev_rx_stat).total++;
1963         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1964                 if (queue->input_pkt_queue.qlen) {
1965 enqueue:
1966                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1967                         local_irq_restore(flags);
1968                         return NET_RX_SUCCESS;
1969                 }
1970
1971                 napi_schedule(&queue->backlog);
1972                 goto enqueue;
1973         }
1974
1975         __get_cpu_var(netdev_rx_stat).dropped++;
1976         local_irq_restore(flags);
1977
1978         kfree_skb(skb);
1979         return NET_RX_DROP;
1980 }
1981
1982 int netif_rx_ni(struct sk_buff *skb)
1983 {
1984         int err;
1985
1986         preempt_disable();
1987         err = netif_rx(skb);
1988         if (local_softirq_pending())
1989                 do_softirq();
1990         preempt_enable();
1991
1992         return err;
1993 }
1994
1995 EXPORT_SYMBOL(netif_rx_ni);
1996
1997 static void net_tx_action(struct softirq_action *h)
1998 {
1999         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2000
2001         if (sd->completion_queue) {
2002                 struct sk_buff *clist;
2003
2004                 local_irq_disable();
2005                 clist = sd->completion_queue;
2006                 sd->completion_queue = NULL;
2007                 local_irq_enable();
2008
2009                 while (clist) {
2010                         struct sk_buff *skb = clist;
2011                         clist = clist->next;
2012
2013                         WARN_ON(atomic_read(&skb->users));
2014                         __kfree_skb(skb);
2015                 }
2016         }
2017
2018         if (sd->output_queue) {
2019                 struct Qdisc *head;
2020
2021                 local_irq_disable();
2022                 head = sd->output_queue;
2023                 sd->output_queue = NULL;
2024                 local_irq_enable();
2025
2026                 while (head) {
2027                         struct Qdisc *q = head;
2028                         spinlock_t *root_lock;
2029
2030                         head = head->next_sched;
2031
2032                         root_lock = qdisc_lock(q);
2033                         if (spin_trylock(root_lock)) {
2034                                 smp_mb__before_clear_bit();
2035                                 clear_bit(__QDISC_STATE_SCHED,
2036                                           &q->state);
2037                                 qdisc_run(q);
2038                                 spin_unlock(root_lock);
2039                         } else {
2040                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2041                                               &q->state)) {
2042                                         __netif_reschedule(q);
2043                                 } else {
2044                                         smp_mb__before_clear_bit();
2045                                         clear_bit(__QDISC_STATE_SCHED,
2046                                                   &q->state);
2047                                 }
2048                         }
2049                 }
2050         }
2051 }
2052
2053 static inline int deliver_skb(struct sk_buff *skb,
2054                               struct packet_type *pt_prev,
2055                               struct net_device *orig_dev)
2056 {
2057         atomic_inc(&skb->users);
2058         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2059 }
2060
2061 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2062 /* These hooks defined here for ATM */
2063 struct net_bridge;
2064 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2065                                                 unsigned char *addr);
2066 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2067
2068 /*
2069  * If bridge module is loaded call bridging hook.
2070  *  returns NULL if packet was consumed.
2071  */
2072 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2073                                         struct sk_buff *skb) __read_mostly;
2074 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2075                                             struct packet_type **pt_prev, int *ret,
2076                                             struct net_device *orig_dev)
2077 {
2078         struct net_bridge_port *port;
2079
2080         if (skb->pkt_type == PACKET_LOOPBACK ||
2081             (port = rcu_dereference(skb->dev->br_port)) == NULL)
2082                 return skb;
2083
2084         if (*pt_prev) {
2085                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2086                 *pt_prev = NULL;
2087         }
2088
2089         return br_handle_frame_hook(port, skb);
2090 }
2091 #else
2092 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2093 #endif
2094
2095 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2096 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2097 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2098
2099 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2100                                              struct packet_type **pt_prev,
2101                                              int *ret,
2102                                              struct net_device *orig_dev)
2103 {
2104         if (skb->dev->macvlan_port == NULL)
2105                 return skb;
2106
2107         if (*pt_prev) {
2108                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2109                 *pt_prev = NULL;
2110         }
2111         return macvlan_handle_frame_hook(skb);
2112 }
2113 #else
2114 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2115 #endif
2116
2117 #ifdef CONFIG_NET_CLS_ACT
2118 /* TODO: Maybe we should just force sch_ingress to be compiled in
2119  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2120  * a compare and 2 stores extra right now if we dont have it on
2121  * but have CONFIG_NET_CLS_ACT
2122  * NOTE: This doesnt stop any functionality; if you dont have
2123  * the ingress scheduler, you just cant add policies on ingress.
2124  *
2125  */
2126 static int ing_filter(struct sk_buff *skb)
2127 {
2128         struct net_device *dev = skb->dev;
2129         u32 ttl = G_TC_RTTL(skb->tc_verd);
2130         struct netdev_queue *rxq;
2131         int result = TC_ACT_OK;
2132         struct Qdisc *q;
2133
2134         if (MAX_RED_LOOP < ttl++) {
2135                 printk(KERN_WARNING
2136                        "Redir loop detected Dropping packet (%d->%d)\n",
2137                        skb->iif, dev->ifindex);
2138                 return TC_ACT_SHOT;
2139         }
2140
2141         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2142         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2143
2144         rxq = &dev->rx_queue;
2145
2146         q = rxq->qdisc;
2147         if (q != &noop_qdisc) {
2148                 spin_lock(qdisc_lock(q));
2149                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2150                         result = qdisc_enqueue_root(skb, q);
2151                 spin_unlock(qdisc_lock(q));
2152         }
2153
2154         return result;
2155 }
2156
2157 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2158                                          struct packet_type **pt_prev,
2159                                          int *ret, struct net_device *orig_dev)
2160 {
2161         if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2162                 goto out;
2163
2164         if (*pt_prev) {
2165                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2166                 *pt_prev = NULL;
2167         } else {
2168                 /* Huh? Why does turning on AF_PACKET affect this? */
2169                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2170         }
2171
2172         switch (ing_filter(skb)) {
2173         case TC_ACT_SHOT:
2174         case TC_ACT_STOLEN:
2175                 kfree_skb(skb);
2176                 return NULL;
2177         }
2178
2179 out:
2180         skb->tc_verd = 0;
2181         return skb;
2182 }
2183 #endif
2184
2185 /*
2186  *      netif_nit_deliver - deliver received packets to network taps
2187  *      @skb: buffer
2188  *
2189  *      This function is used to deliver incoming packets to network
2190  *      taps. It should be used when the normal netif_receive_skb path
2191  *      is bypassed, for example because of VLAN acceleration.
2192  */
2193 void netif_nit_deliver(struct sk_buff *skb)
2194 {
2195         struct packet_type *ptype;
2196
2197         if (list_empty(&ptype_all))
2198                 return;
2199
2200         skb_reset_network_header(skb);
2201         skb_reset_transport_header(skb);
2202         skb->mac_len = skb->network_header - skb->mac_header;
2203
2204         rcu_read_lock();
2205         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2206                 if (!ptype->dev || ptype->dev == skb->dev)
2207                         deliver_skb(skb, ptype, skb->dev);
2208         }
2209         rcu_read_unlock();
2210 }
2211
2212 /**
2213  *      netif_receive_skb - process receive buffer from network
2214  *      @skb: buffer to process
2215  *
2216  *      netif_receive_skb() is the main receive data processing function.
2217  *      It always succeeds. The buffer may be dropped during processing
2218  *      for congestion control or by the protocol layers.
2219  *
2220  *      This function may only be called from softirq context and interrupts
2221  *      should be enabled.
2222  *
2223  *      Return values (usually ignored):
2224  *      NET_RX_SUCCESS: no congestion
2225  *      NET_RX_DROP: packet was dropped
2226  */
2227 int netif_receive_skb(struct sk_buff *skb)
2228 {
2229         struct packet_type *ptype, *pt_prev;
2230         struct net_device *orig_dev;
2231         struct net_device *null_or_orig;
2232         int ret = NET_RX_DROP;
2233         __be16 type;
2234
2235         if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2236                 return NET_RX_SUCCESS;
2237
2238         /* if we've gotten here through NAPI, check netpoll */
2239         if (netpoll_receive_skb(skb))
2240                 return NET_RX_DROP;
2241
2242         if (!skb->tstamp.tv64)
2243                 net_timestamp(skb);
2244
2245         if (!skb->iif)
2246                 skb->iif = skb->dev->ifindex;
2247
2248         null_or_orig = NULL;
2249         orig_dev = skb->dev;
2250         if (orig_dev->master) {
2251                 if (skb_bond_should_drop(skb))
2252                         null_or_orig = orig_dev; /* deliver only exact match */
2253                 else
2254                         skb->dev = orig_dev->master;
2255         }
2256
2257         __get_cpu_var(netdev_rx_stat).total++;
2258
2259         skb_reset_network_header(skb);
2260         skb_reset_transport_header(skb);
2261         skb->mac_len = skb->network_header - skb->mac_header;
2262
2263         pt_prev = NULL;
2264
2265         rcu_read_lock();
2266
2267         /* Don't receive packets in an exiting network namespace */
2268         if (!net_alive(dev_net(skb->dev))) {
2269                 kfree_skb(skb);
2270                 goto out;
2271         }
2272
2273 #ifdef CONFIG_NET_CLS_ACT
2274         if (skb->tc_verd & TC_NCLS) {
2275                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2276                 goto ncls;
2277         }
2278 #endif
2279
2280         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2281                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2282                     ptype->dev == orig_dev) {
2283                         if (pt_prev)
2284                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2285                         pt_prev = ptype;
2286                 }
2287         }
2288
2289 #ifdef CONFIG_NET_CLS_ACT
2290         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2291         if (!skb)
2292                 goto out;
2293 ncls:
2294 #endif
2295
2296         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2297         if (!skb)
2298                 goto out;
2299         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2300         if (!skb)
2301                 goto out;
2302
2303         type = skb->protocol;
2304         list_for_each_entry_rcu(ptype,
2305                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2306                 if (ptype->type == type &&
2307                     (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2308                      ptype->dev == orig_dev)) {
2309                         if (pt_prev)
2310                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2311                         pt_prev = ptype;
2312                 }
2313         }
2314
2315         if (pt_prev) {
2316                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2317         } else {
2318                 kfree_skb(skb);
2319                 /* Jamal, now you will not able to escape explaining
2320                  * me how you were going to use this. :-)
2321                  */
2322                 ret = NET_RX_DROP;
2323         }
2324
2325 out:
2326         rcu_read_unlock();
2327         return ret;
2328 }
2329
2330 /* Network device is going away, flush any packets still pending  */
2331 static void flush_backlog(void *arg)
2332 {
2333         struct net_device *dev = arg;
2334         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2335         struct sk_buff *skb, *tmp;
2336
2337         skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2338                 if (skb->dev == dev) {
2339                         __skb_unlink(skb, &queue->input_pkt_queue);
2340                         kfree_skb(skb);
2341                 }
2342 }
2343
2344 static int napi_gro_complete(struct sk_buff *skb)
2345 {
2346         struct packet_type *ptype;
2347         __be16 type = skb->protocol;
2348         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2349         int err = -ENOENT;
2350
2351         if (NAPI_GRO_CB(skb)->count == 1)
2352                 goto out;
2353
2354         rcu_read_lock();
2355         list_for_each_entry_rcu(ptype, head, list) {
2356                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2357                         continue;
2358
2359                 err = ptype->gro_complete(skb);
2360                 break;
2361         }
2362         rcu_read_unlock();
2363
2364         if (err) {
2365                 WARN_ON(&ptype->list == head);
2366                 kfree_skb(skb);
2367                 return NET_RX_SUCCESS;
2368         }
2369
2370 out:
2371         skb_shinfo(skb)->gso_size = 0;
2372         __skb_push(skb, -skb_network_offset(skb));
2373         return netif_receive_skb(skb);
2374 }
2375
2376 void napi_gro_flush(struct napi_struct *napi)
2377 {
2378         struct sk_buff *skb, *next;
2379
2380         for (skb = napi->gro_list; skb; skb = next) {
2381                 next = skb->next;
2382                 skb->next = NULL;
2383                 napi_gro_complete(skb);
2384         }
2385
2386         napi->gro_list = NULL;
2387 }
2388 EXPORT_SYMBOL(napi_gro_flush);
2389
2390 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2391 {
2392         struct sk_buff **pp = NULL;
2393         struct packet_type *ptype;
2394         __be16 type = skb->protocol;
2395         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2396         int count = 0;
2397         int same_flow;
2398         int mac_len;
2399         int free;
2400
2401         if (!(skb->dev->features & NETIF_F_GRO))
2402                 goto normal;
2403
2404         rcu_read_lock();
2405         list_for_each_entry_rcu(ptype, head, list) {
2406                 struct sk_buff *p;
2407
2408                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2409                         continue;
2410
2411                 skb_reset_network_header(skb);
2412                 mac_len = skb->network_header - skb->mac_header;
2413                 skb->mac_len = mac_len;
2414                 NAPI_GRO_CB(skb)->same_flow = 0;
2415                 NAPI_GRO_CB(skb)->flush = 0;
2416                 NAPI_GRO_CB(skb)->free = 0;
2417
2418                 for (p = napi->gro_list; p; p = p->next) {
2419                         count++;
2420                         NAPI_GRO_CB(p)->same_flow =
2421                                 p->mac_len == mac_len &&
2422                                 !memcmp(skb_mac_header(p), skb_mac_header(skb),
2423                                         mac_len);
2424                         NAPI_GRO_CB(p)->flush = 0;
2425                 }
2426
2427                 pp = ptype->gro_receive(&napi->gro_list, skb);
2428                 break;
2429         }
2430         rcu_read_unlock();
2431
2432         if (&ptype->list == head)
2433                 goto normal;
2434
2435         same_flow = NAPI_GRO_CB(skb)->same_flow;
2436         free = NAPI_GRO_CB(skb)->free;
2437
2438         if (pp) {
2439                 struct sk_buff *nskb = *pp;
2440
2441                 *pp = nskb->next;
2442                 nskb->next = NULL;
2443                 napi_gro_complete(nskb);
2444                 count--;
2445         }
2446
2447         if (same_flow)
2448                 goto ok;
2449
2450         if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) {
2451                 __skb_push(skb, -skb_network_offset(skb));
2452                 goto normal;
2453         }
2454
2455         NAPI_GRO_CB(skb)->count = 1;
2456         skb_shinfo(skb)->gso_size = skb->len;
2457         skb->next = napi->gro_list;
2458         napi->gro_list = skb;
2459
2460 ok:
2461         return free;
2462
2463 normal:
2464         return -1;
2465 }
2466
2467 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2468 {
2469         switch (__napi_gro_receive(napi, skb)) {
2470         case -1:
2471                 return netif_receive_skb(skb);
2472
2473         case 1:
2474                 kfree_skb(skb);
2475                 break;
2476         }
2477
2478         return NET_RX_SUCCESS;
2479 }
2480 EXPORT_SYMBOL(napi_gro_receive);
2481
2482 int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
2483 {
2484         struct net_device *dev = napi->dev;
2485         struct sk_buff *skb = napi->skb;
2486         int err = NET_RX_DROP;
2487
2488         napi->skb = NULL;
2489
2490         if (!skb) {
2491                 skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2492                 if (!skb)
2493                         goto out;
2494
2495                 skb_reserve(skb, NET_IP_ALIGN);
2496         }
2497
2498         BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
2499         skb_shinfo(skb)->nr_frags = info->nr_frags;
2500         memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags));
2501
2502         skb->data_len = info->len;
2503         skb->len += info->len;
2504         skb->truesize += info->len;
2505
2506         if (!pskb_may_pull(skb, ETH_HLEN))
2507                 goto reuse;
2508
2509         err = NET_RX_SUCCESS;
2510
2511         skb->protocol = eth_type_trans(skb, dev);
2512
2513         skb->ip_summed = info->ip_summed;
2514         skb->csum = info->csum;
2515
2516         switch (__napi_gro_receive(napi, skb)) {
2517         case -1:
2518                 return netif_receive_skb(skb);
2519
2520         case 0:
2521                 goto out;
2522         }
2523
2524 reuse:
2525         skb_shinfo(skb)->nr_frags = 0;
2526
2527         skb->len -= skb->data_len;
2528         skb->truesize -= skb->data_len;
2529         skb->data_len = 0;
2530
2531         __skb_pull(skb, skb_headlen(skb));
2532         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2533
2534         napi->skb = skb;
2535
2536 out:
2537         return err;
2538 }
2539 EXPORT_SYMBOL(napi_gro_frags);
2540
2541 static int process_backlog(struct napi_struct *napi, int quota)
2542 {
2543         int work = 0;
2544         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2545         unsigned long start_time = jiffies;
2546
2547         napi->weight = weight_p;
2548         do {
2549                 struct sk_buff *skb;
2550
2551                 local_irq_disable();
2552                 skb = __skb_dequeue(&queue->input_pkt_queue);
2553                 if (!skb) {
2554                         __napi_complete(napi);
2555                         local_irq_enable();
2556                         break;
2557                 }
2558                 local_irq_enable();
2559
2560                 napi_gro_receive(napi, skb);
2561         } while (++work < quota && jiffies == start_time);
2562
2563         napi_gro_flush(napi);
2564
2565         return work;
2566 }
2567
2568 /**
2569  * __napi_schedule - schedule for receive
2570  * @n: entry to schedule
2571  *
2572  * The entry's receive function will be scheduled to run
2573  */
2574 void __napi_schedule(struct napi_struct *n)
2575 {
2576         unsigned long flags;
2577
2578         local_irq_save(flags);
2579         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2580         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2581         local_irq_restore(flags);
2582 }
2583 EXPORT_SYMBOL(__napi_schedule);
2584
2585 void __napi_complete(struct napi_struct *n)
2586 {
2587         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2588         BUG_ON(n->gro_list);
2589
2590         list_del(&n->poll_list);
2591         smp_mb__before_clear_bit();
2592         clear_bit(NAPI_STATE_SCHED, &n->state);
2593 }
2594 EXPORT_SYMBOL(__napi_complete);
2595
2596 void napi_complete(struct napi_struct *n)
2597 {
2598         unsigned long flags;
2599
2600         /*
2601          * don't let napi dequeue from the cpu poll list
2602          * just in case its running on a different cpu
2603          */
2604         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2605                 return;
2606
2607         napi_gro_flush(n);
2608         local_irq_save(flags);
2609         __napi_complete(n);
2610         local_irq_restore(flags);
2611 }
2612 EXPORT_SYMBOL(napi_complete);
2613
2614 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2615                     int (*poll)(struct napi_struct *, int), int weight)
2616 {
2617         INIT_LIST_HEAD(&napi->poll_list);
2618         napi->gro_list = NULL;
2619         napi->skb = NULL;
2620         napi->poll = poll;
2621         napi->weight = weight;
2622         list_add(&napi->dev_list, &dev->napi_list);
2623         napi->dev = dev;
2624 #ifdef CONFIG_NETPOLL
2625         spin_lock_init(&napi->poll_lock);
2626         napi->poll_owner = -1;
2627 #endif
2628         set_bit(NAPI_STATE_SCHED, &napi->state);
2629 }
2630 EXPORT_SYMBOL(netif_napi_add);
2631
2632 void netif_napi_del(struct napi_struct *napi)
2633 {
2634         struct sk_buff *skb, *next;
2635
2636         list_del_init(&napi->dev_list);
2637         kfree(napi->skb);
2638
2639         for (skb = napi->gro_list; skb; skb = next) {
2640                 next = skb->next;
2641                 skb->next = NULL;
2642                 kfree_skb(skb);
2643         }
2644
2645         napi->gro_list = NULL;
2646 }
2647 EXPORT_SYMBOL(netif_napi_del);
2648
2649
2650 static void net_rx_action(struct softirq_action *h)
2651 {
2652         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2653         unsigned long time_limit = jiffies + 2;
2654         int budget = netdev_budget;
2655         void *have;
2656
2657         local_irq_disable();
2658
2659         while (!list_empty(list)) {
2660                 struct napi_struct *n;
2661                 int work, weight;
2662
2663                 /* If softirq window is exhuasted then punt.
2664                  * Allow this to run for 2 jiffies since which will allow
2665                  * an average latency of 1.5/HZ.
2666                  */
2667                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2668                         goto softnet_break;
2669
2670                 local_irq_enable();
2671
2672                 /* Even though interrupts have been re-enabled, this
2673                  * access is safe because interrupts can only add new
2674                  * entries to the tail of this list, and only ->poll()
2675                  * calls can remove this head entry from the list.
2676                  */
2677                 n = list_entry(list->next, struct napi_struct, poll_list);
2678
2679                 have = netpoll_poll_lock(n);
2680
2681                 weight = n->weight;
2682
2683                 /* This NAPI_STATE_SCHED test is for avoiding a race
2684                  * with netpoll's poll_napi().  Only the entity which
2685                  * obtains the lock and sees NAPI_STATE_SCHED set will
2686                  * actually make the ->poll() call.  Therefore we avoid
2687                  * accidently calling ->poll() when NAPI is not scheduled.
2688                  */
2689                 work = 0;
2690                 if (test_bit(NAPI_STATE_SCHED, &n->state))
2691                         work = n->poll(n, weight);
2692
2693                 WARN_ON_ONCE(work > weight);
2694
2695                 budget -= work;
2696
2697                 local_irq_disable();
2698
2699                 /* Drivers must not modify the NAPI state if they
2700                  * consume the entire weight.  In such cases this code
2701                  * still "owns" the NAPI instance and therefore can
2702                  * move the instance around on the list at-will.
2703                  */
2704                 if (unlikely(work == weight)) {
2705                         if (unlikely(napi_disable_pending(n)))
2706                                 __napi_complete(n);
2707                         else
2708                                 list_move_tail(&n->poll_list, list);
2709                 }
2710
2711                 netpoll_poll_unlock(have);
2712         }
2713 out:
2714         local_irq_enable();
2715
2716 #ifdef CONFIG_NET_DMA
2717         /*
2718          * There may not be any more sk_buffs coming right now, so push
2719          * any pending DMA copies to hardware
2720          */
2721         if (!cpus_empty(net_dma.channel_mask)) {
2722                 int chan_idx;
2723                 for_each_cpu_mask_nr(chan_idx, net_dma.channel_mask) {
2724                         struct dma_chan *chan = net_dma.channels[chan_idx];
2725                         if (chan)
2726                                 dma_async_memcpy_issue_pending(chan);
2727                 }
2728         }
2729 #endif
2730
2731         return;
2732
2733 softnet_break:
2734         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2735         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2736         goto out;
2737 }
2738
2739 static gifconf_func_t * gifconf_list [NPROTO];
2740
2741 /**
2742  *      register_gifconf        -       register a SIOCGIF handler
2743  *      @family: Address family
2744  *      @gifconf: Function handler
2745  *
2746  *      Register protocol dependent address dumping routines. The handler
2747  *      that is passed must not be freed or reused until it has been replaced
2748  *      by another handler.
2749  */
2750 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2751 {
2752         if (family >= NPROTO)
2753                 return -EINVAL;
2754         gifconf_list[family] = gifconf;
2755         return 0;
2756 }
2757
2758
2759 /*
2760  *      Map an interface index to its name (SIOCGIFNAME)
2761  */
2762
2763 /*
2764  *      We need this ioctl for efficient implementation of the
2765  *      if_indextoname() function required by the IPv6 API.  Without
2766  *      it, we would have to search all the interfaces to find a
2767  *      match.  --pb
2768  */
2769
2770 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2771 {
2772         struct net_device *dev;
2773         struct ifreq ifr;
2774
2775         /*
2776          *      Fetch the caller's info block.
2777          */
2778
2779         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2780                 return -EFAULT;
2781
2782         read_lock(&dev_base_lock);
2783         dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2784         if (!dev) {
2785                 read_unlock(&dev_base_lock);
2786                 return -ENODEV;
2787         }
2788
2789         strcpy(ifr.ifr_name, dev->name);
2790         read_unlock(&dev_base_lock);
2791
2792         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2793                 return -EFAULT;
2794         return 0;
2795 }
2796
2797 /*
2798  *      Perform a SIOCGIFCONF call. This structure will change
2799  *      size eventually, and there is nothing I can do about it.
2800  *      Thus we will need a 'compatibility mode'.
2801  */
2802
2803 static int dev_ifconf(struct net *net, char __user *arg)
2804 {
2805         struct ifconf ifc;
2806         struct net_device *dev;
2807         char __user *pos;
2808         int len;
2809         int total;
2810         int i;
2811
2812         /*
2813          *      Fetch the caller's info block.
2814          */
2815
2816         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2817                 return -EFAULT;
2818
2819         pos = ifc.ifc_buf;
2820         len = ifc.ifc_len;
2821
2822         /*
2823          *      Loop over the interfaces, and write an info block for each.
2824          */
2825
2826         total = 0;
2827         for_each_netdev(net, dev) {
2828                 for (i = 0; i < NPROTO; i++) {
2829                         if (gifconf_list[i]) {
2830                                 int done;
2831                                 if (!pos)
2832                                         done = gifconf_list[i](dev, NULL, 0);
2833                                 else
2834                                         done = gifconf_list[i](dev, pos + total,
2835                                                                len - total);
2836                                 if (done < 0)
2837                                         return -EFAULT;
2838                                 total += done;
2839                         }
2840                 }
2841         }
2842
2843         /*
2844          *      All done.  Write the updated control block back to the caller.
2845          */
2846         ifc.ifc_len = total;
2847
2848         /*
2849          *      Both BSD and Solaris return 0 here, so we do too.
2850          */
2851         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2852 }
2853
2854 #ifdef CONFIG_PROC_FS
2855 /*
2856  *      This is invoked by the /proc filesystem handler to display a device
2857  *      in detail.
2858  */
2859 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2860         __acquires(dev_base_lock)
2861 {
2862         struct net *net = seq_file_net(seq);
2863         loff_t off;
2864         struct net_device *dev;
2865
2866         read_lock(&dev_base_lock);
2867         if (!*pos)
2868                 return SEQ_START_TOKEN;
2869
2870         off = 1;
2871         for_each_netdev(net, dev)
2872                 if (off++ == *pos)
2873                         return dev;
2874
2875         return NULL;
2876 }
2877
2878 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2879 {
2880         struct net *net = seq_file_net(seq);
2881         ++*pos;
2882         return v == SEQ_START_TOKEN ?
2883                 first_net_device(net) : next_net_device((struct net_device *)v);
2884 }
2885
2886 void dev_seq_stop(struct seq_file *seq, void *v)
2887         __releases(dev_base_lock)
2888 {
2889         read_unlock(&dev_base_lock);
2890 }
2891
2892 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2893 {
2894         const struct net_device_stats *stats = dev_get_stats(dev);
2895
2896         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2897                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2898                    dev->name, stats->rx_bytes, stats->rx_packets,
2899                    stats->rx_errors,
2900                    stats->rx_dropped + stats->rx_missed_errors,
2901                    stats->rx_fifo_errors,
2902                    stats->rx_length_errors + stats->rx_over_errors +
2903                     stats->rx_crc_errors + stats->rx_frame_errors,
2904                    stats->rx_compressed, stats->multicast,
2905                    stats->tx_bytes, stats->tx_packets,
2906                    stats->tx_errors, stats->tx_dropped,
2907                    stats->tx_fifo_errors, stats->collisions,
2908                    stats->tx_carrier_errors +
2909                     stats->tx_aborted_errors +
2910                     stats->tx_window_errors +
2911                     stats->tx_heartbeat_errors,
2912                    stats->tx_compressed);
2913 }
2914
2915 /*
2916  *      Called from the PROCfs module. This now uses the new arbitrary sized
2917  *      /proc/net interface to create /proc/net/dev
2918  */
2919 static int dev_seq_show(struct seq_file *seq, void *v)
2920 {
2921         if (v == SEQ_START_TOKEN)
2922                 seq_puts(seq, "Inter-|   Receive                            "
2923                               "                    |  Transmit\n"
2924                               " face |bytes    packets errs drop fifo frame "
2925                               "compressed multicast|bytes    packets errs "
2926                               "drop fifo colls carrier compressed\n");
2927         else
2928                 dev_seq_printf_stats(seq, v);
2929         return 0;
2930 }
2931
2932 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2933 {
2934         struct netif_rx_stats *rc = NULL;
2935
2936         while (*pos < nr_cpu_ids)
2937                 if (cpu_online(*pos)) {
2938                         rc = &per_cpu(netdev_rx_stat, *pos);
2939                         break;
2940                 } else
2941                         ++*pos;
2942         return rc;
2943 }
2944
2945 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2946 {
2947         return softnet_get_online(pos);
2948 }
2949
2950 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2951 {
2952         ++*pos;
2953         return softnet_get_online(pos);
2954 }
2955
2956 static void softnet_seq_stop(struct seq_file *seq, void *v)
2957 {
2958 }
2959
2960 static int softnet_seq_show(struct seq_file *seq, void *v)
2961 {
2962         struct netif_rx_stats *s = v;
2963
2964         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2965                    s->total, s->dropped, s->time_squeeze, 0,
2966                    0, 0, 0, 0, /* was fastroute */
2967                    s->cpu_collision );
2968         return 0;
2969 }
2970
2971 static const struct seq_operations dev_seq_ops = {
2972         .start = dev_seq_start,
2973         .next  = dev_seq_next,
2974         .stop  = dev_seq_stop,
2975         .show  = dev_seq_show,
2976 };
2977
2978 static int dev_seq_open(struct inode *inode, struct file *file)
2979 {
2980         return seq_open_net(inode, file, &dev_seq_ops,
2981                             sizeof(struct seq_net_private));
2982 }
2983
2984 static const struct file_operations dev_seq_fops = {
2985         .owner   = THIS_MODULE,
2986         .open    = dev_seq_open,
2987         .read    = seq_read,
2988         .llseek  = seq_lseek,
2989         .release = seq_release_net,
2990 };
2991
2992 static const struct seq_operations softnet_seq_ops = {
2993         .start = softnet_seq_start,
2994         .next  = softnet_seq_next,
2995         .stop  = softnet_seq_stop,
2996         .show  = softnet_seq_show,
2997 };
2998
2999 static int softnet_seq_open(struct inode *inode, struct file *file)
3000 {
3001         return seq_open(file, &softnet_seq_ops);
3002 }
3003
3004 static const struct file_operations softnet_seq_fops = {
3005         .owner   = THIS_MODULE,
3006         .open    = softnet_seq_open,
3007         .read    = seq_read,
3008         .llseek  = seq_lseek,
3009         .release = seq_release,
3010 };
3011
3012 static void *ptype_get_idx(loff_t pos)
3013 {
3014         struct packet_type *pt = NULL;
3015         loff_t i = 0;
3016         int t;
3017
3018         list_for_each_entry_rcu(pt, &ptype_all, list) {
3019                 if (i == pos)
3020                         return pt;
3021                 ++i;
3022         }
3023
3024         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3025                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3026                         if (i == pos)
3027                                 return pt;
3028                         ++i;
3029                 }
3030         }
3031         return NULL;
3032 }
3033
3034 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3035         __acquires(RCU)
3036 {
3037         rcu_read_lock();
3038         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3039 }
3040
3041 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3042 {
3043         struct packet_type *pt;
3044         struct list_head *nxt;
3045         int hash;
3046
3047         ++*pos;
3048         if (v == SEQ_START_TOKEN)
3049                 return ptype_get_idx(0);
3050
3051         pt = v;
3052         nxt = pt->list.next;
3053         if (pt->type == htons(ETH_P_ALL)) {
3054                 if (nxt != &ptype_all)
3055                         goto found;
3056                 hash = 0;
3057                 nxt = ptype_base[0].next;
3058         } else
3059                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3060
3061         while (nxt == &ptype_base[hash]) {
3062                 if (++hash >= PTYPE_HASH_SIZE)
3063                         return NULL;
3064                 nxt = ptype_base[hash].next;
3065         }
3066 found:
3067         return list_entry(nxt, struct packet_type, list);
3068 }
3069
3070 static void ptype_seq_stop(struct seq_file *seq, void *v)
3071         __releases(RCU)
3072 {
3073         rcu_read_unlock();
3074 }
3075
3076 static int ptype_seq_show(struct seq_file *seq, void *v)
3077 {
3078         struct packet_type *pt = v;
3079
3080         if (v == SEQ_START_TOKEN)
3081                 seq_puts(seq, "Type Device      Function\n");
3082         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3083                 if (pt->type == htons(ETH_P_ALL))
3084                         seq_puts(seq, "ALL ");
3085                 else
3086                         seq_printf(seq, "%04x", ntohs(pt->type));
3087
3088                 seq_printf(seq, " %-8s %pF\n",
3089                            pt->dev ? pt->dev->name : "", pt->func);
3090         }
3091
3092         return 0;
3093 }
3094
3095 static const struct seq_operations ptype_seq_ops = {
3096         .start = ptype_seq_start,
3097         .next  = ptype_seq_next,
3098         .stop  = ptype_seq_stop,
3099         .show  = ptype_seq_show,
3100 };
3101
3102 static int ptype_seq_open(struct inode *inode, struct file *file)
3103 {
3104         return seq_open_net(inode, file, &ptype_seq_ops,
3105                         sizeof(struct seq_net_private));
3106 }
3107
3108 static const struct file_operations ptype_seq_fops = {
3109         .owner   = THIS_MODULE,
3110         .open    = ptype_seq_open,
3111         .read    = seq_read,
3112         .llseek  = seq_lseek,
3113         .release = seq_release_net,
3114 };
3115
3116
3117 static int __net_init dev_proc_net_init(struct net *net)
3118 {
3119         int rc = -ENOMEM;
3120
3121         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3122                 goto out;
3123         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3124                 goto out_dev;
3125         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3126                 goto out_softnet;
3127
3128         if (wext_proc_init(net))
3129                 goto out_ptype;
3130         rc = 0;
3131 out:
3132         return rc;
3133 out_ptype:
3134         proc_net_remove(net, "ptype");
3135 out_softnet:
3136         proc_net_remove(net, "softnet_stat");
3137 out_dev:
3138         proc_net_remove(net, "dev");
3139         goto out;
3140 }
3141
3142 static void __net_exit dev_proc_net_exit(struct net *net)
3143 {
3144         wext_proc_exit(net);
3145
3146         proc_net_remove(net, "ptype");
3147         proc_net_remove(net, "softnet_stat");
3148         proc_net_remove(net, "dev");
3149 }
3150
3151 static struct pernet_operations __net_initdata dev_proc_ops = {
3152         .init = dev_proc_net_init,
3153         .exit = dev_proc_net_exit,
3154 };
3155
3156 static int __init dev_proc_init(void)
3157 {
3158         return register_pernet_subsys(&dev_proc_ops);
3159 }
3160 #else
3161 #define dev_proc_init() 0
3162 #endif  /* CONFIG_PROC_FS */
3163
3164
3165 /**
3166  *      netdev_set_master       -       set up master/slave pair
3167  *      @slave: slave device
3168  *      @master: new master device
3169  *
3170  *      Changes the master device of the slave. Pass %NULL to break the
3171  *      bonding. The caller must hold the RTNL semaphore. On a failure
3172  *      a negative errno code is returned. On success the reference counts
3173  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3174  *      function returns zero.
3175  */
3176 int netdev_set_master(struct net_device *slave, struct net_device *master)
3177 {
3178         struct net_device *old = slave->master;
3179
3180         ASSERT_RTNL();
3181
3182         if (master) {
3183                 if (old)
3184                         return -EBUSY;
3185                 dev_hold(master);
3186         }
3187
3188         slave->master = master;
3189
3190         synchronize_net();
3191
3192         if (old)
3193                 dev_put(old);
3194
3195         if (master)
3196                 slave->flags |= IFF_SLAVE;
3197         else
3198                 slave->flags &= ~IFF_SLAVE;
3199
3200         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3201         return 0;
3202 }
3203
3204 static void dev_change_rx_flags(struct net_device *dev, int flags)
3205 {
3206         const struct net_device_ops *ops = dev->netdev_ops;
3207
3208         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3209                 ops->ndo_change_rx_flags(dev, flags);
3210 }
3211
3212 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3213 {
3214         unsigned short old_flags = dev->flags;
3215         uid_t uid;
3216         gid_t gid;
3217
3218         ASSERT_RTNL();
3219
3220         dev->flags |= IFF_PROMISC;
3221         dev->promiscuity += inc;
3222         if (dev->promiscuity == 0) {
3223                 /*
3224                  * Avoid overflow.
3225                  * If inc causes overflow, untouch promisc and return error.
3226                  */
3227                 if (inc < 0)
3228                         dev->flags &= ~IFF_PROMISC;
3229                 else {
3230                         dev->promiscuity -= inc;
3231                         printk(KERN_WARNING "%s: promiscuity touches roof, "
3232                                 "set promiscuity failed, promiscuity feature "
3233                                 "of device might be broken.\n", dev->name);
3234                         return -EOVERFLOW;
3235                 }
3236         }
3237         if (dev->flags != old_flags) {
3238                 printk(KERN_INFO "device %s %s promiscuous mode\n",
3239                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3240                                                                "left");
3241                 if (audit_enabled) {
3242                         current_uid_gid(&uid, &gid);
3243                         audit_log(current->audit_context, GFP_ATOMIC,
3244                                 AUDIT_ANOM_PROMISCUOUS,
3245                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3246                                 dev->name, (dev->flags & IFF_PROMISC),
3247                                 (old_flags & IFF_PROMISC),
3248                                 audit_get_loginuid(current),
3249                                 uid, gid,
3250                                 audit_get_sessionid(current));
3251                 }
3252
3253                 dev_change_rx_flags(dev, IFF_PROMISC);
3254         }
3255         return 0;
3256 }
3257
3258 /**
3259  *      dev_set_promiscuity     - update promiscuity count on a device
3260  *      @dev: device
3261  *      @inc: modifier
3262  *
3263  *      Add or remove promiscuity from a device. While the count in the device
3264  *      remains above zero the interface remains promiscuous. Once it hits zero
3265  *      the device reverts back to normal filtering operation. A negative inc
3266  *      value is used to drop promiscuity on the device.
3267  *      Return 0 if successful or a negative errno code on error.
3268  */
3269 int dev_set_promiscuity(struct net_device *dev, int inc)
3270 {
3271         unsigned short old_flags = dev->flags;
3272         int err;
3273
3274         err = __dev_set_promiscuity(dev, inc);
3275         if (err < 0)
3276                 return err;
3277         if (dev->flags != old_flags)
3278                 dev_set_rx_mode(dev);
3279         return err;
3280 }
3281
3282 /**
3283  *      dev_set_allmulti        - update allmulti count on a device
3284  *      @dev: device
3285  *      @inc: modifier
3286  *
3287  *      Add or remove reception of all multicast frames to a device. While the
3288  *      count in the device remains above zero the interface remains listening
3289  *      to all interfaces. Once it hits zero the device reverts back to normal
3290  *      filtering operation. A negative @inc value is used to drop the counter
3291  *      when releasing a resource needing all multicasts.
3292  *      Return 0 if successful or a negative errno code on error.
3293  */
3294
3295 int dev_set_allmulti(struct net_device *dev, int inc)
3296 {
3297         unsigned short old_flags = dev->flags;
3298
3299         ASSERT_RTNL();
3300
3301         dev->flags |= IFF_ALLMULTI;
3302         dev->allmulti += inc;
3303         if (dev->allmulti == 0) {
3304                 /*
3305                  * Avoid overflow.
3306                  * If inc causes overflow, untouch allmulti and return error.
3307                  */
3308                 if (inc < 0)
3309                         dev->flags &= ~IFF_ALLMULTI;
3310                 else {
3311                         dev->allmulti -= inc;
3312                         printk(KERN_WARNING "%s: allmulti touches roof, "
3313                                 "set allmulti failed, allmulti feature of "
3314                                 "device might be broken.\n", dev->name);
3315                         return -EOVERFLOW;
3316                 }
3317         }
3318         if (dev->flags ^ old_flags) {
3319                 dev_change_rx_flags(dev, IFF_ALLMULTI);
3320                 dev_set_rx_mode(dev);
3321         }
3322         return 0;
3323 }
3324
3325 /*
3326  *      Upload unicast and multicast address lists to device and
3327  *      configure RX filtering. When the device doesn't support unicast
3328  *      filtering it is put in promiscuous mode while unicast addresses
3329  *      are present.
3330  */
3331 void __dev_set_rx_mode(struct net_device *dev)
3332 {
3333         const struct net_device_ops *ops = dev->netdev_ops;
3334
3335         /* dev_open will call this function so the list will stay sane. */
3336         if (!(dev->flags&IFF_UP))
3337                 return;
3338
3339         if (!netif_device_present(dev))
3340                 return;
3341
3342         if (ops->ndo_set_rx_mode)
3343                 ops->ndo_set_rx_mode(dev);
3344         else {
3345                 /* Unicast addresses changes may only happen under the rtnl,
3346                  * therefore calling __dev_set_promiscuity here is safe.
3347                  */
3348                 if (dev->uc_count > 0 && !dev->uc_promisc) {
3349                         __dev_set_promiscuity(dev, 1);
3350                         dev->uc_promisc = 1;
3351                 } else if (dev->uc_count == 0 && dev->uc_promisc) {
3352                         __dev_set_promiscuity(dev, -1);
3353                         dev->uc_promisc = 0;
3354                 }
3355
3356                 if (ops->ndo_set_multicast_list)
3357                         ops->ndo_set_multicast_list(dev);
3358         }
3359 }
3360
3361 void dev_set_rx_mode(struct net_device *dev)
3362 {
3363         netif_addr_lock_bh(dev);
3364         __dev_set_rx_mode(dev);
3365         netif_addr_unlock_bh(dev);
3366 }
3367
3368 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3369                       void *addr, int alen, int glbl)
3370 {
3371         struct dev_addr_list *da;
3372
3373         for (; (da = *list) != NULL; list = &da->next) {
3374                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3375                     alen == da->da_addrlen) {
3376                         if (glbl) {
3377                                 int old_glbl = da->da_gusers;
3378                                 da->da_gusers = 0;
3379                                 if (old_glbl == 0)
3380                                         break;
3381                         }
3382                         if (--da->da_users)
3383                                 return 0;
3384
3385                         *list = da->next;
3386                         kfree(da);
3387                         (*count)--;
3388                         return 0;
3389                 }
3390         }
3391         return -ENOENT;
3392 }
3393
3394 int __dev_addr_add(struct dev_addr_list **list, int *count,
3395                    void *addr, int alen, int glbl)
3396 {
3397         struct dev_addr_list *da;
3398
3399         for (da = *list; da != NULL; da = da->next) {
3400                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3401                     da->da_addrlen == alen) {
3402                         if (glbl) {
3403                                 int old_glbl = da->da_gusers;
3404                                 da->da_gusers = 1;
3405                                 if (old_glbl)
3406                                         return 0;
3407                         }
3408                         da->da_users++;
3409                         return 0;
3410                 }
3411         }
3412
3413         da = kzalloc(sizeof(*da), GFP_ATOMIC);
3414         if (da == NULL)
3415                 return -ENOMEM;
3416         memcpy(da->da_addr, addr, alen);
3417         da->da_addrlen = alen;
3418         da->da_users = 1;
3419         da->da_gusers = glbl ? 1 : 0;
3420         da->next = *list;
3421         *list = da;
3422         (*count)++;
3423         return 0;
3424 }
3425
3426 /**
3427  *      dev_unicast_delete      - Release secondary unicast address.
3428  *      @dev: device
3429  *      @addr: address to delete
3430  *      @alen: length of @addr
3431  *
3432  *      Release reference to a secondary unicast address and remove it
3433  *      from the device if the reference count drops to zero.
3434  *
3435  *      The caller must hold the rtnl_mutex.
3436  */
3437 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3438 {
3439         int err;
3440
3441         ASSERT_RTNL();
3442
3443         netif_addr_lock_bh(dev);
3444         err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3445         if (!err)
3446                 __dev_set_rx_mode(dev);
3447         netif_addr_unlock_bh(dev);
3448         return err;
3449 }
3450 EXPORT_SYMBOL(dev_unicast_delete);
3451
3452 /**
3453  *      dev_unicast_add         - add a secondary unicast address
3454  *      @dev: device
3455  *      @addr: address to add
3456  *      @alen: length of @addr
3457  *
3458  *      Add a secondary unicast address to the device or increase
3459  *      the reference count if it already exists.
3460  *
3461  *      The caller must hold the rtnl_mutex.
3462  */
3463 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3464 {
3465         int err;
3466
3467         ASSERT_RTNL();
3468
3469         netif_addr_lock_bh(dev);
3470         err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3471         if (!err)
3472                 __dev_set_rx_mode(dev);
3473         netif_addr_unlock_bh(dev);
3474         return err;
3475 }
3476 EXPORT_SYMBOL(dev_unicast_add);
3477
3478 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3479                     struct dev_addr_list **from, int *from_count)
3480 {
3481         struct dev_addr_list *da, *next;
3482         int err = 0;
3483
3484         da = *from;
3485         while (da != NULL) {
3486                 next = da->next;
3487                 if (!da->da_synced) {
3488                         err = __dev_addr_add(to, to_count,
3489                                              da->da_addr, da->da_addrlen, 0);
3490                         if (err < 0)
3491                                 break;
3492                         da->da_synced = 1;
3493                         da->da_users++;
3494                 } else if (da->da_users == 1) {
3495                         __dev_addr_delete(to, to_count,
3496                                           da->da_addr, da->da_addrlen, 0);
3497                         __dev_addr_delete(from, from_count,
3498                                           da->da_addr, da->da_addrlen, 0);
3499                 }
3500                 da = next;
3501         }
3502         return err;
3503 }
3504
3505 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3506                        struct dev_addr_list **from, int *from_count)
3507 {
3508         struct dev_addr_list *da, *next;
3509
3510         da = *from;
3511         while (da != NULL) {
3512                 next = da->next;
3513                 if (da->da_synced) {
3514                         __dev_addr_delete(to, to_count,
3515                                           da->da_addr, da->da_addrlen, 0);
3516                         da->da_synced = 0;
3517                         __dev_addr_delete(from, from_count,
3518                                           da->da_addr, da->da_addrlen, 0);
3519                 }
3520                 da = next;
3521         }
3522 }
3523
3524 /**
3525  *      dev_unicast_sync - Synchronize device's unicast list to another device
3526  *      @to: destination device
3527  *      @from: source device
3528  *
3529  *      Add newly added addresses to the destination device and release
3530  *      addresses that have no users left. The source device must be
3531  *      locked by netif_tx_lock_bh.
3532  *
3533  *      This function is intended to be called from the dev->set_rx_mode
3534  *      function of layered software devices.
3535  */
3536 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3537 {
3538         int err = 0;
3539
3540         netif_addr_lock_bh(to);
3541         err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3542                               &from->uc_list, &from->uc_count);
3543         if (!err)
3544                 __dev_set_rx_mode(to);
3545         netif_addr_unlock_bh(to);
3546         return err;
3547 }
3548 EXPORT_SYMBOL(dev_unicast_sync);
3549
3550 /**
3551  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
3552  *      @to: destination device
3553  *      @from: source device
3554  *
3555  *      Remove all addresses that were added to the destination device by
3556  *      dev_unicast_sync(). This function is intended to be called from the
3557  *      dev->stop function of layered software devices.
3558  */
3559 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3560 {
3561         netif_addr_lock_bh(from);
3562         netif_addr_lock(to);
3563
3564         __dev_addr_unsync(&to->uc_list, &to->uc_count,
3565                           &from->uc_list, &from->uc_count);
3566         __dev_set_rx_mode(to);
3567
3568         netif_addr_unlock(to);
3569         netif_addr_unlock_bh(from);
3570 }
3571 EXPORT_SYMBOL(dev_unicast_unsync);
3572
3573 static void __dev_addr_discard(struct dev_addr_list **list)
3574 {
3575         struct dev_addr_list *tmp;
3576
3577         while (*list != NULL) {
3578                 tmp = *list;
3579                 *list = tmp->next;
3580                 if (tmp->da_users > tmp->da_gusers)
3581                         printk("__dev_addr_discard: address leakage! "
3582                                "da_users=%d\n", tmp->da_users);
3583                 kfree(tmp);
3584         }
3585 }
3586
3587 static void dev_addr_discard(struct net_device *dev)
3588 {
3589         netif_addr_lock_bh(dev);
3590
3591         __dev_addr_discard(&dev->uc_list);
3592         dev->uc_count = 0;
3593
3594         __dev_addr_discard(&dev->mc_list);
3595         dev->mc_count = 0;
3596
3597         netif_addr_unlock_bh(dev);
3598 }
3599
3600 /**
3601  *      dev_get_flags - get flags reported to userspace
3602  *      @dev: device
3603  *
3604  *      Get the combination of flag bits exported through APIs to userspace.
3605  */
3606 unsigned dev_get_flags(const struct net_device *dev)
3607 {
3608         unsigned flags;
3609
3610         flags = (dev->flags & ~(IFF_PROMISC |
3611                                 IFF_ALLMULTI |
3612                                 IFF_RUNNING |
3613                                 IFF_LOWER_UP |
3614                                 IFF_DORMANT)) |
3615                 (dev->gflags & (IFF_PROMISC |
3616                                 IFF_ALLMULTI));
3617
3618         if (netif_running(dev)) {
3619                 if (netif_oper_up(dev))
3620                         flags |= IFF_RUNNING;
3621                 if (netif_carrier_ok(dev))
3622                         flags |= IFF_LOWER_UP;
3623                 if (netif_dormant(dev))
3624                         flags |= IFF_DORMANT;
3625         }
3626
3627         return flags;
3628 }
3629
3630 /**
3631  *      dev_change_flags - change device settings
3632  *      @dev: device
3633  *      @flags: device state flags
3634  *
3635  *      Change settings on device based state flags. The flags are
3636  *      in the userspace exported format.
3637  */
3638 int dev_change_flags(struct net_device *dev, unsigned flags)
3639 {
3640         int ret, changes;
3641         int old_flags = dev->flags;
3642
3643         ASSERT_RTNL();
3644
3645         /*
3646          *      Set the flags on our device.
3647          */
3648
3649         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3650                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3651                                IFF_AUTOMEDIA)) |
3652                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3653                                     IFF_ALLMULTI));
3654
3655         /*
3656          *      Load in the correct multicast list now the flags have changed.
3657          */
3658
3659         if ((old_flags ^ flags) & IFF_MULTICAST)
3660                 dev_change_rx_flags(dev, IFF_MULTICAST);
3661
3662         dev_set_rx_mode(dev);
3663
3664         /*
3665          *      Have we downed the interface. We handle IFF_UP ourselves
3666          *      according to user attempts to set it, rather than blindly
3667          *      setting it.
3668          */
3669
3670         ret = 0;
3671         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
3672                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3673
3674                 if (!ret)
3675                         dev_set_rx_mode(dev);
3676         }
3677
3678         if (dev->flags & IFF_UP &&
3679             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3680                                           IFF_VOLATILE)))
3681                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
3682
3683         if ((flags ^ dev->gflags) & IFF_PROMISC) {
3684                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
3685                 dev->gflags ^= IFF_PROMISC;
3686                 dev_set_promiscuity(dev, inc);
3687         }
3688
3689         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3690            is important. Some (broken) drivers set IFF_PROMISC, when
3691            IFF_ALLMULTI is requested not asking us and not reporting.
3692          */
3693         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3694                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3695                 dev->gflags ^= IFF_ALLMULTI;
3696                 dev_set_allmulti(dev, inc);
3697         }
3698
3699         /* Exclude state transition flags, already notified */
3700         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3701         if (changes)
3702                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3703
3704         return ret;
3705 }
3706
3707 /**
3708  *      dev_set_mtu - Change maximum transfer unit
3709  *      @dev: device
3710  *      @new_mtu: new transfer unit
3711  *
3712  *      Change the maximum transfer size of the network device.
3713  */
3714 int dev_set_mtu(struct net_device *dev, int new_mtu)
3715 {
3716         const struct net_device_ops *ops = dev->netdev_ops;
3717         int err;
3718
3719         if (new_mtu == dev->mtu)
3720                 return 0;
3721
3722         /*      MTU must be positive.    */
3723         if (new_mtu < 0)
3724                 return -EINVAL;
3725
3726         if (!netif_device_present(dev))
3727                 return -ENODEV;
3728
3729         err = 0;
3730         if (ops->ndo_change_mtu)
3731                 err = ops->ndo_change_mtu(dev, new_mtu);
3732         else
3733                 dev->mtu = new_mtu;
3734
3735         if (!err && dev->flags & IFF_UP)
3736                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3737         return err;
3738 }
3739
3740 /**
3741  *      dev_set_mac_address - Change Media Access Control Address
3742  *      @dev: device
3743  *      @sa: new address
3744  *
3745  *      Change the hardware (MAC) address of the device
3746  */
3747 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3748 {
3749         const struct net_device_ops *ops = dev->netdev_ops;
3750         int err;
3751
3752         if (!ops->ndo_set_mac_address)
3753                 return -EOPNOTSUPP;
3754         if (sa->sa_family != dev->type)
3755                 return -EINVAL;
3756         if (!netif_device_present(dev))
3757                 return -ENODEV;
3758         err = ops->ndo_set_mac_address(dev, sa);
3759         if (!err)
3760                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3761         return err;
3762 }
3763
3764 /*
3765  *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3766  */
3767 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3768 {
3769         int err;
3770         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3771
3772         if (!dev)
3773                 return -ENODEV;
3774
3775         switch (cmd) {
3776                 case SIOCGIFFLAGS:      /* Get interface flags */
3777                         ifr->ifr_flags = dev_get_flags(dev);
3778                         return 0;
3779
3780                 case SIOCGIFMETRIC:     /* Get the metric on the interface
3781                                            (currently unused) */
3782                         ifr->ifr_metric = 0;
3783                         return 0;
3784
3785                 case SIOCGIFMTU:        /* Get the MTU of a device */
3786                         ifr->ifr_mtu = dev->mtu;
3787                         return 0;
3788
3789                 case SIOCGIFHWADDR:
3790                         if (!dev->addr_len)
3791                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3792                         else
3793                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3794                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3795                         ifr->ifr_hwaddr.sa_family = dev->type;
3796                         return 0;
3797
3798                 case SIOCGIFSLAVE:
3799                         err = -EINVAL;
3800                         break;
3801
3802                 case SIOCGIFMAP:
3803                         ifr->ifr_map.mem_start = dev->mem_start;
3804                         ifr->ifr_map.mem_end   = dev->mem_end;
3805                         ifr->ifr_map.base_addr = dev->base_addr;
3806                         ifr->ifr_map.irq       = dev->irq;
3807                         ifr->ifr_map.dma       = dev->dma;
3808                         ifr->ifr_map.port      = dev->if_port;
3809                         return 0;
3810
3811                 case SIOCGIFINDEX:
3812                         ifr->ifr_ifindex = dev->ifindex;
3813                         return 0;
3814
3815                 case SIOCGIFTXQLEN:
3816                         ifr->ifr_qlen = dev->tx_queue_len;
3817                         return 0;
3818
3819                 default:
3820                         /* dev_ioctl() should ensure this case
3821                          * is never reached
3822                          */
3823                         WARN_ON(1);
3824                         err = -EINVAL;
3825                         break;
3826
3827         }
3828         return err;
3829 }
3830
3831 /*
3832  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
3833  */
3834 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3835 {
3836         int err;
3837         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3838         const struct net_device_ops *ops;
3839
3840         if (!dev)
3841                 return -ENODEV;
3842
3843         ops = dev->netdev_ops;
3844
3845         switch (cmd) {
3846                 case SIOCSIFFLAGS:      /* Set interface flags */
3847                         return dev_change_flags(dev, ifr->ifr_flags);
3848
3849                 case SIOCSIFMETRIC:     /* Set the metric on the interface
3850                                            (currently unused) */
3851                         return -EOPNOTSUPP;
3852
3853                 case SIOCSIFMTU:        /* Set the MTU of a device */
3854                         return dev_set_mtu(dev, ifr->ifr_mtu);
3855
3856                 case SIOCSIFHWADDR:
3857                         return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3858
3859                 case SIOCSIFHWBROADCAST:
3860                         if (ifr->ifr_hwaddr.sa_family != dev->type)
3861                                 return -EINVAL;
3862                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3863                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3864                         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3865                         return 0;
3866
3867                 case SIOCSIFMAP:
3868                         if (ops->ndo_set_config) {
3869                                 if (!netif_device_present(dev))
3870                                         return -ENODEV;
3871                                 return ops->ndo_set_config(dev, &ifr->ifr_map);
3872                         }
3873                         return -EOPNOTSUPP;
3874
3875                 case SIOCADDMULTI:
3876                         if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3877                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3878                                 return -EINVAL;
3879                         if (!netif_device_present(dev))
3880                                 return -ENODEV;
3881                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3882                                           dev->addr_len, 1);
3883
3884                 case SIOCDELMULTI:
3885                         if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3886                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3887                                 return -EINVAL;
3888                         if (!netif_device_present(dev))
3889                                 return -ENODEV;
3890                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3891                                              dev->addr_len, 1);
3892
3893                 case SIOCSIFTXQLEN:
3894                         if (ifr->ifr_qlen < 0)
3895                                 return -EINVAL;
3896                         dev->tx_queue_len = ifr->ifr_qlen;
3897                         return 0;
3898
3899                 case SIOCSIFNAME:
3900                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3901                         return dev_change_name(dev, ifr->ifr_newname);
3902
3903                 /*
3904                  *      Unknown or private ioctl
3905                  */
3906
3907                 default:
3908                         if ((cmd >= SIOCDEVPRIVATE &&
3909                             cmd <= SIOCDEVPRIVATE + 15) ||
3910                             cmd == SIOCBONDENSLAVE ||
3911                             cmd == SIOCBONDRELEASE ||
3912                             cmd == SIOCBONDSETHWADDR ||
3913                             cmd == SIOCBONDSLAVEINFOQUERY ||
3914                             cmd == SIOCBONDINFOQUERY ||
3915                             cmd == SIOCBONDCHANGEACTIVE ||
3916                             cmd == SIOCGMIIPHY ||
3917                             cmd == SIOCGMIIREG ||
3918                             cmd == SIOCSMIIREG ||
3919                             cmd == SIOCBRADDIF ||
3920                             cmd == SIOCBRDELIF ||
3921                             cmd == SIOCWANDEV) {
3922                                 err = -EOPNOTSUPP;
3923                                 if (ops->ndo_do_ioctl) {
3924                                         if (netif_device_present(dev))
3925                                                 err = ops->ndo_do_ioctl(dev, ifr, cmd);
3926                                         else
3927                                                 err = -ENODEV;
3928                                 }
3929                         } else
3930                                 err = -EINVAL;
3931
3932         }
3933         return err;
3934 }
3935
3936 /*
3937  *      This function handles all "interface"-type I/O control requests. The actual
3938  *      'doing' part of this is dev_ifsioc above.
3939  */
3940
3941 /**
3942  *      dev_ioctl       -       network device ioctl
3943  *      @net: the applicable net namespace
3944  *      @cmd: command to issue
3945  *      @arg: pointer to a struct ifreq in user space
3946  *
3947  *      Issue ioctl functions to devices. This is normally called by the
3948  *      user space syscall interfaces but can sometimes be useful for
3949  *      other purposes. The return value is the return from the syscall if
3950  *      positive or a negative errno code on error.
3951  */
3952
3953 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3954 {
3955         struct ifreq ifr;
3956         int ret;
3957         char *colon;
3958
3959         /* One special case: SIOCGIFCONF takes ifconf argument
3960            and requires shared lock, because it sleeps writing
3961            to user space.
3962          */
3963
3964         if (cmd == SIOCGIFCONF) {
3965                 rtnl_lock();
3966                 ret = dev_ifconf(net, (char __user *) arg);
3967                 rtnl_unlock();
3968                 return ret;
3969         }
3970         if (cmd == SIOCGIFNAME)
3971                 return dev_ifname(net, (struct ifreq __user *)arg);
3972
3973         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3974                 return -EFAULT;
3975
3976         ifr.ifr_name[IFNAMSIZ-1] = 0;
3977
3978         colon = strchr(ifr.ifr_name, ':');
3979         if (colon)
3980                 *colon = 0;
3981
3982         /*
3983          *      See which interface the caller is talking about.
3984          */
3985
3986         switch (cmd) {
3987                 /*
3988                  *      These ioctl calls:
3989                  *      - can be done by all.
3990                  *      - atomic and do not require locking.
3991                  *      - return a value
3992                  */
3993                 case SIOCGIFFLAGS:
3994                 case SIOCGIFMETRIC:
3995                 case SIOCGIFMTU:
3996                 case SIOCGIFHWADDR:
3997                 case SIOCGIFSLAVE:
3998                 case SIOCGIFMAP:
3999                 case SIOCGIFINDEX:
4000                 case SIOCGIFTXQLEN:
4001                         dev_load(net, ifr.ifr_name);
4002                         read_lock(&dev_base_lock);
4003                         ret = dev_ifsioc_locked(net, &ifr, cmd);
4004                         read_unlock(&dev_base_lock);
4005                         if (!ret) {
4006                                 if (colon)
4007                                         *colon = ':';
4008                                 if (copy_to_user(arg, &ifr,
4009                                                  sizeof(struct ifreq)))
4010                                         ret = -EFAULT;
4011                         }
4012                         return ret;
4013
4014                 case SIOCETHTOOL:
4015                         dev_load(net, ifr.ifr_name);
4016                         rtnl_lock();
4017                         ret = dev_ethtool(net, &ifr);
4018                         rtnl_unlock();
4019                         if (!ret) {
4020                                 if (colon)
4021                                         *colon = ':';
4022                                 if (copy_to_user(arg, &ifr,
4023                                                  sizeof(struct ifreq)))
4024                                         ret = -EFAULT;
4025                         }
4026                         return ret;
4027
4028                 /*
4029                  *      These ioctl calls:
4030                  *      - require superuser power.
4031                  *      - require strict serialization.
4032                  *      - return a value
4033                  */
4034                 case SIOCGMIIPHY:
4035                 case SIOCGMIIREG:
4036                 case SIOCSIFNAME:
4037                         if (!capable(CAP_NET_ADMIN))
4038                                 return -EPERM;
4039                         dev_load(net, ifr.ifr_name);
4040                         rtnl_lock();
4041                         ret = dev_ifsioc(net, &ifr, cmd);
4042                         rtnl_unlock();
4043                         if (!ret) {
4044                                 if (colon)
4045                                         *colon = ':';
4046                                 if (copy_to_user(arg, &ifr,
4047                                                  sizeof(struct ifreq)))
4048                                         ret = -EFAULT;
4049                         }
4050                         return ret;
4051
4052                 /*
4053                  *      These ioctl calls:
4054                  *      - require superuser power.
4055                  *      - require strict serialization.
4056                  *      - do not return a value
4057                  */
4058                 case SIOCSIFFLAGS:
4059                 case SIOCSIFMETRIC:
4060                 case SIOCSIFMTU:
4061                 case SIOCSIFMAP:
4062                 case SIOCSIFHWADDR:
4063                 case SIOCSIFSLAVE:
4064                 case SIOCADDMULTI:
4065                 case SIOCDELMULTI:
4066                 case SIOCSIFHWBROADCAST:
4067                 case SIOCSIFTXQLEN:
4068                 case SIOCSMIIREG:
4069                 case SIOCBONDENSLAVE:
4070                 case SIOCBONDRELEASE:
4071                 case SIOCBONDSETHWADDR:
4072                 case SIOCBONDCHANGEACTIVE:
4073                 case SIOCBRADDIF:
4074                 case SIOCBRDELIF:
4075                         if (!capable(CAP_NET_ADMIN))
4076                                 return -EPERM;
4077                         /* fall through */
4078                 case SIOCBONDSLAVEINFOQUERY:
4079                 case SIOCBONDINFOQUERY:
4080                         dev_load(net, ifr.ifr_name);
4081                         rtnl_lock();
4082                         ret = dev_ifsioc(net, &ifr, cmd);
4083                         rtnl_unlock();
4084                         return ret;
4085
4086                 case SIOCGIFMEM:
4087                         /* Get the per device memory space. We can add this but
4088                          * currently do not support it */
4089                 case SIOCSIFMEM:
4090                         /* Set the per device memory buffer space.
4091                          * Not applicable in our case */
4092                 case SIOCSIFLINK:
4093                         return -EINVAL;
4094
4095                 /*
4096                  *      Unknown or private ioctl.
4097                  */
4098                 default:
4099                         if (cmd == SIOCWANDEV ||
4100                             (cmd >= SIOCDEVPRIVATE &&
4101                              cmd <= SIOCDEVPRIVATE + 15)) {
4102                                 dev_load(net, ifr.ifr_name);
4103                                 rtnl_lock();
4104                                 ret = dev_ifsioc(net, &ifr, cmd);
4105                                 rtnl_unlock();
4106                                 if (!ret && copy_to_user(arg, &ifr,
4107                                                          sizeof(struct ifreq)))
4108                                         ret = -EFAULT;
4109                                 return ret;
4110                         }
4111                         /* Take care of Wireless Extensions */
4112                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4113                                 return wext_handle_ioctl(net, &ifr, cmd, arg);
4114                         return -EINVAL;
4115         }
4116 }
4117
4118
4119 /**
4120  *      dev_new_index   -       allocate an ifindex
4121  *      @net: the applicable net namespace
4122  *
4123  *      Returns a suitable unique value for a new device interface
4124  *      number.  The caller must hold the rtnl semaphore or the
4125  *      dev_base_lock to be sure it remains unique.
4126  */
4127 static int dev_new_index(struct net *net)
4128 {
4129         static int ifindex;
4130         for (;;) {
4131                 if (++ifindex <= 0)
4132                         ifindex = 1;
4133                 if (!__dev_get_by_index(net, ifindex))
4134                         return ifindex;
4135         }
4136 }
4137
4138 /* Delayed registration/unregisteration */
4139 static LIST_HEAD(net_todo_list);
4140
4141 static void net_set_todo(struct net_device *dev)
4142 {
4143         list_add_tail(&dev->todo_list, &net_todo_list);
4144 }
4145
4146 static void rollback_registered(struct net_device *dev)
4147 {
4148         BUG_ON(dev_boot_phase);
4149         ASSERT_RTNL();
4150
4151         /* Some devices call without registering for initialization unwind. */
4152         if (dev->reg_state == NETREG_UNINITIALIZED) {
4153                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4154                                   "was registered\n", dev->name, dev);
4155
4156                 WARN_ON(1);
4157                 return;
4158         }
4159
4160         BUG_ON(dev->reg_state != NETREG_REGISTERED);
4161
4162         /* If device is running, close it first. */
4163         dev_close(dev);
4164
4165         /* And unlink it from device chain. */
4166         unlist_netdevice(dev);
4167
4168         dev->reg_state = NETREG_UNREGISTERING;
4169
4170         synchronize_net();
4171
4172         /* Shutdown queueing discipline. */
4173         dev_shutdown(dev);
4174
4175
4176         /* Notify protocols, that we are about to destroy
4177            this device. They should clean all the things.
4178         */
4179         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4180
4181         /*
4182          *      Flush the unicast and multicast chains
4183          */
4184         dev_addr_discard(dev);
4185
4186         if (dev->netdev_ops->ndo_uninit)
4187                 dev->netdev_ops->ndo_uninit(dev);
4188
4189         /* Notifier chain MUST detach us from master device. */
4190         WARN_ON(dev->master);
4191
4192         /* Remove entries from kobject tree */
4193         netdev_unregister_kobject(dev);
4194
4195         synchronize_net();
4196
4197         dev_put(dev);
4198 }
4199
4200 static void __netdev_init_queue_locks_one(struct net_device *dev,
4201                                           struct netdev_queue *dev_queue,
4202                                           void *_unused)
4203 {
4204         spin_lock_init(&dev_queue->_xmit_lock);
4205         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4206         dev_queue->xmit_lock_owner = -1;
4207 }
4208
4209 static void netdev_init_queue_locks(struct net_device *dev)
4210 {
4211         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4212         __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4213 }
4214
4215 unsigned long netdev_fix_features(unsigned long features, const char *name)
4216 {
4217         /* Fix illegal SG+CSUM combinations. */
4218         if ((features & NETIF_F_SG) &&
4219             !(features & NETIF_F_ALL_CSUM)) {
4220                 if (name)
4221                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4222                                "checksum feature.\n", name);
4223                 features &= ~NETIF_F_SG;
4224         }
4225
4226         /* TSO requires that SG is present as well. */
4227         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4228                 if (name)
4229                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4230                                "SG feature.\n", name);
4231                 features &= ~NETIF_F_TSO;
4232         }
4233
4234         if (features & NETIF_F_UFO) {
4235                 if (!(features & NETIF_F_GEN_CSUM)) {
4236                         if (name)
4237                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4238                                        "since no NETIF_F_HW_CSUM feature.\n",
4239                                        name);
4240                         features &= ~NETIF_F_UFO;
4241                 }
4242
4243                 if (!(features & NETIF_F_SG)) {
4244                         if (name)
4245                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4246                                        "since no NETIF_F_SG feature.\n", name);
4247                         features &= ~NETIF_F_UFO;
4248                 }
4249         }
4250
4251         return features;
4252 }
4253 EXPORT_SYMBOL(netdev_fix_features);
4254
4255 /**
4256  *      register_netdevice      - register a network device
4257  *      @dev: device to register
4258  *
4259  *      Take a completed network device structure and add it to the kernel
4260  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4261  *      chain. 0 is returned on success. A negative errno code is returned
4262  *      on a failure to set up the device, or if the name is a duplicate.
4263  *
4264  *      Callers must hold the rtnl semaphore. You may want
4265  *      register_netdev() instead of this.
4266  *
4267  *      BUGS:
4268  *      The locking appears insufficient to guarantee two parallel registers
4269  *      will not get the same name.
4270  */
4271
4272 int register_netdevice(struct net_device *dev)
4273 {
4274         struct hlist_head *head;
4275         struct hlist_node *p;
4276         int ret;
4277         struct net *net = dev_net(dev);
4278
4279         BUG_ON(dev_boot_phase);
4280         ASSERT_RTNL();
4281
4282         might_sleep();
4283
4284         /* When net_device's are persistent, this will be fatal. */
4285         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4286         BUG_ON(!net);
4287
4288         spin_lock_init(&dev->addr_list_lock);
4289         netdev_set_addr_lockdep_class(dev);
4290         netdev_init_queue_locks(dev);
4291
4292         dev->iflink = -1;
4293
4294 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4295         /* Netdevice_ops API compatiability support.
4296          * This is temporary until all network devices are converted.
4297          */
4298         if (dev->netdev_ops) {
4299                 const struct net_device_ops *ops = dev->netdev_ops;
4300
4301                 dev->init = ops->ndo_init;
4302                 dev->uninit = ops->ndo_uninit;
4303                 dev->open = ops->ndo_open;
4304                 dev->change_rx_flags = ops->ndo_change_rx_flags;
4305                 dev->set_rx_mode = ops->ndo_set_rx_mode;
4306                 dev->set_multicast_list = ops->ndo_set_multicast_list;
4307                 dev->set_mac_address = ops->ndo_set_mac_address;
4308                 dev->validate_addr = ops->ndo_validate_addr;
4309                 dev->do_ioctl = ops->ndo_do_ioctl;
4310                 dev->set_config = ops->ndo_set_config;
4311                 dev->change_mtu = ops->ndo_change_mtu;
4312                 dev->tx_timeout = ops->ndo_tx_timeout;
4313                 dev->get_stats = ops->ndo_get_stats;
4314                 dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4315                 dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4316                 dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4317 #ifdef CONFIG_NET_POLL_CONTROLLER
4318                 dev->poll_controller = ops->ndo_poll_controller;
4319 #endif
4320         } else {
4321                 char drivername[64];
4322                 pr_info("%s (%s): not using net_device_ops yet\n",
4323                         dev->name, netdev_drivername(dev, drivername, 64));
4324
4325                 /* This works only because net_device_ops and the
4326                    compatiablity structure are the same. */
4327                 dev->netdev_ops = (void *) &(dev->init);
4328         }
4329 #endif
4330
4331         /* Init, if this function is available */
4332         if (dev->netdev_ops->ndo_init) {
4333                 ret = dev->netdev_ops->ndo_init(dev);
4334                 if (ret) {
4335                         if (ret > 0)
4336                                 ret = -EIO;
4337                         goto out;
4338                 }
4339         }
4340
4341         if (!dev_valid_name(dev->name)) {
4342                 ret = -EINVAL;
4343                 goto err_uninit;
4344         }
4345
4346         dev->ifindex = dev_new_index(net);
4347         if (dev->iflink == -1)
4348                 dev->iflink = dev->ifindex;
4349
4350         /* Check for existence of name */
4351         head = dev_name_hash(net, dev->name);
4352         hlist_for_each(p, head) {
4353                 struct net_device *d
4354                         = hlist_entry(p, struct net_device, name_hlist);
4355                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4356                         ret = -EEXIST;
4357                         goto err_uninit;
4358                 }
4359         }
4360
4361         /* Fix illegal checksum combinations */
4362         if ((dev->features & NETIF_F_HW_CSUM) &&
4363             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4364                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4365                        dev->name);
4366                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4367         }
4368
4369         if ((dev->features & NETIF_F_NO_CSUM) &&
4370             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4371                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4372                        dev->name);
4373                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4374         }
4375
4376         dev->features = netdev_fix_features(dev->features, dev->name);
4377
4378         /* Enable software GSO if SG is supported. */
4379         if (dev->features & NETIF_F_SG)
4380                 dev->features |= NETIF_F_GSO;
4381
4382         netdev_initialize_kobject(dev);
4383         ret = netdev_register_kobject(dev);
4384         if (ret)
4385                 goto err_uninit;
4386         dev->reg_state = NETREG_REGISTERED;
4387
4388         /*
4389          *      Default initial state at registry is that the
4390          *      device is present.
4391          */
4392
4393         set_bit(__LINK_STATE_PRESENT, &dev->state);
4394
4395         dev_init_scheduler(dev);
4396         dev_hold(dev);
4397         list_netdevice(dev);
4398
4399         /* Notify protocols, that a new device appeared. */
4400         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4401         ret = notifier_to_errno(ret);
4402         if (ret) {
4403                 rollback_registered(dev);
4404                 dev->reg_state = NETREG_UNREGISTERED;
4405         }
4406
4407 out:
4408         return ret;
4409
4410 err_uninit:
4411         if (dev->netdev_ops->ndo_uninit)
4412                 dev->netdev_ops->ndo_uninit(dev);
4413         goto out;
4414 }
4415
4416 /**
4417  *      register_netdev - register a network device
4418  *      @dev: device to register
4419  *
4420  *      Take a completed network device structure and add it to the kernel
4421  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4422  *      chain. 0 is returned on success. A negative errno code is returned
4423  *      on a failure to set up the device, or if the name is a duplicate.
4424  *
4425  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
4426  *      and expands the device name if you passed a format string to
4427  *      alloc_netdev.
4428  */
4429 int register_netdev(struct net_device *dev)
4430 {
4431         int err;
4432
4433         rtnl_lock();
4434
4435         /*
4436          * If the name is a format string the caller wants us to do a
4437          * name allocation.
4438          */
4439         if (strchr(dev->name, '%')) {
4440                 err = dev_alloc_name(dev, dev->name);
4441                 if (err < 0)
4442                         goto out;
4443         }
4444
4445         err = register_netdevice(dev);
4446 out:
4447         rtnl_unlock();
4448         return err;
4449 }
4450 EXPORT_SYMBOL(register_netdev);
4451
4452 /*
4453  * netdev_wait_allrefs - wait until all references are gone.
4454  *
4455  * This is called when unregistering network devices.
4456  *
4457  * Any protocol or device that holds a reference should register
4458  * for netdevice notification, and cleanup and put back the
4459  * reference if they receive an UNREGISTER event.
4460  * We can get stuck here if buggy protocols don't correctly
4461  * call dev_put.
4462  */
4463 static void netdev_wait_allrefs(struct net_device *dev)
4464 {
4465         unsigned long rebroadcast_time, warning_time;
4466
4467         rebroadcast_time = warning_time = jiffies;
4468         while (atomic_read(&dev->refcnt) != 0) {
4469                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4470                         rtnl_lock();
4471
4472                         /* Rebroadcast unregister notification */
4473                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4474
4475                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4476                                      &dev->state)) {
4477                                 /* We must not have linkwatch events
4478                                  * pending on unregister. If this
4479                                  * happens, we simply run the queue
4480                                  * unscheduled, resulting in a noop
4481                                  * for this device.
4482                                  */
4483                                 linkwatch_run_queue();
4484                         }
4485
4486                         __rtnl_unlock();
4487
4488                         rebroadcast_time = jiffies;
4489                 }
4490
4491                 msleep(250);
4492
4493                 if (time_after(jiffies, warning_time + 10 * HZ)) {
4494                         printk(KERN_EMERG "unregister_netdevice: "
4495                                "waiting for %s to become free. Usage "
4496                                "count = %d\n",
4497                                dev->name, atomic_read(&dev->refcnt));
4498                         warning_time = jiffies;
4499                 }
4500         }
4501 }
4502
4503 /* The sequence is:
4504  *
4505  *      rtnl_lock();
4506  *      ...
4507  *      register_netdevice(x1);
4508  *      register_netdevice(x2);
4509  *      ...
4510  *      unregister_netdevice(y1);
4511  *      unregister_netdevice(y2);
4512  *      ...
4513  *      rtnl_unlock();
4514  *      free_netdev(y1);
4515  *      free_netdev(y2);
4516  *
4517  * We are invoked by rtnl_unlock().
4518  * This allows us to deal with problems:
4519  * 1) We can delete sysfs objects which invoke hotplug
4520  *    without deadlocking with linkwatch via keventd.
4521  * 2) Since we run with the RTNL semaphore not held, we can sleep
4522  *    safely in order to wait for the netdev refcnt to drop to zero.
4523  *
4524  * We must not return until all unregister events added during
4525  * the interval the lock was held have been completed.
4526  */
4527 void netdev_run_todo(void)
4528 {
4529         struct list_head list;
4530
4531         /* Snapshot list, allow later requests */
4532         list_replace_init(&net_todo_list, &list);
4533
4534         __rtnl_unlock();
4535
4536         while (!list_empty(&list)) {
4537                 struct net_device *dev
4538                         = list_entry(list.next, struct net_device, todo_list);
4539                 list_del(&dev->todo_list);
4540
4541                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4542                         printk(KERN_ERR "network todo '%s' but state %d\n",
4543                                dev->name, dev->reg_state);
4544                         dump_stack();
4545                         continue;
4546                 }
4547
4548                 dev->reg_state = NETREG_UNREGISTERED;
4549
4550                 on_each_cpu(flush_backlog, dev, 1);
4551
4552                 netdev_wait_allrefs(dev);
4553
4554                 /* paranoia */
4555                 BUG_ON(atomic_read(&dev->refcnt));
4556                 WARN_ON(dev->ip_ptr);
4557                 WARN_ON(dev->ip6_ptr);
4558                 WARN_ON(dev->dn_ptr);
4559
4560                 if (dev->destructor)
4561                         dev->destructor(dev);
4562
4563                 /* Free network device */
4564                 kobject_put(&dev->dev.kobj);
4565         }
4566 }
4567
4568 /**
4569  *      dev_get_stats   - get network device statistics
4570  *      @dev: device to get statistics from
4571  *
4572  *      Get network statistics from device. The device driver may provide
4573  *      its own method by setting dev->netdev_ops->get_stats; otherwise
4574  *      the internal statistics structure is used.
4575  */
4576 const struct net_device_stats *dev_get_stats(struct net_device *dev)
4577  {
4578         const struct net_device_ops *ops = dev->netdev_ops;
4579
4580         if (ops->ndo_get_stats)
4581                 return ops->ndo_get_stats(dev);
4582         else
4583                 return &dev->stats;
4584 }
4585 EXPORT_SYMBOL(dev_get_stats);
4586
4587 static void netdev_init_one_queue(struct net_device *dev,
4588                                   struct netdev_queue *queue,
4589                                   void *_unused)
4590 {
4591         queue->dev = dev;
4592 }
4593
4594 static void netdev_init_queues(struct net_device *dev)
4595 {
4596         netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4597         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4598         spin_lock_init(&dev->tx_global_lock);
4599 }
4600
4601 /**
4602  *      alloc_netdev_mq - allocate network device
4603  *      @sizeof_priv:   size of private data to allocate space for
4604  *      @name:          device name format string
4605  *      @setup:         callback to initialize device
4606  *      @queue_count:   the number of subqueues to allocate
4607  *
4608  *      Allocates a struct net_device with private data area for driver use
4609  *      and performs basic initialization.  Also allocates subquue structs
4610  *      for each queue on the device at the end of the netdevice.
4611  */
4612 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4613                 void (*setup)(struct net_device *), unsigned int queue_count)
4614 {
4615         struct netdev_queue *tx;
4616         struct net_device *dev;
4617         size_t alloc_size;
4618         void *p;
4619
4620         BUG_ON(strlen(name) >= sizeof(dev->name));
4621
4622         alloc_size = sizeof(struct net_device);
4623         if (sizeof_priv) {
4624                 /* ensure 32-byte alignment of private area */
4625                 alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4626                 alloc_size += sizeof_priv;
4627         }
4628         /* ensure 32-byte alignment of whole construct */
4629         alloc_size += NETDEV_ALIGN_CONST;
4630
4631         p = kzalloc(alloc_size, GFP_KERNEL);
4632         if (!p) {
4633                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4634                 return NULL;
4635         }
4636
4637         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4638         if (!tx) {
4639                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
4640                        "tx qdiscs.\n");
4641                 kfree(p);
4642                 return NULL;
4643         }
4644
4645         dev = (struct net_device *)
4646                 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4647         dev->padded = (char *)dev - (char *)p;
4648         dev_net_set(dev, &init_net);
4649
4650         dev->_tx = tx;
4651         dev->num_tx_queues = queue_count;
4652         dev->real_num_tx_queues = queue_count;
4653
4654         dev->gso_max_size = GSO_MAX_SIZE;
4655
4656         netdev_init_queues(dev);
4657
4658         INIT_LIST_HEAD(&dev->napi_list);
4659         setup(dev);
4660         strcpy(dev->name, name);
4661         return dev;
4662 }
4663 EXPORT_SYMBOL(alloc_netdev_mq);
4664
4665 /**
4666  *      free_netdev - free network device
4667  *      @dev: device
4668  *
4669  *      This function does the last stage of destroying an allocated device
4670  *      interface. The reference to the device object is released.
4671  *      If this is the last reference then it will be freed.
4672  */
4673 void free_netdev(struct net_device *dev)
4674 {
4675         struct napi_struct *p, *n;
4676
4677         release_net(dev_net(dev));
4678
4679         kfree(dev->_tx);
4680
4681         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
4682                 netif_napi_del(p);
4683
4684         /*  Compatibility with error handling in drivers */
4685         if (dev->reg_state == NETREG_UNINITIALIZED) {
4686                 kfree((char *)dev - dev->padded);
4687                 return;
4688         }
4689
4690         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4691         dev->reg_state = NETREG_RELEASED;
4692
4693         /* will free via device release */
4694         put_device(&dev->dev);
4695 }
4696
4697 /**
4698  *      synchronize_net -  Synchronize with packet receive processing
4699  *
4700  *      Wait for packets currently being received to be done.
4701  *      Does not block later packets from starting.
4702  */
4703 void synchronize_net(void)
4704 {
4705         might_sleep();
4706         synchronize_rcu();
4707 }
4708
4709 /**
4710  *      unregister_netdevice - remove device from the kernel
4711  *      @dev: device
4712  *
4713  *      This function shuts down a device interface and removes it
4714  *      from the kernel tables.
4715  *
4716  *      Callers must hold the rtnl semaphore.  You may want
4717  *      unregister_netdev() instead of this.
4718  */
4719
4720 void unregister_netdevice(struct net_device *dev)
4721 {
4722         ASSERT_RTNL();
4723
4724         rollback_registered(dev);
4725         /* Finish processing unregister after unlock */
4726         net_set_todo(dev);
4727 }
4728
4729 /**
4730  *      unregister_netdev - remove device from the kernel
4731  *      @dev: device
4732  *
4733  *      This function shuts down a device interface and removes it
4734  *      from the kernel tables.
4735  *
4736  *      This is just a wrapper for unregister_netdevice that takes
4737  *      the rtnl semaphore.  In general you want to use this and not
4738  *      unregister_netdevice.
4739  */
4740 void unregister_netdev(struct net_device *dev)
4741 {
4742         rtnl_lock();
4743         unregister_netdevice(dev);
4744         rtnl_unlock();
4745 }
4746
4747 EXPORT_SYMBOL(unregister_netdev);
4748
4749 /**
4750  *      dev_change_net_namespace - move device to different nethost namespace
4751  *      @dev: device
4752  *      @net: network namespace
4753  *      @pat: If not NULL name pattern to try if the current device name
4754  *            is already taken in the destination network namespace.
4755  *
4756  *      This function shuts down a device interface and moves it
4757  *      to a new network namespace. On success 0 is returned, on
4758  *      a failure a netagive errno code is returned.
4759  *
4760  *      Callers must hold the rtnl semaphore.
4761  */
4762
4763 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4764 {
4765         char buf[IFNAMSIZ];
4766         const char *destname;
4767         int err;
4768
4769         ASSERT_RTNL();
4770
4771         /* Don't allow namespace local devices to be moved. */
4772         err = -EINVAL;
4773         if (dev->features & NETIF_F_NETNS_LOCAL)
4774                 goto out;
4775
4776 #ifdef CONFIG_SYSFS
4777         /* Don't allow real devices to be moved when sysfs
4778          * is enabled.
4779          */
4780         err = -EINVAL;
4781         if (dev->dev.parent)
4782                 goto out;
4783 #endif
4784
4785         /* Ensure the device has been registrered */
4786         err = -EINVAL;
4787         if (dev->reg_state != NETREG_REGISTERED)
4788                 goto out;
4789
4790         /* Get out if there is nothing todo */
4791         err = 0;
4792         if (net_eq(dev_net(dev), net))
4793                 goto out;
4794
4795         /* Pick the destination device name, and ensure
4796          * we can use it in the destination network namespace.
4797          */
4798         err = -EEXIST;
4799         destname = dev->name;
4800         if (__dev_get_by_name(net, destname)) {
4801                 /* We get here if we can't use the current device name */
4802                 if (!pat)
4803                         goto out;
4804                 if (!dev_valid_name(pat))
4805                         goto out;
4806                 if (strchr(pat, '%')) {
4807                         if (__dev_alloc_name(net, pat, buf) < 0)
4808                                 goto out;
4809                         destname = buf;
4810                 } else
4811                         destname = pat;
4812                 if (__dev_get_by_name(net, destname))
4813                         goto out;
4814         }
4815
4816         /*
4817          * And now a mini version of register_netdevice unregister_netdevice.
4818          */
4819
4820         /* If device is running close it first. */
4821         dev_close(dev);
4822
4823         /* And unlink it from device chain */
4824         err = -ENODEV;
4825         unlist_netdevice(dev);
4826
4827         synchronize_net();
4828
4829         /* Shutdown queueing discipline. */
4830         dev_shutdown(dev);
4831
4832         /* Notify protocols, that we are about to destroy
4833            this device. They should clean all the things.
4834         */
4835         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4836
4837         /*
4838          *      Flush the unicast and multicast chains
4839          */
4840         dev_addr_discard(dev);
4841
4842         netdev_unregister_kobject(dev);
4843
4844         /* Actually switch the network namespace */
4845         dev_net_set(dev, net);
4846
4847         /* Assign the new device name */
4848         if (destname != dev->name)
4849                 strcpy(dev->name, destname);
4850
4851         /* If there is an ifindex conflict assign a new one */
4852         if (__dev_get_by_index(net, dev->ifindex)) {
4853                 int iflink = (dev->iflink == dev->ifindex);
4854                 dev->ifindex = dev_new_index(net);
4855                 if (iflink)
4856                         dev->iflink = dev->ifindex;
4857         }
4858
4859         /* Fixup kobjects */
4860         err = netdev_register_kobject(dev);
4861         WARN_ON(err);
4862
4863         /* Add the device back in the hashes */
4864         list_netdevice(dev);
4865
4866         /* Notify protocols, that a new device appeared. */
4867         call_netdevice_notifiers(NETDEV_REGISTER, dev);
4868
4869         synchronize_net();
4870         err = 0;
4871 out:
4872         return err;
4873 }
4874
4875 static int dev_cpu_callback(struct notifier_block *nfb,
4876                             unsigned long action,
4877                             void *ocpu)
4878 {
4879         struct sk_buff **list_skb;
4880         struct Qdisc **list_net;
4881         struct sk_buff *skb;
4882         unsigned int cpu, oldcpu = (unsigned long)ocpu;
4883         struct softnet_data *sd, *oldsd;
4884
4885         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4886                 return NOTIFY_OK;
4887
4888         local_irq_disable();
4889         cpu = smp_processor_id();
4890         sd = &per_cpu(softnet_data, cpu);
4891         oldsd = &per_cpu(softnet_data, oldcpu);
4892
4893         /* Find end of our completion_queue. */
4894         list_skb = &sd->completion_queue;
4895         while (*list_skb)
4896                 list_skb = &(*list_skb)->next;
4897         /* Append completion queue from offline CPU. */
4898         *list_skb = oldsd->completion_queue;
4899         oldsd->completion_queue = NULL;
4900
4901         /* Find end of our output_queue. */
4902         list_net = &sd->output_queue;
4903         while (*list_net)
4904                 list_net = &(*list_net)->next_sched;
4905         /* Append output queue from offline CPU. */
4906         *list_net = oldsd->output_queue;
4907         oldsd->output_queue = NULL;
4908
4909         raise_softirq_irqoff(NET_TX_SOFTIRQ);
4910         local_irq_enable();
4911
4912         /* Process offline CPU's input_pkt_queue */
4913         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4914                 netif_rx(skb);
4915
4916         return NOTIFY_OK;
4917 }
4918
4919 #ifdef CONFIG_NET_DMA
4920 /**
4921  * net_dma_rebalance - try to maintain one DMA channel per CPU
4922  * @net_dma: DMA client and associated data (lock, channels, channel_mask)
4923  *
4924  * This is called when the number of channels allocated to the net_dma client
4925  * changes.  The net_dma client tries to have one DMA channel per CPU.
4926  */
4927
4928 static void net_dma_rebalance(struct net_dma *net_dma)
4929 {
4930         unsigned int cpu, i, n, chan_idx;
4931         struct dma_chan *chan;
4932
4933         if (cpus_empty(net_dma->channel_mask)) {
4934                 for_each_online_cpu(cpu)
4935                         rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
4936                 return;
4937         }
4938
4939         i = 0;
4940         cpu = first_cpu(cpu_online_map);
4941
4942         for_each_cpu_mask_nr(chan_idx, net_dma->channel_mask) {
4943                 chan = net_dma->channels[chan_idx];
4944
4945                 n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
4946                    + (i < (num_online_cpus() %
4947                         cpus_weight(net_dma->channel_mask)) ? 1 : 0));
4948
4949                 while(n) {
4950                         per_cpu(softnet_data, cpu).net_dma = chan;
4951                         cpu = next_cpu(cpu, cpu_online_map);
4952                         n--;
4953                 }
4954                 i++;
4955         }
4956 }
4957
4958 /**
4959  * netdev_dma_event - event callback for the net_dma_client
4960  * @client: should always be net_dma_client
4961  * @chan: DMA channel for the event
4962  * @state: DMA state to be handled
4963  */
4964 static enum dma_state_client
4965 netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
4966         enum dma_state state)
4967 {
4968         int i, found = 0, pos = -1;
4969         struct net_dma *net_dma =
4970                 container_of(client, struct net_dma, client);
4971         enum dma_state_client ack = DMA_DUP; /* default: take no action */
4972
4973         spin_lock(&net_dma->lock);
4974         switch (state) {
4975         case DMA_RESOURCE_AVAILABLE:
4976                 for (i = 0; i < nr_cpu_ids; i++)
4977                         if (net_dma->channels[i] == chan) {
4978                                 found = 1;
4979                                 break;
4980                         } else if (net_dma->channels[i] == NULL && pos < 0)
4981                                 pos = i;
4982
4983                 if (!found && pos >= 0) {
4984                         ack = DMA_ACK;
4985                         net_dma->channels[pos] = chan;
4986                         cpu_set(pos, net_dma->channel_mask);
4987                         net_dma_rebalance(net_dma);
4988                 }
4989                 break;
4990         case DMA_RESOURCE_REMOVED:
4991                 for (i = 0; i < nr_cpu_ids; i++)
4992                         if (net_dma->channels[i] == chan) {
4993                                 found = 1;
4994                                 pos = i;
4995                                 break;
4996                         }
4997
4998                 if (found) {
4999                         ack = DMA_ACK;
5000                         cpu_clear(pos, net_dma->channel_mask);
5001                         net_dma->channels[i] = NULL;
5002                         net_dma_rebalance(net_dma);
5003                 }
5004                 break;
5005         default:
5006                 break;
5007         }
5008         spin_unlock(&net_dma->lock);
5009
5010         return ack;
5011 }
5012
5013 /**
5014  * netdev_dma_register - register the networking subsystem as a DMA client
5015  */
5016 static int __init netdev_dma_register(void)
5017 {
5018         net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma),
5019                                                                 GFP_KERNEL);
5020         if (unlikely(!net_dma.channels)) {
5021                 printk(KERN_NOTICE
5022                                 "netdev_dma: no memory for net_dma.channels\n");
5023                 return -ENOMEM;
5024         }
5025         spin_lock_init(&net_dma.lock);
5026         dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
5027         dma_async_client_register(&net_dma.client);
5028         dma_async_client_chan_request(&net_dma.client);
5029         return 0;
5030 }
5031
5032 #else
5033 static int __init netdev_dma_register(void) { return -ENODEV; }
5034 #endif /* CONFIG_NET_DMA */
5035
5036 /**
5037  *      netdev_increment_features - increment feature set by one
5038  *      @all: current feature set
5039  *      @one: new feature set
5040  *      @mask: mask feature set
5041  *
5042  *      Computes a new feature set after adding a device with feature set
5043  *      @one to the master device with current feature set @all.  Will not
5044  *      enable anything that is off in @mask. Returns the new feature set.
5045  */
5046 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5047                                         unsigned long mask)
5048 {
5049         /* If device needs checksumming, downgrade to it. */
5050         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5051                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5052         else if (mask & NETIF_F_ALL_CSUM) {
5053                 /* If one device supports v4/v6 checksumming, set for all. */
5054                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5055                     !(all & NETIF_F_GEN_CSUM)) {
5056                         all &= ~NETIF_F_ALL_CSUM;
5057                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5058                 }
5059
5060                 /* If one device supports hw checksumming, set for all. */
5061                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5062                         all &= ~NETIF_F_ALL_CSUM;
5063                         all |= NETIF_F_HW_CSUM;
5064                 }
5065         }
5066
5067         one |= NETIF_F_ALL_CSUM;
5068
5069         one |= all & NETIF_F_ONE_FOR_ALL;
5070         all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5071         all |= one & mask & NETIF_F_ONE_FOR_ALL;
5072
5073         return all;
5074 }
5075 EXPORT_SYMBOL(netdev_increment_features);
5076
5077 static struct hlist_head *netdev_create_hash(void)
5078 {
5079         int i;
5080         struct hlist_head *hash;
5081
5082         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5083         if (hash != NULL)
5084                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5085                         INIT_HLIST_HEAD(&hash[i]);
5086
5087         return hash;
5088 }
5089
5090 /* Initialize per network namespace state */
5091 static int __net_init netdev_init(struct net *net)
5092 {
5093         INIT_LIST_HEAD(&net->dev_base_head);
5094
5095         net->dev_name_head = netdev_create_hash();
5096         if (net->dev_name_head == NULL)
5097                 goto err_name;
5098
5099         net->dev_index_head = netdev_create_hash();
5100         if (net->dev_index_head == NULL)
5101                 goto err_idx;
5102
5103         return 0;
5104
5105 err_idx:
5106         kfree(net->dev_name_head);
5107 err_name:
5108         return -ENOMEM;
5109 }
5110
5111 /**
5112  *      netdev_drivername - network driver for the device
5113  *      @dev: network device
5114  *      @buffer: buffer for resulting name
5115  *      @len: size of buffer
5116  *
5117  *      Determine network driver for device.
5118  */
5119 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5120 {
5121         const struct device_driver *driver;
5122         const struct device *parent;
5123
5124         if (len <= 0 || !buffer)
5125                 return buffer;
5126         buffer[0] = 0;
5127
5128         parent = dev->dev.parent;
5129
5130         if (!parent)
5131                 return buffer;
5132
5133         driver = parent->driver;
5134         if (driver && driver->name)
5135                 strlcpy(buffer, driver->name, len);
5136         return buffer;
5137 }
5138
5139 static void __net_exit netdev_exit(struct net *net)
5140 {
5141         kfree(net->dev_name_head);
5142         kfree(net->dev_index_head);
5143 }
5144
5145 static struct pernet_operations __net_initdata netdev_net_ops = {
5146         .init = netdev_init,
5147         .exit = netdev_exit,
5148 };
5149
5150 static void __net_exit default_device_exit(struct net *net)
5151 {
5152         struct net_device *dev;
5153         /*
5154          * Push all migratable of the network devices back to the
5155          * initial network namespace
5156          */
5157         rtnl_lock();
5158 restart:
5159         for_each_netdev(net, dev) {
5160                 int err;
5161                 char fb_name[IFNAMSIZ];
5162
5163                 /* Ignore unmoveable devices (i.e. loopback) */
5164                 if (dev->features & NETIF_F_NETNS_LOCAL)
5165                         continue;
5166
5167                 /* Delete virtual devices */
5168                 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5169                         dev->rtnl_link_ops->dellink(dev);
5170                         goto restart;
5171                 }
5172
5173                 /* Push remaing network devices to init_net */
5174                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5175                 err = dev_change_net_namespace(dev, &init_net, fb_name);
5176                 if (err) {
5177                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5178                                 __func__, dev->name, err);
5179                         BUG();
5180                 }
5181                 goto restart;
5182         }
5183         rtnl_unlock();
5184 }
5185
5186 static struct pernet_operations __net_initdata default_device_ops = {
5187         .exit = default_device_exit,
5188 };
5189
5190 /*
5191  *      Initialize the DEV module. At boot time this walks the device list and
5192  *      unhooks any devices that fail to initialise (normally hardware not
5193  *      present) and leaves us with a valid list of present and active devices.
5194  *
5195  */
5196
5197 /*
5198  *       This is called single threaded during boot, so no need
5199  *       to take the rtnl semaphore.
5200  */
5201 static int __init net_dev_init(void)
5202 {
5203         int i, rc = -ENOMEM;
5204
5205         BUG_ON(!dev_boot_phase);
5206
5207         if (dev_proc_init())
5208                 goto out;
5209
5210         if (netdev_kobject_init())
5211                 goto out;
5212
5213         INIT_LIST_HEAD(&ptype_all);
5214         for (i = 0; i < PTYPE_HASH_SIZE; i++)
5215                 INIT_LIST_HEAD(&ptype_base[i]);
5216
5217         if (register_pernet_subsys(&netdev_net_ops))
5218                 goto out;
5219
5220         /*
5221          *      Initialise the packet receive queues.
5222          */
5223
5224         for_each_possible_cpu(i) {
5225                 struct softnet_data *queue;
5226
5227                 queue = &per_cpu(softnet_data, i);
5228                 skb_queue_head_init(&queue->input_pkt_queue);
5229                 queue->completion_queue = NULL;
5230                 INIT_LIST_HEAD(&queue->poll_list);
5231
5232                 queue->backlog.poll = process_backlog;
5233                 queue->backlog.weight = weight_p;
5234                 queue->backlog.gro_list = NULL;
5235         }
5236
5237         dev_boot_phase = 0;
5238
5239         /* The loopback device is special if any other network devices
5240          * is present in a network namespace the loopback device must
5241          * be present. Since we now dynamically allocate and free the
5242          * loopback device ensure this invariant is maintained by
5243          * keeping the loopback device as the first device on the
5244          * list of network devices.  Ensuring the loopback devices
5245          * is the first device that appears and the last network device
5246          * that disappears.
5247          */
5248         if (register_pernet_device(&loopback_net_ops))
5249                 goto out;
5250
5251         if (register_pernet_device(&default_device_ops))
5252                 goto out;
5253
5254         netdev_dma_register();
5255
5256         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5257         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5258
5259         hotcpu_notifier(dev_cpu_callback, 0);
5260         dst_init();
5261         dev_mcast_init();
5262         rc = 0;
5263 out:
5264         return rc;
5265 }
5266
5267 subsys_initcall(net_dev_init);
5268
5269 EXPORT_SYMBOL(__dev_get_by_index);
5270 EXPORT_SYMBOL(__dev_get_by_name);
5271 EXPORT_SYMBOL(__dev_remove_pack);
5272 EXPORT_SYMBOL(dev_valid_name);
5273 EXPORT_SYMBOL(dev_add_pack);
5274 EXPORT_SYMBOL(dev_alloc_name);
5275 EXPORT_SYMBOL(dev_close);
5276 EXPORT_SYMBOL(dev_get_by_flags);
5277 EXPORT_SYMBOL(dev_get_by_index);
5278 EXPORT_SYMBOL(dev_get_by_name);
5279 EXPORT_SYMBOL(dev_open);
5280 EXPORT_SYMBOL(dev_queue_xmit);
5281 EXPORT_SYMBOL(dev_remove_pack);
5282 EXPORT_SYMBOL(dev_set_allmulti);
5283 EXPORT_SYMBOL(dev_set_promiscuity);
5284 EXPORT_SYMBOL(dev_change_flags);
5285 EXPORT_SYMBOL(dev_set_mtu);
5286 EXPORT_SYMBOL(dev_set_mac_address);
5287 EXPORT_SYMBOL(free_netdev);
5288 EXPORT_SYMBOL(netdev_boot_setup_check);
5289 EXPORT_SYMBOL(netdev_set_master);
5290 EXPORT_SYMBOL(netdev_state_change);
5291 EXPORT_SYMBOL(netif_receive_skb);
5292 EXPORT_SYMBOL(netif_rx);
5293 EXPORT_SYMBOL(register_gifconf);
5294 EXPORT_SYMBOL(register_netdevice);
5295 EXPORT_SYMBOL(register_netdevice_notifier);
5296 EXPORT_SYMBOL(skb_checksum_help);
5297 EXPORT_SYMBOL(synchronize_net);
5298 EXPORT_SYMBOL(unregister_netdevice);
5299 EXPORT_SYMBOL(unregister_netdevice_notifier);
5300 EXPORT_SYMBOL(net_enable_timestamp);
5301 EXPORT_SYMBOL(net_disable_timestamp);
5302 EXPORT_SYMBOL(dev_get_flags);
5303
5304 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5305 EXPORT_SYMBOL(br_handle_frame_hook);
5306 EXPORT_SYMBOL(br_fdb_get_hook);
5307 EXPORT_SYMBOL(br_fdb_put_hook);
5308 #endif
5309
5310 EXPORT_SYMBOL(dev_load);
5311
5312 EXPORT_PER_CPU_SYMBOL(softnet_data);