net/core/skbuff.c

   1 /*
   2  *      Routines having to do with the 'struct sk_buff' memory handlers.
   3  *
   4  *      Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
   5  *                      Florian La Roche <rzsfl@rz.uni-sb.de>
   6  *
   7  *      Fixes:
   8  *              Alan Cox        :       Fixed the worst of the load
   9  *                                      balancer bugs.
  10  *              Dave Platt      :       Interrupt stacking fix.
  11  *      Richard Kooijman        :       Timestamp fixes.
  12  *              Alan Cox        :       Changed buffer format.
  13  *              Alan Cox        :       destructor hook for AF_UNIX etc.
  14  *              Linus Torvalds  :       Better skb_clone.
  15  *              Alan Cox        :       Added skb_copy.
  16  *              Alan Cox        :       Added all the changed routines Linus
  17  *                                      only put in the headers
  18  *              Ray VanTassle   :       Fixed --skb->lock in free
  19  *              Alan Cox        :       skb_copy copy arp field
  20  *              Andi Kleen      :       slabified it.
  21  *              Robert Olsson   :       Removed skb_head_pool
  22  *
  23  *      NOTE:
  24  *              The __skb_ routines should be called with interrupts
  25  *      disabled, or you better be *real* sure that the operation is atomic
  26  *      with respect to whatever list is being frobbed (e.g. via lock_sock()
  27  *      or via disabling bottom half handlers, etc).
  28  *
  29  *      This program is free software; you can redistribute it and/or
  30  *      modify it under the terms of the GNU General Public License
  31  *      as published by the Free Software Foundation; either version
  32  *      2 of the License, or (at your option) any later version.
  33  */
  34
  35 /*
  36  *      The functions in this file will not compile correctly with gcc 2.4.x
  37  */
  38
  39 #include <linux/module.h>
  40 #include <linux/types.h>
  41 #include <linux/kernel.h>
  42 #include <linux/kmemcheck.h>
  43 #include <linux/mm.h>
  44 #include <linux/interrupt.h>
  45 #include <linux/in.h>
  46 #include <linux/inet.h>
  47 #include <linux/slab.h>
  48 #include <linux/netdevice.h>
  49 #ifdef CONFIG_NET_CLS_ACT
  50 #include <net/pkt_sched.h>
  51 #endif
  52 #include <linux/string.h>
  53 #include <linux/skbuff.h>
  54 #include <linux/splice.h>
  55 #include <linux/cache.h>
  56 #include <linux/rtnetlink.h>
  57 #include <linux/init.h>
  58 #include <linux/scatterlist.h>
  59 #include <linux/errqueue.h>
  60 #include <linux/prefetch.h>
  61
  62 #include <net/protocol.h>
  63 #include <net/dst.h>
  64 #include <net/sock.h>
  65 #include <net/checksum.h>
  66 #include <net/xfrm.h>
  67
  68 #include <asm/uaccess.h>
  69 #include <trace/events/skb.h>
  70 #include <linux/highmem.h>
  71
  72 struct kmem_cache *skbuff_head_cache __read_mostly;
  73 static struct kmem_cache *skbuff_fclone_cache __read_mostly;
  74
  75 static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
  76                                   struct pipe_buffer *buf)
  77 {
  78         put_page(buf->page);
  79 }
  80
  81 static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
  82                                 struct pipe_buffer *buf)
  83 {
  84         get_page(buf->page);
  85 }
  86
  87 static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
  88                                struct pipe_buffer *buf)
  89 {
  90         return 1;
  91 }
  92
  93
  94 /* Pipe buffer operations for a socket. */
  95 static const struct pipe_buf_operations sock_pipe_buf_ops = {
  96         .can_merge = 0,
  97         .map = generic_pipe_buf_map,
  98         .unmap = generic_pipe_buf_unmap,
  99         .confirm = generic_pipe_buf_confirm,
 100         .release = sock_pipe_buf_release,
 101         .steal = sock_pipe_buf_steal,
 102         .get = sock_pipe_buf_get,
 103 };
 104
 105 /*
 106  *      Keep out-of-line to prevent kernel bloat.
 107  *      __builtin_return_address is not used because it is not always
 108  *      reliable.
 109  */
 110
 111 /**
 112  *      skb_over_panic  -       private function
 113  *      @skb: buffer
 114  *      @sz: size
 115  *      @here: address
 116  *
 117  *      Out of line support code for skb_put(). Not user callable.
 118  */
 119 static void skb_over_panic(struct sk_buff *skb, int sz, void *here)
 120 {
 121         printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
 122                           "data:%p tail:%#lx end:%#lx dev:%s\n",
 123                here, skb->len, sz, skb->head, skb->data,
 124                (unsigned long)skb->tail, (unsigned long)skb->end,
 125                skb->dev ? skb->dev->name : "<NULL>");
 126         BUG();
 127 }
 128
 129 /**
 130  *      skb_under_panic -       private function
 131  *      @skb: buffer
 132  *      @sz: size
 133  *      @here: address
 134  *
 135  *      Out of line support code for skb_push(). Not user callable.
 136  */
 137
 138 static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
 139 {
 140         printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p "
 141                           "data:%p tail:%#lx end:%#lx dev:%s\n",
 142                here, skb->len, sz, skb->head, skb->data,
 143                (unsigned long)skb->tail, (unsigned long)skb->end,
 144                skb->dev ? skb->dev->name : "<NULL>");
 145         BUG();
 146 }
 147
 148 /*      Allocate a new skbuff. We do this ourselves so we can fill in a few
 149  *      'private' fields and also do memory statistics to find all the
 150  *      [BEEP] leaks.
 151  *
 152  */
 153
 154 /**
 155  *      __alloc_skb     -       allocate a network buffer
 156  *      @size: size to allocate
 157  *      @gfp_mask: allocation mask
 158  *      @fclone: allocate from fclone cache instead of head cache
 159  *              and allocate a cloned (child) skb
 160  *      @node: numa node to allocate memory on
 161  *
 162  *      Allocate a new &sk_buff. The returned buffer has no headroom and a
 163  *      tail room of size bytes. The object has a reference count of one.
 164  *      The return is the buffer. On a failure the return is %NULL.
 165  *
 166  *      Buffers may only be allocated from interrupts using a @gfp_mask of
 167  *      %GFP_ATOMIC.
 168  */
 169 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 170                             int fclone, int node)
 171 {
 172         struct kmem_cache *cache;
 173         struct skb_shared_info *shinfo;
 174         struct sk_buff *skb;
 175         u8 *data;
 176
 177         cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
 178
 179         /* Get the HEAD */
 180         skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
 181         if (!skb)
 182                 goto out;
 183         prefetchw(skb);
 184
 185         /* We do our best to align skb_shared_info on a separate cache
 186          * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
 187          * aligned memory blocks, unless SLUB/SLAB debug is enabled.
 188          * Both skb->head and skb_shared_info are cache line aligned.
 189          */
 190         size = SKB_DATA_ALIGN(size);
 191         size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 192         data = kmalloc_node_track_caller(size, gfp_mask, node);
 193         if (!data)
 194                 goto nodata;
 195         /* kmalloc(size) might give us more room than requested.
 196          * Put skb_shared_info exactly at the end of allocated zone,
 197          * to allow max possible filling before reallocation.
 198          */
 199         size = SKB_WITH_OVERHEAD(ksize(data));
 200         prefetchw(data + size);
 201
 202         /*
 203          * Only clear those fields we need to clear, not those that we will
 204          * actually initialise below. Hence, don't put any more fields after
 205          * the tail pointer in struct sk_buff!
 206          */
 207         memset(skb, 0, offsetof(struct sk_buff, tail));
 208         /* Account for allocated memory : skb + skb->head */
 209         skb->truesize = SKB_TRUESIZE(size);
 210         atomic_set(&skb->users, 1);
 211         skb->head = data;
 212         skb->data = data;
 213         skb_reset_tail_pointer(skb);
 214         skb->end = skb->tail + size;
 215 #ifdef NET_SKBUFF_DATA_USES_OFFSET
 216         skb->mac_header = ~0U;
 217 #endif
 218
 219         /* make sure we initialize shinfo sequentially */
 220         shinfo = skb_shinfo(skb);
 221         memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 222         atomic_set(&shinfo->dataref, 1);
 223         kmemcheck_annotate_variable(shinfo->destructor_arg);
 224
 225         if (fclone) {
 226                 struct sk_buff *child = skb + 1;
 227                 atomic_t *fclone_ref = (atomic_t *) (child + 1);
 228
 229                 kmemcheck_annotate_bitfield(child, flags1);
 230                 kmemcheck_annotate_bitfield(child, flags2);
 231                 skb->fclone = SKB_FCLONE_ORIG;
 232                 atomic_set(fclone_ref, 1);
 233
 234                 child->fclone = SKB_FCLONE_UNAVAILABLE;
 235         }
 236 out:
 237         return skb;
 238 nodata:
 239         kmem_cache_free(cache, skb);
 240         skb = NULL;
 241         goto out;
 242 }
 243 EXPORT_SYMBOL(__alloc_skb);
 244
 245 /**
 246  * build_skb - build a network buffer
 247  * @data: data buffer provided by caller
 248  * @frag_size: size of fragment, or 0 if head was kmalloced
 249  *
 250  * Allocate a new &sk_buff. Caller provides space holding head and
 251  * skb_shared_info. @data must have been allocated by kmalloc()
 252  * The return is the new skb buffer.
 253  * On a failure the return is %NULL, and @data is not freed.
 254  * Notes :
 255  *  Before IO, driver allocates only data buffer where NIC put incoming frame
 256  *  Driver should add room at head (NET_SKB_PAD) and
 257  *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
 258  *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
 259  *  before giving packet to stack.
 260  *  RX rings only contains data buffers, not full skbs.
 261  */
 262 struct sk_buff *build_skb(void *data, unsigned int frag_size)
 263 {
 264         struct skb_shared_info *shinfo;
 265         struct sk_buff *skb;
 266         unsigned int size = frag_size ? : ksize(data);
 267
 268         skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
 269         if (!skb)
 270                 return NULL;
 271
 272         size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 273
 274         memset(skb, 0, offsetof(struct sk_buff, tail));
 275         skb->truesize = SKB_TRUESIZE(size);
 276         skb->head_frag = frag_size != 0;
 277         atomic_set(&skb->users, 1);
 278         skb->head = data;
 279         skb->data = data;
 280         skb_reset_tail_pointer(skb);
 281         skb->end = skb->tail + size;
 282 #ifdef NET_SKBUFF_DATA_USES_OFFSET
 283         skb->mac_header = ~0U;
 284 #endif
 285
 286         /* make sure we initialize shinfo sequentially */
 287         shinfo = skb_shinfo(skb);
 288         memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 289         atomic_set(&shinfo->dataref, 1);
 290         kmemcheck_annotate_variable(shinfo->destructor_arg);
 291
 292         return skb;
 293 }
 294 EXPORT_SYMBOL(build_skb);
 295
 296 /**
 297  *      __netdev_alloc_skb - allocate an skbuff for rx on a specific device
 298  *      @dev: network device to receive on
 299  *      @length: length to allocate
 300  *      @gfp_mask: get_free_pages mask, passed to alloc_skb
 301  *
 302  *      Allocate a new &sk_buff and assign it a usage count of one. The
 303  *      buffer has unspecified headroom built in. Users should allocate
 304  *      the headroom they think they need without accounting for the
 305  *      built in space. The built in space is used for optimisations.
 306  *
 307  *      %NULL is returned if there is no free memory.
 308  */
 309 struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 310                 unsigned int length, gfp_t gfp_mask)
 311 {
 312         struct sk_buff *skb;
 313
 314         skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
 315         if (likely(skb)) {
 316                 skb_reserve(skb, NET_SKB_PAD);
 317                 skb->dev = dev;
 318         }
 319         return skb;
 320 }
 321 EXPORT_SYMBOL(__netdev_alloc_skb);
 322
 323 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 324                      int size, unsigned int truesize)
 325 {
 326         skb_fill_page_desc(skb, i, page, off, size);
 327         skb->len += size;
 328         skb->data_len += size;
 329         skb->truesize += truesize;
 330 }
 331 EXPORT_SYMBOL(skb_add_rx_frag);
 332
 333 /**
 334  *      dev_alloc_skb - allocate an skbuff for receiving
 335  *      @length: length to allocate
 336  *
 337  *      Allocate a new &sk_buff and assign it a usage count of one. The
 338  *      buffer has unspecified headroom built in. Users should allocate
 339  *      the headroom they think they need without accounting for the
 340  *      built in space. The built in space is used for optimisations.
 341  *
 342  *      %NULL is returned if there is no free memory. Although this function
 343  *      allocates memory it can be called from an interrupt.
 344  */
 345 struct sk_buff *dev_alloc_skb(unsigned int length)
 346 {
 347         /*
 348          * There is more code here than it seems:
 349          * __dev_alloc_skb is an inline
 350          */
 351         return __dev_alloc_skb(length, GFP_ATOMIC);
 352 }
 353 EXPORT_SYMBOL(dev_alloc_skb);
 354
 355 static void skb_drop_list(struct sk_buff **listp)
 356 {
 357         struct sk_buff *list = *listp;
 358
 359         *listp = NULL;
 360
 361         do {
 362                 struct sk_buff *this = list;
 363                 list = list->next;
 364                 kfree_skb(this);
 365         } while (list);
 366 }
 367
 368 static inline void skb_drop_fraglist(struct sk_buff *skb)
 369 {
 370         skb_drop_list(&skb_shinfo(skb)->frag_list);
 371 }
 372
 373 static void skb_clone_fraglist(struct sk_buff *skb)
 374 {
 375         struct sk_buff *list;
 376
 377         skb_walk_frags(skb, list)
 378                 skb_get(list);
 379 }
 380
 381 static void skb_free_head(struct sk_buff *skb)
 382 {
 383         if (skb->head_frag)
 384                 put_page(virt_to_head_page(skb->head));
 385         else
 386                 kfree(skb->head);
 387 }
 388
 389 static void skb_release_data(struct sk_buff *skb)
 390 {
 391         if (!skb->cloned ||
 392             !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
 393                                &skb_shinfo(skb)->dataref)) {
 394                 if (skb_shinfo(skb)->nr_frags) {
 395                         int i;
 396                         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 397                                 skb_frag_unref(skb, i);
 398                 }
 399
 400                 /*
 401                  * If skb buf is from userspace, we need to notify the caller
 402                  * the lower device DMA has done;
 403                  */
 404                 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
 405                         struct ubuf_info *uarg;
 406
 407                         uarg = skb_shinfo(skb)->destructor_arg;
 408                         if (uarg->callback)
 409                                 uarg->callback(uarg);
 410                 }
 411
 412                 if (skb_has_frag_list(skb))
 413                         skb_drop_fraglist(skb);
 414
 415                 skb_free_head(skb);
 416         }
 417 }
 418
 419 /*
 420  *      Free an skbuff by memory without cleaning the state.
 421  */
 422 static void kfree_skbmem(struct sk_buff *skb)
 423 {
 424         struct sk_buff *other;
 425         atomic_t *fclone_ref;
 426
 427         switch (skb->fclone) {
 428         case SKB_FCLONE_UNAVAILABLE:
 429                 kmem_cache_free(skbuff_head_cache, skb);
 430                 break;
 431
 432         case SKB_FCLONE_ORIG:
 433                 fclone_ref = (atomic_t *) (skb + 2);
 434                 if (atomic_dec_and_test(fclone_ref))
 435                         kmem_cache_free(skbuff_fclone_cache, skb);
 436                 break;
 437
 438         case SKB_FCLONE_CLONE:
 439                 fclone_ref = (atomic_t *) (skb + 1);
 440                 other = skb - 1;
 441
 442                 /* The clone portion is available for
 443                  * fast-cloning again.
 444                  */
 445                 skb->fclone = SKB_FCLONE_UNAVAILABLE;
 446
 447                 if (atomic_dec_and_test(fclone_ref))
 448                         kmem_cache_free(skbuff_fclone_cache, other);
 449                 break;
 450         }
 451 }
 452
 453 static void skb_release_head_state(struct sk_buff *skb)
 454 {
 455         skb_dst_drop(skb);
 456 #ifdef CONFIG_XFRM
 457         secpath_put(skb->sp);
 458 #endif
 459         if (skb->destructor) {
 460                 WARN_ON(in_irq());
 461                 skb->destructor(skb);
 462         }
 463 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 464         nf_conntrack_put(skb->nfct);
 465 #endif
 466 #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
 467         nf_conntrack_put_reasm(skb->nfct_reasm);
 468 #endif
 469 #ifdef CONFIG_BRIDGE_NETFILTER
 470         nf_bridge_put(skb->nf_bridge);
 471 #endif
 472 /* XXX: IS this still necessary? - JHS */
 473 #ifdef CONFIG_NET_SCHED
 474         skb->tc_index = 0;
 475 #ifdef CONFIG_NET_CLS_ACT
 476         skb->tc_verd = 0;
 477 #endif
 478 #endif
 479 }
 480
 481 /* Free everything but the sk_buff shell. */
 482 static void skb_release_all(struct sk_buff *skb)
 483 {
 484         skb_release_head_state(skb);
 485         skb_release_data(skb);
 486 }
 487
 488 /**
 489  *      __kfree_skb - private function
 490  *      @skb: buffer
 491  *
 492  *      Free an sk_buff. Release anything attached to the buffer.
 493  *      Clean the state. This is an internal helper function. Users should
 494  *      always call kfree_skb
 495  */
 496
 497 void __kfree_skb(struct sk_buff *skb)
 498 {
 499         skb_release_all(skb);
 500         kfree_skbmem(skb);
 501 }
 502 EXPORT_SYMBOL(__kfree_skb);
 503
 504 /**
 505  *      kfree_skb - free an sk_buff
 506  *      @skb: buffer to free
 507  *
 508  *      Drop a reference to the buffer and free it if the usage count has
 509  *      hit zero.
 510  */
 511 void kfree_skb(struct sk_buff *skb)
 512 {
 513         if (unlikely(!skb))
 514                 return;
 515         if (likely(atomic_read(&skb->users) == 1))
 516                 smp_rmb();
 517         else if (likely(!atomic_dec_and_test(&skb->users)))
 518                 return;
 519         trace_kfree_skb(skb, __builtin_return_address(0));
 520         __kfree_skb(skb);
 521 }
 522 EXPORT_SYMBOL(kfree_skb);
 523
 524 /**
 525  *      consume_skb - free an skbuff
 526  *      @skb: buffer to free
 527  *
 528  *      Drop a ref to the buffer and free it if the usage count has hit zero
 529  *      Functions identically to kfree_skb, but kfree_skb assumes that the frame
 530  *      is being dropped after a failure and notes that
 531  */
 532 void consume_skb(struct sk_buff *skb)
 533 {
 534         if (unlikely(!skb))
 535                 return;
 536         if (likely(atomic_read(&skb->users) == 1))
 537                 smp_rmb();
 538         else if (likely(!atomic_dec_and_test(&skb->users)))
 539                 return;
 540         trace_consume_skb(skb);
 541         __kfree_skb(skb);
 542 }
 543 EXPORT_SYMBOL(consume_skb);
 544
 545 /**
 546  *      skb_recycle - clean up an skb for reuse
 547  *      @skb: buffer
 548  *
 549  *      Recycles the skb to be reused as a receive buffer. This
 550  *      function does any necessary reference count dropping, and
 551  *      cleans up the skbuff as if it just came from __alloc_skb().
 552  */
 553 void skb_recycle(struct sk_buff *skb)
 554 {
 555         struct skb_shared_info *shinfo;
 556
 557         skb_release_head_state(skb);
 558
 559         shinfo = skb_shinfo(skb);
 560         memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 561         atomic_set(&shinfo->dataref, 1);
 562
 563         memset(skb, 0, offsetof(struct sk_buff, tail));
 564         skb->data = skb->head + NET_SKB_PAD;
 565         skb_reset_tail_pointer(skb);
 566 }
 567 EXPORT_SYMBOL(skb_recycle);
 568
 569 /**
 570  *      skb_recycle_check - check if skb can be reused for receive
 571  *      @skb: buffer
 572  *      @skb_size: minimum receive buffer size
 573  *
 574  *      Checks that the skb passed in is not shared or cloned, and
 575  *      that it is linear and its head portion at least as large as
 576  *      skb_size so that it can be recycled as a receive buffer.
 577  *      If these conditions are met, this function does any necessary
 578  *      reference count dropping and cleans up the skbuff as if it
 579  *      just came from __alloc_skb().
 580  */
 581 bool skb_recycle_check(struct sk_buff *skb, int skb_size)
 582 {
 583         if (!skb_is_recycleable(skb, skb_size))
 584                 return false;
 585
 586         skb_recycle(skb);
 587
 588         return true;
 589 }
 590 EXPORT_SYMBOL(skb_recycle_check);
 591
 592 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 593 {
 594         new->tstamp             = old->tstamp;
 595         new->dev                = old->dev;
 596         new->transport_header   = old->transport_header;
 597         new->network_header     = old->network_header;
 598         new->mac_header         = old->mac_header;
 599         skb_dst_copy(new, old);
 600         new->rxhash             = old->rxhash;
 601         new->ooo_okay           = old->ooo_okay;
 602         new->l4_rxhash          = old->l4_rxhash;
 603         new->no_fcs             = old->no_fcs;
 604 #ifdef CONFIG_XFRM
 605         new->sp                 = secpath_get(old->sp);
 606 #endif
 607         memcpy(new->cb, old->cb, sizeof(old->cb));
 608         new->csum               = old->csum;
 609         new->local_df           = old->local_df;
 610         new->pkt_type           = old->pkt_type;
 611         new->ip_summed          = old->ip_summed;
 612         skb_copy_queue_mapping(new, old);
 613         new->priority           = old->priority;
 614 #if IS_ENABLED(CONFIG_IP_VS)
 615         new->ipvs_property      = old->ipvs_property;
 616 #endif
 617         new->protocol           = old->protocol;
 618         new->mark               = old->mark;
 619         new->skb_iif            = old->skb_iif;
 620         __nf_copy(new, old);
 621 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 622         new->nf_trace           = old->nf_trace;
 623 #endif
 624 #ifdef CONFIG_NET_SCHED
 625         new->tc_index           = old->tc_index;
 626 #ifdef CONFIG_NET_CLS_ACT
 627         new->tc_verd            = old->tc_verd;
 628 #endif
 629 #endif
 630         new->vlan_tci           = old->vlan_tci;
 631
 632         skb_copy_secmark(new, old);
 633 }
 634
 635 /*
 636  * You should not add any new code to this function.  Add it to
 637  * __copy_skb_header above instead.
 638  */
 639 static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 640 {
 641 #define C(x) n->x = skb->x
 642
 643         n->next = n->prev = NULL;
 644         n->sk = NULL;
 645         __copy_skb_header(n, skb);
 646
 647         C(len);
 648         C(data_len);
 649         C(mac_len);
 650         n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
 651         n->cloned = 1;
 652         n->nohdr = 0;
 653         n->destructor = NULL;
 654         C(tail);
 655         C(end);
 656         C(head);
 657         C(head_frag);
 658         C(data);
 659         C(truesize);
 660         atomic_set(&n->users, 1);
 661
 662         atomic_inc(&(skb_shinfo(skb)->dataref));
 663         skb->cloned = 1;
 664
 665         return n;
 666 #undef C
 667 }
 668
 669 /**
 670  *      skb_morph       -       morph one skb into another
 671  *      @dst: the skb to receive the contents
 672  *      @src: the skb to supply the contents
 673  *
 674  *      This is identical to skb_clone except that the target skb is
 675  *      supplied by the user.
 676  *
 677  *      The target skb is returned upon exit.
 678  */
 679 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 680 {
 681         skb_release_all(dst);
 682         return __skb_clone(dst, src);
 683 }
 684 EXPORT_SYMBOL_GPL(skb_morph);
 685
 686 /*      skb_copy_ubufs  -       copy userspace skb frags buffers to kernel
 687  *      @skb: the skb to modify
 688  *      @gfp_mask: allocation priority
 689  *
 690  *      This must be called on SKBTX_DEV_ZEROCOPY skb.
 691  *      It will copy all frags into kernel and drop the reference
 692  *      to userspace pages.
 693  *
 694  *      If this function is called from an interrupt gfp_mask() must be
 695  *      %GFP_ATOMIC.
 696  *
 697  *      Returns 0 on success or a negative error code on failure
 698  *      to allocate kernel memory to copy to.
 699  */
 700 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 701 {
 702         int i;
 703         int num_frags = skb_shinfo(skb)->nr_frags;
 704         struct page *page, *head = NULL;
 705         struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
 706
 707         for (i = 0; i < num_frags; i++) {
 708                 u8 *vaddr;
 709                 skb_frag_t *f = &skb_shinfo(skb)->frags[i];
 710
 711                 page = alloc_page(GFP_ATOMIC);
 712                 if (!page) {
 713                         while (head) {
 714                                 struct page *next = (struct page *)head->private;
 715                                 put_page(head);
 716                                 head = next;
 717                         }
 718                         return -ENOMEM;
 719                 }
 720                 vaddr = kmap_atomic(skb_frag_page(f));
 721                 memcpy(page_address(page),
 722                        vaddr + f->page_offset, skb_frag_size(f));
 723                 kunmap_atomic(vaddr);
 724                 page->private = (unsigned long)head;
 725                 head = page;
 726         }
 727
 728         /* skb frags release userspace buffers */
 729         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 730                 skb_frag_unref(skb, i);
 731
 732         uarg->callback(uarg);
 733
 734         /* skb frags point to kernel buffers */
 735         for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) {
 736                 __skb_fill_page_desc(skb, i-1, head, 0,
 737                                      skb_shinfo(skb)->frags[i - 1].size);
 738                 head = (struct page *)head->private;
 739         }
 740
 741         skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
 742         return 0;
 743 }
 744
 745
 746 /**
 747  *      skb_clone       -       duplicate an sk_buff
 748  *      @skb: buffer to clone
 749  *      @gfp_mask: allocation priority
 750  *
 751  *      Duplicate an &sk_buff. The new one is not owned by a socket. Both
 752  *      copies share the same packet data but not structure. The new
 753  *      buffer has a reference count of 1. If the allocation fails the
 754  *      function returns %NULL otherwise the new buffer is returned.
 755  *
 756  *      If this function is called from an interrupt gfp_mask() must be
 757  *      %GFP_ATOMIC.
 758  */
 759
 760 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 761 {
 762         struct sk_buff *n;
 763
 764         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
 765                 if (skb_copy_ubufs(skb, gfp_mask))
 766                         return NULL;
 767         }
 768
 769         n = skb + 1;
 770         if (skb->fclone == SKB_FCLONE_ORIG &&
 771             n->fclone == SKB_FCLONE_UNAVAILABLE) {
 772                 atomic_t *fclone_ref = (atomic_t *) (n + 1);
 773                 n->fclone = SKB_FCLONE_CLONE;
 774                 atomic_inc(fclone_ref);
 775         } else {
 776                 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
 777                 if (!n)
 778                         return NULL;
 779
 780                 kmemcheck_annotate_bitfield(n, flags1);
 781                 kmemcheck_annotate_bitfield(n, flags2);
 782                 n->fclone = SKB_FCLONE_UNAVAILABLE;
 783         }
 784
 785         return __skb_clone(n, skb);
 786 }
 787 EXPORT_SYMBOL(skb_clone);
 788
 789 static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 790 {
 791 #ifndef NET_SKBUFF_DATA_USES_OFFSET
 792         /*
 793          *      Shift between the two data areas in bytes
 794          */
 795         unsigned long offset = new->data - old->data;
 796 #endif
 797
 798         __copy_skb_header(new, old);
 799
 800 #ifndef NET_SKBUFF_DATA_USES_OFFSET
 801         /* {transport,network,mac}_header are relative to skb->head */
 802         new->transport_header += offset;
 803         new->network_header   += offset;
 804         if (skb_mac_header_was_set(new))
 805                 new->mac_header       += offset;
 806 #endif
 807         skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
 808         skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
 809         skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
 810 }
 811
 812 /**
 813  *      skb_copy        -       create private copy of an sk_buff
 814  *      @skb: buffer to copy
 815  *      @gfp_mask: allocation priority
 816  *
 817  *      Make a copy of both an &sk_buff and its data. This is used when the
 818  *      caller wishes to modify the data and needs a private copy of the
 819  *      data to alter. Returns %NULL on failure or the pointer to the buffer
 820  *      on success. The returned buffer has a reference count of 1.
 821  *
 822  *      As by-product this function converts non-linear &sk_buff to linear
 823  *      one, so that &sk_buff becomes completely private and caller is allowed
 824  *      to modify all the data of returned buffer. This means that this
 825  *      function is not recommended for use in circumstances when only
 826  *      header is going to be modified. Use pskb_copy() instead.
 827  */
 828
 829 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
 830 {
 831         int headerlen = skb_headroom(skb);
 832         unsigned int size = skb_end_offset(skb) + skb->data_len;
 833         struct sk_buff *n = alloc_skb(size, gfp_mask);
 834
 835         if (!n)
 836                 return NULL;
 837
 838         /* Set the data pointer */
 839         skb_reserve(n, headerlen);
 840         /* Set the tail pointer and length */
 841         skb_put(n, skb->len);
 842
 843         if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
 844                 BUG();
 845
 846         copy_skb_header(n, skb);
 847         return n;
 848 }
 849 EXPORT_SYMBOL(skb_copy);
 850
 851 /**
 852  *      __pskb_copy     -       create copy of an sk_buff with private head.
 853  *      @skb: buffer to copy
 854  *      @headroom: headroom of new skb
 855  *      @gfp_mask: allocation priority
 856  *
 857  *      Make a copy of both an &sk_buff and part of its data, located
 858  *      in header. Fragmented data remain shared. This is used when
 859  *      the caller wishes to modify only header of &sk_buff and needs
 860  *      private copy of the header to alter. Returns %NULL on failure
 861  *      or the pointer to the buffer on success.
 862  *      The returned buffer has a reference count of 1.
 863  */
 864
 865 struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
 866 {
 867         unsigned int size = skb_headlen(skb) + headroom;
 868         struct sk_buff *n = alloc_skb(size, gfp_mask);
 869
 870         if (!n)
 871                 goto out;
 872
 873         /* Set the data pointer */
 874         skb_reserve(n, headroom);
 875         /* Set the tail pointer and length */
 876         skb_put(n, skb_headlen(skb));
 877         /* Copy the bytes */
 878         skb_copy_from_linear_data(skb, n->data, n->len);
 879
 880         n->truesize += skb->data_len;
 881         n->data_len  = skb->data_len;
 882         n->len       = skb->len;
 883
 884         if (skb_shinfo(skb)->nr_frags) {
 885                 int i;
 886
 887                 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
 888                         if (skb_copy_ubufs(skb, gfp_mask)) {
 889                                 kfree_skb(n);
 890                                 n = NULL;
 891                                 goto out;
 892                         }
 893                 }
 894                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 895                         skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
 896                         skb_frag_ref(skb, i);
 897                 }
 898                 skb_shinfo(n)->nr_frags = i;
 899         }
 900
 901         if (skb_has_frag_list(skb)) {
 902                 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
 903                 skb_clone_fraglist(n);
 904         }
 905
 906         copy_skb_header(n, skb);
 907 out:
 908         return n;
 909 }
 910 EXPORT_SYMBOL(__pskb_copy);
 911
 912 /**
 913  *      pskb_expand_head - reallocate header of &sk_buff
 914  *      @skb: buffer to reallocate
 915  *      @nhead: room to add at head
 916  *      @ntail: room to add at tail
 917  *      @gfp_mask: allocation priority
 918  *
 919  *      Expands (or creates identical copy, if &nhead and &ntail are zero)
 920  *      header of skb. &sk_buff itself is not changed. &sk_buff MUST have
 921  *      reference count of 1. Returns zero in the case of success or error,
 922  *      if expansion failed. In the last case, &sk_buff is not changed.
 923  *
 924  *      All the pointers pointing into skb header may change and must be
 925  *      reloaded after call to this function.
 926  */
 927
 928 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 929                      gfp_t gfp_mask)
 930 {
 931         int i;
 932         u8 *data;
 933         int size = nhead + skb_end_offset(skb) + ntail;
 934         long off;
 935
 936         BUG_ON(nhead < 0);
 937
 938         if (skb_shared(skb))
 939                 BUG();
 940
 941         size = SKB_DATA_ALIGN(size);
 942
 943         data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
 944                        gfp_mask);
 945         if (!data)
 946                 goto nodata;
 947         size = SKB_WITH_OVERHEAD(ksize(data));
 948
 949         /* Copy only real data... and, alas, header. This should be
 950          * optimized for the cases when header is void.
 951          */
 952         memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
 953
 954         memcpy((struct skb_shared_info *)(data + size),
 955                skb_shinfo(skb),
 956                offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
 957
 958         /*
 959          * if shinfo is shared we must drop the old head gracefully, but if it
 960          * is not we can just drop the old head and let the existing refcount
 961          * be since all we did is relocate the values
 962          */
 963         if (skb_cloned(skb)) {
 964                 /* copy this zero copy skb frags */
 965                 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
 966                         if (skb_copy_ubufs(skb, gfp_mask))
 967                                 goto nofrags;
 968                 }
 969                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 970                         skb_frag_ref(skb, i);
 971
 972                 if (skb_has_frag_list(skb))
 973                         skb_clone_fraglist(skb);
 974
 975                 skb_release_data(skb);
 976         } else {
 977                 skb_free_head(skb);
 978         }
 979         off = (data + nhead) - skb->head;
 980
 981         skb->head     = data;
 982         skb->head_frag = 0;
 983         skb->data    += off;
 984 #ifdef NET_SKBUFF_DATA_USES_OFFSET
 985         skb->end      = size;
 986         off           = nhead;
 987 #else
 988         skb->end      = skb->head + size;
 989 #endif
 990         /* {transport,network,mac}_header and tail are relative to skb->head */
 991         skb->tail             += off;
 992         skb->transport_header += off;
 993         skb->network_header   += off;
 994         if (skb_mac_header_was_set(skb))
 995                 skb->mac_header += off;
 996         /* Only adjust this if it actually is csum_start rather than csum */
 997         if (skb->ip_summed == CHECKSUM_PARTIAL)
 998                 skb->csum_start += nhead;
 999         skb->cloned   = 0;
1000         skb->hdr_len  = 0;
1001         skb->nohdr    = 0;
1002         atomic_set(&skb_shinfo(skb)->dataref, 1);
1003         return 0;
1004
1005 nofrags:
1006         kfree(data);
1007 nodata:
1008         return -ENOMEM;
1009 }
1010 EXPORT_SYMBOL(pskb_expand_head);
1011
1012 /* Make private copy of skb with writable head and some headroom */
1013
1014 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
1015 {
1016         struct sk_buff *skb2;
1017         int delta = headroom - skb_headroom(skb);
1018
1019         if (delta <= 0)
1020                 skb2 = pskb_copy(skb, GFP_ATOMIC);
1021         else {
1022                 skb2 = skb_clone(skb, GFP_ATOMIC);
1023                 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
1024                                              GFP_ATOMIC)) {
1025                         kfree_skb(skb2);
1026                         skb2 = NULL;
1027                 }
1028         }
1029         return skb2;
1030 }
1031 EXPORT_SYMBOL(skb_realloc_headroom);
1032
1033 /**
1034  *      skb_copy_expand -       copy and expand sk_buff
1035  *      @skb: buffer to copy
1036  *      @newheadroom: new free bytes at head
1037  *      @newtailroom: new free bytes at tail
1038  *      @gfp_mask: allocation priority
1039  *
1040  *      Make a copy of both an &sk_buff and its data and while doing so
1041  *      allocate additional space.
1042  *
1043  *      This is used when the caller wishes to modify the data and needs a
1044  *      private copy of the data to alter as well as more space for new fields.
1045  *      Returns %NULL on failure or the pointer to the buffer
1046  *      on success. The returned buffer has a reference count of 1.
1047  *
1048  *      You must pass %GFP_ATOMIC as the allocation priority if this function
1049  *      is called from an interrupt.
1050  */
1051 struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
1052                                 int newheadroom, int newtailroom,
1053                                 gfp_t gfp_mask)
1054 {
1055         /*
1056          *      Allocate the copy buffer
1057          */
1058         struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
1059                                       gfp_mask);
1060         int oldheadroom = skb_headroom(skb);
1061         int head_copy_len, head_copy_off;
1062         int off;
1063
1064         if (!n)
1065                 return NULL;
1066
1067         skb_reserve(n, newheadroom);
1068
1069         /* Set the tail pointer and length */
1070         skb_put(n, skb->len);
1071
1072         head_copy_len = oldheadroom;
1073         head_copy_off = 0;
1074         if (newheadroom <= head_copy_len)
1075                 head_copy_len = newheadroom;
1076         else
1077                 head_copy_off = newheadroom - head_copy_len;
1078
1079         /* Copy the linear header and data. */
1080         if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
1081                           skb->len + head_copy_len))
1082                 BUG();
1083
1084         copy_skb_header(n, skb);
1085
1086         off                  = newheadroom - oldheadroom;
1087         if (n->ip_summed == CHECKSUM_PARTIAL)
1088                 n->csum_start += off;
1089 #ifdef NET_SKBUFF_DATA_USES_OFFSET
1090         n->transport_header += off;
1091         n->network_header   += off;
1092         if (skb_mac_header_was_set(skb))
1093                 n->mac_header += off;
1094 #endif
1095
1096         return n;
1097 }
1098 EXPORT_SYMBOL(skb_copy_expand);
1099
1100 /**
1101  *      skb_pad                 -       zero pad the tail of an skb
1102  *      @skb: buffer to pad
1103  *      @pad: space to pad
1104  *
1105  *      Ensure that a buffer is followed by a padding area that is zero
1106  *      filled. Used by network drivers which may DMA or transfer data
1107  *      beyond the buffer end onto the wire.
1108  *
1109  *      May return error in out of memory cases. The skb is freed on error.
1110  */
1111
1112 int skb_pad(struct sk_buff *skb, int pad)
1113 {
1114         int err;
1115         int ntail;
1116
1117         /* If the skbuff is non linear tailroom is always zero.. */
1118         if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
1119                 memset(skb->data+skb->len, 0, pad);
1120                 return 0;
1121         }
1122
1123         ntail = skb->data_len + pad - (skb->end - skb->tail);
1124         if (likely(skb_cloned(skb) || ntail > 0)) {
1125                 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
1126                 if (unlikely(err))
1127                         goto free_skb;
1128         }
1129
1130         /* FIXME: The use of this function with non-linear skb's really needs
1131          * to be audited.
1132          */
1133         err = skb_linearize(skb);
1134         if (unlikely(err))
1135                 goto free_skb;
1136
1137         memset(skb->data + skb->len, 0, pad);
1138         return 0;
1139
1140 free_skb:
1141         kfree_skb(skb);
1142         return err;
1143 }
1144 EXPORT_SYMBOL(skb_pad);
1145
1146 /**
1147  *      skb_put - add data to a buffer
1148  *      @skb: buffer to use
1149  *      @len: amount of data to add
1150  *
1151  *      This function extends the used data area of the buffer. If this would
1152  *      exceed the total buffer size the kernel will panic. A pointer to the
1153  *      first byte of the extra data is returned.
1154  */
1155 unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
1156 {
1157         unsigned char *tmp = skb_tail_pointer(skb);
1158         SKB_LINEAR_ASSERT(skb);
1159         skb->tail += len;
1160         skb->len  += len;
1161         if (unlikely(skb->tail > skb->end))
1162                 skb_over_panic(skb, len, __builtin_return_address(0));
1163         return tmp;
1164 }
1165 EXPORT_SYMBOL(skb_put);
1166
1167 /**
1168  *      skb_push - add data to the start of a buffer
1169  *      @skb: buffer to use
1170  *      @len: amount of data to add
1171  *
1172  *      This function extends the used data area of the buffer at the buffer
1173  *      start. If this would exceed the total buffer headroom the kernel will
1174  *      panic. A pointer to the first byte of the extra data is returned.
1175  */
1176 unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
1177 {
1178         skb->data -= len;
1179         skb->len  += len;
1180         if (unlikely(skb->data<skb->head))
1181                 skb_under_panic(skb, len, __builtin_return_address(0));
1182         return skb->data;
1183 }
1184 EXPORT_SYMBOL(skb_push);
1185
1186 /**
1187  *      skb_pull - remove data from the start of a buffer
1188  *      @skb: buffer to use
1189  *      @len: amount of data to remove
1190  *
1191  *      This function removes data from the start of a buffer, returning
1192  *      the memory to the headroom. A pointer to the next data in the buffer
1193  *      is returned. Once the data has been pulled future pushes will overwrite
1194  *      the old data.
1195  */
1196 unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
1197 {
1198         return skb_pull_inline(skb, len);
1199 }
1200 EXPORT_SYMBOL(skb_pull);
1201
1202 /**
1203  *      skb_trim - remove end from a buffer
1204  *      @skb: buffer to alter
1205  *      @len: new length
1206  *
1207  *      Cut the length of a buffer down by removing data from the tail. If
1208  *      the buffer is already under the length specified it is not modified.
1209  *      The skb must be linear.
1210  */
1211 void skb_trim(struct sk_buff *skb, unsigned int len)
1212 {
1213         if (skb->len > len)
1214                 __skb_trim(skb, len);
1215 }
1216 EXPORT_SYMBOL(skb_trim);
1217
1218 /* Trims skb to length len. It can change skb pointers.
1219  */
1220
1221 int ___pskb_trim(struct sk_buff *skb, unsigned int len)
1222 {
1223         struct sk_buff **fragp;
1224         struct sk_buff *frag;
1225         int offset = skb_headlen(skb);
1226         int nfrags = skb_shinfo(skb)->nr_frags;
1227         int i;
1228         int err;
1229
1230         if (skb_cloned(skb) &&
1231             unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
1232                 return err;
1233
1234         i = 0;
1235         if (offset >= len)
1236                 goto drop_pages;
1237
1238         for (; i < nfrags; i++) {
1239                 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
1240
1241                 if (end < len) {
1242                         offset = end;
1243                         continue;
1244                 }
1245
1246                 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
1247
1248 drop_pages:
1249                 skb_shinfo(skb)->nr_frags = i;
1250
1251                 for (; i < nfrags; i++)
1252                         skb_frag_unref(skb, i);
1253
1254                 if (skb_has_frag_list(skb))
1255                         skb_drop_fraglist(skb);
1256                 goto done;
1257         }
1258
1259         for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
1260              fragp = &frag->next) {
1261                 int end = offset + frag->len;
1262
1263                 if (skb_shared(frag)) {
1264                         struct sk_buff *nfrag;
1265
1266                         nfrag = skb_clone(frag, GFP_ATOMIC);
1267                         if (unlikely(!nfrag))
1268                                 return -ENOMEM;
1269
1270                         nfrag->next = frag->next;
1271                         consume_skb(frag);
1272                         frag = nfrag;
1273                         *fragp = frag;
1274                 }
1275
1276                 if (end < len) {
1277                         offset = end;
1278                         continue;
1279                 }
1280
1281                 if (end > len &&
1282                     unlikely((err = pskb_trim(frag, len - offset))))
1283                         return err;
1284
1285                 if (frag->next)
1286                         skb_drop_list(&frag->next);
1287                 break;
1288         }
1289
1290 done:
1291         if (len > skb_headlen(skb)) {
1292                 skb->data_len -= skb->len - len;
1293                 skb->len       = len;
1294         } else {
1295                 skb->len       = len;
1296                 skb->data_len  = 0;
1297                 skb_set_tail_pointer(skb, len);
1298         }
1299
1300         return 0;
1301 }
1302 EXPORT_SYMBOL(___pskb_trim);
1303
1304 /**
1305  *      __pskb_pull_tail - advance tail of skb header
1306  *      @skb: buffer to reallocate
1307  *      @delta: number of bytes to advance tail
1308  *
1309  *      The function makes a sense only on a fragmented &sk_buff,
1310  *      it expands header moving its tail forward and copying necessary
1311  *      data from fragmented part.
1312  *
1313  *      &sk_buff MUST have reference count of 1.
1314  *
1315  *      Returns %NULL (and &sk_buff does not change) if pull failed
1316  *      or value of new tail of skb in the case of success.
1317  *
1318  *      All the pointers pointing into skb header may change and must be
1319  *      reloaded after call to this function.
1320  */
1321
1322 /* Moves tail of skb head forward, copying data from fragmented part,
1323  * when it is necessary.
1324  * 1. It may fail due to malloc failure.
1325  * 2. It may change skb pointers.
1326  *
1327  * It is pretty complicated. Luckily, it is called only in exceptional cases.
1328  */
1329 unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
1330 {
1331         /* If skb has not enough free space at tail, get new one
1332          * plus 128 bytes for future expansions. If we have enough
1333          * room at tail, reallocate without expansion only if skb is cloned.
1334          */
1335         int i, k, eat = (skb->tail + delta) - skb->end;
1336
1337         if (eat > 0 || skb_cloned(skb)) {
1338                 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
1339                                      GFP_ATOMIC))
1340                         return NULL;
1341         }
1342
1343         if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))
1344                 BUG();
1345
1346         /* Optimization: no fragments, no reasons to preestimate
1347          * size of pulled pages. Superb.
1348          */
1349         if (!skb_has_frag_list(skb))
1350                 goto pull_pages;
1351
1352         /* Estimate size of pulled pages. */
1353         eat = delta;
1354         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1355                 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
1356
1357                 if (size >= eat)
1358                         goto pull_pages;
1359                 eat -= size;
1360         }
1361
1362         /* If we need update frag list, we are in troubles.
1363          * Certainly, it possible to add an offset to skb data,
1364          * but taking into account that pulling is expected to
1365          * be very rare operation, it is worth to fight against
1366          * further bloating skb head and crucify ourselves here instead.
1367          * Pure masohism, indeed. 8)8)
1368          */
1369         if (eat) {
1370                 struct sk_buff *list = skb_shinfo(skb)->frag_list;
1371                 struct sk_buff *clone = NULL;
1372                 struct sk_buff *insp = NULL;
1373
1374                 do {
1375                         BUG_ON(!list);
1376
1377                         if (list->len <= eat) {
1378                                 /* Eaten as whole. */
1379                                 eat -= list->len;
1380                                 list = list->next;
1381                                 insp = list;
1382                         } else {
1383                                 /* Eaten partially. */
1384
1385                                 if (skb_shared(list)) {
1386                                         /* Sucks! We need to fork list. :-( */
1387                                         clone = skb_clone(list, GFP_ATOMIC);
1388                                         if (!clone)
1389                                                 return NULL;
1390                                         insp = list->next;
1391                                         list = clone;
1392                                 } else {
1393                                         /* This may be pulled without
1394                                          * problems. */
1395                                         insp = list;
1396                                 }
1397                                 if (!pskb_pull(list, eat)) {
1398                                         kfree_skb(clone);
1399                                         return NULL;
1400                                 }
1401                                 break;
1402                         }
1403                 } while (eat);
1404
1405                 /* Free pulled out fragments. */
1406                 while ((list = skb_shinfo(skb)->frag_list) != insp) {
1407                         skb_shinfo(skb)->frag_list = list->next;
1408                         kfree_skb(list);
1409                 }
1410                 /* And insert new clone at head. */
1411                 if (clone) {
1412                         clone->next = list;
1413                         skb_shinfo(skb)->frag_list = clone;
1414                 }
1415         }
1416         /* Success! Now we may commit changes to skb data. */
1417
1418 pull_pages:
1419         eat = delta;
1420         k = 0;
1421         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1422                 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
1423
1424                 if (size <= eat) {
1425                         skb_frag_unref(skb, i);
1426                         eat -= size;
1427                 } else {
1428                         skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
1429                         if (eat) {
1430                                 skb_shinfo(skb)->frags[k].page_offset += eat;
1431                                 skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
1432                                 eat = 0;
1433                         }
1434                         k++;
1435                 }
1436         }
1437         skb_shinfo(skb)->nr_frags = k;
1438
1439         skb->tail     += delta;
1440         skb->data_len -= delta;
1441
1442         return skb_tail_pointer(skb);
1443 }
1444 EXPORT_SYMBOL(__pskb_pull_tail);
1445
1446 /**
1447  *      skb_copy_bits - copy bits from skb to kernel buffer
1448  *      @skb: source skb
1449  *      @offset: offset in source
1450  *      @to: destination buffer
1451  *      @len: number of bytes to copy
1452  *
1453  *      Copy the specified number of bytes from the source skb to the
1454  *      destination buffer.
1455  *
1456  *      CAUTION ! :
1457  *              If its prototype is ever changed,
1458  *              check arch/{*}/net/{*}.S files,
1459  *              since it is called from BPF assembly code.
1460  */
1461 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
1462 {
1463         int start = skb_headlen(skb);
1464         struct sk_buff *frag_iter;
1465         int i, copy;
1466
1467         if (offset > (int)skb->len - len)
1468                 goto fault;
1469
1470         /* Copy header. */
1471         if ((copy = start - offset) > 0) {
1472                 if (copy > len)
1473                         copy = len;
1474                 skb_copy_from_linear_data_offset(skb, offset, to, copy);
1475                 if ((len -= copy) == 0)
1476                         return 0;
1477                 offset += copy;
1478                 to     += copy;
1479         }
1480
1481         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1482                 int end;
1483                 skb_frag_t *f = &skb_shinfo(skb)->frags[i];
1484
1485                 WARN_ON(start > offset + len);
1486
1487                 end = start + skb_frag_size(f);
1488                 if ((copy = end - offset) > 0) {
1489                         u8 *vaddr;
1490
1491                         if (copy > len)
1492                                 copy = len;
1493
1494                         vaddr = kmap_atomic(skb_frag_page(f));
1495                         memcpy(to,
1496                                vaddr + f->page_offset + offset - start,
1497                                copy);
1498                         kunmap_atomic(vaddr);
1499
1500                         if ((len -= copy) == 0)
1501                                 return 0;
1502                         offset += copy;
1503                         to     += copy;
1504                 }
1505                 start = end;
1506         }
1507
1508         skb_walk_frags(skb, frag_iter) {
1509                 int end;
1510
1511                 WARN_ON(start > offset + len);
1512
1513                 end = start + frag_iter->len;
1514                 if ((copy = end - offset) > 0) {
1515                         if (copy > len)
1516                                 copy = len;
1517                         if (skb_copy_bits(frag_iter, offset - start, to, copy))
1518                                 goto fault;
1519                         if ((len -= copy) == 0)
1520                                 return 0;
1521                         offset += copy;
1522                         to     += copy;
1523                 }
1524                 start = end;
1525         }
1526
1527         if (!len)
1528                 return 0;
1529
1530 fault:
1531         return -EFAULT;
1532 }
1533 EXPORT_SYMBOL(skb_copy_bits);
1534
1535 /*
1536  * Callback from splice_to_pipe(), if we need to release some pages
1537  * at the end of the spd in case we error'ed out in filling the pipe.
1538  */
1539 static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
1540 {
1541         put_page(spd->pages[i]);
1542 }
1543
1544 static struct page *linear_to_page(struct page *page, unsigned int *len,
1545                                    unsigned int *offset,
1546                                    struct sk_buff *skb, struct sock *sk)
1547 {
1548         struct page *p = sk->sk_sndmsg_page;
1549         unsigned int off;
1550
1551         if (!p) {
1552 new_page:
1553                 p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0);
1554                 if (!p)
1555                         return NULL;
1556
1557                 off = sk->sk_sndmsg_off = 0;
1558                 /* hold one ref to this page until it's full */
1559         } else {
1560                 unsigned int mlen;
1561
1562                 /* If we are the only user of the page, we can reset offset */
1563                 if (page_count(p) == 1)
1564                         sk->sk_sndmsg_off = 0;
1565                 off = sk->sk_sndmsg_off;
1566                 mlen = PAGE_SIZE - off;
1567                 if (mlen < 64 && mlen < *len) {
1568                         put_page(p);
1569                         goto new_page;
1570                 }
1571
1572                 *len = min_t(unsigned int, *len, mlen);
1573         }
1574
1575         memcpy(page_address(p) + off, page_address(page) + *offset, *len);
1576         sk->sk_sndmsg_off += *len;
1577         *offset = off;
1578
1579         return p;
1580 }
1581
1582 static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
1583                              struct page *page,
1584                              unsigned int offset)
1585 {
1586         return  spd->nr_pages &&
1587                 spd->pages[spd->nr_pages - 1] == page &&
1588                 (spd->partial[spd->nr_pages - 1].offset +
1589                  spd->partial[spd->nr_pages - 1].len == offset);
1590 }
1591
1592 /*
1593  * Fill page/offset/length into spd, if it can hold more pages.
1594  */
1595 static bool spd_fill_page(struct splice_pipe_desc *spd,
1596                           struct pipe_inode_info *pipe, struct page *page,
1597                           unsigned int *len, unsigned int offset,
1598                           struct sk_buff *skb, bool linear,
1599                           struct sock *sk)
1600 {
1601         if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
1602                 return true;
1603
1604         if (linear) {
1605                 page = linear_to_page(page, len, &offset, skb, sk);
1606                 if (!page)
1607                         return true;
1608         }
1609         if (spd_can_coalesce(spd, page, offset)) {
1610                 spd->partial[spd->nr_pages - 1].len += *len;
1611                 return false;
1612         }
1613         get_page(page);
1614         spd->pages[spd->nr_pages] = page;
1615         spd->partial[spd->nr_pages].len = *len;
1616         spd->partial[spd->nr_pages].offset = offset;
1617         spd->nr_pages++;
1618
1619         return false;
1620 }
1621
1622 static inline void __segment_seek(struct page **page, unsigned int *poff,
1623                                   unsigned int *plen, unsigned int off)
1624 {
1625         unsigned long n;
1626
1627         *poff += off;
1628         n = *poff / PAGE_SIZE;
1629         if (n)
1630                 *page = nth_page(*page, n);
1631
1632         *poff = *poff % PAGE_SIZE;
1633         *plen -= off;
1634 }
1635
1636 static bool __splice_segment(struct page *page, unsigned int poff,
1637                              unsigned int plen, unsigned int *off,
1638                              unsigned int *len, struct sk_buff *skb,
1639                              struct splice_pipe_desc *spd, bool linear,
1640                              struct sock *sk,
1641                              struct pipe_inode_info *pipe)
1642 {
1643         if (!*len)
1644                 return true;
1645
1646         /* skip this segment if already processed */
1647         if (*off >= plen) {
1648                 *off -= plen;
1649                 return false;
1650         }
1651
1652         /* ignore any bits we already processed */
1653         if (*off) {
1654                 __segment_seek(&page, &poff, &plen, *off);
1655                 *off = 0;
1656         }
1657
1658         do {
1659                 unsigned int flen = min(*len, plen);
1660
1661                 /* the linear region may spread across several pages  */
1662                 flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
1663
1664                 if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
1665                         return true;
1666
1667                 __segment_seek(&page, &poff, &plen, flen);
1668                 *len -= flen;
1669
1670         } while (*len && plen);
1671
1672         return false;
1673 }
1674
1675 /*
1676  * Map linear and fragment data from the skb to spd. It reports true if the
1677  * pipe is full or if we already spliced the requested length.
1678  */
1679 static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
1680                               unsigned int *offset, unsigned int *len,
1681                               struct splice_pipe_desc *spd, struct sock *sk)
1682 {
1683         int seg;
1684
1685         /* map the linear part :
1686          * If skb->head_frag is set, this 'linear' part is backed by a
1687          * fragment, and if the head is not shared with any clones then
1688          * we can avoid a copy since we own the head portion of this page.
1689          */
1690         if (__splice_segment(virt_to_page(skb->data),
1691                              (unsigned long) skb->data & (PAGE_SIZE - 1),
1692                              skb_headlen(skb),
1693                              offset, len, skb, spd,
1694                              skb_head_is_locked(skb),
1695                              sk, pipe))
1696                 return true;
1697
1698         /*
1699          * then map the fragments
1700          */
1701         for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
1702                 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
1703
1704                 if (__splice_segment(skb_frag_page(f),
1705                                      f->page_offset, skb_frag_size(f),
1706                                      offset, len, skb, spd, false, sk, pipe))
1707                         return true;
1708         }
1709
1710         return false;
1711 }
1712
1713 /*
1714  * Map data from the skb to a pipe. Should handle both the linear part,
1715  * the fragments, and the frag list. It does NOT handle frag lists within
1716  * the frag list, if such a thing exists. We'd probably need to recurse to
1717  * handle that cleanly.
1718  */
1719 int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
1720                     struct pipe_inode_info *pipe, unsigned int tlen,
1721                     unsigned int flags)
1722 {
1723         struct partial_page partial[MAX_SKB_FRAGS];
1724         struct page *pages[MAX_SKB_FRAGS];
1725         struct splice_pipe_desc spd = {
1726                 .pages = pages,
1727                 .partial = partial,
1728                 .flags = flags,
1729                 .ops = &sock_pipe_buf_ops,
1730                 .spd_release = sock_spd_release,
1731         };
1732         struct sk_buff *frag_iter;
1733         struct sock *sk = skb->sk;
1734         int ret = 0;
1735
1736         /*
1737          * __skb_splice_bits() only fails if the output has no room left,
1738          * so no point in going over the frag_list for the error case.
1739          */
1740         if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
1741                 goto done;
1742         else if (!tlen)
1743                 goto done;
1744
1745         /*
1746          * now see if we have a frag_list to map
1747          */
1748         skb_walk_frags(skb, frag_iter) {
1749                 if (!tlen)
1750                         break;
1751                 if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
1752                         break;
1753         }
1754
1755 done:
1756         if (spd.nr_pages) {
1757                 /*
1758                  * Drop the socket lock, otherwise we have reverse
1759                  * locking dependencies between sk_lock and i_mutex
1760                  * here as compared to sendfile(). We enter here
1761                  * with the socket lock held, and splice_to_pipe() will
1762                  * grab the pipe inode lock. For sendfile() emulation,
1763                  * we call into ->sendpage() with the i_mutex lock held
1764                  * and networking will grab the socket lock.
1765                  */
1766                 release_sock(sk);
1767                 ret = splice_to_pipe(pipe, &spd);
1768                 lock_sock(sk);
1769         }
1770
1771         return ret;
1772 }
1773
1774 /**
1775  *      skb_store_bits - store bits from kernel buffer to skb
1776  *      @skb: destination buffer
1777  *      @offset: offset in destination
1778  *      @from: source buffer
1779  *      @len: number of bytes to copy
1780  *
1781  *      Copy the specified number of bytes from the source buffer to the
1782  *      destination skb.  This function handles all the messy bits of
1783  *      traversing fragment lists and such.
1784  */
1785
1786 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
1787 {
1788         int start = skb_headlen(skb);
1789         struct sk_buff *frag_iter;
1790         int i, copy;
1791
1792         if (offset > (int)skb->len - len)
1793                 goto fault;
1794
1795         if ((copy = start - offset) > 0) {
1796                 if (copy > len)
1797                         copy = len;
1798                 skb_copy_to_linear_data_offset(skb, offset, from, copy);
1799                 if ((len -= copy) == 0)
1800                         return 0;
1801                 offset += copy;
1802                 from += copy;
1803         }
1804
1805         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1806                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1807                 int end;
1808
1809                 WARN_ON(start > offset + len);
1810
1811                 end = start + skb_frag_size(frag);
1812                 if ((copy = end - offset) > 0) {
1813                         u8 *vaddr;
1814
1815                         if (copy > len)
1816                                 copy = len;
1817
1818                         vaddr = kmap_atomic(skb_frag_page(frag));
1819                         memcpy(vaddr + frag->page_offset + offset - start,
1820                                from, copy);
1821                         kunmap_atomic(vaddr);
1822
1823                         if ((len -= copy) == 0)
1824                                 return 0;
1825                         offset += copy;
1826                         from += copy;
1827                 }
1828                 start = end;
1829         }
1830
1831         skb_walk_frags(skb, frag_iter) {
1832                 int end;
1833
1834                 WARN_ON(start > offset + len);
1835
1836                 end = start + frag_iter->len;
1837                 if ((copy = end - offset) > 0) {
1838                         if (copy > len)
1839                                 copy = len;
1840                         if (skb_store_bits(frag_iter, offset - start,
1841                                            from, copy))
1842                                 goto fault;
1843                         if ((len -= copy) == 0)
1844                                 return 0;
1845                         offset += copy;
1846                         from += copy;
1847                 }
1848                 start = end;
1849         }
1850         if (!len)
1851                 return 0;
1852
1853 fault:
1854         return -EFAULT;
1855 }
1856 EXPORT_SYMBOL(skb_store_bits);
1857
1858 /* Checksum skb data. */
1859
1860 __wsum skb_checksum(const struct sk_buff *skb, int offset,
1861                           int len, __wsum csum)
1862 {
1863         int start = skb_headlen(skb);
1864         int i, copy = start - offset;
1865         struct sk_buff *frag_iter;
1866         int pos = 0;
1867
1868         /* Checksum header. */
1869         if (copy > 0) {
1870                 if (copy > len)
1871                         copy = len;
1872                 csum = csum_partial(skb->data + offset, copy, csum);
1873                 if ((len -= copy) == 0)
1874                         return csum;
1875                 offset += copy;
1876                 pos     = copy;
1877         }
1878
1879         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1880                 int end;
1881                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1882
1883                 WARN_ON(start > offset + len);
1884
1885                 end = start + skb_frag_size(frag);
1886                 if ((copy = end - offset) > 0) {
1887                         __wsum csum2;
1888                         u8 *vaddr;
1889
1890                         if (copy > len)
1891                                 copy = len;
1892                         vaddr = kmap_atomic(skb_frag_page(frag));
1893                         csum2 = csum_partial(vaddr + frag->page_offset +
1894                                              offset - start, copy, 0);
1895                         kunmap_atomic(vaddr);
1896                         csum = csum_block_add(csum, csum2, pos);
1897                         if (!(len -= copy))
1898                                 return csum;
1899                         offset += copy;
1900                         pos    += copy;
1901                 }
1902                 start = end;
1903         }
1904
1905         skb_walk_frags(skb, frag_iter) {
1906                 int end;
1907
1908                 WARN_ON(start > offset + len);
1909
1910                 end = start + frag_iter->len;
1911                 if ((copy = end - offset) > 0) {
1912                         __wsum csum2;
1913                         if (copy > len)
1914                                 copy = len;
1915                         csum2 = skb_checksum(frag_iter, offset - start,
1916                                              copy, 0);
1917                         csum = csum_block_add(csum, csum2, pos);
1918                         if ((len -= copy) == 0)
1919                                 return csum;
1920                         offset += copy;
1921                         pos    += copy;
1922                 }
1923                 start = end;
1924         }
1925         BUG_ON(len);
1926
1927         return csum;
1928 }
1929 EXPORT_SYMBOL(skb_checksum);
1930
1931 /* Both of above in one bottle. */
1932
1933 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
1934                                     u8 *to, int len, __wsum csum)
1935 {
1936         int start = skb_headlen(skb);
1937         int i, copy = start - offset;
1938         struct sk_buff *frag_iter;
1939         int pos = 0;
1940
1941         /* Copy header. */
1942         if (copy > 0) {
1943                 if (copy > len)
1944                         copy = len;
1945                 csum = csum_partial_copy_nocheck(skb->data + offset, to,
1946                                                  copy, csum);
1947                 if ((len -= copy) == 0)
1948                         return csum;
1949                 offset += copy;
1950                 to     += copy;
1951                 pos     = copy;
1952         }
1953
1954         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1955                 int end;
1956
1957                 WARN_ON(start > offset + len);
1958
1959                 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
1960                 if ((copy = end - offset) > 0) {
1961                         __wsum csum2;
1962                         u8 *vaddr;
1963                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
1964
1965                         if (copy > len)
1966                                 copy = len;
1967                         vaddr = kmap_atomic(skb_frag_page(frag));
1968                         csum2 = csum_partial_copy_nocheck(vaddr +
1969                                                           frag->page_offset +
1970                                                           offset - start, to,
1971                                                           copy, 0);
1972                         kunmap_atomic(vaddr);
1973                         csum = csum_block_add(csum, csum2, pos);
1974                         if (!(len -= copy))
1975                                 return csum;
1976                         offset += copy;
1977                         to     += copy;
1978                         pos    += copy;
1979                 }
1980                 start = end;
1981         }
1982
1983         skb_walk_frags(skb, frag_iter) {
1984                 __wsum csum2;
1985                 int end;
1986
1987                 WARN_ON(start > offset + len);
1988
1989                 end = start + frag_iter->len;
1990                 if ((copy = end - offset) > 0) {
1991                         if (copy > len)
1992                                 copy = len;
1993                         csum2 = skb_copy_and_csum_bits(frag_iter,
1994                                                        offset - start,
1995                                                        to, copy, 0);
1996                         csum = csum_block_add(csum, csum2, pos);
1997                         if ((len -= copy) == 0)
1998                                 return csum;
1999                         offset += copy;
2000                         to     += copy;
2001                         pos    += copy;
2002                 }
2003                 start = end;
2004         }
2005         BUG_ON(len);
2006         return csum;
2007 }
2008 EXPORT_SYMBOL(skb_copy_and_csum_bits);
2009
2010 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
2011 {
2012         __wsum csum;
2013         long csstart;
2014
2015         if (skb->ip_summed == CHECKSUM_PARTIAL)
2016                 csstart = skb_checksum_start_offset(skb);
2017         else
2018                 csstart = skb_headlen(skb);
2019
2020         BUG_ON(csstart > skb_headlen(skb));
2021
2022         skb_copy_from_linear_data(skb, to, csstart);
2023
2024         csum = 0;
2025         if (csstart != skb->len)
2026                 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
2027                                               skb->len - csstart, 0);
2028
2029         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2030                 long csstuff = csstart + skb->csum_offset;
2031
2032                 *((__sum16 *)(to + csstuff)) = csum_fold(csum);
2033         }
2034 }
2035 EXPORT_SYMBOL(skb_copy_and_csum_dev);
2036
2037 /**
2038  *      skb_dequeue - remove from the head of the queue
2039  *      @list: list to dequeue from
2040  *
2041  *      Remove the head of the list. The list lock is taken so the function
2042  *      may be used safely with other locking list functions. The head item is
2043  *      returned or %NULL if the list is empty.
2044  */
2045
2046 struct sk_buff *skb_dequeue(struct sk_buff_head *list)
2047 {
2048         unsigned long flags;
2049         struct sk_buff *result;
2050
2051         spin_lock_irqsave(&list->lock, flags);
2052         result = __skb_dequeue(list);
2053         spin_unlock_irqrestore(&list->lock, flags);
2054         return result;
2055 }
2056 EXPORT_SYMBOL(skb_dequeue);
2057
2058 /**
2059  *      skb_dequeue_tail - remove from the tail of the queue
2060  *      @list: list to dequeue from
2061  *
2062  *      Remove the tail of the list. The list lock is taken so the function
2063  *      may be used safely with other locking list functions. The tail item is
2064  *      returned or %NULL if the list is empty.
2065  */
2066 struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
2067 {
2068         unsigned long flags;
2069         struct sk_buff *result;
2070
2071         spin_lock_irqsave(&list->lock, flags);
2072         result = __skb_dequeue_tail(list);
2073         spin_unlock_irqrestore(&list->lock, flags);
2074         return result;
2075 }
2076 EXPORT_SYMBOL(skb_dequeue_tail);
2077
2078 /**
2079  *      skb_queue_purge - empty a list
2080  *      @list: list to empty
2081  *
2082  *      Delete all buffers on an &sk_buff list. Each buffer is removed from
2083  *      the list and one reference dropped. This function takes the list
2084  *      lock and is atomic with respect to other list locking functions.
2085  */
2086 void skb_queue_purge(struct sk_buff_head *list)
2087 {
2088         struct sk_buff *skb;
2089         while ((skb = skb_dequeue(list)) != NULL)
2090                 kfree_skb(skb);
2091 }
2092 EXPORT_SYMBOL(skb_queue_purge);
2093
2094 /**
2095  *      skb_queue_head - queue a buffer at the list head
2096  *      @list: list to use
2097  *      @newsk: buffer to queue
2098  *
2099  *      Queue a buffer at the start of the list. This function takes the
2100  *      list lock and can be used safely with other locking &sk_buff functions
2101  *      safely.
2102  *
2103  *      A buffer cannot be placed on two lists at the same time.
2104  */
2105 void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
2106 {
2107         unsigned long flags;
2108
2109         spin_lock_irqsave(&list->lock, flags);
2110         __skb_queue_head(list, newsk);
2111         spin_unlock_irqrestore(&list->lock, flags);
2112 }
2113 EXPORT_SYMBOL(skb_queue_head);
2114
2115 /**
2116  *      skb_queue_tail - queue a buffer at the list tail
2117  *      @list: list to use
2118  *      @newsk: buffer to queue
2119  *
2120  *      Queue a buffer at the tail of the list. This function takes the
2121  *      list lock and can be used safely with other locking &sk_buff functions
2122  *      safely.
2123  *
2124  *      A buffer cannot be placed on two lists at the same time.
2125  */
2126 void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
2127 {
2128         unsigned long flags;
2129
2130         spin_lock_irqsave(&list->lock, flags);
2131         __skb_queue_tail(list, newsk);
2132         spin_unlock_irqrestore(&list->lock, flags);
2133 }
2134 EXPORT_SYMBOL(skb_queue_tail);
2135
2136 /**
2137  *      skb_unlink      -       remove a buffer from a list
2138  *      @skb: buffer to remove
2139  *      @list: list to use
2140  *
2141  *      Remove a packet from a list. The list locks are taken and this
2142  *      function is atomic with respect to other list locked calls
2143  *
2144  *      You must know what list the SKB is on.
2145  */
2146 void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
2147 {
2148         unsigned long flags;
2149
2150         spin_lock_irqsave(&list->lock, flags);
2151         __skb_unlink(skb, list);
2152         spin_unlock_irqrestore(&list->lock, flags);
2153 }
2154 EXPORT_SYMBOL(skb_unlink);
2155
2156 /**
2157  *      skb_append      -       append a buffer
2158  *      @old: buffer to insert after
2159  *      @newsk: buffer to insert
2160  *      @list: list to use
2161  *
2162  *      Place a packet after a given packet in a list. The list locks are taken
2163  *      and this function is atomic with respect to other list locked calls.
2164  *      A buffer cannot be placed on two lists at the same time.
2165  */
2166 void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
2167 {
2168         unsigned long flags;
2169
2170         spin_lock_irqsave(&list->lock, flags);
2171         __skb_queue_after(list, old, newsk);
2172         spin_unlock_irqrestore(&list->lock, flags);
2173 }
2174 EXPORT_SYMBOL(skb_append);
2175
2176 /**
2177  *      skb_insert      -       insert a buffer
2178  *      @old: buffer to insert before
2179  *      @newsk: buffer to insert
2180  *      @list: list to use
2181  *
2182  *      Place a packet before a given packet in a list. The list locks are
2183  *      taken and this function is atomic with respect to other list locked
2184  *      calls.
2185  *
2186  *      A buffer cannot be placed on two lists at the same time.
2187  */
2188 void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
2189 {
2190         unsigned long flags;
2191
2192         spin_lock_irqsave(&list->lock, flags);
2193         __skb_insert(newsk, old->prev, old, list);
2194         spin_unlock_irqrestore(&list->lock, flags);
2195 }
2196 EXPORT_SYMBOL(skb_insert);
2197
2198 static inline void skb_split_inside_header(struct sk_buff *skb,
2199                                            struct sk_buff* skb1,
2200                                            const u32 len, const int pos)
2201 {
2202         int i;
2203
2204         skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
2205                                          pos - len);
2206         /* And move data appendix as is. */
2207         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
2208                 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
2209
2210         skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
2211         skb_shinfo(skb)->nr_frags  = 0;
2212         skb1->data_len             = skb->data_len;
2213         skb1->len                  += skb1->data_len;
2214         skb->data_len              = 0;
2215         skb->len                   = len;
2216         skb_set_tail_pointer(skb, len);
2217 }
2218
2219 static inline void skb_split_no_header(struct sk_buff *skb,
2220                                        struct sk_buff* skb1,
2221                                        const u32 len, int pos)
2222 {
2223         int i, k = 0;
2224         const int nfrags = skb_shinfo(skb)->nr_frags;
2225
2226         skb_shinfo(skb)->nr_frags = 0;
2227         skb1->len                 = skb1->data_len = skb->len - len;
2228         skb->len                  = len;
2229         skb->data_len             = len - pos;
2230
2231         for (i = 0; i < nfrags; i++) {
2232                 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
2233
2234                 if (pos + size > len) {
2235                         skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
2236
2237                         if (pos < len) {
2238                                 /* Split frag.
2239                                  * We have two variants in this case:
2240                                  * 1. Move all the frag to the second
2241                                  *    part, if it is possible. F.e.
2242                                  *    this approach is mandatory for TUX,
2243                                  *    where splitting is expensive.
2244                                  * 2. Split is accurately. We make this.
2245                                  */
2246                                 skb_frag_ref(skb, i);
2247                                 skb_shinfo(skb1)->frags[0].page_offset += len - pos;
2248                                 skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
2249                                 skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
2250                                 skb_shinfo(skb)->nr_frags++;
2251                         }
2252                         k++;
2253                 } else
2254                         skb_shinfo(skb)->nr_frags++;
2255                 pos += size;
2256         }
2257         skb_shinfo(skb1)->nr_frags = k;
2258 }
2259
2260 /**
2261  * skb_split - Split fragmented skb to two parts at length len.
2262  * @skb: the buffer to split
2263  * @skb1: the buffer to receive the second part
2264  * @len: new length for skb
2265  */
2266 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
2267 {
2268         int pos = skb_headlen(skb);
2269
2270         if (len < pos)  /* Split line is inside header. */
2271                 skb_split_inside_header(skb, skb1, len, pos);
2272         else            /* Second chunk has no header, nothing to copy. */
2273                 skb_split_no_header(skb, skb1, len, pos);
2274 }
2275 EXPORT_SYMBOL(skb_split);
2276
2277 /* Shifting from/to a cloned skb is a no-go.
2278  *
2279  * Caller cannot keep skb_shinfo related pointers past calling here!
2280  */
2281 static int skb_prepare_for_shift(struct sk_buff *skb)
2282 {
2283         return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2284 }
2285
2286 /**
2287  * skb_shift - Shifts paged data partially from skb to another
2288  * @tgt: buffer into which tail data gets added
2289  * @skb: buffer from which the paged data comes from
2290  * @shiftlen: shift up to this many bytes
2291  *
2292  * Attempts to shift up to shiftlen worth of bytes, which may be less than
2293  * the length of the skb, from skb to tgt. Returns number bytes shifted.
2294  * It's up to caller to free skb if everything was shifted.
2295  *
2296  * If @tgt runs out of frags, the whole operation is aborted.
2297  *
2298  * Skb cannot include anything else but paged data while tgt is allowed
2299  * to have non-paged data as well.
2300  *
2301  * TODO: full sized shift could be optimized but that would need
2302  * specialized skb free'er to handle frags without up-to-date nr_frags.
2303  */
2304 int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
2305 {
2306         int from, to, merge, todo;
2307         struct skb_frag_struct *fragfrom, *fragto;
2308
2309         BUG_ON(shiftlen > skb->len);
2310         BUG_ON(skb_headlen(skb));       /* Would corrupt stream */
2311
2312         todo = shiftlen;
2313         from = 0;
2314         to = skb_shinfo(tgt)->nr_frags;
2315         fragfrom = &skb_shinfo(skb)->frags[from];
2316
2317         /* Actual merge is delayed until the point when we know we can
2318          * commit all, so that we don't have to undo partial changes
2319          */
2320         if (!to ||
2321             !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
2322                               fragfrom->page_offset)) {
2323                 merge = -1;
2324         } else {
2325                 merge = to - 1;
2326
2327                 todo -= skb_frag_size(fragfrom);
2328                 if (todo < 0) {
2329                         if (skb_prepare_for_shift(skb) ||
2330                             skb_prepare_for_shift(tgt))
2331                                 return 0;
2332
2333                         /* All previous frag pointers might be stale! */
2334                         fragfrom = &skb_shinfo(skb)->frags[from];
2335                         fragto = &skb_shinfo(tgt)->frags[merge];
2336
2337                         skb_frag_size_add(fragto, shiftlen);
2338                         skb_frag_size_sub(fragfrom, shiftlen);
2339                         fragfrom->page_offset += shiftlen;
2340
2341                         goto onlymerged;
2342                 }
2343
2344                 from++;
2345         }
2346
2347         /* Skip full, not-fitting skb to avoid expensive operations */
2348         if ((shiftlen == skb->len) &&
2349             (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
2350                 return 0;
2351
2352         if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
2353                 return 0;
2354
2355         while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
2356                 if (to == MAX_SKB_FRAGS)
2357                         return 0;
2358
2359                 fragfrom = &skb_shinfo(skb)->frags[from];
2360                 fragto = &skb_shinfo(tgt)->frags[to];
2361
2362                 if (todo >= skb_frag_size(fragfrom)) {
2363                         *fragto = *fragfrom;
2364                         todo -= skb_frag_size(fragfrom);
2365                         from++;
2366                         to++;
2367
2368                 } else {
2369                         __skb_frag_ref(fragfrom);
2370                         fragto->page = fragfrom->page;
2371                         fragto->page_offset = fragfrom->page_offset;
2372                         skb_frag_size_set(fragto, todo);
2373
2374                         fragfrom->page_offset += todo;
2375                         skb_frag_size_sub(fragfrom, todo);
2376                         todo = 0;
2377
2378                         to++;
2379                         break;
2380                 }
2381         }
2382
2383         /* Ready to "commit" this state change to tgt */
2384         skb_shinfo(tgt)->nr_frags = to;
2385
2386         if (merge >= 0) {
2387                 fragfrom = &skb_shinfo(skb)->frags[0];
2388                 fragto = &skb_shinfo(tgt)->frags[merge];
2389
2390                 skb_frag_size_add(fragto, skb_frag_size(fragfrom));
2391                 __skb_frag_unref(fragfrom);
2392         }
2393
2394         /* Reposition in the original skb */
2395         to = 0;
2396         while (from < skb_shinfo(skb)->nr_frags)
2397                 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
2398         skb_shinfo(skb)->nr_frags = to;
2399
2400         BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
2401
2402 onlymerged:
2403         /* Most likely the tgt won't ever need its checksum anymore, skb on
2404          * the other hand might need it if it needs to be resent
2405          */
2406         tgt->ip_summed = CHECKSUM_PARTIAL;
2407         skb->ip_summed = CHECKSUM_PARTIAL;
2408
2409         /* Yak, is it really working this way? Some helper please? */
2410         skb->len -= shiftlen;
2411         skb->data_len -= shiftlen;
2412         skb->truesize -= shiftlen;
2413         tgt->len += shiftlen;
2414         tgt->data_len += shiftlen;
2415         tgt->truesize += shiftlen;
2416
2417         return shiftlen;
2418 }
2419
2420 /**
2421  * skb_prepare_seq_read - Prepare a sequential read of skb data
2422  * @skb: the buffer to read
2423  * @from: lower offset of data to be read
2424  * @to: upper offset of data to be read
2425  * @st: state variable
2426  *
2427  * Initializes the specified state variable. Must be called before
2428  * invoking skb_seq_read() for the first time.
2429  */
2430 void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
2431                           unsigned int to, struct skb_seq_state *st)
2432 {
2433         st->lower_offset = from;
2434         st->upper_offset = to;
2435         st->root_skb = st->cur_skb = skb;
2436         st->frag_idx = st->stepped_offset = 0;
2437         st->frag_data = NULL;
2438 }
2439 EXPORT_SYMBOL(skb_prepare_seq_read);
2440
2441 /**
2442  * skb_seq_read - Sequentially read skb data
2443  * @consumed: number of bytes consumed by the caller so far
2444  * @data: destination pointer for data to be returned
2445  * @st: state variable
2446  *
2447  * Reads a block of skb data at &consumed relative to the
2448  * lower offset specified to skb_prepare_seq_read(). Assigns
2449  * the head of the data block to &data and returns the length
2450  * of the block or 0 if the end of the skb data or the upper
2451  * offset has been reached.
2452  *
2453  * The caller is not required to consume all of the data
2454  * returned, i.e. &consumed is typically set to the number
2455  * of bytes already consumed and the next call to
2456  * skb_seq_read() will return the remaining part of the block.
2457  *
2458  * Note 1: The size of each block of data returned can be arbitrary,
2459  *       this limitation is the cost for zerocopy seqeuental
2460  *       reads of potentially non linear data.
2461  *
2462  * Note 2: Fragment lists within fragments are not implemented
2463  *       at the moment, state->root_skb could be replaced with
2464  *       a stack for this purpose.
2465  */
2466 unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
2467                           struct skb_seq_state *st)
2468 {
2469         unsigned int block_limit, abs_offset = consumed + st->lower_offset;
2470         skb_frag_t *frag;
2471
2472         if (unlikely(abs_offset >= st->upper_offset))
2473                 return 0;
2474
2475 next_skb:
2476         block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
2477
2478         if (abs_offset < block_limit && !st->frag_data) {
2479                 *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
2480                 return block_limit - abs_offset;
2481         }
2482
2483         if (st->frag_idx == 0 && !st->frag_data)
2484                 st->stepped_offset += skb_headlen(st->cur_skb);
2485
2486         while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
2487                 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
2488                 block_limit = skb_frag_size(frag) + st->stepped_offset;
2489
2490                 if (abs_offset < block_limit) {
2491                         if (!st->frag_data)
2492                                 st->frag_data = kmap_atomic(skb_frag_page(frag));
2493
2494                         *data = (u8 *) st->frag_data + frag->page_offset +
2495                                 (abs_offset - st->stepped_offset);
2496
2497                         return block_limit - abs_offset;
2498                 }
2499
2500                 if (st->frag_data) {
2501                         kunmap_atomic(st->frag_data);
2502                         st->frag_data = NULL;
2503                 }
2504
2505                 st->frag_idx++;
2506                 st->stepped_offset += skb_frag_size(frag);
2507         }
2508
2509         if (st->frag_data) {
2510                 kunmap_atomic(st->frag_data);
2511                 st->frag_data = NULL;
2512         }
2513
2514         if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
2515                 st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
2516                 st->frag_idx = 0;
2517                 goto next_skb;
2518         } else if (st->cur_skb->next) {
2519                 st->cur_skb = st->cur_skb->next;
2520                 st->frag_idx = 0;
2521                 goto next_skb;
2522         }
2523
2524         return 0;
2525 }
2526 EXPORT_SYMBOL(skb_seq_read);
2527
2528 /**
2529  * skb_abort_seq_read - Abort a sequential read of skb data
2530  * @st: state variable
2531  *
2532  * Must be called if skb_seq_read() was not called until it
2533  * returned 0.
2534  */
2535 void skb_abort_seq_read(struct skb_seq_state *st)
2536 {
2537         if (st->frag_data)
2538                 kunmap_atomic(st->frag_data);
2539 }
2540 EXPORT_SYMBOL(skb_abort_seq_read);
2541
2542 #define TS_SKB_CB(state)        ((struct skb_seq_state *) &((state)->cb))
2543
2544 static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
2545                                           struct ts_config *conf,
2546                                           struct ts_state *state)
2547 {
2548         return skb_seq_read(offset, text, TS_SKB_CB(state));
2549 }
2550
2551 static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
2552 {
2553         skb_abort_seq_read(TS_SKB_CB(state));
2554 }
2555
2556 /**
2557  * skb_find_text - Find a text pattern in skb data
2558  * @skb: the buffer to look in
2559  * @from: search offset
2560  * @to: search limit
2561  * @config: textsearch configuration
2562  * @state: uninitialized textsearch state variable
2563  *
2564  * Finds a pattern in the skb data according to the specified
2565  * textsearch configuration. Use textsearch_next() to retrieve
2566  * subsequent occurrences of the pattern. Returns the offset
2567  * to the first occurrence or UINT_MAX if no match was found.
2568  */
2569 unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
2570                            unsigned int to, struct ts_config *config,
2571                            struct ts_state *state)
2572 {
2573         unsigned int ret;
2574
2575         config->get_next_block = skb_ts_get_next_block;
2576         config->finish = skb_ts_finish;
2577
2578         skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
2579
2580         ret = textsearch_find(config, state);
2581         return (ret <= to - from ? ret : UINT_MAX);
2582 }
2583 EXPORT_SYMBOL(skb_find_text);
2584
2585 /**
2586  * skb_append_datato_frags: - append the user data to a skb
2587  * @sk: sock  structure
2588  * @skb: skb structure to be appened with user data.
2589  * @getfrag: call back function to be used for getting the user data
2590  * @from: pointer to user message iov
2591  * @length: length of the iov message
2592  *
2593  * Description: This procedure append the user data in the fragment part
2594  * of the skb if any page alloc fails user this procedure returns  -ENOMEM
2595  */
2596 int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
2597                         int (*getfrag)(void *from, char *to, int offset,
2598                                         int len, int odd, struct sk_buff *skb),
2599                         void *from, int length)
2600 {
2601         int frg_cnt = 0;
2602         skb_frag_t *frag = NULL;
2603         struct page *page = NULL;
2604         int copy, left;
2605         int offset = 0;
2606         int ret;
2607
2608         do {
2609                 /* Return error if we don't have space for new frag */
2610                 frg_cnt = skb_shinfo(skb)->nr_frags;
2611                 if (frg_cnt >= MAX_SKB_FRAGS)
2612                         return -EFAULT;
2613
2614                 /* allocate a new page for next frag */
2615                 page = alloc_pages(sk->sk_allocation, 0);
2616
2617                 /* If alloc_page fails just return failure and caller will
2618                  * free previous allocated pages by doing kfree_skb()
2619                  */
2620                 if (page == NULL)
2621                         return -ENOMEM;
2622
2623                 /* initialize the next frag */
2624                 skb_fill_page_desc(skb, frg_cnt, page, 0, 0);
2625                 skb->truesize += PAGE_SIZE;
2626                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
2627
2628                 /* get the new initialized frag */
2629                 frg_cnt = skb_shinfo(skb)->nr_frags;
2630                 frag = &skb_shinfo(skb)->frags[frg_cnt - 1];
2631
2632                 /* copy the user data to page */
2633                 left = PAGE_SIZE - frag->page_offset;
2634                 copy = (length > left)? left : length;
2635
2636                 ret = getfrag(from, skb_frag_address(frag) + skb_frag_size(frag),
2637                             offset, copy, 0, skb);
2638                 if (ret < 0)
2639                         return -EFAULT;
2640
2641                 /* copy was successful so update the size parameters */
2642                 skb_frag_size_add(frag, copy);
2643                 skb->len += copy;
2644                 skb->data_len += copy;
2645                 offset += copy;
2646                 length -= copy;
2647
2648         } while (length > 0);
2649
2650         return 0;
2651 }
2652 EXPORT_SYMBOL(skb_append_datato_frags);
2653
2654 /**
2655  *      skb_pull_rcsum - pull skb and update receive checksum
2656  *      @skb: buffer to update
2657  *      @len: length of data pulled
2658  *
2659  *      This function performs an skb_pull on the packet and updates
2660  *      the CHECKSUM_COMPLETE checksum.  It should be used on
2661  *      receive path processing instead of skb_pull unless you know
2662  *      that the checksum difference is zero (e.g., a valid IP header)
2663  *      or you are setting ip_summed to CHECKSUM_NONE.
2664  */
2665 unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
2666 {
2667         BUG_ON(len > skb->len);
2668         skb->len -= len;
2669         BUG_ON(skb->len < skb->data_len);
2670         skb_postpull_rcsum(skb, skb->data, len);
2671         return skb->data += len;
2672 }
2673 EXPORT_SYMBOL_GPL(skb_pull_rcsum);
2674
2675 /**
2676  *      skb_segment - Perform protocol segmentation on skb.
2677  *      @skb: buffer to segment
2678  *      @features: features for the output path (see dev->features)
2679  *
2680  *      This function performs segmentation on the given skb.  It returns
2681  *      a pointer to the first in a list of new skbs for the segments.
2682  *      In case of error it returns ERR_PTR(err).
2683  */
2684 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
2685 {
2686         struct sk_buff *segs = NULL;
2687         struct sk_buff *tail = NULL;
2688         struct sk_buff *fskb = skb_shinfo(skb)->frag_list;
2689         unsigned int mss = skb_shinfo(skb)->gso_size;
2690         unsigned int doffset = skb->data - skb_mac_header(skb);
2691         unsigned int offset = doffset;
2692         unsigned int headroom;
2693         unsigned int len;
2694         int sg = !!(features & NETIF_F_SG);
2695         int nfrags = skb_shinfo(skb)->nr_frags;
2696         int err = -ENOMEM;
2697         int i = 0;
2698         int pos;
2699
2700         __skb_push(skb, doffset);
2701         headroom = skb_headroom(skb);
2702         pos = skb_headlen(skb);
2703
2704         do {
2705                 struct sk_buff *nskb;
2706                 skb_frag_t *frag;
2707                 int hsize;
2708                 int size;
2709
2710                 len = skb->len - offset;
2711                 if (len > mss)
2712                         len = mss;
2713
2714                 hsize = skb_headlen(skb) - offset;
2715                 if (hsize < 0)
2716                         hsize = 0;
2717                 if (hsize > len || !sg)
2718                         hsize = len;
2719
2720                 if (!hsize && i >= nfrags) {
2721                         BUG_ON(fskb->len != len);
2722
2723                         pos += len;
2724                         nskb = skb_clone(fskb, GFP_ATOMIC);
2725                         fskb = fskb->next;
2726
2727                         if (unlikely(!nskb))
2728                                 goto err;
2729
2730                         hsize = skb_end_offset(nskb);
2731                         if (skb_cow_head(nskb, doffset + headroom)) {
2732                                 kfree_skb(nskb);
2733                                 goto err;
2734                         }
2735
2736                         nskb->truesize += skb_end_offset(nskb) - hsize;
2737                         skb_release_head_state(nskb);
2738                         __skb_push(nskb, doffset);
2739                 } else {
2740                         nskb = alloc_skb(hsize + doffset + headroom,
2741                                          GFP_ATOMIC);
2742
2743                         if (unlikely(!nskb))
2744                                 goto err;
2745
2746                         skb_reserve(nskb, headroom);
2747                         __skb_put(nskb, doffset);
2748                 }
2749
2750                 if (segs)
2751                         tail->next = nskb;
2752                 else
2753                         segs = nskb;
2754                 tail = nskb;
2755
2756                 __copy_skb_header(nskb, skb);
2757                 nskb->mac_len = skb->mac_len;
2758
2759                 /* nskb and skb might have different headroom */
2760                 if (nskb->ip_summed == CHECKSUM_PARTIAL)
2761                         nskb->csum_start += skb_headroom(nskb) - headroom;
2762
2763                 skb_reset_mac_header(nskb);
2764                 skb_set_network_header(nskb, skb->mac_len);
2765                 nskb->transport_header = (nskb->network_header +
2766                                           skb_network_header_len(skb));
2767                 skb_copy_from_linear_data(skb, nskb->data, doffset);
2768
2769                 if (fskb != skb_shinfo(skb)->frag_list)
2770                         continue;
2771
2772                 if (!sg) {
2773                         nskb->ip_summed = CHECKSUM_NONE;
2774                         nskb->csum = skb_copy_and_csum_bits(skb, offset,
2775                                                             skb_put(nskb, len),
2776                                                             len, 0);
2777                         continue;
2778                 }
2779
2780                 frag = skb_shinfo(nskb)->frags;
2781
2782                 skb_copy_from_linear_data_offset(skb, offset,
2783                                                  skb_put(nskb, hsize), hsize);
2784
2785                 while (pos < offset + len && i < nfrags) {
2786                         *frag = skb_shinfo(skb)->frags[i];
2787                         __skb_frag_ref(frag);
2788                         size = skb_frag_size(frag);
2789
2790                         if (pos < offset) {
2791                                 frag->page_offset += offset - pos;
2792                                 skb_frag_size_sub(frag, offset - pos);
2793                         }
2794
2795                         skb_shinfo(nskb)->nr_frags++;
2796
2797                         if (pos + size <= offset + len) {
2798                                 i++;
2799                                 pos += size;
2800                         } else {
2801                                 skb_frag_size_sub(frag, pos + size - (offset + len));
2802                                 goto skip_fraglist;
2803                         }
2804
2805                         frag++;
2806                 }
2807
2808                 if (pos < offset + len) {
2809                         struct sk_buff *fskb2 = fskb;
2810
2811                         BUG_ON(pos + fskb->len != offset + len);
2812
2813                         pos += fskb->len;
2814                         fskb = fskb->next;
2815
2816                         if (fskb2->next) {
2817                                 fskb2 = skb_clone(fskb2, GFP_ATOMIC);
2818                                 if (!fskb2)
2819                                         goto err;
2820                         } else
2821                                 skb_get(fskb2);
2822
2823                         SKB_FRAG_ASSERT(nskb);
2824                         skb_shinfo(nskb)->frag_list = fskb2;
2825                 }
2826
2827 skip_fraglist:
2828                 nskb->data_len = len - hsize;
2829                 nskb->len += nskb->data_len;
2830                 nskb->truesize += nskb->data_len;
2831         } while ((offset += len) < skb->len);
2832
2833         return segs;
2834
2835 err:
2836         while ((skb = segs)) {
2837                 segs = skb->next;
2838                 kfree_skb(skb);
2839         }
2840         return ERR_PTR(err);
2841 }
2842 EXPORT_SYMBOL_GPL(skb_segment);
2843
2844 int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2845 {
2846         struct sk_buff *p = *head;
2847         struct sk_buff *nskb;
2848         struct skb_shared_info *skbinfo = skb_shinfo(skb);
2849         struct skb_shared_info *pinfo = skb_shinfo(p);
2850         unsigned int headroom;
2851         unsigned int len = skb_gro_len(skb);
2852         unsigned int offset = skb_gro_offset(skb);
2853         unsigned int headlen = skb_headlen(skb);
2854         unsigned int delta_truesize;
2855
2856         if (p->len + len >= 65536)
2857                 return -E2BIG;
2858
2859         if (pinfo->frag_list)
2860                 goto merge;
2861         else if (headlen <= offset) {
2862                 skb_frag_t *frag;
2863                 skb_frag_t *frag2;
2864                 int i = skbinfo->nr_frags;
2865                 int nr_frags = pinfo->nr_frags + i;
2866
2867                 offset -= headlen;
2868
2869                 if (nr_frags > MAX_SKB_FRAGS)
2870                         return -E2BIG;
2871
2872                 pinfo->nr_frags = nr_frags;
2873                 skbinfo->nr_frags = 0;
2874
2875                 frag = pinfo->frags + nr_frags;
2876                 frag2 = skbinfo->frags + i;
2877                 do {
2878                         *--frag = *--frag2;
2879                 } while (--i);
2880
2881                 frag->page_offset += offset;
2882                 skb_frag_size_sub(frag, offset);
2883
2884                 /* all fragments truesize : remove (head size + sk_buff) */
2885                 delta_truesize = skb->truesize -
2886                                  SKB_TRUESIZE(skb_end_offset(skb));
2887
2888                 skb->truesize -= skb->data_len;
2889                 skb->len -= skb->data_len;
2890                 skb->data_len = 0;
2891
2892                 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
2893                 goto done;
2894         } else if (skb->head_frag) {
2895                 int nr_frags = pinfo->nr_frags;
2896                 skb_frag_t *frag = pinfo->frags + nr_frags;
2897                 struct page *page = virt_to_head_page(skb->head);
2898                 unsigned int first_size = headlen - offset;
2899                 unsigned int first_offset;
2900
2901                 if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
2902                         return -E2BIG;
2903
2904                 first_offset = skb->data -
2905                                (unsigned char *)page_address(page) +
2906                                offset;
2907
2908                 pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
2909
2910                 frag->page.p      = page;
2911                 frag->page_offset = first_offset;
2912                 skb_frag_size_set(frag, first_size);
2913
2914                 memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
2915                 /* We dont need to clear skbinfo->nr_frags here */
2916
2917                 delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
2918                 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
2919                 goto done;
2920         } else if (skb_gro_len(p) != pinfo->gso_size)
2921                 return -E2BIG;
2922
2923         headroom = skb_headroom(p);
2924         nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);
2925         if (unlikely(!nskb))
2926                 return -ENOMEM;
2927
2928         __copy_skb_header(nskb, p);
2929         nskb->mac_len = p->mac_len;
2930
2931         skb_reserve(nskb, headroom);
2932         __skb_put(nskb, skb_gro_offset(p));
2933
2934         skb_set_mac_header(nskb, skb_mac_header(p) - p->data);
2935         skb_set_network_header(nskb, skb_network_offset(p));
2936         skb_set_transport_header(nskb, skb_transport_offset(p));
2937
2938         __skb_pull(p, skb_gro_offset(p));
2939         memcpy(skb_mac_header(nskb), skb_mac_header(p),
2940                p->data - skb_mac_header(p));
2941
2942         *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);
2943         skb_shinfo(nskb)->frag_list = p;
2944         skb_shinfo(nskb)->gso_size = pinfo->gso_size;
2945         pinfo->gso_size = 0;
2946         skb_header_release(p);
2947         nskb->prev = p;
2948
2949         nskb->data_len += p->len;
2950         nskb->truesize += p->truesize;
2951         nskb->len += p->len;
2952
2953         *head = nskb;
2954         nskb->next = p->next;
2955         p->next = NULL;
2956
2957         p = nskb;
2958
2959 merge:
2960         delta_truesize = skb->truesize;
2961         if (offset > headlen) {
2962                 unsigned int eat = offset - headlen;
2963
2964                 skbinfo->frags[0].page_offset += eat;
2965                 skb_frag_size_sub(&skbinfo->frags[0], eat);
2966                 skb->data_len -= eat;
2967                 skb->len -= eat;
2968                 offset = headlen;
2969         }
2970
2971         __skb_pull(skb, offset);
2972
2973         p->prev->next = skb;
2974         p->prev = skb;
2975         skb_header_release(skb);
2976
2977 done:
2978         NAPI_GRO_CB(p)->count++;
2979         p->data_len += len;
2980         p->truesize += delta_truesize;
2981         p->len += len;
2982
2983         NAPI_GRO_CB(skb)->same_flow = 1;
2984         return 0;
2985 }
2986 EXPORT_SYMBOL_GPL(skb_gro_receive);
2987
2988 void __init skb_init(void)
2989 {
2990         skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
2991                                               sizeof(struct sk_buff),
2992                                               0,
2993                                               SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2994                                               NULL);
2995         skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
2996                                                 (2*sizeof(struct sk_buff)) +
2997                                                 sizeof(atomic_t),
2998                                                 0,
2999                                                 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
3000                                                 NULL);
3001 }
3002
3003 /**
3004  *      skb_to_sgvec - Fill a scatter-gather list from a socket buffer
3005  *      @skb: Socket buffer containing the buffers to be mapped
3006  *      @sg: The scatter-gather list to map into
3007  *      @offset: The offset into the buffer's contents to start mapping
3008  *      @len: Length of buffer space to be mapped
3009  *
3010  *      Fill the specified scatter-gather list with mappings/pointers into a
3011  *      region of the buffer space attached to a socket buffer.
3012  */
3013 static int
3014 __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
3015 {
3016         int start = skb_headlen(skb);
3017         int i, copy = start - offset;
3018         struct sk_buff *frag_iter;
3019         int elt = 0;
3020
3021         if (copy > 0) {
3022                 if (copy > len)
3023                         copy = len;
3024                 sg_set_buf(sg, skb->data + offset, copy);
3025                 elt++;
3026                 if ((len -= copy) == 0)
3027                         return elt;
3028                 offset += copy;
3029         }
3030
3031         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3032                 int end;
3033
3034                 WARN_ON(start > offset + len);
3035
3036                 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
3037                 if ((copy = end - offset) > 0) {
3038                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3039
3040                         if (copy > len)
3041                                 copy = len;
3042                         sg_set_page(&sg[elt], skb_frag_page(frag), copy,
3043                                         frag->page_offset+offset-start);
3044                         elt++;
3045                         if (!(len -= copy))
3046                                 return elt;
3047                         offset += copy;
3048                 }
3049                 start = end;
3050         }
3051
3052         skb_walk_frags(skb, frag_iter) {
3053                 int end;
3054
3055                 WARN_ON(start > offset + len);
3056
3057                 end = start + frag_iter->len;
3058                 if ((copy = end - offset) > 0) {
3059                         if (copy > len)
3060                                 copy = len;
3061                         elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start,
3062                                               copy);
3063                         if ((len -= copy) == 0)
3064                                 return elt;
3065                         offset += copy;
3066                 }
3067                 start = end;
3068         }
3069         BUG_ON(len);
3070         return elt;
3071 }
3072
3073 int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
3074 {
3075         int nsg = __skb_to_sgvec(skb, sg, offset, len);
3076
3077         sg_mark_end(&sg[nsg - 1]);
3078
3079         return nsg;
3080 }
3081 EXPORT_SYMBOL_GPL(skb_to_sgvec);
3082
3083 /**
3084  *      skb_cow_data - Check that a socket buffer's data buffers are writable
3085  *      @skb: The socket buffer to check.
3086  *      @tailbits: Amount of trailing space to be added
3087  *      @trailer: Returned pointer to the skb where the @tailbits space begins
3088  *
3089  *      Make sure that the data buffers attached to a socket buffer are
3090  *      writable. If they are not, private copies are made of the data buffers
3091  *      and the socket buffer is set to use these instead.
3092  *
3093  *      If @tailbits is given, make sure that there is space to write @tailbits
3094  *      bytes of data beyond current end of socket buffer.  @trailer will be
3095  *      set to point to the skb in which this space begins.
3096  *
3097  *      The number of scatterlist elements required to completely map the
3098  *      COW'd and extended socket buffer will be returned.
3099  */
3100 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
3101 {
3102         int copyflag;
3103         int elt;
3104         struct sk_buff *skb1, **skb_p;
3105
3106         /* If skb is cloned or its head is paged, reallocate
3107          * head pulling out all the pages (pages are considered not writable
3108          * at the moment even if they are anonymous).
3109          */
3110         if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
3111             __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
3112                 return -ENOMEM;
3113
3114         /* Easy case. Most of packets will go this way. */
3115         if (!skb_has_frag_list(skb)) {
3116                 /* A little of trouble, not enough of space for trailer.
3117                  * This should not happen, when stack is tuned to generate
3118                  * good frames. OK, on miss we reallocate and reserve even more
3119                  * space, 128 bytes is fair. */
3120
3121                 if (skb_tailroom(skb) < tailbits &&
3122                     pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
3123                         return -ENOMEM;
3124
3125                 /* Voila! */
3126                 *trailer = skb;
3127                 return 1;
3128         }
3129
3130         /* Misery. We are in troubles, going to mincer fragments... */
3131
3132         elt = 1;
3133         skb_p = &skb_shinfo(skb)->frag_list;
3134         copyflag = 0;
3135
3136         while ((skb1 = *skb_p) != NULL) {
3137                 int ntail = 0;
3138
3139                 /* The fragment is partially pulled by someone,
3140                  * this can happen on input. Copy it and everything
3141                  * after it. */
3142
3143                 if (skb_shared(skb1))
3144                         copyflag = 1;
3145
3146                 /* If the skb is the last, worry about trailer. */
3147
3148                 if (skb1->next == NULL && tailbits) {
3149                         if (skb_shinfo(skb1)->nr_frags ||
3150                             skb_has_frag_list(skb1) ||
3151                             skb_tailroom(skb1) < tailbits)
3152                                 ntail = tailbits + 128;
3153                 }
3154
3155                 if (copyflag ||
3156                     skb_cloned(skb1) ||
3157                     ntail ||
3158                     skb_shinfo(skb1)->nr_frags ||
3159                     skb_has_frag_list(skb1)) {
3160                         struct sk_buff *skb2;
3161
3162                         /* Fuck, we are miserable poor guys... */
3163                         if (ntail == 0)
3164                                 skb2 = skb_copy(skb1, GFP_ATOMIC);
3165                         else
3166                                 skb2 = skb_copy_expand(skb1,
3167                                                        skb_headroom(skb1),
3168                                                        ntail,
3169                                                        GFP_ATOMIC);
3170                         if (unlikely(skb2 == NULL))
3171                                 return -ENOMEM;
3172
3173                         if (skb1->sk)
3174                                 skb_set_owner_w(skb2, skb1->sk);
3175
3176                         /* Looking around. Are we still alive?
3177                          * OK, link new skb, drop old one */
3178
3179                         skb2->next = skb1->next;
3180                         *skb_p = skb2;
3181                         kfree_skb(skb1);
3182                         skb1 = skb2;
3183                 }
3184                 elt++;
3185                 *trailer = skb1;
3186                 skb_p = &skb1->next;
3187         }
3188
3189         return elt;
3190 }
3191 EXPORT_SYMBOL_GPL(skb_cow_data);
3192
3193 static void sock_rmem_free(struct sk_buff *skb)
3194 {
3195         struct sock *sk = skb->sk;
3196
3197         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
3198 }
3199
3200 /*
3201  * Note: We dont mem charge error packets (no sk_forward_alloc changes)
3202  */
3203 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
3204 {
3205         int len = skb->len;
3206
3207         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
3208             (unsigned int)sk->sk_rcvbuf)
3209                 return -ENOMEM;
3210
3211         skb_orphan(skb);
3212         skb->sk = sk;
3213         skb->destructor = sock_rmem_free;
3214         atomic_add(skb->truesize, &sk->sk_rmem_alloc);
3215
3216         /* before exiting rcu section, make sure dst is refcounted */
3217         skb_dst_force(skb);
3218
3219         skb_queue_tail(&sk->sk_error_queue, skb);
3220         if (!sock_flag(sk, SOCK_DEAD))
3221                 sk->sk_data_ready(sk, len);
3222         return 0;
3223 }
3224 EXPORT_SYMBOL(sock_queue_err_skb);
3225
3226 void skb_tstamp_tx(struct sk_buff *orig_skb,
3227                 struct skb_shared_hwtstamps *hwtstamps)
3228 {
3229         struct sock *sk = orig_skb->sk;
3230         struct sock_exterr_skb *serr;
3231         struct sk_buff *skb;
3232         int err;
3233
3234         if (!sk)
3235                 return;
3236
3237         skb = skb_clone(orig_skb, GFP_ATOMIC);
3238         if (!skb)
3239                 return;
3240
3241         if (hwtstamps) {
3242                 *skb_hwtstamps(skb) =
3243                         *hwtstamps;
3244         } else {
3245                 /*
3246                  * no hardware time stamps available,
3247                  * so keep the shared tx_flags and only
3248                  * store software time stamp
3249                  */
3250                 skb->tstamp = ktime_get_real();
3251         }
3252
3253         serr = SKB_EXT_ERR(skb);
3254         memset(serr, 0, sizeof(*serr));
3255         serr->ee.ee_errno = ENOMSG;
3256         serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
3257
3258         err = sock_queue_err_skb(sk, skb);
3259
3260         if (err)
3261                 kfree_skb(skb);
3262 }
3263 EXPORT_SYMBOL_GPL(skb_tstamp_tx);
3264
3265 void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
3266 {
3267         struct sock *sk = skb->sk;
3268         struct sock_exterr_skb *serr;
3269         int err;
3270
3271         skb->wifi_acked_valid = 1;
3272         skb->wifi_acked = acked;
3273
3274         serr = SKB_EXT_ERR(skb);
3275         memset(serr, 0, sizeof(*serr));
3276         serr->ee.ee_errno = ENOMSG;
3277         serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
3278
3279         err = sock_queue_err_skb(sk, skb);
3280         if (err)
3281                 kfree_skb(skb);
3282 }
3283 EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
3284
3285
3286 /**
3287  * skb_partial_csum_set - set up and verify partial csum values for packet
3288  * @skb: the skb to set
3289  * @start: the number of bytes after skb->data to start checksumming.
3290  * @off: the offset from start to place the checksum.
3291  *
3292  * For untrusted partially-checksummed packets, we need to make sure the values
3293  * for skb->csum_start and skb->csum_offset are valid so we don't oops.
3294  *
3295  * This function checks and sets those values and skb->ip_summed: if this
3296  * returns false you should drop the packet.
3297  */
3298 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
3299 {
3300         if (unlikely(start > skb_headlen(skb)) ||
3301             unlikely((int)start + off > skb_headlen(skb) - 2)) {
3302                 net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
3303                                      start, off, skb_headlen(skb));
3304                 return false;
3305         }
3306         skb->ip_summed = CHECKSUM_PARTIAL;
3307         skb->csum_start = skb_headroom(skb) + start;
3308         skb->csum_offset = off;
3309         return true;
3310 }
3311 EXPORT_SYMBOL_GPL(skb_partial_csum_set);
3312
3313 void __skb_warn_lro_forwarding(const struct sk_buff *skb)
3314 {
3315         net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
3316                              skb->dev->name);
3317 }
3318 EXPORT_SYMBOL(__skb_warn_lro_forwarding);