net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40
  41 #include <linux/netfilter.h>
  42 #include <linux/netfilter_ipv6.h>
  43
  44 #include <net/sock.h>
  45 #include <net/snmp.h>
  46
  47 #include <net/ipv6.h>
  48 #include <net/ndisc.h>
  49 #include <net/protocol.h>
  50 #include <net/ip6_route.h>
  51 #include <net/addrconf.h>
  52 #include <net/rawv6.h>
  53 #include <net/icmp.h>
  54 #include <net/xfrm.h>
  55 #include <net/checksum.h>
  56 #include <linux/mroute6.h>
  57
  58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  59
  60 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
  61 {
  62         static u32 ipv6_fragmentation_id = 1;
  63         static DEFINE_SPINLOCK(ip6_id_lock);
  64
  65         spin_lock_bh(&ip6_id_lock);
  66         fhdr->identification = htonl(ipv6_fragmentation_id);
  67         if (++ipv6_fragmentation_id == 0)
  68                 ipv6_fragmentation_id = 1;
  69         spin_unlock_bh(&ip6_id_lock);
  70 }
  71
  72 int __ip6_local_out(struct sk_buff *skb)
  73 {
  74         int len;
  75
  76         len = skb->len - sizeof(struct ipv6hdr);
  77         if (len > IPV6_MAXPLEN)
  78                 len = 0;
  79         ipv6_hdr(skb)->payload_len = htons(len);
  80
  81         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
  82                        dst_output);
  83 }
  84
  85 int ip6_local_out(struct sk_buff *skb)
  86 {
  87         int err;
  88
  89         err = __ip6_local_out(skb);
  90         if (likely(err == 1))
  91                 err = dst_output(skb);
  92
  93         return err;
  94 }
  95 EXPORT_SYMBOL_GPL(ip6_local_out);
  96
  97 static int ip6_output_finish(struct sk_buff *skb)
  98 {
  99         struct dst_entry *dst = skb->dst;
 100
 101         if (dst->hh)
 102                 return neigh_hh_output(dst->hh, skb);
 103         else if (dst->neighbour)
 104                 return dst->neighbour->output(skb);
 105
 106         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 107         kfree_skb(skb);
 108         return -EINVAL;
 109
 110 }
 111
 112 /* dev_loopback_xmit for use with netfilter. */
 113 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
 114 {
 115         skb_reset_mac_header(newskb);
 116         __skb_pull(newskb, skb_network_offset(newskb));
 117         newskb->pkt_type = PACKET_LOOPBACK;
 118         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 119         BUG_TRAP(newskb->dst);
 120
 121         netif_rx(newskb);
 122         return 0;
 123 }
 124
 125
 126 static int ip6_output2(struct sk_buff *skb)
 127 {
 128         struct dst_entry *dst = skb->dst;
 129         struct net_device *dev = dst->dev;
 130
 131         skb->protocol = htons(ETH_P_IPV6);
 132         skb->dev = dev;
 133
 134         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 135                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
 136                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 137
 138                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
 139                     ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 140                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 141                                          &ipv6_hdr(skb)->saddr))) {
 142                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 143
 144                         /* Do not check for IFF_ALLMULTI; multicast routing
 145                            is not supported in any case.
 146                          */
 147                         if (newskb)
 148                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
 149                                         NULL, newskb->dev,
 150                                         ip6_dev_loopback_xmit);
 151
 152                         if (ipv6_hdr(skb)->hop_limit == 0) {
 153                                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
 154                                 kfree_skb(skb);
 155                                 return 0;
 156                         }
 157                 }
 158
 159                 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
 160         }
 161
 162         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 163                        ip6_output_finish);
 164 }
 165
 166 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
 167 {
 168         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 169
 170         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
 171                skb->dst->dev->mtu : dst_mtu(skb->dst);
 172 }
 173
 174 int ip6_output(struct sk_buff *skb)
 175 {
 176         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 177                                 dst_allfrag(skb->dst))
 178                 return ip6_fragment(skb, ip6_output2);
 179         else
 180                 return ip6_output2(skb);
 181 }
 182
 183 /*
 184  *      xmit an sk_buff (used by TCP)
 185  */
 186
 187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 188              struct ipv6_txoptions *opt, int ipfragok)
 189 {
 190         struct ipv6_pinfo *np = inet6_sk(sk);
 191         struct in6_addr *first_hop = &fl->fl6_dst;
 192         struct dst_entry *dst = skb->dst;
 193         struct ipv6hdr *hdr;
 194         u8  proto = fl->proto;
 195         int seg_len = skb->len;
 196         int hlimit, tclass;
 197         u32 mtu;
 198
 199         if (opt) {
 200                 unsigned int head_room;
 201
 202                 /* First: exthdrs may take lots of space (~8K for now)
 203                    MAX_HEADER is not enough.
 204                  */
 205                 head_room = opt->opt_nflen + opt->opt_flen;
 206                 seg_len += head_room;
 207                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 208
 209                 if (skb_headroom(skb) < head_room) {
 210                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                         if (skb2 == NULL) {
 212                                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 213                                               IPSTATS_MIB_OUTDISCARDS);
 214                                 kfree_skb(skb);
 215                                 return -ENOBUFS;
 216                         }
 217                         kfree_skb(skb);
 218                         skb = skb2;
 219                         if (sk)
 220                                 skb_set_owner_w(skb, sk);
 221                 }
 222                 if (opt->opt_flen)
 223                         ipv6_push_frag_opts(skb, opt, &proto);
 224                 if (opt->opt_nflen)
 225                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 226         }
 227
 228         skb_push(skb, sizeof(struct ipv6hdr));
 229         skb_reset_network_header(skb);
 230         hdr = ipv6_hdr(skb);
 231
 232         /*
 233          *      Fill in the IPv6 header
 234          */
 235
 236         hlimit = -1;
 237         if (np)
 238                 hlimit = np->hop_limit;
 239         if (hlimit < 0)
 240                 hlimit = ip6_dst_hoplimit(dst);
 241
 242         tclass = -1;
 243         if (np)
 244                 tclass = np->tclass;
 245         if (tclass < 0)
 246                 tclass = 0;
 247
 248         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 249
 250         hdr->payload_len = htons(seg_len);
 251         hdr->nexthdr = proto;
 252         hdr->hop_limit = hlimit;
 253
 254         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 255         ipv6_addr_copy(&hdr->daddr, first_hop);
 256
 257         skb->priority = sk->sk_priority;
 258         skb->mark = sk->sk_mark;
 259
 260         mtu = dst_mtu(dst);
 261         if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
 262                 IP6_INC_STATS(ip6_dst_idev(skb->dst),
 263                               IPSTATS_MIB_OUTREQUESTS);
 264                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
 265                                 dst_output);
 266         }
 267
 268         if (net_ratelimit())
 269                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 270         skb->dev = dst->dev;
 271         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 272         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 273         kfree_skb(skb);
 274         return -EMSGSIZE;
 275 }
 276
 277 EXPORT_SYMBOL(ip6_xmit);
 278
 279 /*
 280  *      To avoid extra problems ND packets are send through this
 281  *      routine. It's code duplication but I really want to avoid
 282  *      extra checks since ipv6_build_header is used by TCP (which
 283  *      is for us performance critical)
 284  */
 285
 286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 287                const struct in6_addr *saddr, const struct in6_addr *daddr,
 288                int proto, int len)
 289 {
 290         struct ipv6_pinfo *np = inet6_sk(sk);
 291         struct ipv6hdr *hdr;
 292         int totlen;
 293
 294         skb->protocol = htons(ETH_P_IPV6);
 295         skb->dev = dev;
 296
 297         totlen = len + sizeof(struct ipv6hdr);
 298
 299         skb_reset_network_header(skb);
 300         skb_put(skb, sizeof(struct ipv6hdr));
 301         hdr = ipv6_hdr(skb);
 302
 303         *(__be32*)hdr = htonl(0x60000000);
 304
 305         hdr->payload_len = htons(len);
 306         hdr->nexthdr = proto;
 307         hdr->hop_limit = np->hop_limit;
 308
 309         ipv6_addr_copy(&hdr->saddr, saddr);
 310         ipv6_addr_copy(&hdr->daddr, daddr);
 311
 312         return 0;
 313 }
 314
 315 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 316 {
 317         struct ip6_ra_chain *ra;
 318         struct sock *last = NULL;
 319
 320         read_lock(&ip6_ra_lock);
 321         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 322                 struct sock *sk = ra->sk;
 323                 if (sk && ra->sel == sel &&
 324                     (!sk->sk_bound_dev_if ||
 325                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 326                         if (last) {
 327                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 328                                 if (skb2)
 329                                         rawv6_rcv(last, skb2);
 330                         }
 331                         last = sk;
 332                 }
 333         }
 334
 335         if (last) {
 336                 rawv6_rcv(last, skb);
 337                 read_unlock(&ip6_ra_lock);
 338                 return 1;
 339         }
 340         read_unlock(&ip6_ra_lock);
 341         return 0;
 342 }
 343
 344 static int ip6_forward_proxy_check(struct sk_buff *skb)
 345 {
 346         struct ipv6hdr *hdr = ipv6_hdr(skb);
 347         u8 nexthdr = hdr->nexthdr;
 348         int offset;
 349
 350         if (ipv6_ext_hdr(nexthdr)) {
 351                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 352                 if (offset < 0)
 353                         return 0;
 354         } else
 355                 offset = sizeof(struct ipv6hdr);
 356
 357         if (nexthdr == IPPROTO_ICMPV6) {
 358                 struct icmp6hdr *icmp6;
 359
 360                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 361                                          offset + 1 - skb->data)))
 362                         return 0;
 363
 364                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 365
 366                 switch (icmp6->icmp6_type) {
 367                 case NDISC_ROUTER_SOLICITATION:
 368                 case NDISC_ROUTER_ADVERTISEMENT:
 369                 case NDISC_NEIGHBOUR_SOLICITATION:
 370                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 371                 case NDISC_REDIRECT:
 372                         /* For reaction involving unicast neighbor discovery
 373                          * message destined to the proxied address, pass it to
 374                          * input function.
 375                          */
 376                         return 1;
 377                 default:
 378                         break;
 379                 }
 380         }
 381
 382         /*
 383          * The proxying router can't forward traffic sent to a link-local
 384          * address, so signal the sender and discard the packet. This
 385          * behavior is clarified by the MIPv6 specification.
 386          */
 387         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 388                 dst_link_failure(skb);
 389                 return -1;
 390         }
 391
 392         return 0;
 393 }
 394
 395 static inline int ip6_forward_finish(struct sk_buff *skb)
 396 {
 397         return dst_output(skb);
 398 }
 399
 400 int ip6_forward(struct sk_buff *skb)
 401 {
 402         struct dst_entry *dst = skb->dst;
 403         struct ipv6hdr *hdr = ipv6_hdr(skb);
 404         struct inet6_skb_parm *opt = IP6CB(skb);
 405         struct net *net = dev_net(dst->dev);
 406
 407         if (ipv6_devconf.forwarding == 0)
 408                 goto error;
 409
 410         if (skb_warn_if_lro(skb))
 411                 goto drop;
 412
 413         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 414                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 415                 goto drop;
 416         }
 417
 418         skb_forward_csum(skb);
 419
 420         /*
 421          *      We DO NOT make any processing on
 422          *      RA packets, pushing them to user level AS IS
 423          *      without ane WARRANTY that application will be able
 424          *      to interpret them. The reason is that we
 425          *      cannot make anything clever here.
 426          *
 427          *      We are not end-node, so that if packet contains
 428          *      AH/ESP, we cannot make anything.
 429          *      Defragmentation also would be mistake, RA packets
 430          *      cannot be fragmented, because there is no warranty
 431          *      that different fragments will go along one path. --ANK
 432          */
 433         if (opt->ra) {
 434                 u8 *ptr = skb_network_header(skb) + opt->ra;
 435                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 436                         return 0;
 437         }
 438
 439         /*
 440          *      check and decrement ttl
 441          */
 442         if (hdr->hop_limit <= 1) {
 443                 /* Force OUTPUT device used as source address */
 444                 skb->dev = dst->dev;
 445                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 446                             0, skb->dev);
 447                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 448
 449                 kfree_skb(skb);
 450                 return -ETIMEDOUT;
 451         }
 452
 453         /* XXX: idev->cnf.proxy_ndp? */
 454         if (ipv6_devconf.proxy_ndp &&
 455             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 456                 int proxied = ip6_forward_proxy_check(skb);
 457                 if (proxied > 0)
 458                         return ip6_input(skb);
 459                 else if (proxied < 0) {
 460                         IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 461                         goto drop;
 462                 }
 463         }
 464
 465         if (!xfrm6_route_forward(skb)) {
 466                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 467                 goto drop;
 468         }
 469         dst = skb->dst;
 470
 471         /* IPv6 specs say nothing about it, but it is clear that we cannot
 472            send redirects to source routed frames.
 473            We don't send redirects to frames decapsulated from IPsec.
 474          */
 475         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 476             !skb->sp) {
 477                 struct in6_addr *target = NULL;
 478                 struct rt6_info *rt;
 479                 struct neighbour *n = dst->neighbour;
 480
 481                 /*
 482                  *      incoming and outgoing devices are the same
 483                  *      send a redirect.
 484                  */
 485
 486                 rt = (struct rt6_info *) dst;
 487                 if ((rt->rt6i_flags & RTF_GATEWAY))
 488                         target = (struct in6_addr*)&n->primary_key;
 489                 else
 490                         target = &hdr->daddr;
 491
 492                 /* Limit redirects both by destination (here)
 493                    and by source (inside ndisc_send_redirect)
 494                  */
 495                 if (xrlim_allow(dst, 1*HZ))
 496                         ndisc_send_redirect(skb, n, target);
 497         } else {
 498                 int addrtype = ipv6_addr_type(&hdr->saddr);
 499
 500                 /* This check is security critical. */
 501                 if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK))
 502                         goto error;
 503                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 504                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 505                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
 506                         goto error;
 507                 }
 508         }
 509
 510         if (skb->len > dst_mtu(dst)) {
 511                 /* Again, force OUTPUT device used as source address */
 512                 skb->dev = dst->dev;
 513                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
 514                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 515                 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 516                 kfree_skb(skb);
 517                 return -EMSGSIZE;
 518         }
 519
 520         if (skb_cow(skb, dst->dev->hard_header_len)) {
 521                 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 522                 goto drop;
 523         }
 524
 525         hdr = ipv6_hdr(skb);
 526
 527         /* Mangling hops number delayed to point after skb COW */
 528
 529         hdr->hop_limit--;
 530
 531         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 532         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 533                        ip6_forward_finish);
 534
 535 error:
 536         IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 537 drop:
 538         kfree_skb(skb);
 539         return -EINVAL;
 540 }
 541
 542 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 543 {
 544         to->pkt_type = from->pkt_type;
 545         to->priority = from->priority;
 546         to->protocol = from->protocol;
 547         dst_release(to->dst);
 548         to->dst = dst_clone(from->dst);
 549         to->dev = from->dev;
 550         to->mark = from->mark;
 551
 552 #ifdef CONFIG_NET_SCHED
 553         to->tc_index = from->tc_index;
 554 #endif
 555         nf_copy(to, from);
 556 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 557     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 558         to->nf_trace = from->nf_trace;
 559 #endif
 560         skb_copy_secmark(to, from);
 561 }
 562
 563 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 564 {
 565         u16 offset = sizeof(struct ipv6hdr);
 566         struct ipv6_opt_hdr *exthdr =
 567                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 568         unsigned int packet_len = skb->tail - skb->network_header;
 569         int found_rhdr = 0;
 570         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 571
 572         while (offset + 1 <= packet_len) {
 573
 574                 switch (**nexthdr) {
 575
 576                 case NEXTHDR_HOP:
 577                         break;
 578                 case NEXTHDR_ROUTING:
 579                         found_rhdr = 1;
 580                         break;
 581                 case NEXTHDR_DEST:
 582 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 583                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 584                                 break;
 585 #endif
 586                         if (found_rhdr)
 587                                 return offset;
 588                         break;
 589                 default :
 590                         return offset;
 591                 }
 592
 593                 offset += ipv6_optlen(exthdr);
 594                 *nexthdr = &exthdr->nexthdr;
 595                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 596                                                  offset);
 597         }
 598
 599         return offset;
 600 }
 601
 602 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 603 {
 604         struct net_device *dev;
 605         struct sk_buff *frag;
 606         struct rt6_info *rt = (struct rt6_info*)skb->dst;
 607         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 608         struct ipv6hdr *tmp_hdr;
 609         struct frag_hdr *fh;
 610         unsigned int mtu, hlen, left, len;
 611         __be32 frag_id = 0;
 612         int ptr, offset = 0, err=0;
 613         u8 *prevhdr, nexthdr = 0;
 614
 615         dev = rt->u.dst.dev;
 616         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 617         nexthdr = *prevhdr;
 618
 619         mtu = ip6_skb_dst_mtu(skb);
 620
 621         /* We must not fragment if the socket is set to force MTU discovery
 622          * or if the skb it not generated by a local socket.  (This last
 623          * check should be redundant, but it's free.)
 624          */
 625         if (!skb->local_df) {
 626                 skb->dev = skb->dst->dev;
 627                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 628                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 629                 kfree_skb(skb);
 630                 return -EMSGSIZE;
 631         }
 632
 633         if (np && np->frag_size < mtu) {
 634                 if (np->frag_size)
 635                         mtu = np->frag_size;
 636         }
 637         mtu -= hlen + sizeof(struct frag_hdr);
 638
 639         if (skb_shinfo(skb)->frag_list) {
 640                 int first_len = skb_pagelen(skb);
 641                 int truesizes = 0;
 642
 643                 if (first_len - hlen > mtu ||
 644                     ((first_len - hlen) & 7) ||
 645                     skb_cloned(skb))
 646                         goto slow_path;
 647
 648                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 649                         /* Correct geometry. */
 650                         if (frag->len > mtu ||
 651                             ((frag->len & 7) && frag->next) ||
 652                             skb_headroom(frag) < hlen)
 653                             goto slow_path;
 654
 655                         /* Partially cloned skb? */
 656                         if (skb_shared(frag))
 657                                 goto slow_path;
 658
 659                         BUG_ON(frag->sk);
 660                         if (skb->sk) {
 661                                 sock_hold(skb->sk);
 662                                 frag->sk = skb->sk;
 663                                 frag->destructor = sock_wfree;
 664                                 truesizes += frag->truesize;
 665                         }
 666                 }
 667
 668                 err = 0;
 669                 offset = 0;
 670                 frag = skb_shinfo(skb)->frag_list;
 671                 skb_shinfo(skb)->frag_list = NULL;
 672                 /* BUILD HEADER */
 673
 674                 *prevhdr = NEXTHDR_FRAGMENT;
 675                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 676                 if (!tmp_hdr) {
 677                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
 678                         return -ENOMEM;
 679                 }
 680
 681                 __skb_pull(skb, hlen);
 682                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 683                 __skb_push(skb, hlen);
 684                 skb_reset_network_header(skb);
 685                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 686
 687                 ipv6_select_ident(skb, fh);
 688                 fh->nexthdr = nexthdr;
 689                 fh->reserved = 0;
 690                 fh->frag_off = htons(IP6_MF);
 691                 frag_id = fh->identification;
 692
 693                 first_len = skb_pagelen(skb);
 694                 skb->data_len = first_len - skb_headlen(skb);
 695                 skb->truesize -= truesizes;
 696                 skb->len = first_len;
 697                 ipv6_hdr(skb)->payload_len = htons(first_len -
 698                                                    sizeof(struct ipv6hdr));
 699
 700                 dst_hold(&rt->u.dst);
 701
 702                 for (;;) {
 703                         /* Prepare header of the next frame,
 704                          * before previous one went down. */
 705                         if (frag) {
 706                                 frag->ip_summed = CHECKSUM_NONE;
 707                                 skb_reset_transport_header(frag);
 708                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 709                                 __skb_push(frag, hlen);
 710                                 skb_reset_network_header(frag);
 711                                 memcpy(skb_network_header(frag), tmp_hdr,
 712                                        hlen);
 713                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 714                                 fh->nexthdr = nexthdr;
 715                                 fh->reserved = 0;
 716                                 fh->frag_off = htons(offset);
 717                                 if (frag->next != NULL)
 718                                         fh->frag_off |= htons(IP6_MF);
 719                                 fh->identification = frag_id;
 720                                 ipv6_hdr(frag)->payload_len =
 721                                                 htons(frag->len -
 722                                                       sizeof(struct ipv6hdr));
 723                                 ip6_copy_metadata(frag, skb);
 724                         }
 725
 726                         err = output(skb);
 727                         if(!err)
 728                                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
 729
 730                         if (err || !frag)
 731                                 break;
 732
 733                         skb = frag;
 734                         frag = skb->next;
 735                         skb->next = NULL;
 736                 }
 737
 738                 kfree(tmp_hdr);
 739
 740                 if (err == 0) {
 741                         IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
 742                         dst_release(&rt->u.dst);
 743                         return 0;
 744                 }
 745
 746                 while (frag) {
 747                         skb = frag->next;
 748                         kfree_skb(frag);
 749                         frag = skb;
 750                 }
 751
 752                 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
 753                 dst_release(&rt->u.dst);
 754                 return err;
 755         }
 756
 757 slow_path:
 758         left = skb->len - hlen;         /* Space per frame */
 759         ptr = hlen;                     /* Where to start from */
 760
 761         /*
 762          *      Fragment the datagram.
 763          */
 764
 765         *prevhdr = NEXTHDR_FRAGMENT;
 766
 767         /*
 768          *      Keep copying data until we run out.
 769          */
 770         while(left > 0) {
 771                 len = left;
 772                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 773                 if (len > mtu)
 774                         len = mtu;
 775                 /* IF: we are not sending upto and including the packet end
 776                    then align the next start on an eight byte boundary */
 777                 if (len < left) {
 778                         len &= ~7;
 779                 }
 780                 /*
 781                  *      Allocate buffer.
 782                  */
 783
 784                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
 785                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 786                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 787                                       IPSTATS_MIB_FRAGFAILS);
 788                         err = -ENOMEM;
 789                         goto fail;
 790                 }
 791
 792                 /*
 793                  *      Set up data on packet
 794                  */
 795
 796                 ip6_copy_metadata(frag, skb);
 797                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
 798                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 799                 skb_reset_network_header(frag);
 800                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 801                 frag->transport_header = (frag->network_header + hlen +
 802                                           sizeof(struct frag_hdr));
 803
 804                 /*
 805                  *      Charge the memory for the fragment to any owner
 806                  *      it might possess
 807                  */
 808                 if (skb->sk)
 809                         skb_set_owner_w(frag, skb->sk);
 810
 811                 /*
 812                  *      Copy the packet header into the new buffer.
 813                  */
 814                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 815
 816                 /*
 817                  *      Build fragment header.
 818                  */
 819                 fh->nexthdr = nexthdr;
 820                 fh->reserved = 0;
 821                 if (!frag_id) {
 822                         ipv6_select_ident(skb, fh);
 823                         frag_id = fh->identification;
 824                 } else
 825                         fh->identification = frag_id;
 826
 827                 /*
 828                  *      Copy a block of the IP datagram.
 829                  */
 830                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 831                         BUG();
 832                 left -= len;
 833
 834                 fh->frag_off = htons(offset);
 835                 if (left > 0)
 836                         fh->frag_off |= htons(IP6_MF);
 837                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 838                                                     sizeof(struct ipv6hdr));
 839
 840                 ptr += len;
 841                 offset += len;
 842
 843                 /*
 844                  *      Put this fragment into the sending queue.
 845                  */
 846                 err = output(frag);
 847                 if (err)
 848                         goto fail;
 849
 850                 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
 851         }
 852         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 853                       IPSTATS_MIB_FRAGOKS);
 854         kfree_skb(skb);
 855         return err;
 856
 857 fail:
 858         IP6_INC_STATS(ip6_dst_idev(skb->dst),
 859                       IPSTATS_MIB_FRAGFAILS);
 860         kfree_skb(skb);
 861         return err;
 862 }
 863
 864 static inline int ip6_rt_check(struct rt6key *rt_key,
 865                                struct in6_addr *fl_addr,
 866                                struct in6_addr *addr_cache)
 867 {
 868         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 869                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
 870 }
 871
 872 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 873                                           struct dst_entry *dst,
 874                                           struct flowi *fl)
 875 {
 876         struct ipv6_pinfo *np = inet6_sk(sk);
 877         struct rt6_info *rt = (struct rt6_info *)dst;
 878
 879         if (!dst)
 880                 goto out;
 881
 882         /* Yes, checking route validity in not connected
 883          * case is not very simple. Take into account,
 884          * that we do not support routing by source, TOS,
 885          * and MSG_DONTROUTE            --ANK (980726)
 886          *
 887          * 1. ip6_rt_check(): If route was host route,
 888          *    check that cached destination is current.
 889          *    If it is network route, we still may
 890          *    check its validity using saved pointer
 891          *    to the last used address: daddr_cache.
 892          *    We do not want to save whole address now,
 893          *    (because main consumer of this service
 894          *    is tcp, which has not this problem),
 895          *    so that the last trick works only on connected
 896          *    sockets.
 897          * 2. oif also should be the same.
 898          */
 899         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 900 #ifdef CONFIG_IPV6_SUBTREES
 901             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 902 #endif
 903             (fl->oif && fl->oif != dst->dev->ifindex)) {
 904                 dst_release(dst);
 905                 dst = NULL;
 906         }
 907
 908 out:
 909         return dst;
 910 }
 911
 912 static int ip6_dst_lookup_tail(struct sock *sk,
 913                                struct dst_entry **dst, struct flowi *fl)
 914 {
 915         int err;
 916         struct net *net = sock_net(sk);
 917
 918         if (*dst == NULL)
 919                 *dst = ip6_route_output(net, sk, fl);
 920
 921         if ((err = (*dst)->error))
 922                 goto out_err_release;
 923
 924         if (ipv6_addr_any(&fl->fl6_src)) {
 925                 err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev,
 926                                          &fl->fl6_dst,
 927                                          sk ? inet6_sk(sk)->srcprefs : 0,
 928                                          &fl->fl6_src);
 929                 if (err)
 930                         goto out_err_release;
 931         }
 932
 933 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 934                 /*
 935                  * Here if the dst entry we've looked up
 936                  * has a neighbour entry that is in the INCOMPLETE
 937                  * state and the src address from the flow is
 938                  * marked as OPTIMISTIC, we release the found
 939                  * dst entry and replace it instead with the
 940                  * dst entry of the nexthop router
 941                  */
 942                 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
 943                         struct inet6_ifaddr *ifp;
 944                         struct flowi fl_gw;
 945                         int redirect;
 946
 947                         ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 948                                               (*dst)->dev, 1);
 949
 950                         redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 951                         if (ifp)
 952                                 in6_ifa_put(ifp);
 953
 954                         if (redirect) {
 955                                 /*
 956                                  * We need to get the dst entry for the
 957                                  * default router instead
 958                                  */
 959                                 dst_release(*dst);
 960                                 memcpy(&fl_gw, fl, sizeof(struct flowi));
 961                                 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 962                                 *dst = ip6_route_output(net, sk, &fl_gw);
 963                                 if ((err = (*dst)->error))
 964                                         goto out_err_release;
 965                         }
 966                 }
 967 #endif
 968
 969         return 0;
 970
 971 out_err_release:
 972         if (err == -ENETUNREACH)
 973                 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
 974         dst_release(*dst);
 975         *dst = NULL;
 976         return err;
 977 }
 978
 979 /**
 980  *      ip6_dst_lookup - perform route lookup on flow
 981  *      @sk: socket which provides route info
 982  *      @dst: pointer to dst_entry * for result
 983  *      @fl: flow to lookup
 984  *
 985  *      This function performs a route lookup on the given flow.
 986  *
 987  *      It returns zero on success, or a standard errno code on error.
 988  */
 989 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 990 {
 991         *dst = NULL;
 992         return ip6_dst_lookup_tail(sk, dst, fl);
 993 }
 994 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 995
 996 /**
 997  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
 998  *      @sk: socket which provides the dst cache and route info
 999  *      @dst: pointer to dst_entry * for result
1000  *      @fl: flow to lookup
1001  *
1002  *      This function performs a route lookup on the given flow with the
1003  *      possibility of using the cached route in the socket if it is valid.
1004  *      It will take the socket dst lock when operating on the dst cache.
1005  *      As a result, this function can only be used in process context.
1006  *
1007  *      It returns zero on success, or a standard errno code on error.
1008  */
1009 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1010 {
1011         *dst = NULL;
1012         if (sk) {
1013                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1014                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1015         }
1016
1017         return ip6_dst_lookup_tail(sk, dst, fl);
1018 }
1019 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1020
1021 static inline int ip6_ufo_append_data(struct sock *sk,
1022                         int getfrag(void *from, char *to, int offset, int len,
1023                         int odd, struct sk_buff *skb),
1024                         void *from, int length, int hh_len, int fragheaderlen,
1025                         int transhdrlen, int mtu,unsigned int flags)
1026
1027 {
1028         struct sk_buff *skb;
1029         int err;
1030
1031         /* There is support for UDP large send offload by network
1032          * device, so create one single skb packet containing complete
1033          * udp datagram
1034          */
1035         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1036                 skb = sock_alloc_send_skb(sk,
1037                         hh_len + fragheaderlen + transhdrlen + 20,
1038                         (flags & MSG_DONTWAIT), &err);
1039                 if (skb == NULL)
1040                         return -ENOMEM;
1041
1042                 /* reserve space for Hardware header */
1043                 skb_reserve(skb, hh_len);
1044
1045                 /* create space for UDP/IP header */
1046                 skb_put(skb,fragheaderlen + transhdrlen);
1047
1048                 /* initialize network header pointer */
1049                 skb_reset_network_header(skb);
1050
1051                 /* initialize protocol header pointer */
1052                 skb->transport_header = skb->network_header + fragheaderlen;
1053
1054                 skb->ip_summed = CHECKSUM_PARTIAL;
1055                 skb->csum = 0;
1056                 sk->sk_sndmsg_off = 0;
1057         }
1058
1059         err = skb_append_datato_frags(sk,skb, getfrag, from,
1060                                       (length - transhdrlen));
1061         if (!err) {
1062                 struct frag_hdr fhdr;
1063
1064                 /* specify the length of each IP datagram fragment*/
1065                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
1066                                             sizeof(struct frag_hdr);
1067                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1068                 ipv6_select_ident(skb, &fhdr);
1069                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1070                 __skb_queue_tail(&sk->sk_write_queue, skb);
1071
1072                 return 0;
1073         }
1074         /* There is not enough support do UPD LSO,
1075          * so follow normal path
1076          */
1077         kfree_skb(skb);
1078
1079         return err;
1080 }
1081
1082 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1083         int offset, int len, int odd, struct sk_buff *skb),
1084         void *from, int length, int transhdrlen,
1085         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1086         struct rt6_info *rt, unsigned int flags)
1087 {
1088         struct inet_sock *inet = inet_sk(sk);
1089         struct ipv6_pinfo *np = inet6_sk(sk);
1090         struct sk_buff *skb;
1091         unsigned int maxfraglen, fragheaderlen;
1092         int exthdrlen;
1093         int hh_len;
1094         int mtu;
1095         int copy;
1096         int err;
1097         int offset = 0;
1098         int csummode = CHECKSUM_NONE;
1099
1100         if (flags&MSG_PROBE)
1101                 return 0;
1102         if (skb_queue_empty(&sk->sk_write_queue)) {
1103                 /*
1104                  * setup for corking
1105                  */
1106                 if (opt) {
1107                         if (np->cork.opt == NULL) {
1108                                 np->cork.opt = kmalloc(opt->tot_len,
1109                                                        sk->sk_allocation);
1110                                 if (unlikely(np->cork.opt == NULL))
1111                                         return -ENOBUFS;
1112                         } else if (np->cork.opt->tot_len < opt->tot_len) {
1113                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1114                                 return -EINVAL;
1115                         }
1116                         memcpy(np->cork.opt, opt, opt->tot_len);
1117                         inet->cork.flags |= IPCORK_OPT;
1118                         /* need source address above miyazawa*/
1119                 }
1120                 dst_hold(&rt->u.dst);
1121                 inet->cork.dst = &rt->u.dst;
1122                 inet->cork.fl = *fl;
1123                 np->cork.hop_limit = hlimit;
1124                 np->cork.tclass = tclass;
1125                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1126                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1127                 if (np->frag_size < mtu) {
1128                         if (np->frag_size)
1129                                 mtu = np->frag_size;
1130                 }
1131                 inet->cork.fragsize = mtu;
1132                 if (dst_allfrag(rt->u.dst.path))
1133                         inet->cork.flags |= IPCORK_ALLFRAG;
1134                 inet->cork.length = 0;
1135                 sk->sk_sndmsg_page = NULL;
1136                 sk->sk_sndmsg_off = 0;
1137                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1138                             rt->rt6i_nfheader_len;
1139                 length += exthdrlen;
1140                 transhdrlen += exthdrlen;
1141         } else {
1142                 rt = (struct rt6_info *)inet->cork.dst;
1143                 fl = &inet->cork.fl;
1144                 if (inet->cork.flags & IPCORK_OPT)
1145                         opt = np->cork.opt;
1146                 transhdrlen = 0;
1147                 exthdrlen = 0;
1148                 mtu = inet->cork.fragsize;
1149         }
1150
1151         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1152
1153         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1154                         (opt ? opt->opt_nflen : 0);
1155         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1156
1157         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1158                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1159                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1160                         return -EMSGSIZE;
1161                 }
1162         }
1163
1164         /*
1165          * Let's try using as much space as possible.
1166          * Use MTU if total length of the message fits into the MTU.
1167          * Otherwise, we need to reserve fragment header and
1168          * fragment alignment (= 8-15 octects, in total).
1169          *
1170          * Note that we may need to "move" the data from the tail of
1171          * of the buffer to the new fragment when we split
1172          * the message.
1173          *
1174          * FIXME: It may be fragmented into multiple chunks
1175          *        at once if non-fragmentable extension headers
1176          *        are too large.
1177          * --yoshfuji
1178          */
1179
1180         inet->cork.length += length;
1181         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1182             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1183
1184                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1185                                           fragheaderlen, transhdrlen, mtu,
1186                                           flags);
1187                 if (err)
1188                         goto error;
1189                 return 0;
1190         }
1191
1192         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1193                 goto alloc_new_skb;
1194
1195         while (length > 0) {
1196                 /* Check if the remaining data fits into current packet. */
1197                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1198                 if (copy < length)
1199                         copy = maxfraglen - skb->len;
1200
1201                 if (copy <= 0) {
1202                         char *data;
1203                         unsigned int datalen;
1204                         unsigned int fraglen;
1205                         unsigned int fraggap;
1206                         unsigned int alloclen;
1207                         struct sk_buff *skb_prev;
1208 alloc_new_skb:
1209                         skb_prev = skb;
1210
1211                         /* There's no room in the current skb */
1212                         if (skb_prev)
1213                                 fraggap = skb_prev->len - maxfraglen;
1214                         else
1215                                 fraggap = 0;
1216
1217                         /*
1218                          * If remaining data exceeds the mtu,
1219                          * we know we need more fragment(s).
1220                          */
1221                         datalen = length + fraggap;
1222                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1223                                 datalen = maxfraglen - fragheaderlen;
1224
1225                         fraglen = datalen + fragheaderlen;
1226                         if ((flags & MSG_MORE) &&
1227                             !(rt->u.dst.dev->features&NETIF_F_SG))
1228                                 alloclen = mtu;
1229                         else
1230                                 alloclen = datalen + fragheaderlen;
1231
1232                         /*
1233                          * The last fragment gets additional space at tail.
1234                          * Note: we overallocate on fragments with MSG_MODE
1235                          * because we have no idea if we're the last one.
1236                          */
1237                         if (datalen == length + fraggap)
1238                                 alloclen += rt->u.dst.trailer_len;
1239
1240                         /*
1241                          * We just reserve space for fragment header.
1242                          * Note: this may be overallocation if the message
1243                          * (without MSG_MORE) fits into the MTU.
1244                          */
1245                         alloclen += sizeof(struct frag_hdr);
1246
1247                         if (transhdrlen) {
1248                                 skb = sock_alloc_send_skb(sk,
1249                                                 alloclen + hh_len,
1250                                                 (flags & MSG_DONTWAIT), &err);
1251                         } else {
1252                                 skb = NULL;
1253                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1254                                     2 * sk->sk_sndbuf)
1255                                         skb = sock_wmalloc(sk,
1256                                                            alloclen + hh_len, 1,
1257                                                            sk->sk_allocation);
1258                                 if (unlikely(skb == NULL))
1259                                         err = -ENOBUFS;
1260                         }
1261                         if (skb == NULL)
1262                                 goto error;
1263                         /*
1264                          *      Fill in the control structures
1265                          */
1266                         skb->ip_summed = csummode;
1267                         skb->csum = 0;
1268                         /* reserve for fragmentation */
1269                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1270
1271                         /*
1272                          *      Find where to start putting bytes
1273                          */
1274                         data = skb_put(skb, fraglen);
1275                         skb_set_network_header(skb, exthdrlen);
1276                         data += fragheaderlen;
1277                         skb->transport_header = (skb->network_header +
1278                                                  fragheaderlen);
1279                         if (fraggap) {
1280                                 skb->csum = skb_copy_and_csum_bits(
1281                                         skb_prev, maxfraglen,
1282                                         data + transhdrlen, fraggap, 0);
1283                                 skb_prev->csum = csum_sub(skb_prev->csum,
1284                                                           skb->csum);
1285                                 data += fraggap;
1286                                 pskb_trim_unique(skb_prev, maxfraglen);
1287                         }
1288                         copy = datalen - transhdrlen - fraggap;
1289                         if (copy < 0) {
1290                                 err = -EINVAL;
1291                                 kfree_skb(skb);
1292                                 goto error;
1293                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1294                                 err = -EFAULT;
1295                                 kfree_skb(skb);
1296                                 goto error;
1297                         }
1298
1299                         offset += copy;
1300                         length -= datalen - fraggap;
1301                         transhdrlen = 0;
1302                         exthdrlen = 0;
1303                         csummode = CHECKSUM_NONE;
1304
1305                         /*
1306                          * Put the packet on the pending queue
1307                          */
1308                         __skb_queue_tail(&sk->sk_write_queue, skb);
1309                         continue;
1310                 }
1311
1312                 if (copy > length)
1313                         copy = length;
1314
1315                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1316                         unsigned int off;
1317
1318                         off = skb->len;
1319                         if (getfrag(from, skb_put(skb, copy),
1320                                                 offset, copy, off, skb) < 0) {
1321                                 __skb_trim(skb, off);
1322                                 err = -EFAULT;
1323                                 goto error;
1324                         }
1325                 } else {
1326                         int i = skb_shinfo(skb)->nr_frags;
1327                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1328                         struct page *page = sk->sk_sndmsg_page;
1329                         int off = sk->sk_sndmsg_off;
1330                         unsigned int left;
1331
1332                         if (page && (left = PAGE_SIZE - off) > 0) {
1333                                 if (copy >= left)
1334                                         copy = left;
1335                                 if (page != frag->page) {
1336                                         if (i == MAX_SKB_FRAGS) {
1337                                                 err = -EMSGSIZE;
1338                                                 goto error;
1339                                         }
1340                                         get_page(page);
1341                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1342                                         frag = &skb_shinfo(skb)->frags[i];
1343                                 }
1344                         } else if(i < MAX_SKB_FRAGS) {
1345                                 if (copy > PAGE_SIZE)
1346                                         copy = PAGE_SIZE;
1347                                 page = alloc_pages(sk->sk_allocation, 0);
1348                                 if (page == NULL) {
1349                                         err = -ENOMEM;
1350                                         goto error;
1351                                 }
1352                                 sk->sk_sndmsg_page = page;
1353                                 sk->sk_sndmsg_off = 0;
1354
1355                                 skb_fill_page_desc(skb, i, page, 0, 0);
1356                                 frag = &skb_shinfo(skb)->frags[i];
1357                         } else {
1358                                 err = -EMSGSIZE;
1359                                 goto error;
1360                         }
1361                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1362                                 err = -EFAULT;
1363                                 goto error;
1364                         }
1365                         sk->sk_sndmsg_off += copy;
1366                         frag->size += copy;
1367                         skb->len += copy;
1368                         skb->data_len += copy;
1369                         skb->truesize += copy;
1370                         atomic_add(copy, &sk->sk_wmem_alloc);
1371                 }
1372                 offset += copy;
1373                 length -= copy;
1374         }
1375         return 0;
1376 error:
1377         inet->cork.length -= length;
1378         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1379         return err;
1380 }
1381
1382 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1383 {
1384         inet->cork.flags &= ~IPCORK_OPT;
1385         kfree(np->cork.opt);
1386         np->cork.opt = NULL;
1387         if (inet->cork.dst) {
1388                 dst_release(inet->cork.dst);
1389                 inet->cork.dst = NULL;
1390                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1391         }
1392         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1393 }
1394
1395 int ip6_push_pending_frames(struct sock *sk)
1396 {
1397         struct sk_buff *skb, *tmp_skb;
1398         struct sk_buff **tail_skb;
1399         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1400         struct inet_sock *inet = inet_sk(sk);
1401         struct ipv6_pinfo *np = inet6_sk(sk);
1402         struct ipv6hdr *hdr;
1403         struct ipv6_txoptions *opt = np->cork.opt;
1404         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1405         struct flowi *fl = &inet->cork.fl;
1406         unsigned char proto = fl->proto;
1407         int err = 0;
1408
1409         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1410                 goto out;
1411         tail_skb = &(skb_shinfo(skb)->frag_list);
1412
1413         /* move skb->data to ip header from ext header */
1414         if (skb->data < skb_network_header(skb))
1415                 __skb_pull(skb, skb_network_offset(skb));
1416         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1417                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1418                 *tail_skb = tmp_skb;
1419                 tail_skb = &(tmp_skb->next);
1420                 skb->len += tmp_skb->len;
1421                 skb->data_len += tmp_skb->len;
1422                 skb->truesize += tmp_skb->truesize;
1423                 __sock_put(tmp_skb->sk);
1424                 tmp_skb->destructor = NULL;
1425                 tmp_skb->sk = NULL;
1426         }
1427
1428         /* Allow local fragmentation. */
1429         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1430                 skb->local_df = 1;
1431
1432         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1433         __skb_pull(skb, skb_network_header_len(skb));
1434         if (opt && opt->opt_flen)
1435                 ipv6_push_frag_opts(skb, opt, &proto);
1436         if (opt && opt->opt_nflen)
1437                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1438
1439         skb_push(skb, sizeof(struct ipv6hdr));
1440         skb_reset_network_header(skb);
1441         hdr = ipv6_hdr(skb);
1442
1443         *(__be32*)hdr = fl->fl6_flowlabel |
1444                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1445
1446         hdr->hop_limit = np->cork.hop_limit;
1447         hdr->nexthdr = proto;
1448         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1449         ipv6_addr_copy(&hdr->daddr, final_dst);
1450
1451         skb->priority = sk->sk_priority;
1452         skb->mark = sk->sk_mark;
1453
1454         skb->dst = dst_clone(&rt->u.dst);
1455         IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1456         if (proto == IPPROTO_ICMPV6) {
1457                 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1458
1459                 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1460                 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1461         }
1462
1463         err = ip6_local_out(skb);
1464         if (err) {
1465                 if (err > 0)
1466                         err = np->recverr ? net_xmit_errno(err) : 0;
1467                 if (err)
1468                         goto error;
1469         }
1470
1471 out:
1472         ip6_cork_release(inet, np);
1473         return err;
1474 error:
1475         goto out;
1476 }
1477
1478 void ip6_flush_pending_frames(struct sock *sk)
1479 {
1480         struct sk_buff *skb;
1481
1482         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1483                 if (skb->dst)
1484                         IP6_INC_STATS(ip6_dst_idev(skb->dst),
1485                                       IPSTATS_MIB_OUTDISCARDS);
1486                 kfree_skb(skb);
1487         }
1488
1489         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1490 }