ipv6: Implement automatic flow label generation on transmit
[deliverable/linux.git] / net / ipv6 / ip6_output.c
CommitLineData
1da177e4
LT
1/*
2 * IPv6 output functions
1ab1457c 3 * Linux INET6 implementation
1da177e4
LT
4 *
5 * Authors:
1ab1457c 6 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 7 *
1da177e4
LT
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
1da177e4 29#include <linux/errno.h>
ef76bc23 30#include <linux/kernel.h>
1da177e4
LT
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
b59f45d0 39#include <linux/module.h>
5a0e3ad6 40#include <linux/slab.h>
1da177e4
LT
41
42#include <linux/netfilter.h>
43#include <linux/netfilter_ipv6.h>
44
45#include <net/sock.h>
46#include <net/snmp.h>
47
48#include <net/ipv6.h>
49#include <net/ndisc.h>
50#include <net/protocol.h>
51#include <net/ip6_route.h>
52#include <net/addrconf.h>
53#include <net/rawv6.h>
54#include <net/icmp.h>
55#include <net/xfrm.h>
56#include <net/checksum.h>
7bc570c8 57#include <linux/mroute6.h>
1da177e4 58
9e508490 59static int ip6_finish_output2(struct sk_buff *skb)
1da177e4 60{
adf30907 61 struct dst_entry *dst = skb_dst(skb);
1da177e4 62 struct net_device *dev = dst->dev;
f6b72b62 63 struct neighbour *neigh;
6fd6ce20
YH
64 struct in6_addr *nexthop;
65 int ret;
1da177e4
LT
66
67 skb->protocol = htons(ETH_P_IPV6);
68 skb->dev = dev;
69
0660e03f 70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
adf30907 71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1da177e4 72
7ad6848c 73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
d1db275d 74 ((mroute6_socket(dev_net(dev), skb) &&
bd91b8bf 75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
7bc570c8
YH
76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 &ipv6_hdr(skb)->saddr))) {
1da177e4
LT
78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80 /* Do not check for IFF_ALLMULTI; multicast routing
81 is not supported in any case.
82 */
83 if (newskb)
b2e0b385
JE
84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 newskb, NULL, newskb->dev,
95603e22 86 dev_loopback_xmit);
1da177e4 87
0660e03f 88 if (ipv6_hdr(skb)->hop_limit == 0) {
3bd653c8
DL
89 IP6_INC_STATS(dev_net(dev), idev,
90 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
91 kfree_skb(skb);
92 return 0;
93 }
94 }
95
edf391ff
NH
96 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 skb->len);
dd408515
HFS
98
99 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 IPV6_ADDR_SCOPE_NODELOCAL &&
101 !(dev->flags & IFF_LOOPBACK)) {
102 kfree_skb(skb);
103 return 0;
104 }
1da177e4
LT
105 }
106
6fd6ce20 107 rcu_read_lock_bh();
550bab42 108 nexthop = rt6_nexthop((struct rt6_info *)dst);
6fd6ce20
YH
109 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 if (unlikely(!neigh))
111 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 if (!IS_ERR(neigh)) {
113 ret = dst_neigh_output(dst, neigh, skb);
114 rcu_read_unlock_bh();
115 return ret;
116 }
117 rcu_read_unlock_bh();
05e3aa09 118
7f88c6b2
HFS
119 IP6_INC_STATS(dev_net(dst->dev),
120 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
9e508490
JE
121 kfree_skb(skb);
122 return -EINVAL;
1da177e4
LT
123}
124
9e508490
JE
125static int ip6_finish_output(struct sk_buff *skb)
126{
127 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
9037c357
JP
128 dst_allfrag(skb_dst(skb)) ||
129 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
9e508490
JE
130 return ip6_fragment(skb, ip6_finish_output2);
131 else
132 return ip6_finish_output2(skb);
133}
134
aad88724 135int ip6_output(struct sock *sk, struct sk_buff *skb)
1da177e4 136{
9e508490 137 struct net_device *dev = skb_dst(skb)->dev;
adf30907 138 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
778d80be 139 if (unlikely(idev->cnf.disable_ipv6)) {
9e508490 140 IP6_INC_STATS(dev_net(dev), idev,
3bd653c8 141 IPSTATS_MIB_OUTDISCARDS);
778d80be
YH
142 kfree_skb(skb);
143 return 0;
144 }
145
9c6eb28a
JE
146 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
147 ip6_finish_output,
148 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
1da177e4
LT
149}
150
1da177e4 151/*
b5d43998 152 * xmit an sk_buff (used by TCP, SCTP and DCCP)
1da177e4
LT
153 */
154
4c9483b2 155int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
b903d324 156 struct ipv6_txoptions *opt, int tclass)
1da177e4 157{
3bd653c8 158 struct net *net = sock_net(sk);
b30bd282 159 struct ipv6_pinfo *np = inet6_sk(sk);
4c9483b2 160 struct in6_addr *first_hop = &fl6->daddr;
adf30907 161 struct dst_entry *dst = skb_dst(skb);
1da177e4 162 struct ipv6hdr *hdr;
4c9483b2 163 u8 proto = fl6->flowi6_proto;
1da177e4 164 int seg_len = skb->len;
e651f03a 165 int hlimit = -1;
1da177e4
LT
166 u32 mtu;
167
168 if (opt) {
c2636b4d 169 unsigned int head_room;
1da177e4
LT
170
171 /* First: exthdrs may take lots of space (~8K for now)
172 MAX_HEADER is not enough.
173 */
174 head_room = opt->opt_nflen + opt->opt_flen;
175 seg_len += head_room;
176 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
177
178 if (skb_headroom(skb) < head_room) {
179 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
a11d206d 180 if (skb2 == NULL) {
adf30907 181 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d
YH
182 IPSTATS_MIB_OUTDISCARDS);
183 kfree_skb(skb);
1da177e4
LT
184 return -ENOBUFS;
185 }
808db80a 186 consume_skb(skb);
a11d206d 187 skb = skb2;
83d7eb29 188 skb_set_owner_w(skb, sk);
1da177e4
LT
189 }
190 if (opt->opt_flen)
191 ipv6_push_frag_opts(skb, opt, &proto);
192 if (opt->opt_nflen)
193 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
194 }
195
e2d1bca7
ACM
196 skb_push(skb, sizeof(struct ipv6hdr));
197 skb_reset_network_header(skb);
0660e03f 198 hdr = ipv6_hdr(skb);
1da177e4
LT
199
200 /*
201 * Fill in the IPv6 header
202 */
b903d324 203 if (np)
1da177e4
LT
204 hlimit = np->hop_limit;
205 if (hlimit < 0)
6b75d090 206 hlimit = ip6_dst_hoplimit(dst);
1da177e4 207
cb1ce2ef
TH
208 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
209 np->autoflowlabel));
41a1f8ea 210
1da177e4
LT
211 hdr->payload_len = htons(seg_len);
212 hdr->nexthdr = proto;
213 hdr->hop_limit = hlimit;
214
4e3fd7a0
AD
215 hdr->saddr = fl6->saddr;
216 hdr->daddr = *first_hop;
1da177e4 217
9c9c9ad5 218 skb->protocol = htons(ETH_P_IPV6);
a2c2064f 219 skb->priority = sk->sk_priority;
4a19ec58 220 skb->mark = sk->sk_mark;
a2c2064f 221
1da177e4 222 mtu = dst_mtu(dst);
60ff7467 223 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
adf30907 224 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
edf391ff 225 IPSTATS_MIB_OUT, skb->len);
b2e0b385
JE
226 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
227 dst->dev, dst_output);
1da177e4
LT
228 }
229
1da177e4 230 skb->dev = dst->dev;
f4e53e29 231 ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
adf30907 232 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
233 kfree_skb(skb);
234 return -EMSGSIZE;
235}
236
7159039a
YH
237EXPORT_SYMBOL(ip6_xmit);
238
1da177e4
LT
239static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
240{
241 struct ip6_ra_chain *ra;
242 struct sock *last = NULL;
243
244 read_lock(&ip6_ra_lock);
245 for (ra = ip6_ra_chain; ra; ra = ra->next) {
246 struct sock *sk = ra->sk;
0bd1b59b
AM
247 if (sk && ra->sel == sel &&
248 (!sk->sk_bound_dev_if ||
249 sk->sk_bound_dev_if == skb->dev->ifindex)) {
1da177e4
LT
250 if (last) {
251 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
252 if (skb2)
253 rawv6_rcv(last, skb2);
254 }
255 last = sk;
256 }
257 }
258
259 if (last) {
260 rawv6_rcv(last, skb);
261 read_unlock(&ip6_ra_lock);
262 return 1;
263 }
264 read_unlock(&ip6_ra_lock);
265 return 0;
266}
267
e21e0b5f
VN
268static int ip6_forward_proxy_check(struct sk_buff *skb)
269{
0660e03f 270 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f 271 u8 nexthdr = hdr->nexthdr;
75f2811c 272 __be16 frag_off;
e21e0b5f
VN
273 int offset;
274
275 if (ipv6_ext_hdr(nexthdr)) {
75f2811c 276 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
e21e0b5f
VN
277 if (offset < 0)
278 return 0;
279 } else
280 offset = sizeof(struct ipv6hdr);
281
282 if (nexthdr == IPPROTO_ICMPV6) {
283 struct icmp6hdr *icmp6;
284
d56f90a7
ACM
285 if (!pskb_may_pull(skb, (skb_network_header(skb) +
286 offset + 1 - skb->data)))
e21e0b5f
VN
287 return 0;
288
d56f90a7 289 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
290
291 switch (icmp6->icmp6_type) {
292 case NDISC_ROUTER_SOLICITATION:
293 case NDISC_ROUTER_ADVERTISEMENT:
294 case NDISC_NEIGHBOUR_SOLICITATION:
295 case NDISC_NEIGHBOUR_ADVERTISEMENT:
296 case NDISC_REDIRECT:
297 /* For reaction involving unicast neighbor discovery
298 * message destined to the proxied address, pass it to
299 * input function.
300 */
301 return 1;
302 default:
303 break;
304 }
305 }
306
74553b09
VN
307 /*
308 * The proxying router can't forward traffic sent to a link-local
309 * address, so signal the sender and discard the packet. This
310 * behavior is clarified by the MIPv6 specification.
311 */
312 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
313 dst_link_failure(skb);
314 return -1;
315 }
316
e21e0b5f
VN
317 return 0;
318}
319
1da177e4
LT
320static inline int ip6_forward_finish(struct sk_buff *skb)
321{
322 return dst_output(skb);
323}
324
0954cf9c
HFS
325static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
326{
327 unsigned int mtu;
328 struct inet6_dev *idev;
329
330 if (dst_metric_locked(dst, RTAX_MTU)) {
331 mtu = dst_metric_raw(dst, RTAX_MTU);
332 if (mtu)
333 return mtu;
334 }
335
336 mtu = IPV6_MIN_MTU;
337 rcu_read_lock();
338 idev = __in6_dev_get(dst->dev);
339 if (idev)
340 mtu = idev->cnf.mtu6;
341 rcu_read_unlock();
342
343 return mtu;
344}
345
fe6cc55f
FW
346static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
347{
418a3156 348 if (skb->len <= mtu)
fe6cc55f
FW
349 return false;
350
60ff7467 351 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
fe6cc55f
FW
352 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
353 return true;
354
60ff7467 355 if (skb->ignore_df)
418a3156
FW
356 return false;
357
fe6cc55f
FW
358 if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
359 return false;
360
361 return true;
362}
363
1da177e4
LT
364int ip6_forward(struct sk_buff *skb)
365{
adf30907 366 struct dst_entry *dst = skb_dst(skb);
0660e03f 367 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 368 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 369 struct net *net = dev_net(dst->dev);
14f3ad6f 370 u32 mtu;
1ab1457c 371
53b7997f 372 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
373 goto error;
374
090f1166
LR
375 if (skb->pkt_type != PACKET_HOST)
376 goto drop;
377
4497b076
BH
378 if (skb_warn_if_lro(skb))
379 goto drop;
380
1da177e4 381 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
15c77d8b
ED
382 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
383 IPSTATS_MIB_INDISCARDS);
1da177e4
LT
384 goto drop;
385 }
386
35fc92a9 387 skb_forward_csum(skb);
1da177e4
LT
388
389 /*
390 * We DO NOT make any processing on
391 * RA packets, pushing them to user level AS IS
392 * without ane WARRANTY that application will be able
393 * to interpret them. The reason is that we
394 * cannot make anything clever here.
395 *
396 * We are not end-node, so that if packet contains
397 * AH/ESP, we cannot make anything.
398 * Defragmentation also would be mistake, RA packets
399 * cannot be fragmented, because there is no warranty
400 * that different fragments will go along one path. --ANK
401 */
ab4eb353
YH
402 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
403 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
1da177e4
LT
404 return 0;
405 }
406
407 /*
408 * check and decrement ttl
409 */
410 if (hdr->hop_limit <= 1) {
411 /* Force OUTPUT device used as source address */
412 skb->dev = dst->dev;
3ffe533c 413 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
15c77d8b
ED
414 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
415 IPSTATS_MIB_INHDRERRORS);
1da177e4
LT
416
417 kfree_skb(skb);
418 return -ETIMEDOUT;
419 }
420
fbea49e1 421 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 422 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 423 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09
VN
424 int proxied = ip6_forward_proxy_check(skb);
425 if (proxied > 0)
e21e0b5f 426 return ip6_input(skb);
74553b09 427 else if (proxied < 0) {
15c77d8b
ED
428 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
429 IPSTATS_MIB_INDISCARDS);
74553b09
VN
430 goto drop;
431 }
e21e0b5f
VN
432 }
433
1da177e4 434 if (!xfrm6_route_forward(skb)) {
15c77d8b
ED
435 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
436 IPSTATS_MIB_INDISCARDS);
1da177e4
LT
437 goto drop;
438 }
adf30907 439 dst = skb_dst(skb);
1da177e4
LT
440
441 /* IPv6 specs say nothing about it, but it is clear that we cannot
442 send redirects to source routed frames.
1e5dc146 443 We don't send redirects to frames decapsulated from IPsec.
1da177e4 444 */
c45a3dfb 445 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
1da177e4 446 struct in6_addr *target = NULL;
fbfe95a4 447 struct inet_peer *peer;
1da177e4 448 struct rt6_info *rt;
1da177e4
LT
449
450 /*
451 * incoming and outgoing devices are the same
452 * send a redirect.
453 */
454
455 rt = (struct rt6_info *) dst;
c45a3dfb
DM
456 if (rt->rt6i_flags & RTF_GATEWAY)
457 target = &rt->rt6i_gateway;
1da177e4
LT
458 else
459 target = &hdr->daddr;
460
1d861aa4 461 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
92d86829 462
1da177e4
LT
463 /* Limit redirects both by destination (here)
464 and by source (inside ndisc_send_redirect)
465 */
fbfe95a4 466 if (inet_peer_xrlim_allow(peer, 1*HZ))
4991969a 467 ndisc_send_redirect(skb, target);
1d861aa4
DM
468 if (peer)
469 inet_putpeer(peer);
5bb1ab09
DS
470 } else {
471 int addrtype = ipv6_addr_type(&hdr->saddr);
472
1da177e4 473 /* This check is security critical. */
f81b2e7d
YH
474 if (addrtype == IPV6_ADDR_ANY ||
475 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
476 goto error;
477 if (addrtype & IPV6_ADDR_LINKLOCAL) {
478 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
3ffe533c 479 ICMPV6_NOT_NEIGHBOUR, 0);
5bb1ab09
DS
480 goto error;
481 }
1da177e4
LT
482 }
483
0954cf9c 484 mtu = ip6_dst_mtu_forward(dst);
14f3ad6f
UW
485 if (mtu < IPV6_MIN_MTU)
486 mtu = IPV6_MIN_MTU;
487
fe6cc55f 488 if (ip6_pkt_too_big(skb, mtu)) {
1da177e4
LT
489 /* Again, force OUTPUT device used as source address */
490 skb->dev = dst->dev;
14f3ad6f 491 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
15c77d8b
ED
492 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
493 IPSTATS_MIB_INTOOBIGERRORS);
494 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
495 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
496 kfree_skb(skb);
497 return -EMSGSIZE;
498 }
499
500 if (skb_cow(skb, dst->dev->hard_header_len)) {
15c77d8b
ED
501 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
502 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
503 goto drop;
504 }
505
0660e03f 506 hdr = ipv6_hdr(skb);
1da177e4
LT
507
508 /* Mangling hops number delayed to point after skb COW */
1ab1457c 509
1da177e4
LT
510 hdr->hop_limit--;
511
483a47d2 512 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
2d8dbb04 513 IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
b2e0b385 514 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
6e23ae2a 515 ip6_forward_finish);
1da177e4
LT
516
517error:
483a47d2 518 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
1da177e4
LT
519drop:
520 kfree_skb(skb);
521 return -EINVAL;
522}
523
524static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
525{
526 to->pkt_type = from->pkt_type;
527 to->priority = from->priority;
528 to->protocol = from->protocol;
adf30907
ED
529 skb_dst_drop(to);
530 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 531 to->dev = from->dev;
82e91ffe 532 to->mark = from->mark;
1da177e4
LT
533
534#ifdef CONFIG_NET_SCHED
535 to->tc_index = from->tc_index;
536#endif
e7ac05f3 537 nf_copy(to, from);
984bc16c 538 skb_copy_secmark(to, from);
1da177e4
LT
539}
540
73f156a6
ED
541static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
542{
543 static u32 ip6_idents_hashrnd __read_mostly;
544 u32 hash, id;
545
546 net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
547
548 hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
549 id = ip_idents_reserve(hash, 1);
550 fhdr->identification = htonl(id);
551}
552
ad0081e4 553int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
1da177e4 554{
1da177e4 555 struct sk_buff *frag;
adf30907 556 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
d91675f9 557 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
1da177e4
LT
558 struct ipv6hdr *tmp_hdr;
559 struct frag_hdr *fh;
560 unsigned int mtu, hlen, left, len;
a7ae1992 561 int hroom, troom;
ae08e1f0 562 __be32 frag_id = 0;
1da177e4
LT
563 int ptr, offset = 0, err=0;
564 u8 *prevhdr, nexthdr = 0;
adf30907 565 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4 566
1da177e4
LT
567 hlen = ip6_find_1stfragopt(skb, &prevhdr);
568 nexthdr = *prevhdr;
569
628a5c56 570 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
571
572 /* We must not fragment if the socket is set to force MTU discovery
14f3ad6f 573 * or if the skb it not generated by a local socket.
b881ef76 574 */
60ff7467 575 if (unlikely(!skb->ignore_df && skb->len > mtu) ||
4cdd3408
PM
576 (IP6CB(skb)->frag_max_size &&
577 IP6CB(skb)->frag_max_size > mtu)) {
a34a101e
ED
578 if (skb->sk && dst_allfrag(skb_dst(skb)))
579 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
580
adf30907 581 skb->dev = skb_dst(skb)->dev;
3ffe533c 582 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
adf30907 583 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 584 IPSTATS_MIB_FRAGFAILS);
b881ef76
JH
585 kfree_skb(skb);
586 return -EMSGSIZE;
587 }
588
d91675f9
YH
589 if (np && np->frag_size < mtu) {
590 if (np->frag_size)
591 mtu = np->frag_size;
592 }
593 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4 594
21dc3301 595 if (skb_has_frag_list(skb)) {
1da177e4 596 int first_len = skb_pagelen(skb);
3d13008e 597 struct sk_buff *frag2;
1da177e4
LT
598
599 if (first_len - hlen > mtu ||
600 ((first_len - hlen) & 7) ||
601 skb_cloned(skb))
602 goto slow_path;
603
4d9092bb 604 skb_walk_frags(skb, frag) {
1da177e4
LT
605 /* Correct geometry. */
606 if (frag->len > mtu ||
607 ((frag->len & 7) && frag->next) ||
608 skb_headroom(frag) < hlen)
3d13008e 609 goto slow_path_clean;
1da177e4 610
1da177e4
LT
611 /* Partially cloned skb? */
612 if (skb_shared(frag))
3d13008e 613 goto slow_path_clean;
2fdba6b0
HX
614
615 BUG_ON(frag->sk);
616 if (skb->sk) {
2fdba6b0
HX
617 frag->sk = skb->sk;
618 frag->destructor = sock_wfree;
2fdba6b0 619 }
3d13008e 620 skb->truesize -= frag->truesize;
1da177e4
LT
621 }
622
623 err = 0;
624 offset = 0;
625 frag = skb_shinfo(skb)->frag_list;
4d9092bb 626 skb_frag_list_init(skb);
1da177e4
LT
627 /* BUILD HEADER */
628
9a217a1c 629 *prevhdr = NEXTHDR_FRAGMENT;
d56f90a7 630 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
1da177e4 631 if (!tmp_hdr) {
adf30907 632 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 633 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
634 return -ENOMEM;
635 }
636
1da177e4
LT
637 __skb_pull(skb, hlen);
638 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
e2d1bca7
ACM
639 __skb_push(skb, hlen);
640 skb_reset_network_header(skb);
d56f90a7 641 memcpy(skb_network_header(skb), tmp_hdr, hlen);
1da177e4 642
87c48fa3 643 ipv6_select_ident(fh, rt);
1da177e4
LT
644 fh->nexthdr = nexthdr;
645 fh->reserved = 0;
646 fh->frag_off = htons(IP6_MF);
647 frag_id = fh->identification;
648
649 first_len = skb_pagelen(skb);
650 skb->data_len = first_len - skb_headlen(skb);
651 skb->len = first_len;
0660e03f
ACM
652 ipv6_hdr(skb)->payload_len = htons(first_len -
653 sizeof(struct ipv6hdr));
a11d206d 654
d8d1f30b 655 dst_hold(&rt->dst);
1da177e4
LT
656
657 for (;;) {
658 /* Prepare header of the next frame,
659 * before previous one went down. */
660 if (frag) {
661 frag->ip_summed = CHECKSUM_NONE;
badff6d0 662 skb_reset_transport_header(frag);
1da177e4 663 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
e2d1bca7
ACM
664 __skb_push(frag, hlen);
665 skb_reset_network_header(frag);
d56f90a7
ACM
666 memcpy(skb_network_header(frag), tmp_hdr,
667 hlen);
1da177e4
LT
668 offset += skb->len - hlen - sizeof(struct frag_hdr);
669 fh->nexthdr = nexthdr;
670 fh->reserved = 0;
671 fh->frag_off = htons(offset);
672 if (frag->next != NULL)
673 fh->frag_off |= htons(IP6_MF);
674 fh->identification = frag_id;
0660e03f
ACM
675 ipv6_hdr(frag)->payload_len =
676 htons(frag->len -
677 sizeof(struct ipv6hdr));
1da177e4
LT
678 ip6_copy_metadata(frag, skb);
679 }
1ab1457c 680
1da177e4 681 err = output(skb);
dafee490 682 if(!err)
d8d1f30b 683 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 684 IPSTATS_MIB_FRAGCREATES);
dafee490 685
1da177e4
LT
686 if (err || !frag)
687 break;
688
689 skb = frag;
690 frag = skb->next;
691 skb->next = NULL;
692 }
693
a51482bd 694 kfree(tmp_hdr);
1da177e4
LT
695
696 if (err == 0) {
d8d1f30b 697 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 698 IPSTATS_MIB_FRAGOKS);
94e187c0 699 ip6_rt_put(rt);
1da177e4
LT
700 return 0;
701 }
702
703 while (frag) {
704 skb = frag->next;
705 kfree_skb(frag);
706 frag = skb;
707 }
708
d8d1f30b 709 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
3bd653c8 710 IPSTATS_MIB_FRAGFAILS);
94e187c0 711 ip6_rt_put(rt);
1da177e4 712 return err;
3d13008e
ED
713
714slow_path_clean:
715 skb_walk_frags(skb, frag2) {
716 if (frag2 == frag)
717 break;
718 frag2->sk = NULL;
719 frag2->destructor = NULL;
720 skb->truesize += frag2->truesize;
721 }
1da177e4
LT
722 }
723
724slow_path:
72e843bb
ED
725 if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
726 skb_checksum_help(skb))
727 goto fail;
728
1da177e4
LT
729 left = skb->len - hlen; /* Space per frame */
730 ptr = hlen; /* Where to start from */
731
732 /*
733 * Fragment the datagram.
734 */
735
736 *prevhdr = NEXTHDR_FRAGMENT;
a7ae1992
HX
737 hroom = LL_RESERVED_SPACE(rt->dst.dev);
738 troom = rt->dst.dev->needed_tailroom;
1da177e4
LT
739
740 /*
741 * Keep copying data until we run out.
742 */
743 while(left > 0) {
744 len = left;
745 /* IF: it doesn't fit, use 'mtu' - the data space left */
746 if (len > mtu)
747 len = mtu;
25985edc 748 /* IF: we are not sending up to and including the packet end
1da177e4
LT
749 then align the next start on an eight byte boundary */
750 if (len < left) {
751 len &= ~7;
752 }
753 /*
754 * Allocate buffer.
755 */
756
a7ae1992
HX
757 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
758 hroom + troom, GFP_ATOMIC)) == NULL) {
64ce2073 759 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
adf30907 760 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 761 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
762 err = -ENOMEM;
763 goto fail;
764 }
765
766 /*
767 * Set up data on packet
768 */
769
770 ip6_copy_metadata(frag, skb);
a7ae1992 771 skb_reserve(frag, hroom);
1da177e4 772 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
c1d2bbe1 773 skb_reset_network_header(frag);
badff6d0 774 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
b0e380b1
ACM
775 frag->transport_header = (frag->network_header + hlen +
776 sizeof(struct frag_hdr));
1da177e4
LT
777
778 /*
779 * Charge the memory for the fragment to any owner
780 * it might possess
781 */
782 if (skb->sk)
783 skb_set_owner_w(frag, skb->sk);
784
785 /*
786 * Copy the packet header into the new buffer.
787 */
d626f62b 788 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
1da177e4
LT
789
790 /*
791 * Build fragment header.
792 */
793 fh->nexthdr = nexthdr;
794 fh->reserved = 0;
f36d6ab1 795 if (!frag_id) {
87c48fa3 796 ipv6_select_ident(fh, rt);
1da177e4
LT
797 frag_id = fh->identification;
798 } else
799 fh->identification = frag_id;
800
801 /*
802 * Copy a block of the IP datagram.
803 */
8984e41d 804 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
1da177e4
LT
805 BUG();
806 left -= len;
807
808 fh->frag_off = htons(offset);
809 if (left > 0)
810 fh->frag_off |= htons(IP6_MF);
0660e03f
ACM
811 ipv6_hdr(frag)->payload_len = htons(frag->len -
812 sizeof(struct ipv6hdr));
1da177e4
LT
813
814 ptr += len;
815 offset += len;
816
817 /*
818 * Put this fragment into the sending queue.
819 */
1da177e4
LT
820 err = output(frag);
821 if (err)
822 goto fail;
dafee490 823
adf30907 824 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 825 IPSTATS_MIB_FRAGCREATES);
1da177e4 826 }
adf30907 827 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 828 IPSTATS_MIB_FRAGOKS);
808db80a 829 consume_skb(skb);
1da177e4
LT
830 return err;
831
832fail:
adf30907 833 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 834 IPSTATS_MIB_FRAGFAILS);
1ab1457c 835 kfree_skb(skb);
1da177e4
LT
836 return err;
837}
838
b71d1d42
ED
839static inline int ip6_rt_check(const struct rt6key *rt_key,
840 const struct in6_addr *fl_addr,
841 const struct in6_addr *addr_cache)
cf6b1982 842{
a02cec21
ED
843 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
844 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
cf6b1982
YH
845}
846
497c615a
HX
847static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
848 struct dst_entry *dst,
b71d1d42 849 const struct flowi6 *fl6)
1da177e4 850{
497c615a 851 struct ipv6_pinfo *np = inet6_sk(sk);
a963a37d 852 struct rt6_info *rt;
1da177e4 853
497c615a
HX
854 if (!dst)
855 goto out;
856
a963a37d
ED
857 if (dst->ops->family != AF_INET6) {
858 dst_release(dst);
859 return NULL;
860 }
861
862 rt = (struct rt6_info *)dst;
497c615a
HX
863 /* Yes, checking route validity in not connected
864 * case is not very simple. Take into account,
865 * that we do not support routing by source, TOS,
866 * and MSG_DONTROUTE --ANK (980726)
867 *
cf6b1982
YH
868 * 1. ip6_rt_check(): If route was host route,
869 * check that cached destination is current.
497c615a
HX
870 * If it is network route, we still may
871 * check its validity using saved pointer
872 * to the last used address: daddr_cache.
873 * We do not want to save whole address now,
874 * (because main consumer of this service
875 * is tcp, which has not this problem),
876 * so that the last trick works only on connected
877 * sockets.
878 * 2. oif also should be the same.
879 */
4c9483b2 880 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
8e1ef0a9 881#ifdef CONFIG_IPV6_SUBTREES
4c9483b2 882 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
8e1ef0a9 883#endif
4c9483b2 884 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
497c615a
HX
885 dst_release(dst);
886 dst = NULL;
1da177e4
LT
887 }
888
497c615a
HX
889out:
890 return dst;
891}
892
893static int ip6_dst_lookup_tail(struct sock *sk,
4c9483b2 894 struct dst_entry **dst, struct flowi6 *fl6)
497c615a 895{
3b1e0a65 896 struct net *net = sock_net(sk);
69cce1d1
DM
897#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
898 struct neighbour *n;
97cac082 899 struct rt6_info *rt;
69cce1d1
DM
900#endif
901 int err;
497c615a 902
1da177e4 903 if (*dst == NULL)
4c9483b2 904 *dst = ip6_route_output(net, sk, fl6);
1da177e4
LT
905
906 if ((err = (*dst)->error))
907 goto out_err_release;
908
4c9483b2 909 if (ipv6_addr_any(&fl6->saddr)) {
c3968a85
DW
910 struct rt6_info *rt = (struct rt6_info *) *dst;
911 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
912 sk ? inet6_sk(sk)->srcprefs : 0,
913 &fl6->saddr);
44456d37 914 if (err)
1da177e4 915 goto out_err_release;
1da177e4
LT
916 }
917
95c385b4 918#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
919 /*
920 * Here if the dst entry we've looked up
921 * has a neighbour entry that is in the INCOMPLETE
922 * state and the src address from the flow is
923 * marked as OPTIMISTIC, we release the found
924 * dst entry and replace it instead with the
925 * dst entry of the nexthop router
926 */
c56bf6fe 927 rt = (struct rt6_info *) *dst;
707be1ff 928 rcu_read_lock_bh();
550bab42 929 n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
707be1ff
YH
930 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
931 rcu_read_unlock_bh();
932
933 if (err) {
e550dfb0 934 struct inet6_ifaddr *ifp;
4c9483b2 935 struct flowi6 fl_gw6;
e550dfb0
NH
936 int redirect;
937
4c9483b2 938 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
e550dfb0
NH
939 (*dst)->dev, 1);
940
941 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
942 if (ifp)
943 in6_ifa_put(ifp);
944
945 if (redirect) {
946 /*
947 * We need to get the dst entry for the
948 * default router instead
949 */
950 dst_release(*dst);
4c9483b2
DM
951 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
952 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
953 *dst = ip6_route_output(net, sk, &fl_gw6);
e550dfb0
NH
954 if ((err = (*dst)->error))
955 goto out_err_release;
95c385b4 956 }
e550dfb0 957 }
95c385b4
NH
958#endif
959
1da177e4
LT
960 return 0;
961
962out_err_release:
ca46f9c8 963 if (err == -ENETUNREACH)
5ac68e7c 964 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
965 dst_release(*dst);
966 *dst = NULL;
967 return err;
968}
34a0b3cd 969
497c615a
HX
970/**
971 * ip6_dst_lookup - perform route lookup on flow
972 * @sk: socket which provides route info
973 * @dst: pointer to dst_entry * for result
4c9483b2 974 * @fl6: flow to lookup
497c615a
HX
975 *
976 * This function performs a route lookup on the given flow.
977 *
978 * It returns zero on success, or a standard errno code on error.
979 */
4c9483b2 980int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
497c615a
HX
981{
982 *dst = NULL;
4c9483b2 983 return ip6_dst_lookup_tail(sk, dst, fl6);
497c615a 984}
3cf3dc6c
ACM
985EXPORT_SYMBOL_GPL(ip6_dst_lookup);
986
497c615a 987/**
68d0c6d3
DM
988 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
989 * @sk: socket which provides route info
4c9483b2 990 * @fl6: flow to lookup
68d0c6d3 991 * @final_dst: final destination address for ipsec lookup
68d0c6d3
DM
992 *
993 * This function performs a route lookup on the given flow.
994 *
995 * It returns a valid dst pointer on success, or a pointer encoded
996 * error code.
997 */
4c9483b2 998struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
0e0d44ab 999 const struct in6_addr *final_dst)
68d0c6d3
DM
1000{
1001 struct dst_entry *dst = NULL;
1002 int err;
1003
4c9483b2 1004 err = ip6_dst_lookup_tail(sk, &dst, fl6);
68d0c6d3
DM
1005 if (err)
1006 return ERR_PTR(err);
1007 if (final_dst)
4e3fd7a0 1008 fl6->daddr = *final_dst;
2774c131 1009
4c9483b2 1010 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
68d0c6d3
DM
1011}
1012EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1013
1014/**
1015 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
497c615a 1016 * @sk: socket which provides the dst cache and route info
4c9483b2 1017 * @fl6: flow to lookup
68d0c6d3 1018 * @final_dst: final destination address for ipsec lookup
497c615a
HX
1019 *
1020 * This function performs a route lookup on the given flow with the
1021 * possibility of using the cached route in the socket if it is valid.
1022 * It will take the socket dst lock when operating on the dst cache.
1023 * As a result, this function can only be used in process context.
1024 *
68d0c6d3
DM
1025 * It returns a valid dst pointer on success, or a pointer encoded
1026 * error code.
497c615a 1027 */
4c9483b2 1028struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
0e0d44ab 1029 const struct in6_addr *final_dst)
497c615a 1030{
68d0c6d3
DM
1031 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1032 int err;
497c615a 1033
4c9483b2 1034 dst = ip6_sk_dst_check(sk, dst, fl6);
68d0c6d3 1035
4c9483b2 1036 err = ip6_dst_lookup_tail(sk, &dst, fl6);
68d0c6d3
DM
1037 if (err)
1038 return ERR_PTR(err);
1039 if (final_dst)
4e3fd7a0 1040 fl6->daddr = *final_dst;
2774c131 1041
4c9483b2 1042 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
497c615a 1043}
68d0c6d3 1044EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
497c615a 1045
34a0b3cd 1046static inline int ip6_ufo_append_data(struct sock *sk,
e89e9cf5
AR
1047 int getfrag(void *from, char *to, int offset, int len,
1048 int odd, struct sk_buff *skb),
1049 void *from, int length, int hh_len, int fragheaderlen,
87c48fa3
ED
1050 int transhdrlen, int mtu,unsigned int flags,
1051 struct rt6_info *rt)
e89e9cf5
AR
1052
1053{
1054 struct sk_buff *skb;
c547dbf5 1055 struct frag_hdr fhdr;
e89e9cf5
AR
1056 int err;
1057
1058 /* There is support for UDP large send offload by network
1059 * device, so create one single skb packet containing complete
1060 * udp datagram
1061 */
1062 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1063 skb = sock_alloc_send_skb(sk,
1064 hh_len + fragheaderlen + transhdrlen + 20,
1065 (flags & MSG_DONTWAIT), &err);
1066 if (skb == NULL)
504744e4 1067 return err;
e89e9cf5
AR
1068
1069 /* reserve space for Hardware header */
1070 skb_reserve(skb, hh_len);
1071
1072 /* create space for UDP/IP header */
1073 skb_put(skb,fragheaderlen + transhdrlen);
1074
1075 /* initialize network header pointer */
c1d2bbe1 1076 skb_reset_network_header(skb);
e89e9cf5
AR
1077
1078 /* initialize protocol header pointer */
b0e380b1 1079 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 1080
9c9c9ad5 1081 skb->protocol = htons(ETH_P_IPV6);
e89e9cf5 1082 skb->csum = 0;
e89e9cf5 1083
e89e9cf5 1084 __skb_queue_tail(&sk->sk_write_queue, skb);
c547dbf5
JP
1085 } else if (skb_is_gso(skb)) {
1086 goto append;
e89e9cf5 1087 }
e89e9cf5 1088
c547dbf5
JP
1089 skb->ip_summed = CHECKSUM_PARTIAL;
1090 /* Specify the length of each IPv6 datagram fragment.
1091 * It has to be a multiple of 8.
1092 */
1093 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1094 sizeof(struct frag_hdr)) & ~7;
1095 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1096 ipv6_select_ident(&fhdr, rt);
1097 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1098
1099append:
2811ebac
HFS
1100 return skb_append_datato_frags(sk, skb, getfrag, from,
1101 (length - transhdrlen));
e89e9cf5 1102}
1da177e4 1103
0178b695
HX
1104static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1105 gfp_t gfp)
1106{
1107 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1108}
1109
1110static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1111 gfp_t gfp)
1112{
1113 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1114}
1115
75a493e6 1116static void ip6_append_data_mtu(unsigned int *mtu,
0c183379
G
1117 int *maxfraglen,
1118 unsigned int fragheaderlen,
1119 struct sk_buff *skb,
75a493e6 1120 struct rt6_info *rt,
e367c2d0 1121 unsigned int orig_mtu)
0c183379
G
1122{
1123 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1124 if (skb == NULL) {
1125 /* first fragment, reserve header_len */
e367c2d0 1126 *mtu = orig_mtu - rt->dst.header_len;
0c183379
G
1127
1128 } else {
1129 /*
1130 * this fragment is not first, the headers
1131 * space is regarded as data space.
1132 */
e367c2d0 1133 *mtu = orig_mtu;
0c183379
G
1134 }
1135 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1136 + fragheaderlen - sizeof(struct frag_hdr);
1137 }
1138}
1139
41a1f8ea
YH
1140int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1141 int offset, int len, int odd, struct sk_buff *skb),
1142 void *from, int length, int transhdrlen,
4c9483b2 1143 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
13b52cd4 1144 struct rt6_info *rt, unsigned int flags, int dontfrag)
1da177e4
LT
1145{
1146 struct inet_sock *inet = inet_sk(sk);
1147 struct ipv6_pinfo *np = inet6_sk(sk);
bdc712b4 1148 struct inet_cork *cork;
0c183379 1149 struct sk_buff *skb, *skb_prev = NULL;
e367c2d0 1150 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1da177e4 1151 int exthdrlen;
299b0767 1152 int dst_exthdrlen;
1da177e4 1153 int hh_len;
1da177e4
LT
1154 int copy;
1155 int err;
1156 int offset = 0;
a693e698 1157 __u8 tx_flags = 0;
1da177e4
LT
1158
1159 if (flags&MSG_PROBE)
1160 return 0;
bdc712b4 1161 cork = &inet->cork.base;
1da177e4
LT
1162 if (skb_queue_empty(&sk->sk_write_queue)) {
1163 /*
1164 * setup for corking
1165 */
1166 if (opt) {
0178b695 1167 if (WARN_ON(np->cork.opt))
1da177e4 1168 return -EINVAL;
0178b695 1169
284041ef 1170 np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
0178b695
HX
1171 if (unlikely(np->cork.opt == NULL))
1172 return -ENOBUFS;
1173
1174 np->cork.opt->tot_len = opt->tot_len;
1175 np->cork.opt->opt_flen = opt->opt_flen;
1176 np->cork.opt->opt_nflen = opt->opt_nflen;
1177
1178 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1179 sk->sk_allocation);
1180 if (opt->dst0opt && !np->cork.opt->dst0opt)
1181 return -ENOBUFS;
1182
1183 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1184 sk->sk_allocation);
1185 if (opt->dst1opt && !np->cork.opt->dst1opt)
1186 return -ENOBUFS;
1187
1188 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1189 sk->sk_allocation);
1190 if (opt->hopopt && !np->cork.opt->hopopt)
1191 return -ENOBUFS;
1192
1193 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1194 sk->sk_allocation);
1195 if (opt->srcrt && !np->cork.opt->srcrt)
1196 return -ENOBUFS;
1197
1da177e4
LT
1198 /* need source address above miyazawa*/
1199 }
d8d1f30b 1200 dst_hold(&rt->dst);
bdc712b4 1201 cork->dst = &rt->dst;
4c9483b2 1202 inet->cork.fl.u.ip6 = *fl6;
1da177e4 1203 np->cork.hop_limit = hlimit;
41a1f8ea 1204 np->cork.tclass = tclass;
0c183379 1205 if (rt->dst.flags & DST_XFRM_TUNNEL)
93b36cf3 1206 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
0c183379
G
1207 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1208 else
93b36cf3 1209 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
0c183379 1210 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
c7503609 1211 if (np->frag_size < mtu) {
d91675f9
YH
1212 if (np->frag_size)
1213 mtu = np->frag_size;
1214 }
bdc712b4 1215 cork->fragsize = mtu;
d8d1f30b 1216 if (dst_allfrag(rt->dst.path))
bdc712b4
DM
1217 cork->flags |= IPCORK_ALLFRAG;
1218 cork->length = 0;
7efdba5b 1219 exthdrlen = (opt ? opt->opt_flen : 0);
1da177e4
LT
1220 length += exthdrlen;
1221 transhdrlen += exthdrlen;
7efdba5b 1222 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1da177e4 1223 } else {
bdc712b4 1224 rt = (struct rt6_info *)cork->dst;
4c9483b2 1225 fl6 = &inet->cork.fl.u.ip6;
0178b695 1226 opt = np->cork.opt;
1da177e4
LT
1227 transhdrlen = 0;
1228 exthdrlen = 0;
299b0767 1229 dst_exthdrlen = 0;
bdc712b4 1230 mtu = cork->fragsize;
1da177e4 1231 }
e367c2d0 1232 orig_mtu = mtu;
1da177e4 1233
d8d1f30b 1234 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4 1235
a1b05140 1236 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1237 (opt ? opt->opt_nflen : 0);
4df98e76
HFS
1238 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1239 sizeof(struct frag_hdr);
1da177e4
LT
1240
1241 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
4df98e76
HFS
1242 unsigned int maxnonfragsize, headersize;
1243
1244 headersize = sizeof(struct ipv6hdr) +
3a1cebe7 1245 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
4df98e76
HFS
1246 (dst_allfrag(&rt->dst) ?
1247 sizeof(struct frag_hdr) : 0) +
1248 rt->rt6i_nfheader_len;
1249
60ff7467 1250 if (ip6_sk_ignore_df(sk))
0b95227a
HFS
1251 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1252 else
1253 maxnonfragsize = mtu;
4df98e76
HFS
1254
1255 /* dontfrag active */
1256 if ((cork->length + length > mtu - headersize) && dontfrag &&
1257 (sk->sk_protocol == IPPROTO_UDP ||
1258 sk->sk_protocol == IPPROTO_RAW)) {
1259 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1260 sizeof(struct ipv6hdr));
1261 goto emsgsize;
1262 }
1263
1264 if (cork->length + length > maxnonfragsize - headersize) {
1265emsgsize:
1266 ipv6_local_error(sk, EMSGSIZE, fl6,
1267 mtu - headersize +
1268 sizeof(struct ipv6hdr));
1da177e4
LT
1269 return -EMSGSIZE;
1270 }
1271 }
1272
a693e698 1273 /* For UDP, check if TX timestamp is enabled */
bf84a010
DB
1274 if (sk->sk_type == SOCK_DGRAM)
1275 sock_tx_timestamp(sk, &tx_flags);
a693e698 1276
1da177e4
LT
1277 /*
1278 * Let's try using as much space as possible.
1279 * Use MTU if total length of the message fits into the MTU.
1280 * Otherwise, we need to reserve fragment header and
1281 * fragment alignment (= 8-15 octects, in total).
1282 *
1283 * Note that we may need to "move" the data from the tail of
1ab1457c 1284 * of the buffer to the new fragment when we split
1da177e4
LT
1285 * the message.
1286 *
1ab1457c 1287 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1288 * at once if non-fragmentable extension headers
1289 * are too large.
1ab1457c 1290 * --yoshfuji
1da177e4
LT
1291 */
1292
2811ebac
HFS
1293 skb = skb_peek_tail(&sk->sk_write_queue);
1294 cork->length += length;
1295 if (((length > mtu) ||
1296 (skb && skb_is_gso(skb))) &&
1297 (sk->sk_protocol == IPPROTO_UDP) &&
1298 (rt->dst.dev->features & NETIF_F_UFO)) {
1299 err = ip6_ufo_append_data(sk, getfrag, from, length,
1300 hh_len, fragheaderlen,
1301 transhdrlen, mtu, flags, rt);
1302 if (err)
1303 goto error;
1304 return 0;
e89e9cf5 1305 }
1da177e4 1306
2811ebac 1307 if (!skb)
1da177e4
LT
1308 goto alloc_new_skb;
1309
1310 while (length > 0) {
1311 /* Check if the remaining data fits into current packet. */
bdc712b4 1312 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1da177e4
LT
1313 if (copy < length)
1314 copy = maxfraglen - skb->len;
1315
1316 if (copy <= 0) {
1317 char *data;
1318 unsigned int datalen;
1319 unsigned int fraglen;
1320 unsigned int fraggap;
1321 unsigned int alloclen;
1da177e4 1322alloc_new_skb:
1da177e4 1323 /* There's no room in the current skb */
0c183379
G
1324 if (skb)
1325 fraggap = skb->len - maxfraglen;
1da177e4
LT
1326 else
1327 fraggap = 0;
0c183379
G
1328 /* update mtu and maxfraglen if necessary */
1329 if (skb == NULL || skb_prev == NULL)
1330 ip6_append_data_mtu(&mtu, &maxfraglen,
75a493e6 1331 fragheaderlen, skb, rt,
e367c2d0 1332 orig_mtu);
0c183379
G
1333
1334 skb_prev = skb;
1da177e4
LT
1335
1336 /*
1337 * If remaining data exceeds the mtu,
1338 * we know we need more fragment(s).
1339 */
1340 datalen = length + fraggap;
1da177e4 1341
0c183379
G
1342 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1343 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1da177e4 1344 if ((flags & MSG_MORE) &&
d8d1f30b 1345 !(rt->dst.dev->features&NETIF_F_SG))
1da177e4
LT
1346 alloclen = mtu;
1347 else
1348 alloclen = datalen + fragheaderlen;
1349
299b0767
SK
1350 alloclen += dst_exthdrlen;
1351
0c183379
G
1352 if (datalen != length + fraggap) {
1353 /*
1354 * this is not the last fragment, the trailer
1355 * space is regarded as data space.
1356 */
1357 datalen += rt->dst.trailer_len;
1358 }
1359
1360 alloclen += rt->dst.trailer_len;
1361 fraglen = datalen + fragheaderlen;
1da177e4
LT
1362
1363 /*
1364 * We just reserve space for fragment header.
1ab1457c 1365 * Note: this may be overallocation if the message
1da177e4
LT
1366 * (without MSG_MORE) fits into the MTU.
1367 */
1368 alloclen += sizeof(struct frag_hdr);
1369
1370 if (transhdrlen) {
1371 skb = sock_alloc_send_skb(sk,
1372 alloclen + hh_len,
1373 (flags & MSG_DONTWAIT), &err);
1374 } else {
1375 skb = NULL;
1376 if (atomic_read(&sk->sk_wmem_alloc) <=
1377 2 * sk->sk_sndbuf)
1378 skb = sock_wmalloc(sk,
1379 alloclen + hh_len, 1,
1380 sk->sk_allocation);
1381 if (unlikely(skb == NULL))
1382 err = -ENOBUFS;
a693e698
AB
1383 else {
1384 /* Only the initial fragment
1385 * is time stamped.
1386 */
1387 tx_flags = 0;
1388 }
1da177e4
LT
1389 }
1390 if (skb == NULL)
1391 goto error;
1392 /*
1393 * Fill in the control structures
1394 */
9c9c9ad5 1395 skb->protocol = htons(ETH_P_IPV6);
d7f7c0ac 1396 skb->ip_summed = CHECKSUM_NONE;
1da177e4 1397 skb->csum = 0;
1f85851e
G
1398 /* reserve for fragmentation and ipsec header */
1399 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1400 dst_exthdrlen);
1da177e4 1401
a693e698
AB
1402 if (sk->sk_type == SOCK_DGRAM)
1403 skb_shinfo(skb)->tx_flags = tx_flags;
1404
1da177e4
LT
1405 /*
1406 * Find where to start putting bytes
1407 */
1f85851e
G
1408 data = skb_put(skb, fraglen);
1409 skb_set_network_header(skb, exthdrlen);
1410 data += fragheaderlen;
b0e380b1
ACM
1411 skb->transport_header = (skb->network_header +
1412 fragheaderlen);
1da177e4
LT
1413 if (fraggap) {
1414 skb->csum = skb_copy_and_csum_bits(
1415 skb_prev, maxfraglen,
1416 data + transhdrlen, fraggap, 0);
1417 skb_prev->csum = csum_sub(skb_prev->csum,
1418 skb->csum);
1419 data += fraggap;
e9fa4f7b 1420 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1421 }
1422 copy = datalen - transhdrlen - fraggap;
299b0767 1423
1da177e4
LT
1424 if (copy < 0) {
1425 err = -EINVAL;
1426 kfree_skb(skb);
1427 goto error;
1428 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1429 err = -EFAULT;
1430 kfree_skb(skb);
1431 goto error;
1432 }
1433
1434 offset += copy;
1435 length -= datalen - fraggap;
1436 transhdrlen = 0;
1437 exthdrlen = 0;
299b0767 1438 dst_exthdrlen = 0;
1da177e4
LT
1439
1440 /*
1441 * Put the packet on the pending queue
1442 */
1443 __skb_queue_tail(&sk->sk_write_queue, skb);
1444 continue;
1445 }
1446
1447 if (copy > length)
1448 copy = length;
1449
d8d1f30b 1450 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1da177e4
LT
1451 unsigned int off;
1452
1453 off = skb->len;
1454 if (getfrag(from, skb_put(skb, copy),
1455 offset, copy, off, skb) < 0) {
1456 __skb_trim(skb, off);
1457 err = -EFAULT;
1458 goto error;
1459 }
1460 } else {
1461 int i = skb_shinfo(skb)->nr_frags;
5640f768 1462 struct page_frag *pfrag = sk_page_frag(sk);
1da177e4 1463
5640f768
ED
1464 err = -ENOMEM;
1465 if (!sk_page_frag_refill(sk, pfrag))
1da177e4 1466 goto error;
5640f768
ED
1467
1468 if (!skb_can_coalesce(skb, i, pfrag->page,
1469 pfrag->offset)) {
1470 err = -EMSGSIZE;
1471 if (i == MAX_SKB_FRAGS)
1472 goto error;
1473
1474 __skb_fill_page_desc(skb, i, pfrag->page,
1475 pfrag->offset, 0);
1476 skb_shinfo(skb)->nr_frags = ++i;
1477 get_page(pfrag->page);
1da177e4 1478 }
5640f768 1479 copy = min_t(int, copy, pfrag->size - pfrag->offset);
9e903e08 1480 if (getfrag(from,
5640f768
ED
1481 page_address(pfrag->page) + pfrag->offset,
1482 offset, copy, skb->len, skb) < 0)
1483 goto error_efault;
1484
1485 pfrag->offset += copy;
1486 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1da177e4
LT
1487 skb->len += copy;
1488 skb->data_len += copy;
f945fa7a
HX
1489 skb->truesize += copy;
1490 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1491 }
1492 offset += copy;
1493 length -= copy;
1494 }
5640f768 1495
1da177e4 1496 return 0;
5640f768
ED
1497
1498error_efault:
1499 err = -EFAULT;
1da177e4 1500error:
bdc712b4 1501 cork->length -= length;
3bd653c8 1502 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1503 return err;
1504}
a495f836 1505EXPORT_SYMBOL_GPL(ip6_append_data);
1da177e4 1506
bf138862
PE
1507static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1508{
0178b695
HX
1509 if (np->cork.opt) {
1510 kfree(np->cork.opt->dst0opt);
1511 kfree(np->cork.opt->dst1opt);
1512 kfree(np->cork.opt->hopopt);
1513 kfree(np->cork.opt->srcrt);
1514 kfree(np->cork.opt);
1515 np->cork.opt = NULL;
1516 }
1517
bdc712b4
DM
1518 if (inet->cork.base.dst) {
1519 dst_release(inet->cork.base.dst);
1520 inet->cork.base.dst = NULL;
1521 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
bf138862
PE
1522 }
1523 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1524}
1525
1da177e4
LT
1526int ip6_push_pending_frames(struct sock *sk)
1527{
1528 struct sk_buff *skb, *tmp_skb;
1529 struct sk_buff **tail_skb;
1530 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1531 struct inet_sock *inet = inet_sk(sk);
1532 struct ipv6_pinfo *np = inet6_sk(sk);
3bd653c8 1533 struct net *net = sock_net(sk);
1da177e4
LT
1534 struct ipv6hdr *hdr;
1535 struct ipv6_txoptions *opt = np->cork.opt;
bdc712b4 1536 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
4c9483b2
DM
1537 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1538 unsigned char proto = fl6->flowi6_proto;
1da177e4
LT
1539 int err = 0;
1540
1541 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1542 goto out;
1543 tail_skb = &(skb_shinfo(skb)->frag_list);
1544
1545 /* move skb->data to ip header from ext header */
d56f90a7 1546 if (skb->data < skb_network_header(skb))
bbe735e4 1547 __skb_pull(skb, skb_network_offset(skb));
1da177e4 1548 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
cfe1fc77 1549 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1550 *tail_skb = tmp_skb;
1551 tail_skb = &(tmp_skb->next);
1552 skb->len += tmp_skb->len;
1553 skb->data_len += tmp_skb->len;
1da177e4 1554 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1555 tmp_skb->destructor = NULL;
1556 tmp_skb->sk = NULL;
1da177e4
LT
1557 }
1558
28a89453 1559 /* Allow local fragmentation. */
60ff7467 1560 skb->ignore_df = ip6_sk_ignore_df(sk);
28a89453 1561
4e3fd7a0 1562 *final_dst = fl6->daddr;
cfe1fc77 1563 __skb_pull(skb, skb_network_header_len(skb));
1da177e4
LT
1564 if (opt && opt->opt_flen)
1565 ipv6_push_frag_opts(skb, opt, &proto);
1566 if (opt && opt->opt_nflen)
1567 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1568
e2d1bca7
ACM
1569 skb_push(skb, sizeof(struct ipv6hdr));
1570 skb_reset_network_header(skb);
0660e03f 1571 hdr = ipv6_hdr(skb);
1ab1457c 1572
cb1ce2ef
TH
1573 ip6_flow_hdr(hdr, np->cork.tclass,
1574 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1575 np->autoflowlabel));
1da177e4
LT
1576 hdr->hop_limit = np->cork.hop_limit;
1577 hdr->nexthdr = proto;
4e3fd7a0
AD
1578 hdr->saddr = fl6->saddr;
1579 hdr->daddr = *final_dst;
1da177e4 1580
a2c2064f 1581 skb->priority = sk->sk_priority;
4a19ec58 1582 skb->mark = sk->sk_mark;
a2c2064f 1583
d8d1f30b 1584 skb_dst_set(skb, dst_clone(&rt->dst));
edf391ff 1585 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
14878f75 1586 if (proto == IPPROTO_ICMPV6) {
adf30907 1587 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
14878f75 1588
43a43b60
HFS
1589 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1590 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1591 }
1592
ef76bc23 1593 err = ip6_local_out(skb);
1da177e4
LT
1594 if (err) {
1595 if (err > 0)
6ce9e7b5 1596 err = net_xmit_errno(err);
1da177e4
LT
1597 if (err)
1598 goto error;
1599 }
1600
1601out:
bf138862 1602 ip6_cork_release(inet, np);
1da177e4
LT
1603 return err;
1604error:
06254914 1605 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1606 goto out;
1607}
a495f836 1608EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1da177e4
LT
1609
1610void ip6_flush_pending_frames(struct sock *sk)
1611{
1da177e4
LT
1612 struct sk_buff *skb;
1613
1614 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
adf30907
ED
1615 if (skb_dst(skb))
1616 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
e1f52208 1617 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1618 kfree_skb(skb);
1619 }
1620
bf138862 1621 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1da177e4 1622}
a495f836 1623EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
This page took 1.010387 seconds and 5 git commands to generate.