ipv6: make fragment identifications less predictable
[deliverable/linux.git] / net / ipv6 / ip6_output.c
1 /*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 int len;
64
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
67 len = 0;
68 ipv6_hdr(skb)->payload_len = htons(len);
69
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 int err;
77
78 err = __ip6_local_out(skb);
79 if (likely(err == 1))
80 err = dst_output(skb);
81
82 return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 skb_reset_mac_header(newskb);
90 __skb_pull(newskb, skb_network_offset(newskb));
91 newskb->pkt_type = PACKET_LOOPBACK;
92 newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 WARN_ON(!skb_dst(newskb));
94
95 netif_rx_ni(newskb);
96 return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 struct dst_entry *dst = skb_dst(skb);
102 struct net_device *dev = dst->dev;
103 struct neighbour *neigh;
104
105 skb->protocol = htons(ETH_P_IPV6);
106 skb->dev = dev;
107
108 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 ((mroute6_socket(dev_net(dev), skb) &&
113 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 &ipv6_hdr(skb)->saddr))) {
116 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118 /* Do not check for IFF_ALLMULTI; multicast routing
119 is not supported in any case.
120 */
121 if (newskb)
122 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 newskb, NULL, newskb->dev,
124 ip6_dev_loopback_xmit);
125
126 if (ipv6_hdr(skb)->hop_limit == 0) {
127 IP6_INC_STATS(dev_net(dev), idev,
128 IPSTATS_MIB_OUTDISCARDS);
129 kfree_skb(skb);
130 return 0;
131 }
132 }
133
134 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 skb->len);
136 }
137
138 neigh = dst_get_neighbour(dst);
139 if (neigh)
140 return neigh_output(neigh, skb);
141
142 IP6_INC_STATS_BH(dev_net(dst->dev),
143 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144 kfree_skb(skb);
145 return -EINVAL;
146 }
147
148 static int ip6_finish_output(struct sk_buff *skb)
149 {
150 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
151 dst_allfrag(skb_dst(skb)))
152 return ip6_fragment(skb, ip6_finish_output2);
153 else
154 return ip6_finish_output2(skb);
155 }
156
157 int ip6_output(struct sk_buff *skb)
158 {
159 struct net_device *dev = skb_dst(skb)->dev;
160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 if (unlikely(idev->cnf.disable_ipv6)) {
162 IP6_INC_STATS(dev_net(dev), idev,
163 IPSTATS_MIB_OUTDISCARDS);
164 kfree_skb(skb);
165 return 0;
166 }
167
168 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
169 ip6_finish_output,
170 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171 }
172
173 /*
174 * xmit an sk_buff (used by TCP, SCTP and DCCP)
175 */
176
177 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
178 struct ipv6_txoptions *opt)
179 {
180 struct net *net = sock_net(sk);
181 struct ipv6_pinfo *np = inet6_sk(sk);
182 struct in6_addr *first_hop = &fl6->daddr;
183 struct dst_entry *dst = skb_dst(skb);
184 struct ipv6hdr *hdr;
185 u8 proto = fl6->flowi6_proto;
186 int seg_len = skb->len;
187 int hlimit = -1;
188 int tclass = 0;
189 u32 mtu;
190
191 if (opt) {
192 unsigned int head_room;
193
194 /* First: exthdrs may take lots of space (~8K for now)
195 MAX_HEADER is not enough.
196 */
197 head_room = opt->opt_nflen + opt->opt_flen;
198 seg_len += head_room;
199 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
200
201 if (skb_headroom(skb) < head_room) {
202 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
203 if (skb2 == NULL) {
204 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
205 IPSTATS_MIB_OUTDISCARDS);
206 kfree_skb(skb);
207 return -ENOBUFS;
208 }
209 kfree_skb(skb);
210 skb = skb2;
211 skb_set_owner_w(skb, sk);
212 }
213 if (opt->opt_flen)
214 ipv6_push_frag_opts(skb, opt, &proto);
215 if (opt->opt_nflen)
216 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
217 }
218
219 skb_push(skb, sizeof(struct ipv6hdr));
220 skb_reset_network_header(skb);
221 hdr = ipv6_hdr(skb);
222
223 /*
224 * Fill in the IPv6 header
225 */
226 if (np) {
227 tclass = np->tclass;
228 hlimit = np->hop_limit;
229 }
230 if (hlimit < 0)
231 hlimit = ip6_dst_hoplimit(dst);
232
233 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
234
235 hdr->payload_len = htons(seg_len);
236 hdr->nexthdr = proto;
237 hdr->hop_limit = hlimit;
238
239 ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
240 ipv6_addr_copy(&hdr->daddr, first_hop);
241
242 skb->priority = sk->sk_priority;
243 skb->mark = sk->sk_mark;
244
245 mtu = dst_mtu(dst);
246 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
247 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248 IPSTATS_MIB_OUT, skb->len);
249 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
250 dst->dev, dst_output);
251 }
252
253 if (net_ratelimit())
254 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
255 skb->dev = dst->dev;
256 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
257 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
258 kfree_skb(skb);
259 return -EMSGSIZE;
260 }
261
262 EXPORT_SYMBOL(ip6_xmit);
263
264 /*
265 * To avoid extra problems ND packets are send through this
266 * routine. It's code duplication but I really want to avoid
267 * extra checks since ipv6_build_header is used by TCP (which
268 * is for us performance critical)
269 */
270
271 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
272 const struct in6_addr *saddr, const struct in6_addr *daddr,
273 int proto, int len)
274 {
275 struct ipv6_pinfo *np = inet6_sk(sk);
276 struct ipv6hdr *hdr;
277
278 skb->protocol = htons(ETH_P_IPV6);
279 skb->dev = dev;
280
281 skb_reset_network_header(skb);
282 skb_put(skb, sizeof(struct ipv6hdr));
283 hdr = ipv6_hdr(skb);
284
285 *(__be32*)hdr = htonl(0x60000000);
286
287 hdr->payload_len = htons(len);
288 hdr->nexthdr = proto;
289 hdr->hop_limit = np->hop_limit;
290
291 ipv6_addr_copy(&hdr->saddr, saddr);
292 ipv6_addr_copy(&hdr->daddr, daddr);
293
294 return 0;
295 }
296
297 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
298 {
299 struct ip6_ra_chain *ra;
300 struct sock *last = NULL;
301
302 read_lock(&ip6_ra_lock);
303 for (ra = ip6_ra_chain; ra; ra = ra->next) {
304 struct sock *sk = ra->sk;
305 if (sk && ra->sel == sel &&
306 (!sk->sk_bound_dev_if ||
307 sk->sk_bound_dev_if == skb->dev->ifindex)) {
308 if (last) {
309 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
310 if (skb2)
311 rawv6_rcv(last, skb2);
312 }
313 last = sk;
314 }
315 }
316
317 if (last) {
318 rawv6_rcv(last, skb);
319 read_unlock(&ip6_ra_lock);
320 return 1;
321 }
322 read_unlock(&ip6_ra_lock);
323 return 0;
324 }
325
326 static int ip6_forward_proxy_check(struct sk_buff *skb)
327 {
328 struct ipv6hdr *hdr = ipv6_hdr(skb);
329 u8 nexthdr = hdr->nexthdr;
330 int offset;
331
332 if (ipv6_ext_hdr(nexthdr)) {
333 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
334 if (offset < 0)
335 return 0;
336 } else
337 offset = sizeof(struct ipv6hdr);
338
339 if (nexthdr == IPPROTO_ICMPV6) {
340 struct icmp6hdr *icmp6;
341
342 if (!pskb_may_pull(skb, (skb_network_header(skb) +
343 offset + 1 - skb->data)))
344 return 0;
345
346 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
347
348 switch (icmp6->icmp6_type) {
349 case NDISC_ROUTER_SOLICITATION:
350 case NDISC_ROUTER_ADVERTISEMENT:
351 case NDISC_NEIGHBOUR_SOLICITATION:
352 case NDISC_NEIGHBOUR_ADVERTISEMENT:
353 case NDISC_REDIRECT:
354 /* For reaction involving unicast neighbor discovery
355 * message destined to the proxied address, pass it to
356 * input function.
357 */
358 return 1;
359 default:
360 break;
361 }
362 }
363
364 /*
365 * The proxying router can't forward traffic sent to a link-local
366 * address, so signal the sender and discard the packet. This
367 * behavior is clarified by the MIPv6 specification.
368 */
369 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
370 dst_link_failure(skb);
371 return -1;
372 }
373
374 return 0;
375 }
376
377 static inline int ip6_forward_finish(struct sk_buff *skb)
378 {
379 return dst_output(skb);
380 }
381
382 int ip6_forward(struct sk_buff *skb)
383 {
384 struct dst_entry *dst = skb_dst(skb);
385 struct ipv6hdr *hdr = ipv6_hdr(skb);
386 struct inet6_skb_parm *opt = IP6CB(skb);
387 struct net *net = dev_net(dst->dev);
388 struct neighbour *n;
389 u32 mtu;
390
391 if (net->ipv6.devconf_all->forwarding == 0)
392 goto error;
393
394 if (skb_warn_if_lro(skb))
395 goto drop;
396
397 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
398 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
399 goto drop;
400 }
401
402 if (skb->pkt_type != PACKET_HOST)
403 goto drop;
404
405 skb_forward_csum(skb);
406
407 /*
408 * We DO NOT make any processing on
409 * RA packets, pushing them to user level AS IS
410 * without ane WARRANTY that application will be able
411 * to interpret them. The reason is that we
412 * cannot make anything clever here.
413 *
414 * We are not end-node, so that if packet contains
415 * AH/ESP, we cannot make anything.
416 * Defragmentation also would be mistake, RA packets
417 * cannot be fragmented, because there is no warranty
418 * that different fragments will go along one path. --ANK
419 */
420 if (opt->ra) {
421 u8 *ptr = skb_network_header(skb) + opt->ra;
422 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
423 return 0;
424 }
425
426 /*
427 * check and decrement ttl
428 */
429 if (hdr->hop_limit <= 1) {
430 /* Force OUTPUT device used as source address */
431 skb->dev = dst->dev;
432 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
433 IP6_INC_STATS_BH(net,
434 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
435
436 kfree_skb(skb);
437 return -ETIMEDOUT;
438 }
439
440 /* XXX: idev->cnf.proxy_ndp? */
441 if (net->ipv6.devconf_all->proxy_ndp &&
442 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
443 int proxied = ip6_forward_proxy_check(skb);
444 if (proxied > 0)
445 return ip6_input(skb);
446 else if (proxied < 0) {
447 IP6_INC_STATS(net, ip6_dst_idev(dst),
448 IPSTATS_MIB_INDISCARDS);
449 goto drop;
450 }
451 }
452
453 if (!xfrm6_route_forward(skb)) {
454 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
455 goto drop;
456 }
457 dst = skb_dst(skb);
458
459 /* IPv6 specs say nothing about it, but it is clear that we cannot
460 send redirects to source routed frames.
461 We don't send redirects to frames decapsulated from IPsec.
462 */
463 n = dst_get_neighbour(dst);
464 if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
465 struct in6_addr *target = NULL;
466 struct rt6_info *rt;
467
468 /*
469 * incoming and outgoing devices are the same
470 * send a redirect.
471 */
472
473 rt = (struct rt6_info *) dst;
474 if ((rt->rt6i_flags & RTF_GATEWAY))
475 target = (struct in6_addr*)&n->primary_key;
476 else
477 target = &hdr->daddr;
478
479 if (!rt->rt6i_peer)
480 rt6_bind_peer(rt, 1);
481
482 /* Limit redirects both by destination (here)
483 and by source (inside ndisc_send_redirect)
484 */
485 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486 ndisc_send_redirect(skb, n, target);
487 } else {
488 int addrtype = ipv6_addr_type(&hdr->saddr);
489
490 /* This check is security critical. */
491 if (addrtype == IPV6_ADDR_ANY ||
492 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493 goto error;
494 if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 ICMPV6_NOT_NEIGHBOUR, 0);
497 goto error;
498 }
499 }
500
501 mtu = dst_mtu(dst);
502 if (mtu < IPV6_MIN_MTU)
503 mtu = IPV6_MIN_MTU;
504
505 if (skb->len > mtu && !skb_is_gso(skb)) {
506 /* Again, force OUTPUT device used as source address */
507 skb->dev = dst->dev;
508 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 IP6_INC_STATS_BH(net,
510 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 IP6_INC_STATS_BH(net,
512 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513 kfree_skb(skb);
514 return -EMSGSIZE;
515 }
516
517 if (skb_cow(skb, dst->dev->hard_header_len)) {
518 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519 goto drop;
520 }
521
522 hdr = ipv6_hdr(skb);
523
524 /* Mangling hops number delayed to point after skb COW */
525
526 hdr->hop_limit--;
527
528 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
530 ip6_forward_finish);
531
532 error:
533 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534 drop:
535 kfree_skb(skb);
536 return -EINVAL;
537 }
538
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540 {
541 to->pkt_type = from->pkt_type;
542 to->priority = from->priority;
543 to->protocol = from->protocol;
544 skb_dst_drop(to);
545 skb_dst_set(to, dst_clone(skb_dst(from)));
546 to->dev = from->dev;
547 to->mark = from->mark;
548
549 #ifdef CONFIG_NET_SCHED
550 to->tc_index = from->tc_index;
551 #endif
552 nf_copy(to, from);
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 to->nf_trace = from->nf_trace;
556 #endif
557 skb_copy_secmark(to, from);
558 }
559
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
561 {
562 u16 offset = sizeof(struct ipv6hdr);
563 struct ipv6_opt_hdr *exthdr =
564 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 unsigned int packet_len = skb->tail - skb->network_header;
566 int found_rhdr = 0;
567 *nexthdr = &ipv6_hdr(skb)->nexthdr;
568
569 while (offset + 1 <= packet_len) {
570
571 switch (**nexthdr) {
572
573 case NEXTHDR_HOP:
574 break;
575 case NEXTHDR_ROUTING:
576 found_rhdr = 1;
577 break;
578 case NEXTHDR_DEST:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
581 break;
582 #endif
583 if (found_rhdr)
584 return offset;
585 break;
586 default :
587 return offset;
588 }
589
590 offset += ipv6_optlen(exthdr);
591 *nexthdr = &exthdr->nexthdr;
592 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
593 offset);
594 }
595
596 return offset;
597 }
598
599 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
600 {
601 static atomic_t ipv6_fragmentation_id;
602 int old, new;
603
604 if (rt) {
605 struct inet_peer *peer;
606
607 if (!rt->rt6i_peer)
608 rt6_bind_peer(rt, 1);
609 peer = rt->rt6i_peer;
610 if (peer) {
611 fhdr->identification = htonl(inet_getid(peer, 0));
612 return;
613 }
614 }
615 do {
616 old = atomic_read(&ipv6_fragmentation_id);
617 new = old + 1;
618 if (!new)
619 new = 1;
620 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
621 fhdr->identification = htonl(new);
622 }
623
624 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
625 {
626 struct sk_buff *frag;
627 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
628 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
629 struct ipv6hdr *tmp_hdr;
630 struct frag_hdr *fh;
631 unsigned int mtu, hlen, left, len;
632 __be32 frag_id = 0;
633 int ptr, offset = 0, err=0;
634 u8 *prevhdr, nexthdr = 0;
635 struct net *net = dev_net(skb_dst(skb)->dev);
636
637 hlen = ip6_find_1stfragopt(skb, &prevhdr);
638 nexthdr = *prevhdr;
639
640 mtu = ip6_skb_dst_mtu(skb);
641
642 /* We must not fragment if the socket is set to force MTU discovery
643 * or if the skb it not generated by a local socket.
644 */
645 if (!skb->local_df && skb->len > mtu) {
646 skb->dev = skb_dst(skb)->dev;
647 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
648 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
649 IPSTATS_MIB_FRAGFAILS);
650 kfree_skb(skb);
651 return -EMSGSIZE;
652 }
653
654 if (np && np->frag_size < mtu) {
655 if (np->frag_size)
656 mtu = np->frag_size;
657 }
658 mtu -= hlen + sizeof(struct frag_hdr);
659
660 if (skb_has_frag_list(skb)) {
661 int first_len = skb_pagelen(skb);
662 struct sk_buff *frag2;
663
664 if (first_len - hlen > mtu ||
665 ((first_len - hlen) & 7) ||
666 skb_cloned(skb))
667 goto slow_path;
668
669 skb_walk_frags(skb, frag) {
670 /* Correct geometry. */
671 if (frag->len > mtu ||
672 ((frag->len & 7) && frag->next) ||
673 skb_headroom(frag) < hlen)
674 goto slow_path_clean;
675
676 /* Partially cloned skb? */
677 if (skb_shared(frag))
678 goto slow_path_clean;
679
680 BUG_ON(frag->sk);
681 if (skb->sk) {
682 frag->sk = skb->sk;
683 frag->destructor = sock_wfree;
684 }
685 skb->truesize -= frag->truesize;
686 }
687
688 err = 0;
689 offset = 0;
690 frag = skb_shinfo(skb)->frag_list;
691 skb_frag_list_init(skb);
692 /* BUILD HEADER */
693
694 *prevhdr = NEXTHDR_FRAGMENT;
695 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
696 if (!tmp_hdr) {
697 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
698 IPSTATS_MIB_FRAGFAILS);
699 return -ENOMEM;
700 }
701
702 __skb_pull(skb, hlen);
703 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
704 __skb_push(skb, hlen);
705 skb_reset_network_header(skb);
706 memcpy(skb_network_header(skb), tmp_hdr, hlen);
707
708 ipv6_select_ident(fh, rt);
709 fh->nexthdr = nexthdr;
710 fh->reserved = 0;
711 fh->frag_off = htons(IP6_MF);
712 frag_id = fh->identification;
713
714 first_len = skb_pagelen(skb);
715 skb->data_len = first_len - skb_headlen(skb);
716 skb->len = first_len;
717 ipv6_hdr(skb)->payload_len = htons(first_len -
718 sizeof(struct ipv6hdr));
719
720 dst_hold(&rt->dst);
721
722 for (;;) {
723 /* Prepare header of the next frame,
724 * before previous one went down. */
725 if (frag) {
726 frag->ip_summed = CHECKSUM_NONE;
727 skb_reset_transport_header(frag);
728 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
729 __skb_push(frag, hlen);
730 skb_reset_network_header(frag);
731 memcpy(skb_network_header(frag), tmp_hdr,
732 hlen);
733 offset += skb->len - hlen - sizeof(struct frag_hdr);
734 fh->nexthdr = nexthdr;
735 fh->reserved = 0;
736 fh->frag_off = htons(offset);
737 if (frag->next != NULL)
738 fh->frag_off |= htons(IP6_MF);
739 fh->identification = frag_id;
740 ipv6_hdr(frag)->payload_len =
741 htons(frag->len -
742 sizeof(struct ipv6hdr));
743 ip6_copy_metadata(frag, skb);
744 }
745
746 err = output(skb);
747 if(!err)
748 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
749 IPSTATS_MIB_FRAGCREATES);
750
751 if (err || !frag)
752 break;
753
754 skb = frag;
755 frag = skb->next;
756 skb->next = NULL;
757 }
758
759 kfree(tmp_hdr);
760
761 if (err == 0) {
762 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
763 IPSTATS_MIB_FRAGOKS);
764 dst_release(&rt->dst);
765 return 0;
766 }
767
768 while (frag) {
769 skb = frag->next;
770 kfree_skb(frag);
771 frag = skb;
772 }
773
774 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
775 IPSTATS_MIB_FRAGFAILS);
776 dst_release(&rt->dst);
777 return err;
778
779 slow_path_clean:
780 skb_walk_frags(skb, frag2) {
781 if (frag2 == frag)
782 break;
783 frag2->sk = NULL;
784 frag2->destructor = NULL;
785 skb->truesize += frag2->truesize;
786 }
787 }
788
789 slow_path:
790 left = skb->len - hlen; /* Space per frame */
791 ptr = hlen; /* Where to start from */
792
793 /*
794 * Fragment the datagram.
795 */
796
797 *prevhdr = NEXTHDR_FRAGMENT;
798
799 /*
800 * Keep copying data until we run out.
801 */
802 while(left > 0) {
803 len = left;
804 /* IF: it doesn't fit, use 'mtu' - the data space left */
805 if (len > mtu)
806 len = mtu;
807 /* IF: we are not sending up to and including the packet end
808 then align the next start on an eight byte boundary */
809 if (len < left) {
810 len &= ~7;
811 }
812 /*
813 * Allocate buffer.
814 */
815
816 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
817 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
818 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
819 IPSTATS_MIB_FRAGFAILS);
820 err = -ENOMEM;
821 goto fail;
822 }
823
824 /*
825 * Set up data on packet
826 */
827
828 ip6_copy_metadata(frag, skb);
829 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
830 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
831 skb_reset_network_header(frag);
832 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
833 frag->transport_header = (frag->network_header + hlen +
834 sizeof(struct frag_hdr));
835
836 /*
837 * Charge the memory for the fragment to any owner
838 * it might possess
839 */
840 if (skb->sk)
841 skb_set_owner_w(frag, skb->sk);
842
843 /*
844 * Copy the packet header into the new buffer.
845 */
846 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
847
848 /*
849 * Build fragment header.
850 */
851 fh->nexthdr = nexthdr;
852 fh->reserved = 0;
853 if (!frag_id) {
854 ipv6_select_ident(fh, rt);
855 frag_id = fh->identification;
856 } else
857 fh->identification = frag_id;
858
859 /*
860 * Copy a block of the IP datagram.
861 */
862 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
863 BUG();
864 left -= len;
865
866 fh->frag_off = htons(offset);
867 if (left > 0)
868 fh->frag_off |= htons(IP6_MF);
869 ipv6_hdr(frag)->payload_len = htons(frag->len -
870 sizeof(struct ipv6hdr));
871
872 ptr += len;
873 offset += len;
874
875 /*
876 * Put this fragment into the sending queue.
877 */
878 err = output(frag);
879 if (err)
880 goto fail;
881
882 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
883 IPSTATS_MIB_FRAGCREATES);
884 }
885 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
886 IPSTATS_MIB_FRAGOKS);
887 kfree_skb(skb);
888 return err;
889
890 fail:
891 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
892 IPSTATS_MIB_FRAGFAILS);
893 kfree_skb(skb);
894 return err;
895 }
896
897 static inline int ip6_rt_check(const struct rt6key *rt_key,
898 const struct in6_addr *fl_addr,
899 const struct in6_addr *addr_cache)
900 {
901 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
902 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
903 }
904
905 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
906 struct dst_entry *dst,
907 const struct flowi6 *fl6)
908 {
909 struct ipv6_pinfo *np = inet6_sk(sk);
910 struct rt6_info *rt = (struct rt6_info *)dst;
911
912 if (!dst)
913 goto out;
914
915 /* Yes, checking route validity in not connected
916 * case is not very simple. Take into account,
917 * that we do not support routing by source, TOS,
918 * and MSG_DONTROUTE --ANK (980726)
919 *
920 * 1. ip6_rt_check(): If route was host route,
921 * check that cached destination is current.
922 * If it is network route, we still may
923 * check its validity using saved pointer
924 * to the last used address: daddr_cache.
925 * We do not want to save whole address now,
926 * (because main consumer of this service
927 * is tcp, which has not this problem),
928 * so that the last trick works only on connected
929 * sockets.
930 * 2. oif also should be the same.
931 */
932 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
933 #ifdef CONFIG_IPV6_SUBTREES
934 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
935 #endif
936 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
937 dst_release(dst);
938 dst = NULL;
939 }
940
941 out:
942 return dst;
943 }
944
945 static int ip6_dst_lookup_tail(struct sock *sk,
946 struct dst_entry **dst, struct flowi6 *fl6)
947 {
948 struct net *net = sock_net(sk);
949 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
950 struct neighbour *n;
951 #endif
952 int err;
953
954 if (*dst == NULL)
955 *dst = ip6_route_output(net, sk, fl6);
956
957 if ((err = (*dst)->error))
958 goto out_err_release;
959
960 if (ipv6_addr_any(&fl6->saddr)) {
961 struct rt6_info *rt = (struct rt6_info *) *dst;
962 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
963 sk ? inet6_sk(sk)->srcprefs : 0,
964 &fl6->saddr);
965 if (err)
966 goto out_err_release;
967 }
968
969 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
970 /*
971 * Here if the dst entry we've looked up
972 * has a neighbour entry that is in the INCOMPLETE
973 * state and the src address from the flow is
974 * marked as OPTIMISTIC, we release the found
975 * dst entry and replace it instead with the
976 * dst entry of the nexthop router
977 */
978 n = dst_get_neighbour(*dst);
979 if (n && !(n->nud_state & NUD_VALID)) {
980 struct inet6_ifaddr *ifp;
981 struct flowi6 fl_gw6;
982 int redirect;
983
984 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
985 (*dst)->dev, 1);
986
987 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
988 if (ifp)
989 in6_ifa_put(ifp);
990
991 if (redirect) {
992 /*
993 * We need to get the dst entry for the
994 * default router instead
995 */
996 dst_release(*dst);
997 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
998 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
999 *dst = ip6_route_output(net, sk, &fl_gw6);
1000 if ((err = (*dst)->error))
1001 goto out_err_release;
1002 }
1003 }
1004 #endif
1005
1006 return 0;
1007
1008 out_err_release:
1009 if (err == -ENETUNREACH)
1010 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1011 dst_release(*dst);
1012 *dst = NULL;
1013 return err;
1014 }
1015
1016 /**
1017 * ip6_dst_lookup - perform route lookup on flow
1018 * @sk: socket which provides route info
1019 * @dst: pointer to dst_entry * for result
1020 * @fl6: flow to lookup
1021 *
1022 * This function performs a route lookup on the given flow.
1023 *
1024 * It returns zero on success, or a standard errno code on error.
1025 */
1026 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1027 {
1028 *dst = NULL;
1029 return ip6_dst_lookup_tail(sk, dst, fl6);
1030 }
1031 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1032
1033 /**
1034 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1035 * @sk: socket which provides route info
1036 * @fl6: flow to lookup
1037 * @final_dst: final destination address for ipsec lookup
1038 * @can_sleep: we are in a sleepable context
1039 *
1040 * This function performs a route lookup on the given flow.
1041 *
1042 * It returns a valid dst pointer on success, or a pointer encoded
1043 * error code.
1044 */
1045 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1046 const struct in6_addr *final_dst,
1047 bool can_sleep)
1048 {
1049 struct dst_entry *dst = NULL;
1050 int err;
1051
1052 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1053 if (err)
1054 return ERR_PTR(err);
1055 if (final_dst)
1056 ipv6_addr_copy(&fl6->daddr, final_dst);
1057 if (can_sleep)
1058 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1059
1060 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1061 }
1062 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1063
1064 /**
1065 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1066 * @sk: socket which provides the dst cache and route info
1067 * @fl6: flow to lookup
1068 * @final_dst: final destination address for ipsec lookup
1069 * @can_sleep: we are in a sleepable context
1070 *
1071 * This function performs a route lookup on the given flow with the
1072 * possibility of using the cached route in the socket if it is valid.
1073 * It will take the socket dst lock when operating on the dst cache.
1074 * As a result, this function can only be used in process context.
1075 *
1076 * It returns a valid dst pointer on success, or a pointer encoded
1077 * error code.
1078 */
1079 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1080 const struct in6_addr *final_dst,
1081 bool can_sleep)
1082 {
1083 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1084 int err;
1085
1086 dst = ip6_sk_dst_check(sk, dst, fl6);
1087
1088 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1089 if (err)
1090 return ERR_PTR(err);
1091 if (final_dst)
1092 ipv6_addr_copy(&fl6->daddr, final_dst);
1093 if (can_sleep)
1094 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1095
1096 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1097 }
1098 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1099
1100 static inline int ip6_ufo_append_data(struct sock *sk,
1101 int getfrag(void *from, char *to, int offset, int len,
1102 int odd, struct sk_buff *skb),
1103 void *from, int length, int hh_len, int fragheaderlen,
1104 int transhdrlen, int mtu,unsigned int flags,
1105 struct rt6_info *rt)
1106
1107 {
1108 struct sk_buff *skb;
1109 int err;
1110
1111 /* There is support for UDP large send offload by network
1112 * device, so create one single skb packet containing complete
1113 * udp datagram
1114 */
1115 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1116 skb = sock_alloc_send_skb(sk,
1117 hh_len + fragheaderlen + transhdrlen + 20,
1118 (flags & MSG_DONTWAIT), &err);
1119 if (skb == NULL)
1120 return -ENOMEM;
1121
1122 /* reserve space for Hardware header */
1123 skb_reserve(skb, hh_len);
1124
1125 /* create space for UDP/IP header */
1126 skb_put(skb,fragheaderlen + transhdrlen);
1127
1128 /* initialize network header pointer */
1129 skb_reset_network_header(skb);
1130
1131 /* initialize protocol header pointer */
1132 skb->transport_header = skb->network_header + fragheaderlen;
1133
1134 skb->ip_summed = CHECKSUM_PARTIAL;
1135 skb->csum = 0;
1136 }
1137
1138 err = skb_append_datato_frags(sk,skb, getfrag, from,
1139 (length - transhdrlen));
1140 if (!err) {
1141 struct frag_hdr fhdr;
1142
1143 /* Specify the length of each IPv6 datagram fragment.
1144 * It has to be a multiple of 8.
1145 */
1146 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1147 sizeof(struct frag_hdr)) & ~7;
1148 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1149 ipv6_select_ident(&fhdr, rt);
1150 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1151 __skb_queue_tail(&sk->sk_write_queue, skb);
1152
1153 return 0;
1154 }
1155 /* There is not enough support do UPD LSO,
1156 * so follow normal path
1157 */
1158 kfree_skb(skb);
1159
1160 return err;
1161 }
1162
1163 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1164 gfp_t gfp)
1165 {
1166 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1167 }
1168
1169 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1170 gfp_t gfp)
1171 {
1172 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1173 }
1174
1175 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1176 int offset, int len, int odd, struct sk_buff *skb),
1177 void *from, int length, int transhdrlen,
1178 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1179 struct rt6_info *rt, unsigned int flags, int dontfrag)
1180 {
1181 struct inet_sock *inet = inet_sk(sk);
1182 struct ipv6_pinfo *np = inet6_sk(sk);
1183 struct inet_cork *cork;
1184 struct sk_buff *skb;
1185 unsigned int maxfraglen, fragheaderlen;
1186 int exthdrlen;
1187 int hh_len;
1188 int mtu;
1189 int copy;
1190 int err;
1191 int offset = 0;
1192 int csummode = CHECKSUM_NONE;
1193 __u8 tx_flags = 0;
1194
1195 if (flags&MSG_PROBE)
1196 return 0;
1197 cork = &inet->cork.base;
1198 if (skb_queue_empty(&sk->sk_write_queue)) {
1199 /*
1200 * setup for corking
1201 */
1202 if (opt) {
1203 if (WARN_ON(np->cork.opt))
1204 return -EINVAL;
1205
1206 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1207 if (unlikely(np->cork.opt == NULL))
1208 return -ENOBUFS;
1209
1210 np->cork.opt->tot_len = opt->tot_len;
1211 np->cork.opt->opt_flen = opt->opt_flen;
1212 np->cork.opt->opt_nflen = opt->opt_nflen;
1213
1214 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1215 sk->sk_allocation);
1216 if (opt->dst0opt && !np->cork.opt->dst0opt)
1217 return -ENOBUFS;
1218
1219 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1220 sk->sk_allocation);
1221 if (opt->dst1opt && !np->cork.opt->dst1opt)
1222 return -ENOBUFS;
1223
1224 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1225 sk->sk_allocation);
1226 if (opt->hopopt && !np->cork.opt->hopopt)
1227 return -ENOBUFS;
1228
1229 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1230 sk->sk_allocation);
1231 if (opt->srcrt && !np->cork.opt->srcrt)
1232 return -ENOBUFS;
1233
1234 /* need source address above miyazawa*/
1235 }
1236 dst_hold(&rt->dst);
1237 cork->dst = &rt->dst;
1238 inet->cork.fl.u.ip6 = *fl6;
1239 np->cork.hop_limit = hlimit;
1240 np->cork.tclass = tclass;
1241 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1242 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1243 if (np->frag_size < mtu) {
1244 if (np->frag_size)
1245 mtu = np->frag_size;
1246 }
1247 cork->fragsize = mtu;
1248 if (dst_allfrag(rt->dst.path))
1249 cork->flags |= IPCORK_ALLFRAG;
1250 cork->length = 0;
1251 sk->sk_sndmsg_page = NULL;
1252 sk->sk_sndmsg_off = 0;
1253 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1254 rt->rt6i_nfheader_len;
1255 length += exthdrlen;
1256 transhdrlen += exthdrlen;
1257 } else {
1258 rt = (struct rt6_info *)cork->dst;
1259 fl6 = &inet->cork.fl.u.ip6;
1260 opt = np->cork.opt;
1261 transhdrlen = 0;
1262 exthdrlen = 0;
1263 mtu = cork->fragsize;
1264 }
1265
1266 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1267
1268 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1269 (opt ? opt->opt_nflen : 0);
1270 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1271
1272 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1273 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1274 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1275 return -EMSGSIZE;
1276 }
1277 }
1278
1279 /* For UDP, check if TX timestamp is enabled */
1280 if (sk->sk_type == SOCK_DGRAM) {
1281 err = sock_tx_timestamp(sk, &tx_flags);
1282 if (err)
1283 goto error;
1284 }
1285
1286 /*
1287 * Let's try using as much space as possible.
1288 * Use MTU if total length of the message fits into the MTU.
1289 * Otherwise, we need to reserve fragment header and
1290 * fragment alignment (= 8-15 octects, in total).
1291 *
1292 * Note that we may need to "move" the data from the tail of
1293 * of the buffer to the new fragment when we split
1294 * the message.
1295 *
1296 * FIXME: It may be fragmented into multiple chunks
1297 * at once if non-fragmentable extension headers
1298 * are too large.
1299 * --yoshfuji
1300 */
1301
1302 cork->length += length;
1303 if (length > mtu) {
1304 int proto = sk->sk_protocol;
1305 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1306 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1307 return -EMSGSIZE;
1308 }
1309
1310 if (proto == IPPROTO_UDP &&
1311 (rt->dst.dev->features & NETIF_F_UFO)) {
1312
1313 err = ip6_ufo_append_data(sk, getfrag, from, length,
1314 hh_len, fragheaderlen,
1315 transhdrlen, mtu, flags, rt);
1316 if (err)
1317 goto error;
1318 return 0;
1319 }
1320 }
1321
1322 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1323 goto alloc_new_skb;
1324
1325 while (length > 0) {
1326 /* Check if the remaining data fits into current packet. */
1327 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1328 if (copy < length)
1329 copy = maxfraglen - skb->len;
1330
1331 if (copy <= 0) {
1332 char *data;
1333 unsigned int datalen;
1334 unsigned int fraglen;
1335 unsigned int fraggap;
1336 unsigned int alloclen;
1337 struct sk_buff *skb_prev;
1338 alloc_new_skb:
1339 skb_prev = skb;
1340
1341 /* There's no room in the current skb */
1342 if (skb_prev)
1343 fraggap = skb_prev->len - maxfraglen;
1344 else
1345 fraggap = 0;
1346
1347 /*
1348 * If remaining data exceeds the mtu,
1349 * we know we need more fragment(s).
1350 */
1351 datalen = length + fraggap;
1352 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1353 datalen = maxfraglen - fragheaderlen;
1354
1355 fraglen = datalen + fragheaderlen;
1356 if ((flags & MSG_MORE) &&
1357 !(rt->dst.dev->features&NETIF_F_SG))
1358 alloclen = mtu;
1359 else
1360 alloclen = datalen + fragheaderlen;
1361
1362 /*
1363 * The last fragment gets additional space at tail.
1364 * Note: we overallocate on fragments with MSG_MODE
1365 * because we have no idea if we're the last one.
1366 */
1367 if (datalen == length + fraggap)
1368 alloclen += rt->dst.trailer_len;
1369
1370 /*
1371 * We just reserve space for fragment header.
1372 * Note: this may be overallocation if the message
1373 * (without MSG_MORE) fits into the MTU.
1374 */
1375 alloclen += sizeof(struct frag_hdr);
1376
1377 if (transhdrlen) {
1378 skb = sock_alloc_send_skb(sk,
1379 alloclen + hh_len,
1380 (flags & MSG_DONTWAIT), &err);
1381 } else {
1382 skb = NULL;
1383 if (atomic_read(&sk->sk_wmem_alloc) <=
1384 2 * sk->sk_sndbuf)
1385 skb = sock_wmalloc(sk,
1386 alloclen + hh_len, 1,
1387 sk->sk_allocation);
1388 if (unlikely(skb == NULL))
1389 err = -ENOBUFS;
1390 else {
1391 /* Only the initial fragment
1392 * is time stamped.
1393 */
1394 tx_flags = 0;
1395 }
1396 }
1397 if (skb == NULL)
1398 goto error;
1399 /*
1400 * Fill in the control structures
1401 */
1402 skb->ip_summed = csummode;
1403 skb->csum = 0;
1404 /* reserve for fragmentation */
1405 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1406
1407 if (sk->sk_type == SOCK_DGRAM)
1408 skb_shinfo(skb)->tx_flags = tx_flags;
1409
1410 /*
1411 * Find where to start putting bytes
1412 */
1413 data = skb_put(skb, fraglen);
1414 skb_set_network_header(skb, exthdrlen);
1415 data += fragheaderlen;
1416 skb->transport_header = (skb->network_header +
1417 fragheaderlen);
1418 if (fraggap) {
1419 skb->csum = skb_copy_and_csum_bits(
1420 skb_prev, maxfraglen,
1421 data + transhdrlen, fraggap, 0);
1422 skb_prev->csum = csum_sub(skb_prev->csum,
1423 skb->csum);
1424 data += fraggap;
1425 pskb_trim_unique(skb_prev, maxfraglen);
1426 }
1427 copy = datalen - transhdrlen - fraggap;
1428 if (copy < 0) {
1429 err = -EINVAL;
1430 kfree_skb(skb);
1431 goto error;
1432 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1433 err = -EFAULT;
1434 kfree_skb(skb);
1435 goto error;
1436 }
1437
1438 offset += copy;
1439 length -= datalen - fraggap;
1440 transhdrlen = 0;
1441 exthdrlen = 0;
1442 csummode = CHECKSUM_NONE;
1443
1444 /*
1445 * Put the packet on the pending queue
1446 */
1447 __skb_queue_tail(&sk->sk_write_queue, skb);
1448 continue;
1449 }
1450
1451 if (copy > length)
1452 copy = length;
1453
1454 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1455 unsigned int off;
1456
1457 off = skb->len;
1458 if (getfrag(from, skb_put(skb, copy),
1459 offset, copy, off, skb) < 0) {
1460 __skb_trim(skb, off);
1461 err = -EFAULT;
1462 goto error;
1463 }
1464 } else {
1465 int i = skb_shinfo(skb)->nr_frags;
1466 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1467 struct page *page = sk->sk_sndmsg_page;
1468 int off = sk->sk_sndmsg_off;
1469 unsigned int left;
1470
1471 if (page && (left = PAGE_SIZE - off) > 0) {
1472 if (copy >= left)
1473 copy = left;
1474 if (page != frag->page) {
1475 if (i == MAX_SKB_FRAGS) {
1476 err = -EMSGSIZE;
1477 goto error;
1478 }
1479 get_page(page);
1480 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1481 frag = &skb_shinfo(skb)->frags[i];
1482 }
1483 } else if(i < MAX_SKB_FRAGS) {
1484 if (copy > PAGE_SIZE)
1485 copy = PAGE_SIZE;
1486 page = alloc_pages(sk->sk_allocation, 0);
1487 if (page == NULL) {
1488 err = -ENOMEM;
1489 goto error;
1490 }
1491 sk->sk_sndmsg_page = page;
1492 sk->sk_sndmsg_off = 0;
1493
1494 skb_fill_page_desc(skb, i, page, 0, 0);
1495 frag = &skb_shinfo(skb)->frags[i];
1496 } else {
1497 err = -EMSGSIZE;
1498 goto error;
1499 }
1500 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1501 err = -EFAULT;
1502 goto error;
1503 }
1504 sk->sk_sndmsg_off += copy;
1505 frag->size += copy;
1506 skb->len += copy;
1507 skb->data_len += copy;
1508 skb->truesize += copy;
1509 atomic_add(copy, &sk->sk_wmem_alloc);
1510 }
1511 offset += copy;
1512 length -= copy;
1513 }
1514 return 0;
1515 error:
1516 cork->length -= length;
1517 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1518 return err;
1519 }
1520
1521 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1522 {
1523 if (np->cork.opt) {
1524 kfree(np->cork.opt->dst0opt);
1525 kfree(np->cork.opt->dst1opt);
1526 kfree(np->cork.opt->hopopt);
1527 kfree(np->cork.opt->srcrt);
1528 kfree(np->cork.opt);
1529 np->cork.opt = NULL;
1530 }
1531
1532 if (inet->cork.base.dst) {
1533 dst_release(inet->cork.base.dst);
1534 inet->cork.base.dst = NULL;
1535 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1536 }
1537 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1538 }
1539
1540 int ip6_push_pending_frames(struct sock *sk)
1541 {
1542 struct sk_buff *skb, *tmp_skb;
1543 struct sk_buff **tail_skb;
1544 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1545 struct inet_sock *inet = inet_sk(sk);
1546 struct ipv6_pinfo *np = inet6_sk(sk);
1547 struct net *net = sock_net(sk);
1548 struct ipv6hdr *hdr;
1549 struct ipv6_txoptions *opt = np->cork.opt;
1550 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1551 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1552 unsigned char proto = fl6->flowi6_proto;
1553 int err = 0;
1554
1555 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1556 goto out;
1557 tail_skb = &(skb_shinfo(skb)->frag_list);
1558
1559 /* move skb->data to ip header from ext header */
1560 if (skb->data < skb_network_header(skb))
1561 __skb_pull(skb, skb_network_offset(skb));
1562 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1563 __skb_pull(tmp_skb, skb_network_header_len(skb));
1564 *tail_skb = tmp_skb;
1565 tail_skb = &(tmp_skb->next);
1566 skb->len += tmp_skb->len;
1567 skb->data_len += tmp_skb->len;
1568 skb->truesize += tmp_skb->truesize;
1569 tmp_skb->destructor = NULL;
1570 tmp_skb->sk = NULL;
1571 }
1572
1573 /* Allow local fragmentation. */
1574 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1575 skb->local_df = 1;
1576
1577 ipv6_addr_copy(final_dst, &fl6->daddr);
1578 __skb_pull(skb, skb_network_header_len(skb));
1579 if (opt && opt->opt_flen)
1580 ipv6_push_frag_opts(skb, opt, &proto);
1581 if (opt && opt->opt_nflen)
1582 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1583
1584 skb_push(skb, sizeof(struct ipv6hdr));
1585 skb_reset_network_header(skb);
1586 hdr = ipv6_hdr(skb);
1587
1588 *(__be32*)hdr = fl6->flowlabel |
1589 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1590
1591 hdr->hop_limit = np->cork.hop_limit;
1592 hdr->nexthdr = proto;
1593 ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1594 ipv6_addr_copy(&hdr->daddr, final_dst);
1595
1596 skb->priority = sk->sk_priority;
1597 skb->mark = sk->sk_mark;
1598
1599 skb_dst_set(skb, dst_clone(&rt->dst));
1600 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1601 if (proto == IPPROTO_ICMPV6) {
1602 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1603
1604 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1605 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1606 }
1607
1608 err = ip6_local_out(skb);
1609 if (err) {
1610 if (err > 0)
1611 err = net_xmit_errno(err);
1612 if (err)
1613 goto error;
1614 }
1615
1616 out:
1617 ip6_cork_release(inet, np);
1618 return err;
1619 error:
1620 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1621 goto out;
1622 }
1623
1624 void ip6_flush_pending_frames(struct sock *sk)
1625 {
1626 struct sk_buff *skb;
1627
1628 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1629 if (skb_dst(skb))
1630 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1631 IPSTATS_MIB_OUTDISCARDS);
1632 kfree_skb(skb);
1633 }
1634
1635 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1636 }
This page took 0.072832 seconds and 5 git commands to generate.