[PATCH] ieee80211: Fix debug comments ipw->ieee80211
[deliverable/linux.git] / net / ipv4 / ip_output.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Donald Becker, <becker@super.org>
13 * Alan Cox, <Alan.Cox@linux.org>
14 * Richard Underwood
15 * Stefan Becker, <stefanb@yello.ping.de>
16 * Jorge Cwik, <jorge@laser.satlink.net>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Hirokazu Takahashi, <taka@valinux.co.jp>
19 *
20 * See ip_input.c for original log
21 *
22 * Fixes:
23 * Alan Cox : Missing nonblock feature in ip_build_xmit.
24 * Mike Kilburn : htons() missing in ip_build_xmit.
25 * Bradford Johnson: Fix faulty handling of some frames when
26 * no route is found.
27 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
28 * (in case if packet not accepted by
29 * output firewall rules)
30 * Mike McLagan : Routing by source
31 * Alexey Kuznetsov: use new route cache
32 * Andi Kleen: Fix broken PMTU recovery and remove
33 * some redundant tests.
34 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
35 * Andi Kleen : Replace ip_reply with ip_send_reply.
36 * Andi Kleen : Split fast and slow ip_build_xmit path
37 * for decreased register pressure on x86
38 * and more readibility.
39 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
40 * silently drop skb instead of failing with -EPERM.
41 * Detlev Wengorz : Copy protocol for fragments.
42 * Hirokazu Takahashi: HW checksumming for outgoing UDP
43 * datagrams.
44 * Hirokazu Takahashi: sendfile() on UDP works now.
45 */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
57
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <net/tcp.h>
73 #include <net/udp.h>
74 #include <linux/skbuff.h>
75 #include <net/sock.h>
76 #include <net/arp.h>
77 #include <net/icmp.h>
78 #include <net/raw.h>
79 #include <net/checksum.h>
80 #include <net/inetpeer.h>
81 #include <net/checksum.h>
82 #include <linux/igmp.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/netfilter_bridge.h>
85 #include <linux/mroute.h>
86 #include <linux/netlink.h>
87
88 /*
89 * Shall we try to damage output packets if routing dev changes?
90 */
91
92 int sysctl_ip_dynaddr;
93 int sysctl_ip_default_ttl = IPDEFTTL;
94
95 /* Generate a checksum for an outgoing IP datagram. */
96 __inline__ void ip_send_check(struct iphdr *iph)
97 {
98 iph->check = 0;
99 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
100 }
101
102 /* dev_loopback_xmit for use with netfilter. */
103 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
104 {
105 newskb->mac.raw = newskb->data;
106 __skb_pull(newskb, newskb->nh.raw - newskb->data);
107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst);
110 netif_rx(newskb);
111 return 0;
112 }
113
114 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
115 {
116 int ttl = inet->uc_ttl;
117
118 if (ttl < 0)
119 ttl = dst_metric(dst, RTAX_HOPLIMIT);
120 return ttl;
121 }
122
123 /*
124 * Add an ip header to a skbuff and send it out.
125 *
126 */
127 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
128 u32 saddr, u32 daddr, struct ip_options *opt)
129 {
130 struct inet_sock *inet = inet_sk(sk);
131 struct rtable *rt = (struct rtable *)skb->dst;
132 struct iphdr *iph;
133
134 /* Build the IP header. */
135 if (opt)
136 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
137 else
138 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
139
140 iph->version = 4;
141 iph->ihl = 5;
142 iph->tos = inet->tos;
143 if (ip_dont_fragment(sk, &rt->u.dst))
144 iph->frag_off = htons(IP_DF);
145 else
146 iph->frag_off = 0;
147 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
148 iph->daddr = rt->rt_dst;
149 iph->saddr = rt->rt_src;
150 iph->protocol = sk->sk_protocol;
151 iph->tot_len = htons(skb->len);
152 ip_select_ident(iph, &rt->u.dst, sk);
153 skb->nh.iph = iph;
154
155 if (opt && opt->optlen) {
156 iph->ihl += opt->optlen>>2;
157 ip_options_build(skb, opt, daddr, rt, 0);
158 }
159 ip_send_check(iph);
160
161 skb->priority = sk->sk_priority;
162
163 /* Send it out. */
164 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
165 dst_output);
166 }
167
168 static inline int ip_finish_output2(struct sk_buff *skb)
169 {
170 struct dst_entry *dst = skb->dst;
171 struct hh_cache *hh = dst->hh;
172 struct net_device *dev = dst->dev;
173 int hh_len = LL_RESERVED_SPACE(dev);
174
175 /* Be paranoid, rather than too clever. */
176 if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
177 struct sk_buff *skb2;
178
179 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
180 if (skb2 == NULL) {
181 kfree_skb(skb);
182 return -ENOMEM;
183 }
184 if (skb->sk)
185 skb_set_owner_w(skb2, skb->sk);
186 kfree_skb(skb);
187 skb = skb2;
188 }
189
190 if (hh) {
191 int hh_alen;
192
193 read_lock_bh(&hh->hh_lock);
194 hh_alen = HH_DATA_ALIGN(hh->hh_len);
195 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
196 read_unlock_bh(&hh->hh_lock);
197 skb_push(skb, hh->hh_len);
198 return hh->hh_output(skb);
199 } else if (dst->neighbour)
200 return dst->neighbour->output(skb);
201
202 if (net_ratelimit())
203 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
204 kfree_skb(skb);
205 return -EINVAL;
206 }
207
208 int ip_finish_output(struct sk_buff *skb)
209 {
210 struct net_device *dev = skb->dst->dev;
211
212 skb->dev = dev;
213 skb->protocol = htons(ETH_P_IP);
214
215 return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
216 ip_finish_output2);
217 }
218
219 int ip_mc_output(struct sk_buff *skb)
220 {
221 struct sock *sk = skb->sk;
222 struct rtable *rt = (struct rtable*)skb->dst;
223 struct net_device *dev = rt->u.dst.dev;
224
225 /*
226 * If the indicated interface is up and running, send the packet.
227 */
228 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
229
230 skb->dev = dev;
231 skb->protocol = htons(ETH_P_IP);
232
233 /*
234 * Multicasts are looped back for other local users
235 */
236
237 if (rt->rt_flags&RTCF_MULTICAST) {
238 if ((!sk || inet_sk(sk)->mc_loop)
239 #ifdef CONFIG_IP_MROUTE
240 /* Small optimization: do not loopback not local frames,
241 which returned after forwarding; they will be dropped
242 by ip_mr_input in any case.
243 Note, that local frames are looped back to be delivered
244 to local recipients.
245
246 This check is duplicated in ip_mr_input at the moment.
247 */
248 && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
249 #endif
250 ) {
251 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
252 if (newskb)
253 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
254 newskb->dev,
255 ip_dev_loopback_xmit);
256 }
257
258 /* Multicasts with ttl 0 must not go beyond the host */
259
260 if (skb->nh.iph->ttl == 0) {
261 kfree_skb(skb);
262 return 0;
263 }
264 }
265
266 if (rt->rt_flags&RTCF_BROADCAST) {
267 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
268 if (newskb)
269 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
270 newskb->dev, ip_dev_loopback_xmit);
271 }
272
273 if (skb->len > dst_mtu(&rt->u.dst))
274 return ip_fragment(skb, ip_finish_output);
275 else
276 return ip_finish_output(skb);
277 }
278
279 int ip_output(struct sk_buff *skb)
280 {
281 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
282
283 if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
284 return ip_fragment(skb, ip_finish_output);
285 else
286 return ip_finish_output(skb);
287 }
288
289 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
290 {
291 struct sock *sk = skb->sk;
292 struct inet_sock *inet = inet_sk(sk);
293 struct ip_options *opt = inet->opt;
294 struct rtable *rt;
295 struct iphdr *iph;
296
297 /* Skip all of this if the packet is already routed,
298 * f.e. by something like SCTP.
299 */
300 rt = (struct rtable *) skb->dst;
301 if (rt != NULL)
302 goto packet_routed;
303
304 /* Make sure we can route this packet. */
305 rt = (struct rtable *)__sk_dst_check(sk, 0);
306 if (rt == NULL) {
307 u32 daddr;
308
309 /* Use correct destination address if we have options. */
310 daddr = inet->daddr;
311 if(opt && opt->srr)
312 daddr = opt->faddr;
313
314 {
315 struct flowi fl = { .oif = sk->sk_bound_dev_if,
316 .nl_u = { .ip4_u =
317 { .daddr = daddr,
318 .saddr = inet->saddr,
319 .tos = RT_CONN_FLAGS(sk) } },
320 .proto = sk->sk_protocol,
321 .uli_u = { .ports =
322 { .sport = inet->sport,
323 .dport = inet->dport } } };
324
325 /* If this fails, retransmit mechanism of transport layer will
326 * keep trying until route appears or the connection times
327 * itself out.
328 */
329 if (ip_route_output_flow(&rt, &fl, sk, 0))
330 goto no_route;
331 }
332 __sk_dst_set(sk, &rt->u.dst);
333 tcp_v4_setup_caps(sk, &rt->u.dst);
334 }
335 skb->dst = dst_clone(&rt->u.dst);
336
337 packet_routed:
338 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
339 goto no_route;
340
341 /* OK, we know where to send it, allocate and build IP header. */
342 iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
343 *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
344 iph->tot_len = htons(skb->len);
345 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
346 iph->frag_off = htons(IP_DF);
347 else
348 iph->frag_off = 0;
349 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
350 iph->protocol = sk->sk_protocol;
351 iph->saddr = rt->rt_src;
352 iph->daddr = rt->rt_dst;
353 skb->nh.iph = iph;
354 /* Transport layer set skb->h.foo itself. */
355
356 if (opt && opt->optlen) {
357 iph->ihl += opt->optlen >> 2;
358 ip_options_build(skb, opt, inet->daddr, rt, 0);
359 }
360
361 ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
362
363 /* Add an IP checksum. */
364 ip_send_check(iph);
365
366 skb->priority = sk->sk_priority;
367
368 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
369 dst_output);
370
371 no_route:
372 IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
373 kfree_skb(skb);
374 return -EHOSTUNREACH;
375 }
376
377
378 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
379 {
380 to->pkt_type = from->pkt_type;
381 to->priority = from->priority;
382 to->protocol = from->protocol;
383 dst_release(to->dst);
384 to->dst = dst_clone(from->dst);
385 to->dev = from->dev;
386
387 /* Copy the flags to each fragment. */
388 IPCB(to)->flags = IPCB(from)->flags;
389
390 #ifdef CONFIG_NET_SCHED
391 to->tc_index = from->tc_index;
392 #endif
393 #ifdef CONFIG_NETFILTER
394 to->nfmark = from->nfmark;
395 to->nfcache = from->nfcache;
396 /* Connection association is same as pre-frag packet */
397 nf_conntrack_put(to->nfct);
398 to->nfct = from->nfct;
399 nf_conntrack_get(to->nfct);
400 to->nfctinfo = from->nfctinfo;
401 #ifdef CONFIG_BRIDGE_NETFILTER
402 nf_bridge_put(to->nf_bridge);
403 to->nf_bridge = from->nf_bridge;
404 nf_bridge_get(to->nf_bridge);
405 #endif
406 #endif
407 }
408
409 /*
410 * This IP datagram is too large to be sent in one piece. Break it up into
411 * smaller pieces (each of size equal to IP header plus
412 * a block of the data of the original IP data part) that will yet fit in a
413 * single device frame, and queue such a frame for sending.
414 */
415
416 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
417 {
418 struct iphdr *iph;
419 int raw = 0;
420 int ptr;
421 struct net_device *dev;
422 struct sk_buff *skb2;
423 unsigned int mtu, hlen, left, len, ll_rs;
424 int offset;
425 int not_last_frag;
426 struct rtable *rt = (struct rtable*)skb->dst;
427 int err = 0;
428
429 dev = rt->u.dst.dev;
430
431 /*
432 * Point into the IP datagram header.
433 */
434
435 iph = skb->nh.iph;
436
437 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
438 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
439 htonl(dst_mtu(&rt->u.dst)));
440 kfree_skb(skb);
441 return -EMSGSIZE;
442 }
443
444 /*
445 * Setup starting values.
446 */
447
448 hlen = iph->ihl * 4;
449 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
450
451 /* When frag_list is given, use it. First, check its validity:
452 * some transformers could create wrong frag_list or break existing
453 * one, it is not prohibited. In this case fall back to copying.
454 *
455 * LATER: this step can be merged to real generation of fragments,
456 * we can switch to copy when see the first bad fragment.
457 */
458 if (skb_shinfo(skb)->frag_list) {
459 struct sk_buff *frag;
460 int first_len = skb_pagelen(skb);
461
462 if (first_len - hlen > mtu ||
463 ((first_len - hlen) & 7) ||
464 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
465 skb_cloned(skb))
466 goto slow_path;
467
468 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
469 /* Correct geometry. */
470 if (frag->len > mtu ||
471 ((frag->len & 7) && frag->next) ||
472 skb_headroom(frag) < hlen)
473 goto slow_path;
474
475 /* Partially cloned skb? */
476 if (skb_shared(frag))
477 goto slow_path;
478
479 BUG_ON(frag->sk);
480 if (skb->sk) {
481 sock_hold(skb->sk);
482 frag->sk = skb->sk;
483 frag->destructor = sock_wfree;
484 skb->truesize -= frag->truesize;
485 }
486 }
487
488 /* Everything is OK. Generate! */
489
490 err = 0;
491 offset = 0;
492 frag = skb_shinfo(skb)->frag_list;
493 skb_shinfo(skb)->frag_list = NULL;
494 skb->data_len = first_len - skb_headlen(skb);
495 skb->len = first_len;
496 iph->tot_len = htons(first_len);
497 iph->frag_off = htons(IP_MF);
498 ip_send_check(iph);
499
500 for (;;) {
501 /* Prepare header of the next frame,
502 * before previous one went down. */
503 if (frag) {
504 frag->ip_summed = CHECKSUM_NONE;
505 frag->h.raw = frag->data;
506 frag->nh.raw = __skb_push(frag, hlen);
507 memcpy(frag->nh.raw, iph, hlen);
508 iph = frag->nh.iph;
509 iph->tot_len = htons(frag->len);
510 ip_copy_metadata(frag, skb);
511 if (offset == 0)
512 ip_options_fragment(frag);
513 offset += skb->len - hlen;
514 iph->frag_off = htons(offset>>3);
515 if (frag->next != NULL)
516 iph->frag_off |= htons(IP_MF);
517 /* Ready, complete checksum */
518 ip_send_check(iph);
519 }
520
521 err = output(skb);
522
523 if (err || !frag)
524 break;
525
526 skb = frag;
527 frag = skb->next;
528 skb->next = NULL;
529 }
530
531 if (err == 0) {
532 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
533 return 0;
534 }
535
536 while (frag) {
537 skb = frag->next;
538 kfree_skb(frag);
539 frag = skb;
540 }
541 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
542 return err;
543 }
544
545 slow_path:
546 left = skb->len - hlen; /* Space per frame */
547 ptr = raw + hlen; /* Where to start from */
548
549 #ifdef CONFIG_BRIDGE_NETFILTER
550 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
551 * we need to make room for the encapsulating header */
552 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
553 mtu -= nf_bridge_pad(skb);
554 #else
555 ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
556 #endif
557 /*
558 * Fragment the datagram.
559 */
560
561 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
562 not_last_frag = iph->frag_off & htons(IP_MF);
563
564 /*
565 * Keep copying data until we run out.
566 */
567
568 while(left > 0) {
569 len = left;
570 /* IF: it doesn't fit, use 'mtu' - the data space left */
571 if (len > mtu)
572 len = mtu;
573 /* IF: we are not sending upto and including the packet end
574 then align the next start on an eight byte boundary */
575 if (len < left) {
576 len &= ~7;
577 }
578 /*
579 * Allocate buffer.
580 */
581
582 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
583 NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
584 err = -ENOMEM;
585 goto fail;
586 }
587
588 /*
589 * Set up data on packet
590 */
591
592 ip_copy_metadata(skb2, skb);
593 skb_reserve(skb2, ll_rs);
594 skb_put(skb2, len + hlen);
595 skb2->nh.raw = skb2->data;
596 skb2->h.raw = skb2->data + hlen;
597
598 /*
599 * Charge the memory for the fragment to any owner
600 * it might possess
601 */
602
603 if (skb->sk)
604 skb_set_owner_w(skb2, skb->sk);
605
606 /*
607 * Copy the packet header into the new buffer.
608 */
609
610 memcpy(skb2->nh.raw, skb->data, hlen);
611
612 /*
613 * Copy a block of the IP datagram.
614 */
615 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
616 BUG();
617 left -= len;
618
619 /*
620 * Fill in the new header fields.
621 */
622 iph = skb2->nh.iph;
623 iph->frag_off = htons((offset >> 3));
624
625 /* ANK: dirty, but effective trick. Upgrade options only if
626 * the segment to be fragmented was THE FIRST (otherwise,
627 * options are already fixed) and make it ONCE
628 * on the initial skb, so that all the following fragments
629 * will inherit fixed options.
630 */
631 if (offset == 0)
632 ip_options_fragment(skb);
633
634 /*
635 * Added AC : If we are fragmenting a fragment that's not the
636 * last fragment then keep MF on each bit
637 */
638 if (left > 0 || not_last_frag)
639 iph->frag_off |= htons(IP_MF);
640 ptr += len;
641 offset += len;
642
643 /*
644 * Put this fragment into the sending queue.
645 */
646
647 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
648
649 iph->tot_len = htons(len + hlen);
650
651 ip_send_check(iph);
652
653 err = output(skb2);
654 if (err)
655 goto fail;
656 }
657 kfree_skb(skb);
658 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
659 return err;
660
661 fail:
662 kfree_skb(skb);
663 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
664 return err;
665 }
666
667 int
668 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
669 {
670 struct iovec *iov = from;
671
672 if (skb->ip_summed == CHECKSUM_HW) {
673 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
674 return -EFAULT;
675 } else {
676 unsigned int csum = 0;
677 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
678 return -EFAULT;
679 skb->csum = csum_block_add(skb->csum, csum, odd);
680 }
681 return 0;
682 }
683
684 static inline unsigned int
685 csum_page(struct page *page, int offset, int copy)
686 {
687 char *kaddr;
688 unsigned int csum;
689 kaddr = kmap(page);
690 csum = csum_partial(kaddr + offset, copy, 0);
691 kunmap(page);
692 return csum;
693 }
694
695 /*
696 * ip_append_data() and ip_append_page() can make one large IP datagram
697 * from many pieces of data. Each pieces will be holded on the socket
698 * until ip_push_pending_frames() is called. Each piece can be a page
699 * or non-page data.
700 *
701 * Not only UDP, other transport protocols - e.g. raw sockets - can use
702 * this interface potentially.
703 *
704 * LATER: length must be adjusted by pad at tail, when it is required.
705 */
706 int ip_append_data(struct sock *sk,
707 int getfrag(void *from, char *to, int offset, int len,
708 int odd, struct sk_buff *skb),
709 void *from, int length, int transhdrlen,
710 struct ipcm_cookie *ipc, struct rtable *rt,
711 unsigned int flags)
712 {
713 struct inet_sock *inet = inet_sk(sk);
714 struct sk_buff *skb;
715
716 struct ip_options *opt = NULL;
717 int hh_len;
718 int exthdrlen;
719 int mtu;
720 int copy;
721 int err;
722 int offset = 0;
723 unsigned int maxfraglen, fragheaderlen;
724 int csummode = CHECKSUM_NONE;
725
726 if (flags&MSG_PROBE)
727 return 0;
728
729 if (skb_queue_empty(&sk->sk_write_queue)) {
730 /*
731 * setup for corking.
732 */
733 opt = ipc->opt;
734 if (opt) {
735 if (inet->cork.opt == NULL) {
736 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
737 if (unlikely(inet->cork.opt == NULL))
738 return -ENOBUFS;
739 }
740 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
741 inet->cork.flags |= IPCORK_OPT;
742 inet->cork.addr = ipc->addr;
743 }
744 dst_hold(&rt->u.dst);
745 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
746 inet->cork.rt = rt;
747 inet->cork.length = 0;
748 sk->sk_sndmsg_page = NULL;
749 sk->sk_sndmsg_off = 0;
750 if ((exthdrlen = rt->u.dst.header_len) != 0) {
751 length += exthdrlen;
752 transhdrlen += exthdrlen;
753 }
754 } else {
755 rt = inet->cork.rt;
756 if (inet->cork.flags & IPCORK_OPT)
757 opt = inet->cork.opt;
758
759 transhdrlen = 0;
760 exthdrlen = 0;
761 mtu = inet->cork.fragsize;
762 }
763 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
764
765 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
766 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
767
768 if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
769 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
770 return -EMSGSIZE;
771 }
772
773 /*
774 * transhdrlen > 0 means that this is the first fragment and we wish
775 * it won't be fragmented in the future.
776 */
777 if (transhdrlen &&
778 length + fragheaderlen <= mtu &&
779 rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
780 !exthdrlen)
781 csummode = CHECKSUM_HW;
782
783 inet->cork.length += length;
784
785 /* So, what's going on in the loop below?
786 *
787 * We use calculated fragment length to generate chained skb,
788 * each of segments is IP fragment ready for sending to network after
789 * adding appropriate IP header.
790 */
791
792 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
793 goto alloc_new_skb;
794
795 while (length > 0) {
796 /* Check if the remaining data fits into current packet. */
797 copy = mtu - skb->len;
798 if (copy < length)
799 copy = maxfraglen - skb->len;
800 if (copy <= 0) {
801 char *data;
802 unsigned int datalen;
803 unsigned int fraglen;
804 unsigned int fraggap;
805 unsigned int alloclen;
806 struct sk_buff *skb_prev;
807 alloc_new_skb:
808 skb_prev = skb;
809 if (skb_prev)
810 fraggap = skb_prev->len - maxfraglen;
811 else
812 fraggap = 0;
813
814 /*
815 * If remaining data exceeds the mtu,
816 * we know we need more fragment(s).
817 */
818 datalen = length + fraggap;
819 if (datalen > mtu - fragheaderlen)
820 datalen = maxfraglen - fragheaderlen;
821 fraglen = datalen + fragheaderlen;
822
823 if ((flags & MSG_MORE) &&
824 !(rt->u.dst.dev->features&NETIF_F_SG))
825 alloclen = mtu;
826 else
827 alloclen = datalen + fragheaderlen;
828
829 /* The last fragment gets additional space at tail.
830 * Note, with MSG_MORE we overallocate on fragments,
831 * because we have no idea what fragment will be
832 * the last.
833 */
834 if (datalen == length)
835 alloclen += rt->u.dst.trailer_len;
836
837 if (transhdrlen) {
838 skb = sock_alloc_send_skb(sk,
839 alloclen + hh_len + 15,
840 (flags & MSG_DONTWAIT), &err);
841 } else {
842 skb = NULL;
843 if (atomic_read(&sk->sk_wmem_alloc) <=
844 2 * sk->sk_sndbuf)
845 skb = sock_wmalloc(sk,
846 alloclen + hh_len + 15, 1,
847 sk->sk_allocation);
848 if (unlikely(skb == NULL))
849 err = -ENOBUFS;
850 }
851 if (skb == NULL)
852 goto error;
853
854 /*
855 * Fill in the control structures
856 */
857 skb->ip_summed = csummode;
858 skb->csum = 0;
859 skb_reserve(skb, hh_len);
860
861 /*
862 * Find where to start putting bytes.
863 */
864 data = skb_put(skb, fraglen);
865 skb->nh.raw = data + exthdrlen;
866 data += fragheaderlen;
867 skb->h.raw = data + exthdrlen;
868
869 if (fraggap) {
870 skb->csum = skb_copy_and_csum_bits(
871 skb_prev, maxfraglen,
872 data + transhdrlen, fraggap, 0);
873 skb_prev->csum = csum_sub(skb_prev->csum,
874 skb->csum);
875 data += fraggap;
876 skb_trim(skb_prev, maxfraglen);
877 }
878
879 copy = datalen - transhdrlen - fraggap;
880 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
881 err = -EFAULT;
882 kfree_skb(skb);
883 goto error;
884 }
885
886 offset += copy;
887 length -= datalen - fraggap;
888 transhdrlen = 0;
889 exthdrlen = 0;
890 csummode = CHECKSUM_NONE;
891
892 /*
893 * Put the packet on the pending queue.
894 */
895 __skb_queue_tail(&sk->sk_write_queue, skb);
896 continue;
897 }
898
899 if (copy > length)
900 copy = length;
901
902 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
903 unsigned int off;
904
905 off = skb->len;
906 if (getfrag(from, skb_put(skb, copy),
907 offset, copy, off, skb) < 0) {
908 __skb_trim(skb, off);
909 err = -EFAULT;
910 goto error;
911 }
912 } else {
913 int i = skb_shinfo(skb)->nr_frags;
914 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
915 struct page *page = sk->sk_sndmsg_page;
916 int off = sk->sk_sndmsg_off;
917 unsigned int left;
918
919 if (page && (left = PAGE_SIZE - off) > 0) {
920 if (copy >= left)
921 copy = left;
922 if (page != frag->page) {
923 if (i == MAX_SKB_FRAGS) {
924 err = -EMSGSIZE;
925 goto error;
926 }
927 get_page(page);
928 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
929 frag = &skb_shinfo(skb)->frags[i];
930 }
931 } else if (i < MAX_SKB_FRAGS) {
932 if (copy > PAGE_SIZE)
933 copy = PAGE_SIZE;
934 page = alloc_pages(sk->sk_allocation, 0);
935 if (page == NULL) {
936 err = -ENOMEM;
937 goto error;
938 }
939 sk->sk_sndmsg_page = page;
940 sk->sk_sndmsg_off = 0;
941
942 skb_fill_page_desc(skb, i, page, 0, 0);
943 frag = &skb_shinfo(skb)->frags[i];
944 skb->truesize += PAGE_SIZE;
945 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
946 } else {
947 err = -EMSGSIZE;
948 goto error;
949 }
950 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
951 err = -EFAULT;
952 goto error;
953 }
954 sk->sk_sndmsg_off += copy;
955 frag->size += copy;
956 skb->len += copy;
957 skb->data_len += copy;
958 }
959 offset += copy;
960 length -= copy;
961 }
962
963 return 0;
964
965 error:
966 inet->cork.length -= length;
967 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
968 return err;
969 }
970
971 ssize_t ip_append_page(struct sock *sk, struct page *page,
972 int offset, size_t size, int flags)
973 {
974 struct inet_sock *inet = inet_sk(sk);
975 struct sk_buff *skb;
976 struct rtable *rt;
977 struct ip_options *opt = NULL;
978 int hh_len;
979 int mtu;
980 int len;
981 int err;
982 unsigned int maxfraglen, fragheaderlen, fraggap;
983
984 if (inet->hdrincl)
985 return -EPERM;
986
987 if (flags&MSG_PROBE)
988 return 0;
989
990 if (skb_queue_empty(&sk->sk_write_queue))
991 return -EINVAL;
992
993 rt = inet->cork.rt;
994 if (inet->cork.flags & IPCORK_OPT)
995 opt = inet->cork.opt;
996
997 if (!(rt->u.dst.dev->features&NETIF_F_SG))
998 return -EOPNOTSUPP;
999
1000 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1001 mtu = inet->cork.fragsize;
1002
1003 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1004 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1005
1006 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1007 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1008 return -EMSGSIZE;
1009 }
1010
1011 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1012 return -EINVAL;
1013
1014 inet->cork.length += size;
1015
1016 while (size > 0) {
1017 int i;
1018
1019 /* Check if the remaining data fits into current packet. */
1020 len = mtu - skb->len;
1021 if (len < size)
1022 len = maxfraglen - skb->len;
1023 if (len <= 0) {
1024 struct sk_buff *skb_prev;
1025 char *data;
1026 struct iphdr *iph;
1027 int alloclen;
1028
1029 skb_prev = skb;
1030 if (skb_prev)
1031 fraggap = skb_prev->len - maxfraglen;
1032 else
1033 fraggap = 0;
1034
1035 alloclen = fragheaderlen + hh_len + fraggap + 15;
1036 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1037 if (unlikely(!skb)) {
1038 err = -ENOBUFS;
1039 goto error;
1040 }
1041
1042 /*
1043 * Fill in the control structures
1044 */
1045 skb->ip_summed = CHECKSUM_NONE;
1046 skb->csum = 0;
1047 skb_reserve(skb, hh_len);
1048
1049 /*
1050 * Find where to start putting bytes.
1051 */
1052 data = skb_put(skb, fragheaderlen + fraggap);
1053 skb->nh.iph = iph = (struct iphdr *)data;
1054 data += fragheaderlen;
1055 skb->h.raw = data;
1056
1057 if (fraggap) {
1058 skb->csum = skb_copy_and_csum_bits(
1059 skb_prev, maxfraglen,
1060 data, fraggap, 0);
1061 skb_prev->csum = csum_sub(skb_prev->csum,
1062 skb->csum);
1063 skb_trim(skb_prev, maxfraglen);
1064 }
1065
1066 /*
1067 * Put the packet on the pending queue.
1068 */
1069 __skb_queue_tail(&sk->sk_write_queue, skb);
1070 continue;
1071 }
1072
1073 i = skb_shinfo(skb)->nr_frags;
1074 if (len > size)
1075 len = size;
1076 if (skb_can_coalesce(skb, i, page, offset)) {
1077 skb_shinfo(skb)->frags[i-1].size += len;
1078 } else if (i < MAX_SKB_FRAGS) {
1079 get_page(page);
1080 skb_fill_page_desc(skb, i, page, offset, len);
1081 } else {
1082 err = -EMSGSIZE;
1083 goto error;
1084 }
1085
1086 if (skb->ip_summed == CHECKSUM_NONE) {
1087 unsigned int csum;
1088 csum = csum_page(page, offset, len);
1089 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1090 }
1091
1092 skb->len += len;
1093 skb->data_len += len;
1094 offset += len;
1095 size -= len;
1096 }
1097 return 0;
1098
1099 error:
1100 inet->cork.length -= size;
1101 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1102 return err;
1103 }
1104
1105 /*
1106 * Combined all pending IP fragments on the socket as one IP datagram
1107 * and push them out.
1108 */
1109 int ip_push_pending_frames(struct sock *sk)
1110 {
1111 struct sk_buff *skb, *tmp_skb;
1112 struct sk_buff **tail_skb;
1113 struct inet_sock *inet = inet_sk(sk);
1114 struct ip_options *opt = NULL;
1115 struct rtable *rt = inet->cork.rt;
1116 struct iphdr *iph;
1117 int df = 0;
1118 __u8 ttl;
1119 int err = 0;
1120
1121 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1122 goto out;
1123 tail_skb = &(skb_shinfo(skb)->frag_list);
1124
1125 /* move skb->data to ip header from ext header */
1126 if (skb->data < skb->nh.raw)
1127 __skb_pull(skb, skb->nh.raw - skb->data);
1128 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1129 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1130 *tail_skb = tmp_skb;
1131 tail_skb = &(tmp_skb->next);
1132 skb->len += tmp_skb->len;
1133 skb->data_len += tmp_skb->len;
1134 skb->truesize += tmp_skb->truesize;
1135 __sock_put(tmp_skb->sk);
1136 tmp_skb->destructor = NULL;
1137 tmp_skb->sk = NULL;
1138 }
1139
1140 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1141 * to fragment the frame generated here. No matter, what transforms
1142 * how transforms change size of the packet, it will come out.
1143 */
1144 if (inet->pmtudisc != IP_PMTUDISC_DO)
1145 skb->local_df = 1;
1146
1147 /* DF bit is set when we want to see DF on outgoing frames.
1148 * If local_df is set too, we still allow to fragment this frame
1149 * locally. */
1150 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1151 (skb->len <= dst_mtu(&rt->u.dst) &&
1152 ip_dont_fragment(sk, &rt->u.dst)))
1153 df = htons(IP_DF);
1154
1155 if (inet->cork.flags & IPCORK_OPT)
1156 opt = inet->cork.opt;
1157
1158 if (rt->rt_type == RTN_MULTICAST)
1159 ttl = inet->mc_ttl;
1160 else
1161 ttl = ip_select_ttl(inet, &rt->u.dst);
1162
1163 iph = (struct iphdr *)skb->data;
1164 iph->version = 4;
1165 iph->ihl = 5;
1166 if (opt) {
1167 iph->ihl += opt->optlen>>2;
1168 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1169 }
1170 iph->tos = inet->tos;
1171 iph->tot_len = htons(skb->len);
1172 iph->frag_off = df;
1173 if (!df) {
1174 __ip_select_ident(iph, &rt->u.dst, 0);
1175 } else {
1176 iph->id = htons(inet->id++);
1177 }
1178 iph->ttl = ttl;
1179 iph->protocol = sk->sk_protocol;
1180 iph->saddr = rt->rt_src;
1181 iph->daddr = rt->rt_dst;
1182 ip_send_check(iph);
1183
1184 skb->priority = sk->sk_priority;
1185 skb->dst = dst_clone(&rt->u.dst);
1186
1187 /* Netfilter gets whole the not fragmented skb. */
1188 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1189 skb->dst->dev, dst_output);
1190 if (err) {
1191 if (err > 0)
1192 err = inet->recverr ? net_xmit_errno(err) : 0;
1193 if (err)
1194 goto error;
1195 }
1196
1197 out:
1198 inet->cork.flags &= ~IPCORK_OPT;
1199 if (inet->cork.opt) {
1200 kfree(inet->cork.opt);
1201 inet->cork.opt = NULL;
1202 }
1203 if (inet->cork.rt) {
1204 ip_rt_put(inet->cork.rt);
1205 inet->cork.rt = NULL;
1206 }
1207 return err;
1208
1209 error:
1210 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1211 goto out;
1212 }
1213
1214 /*
1215 * Throw away all pending data on the socket.
1216 */
1217 void ip_flush_pending_frames(struct sock *sk)
1218 {
1219 struct inet_sock *inet = inet_sk(sk);
1220 struct sk_buff *skb;
1221
1222 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1223 kfree_skb(skb);
1224
1225 inet->cork.flags &= ~IPCORK_OPT;
1226 if (inet->cork.opt) {
1227 kfree(inet->cork.opt);
1228 inet->cork.opt = NULL;
1229 }
1230 if (inet->cork.rt) {
1231 ip_rt_put(inet->cork.rt);
1232 inet->cork.rt = NULL;
1233 }
1234 }
1235
1236
1237 /*
1238 * Fetch data from kernel space and fill in checksum if needed.
1239 */
1240 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1241 int len, int odd, struct sk_buff *skb)
1242 {
1243 unsigned int csum;
1244
1245 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1246 skb->csum = csum_block_add(skb->csum, csum, odd);
1247 return 0;
1248 }
1249
1250 /*
1251 * Generic function to send a packet as reply to another packet.
1252 * Used to send TCP resets so far. ICMP should use this function too.
1253 *
1254 * Should run single threaded per socket because it uses the sock
1255 * structure to pass arguments.
1256 *
1257 * LATER: switch from ip_build_xmit to ip_append_*
1258 */
1259 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1260 unsigned int len)
1261 {
1262 struct inet_sock *inet = inet_sk(sk);
1263 struct {
1264 struct ip_options opt;
1265 char data[40];
1266 } replyopts;
1267 struct ipcm_cookie ipc;
1268 u32 daddr;
1269 struct rtable *rt = (struct rtable*)skb->dst;
1270
1271 if (ip_options_echo(&replyopts.opt, skb))
1272 return;
1273
1274 daddr = ipc.addr = rt->rt_src;
1275 ipc.opt = NULL;
1276
1277 if (replyopts.opt.optlen) {
1278 ipc.opt = &replyopts.opt;
1279
1280 if (ipc.opt->srr)
1281 daddr = replyopts.opt.faddr;
1282 }
1283
1284 {
1285 struct flowi fl = { .nl_u = { .ip4_u =
1286 { .daddr = daddr,
1287 .saddr = rt->rt_spec_dst,
1288 .tos = RT_TOS(skb->nh.iph->tos) } },
1289 /* Not quite clean, but right. */
1290 .uli_u = { .ports =
1291 { .sport = skb->h.th->dest,
1292 .dport = skb->h.th->source } },
1293 .proto = sk->sk_protocol };
1294 if (ip_route_output_key(&rt, &fl))
1295 return;
1296 }
1297
1298 /* And let IP do all the hard work.
1299
1300 This chunk is not reenterable, hence spinlock.
1301 Note that it uses the fact, that this function is called
1302 with locally disabled BH and that sk cannot be already spinlocked.
1303 */
1304 bh_lock_sock(sk);
1305 inet->tos = skb->nh.iph->tos;
1306 sk->sk_priority = skb->priority;
1307 sk->sk_protocol = skb->nh.iph->protocol;
1308 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1309 &ipc, rt, MSG_DONTWAIT);
1310 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1311 if (arg->csumoffset >= 0)
1312 *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1313 skb->ip_summed = CHECKSUM_NONE;
1314 ip_push_pending_frames(sk);
1315 }
1316
1317 bh_unlock_sock(sk);
1318
1319 ip_rt_put(rt);
1320 }
1321
1322 void __init ip_init(void)
1323 {
1324 ip_rt_init();
1325 inet_initpeers();
1326
1327 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1328 igmp_mc_proc_init();
1329 #endif
1330 }
1331
1332 EXPORT_SYMBOL(ip_finish_output);
1333 EXPORT_SYMBOL(ip_fragment);
1334 EXPORT_SYMBOL(ip_generic_getfrag);
1335 EXPORT_SYMBOL(ip_queue_xmit);
1336 EXPORT_SYMBOL(ip_send_check);
1337
1338 #ifdef CONFIG_SYSCTL
1339 EXPORT_SYMBOL(sysctl_ip_default_ttl);
1340 #endif
This page took 0.062371 seconds and 5 git commands to generate.