gre: Move MTU setting out of ipgre_tunnel_bind_dev
[deliverable/linux.git] / net / ipv4 / ip_gre.c
1 /*
2 * Linux NET3: GRE over IP protocol decoder.
3 *
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
31
32 #include <net/sock.h>
33 #include <net/ip.h>
34 #include <net/icmp.h>
35 #include <net/protocol.h>
36 #include <net/ipip.h>
37 #include <net/arp.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
41 #include <net/xfrm.h>
42 #include <net/net_namespace.h>
43 #include <net/netns/generic.h>
44
45 #ifdef CONFIG_IPV6
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #endif
50
51 /*
52 Problems & solutions
53 --------------------
54
55 1. The most important issue is detecting local dead loops.
56 They would cause complete host lockup in transmit, which
57 would be "resolved" by stack overflow or, if queueing is enabled,
58 with infinite looping in net_bh.
59
60 We cannot track such dead loops during route installation,
61 it is infeasible task. The most general solutions would be
62 to keep skb->encapsulation counter (sort of local ttl),
63 and silently drop packet when it expires. It is the best
64 solution, but it supposes maintaing new variable in ALL
65 skb, even if no tunneling is used.
66
67 Current solution: t->recursion lock breaks dead loops. It looks
68 like dev->tbusy flag, but I preferred new variable, because
69 the semantics is different. One day, when hard_start_xmit
70 will be multithreaded we will have to use skb->encapsulation.
71
72
73
74 2. Networking dead loops would not kill routers, but would really
75 kill network. IP hop limit plays role of "t->recursion" in this case,
76 if we copy it from packet being encapsulated to upper header.
77 It is very good solution, but it introduces two problems:
78
79 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80 do not work over tunnels.
81 - traceroute does not work. I planned to relay ICMP from tunnel,
82 so that this problem would be solved and traceroute output
83 would even more informative. This idea appeared to be wrong:
84 only Linux complies to rfc1812 now (yes, guys, Linux is the only
85 true router now :-)), all routers (at least, in neighbourhood of mine)
86 return only 8 bytes of payload. It is the end.
87
88 Hence, if we want that OSPF worked or traceroute said something reasonable,
89 we should search for another solution.
90
91 One of them is to parse packet trying to detect inner encapsulation
92 made by our node. It is difficult or even impossible, especially,
93 taking into account fragmentation. TO be short, tt is not solution at all.
94
95 Current solution: The solution was UNEXPECTEDLY SIMPLE.
96 We force DF flag on tunnels with preconfigured hop limit,
97 that is ALL. :-) Well, it does not remove the problem completely,
98 but exponential growth of network traffic is changed to linear
99 (branches, that exceed pmtu are pruned) and tunnel mtu
100 fastly degrades to value <68, where looping stops.
101 Yes, it is not good if there exists a router in the loop,
102 which does not force DF, even when encapsulating packets have DF set.
103 But it is not our problem! Nobody could accuse us, we made
104 all that we could make. Even if it is your gated who injected
105 fatal route to network, even if it were you who configured
106 fatal static route: you are innocent. :-)
107
108
109
110 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111 practically identical code. It would be good to glue them
112 together, but it is not very evident, how to make them modular.
113 sit is integral part of IPv6, ipip and gre are naturally modular.
114 We could extract common parts (hash table, ioctl etc)
115 to a separate module (ip_tunnel.c).
116
117 Alexey Kuznetsov.
118 */
119
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
122 static int ipgre_tunnel_bind_dev(struct net_device *dev);
123
124 /* Fallback tunnel: no source, no destination, no key, no options */
125
126 static int ipgre_fb_tunnel_init(struct net_device *dev);
127
128 #define HASH_SIZE 16
129
130 static int ipgre_net_id;
131 struct ipgre_net {
132 struct ip_tunnel *tunnels[4][HASH_SIZE];
133
134 struct net_device *fb_tunnel_dev;
135 };
136
137 /* Tunnel hash table */
138
139 /*
140 4 hash tables:
141
142 3: (remote,local)
143 2: (remote,*)
144 1: (*,local)
145 0: (*,*)
146
147 We require exact key match i.e. if a key is present in packet
148 it will match only tunnel with the same key; if it is not present,
149 it will match only keyless tunnel.
150
151 All keysless packets, if not matched configured keyless tunnels
152 will match fallback tunnel.
153 */
154
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156
157 #define tunnels_r_l tunnels[3]
158 #define tunnels_r tunnels[2]
159 #define tunnels_l tunnels[1]
160 #define tunnels_wc tunnels[0]
161
162 static DEFINE_RWLOCK(ipgre_lock);
163
164 /* Given src, dst and key, find appropriate for input tunnel. */
165
166 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
167 __be32 remote, __be32 local, __be32 key)
168 {
169 unsigned h0 = HASH(remote);
170 unsigned h1 = HASH(key);
171 struct ip_tunnel *t;
172 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
173
174 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
175 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
176 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
177 return t;
178 }
179 }
180 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
181 if (remote == t->parms.iph.daddr) {
182 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
183 return t;
184 }
185 }
186 for (t = ign->tunnels_l[h1]; t; t = t->next) {
187 if (local == t->parms.iph.saddr ||
188 (local == t->parms.iph.daddr &&
189 ipv4_is_multicast(local))) {
190 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
191 return t;
192 }
193 }
194 for (t = ign->tunnels_wc[h1]; t; t = t->next) {
195 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
196 return t;
197 }
198
199 if (ign->fb_tunnel_dev->flags&IFF_UP)
200 return netdev_priv(ign->fb_tunnel_dev);
201 return NULL;
202 }
203
204 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
205 struct ip_tunnel_parm *parms)
206 {
207 __be32 remote = parms->iph.daddr;
208 __be32 local = parms->iph.saddr;
209 __be32 key = parms->i_key;
210 unsigned h = HASH(key);
211 int prio = 0;
212
213 if (local)
214 prio |= 1;
215 if (remote && !ipv4_is_multicast(remote)) {
216 prio |= 2;
217 h ^= HASH(remote);
218 }
219
220 return &ign->tunnels[prio][h];
221 }
222
223 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
224 struct ip_tunnel *t)
225 {
226 return __ipgre_bucket(ign, &t->parms);
227 }
228
229 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
230 {
231 struct ip_tunnel **tp = ipgre_bucket(ign, t);
232
233 t->next = *tp;
234 write_lock_bh(&ipgre_lock);
235 *tp = t;
236 write_unlock_bh(&ipgre_lock);
237 }
238
239 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
240 {
241 struct ip_tunnel **tp;
242
243 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
244 if (t == *tp) {
245 write_lock_bh(&ipgre_lock);
246 *tp = t->next;
247 write_unlock_bh(&ipgre_lock);
248 break;
249 }
250 }
251 }
252
253 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
254 struct ip_tunnel_parm *parms, int create)
255 {
256 __be32 remote = parms->iph.daddr;
257 __be32 local = parms->iph.saddr;
258 __be32 key = parms->i_key;
259 struct ip_tunnel *t, **tp, *nt;
260 struct net_device *dev;
261 char name[IFNAMSIZ];
262 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
263
264 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) {
265 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
266 if (key == t->parms.i_key)
267 return t;
268 }
269 }
270 if (!create)
271 return NULL;
272
273 if (parms->name[0])
274 strlcpy(name, parms->name, IFNAMSIZ);
275 else
276 sprintf(name, "gre%%d");
277
278 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
279 if (!dev)
280 return NULL;
281
282 dev_net_set(dev, net);
283
284 if (strchr(name, '%')) {
285 if (dev_alloc_name(dev, name) < 0)
286 goto failed_free;
287 }
288
289 dev->init = ipgre_tunnel_init;
290 nt = netdev_priv(dev);
291 nt->parms = *parms;
292
293 dev->mtu = ipgre_tunnel_bind_dev(dev);
294
295 if (register_netdevice(dev) < 0)
296 goto failed_free;
297
298 dev_hold(dev);
299 ipgre_tunnel_link(ign, nt);
300 return nt;
301
302 failed_free:
303 free_netdev(dev);
304 return NULL;
305 }
306
307 static void ipgre_tunnel_uninit(struct net_device *dev)
308 {
309 struct net *net = dev_net(dev);
310 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
311
312 ipgre_tunnel_unlink(ign, netdev_priv(dev));
313 dev_put(dev);
314 }
315
316
317 static void ipgre_err(struct sk_buff *skb, u32 info)
318 {
319
320 /* All the routers (except for Linux) return only
321 8 bytes of packet payload. It means, that precise relaying of
322 ICMP in the real Internet is absolutely infeasible.
323
324 Moreover, Cisco "wise men" put GRE key to the third word
325 in GRE header. It makes impossible maintaining even soft state for keyed
326 GRE tunnels with enabled checksum. Tell them "thank you".
327
328 Well, I wonder, rfc1812 was written by Cisco employee,
329 what the hell these idiots break standrads established
330 by themself???
331 */
332
333 struct iphdr *iph = (struct iphdr*)skb->data;
334 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
335 int grehlen = (iph->ihl<<2) + 4;
336 const int type = icmp_hdr(skb)->type;
337 const int code = icmp_hdr(skb)->code;
338 struct ip_tunnel *t;
339 __be16 flags;
340
341 flags = p[0];
342 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
343 if (flags&(GRE_VERSION|GRE_ROUTING))
344 return;
345 if (flags&GRE_KEY) {
346 grehlen += 4;
347 if (flags&GRE_CSUM)
348 grehlen += 4;
349 }
350 }
351
352 /* If only 8 bytes returned, keyed message will be dropped here */
353 if (skb_headlen(skb) < grehlen)
354 return;
355
356 switch (type) {
357 default:
358 case ICMP_PARAMETERPROB:
359 return;
360
361 case ICMP_DEST_UNREACH:
362 switch (code) {
363 case ICMP_SR_FAILED:
364 case ICMP_PORT_UNREACH:
365 /* Impossible event. */
366 return;
367 case ICMP_FRAG_NEEDED:
368 /* Soft state for pmtu is maintained by IP core. */
369 return;
370 default:
371 /* All others are translated to HOST_UNREACH.
372 rfc2003 contains "deep thoughts" about NET_UNREACH,
373 I believe they are just ether pollution. --ANK
374 */
375 break;
376 }
377 break;
378 case ICMP_TIME_EXCEEDED:
379 if (code != ICMP_EXC_TTL)
380 return;
381 break;
382 }
383
384 read_lock(&ipgre_lock);
385 t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
386 (flags&GRE_KEY) ?
387 *(((__be32*)p) + (grehlen>>2) - 1) : 0);
388 if (t == NULL || t->parms.iph.daddr == 0 ||
389 ipv4_is_multicast(t->parms.iph.daddr))
390 goto out;
391
392 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
393 goto out;
394
395 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
396 t->err_count++;
397 else
398 t->err_count = 1;
399 t->err_time = jiffies;
400 out:
401 read_unlock(&ipgre_lock);
402 return;
403 }
404
405 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
406 {
407 if (INET_ECN_is_ce(iph->tos)) {
408 if (skb->protocol == htons(ETH_P_IP)) {
409 IP_ECN_set_ce(ip_hdr(skb));
410 } else if (skb->protocol == htons(ETH_P_IPV6)) {
411 IP6_ECN_set_ce(ipv6_hdr(skb));
412 }
413 }
414 }
415
416 static inline u8
417 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
418 {
419 u8 inner = 0;
420 if (skb->protocol == htons(ETH_P_IP))
421 inner = old_iph->tos;
422 else if (skb->protocol == htons(ETH_P_IPV6))
423 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
424 return INET_ECN_encapsulate(tos, inner);
425 }
426
427 static int ipgre_rcv(struct sk_buff *skb)
428 {
429 struct iphdr *iph;
430 u8 *h;
431 __be16 flags;
432 __sum16 csum = 0;
433 __be32 key = 0;
434 u32 seqno = 0;
435 struct ip_tunnel *tunnel;
436 int offset = 4;
437
438 if (!pskb_may_pull(skb, 16))
439 goto drop_nolock;
440
441 iph = ip_hdr(skb);
442 h = skb->data;
443 flags = *(__be16*)h;
444
445 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
446 /* - Version must be 0.
447 - We do not support routing headers.
448 */
449 if (flags&(GRE_VERSION|GRE_ROUTING))
450 goto drop_nolock;
451
452 if (flags&GRE_CSUM) {
453 switch (skb->ip_summed) {
454 case CHECKSUM_COMPLETE:
455 csum = csum_fold(skb->csum);
456 if (!csum)
457 break;
458 /* fall through */
459 case CHECKSUM_NONE:
460 skb->csum = 0;
461 csum = __skb_checksum_complete(skb);
462 skb->ip_summed = CHECKSUM_COMPLETE;
463 }
464 offset += 4;
465 }
466 if (flags&GRE_KEY) {
467 key = *(__be32*)(h + offset);
468 offset += 4;
469 }
470 if (flags&GRE_SEQ) {
471 seqno = ntohl(*(__be32*)(h + offset));
472 offset += 4;
473 }
474 }
475
476 read_lock(&ipgre_lock);
477 if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
478 iph->saddr, iph->daddr, key)) != NULL) {
479 struct net_device_stats *stats = &tunnel->dev->stats;
480
481 secpath_reset(skb);
482
483 skb->protocol = *(__be16*)(h + 2);
484 /* WCCP version 1 and 2 protocol decoding.
485 * - Change protocol to IP
486 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
487 */
488 if (flags == 0 &&
489 skb->protocol == htons(ETH_P_WCCP)) {
490 skb->protocol = htons(ETH_P_IP);
491 if ((*(h + offset) & 0xF0) != 0x40)
492 offset += 4;
493 }
494
495 skb->mac_header = skb->network_header;
496 __pskb_pull(skb, offset);
497 skb_reset_network_header(skb);
498 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
499 skb->pkt_type = PACKET_HOST;
500 #ifdef CONFIG_NET_IPGRE_BROADCAST
501 if (ipv4_is_multicast(iph->daddr)) {
502 /* Looped back packet, drop it! */
503 if (skb->rtable->fl.iif == 0)
504 goto drop;
505 stats->multicast++;
506 skb->pkt_type = PACKET_BROADCAST;
507 }
508 #endif
509
510 if (((flags&GRE_CSUM) && csum) ||
511 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
512 stats->rx_crc_errors++;
513 stats->rx_errors++;
514 goto drop;
515 }
516 if (tunnel->parms.i_flags&GRE_SEQ) {
517 if (!(flags&GRE_SEQ) ||
518 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
519 stats->rx_fifo_errors++;
520 stats->rx_errors++;
521 goto drop;
522 }
523 tunnel->i_seqno = seqno + 1;
524 }
525 stats->rx_packets++;
526 stats->rx_bytes += skb->len;
527 skb->dev = tunnel->dev;
528 dst_release(skb->dst);
529 skb->dst = NULL;
530 nf_reset(skb);
531 ipgre_ecn_decapsulate(iph, skb);
532 netif_rx(skb);
533 read_unlock(&ipgre_lock);
534 return(0);
535 }
536 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
537
538 drop:
539 read_unlock(&ipgre_lock);
540 drop_nolock:
541 kfree_skb(skb);
542 return(0);
543 }
544
545 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
546 {
547 struct ip_tunnel *tunnel = netdev_priv(dev);
548 struct net_device_stats *stats = &tunnel->dev->stats;
549 struct iphdr *old_iph = ip_hdr(skb);
550 struct iphdr *tiph;
551 u8 tos;
552 __be16 df;
553 struct rtable *rt; /* Route to the other host */
554 struct net_device *tdev; /* Device to other host */
555 struct iphdr *iph; /* Our new IP header */
556 unsigned int max_headroom; /* The extra header space needed */
557 int gre_hlen;
558 __be32 dst;
559 int mtu;
560
561 if (tunnel->recursion++) {
562 stats->collisions++;
563 goto tx_error;
564 }
565
566 if (dev->header_ops) {
567 gre_hlen = 0;
568 tiph = (struct iphdr*)skb->data;
569 } else {
570 gre_hlen = tunnel->hlen;
571 tiph = &tunnel->parms.iph;
572 }
573
574 if ((dst = tiph->daddr) == 0) {
575 /* NBMA tunnel */
576
577 if (skb->dst == NULL) {
578 stats->tx_fifo_errors++;
579 goto tx_error;
580 }
581
582 if (skb->protocol == htons(ETH_P_IP)) {
583 rt = skb->rtable;
584 if ((dst = rt->rt_gateway) == 0)
585 goto tx_error_icmp;
586 }
587 #ifdef CONFIG_IPV6
588 else if (skb->protocol == htons(ETH_P_IPV6)) {
589 struct in6_addr *addr6;
590 int addr_type;
591 struct neighbour *neigh = skb->dst->neighbour;
592
593 if (neigh == NULL)
594 goto tx_error;
595
596 addr6 = (struct in6_addr*)&neigh->primary_key;
597 addr_type = ipv6_addr_type(addr6);
598
599 if (addr_type == IPV6_ADDR_ANY) {
600 addr6 = &ipv6_hdr(skb)->daddr;
601 addr_type = ipv6_addr_type(addr6);
602 }
603
604 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
605 goto tx_error_icmp;
606
607 dst = addr6->s6_addr32[3];
608 }
609 #endif
610 else
611 goto tx_error;
612 }
613
614 tos = tiph->tos;
615 if (tos&1) {
616 if (skb->protocol == htons(ETH_P_IP))
617 tos = old_iph->tos;
618 tos &= ~1;
619 }
620
621 {
622 struct flowi fl = { .oif = tunnel->parms.link,
623 .nl_u = { .ip4_u =
624 { .daddr = dst,
625 .saddr = tiph->saddr,
626 .tos = RT_TOS(tos) } },
627 .proto = IPPROTO_GRE };
628 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
629 stats->tx_carrier_errors++;
630 goto tx_error;
631 }
632 }
633 tdev = rt->u.dst.dev;
634
635 if (tdev == dev) {
636 ip_rt_put(rt);
637 stats->collisions++;
638 goto tx_error;
639 }
640
641 df = tiph->frag_off;
642 if (df)
643 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
644 else
645 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
646
647 if (skb->dst)
648 skb->dst->ops->update_pmtu(skb->dst, mtu);
649
650 if (skb->protocol == htons(ETH_P_IP)) {
651 df |= (old_iph->frag_off&htons(IP_DF));
652
653 if ((old_iph->frag_off&htons(IP_DF)) &&
654 mtu < ntohs(old_iph->tot_len)) {
655 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
656 ip_rt_put(rt);
657 goto tx_error;
658 }
659 }
660 #ifdef CONFIG_IPV6
661 else if (skb->protocol == htons(ETH_P_IPV6)) {
662 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
663
664 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
665 if ((tunnel->parms.iph.daddr &&
666 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
667 rt6->rt6i_dst.plen == 128) {
668 rt6->rt6i_flags |= RTF_MODIFIED;
669 skb->dst->metrics[RTAX_MTU-1] = mtu;
670 }
671 }
672
673 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
674 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
675 ip_rt_put(rt);
676 goto tx_error;
677 }
678 }
679 #endif
680
681 if (tunnel->err_count > 0) {
682 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
683 tunnel->err_count--;
684
685 dst_link_failure(skb);
686 } else
687 tunnel->err_count = 0;
688 }
689
690 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
691
692 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
693 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
694 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
695 if (!new_skb) {
696 ip_rt_put(rt);
697 stats->tx_dropped++;
698 dev_kfree_skb(skb);
699 tunnel->recursion--;
700 return 0;
701 }
702 if (skb->sk)
703 skb_set_owner_w(new_skb, skb->sk);
704 dev_kfree_skb(skb);
705 skb = new_skb;
706 old_iph = ip_hdr(skb);
707 }
708
709 skb->transport_header = skb->network_header;
710 skb_push(skb, gre_hlen);
711 skb_reset_network_header(skb);
712 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
713 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
714 IPSKB_REROUTED);
715 dst_release(skb->dst);
716 skb->dst = &rt->u.dst;
717
718 /*
719 * Push down and install the IPIP header.
720 */
721
722 iph = ip_hdr(skb);
723 iph->version = 4;
724 iph->ihl = sizeof(struct iphdr) >> 2;
725 iph->frag_off = df;
726 iph->protocol = IPPROTO_GRE;
727 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
728 iph->daddr = rt->rt_dst;
729 iph->saddr = rt->rt_src;
730
731 if ((iph->ttl = tiph->ttl) == 0) {
732 if (skb->protocol == htons(ETH_P_IP))
733 iph->ttl = old_iph->ttl;
734 #ifdef CONFIG_IPV6
735 else if (skb->protocol == htons(ETH_P_IPV6))
736 iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
737 #endif
738 else
739 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
740 }
741
742 ((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
743 ((__be16*)(iph+1))[1] = skb->protocol;
744
745 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
746 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
747
748 if (tunnel->parms.o_flags&GRE_SEQ) {
749 ++tunnel->o_seqno;
750 *ptr = htonl(tunnel->o_seqno);
751 ptr--;
752 }
753 if (tunnel->parms.o_flags&GRE_KEY) {
754 *ptr = tunnel->parms.o_key;
755 ptr--;
756 }
757 if (tunnel->parms.o_flags&GRE_CSUM) {
758 *ptr = 0;
759 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
760 }
761 }
762
763 nf_reset(skb);
764
765 IPTUNNEL_XMIT();
766 tunnel->recursion--;
767 return 0;
768
769 tx_error_icmp:
770 dst_link_failure(skb);
771
772 tx_error:
773 stats->tx_errors++;
774 dev_kfree_skb(skb);
775 tunnel->recursion--;
776 return 0;
777 }
778
779 static int ipgre_tunnel_bind_dev(struct net_device *dev)
780 {
781 struct net_device *tdev = NULL;
782 struct ip_tunnel *tunnel;
783 struct iphdr *iph;
784 int hlen = LL_MAX_HEADER;
785 int mtu = ETH_DATA_LEN;
786 int addend = sizeof(struct iphdr) + 4;
787
788 tunnel = netdev_priv(dev);
789 iph = &tunnel->parms.iph;
790
791 /* Guess output device to choose reasonable mtu and needed_headroom */
792
793 if (iph->daddr) {
794 struct flowi fl = { .oif = tunnel->parms.link,
795 .nl_u = { .ip4_u =
796 { .daddr = iph->daddr,
797 .saddr = iph->saddr,
798 .tos = RT_TOS(iph->tos) } },
799 .proto = IPPROTO_GRE };
800 struct rtable *rt;
801 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
802 tdev = rt->u.dst.dev;
803 ip_rt_put(rt);
804 }
805 dev->flags |= IFF_POINTOPOINT;
806 }
807
808 if (!tdev && tunnel->parms.link)
809 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
810
811 if (tdev) {
812 hlen = tdev->hard_header_len + tdev->needed_headroom;
813 mtu = tdev->mtu;
814 }
815 dev->iflink = tunnel->parms.link;
816
817 /* Precalculate GRE options length */
818 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
819 if (tunnel->parms.o_flags&GRE_CSUM)
820 addend += 4;
821 if (tunnel->parms.o_flags&GRE_KEY)
822 addend += 4;
823 if (tunnel->parms.o_flags&GRE_SEQ)
824 addend += 4;
825 }
826 dev->needed_headroom = addend + hlen;
827 mtu -= dev->hard_header_len - addend;
828
829 if (mtu < 68)
830 mtu = 68;
831
832 tunnel->hlen = addend;
833
834 return mtu;
835 }
836
837 static int
838 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
839 {
840 int err = 0;
841 struct ip_tunnel_parm p;
842 struct ip_tunnel *t;
843 struct net *net = dev_net(dev);
844 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
845
846 switch (cmd) {
847 case SIOCGETTUNNEL:
848 t = NULL;
849 if (dev == ign->fb_tunnel_dev) {
850 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
851 err = -EFAULT;
852 break;
853 }
854 t = ipgre_tunnel_locate(net, &p, 0);
855 }
856 if (t == NULL)
857 t = netdev_priv(dev);
858 memcpy(&p, &t->parms, sizeof(p));
859 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
860 err = -EFAULT;
861 break;
862
863 case SIOCADDTUNNEL:
864 case SIOCCHGTUNNEL:
865 err = -EPERM;
866 if (!capable(CAP_NET_ADMIN))
867 goto done;
868
869 err = -EFAULT;
870 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
871 goto done;
872
873 err = -EINVAL;
874 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
875 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
876 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
877 goto done;
878 if (p.iph.ttl)
879 p.iph.frag_off |= htons(IP_DF);
880
881 if (!(p.i_flags&GRE_KEY))
882 p.i_key = 0;
883 if (!(p.o_flags&GRE_KEY))
884 p.o_key = 0;
885
886 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
887
888 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
889 if (t != NULL) {
890 if (t->dev != dev) {
891 err = -EEXIST;
892 break;
893 }
894 } else {
895 unsigned nflags=0;
896
897 t = netdev_priv(dev);
898
899 if (ipv4_is_multicast(p.iph.daddr))
900 nflags = IFF_BROADCAST;
901 else if (p.iph.daddr)
902 nflags = IFF_POINTOPOINT;
903
904 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
905 err = -EINVAL;
906 break;
907 }
908 ipgre_tunnel_unlink(ign, t);
909 t->parms.iph.saddr = p.iph.saddr;
910 t->parms.iph.daddr = p.iph.daddr;
911 t->parms.i_key = p.i_key;
912 t->parms.o_key = p.o_key;
913 memcpy(dev->dev_addr, &p.iph.saddr, 4);
914 memcpy(dev->broadcast, &p.iph.daddr, 4);
915 ipgre_tunnel_link(ign, t);
916 netdev_state_change(dev);
917 }
918 }
919
920 if (t) {
921 err = 0;
922 if (cmd == SIOCCHGTUNNEL) {
923 t->parms.iph.ttl = p.iph.ttl;
924 t->parms.iph.tos = p.iph.tos;
925 t->parms.iph.frag_off = p.iph.frag_off;
926 if (t->parms.link != p.link) {
927 t->parms.link = p.link;
928 dev->mtu = ipgre_tunnel_bind_dev(dev);
929 netdev_state_change(dev);
930 }
931 }
932 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
933 err = -EFAULT;
934 } else
935 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
936 break;
937
938 case SIOCDELTUNNEL:
939 err = -EPERM;
940 if (!capable(CAP_NET_ADMIN))
941 goto done;
942
943 if (dev == ign->fb_tunnel_dev) {
944 err = -EFAULT;
945 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
946 goto done;
947 err = -ENOENT;
948 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
949 goto done;
950 err = -EPERM;
951 if (t == netdev_priv(ign->fb_tunnel_dev))
952 goto done;
953 dev = t->dev;
954 }
955 unregister_netdevice(dev);
956 err = 0;
957 break;
958
959 default:
960 err = -EINVAL;
961 }
962
963 done:
964 return err;
965 }
966
967 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
968 {
969 struct ip_tunnel *tunnel = netdev_priv(dev);
970 if (new_mtu < 68 ||
971 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
972 return -EINVAL;
973 dev->mtu = new_mtu;
974 return 0;
975 }
976
977 /* Nice toy. Unfortunately, useless in real life :-)
978 It allows to construct virtual multiprotocol broadcast "LAN"
979 over the Internet, provided multicast routing is tuned.
980
981
982 I have no idea was this bicycle invented before me,
983 so that I had to set ARPHRD_IPGRE to a random value.
984 I have an impression, that Cisco could make something similar,
985 but this feature is apparently missing in IOS<=11.2(8).
986
987 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
988 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
989
990 ping -t 255 224.66.66.66
991
992 If nobody answers, mbone does not work.
993
994 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
995 ip addr add 10.66.66.<somewhat>/24 dev Universe
996 ifconfig Universe up
997 ifconfig Universe add fe80::<Your_real_addr>/10
998 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
999 ftp 10.66.66.66
1000 ...
1001 ftp fec0:6666:6666::193.233.7.65
1002 ...
1003
1004 */
1005
1006 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1007 unsigned short type,
1008 const void *daddr, const void *saddr, unsigned len)
1009 {
1010 struct ip_tunnel *t = netdev_priv(dev);
1011 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1012 __be16 *p = (__be16*)(iph+1);
1013
1014 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1015 p[0] = t->parms.o_flags;
1016 p[1] = htons(type);
1017
1018 /*
1019 * Set the source hardware address.
1020 */
1021
1022 if (saddr)
1023 memcpy(&iph->saddr, saddr, 4);
1024
1025 if (daddr) {
1026 memcpy(&iph->daddr, daddr, 4);
1027 return t->hlen;
1028 }
1029 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1030 return t->hlen;
1031
1032 return -t->hlen;
1033 }
1034
1035 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1036 {
1037 struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1038 memcpy(haddr, &iph->saddr, 4);
1039 return 4;
1040 }
1041
1042 static const struct header_ops ipgre_header_ops = {
1043 .create = ipgre_header,
1044 .parse = ipgre_header_parse,
1045 };
1046
1047 #ifdef CONFIG_NET_IPGRE_BROADCAST
1048 static int ipgre_open(struct net_device *dev)
1049 {
1050 struct ip_tunnel *t = netdev_priv(dev);
1051
1052 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1053 struct flowi fl = { .oif = t->parms.link,
1054 .nl_u = { .ip4_u =
1055 { .daddr = t->parms.iph.daddr,
1056 .saddr = t->parms.iph.saddr,
1057 .tos = RT_TOS(t->parms.iph.tos) } },
1058 .proto = IPPROTO_GRE };
1059 struct rtable *rt;
1060 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1061 return -EADDRNOTAVAIL;
1062 dev = rt->u.dst.dev;
1063 ip_rt_put(rt);
1064 if (__in_dev_get_rtnl(dev) == NULL)
1065 return -EADDRNOTAVAIL;
1066 t->mlink = dev->ifindex;
1067 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1068 }
1069 return 0;
1070 }
1071
1072 static int ipgre_close(struct net_device *dev)
1073 {
1074 struct ip_tunnel *t = netdev_priv(dev);
1075 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1076 struct in_device *in_dev;
1077 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1078 if (in_dev) {
1079 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1080 in_dev_put(in_dev);
1081 }
1082 }
1083 return 0;
1084 }
1085
1086 #endif
1087
1088 static void ipgre_tunnel_setup(struct net_device *dev)
1089 {
1090 dev->uninit = ipgre_tunnel_uninit;
1091 dev->destructor = free_netdev;
1092 dev->hard_start_xmit = ipgre_tunnel_xmit;
1093 dev->do_ioctl = ipgre_tunnel_ioctl;
1094 dev->change_mtu = ipgre_tunnel_change_mtu;
1095
1096 dev->type = ARPHRD_IPGRE;
1097 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1098 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1099 dev->flags = IFF_NOARP;
1100 dev->iflink = 0;
1101 dev->addr_len = 4;
1102 dev->features |= NETIF_F_NETNS_LOCAL;
1103 }
1104
1105 static int ipgre_tunnel_init(struct net_device *dev)
1106 {
1107 struct ip_tunnel *tunnel;
1108 struct iphdr *iph;
1109
1110 tunnel = netdev_priv(dev);
1111 iph = &tunnel->parms.iph;
1112
1113 tunnel->dev = dev;
1114 strcpy(tunnel->parms.name, dev->name);
1115
1116 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1117 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1118
1119 if (iph->daddr) {
1120 #ifdef CONFIG_NET_IPGRE_BROADCAST
1121 if (ipv4_is_multicast(iph->daddr)) {
1122 if (!iph->saddr)
1123 return -EINVAL;
1124 dev->flags = IFF_BROADCAST;
1125 dev->header_ops = &ipgre_header_ops;
1126 dev->open = ipgre_open;
1127 dev->stop = ipgre_close;
1128 }
1129 #endif
1130 } else
1131 dev->header_ops = &ipgre_header_ops;
1132
1133 return 0;
1134 }
1135
1136 static int ipgre_fb_tunnel_init(struct net_device *dev)
1137 {
1138 struct ip_tunnel *tunnel = netdev_priv(dev);
1139 struct iphdr *iph = &tunnel->parms.iph;
1140 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1141
1142 tunnel->dev = dev;
1143 strcpy(tunnel->parms.name, dev->name);
1144
1145 iph->version = 4;
1146 iph->protocol = IPPROTO_GRE;
1147 iph->ihl = 5;
1148 tunnel->hlen = sizeof(struct iphdr) + 4;
1149
1150 dev_hold(dev);
1151 ign->tunnels_wc[0] = tunnel;
1152 return 0;
1153 }
1154
1155
1156 static struct net_protocol ipgre_protocol = {
1157 .handler = ipgre_rcv,
1158 .err_handler = ipgre_err,
1159 .netns_ok = 1,
1160 };
1161
1162 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1163 {
1164 int prio;
1165
1166 for (prio = 0; prio < 4; prio++) {
1167 int h;
1168 for (h = 0; h < HASH_SIZE; h++) {
1169 struct ip_tunnel *t;
1170 while ((t = ign->tunnels[prio][h]) != NULL)
1171 unregister_netdevice(t->dev);
1172 }
1173 }
1174 }
1175
1176 static int ipgre_init_net(struct net *net)
1177 {
1178 int err;
1179 struct ipgre_net *ign;
1180
1181 err = -ENOMEM;
1182 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1183 if (ign == NULL)
1184 goto err_alloc;
1185
1186 err = net_assign_generic(net, ipgre_net_id, ign);
1187 if (err < 0)
1188 goto err_assign;
1189
1190 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1191 ipgre_tunnel_setup);
1192 if (!ign->fb_tunnel_dev) {
1193 err = -ENOMEM;
1194 goto err_alloc_dev;
1195 }
1196
1197 ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1198 dev_net_set(ign->fb_tunnel_dev, net);
1199
1200 if ((err = register_netdev(ign->fb_tunnel_dev)))
1201 goto err_reg_dev;
1202
1203 return 0;
1204
1205 err_reg_dev:
1206 free_netdev(ign->fb_tunnel_dev);
1207 err_alloc_dev:
1208 /* nothing */
1209 err_assign:
1210 kfree(ign);
1211 err_alloc:
1212 return err;
1213 }
1214
1215 static void ipgre_exit_net(struct net *net)
1216 {
1217 struct ipgre_net *ign;
1218
1219 ign = net_generic(net, ipgre_net_id);
1220 rtnl_lock();
1221 ipgre_destroy_tunnels(ign);
1222 rtnl_unlock();
1223 kfree(ign);
1224 }
1225
1226 static struct pernet_operations ipgre_net_ops = {
1227 .init = ipgre_init_net,
1228 .exit = ipgre_exit_net,
1229 };
1230
1231 /*
1232 * And now the modules code and kernel interface.
1233 */
1234
1235 static int __init ipgre_init(void)
1236 {
1237 int err;
1238
1239 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1240
1241 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1242 printk(KERN_INFO "ipgre init: can't add protocol\n");
1243 return -EAGAIN;
1244 }
1245
1246 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1247 if (err < 0)
1248 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1249
1250 return err;
1251 }
1252
1253 static void __exit ipgre_fini(void)
1254 {
1255 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1256 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1257
1258 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1259 }
1260
1261 module_init(ipgre_init);
1262 module_exit(ipgre_fini);
1263 MODULE_LICENSE("GPL");
This page took 0.081977 seconds and 5 git commands to generate.