Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/nab/target...
[deliverable/linux.git] / net / ipv4 / ip_gre.c
1 /*
2 * Linux NET3: GRE over IP protocol decoder.
3 *
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56
57 /*
58 Problems & solutions
59 --------------------
60
61 1. The most important issue is detecting local dead loops.
62 They would cause complete host lockup in transmit, which
63 would be "resolved" by stack overflow or, if queueing is enabled,
64 with infinite looping in net_bh.
65
66 We cannot track such dead loops during route installation,
67 it is infeasible task. The most general solutions would be
68 to keep skb->encapsulation counter (sort of local ttl),
69 and silently drop packet when it expires. It is a good
70 solution, but it supposes maintaining new variable in ALL
71 skb, even if no tunneling is used.
72
73 Current solution: xmit_recursion breaks dead loops. This is a percpu
74 counter, since when we enter the first ndo_xmit(), cpu migration is
75 forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76
77 2. Networking dead loops would not kill routers, but would really
78 kill network. IP hop limit plays role of "t->recursion" in this case,
79 if we copy it from packet being encapsulated to upper header.
80 It is very good solution, but it introduces two problems:
81
82 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83 do not work over tunnels.
84 - traceroute does not work. I planned to relay ICMP from tunnel,
85 so that this problem would be solved and traceroute output
86 would even more informative. This idea appeared to be wrong:
87 only Linux complies to rfc1812 now (yes, guys, Linux is the only
88 true router now :-)), all routers (at least, in neighbourhood of mine)
89 return only 8 bytes of payload. It is the end.
90
91 Hence, if we want that OSPF worked or traceroute said something reasonable,
92 we should search for another solution.
93
94 One of them is to parse packet trying to detect inner encapsulation
95 made by our node. It is difficult or even impossible, especially,
96 taking into account fragmentation. TO be short, ttl is not solution at all.
97
98 Current solution: The solution was UNEXPECTEDLY SIMPLE.
99 We force DF flag on tunnels with preconfigured hop limit,
100 that is ALL. :-) Well, it does not remove the problem completely,
101 but exponential growth of network traffic is changed to linear
102 (branches, that exceed pmtu are pruned) and tunnel mtu
103 rapidly degrades to value <68, where looping stops.
104 Yes, it is not good if there exists a router in the loop,
105 which does not force DF, even when encapsulating packets have DF set.
106 But it is not our problem! Nobody could accuse us, we made
107 all that we could make. Even if it is your gated who injected
108 fatal route to network, even if it were you who configured
109 fatal static route: you are innocent. :-)
110
111
112
113 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114 practically identical code. It would be good to glue them
115 together, but it is not very evident, how to make them modular.
116 sit is integral part of IPv6, ipip and gre are naturally modular.
117 We could extract common parts (hash table, ioctl etc)
118 to a separate module (ip_tunnel.c).
119
120 Alexey Kuznetsov.
121 */
122
123 static bool log_ecn_error = true;
124 module_param(log_ecn_error, bool, 0644);
125 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126
127 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
128 static int ipgre_tunnel_init(struct net_device *dev);
129 static void ipgre_tunnel_setup(struct net_device *dev);
130 static int ipgre_tunnel_bind_dev(struct net_device *dev);
131
132 /* Fallback tunnel: no source, no destination, no key, no options */
133
134 #define HASH_SIZE 16
135
136 static int ipgre_net_id __read_mostly;
137 struct ipgre_net {
138 struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
139
140 struct net_device *fb_tunnel_dev;
141 };
142
143 /* Tunnel hash table */
144
145 /*
146 4 hash tables:
147
148 3: (remote,local)
149 2: (remote,*)
150 1: (*,local)
151 0: (*,*)
152
153 We require exact key match i.e. if a key is present in packet
154 it will match only tunnel with the same key; if it is not present,
155 it will match only keyless tunnel.
156
157 All keysless packets, if not matched configured keyless tunnels
158 will match fallback tunnel.
159 */
160
161 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
162
163 #define tunnels_r_l tunnels[3]
164 #define tunnels_r tunnels[2]
165 #define tunnels_l tunnels[1]
166 #define tunnels_wc tunnels[0]
167 /*
168 * Locking : hash tables are protected by RCU and RTNL
169 */
170
171 #define for_each_ip_tunnel_rcu(start) \
172 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
173
174 /* often modified stats are per cpu, other are shared (netdev->stats) */
175 struct pcpu_tstats {
176 u64 rx_packets;
177 u64 rx_bytes;
178 u64 tx_packets;
179 u64 tx_bytes;
180 struct u64_stats_sync syncp;
181 };
182
183 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
184 struct rtnl_link_stats64 *tot)
185 {
186 int i;
187
188 for_each_possible_cpu(i) {
189 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
190 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
191 unsigned int start;
192
193 do {
194 start = u64_stats_fetch_begin_bh(&tstats->syncp);
195 rx_packets = tstats->rx_packets;
196 tx_packets = tstats->tx_packets;
197 rx_bytes = tstats->rx_bytes;
198 tx_bytes = tstats->tx_bytes;
199 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
200
201 tot->rx_packets += rx_packets;
202 tot->tx_packets += tx_packets;
203 tot->rx_bytes += rx_bytes;
204 tot->tx_bytes += tx_bytes;
205 }
206
207 tot->multicast = dev->stats.multicast;
208 tot->rx_crc_errors = dev->stats.rx_crc_errors;
209 tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
210 tot->rx_length_errors = dev->stats.rx_length_errors;
211 tot->rx_frame_errors = dev->stats.rx_frame_errors;
212 tot->rx_errors = dev->stats.rx_errors;
213
214 tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
215 tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
216 tot->tx_dropped = dev->stats.tx_dropped;
217 tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
218 tot->tx_errors = dev->stats.tx_errors;
219
220 return tot;
221 }
222
223 /* Does key in tunnel parameters match packet */
224 static bool ipgre_key_match(const struct ip_tunnel_parm *p,
225 __be16 flags, __be32 key)
226 {
227 if (p->i_flags & GRE_KEY) {
228 if (flags & GRE_KEY)
229 return key == p->i_key;
230 else
231 return false; /* key expected, none present */
232 } else
233 return !(flags & GRE_KEY);
234 }
235
236 /* Given src, dst and key, find appropriate for input tunnel. */
237
238 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
239 __be32 remote, __be32 local,
240 __be16 flags, __be32 key,
241 __be16 gre_proto)
242 {
243 struct net *net = dev_net(dev);
244 int link = dev->ifindex;
245 unsigned int h0 = HASH(remote);
246 unsigned int h1 = HASH(key);
247 struct ip_tunnel *t, *cand = NULL;
248 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
249 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
250 ARPHRD_ETHER : ARPHRD_IPGRE;
251 int score, cand_score = 4;
252
253 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
254 if (local != t->parms.iph.saddr ||
255 remote != t->parms.iph.daddr ||
256 !(t->dev->flags & IFF_UP))
257 continue;
258
259 if (!ipgre_key_match(&t->parms, flags, key))
260 continue;
261
262 if (t->dev->type != ARPHRD_IPGRE &&
263 t->dev->type != dev_type)
264 continue;
265
266 score = 0;
267 if (t->parms.link != link)
268 score |= 1;
269 if (t->dev->type != dev_type)
270 score |= 2;
271 if (score == 0)
272 return t;
273
274 if (score < cand_score) {
275 cand = t;
276 cand_score = score;
277 }
278 }
279
280 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
281 if (remote != t->parms.iph.daddr ||
282 !(t->dev->flags & IFF_UP))
283 continue;
284
285 if (!ipgre_key_match(&t->parms, flags, key))
286 continue;
287
288 if (t->dev->type != ARPHRD_IPGRE &&
289 t->dev->type != dev_type)
290 continue;
291
292 score = 0;
293 if (t->parms.link != link)
294 score |= 1;
295 if (t->dev->type != dev_type)
296 score |= 2;
297 if (score == 0)
298 return t;
299
300 if (score < cand_score) {
301 cand = t;
302 cand_score = score;
303 }
304 }
305
306 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
307 if ((local != t->parms.iph.saddr &&
308 (local != t->parms.iph.daddr ||
309 !ipv4_is_multicast(local))) ||
310 !(t->dev->flags & IFF_UP))
311 continue;
312
313 if (!ipgre_key_match(&t->parms, flags, key))
314 continue;
315
316 if (t->dev->type != ARPHRD_IPGRE &&
317 t->dev->type != dev_type)
318 continue;
319
320 score = 0;
321 if (t->parms.link != link)
322 score |= 1;
323 if (t->dev->type != dev_type)
324 score |= 2;
325 if (score == 0)
326 return t;
327
328 if (score < cand_score) {
329 cand = t;
330 cand_score = score;
331 }
332 }
333
334 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
335 if (t->parms.i_key != key ||
336 !(t->dev->flags & IFF_UP))
337 continue;
338
339 if (t->dev->type != ARPHRD_IPGRE &&
340 t->dev->type != dev_type)
341 continue;
342
343 score = 0;
344 if (t->parms.link != link)
345 score |= 1;
346 if (t->dev->type != dev_type)
347 score |= 2;
348 if (score == 0)
349 return t;
350
351 if (score < cand_score) {
352 cand = t;
353 cand_score = score;
354 }
355 }
356
357 if (cand != NULL)
358 return cand;
359
360 dev = ign->fb_tunnel_dev;
361 if (dev->flags & IFF_UP)
362 return netdev_priv(dev);
363
364 return NULL;
365 }
366
367 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
368 struct ip_tunnel_parm *parms)
369 {
370 __be32 remote = parms->iph.daddr;
371 __be32 local = parms->iph.saddr;
372 __be32 key = parms->i_key;
373 unsigned int h = HASH(key);
374 int prio = 0;
375
376 if (local)
377 prio |= 1;
378 if (remote && !ipv4_is_multicast(remote)) {
379 prio |= 2;
380 h ^= HASH(remote);
381 }
382
383 return &ign->tunnels[prio][h];
384 }
385
386 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
387 struct ip_tunnel *t)
388 {
389 return __ipgre_bucket(ign, &t->parms);
390 }
391
392 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
393 {
394 struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
395
396 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
397 rcu_assign_pointer(*tp, t);
398 }
399
400 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
401 {
402 struct ip_tunnel __rcu **tp;
403 struct ip_tunnel *iter;
404
405 for (tp = ipgre_bucket(ign, t);
406 (iter = rtnl_dereference(*tp)) != NULL;
407 tp = &iter->next) {
408 if (t == iter) {
409 rcu_assign_pointer(*tp, t->next);
410 break;
411 }
412 }
413 }
414
415 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
416 struct ip_tunnel_parm *parms,
417 int type)
418 {
419 __be32 remote = parms->iph.daddr;
420 __be32 local = parms->iph.saddr;
421 __be32 key = parms->i_key;
422 int link = parms->link;
423 struct ip_tunnel *t;
424 struct ip_tunnel __rcu **tp;
425 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
426
427 for (tp = __ipgre_bucket(ign, parms);
428 (t = rtnl_dereference(*tp)) != NULL;
429 tp = &t->next)
430 if (local == t->parms.iph.saddr &&
431 remote == t->parms.iph.daddr &&
432 key == t->parms.i_key &&
433 link == t->parms.link &&
434 type == t->dev->type)
435 break;
436
437 return t;
438 }
439
440 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
441 struct ip_tunnel_parm *parms, int create)
442 {
443 struct ip_tunnel *t, *nt;
444 struct net_device *dev;
445 char name[IFNAMSIZ];
446 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
447
448 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
449 if (t || !create)
450 return t;
451
452 if (parms->name[0])
453 strlcpy(name, parms->name, IFNAMSIZ);
454 else
455 strcpy(name, "gre%d");
456
457 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
458 if (!dev)
459 return NULL;
460
461 dev_net_set(dev, net);
462
463 nt = netdev_priv(dev);
464 nt->parms = *parms;
465 dev->rtnl_link_ops = &ipgre_link_ops;
466
467 dev->mtu = ipgre_tunnel_bind_dev(dev);
468
469 if (register_netdevice(dev) < 0)
470 goto failed_free;
471
472 /* Can use a lockless transmit, unless we generate output sequences */
473 if (!(nt->parms.o_flags & GRE_SEQ))
474 dev->features |= NETIF_F_LLTX;
475
476 dev_hold(dev);
477 ipgre_tunnel_link(ign, nt);
478 return nt;
479
480 failed_free:
481 free_netdev(dev);
482 return NULL;
483 }
484
485 static void ipgre_tunnel_uninit(struct net_device *dev)
486 {
487 struct net *net = dev_net(dev);
488 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
489
490 ipgre_tunnel_unlink(ign, netdev_priv(dev));
491 dev_put(dev);
492 }
493
494
495 static void ipgre_err(struct sk_buff *skb, u32 info)
496 {
497
498 /* All the routers (except for Linux) return only
499 8 bytes of packet payload. It means, that precise relaying of
500 ICMP in the real Internet is absolutely infeasible.
501
502 Moreover, Cisco "wise men" put GRE key to the third word
503 in GRE header. It makes impossible maintaining even soft state for keyed
504 GRE tunnels with enabled checksum. Tell them "thank you".
505
506 Well, I wonder, rfc1812 was written by Cisco employee,
507 what the hell these idiots break standards established
508 by themselves???
509 */
510
511 const struct iphdr *iph = (const struct iphdr *)skb->data;
512 __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2));
513 int grehlen = (iph->ihl<<2) + 4;
514 const int type = icmp_hdr(skb)->type;
515 const int code = icmp_hdr(skb)->code;
516 struct ip_tunnel *t;
517 __be16 flags;
518 __be32 key = 0;
519
520 flags = p[0];
521 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
522 if (flags&(GRE_VERSION|GRE_ROUTING))
523 return;
524 if (flags&GRE_KEY) {
525 grehlen += 4;
526 if (flags&GRE_CSUM)
527 grehlen += 4;
528 }
529 }
530
531 /* If only 8 bytes returned, keyed message will be dropped here */
532 if (skb_headlen(skb) < grehlen)
533 return;
534
535 if (flags & GRE_KEY)
536 key = *(((__be32 *)p) + (grehlen / 4) - 1);
537
538 switch (type) {
539 default:
540 case ICMP_PARAMETERPROB:
541 return;
542
543 case ICMP_DEST_UNREACH:
544 switch (code) {
545 case ICMP_SR_FAILED:
546 case ICMP_PORT_UNREACH:
547 /* Impossible event. */
548 return;
549 default:
550 /* All others are translated to HOST_UNREACH.
551 rfc2003 contains "deep thoughts" about NET_UNREACH,
552 I believe they are just ether pollution. --ANK
553 */
554 break;
555 }
556 break;
557 case ICMP_TIME_EXCEEDED:
558 if (code != ICMP_EXC_TTL)
559 return;
560 break;
561
562 case ICMP_REDIRECT:
563 break;
564 }
565
566 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
567 flags, key, p[1]);
568
569 if (t == NULL)
570 return;
571
572 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
573 ipv4_update_pmtu(skb, dev_net(skb->dev), info,
574 t->parms.link, 0, IPPROTO_GRE, 0);
575 return;
576 }
577 if (type == ICMP_REDIRECT) {
578 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
579 IPPROTO_GRE, 0);
580 return;
581 }
582 if (t->parms.iph.daddr == 0 ||
583 ipv4_is_multicast(t->parms.iph.daddr))
584 return;
585
586 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
587 return;
588
589 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
590 t->err_count++;
591 else
592 t->err_count = 1;
593 t->err_time = jiffies;
594 }
595
596 static inline u8
597 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
598 {
599 u8 inner = 0;
600 if (skb->protocol == htons(ETH_P_IP))
601 inner = old_iph->tos;
602 else if (skb->protocol == htons(ETH_P_IPV6))
603 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
604 return INET_ECN_encapsulate(tos, inner);
605 }
606
607 static int ipgre_rcv(struct sk_buff *skb)
608 {
609 const struct iphdr *iph;
610 u8 *h;
611 __be16 flags;
612 __sum16 csum = 0;
613 __be32 key = 0;
614 u32 seqno = 0;
615 struct ip_tunnel *tunnel;
616 int offset = 4;
617 __be16 gre_proto;
618 int err;
619
620 if (!pskb_may_pull(skb, 16))
621 goto drop;
622
623 iph = ip_hdr(skb);
624 h = skb->data;
625 flags = *(__be16 *)h;
626
627 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
628 /* - Version must be 0.
629 - We do not support routing headers.
630 */
631 if (flags&(GRE_VERSION|GRE_ROUTING))
632 goto drop;
633
634 if (flags&GRE_CSUM) {
635 switch (skb->ip_summed) {
636 case CHECKSUM_COMPLETE:
637 csum = csum_fold(skb->csum);
638 if (!csum)
639 break;
640 /* fall through */
641 case CHECKSUM_NONE:
642 skb->csum = 0;
643 csum = __skb_checksum_complete(skb);
644 skb->ip_summed = CHECKSUM_COMPLETE;
645 }
646 offset += 4;
647 }
648 if (flags&GRE_KEY) {
649 key = *(__be32 *)(h + offset);
650 offset += 4;
651 }
652 if (flags&GRE_SEQ) {
653 seqno = ntohl(*(__be32 *)(h + offset));
654 offset += 4;
655 }
656 }
657
658 gre_proto = *(__be16 *)(h + 2);
659
660 tunnel = ipgre_tunnel_lookup(skb->dev,
661 iph->saddr, iph->daddr, flags, key,
662 gre_proto);
663 if (tunnel) {
664 struct pcpu_tstats *tstats;
665
666 secpath_reset(skb);
667
668 skb->protocol = gre_proto;
669 /* WCCP version 1 and 2 protocol decoding.
670 * - Change protocol to IP
671 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
672 */
673 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
674 skb->protocol = htons(ETH_P_IP);
675 if ((*(h + offset) & 0xF0) != 0x40)
676 offset += 4;
677 }
678
679 skb->mac_header = skb->network_header;
680 __pskb_pull(skb, offset);
681 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
682 skb->pkt_type = PACKET_HOST;
683 #ifdef CONFIG_NET_IPGRE_BROADCAST
684 if (ipv4_is_multicast(iph->daddr)) {
685 /* Looped back packet, drop it! */
686 if (rt_is_output_route(skb_rtable(skb)))
687 goto drop;
688 tunnel->dev->stats.multicast++;
689 skb->pkt_type = PACKET_BROADCAST;
690 }
691 #endif
692
693 if (((flags&GRE_CSUM) && csum) ||
694 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
695 tunnel->dev->stats.rx_crc_errors++;
696 tunnel->dev->stats.rx_errors++;
697 goto drop;
698 }
699 if (tunnel->parms.i_flags&GRE_SEQ) {
700 if (!(flags&GRE_SEQ) ||
701 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
702 tunnel->dev->stats.rx_fifo_errors++;
703 tunnel->dev->stats.rx_errors++;
704 goto drop;
705 }
706 tunnel->i_seqno = seqno + 1;
707 }
708
709 /* Warning: All skb pointers will be invalidated! */
710 if (tunnel->dev->type == ARPHRD_ETHER) {
711 if (!pskb_may_pull(skb, ETH_HLEN)) {
712 tunnel->dev->stats.rx_length_errors++;
713 tunnel->dev->stats.rx_errors++;
714 goto drop;
715 }
716
717 iph = ip_hdr(skb);
718 skb->protocol = eth_type_trans(skb, tunnel->dev);
719 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
720 }
721
722 __skb_tunnel_rx(skb, tunnel->dev);
723
724 skb_reset_network_header(skb);
725 err = IP_ECN_decapsulate(iph, skb);
726 if (unlikely(err)) {
727 if (log_ecn_error)
728 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
729 &iph->saddr, iph->tos);
730 if (err > 1) {
731 ++tunnel->dev->stats.rx_frame_errors;
732 ++tunnel->dev->stats.rx_errors;
733 goto drop;
734 }
735 }
736
737 tstats = this_cpu_ptr(tunnel->dev->tstats);
738 u64_stats_update_begin(&tstats->syncp);
739 tstats->rx_packets++;
740 tstats->rx_bytes += skb->len;
741 u64_stats_update_end(&tstats->syncp);
742
743 gro_cells_receive(&tunnel->gro_cells, skb);
744 return 0;
745 }
746 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
747
748 drop:
749 kfree_skb(skb);
750 return 0;
751 }
752
753 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
754 {
755 struct ip_tunnel *tunnel = netdev_priv(dev);
756 struct pcpu_tstats *tstats;
757 const struct iphdr *old_iph = ip_hdr(skb);
758 const struct iphdr *tiph;
759 struct flowi4 fl4;
760 u8 tos;
761 __be16 df;
762 struct rtable *rt; /* Route to the other host */
763 struct net_device *tdev; /* Device to other host */
764 struct iphdr *iph; /* Our new IP header */
765 unsigned int max_headroom; /* The extra header space needed */
766 int gre_hlen;
767 __be32 dst;
768 int mtu;
769
770 if (skb->ip_summed == CHECKSUM_PARTIAL &&
771 skb_checksum_help(skb))
772 goto tx_error;
773
774 if (dev->type == ARPHRD_ETHER)
775 IPCB(skb)->flags = 0;
776
777 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
778 gre_hlen = 0;
779 tiph = (const struct iphdr *)skb->data;
780 } else {
781 gre_hlen = tunnel->hlen;
782 tiph = &tunnel->parms.iph;
783 }
784
785 if ((dst = tiph->daddr) == 0) {
786 /* NBMA tunnel */
787
788 if (skb_dst(skb) == NULL) {
789 dev->stats.tx_fifo_errors++;
790 goto tx_error;
791 }
792
793 if (skb->protocol == htons(ETH_P_IP)) {
794 rt = skb_rtable(skb);
795 dst = rt_nexthop(rt, old_iph->daddr);
796 }
797 #if IS_ENABLED(CONFIG_IPV6)
798 else if (skb->protocol == htons(ETH_P_IPV6)) {
799 const struct in6_addr *addr6;
800 struct neighbour *neigh;
801 bool do_tx_error_icmp;
802 int addr_type;
803
804 neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
805 if (neigh == NULL)
806 goto tx_error;
807
808 addr6 = (const struct in6_addr *)&neigh->primary_key;
809 addr_type = ipv6_addr_type(addr6);
810
811 if (addr_type == IPV6_ADDR_ANY) {
812 addr6 = &ipv6_hdr(skb)->daddr;
813 addr_type = ipv6_addr_type(addr6);
814 }
815
816 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
817 do_tx_error_icmp = true;
818 else {
819 do_tx_error_icmp = false;
820 dst = addr6->s6_addr32[3];
821 }
822 neigh_release(neigh);
823 if (do_tx_error_icmp)
824 goto tx_error_icmp;
825 }
826 #endif
827 else
828 goto tx_error;
829 }
830
831 tos = tiph->tos;
832 if (tos == 1) {
833 tos = 0;
834 if (skb->protocol == htons(ETH_P_IP))
835 tos = old_iph->tos;
836 else if (skb->protocol == htons(ETH_P_IPV6))
837 tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
838 }
839
840 rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
841 tunnel->parms.o_key, RT_TOS(tos),
842 tunnel->parms.link);
843 if (IS_ERR(rt)) {
844 dev->stats.tx_carrier_errors++;
845 goto tx_error;
846 }
847 tdev = rt->dst.dev;
848
849 if (tdev == dev) {
850 ip_rt_put(rt);
851 dev->stats.collisions++;
852 goto tx_error;
853 }
854
855 df = tiph->frag_off;
856 if (df)
857 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
858 else
859 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
860
861 if (skb_dst(skb))
862 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
863
864 if (skb->protocol == htons(ETH_P_IP)) {
865 df |= (old_iph->frag_off&htons(IP_DF));
866
867 if ((old_iph->frag_off&htons(IP_DF)) &&
868 mtu < ntohs(old_iph->tot_len)) {
869 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
870 ip_rt_put(rt);
871 goto tx_error;
872 }
873 }
874 #if IS_ENABLED(CONFIG_IPV6)
875 else if (skb->protocol == htons(ETH_P_IPV6)) {
876 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
877
878 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
879 if ((tunnel->parms.iph.daddr &&
880 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
881 rt6->rt6i_dst.plen == 128) {
882 rt6->rt6i_flags |= RTF_MODIFIED;
883 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
884 }
885 }
886
887 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
888 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
889 ip_rt_put(rt);
890 goto tx_error;
891 }
892 }
893 #endif
894
895 if (tunnel->err_count > 0) {
896 if (time_before(jiffies,
897 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
898 tunnel->err_count--;
899
900 dst_link_failure(skb);
901 } else
902 tunnel->err_count = 0;
903 }
904
905 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
906
907 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
908 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
909 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
910 if (max_headroom > dev->needed_headroom)
911 dev->needed_headroom = max_headroom;
912 if (!new_skb) {
913 ip_rt_put(rt);
914 dev->stats.tx_dropped++;
915 dev_kfree_skb(skb);
916 return NETDEV_TX_OK;
917 }
918 if (skb->sk)
919 skb_set_owner_w(new_skb, skb->sk);
920 dev_kfree_skb(skb);
921 skb = new_skb;
922 old_iph = ip_hdr(skb);
923 }
924
925 skb_reset_transport_header(skb);
926 skb_push(skb, gre_hlen);
927 skb_reset_network_header(skb);
928 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
929 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
930 IPSKB_REROUTED);
931 skb_dst_drop(skb);
932 skb_dst_set(skb, &rt->dst);
933
934 /*
935 * Push down and install the IPIP header.
936 */
937
938 iph = ip_hdr(skb);
939 iph->version = 4;
940 iph->ihl = sizeof(struct iphdr) >> 2;
941 iph->frag_off = df;
942 iph->protocol = IPPROTO_GRE;
943 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
944 iph->daddr = fl4.daddr;
945 iph->saddr = fl4.saddr;
946
947 if ((iph->ttl = tiph->ttl) == 0) {
948 if (skb->protocol == htons(ETH_P_IP))
949 iph->ttl = old_iph->ttl;
950 #if IS_ENABLED(CONFIG_IPV6)
951 else if (skb->protocol == htons(ETH_P_IPV6))
952 iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
953 #endif
954 else
955 iph->ttl = ip4_dst_hoplimit(&rt->dst);
956 }
957
958 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
959 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
960 htons(ETH_P_TEB) : skb->protocol;
961
962 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
963 __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
964
965 if (tunnel->parms.o_flags&GRE_SEQ) {
966 ++tunnel->o_seqno;
967 *ptr = htonl(tunnel->o_seqno);
968 ptr--;
969 }
970 if (tunnel->parms.o_flags&GRE_KEY) {
971 *ptr = tunnel->parms.o_key;
972 ptr--;
973 }
974 if (tunnel->parms.o_flags&GRE_CSUM) {
975 *ptr = 0;
976 *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
977 }
978 }
979
980 nf_reset(skb);
981 tstats = this_cpu_ptr(dev->tstats);
982 __IPTUNNEL_XMIT(tstats, &dev->stats);
983 return NETDEV_TX_OK;
984
985 #if IS_ENABLED(CONFIG_IPV6)
986 tx_error_icmp:
987 dst_link_failure(skb);
988 #endif
989 tx_error:
990 dev->stats.tx_errors++;
991 dev_kfree_skb(skb);
992 return NETDEV_TX_OK;
993 }
994
995 static int ipgre_tunnel_bind_dev(struct net_device *dev)
996 {
997 struct net_device *tdev = NULL;
998 struct ip_tunnel *tunnel;
999 const struct iphdr *iph;
1000 int hlen = LL_MAX_HEADER;
1001 int mtu = ETH_DATA_LEN;
1002 int addend = sizeof(struct iphdr) + 4;
1003
1004 tunnel = netdev_priv(dev);
1005 iph = &tunnel->parms.iph;
1006
1007 /* Guess output device to choose reasonable mtu and needed_headroom */
1008
1009 if (iph->daddr) {
1010 struct flowi4 fl4;
1011 struct rtable *rt;
1012
1013 rt = ip_route_output_gre(dev_net(dev), &fl4,
1014 iph->daddr, iph->saddr,
1015 tunnel->parms.o_key,
1016 RT_TOS(iph->tos),
1017 tunnel->parms.link);
1018 if (!IS_ERR(rt)) {
1019 tdev = rt->dst.dev;
1020 ip_rt_put(rt);
1021 }
1022
1023 if (dev->type != ARPHRD_ETHER)
1024 dev->flags |= IFF_POINTOPOINT;
1025 }
1026
1027 if (!tdev && tunnel->parms.link)
1028 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1029
1030 if (tdev) {
1031 hlen = tdev->hard_header_len + tdev->needed_headroom;
1032 mtu = tdev->mtu;
1033 }
1034 dev->iflink = tunnel->parms.link;
1035
1036 /* Precalculate GRE options length */
1037 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1038 if (tunnel->parms.o_flags&GRE_CSUM)
1039 addend += 4;
1040 if (tunnel->parms.o_flags&GRE_KEY)
1041 addend += 4;
1042 if (tunnel->parms.o_flags&GRE_SEQ)
1043 addend += 4;
1044 }
1045 dev->needed_headroom = addend + hlen;
1046 mtu -= dev->hard_header_len + addend;
1047
1048 if (mtu < 68)
1049 mtu = 68;
1050
1051 tunnel->hlen = addend;
1052
1053 return mtu;
1054 }
1055
1056 static int
1057 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1058 {
1059 int err = 0;
1060 struct ip_tunnel_parm p;
1061 struct ip_tunnel *t;
1062 struct net *net = dev_net(dev);
1063 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1064
1065 switch (cmd) {
1066 case SIOCGETTUNNEL:
1067 t = NULL;
1068 if (dev == ign->fb_tunnel_dev) {
1069 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1070 err = -EFAULT;
1071 break;
1072 }
1073 t = ipgre_tunnel_locate(net, &p, 0);
1074 }
1075 if (t == NULL)
1076 t = netdev_priv(dev);
1077 memcpy(&p, &t->parms, sizeof(p));
1078 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1079 err = -EFAULT;
1080 break;
1081
1082 case SIOCADDTUNNEL:
1083 case SIOCCHGTUNNEL:
1084 err = -EPERM;
1085 if (!capable(CAP_NET_ADMIN))
1086 goto done;
1087
1088 err = -EFAULT;
1089 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1090 goto done;
1091
1092 err = -EINVAL;
1093 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1094 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1095 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1096 goto done;
1097 if (p.iph.ttl)
1098 p.iph.frag_off |= htons(IP_DF);
1099
1100 if (!(p.i_flags&GRE_KEY))
1101 p.i_key = 0;
1102 if (!(p.o_flags&GRE_KEY))
1103 p.o_key = 0;
1104
1105 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1106
1107 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1108 if (t != NULL) {
1109 if (t->dev != dev) {
1110 err = -EEXIST;
1111 break;
1112 }
1113 } else {
1114 unsigned int nflags = 0;
1115
1116 t = netdev_priv(dev);
1117
1118 if (ipv4_is_multicast(p.iph.daddr))
1119 nflags = IFF_BROADCAST;
1120 else if (p.iph.daddr)
1121 nflags = IFF_POINTOPOINT;
1122
1123 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1124 err = -EINVAL;
1125 break;
1126 }
1127 ipgre_tunnel_unlink(ign, t);
1128 synchronize_net();
1129 t->parms.iph.saddr = p.iph.saddr;
1130 t->parms.iph.daddr = p.iph.daddr;
1131 t->parms.i_key = p.i_key;
1132 t->parms.o_key = p.o_key;
1133 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1134 memcpy(dev->broadcast, &p.iph.daddr, 4);
1135 ipgre_tunnel_link(ign, t);
1136 netdev_state_change(dev);
1137 }
1138 }
1139
1140 if (t) {
1141 err = 0;
1142 if (cmd == SIOCCHGTUNNEL) {
1143 t->parms.iph.ttl = p.iph.ttl;
1144 t->parms.iph.tos = p.iph.tos;
1145 t->parms.iph.frag_off = p.iph.frag_off;
1146 if (t->parms.link != p.link) {
1147 t->parms.link = p.link;
1148 dev->mtu = ipgre_tunnel_bind_dev(dev);
1149 netdev_state_change(dev);
1150 }
1151 }
1152 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1153 err = -EFAULT;
1154 } else
1155 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1156 break;
1157
1158 case SIOCDELTUNNEL:
1159 err = -EPERM;
1160 if (!capable(CAP_NET_ADMIN))
1161 goto done;
1162
1163 if (dev == ign->fb_tunnel_dev) {
1164 err = -EFAULT;
1165 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1166 goto done;
1167 err = -ENOENT;
1168 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1169 goto done;
1170 err = -EPERM;
1171 if (t == netdev_priv(ign->fb_tunnel_dev))
1172 goto done;
1173 dev = t->dev;
1174 }
1175 unregister_netdevice(dev);
1176 err = 0;
1177 break;
1178
1179 default:
1180 err = -EINVAL;
1181 }
1182
1183 done:
1184 return err;
1185 }
1186
1187 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1188 {
1189 struct ip_tunnel *tunnel = netdev_priv(dev);
1190 if (new_mtu < 68 ||
1191 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1192 return -EINVAL;
1193 dev->mtu = new_mtu;
1194 return 0;
1195 }
1196
1197 /* Nice toy. Unfortunately, useless in real life :-)
1198 It allows to construct virtual multiprotocol broadcast "LAN"
1199 over the Internet, provided multicast routing is tuned.
1200
1201
1202 I have no idea was this bicycle invented before me,
1203 so that I had to set ARPHRD_IPGRE to a random value.
1204 I have an impression, that Cisco could make something similar,
1205 but this feature is apparently missing in IOS<=11.2(8).
1206
1207 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1208 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1209
1210 ping -t 255 224.66.66.66
1211
1212 If nobody answers, mbone does not work.
1213
1214 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1215 ip addr add 10.66.66.<somewhat>/24 dev Universe
1216 ifconfig Universe up
1217 ifconfig Universe add fe80::<Your_real_addr>/10
1218 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1219 ftp 10.66.66.66
1220 ...
1221 ftp fec0:6666:6666::193.233.7.65
1222 ...
1223
1224 */
1225
1226 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1227 unsigned short type,
1228 const void *daddr, const void *saddr, unsigned int len)
1229 {
1230 struct ip_tunnel *t = netdev_priv(dev);
1231 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1232 __be16 *p = (__be16 *)(iph+1);
1233
1234 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1235 p[0] = t->parms.o_flags;
1236 p[1] = htons(type);
1237
1238 /*
1239 * Set the source hardware address.
1240 */
1241
1242 if (saddr)
1243 memcpy(&iph->saddr, saddr, 4);
1244 if (daddr)
1245 memcpy(&iph->daddr, daddr, 4);
1246 if (iph->daddr)
1247 return t->hlen;
1248
1249 return -t->hlen;
1250 }
1251
1252 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1253 {
1254 const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1255 memcpy(haddr, &iph->saddr, 4);
1256 return 4;
1257 }
1258
1259 static const struct header_ops ipgre_header_ops = {
1260 .create = ipgre_header,
1261 .parse = ipgre_header_parse,
1262 };
1263
1264 #ifdef CONFIG_NET_IPGRE_BROADCAST
1265 static int ipgre_open(struct net_device *dev)
1266 {
1267 struct ip_tunnel *t = netdev_priv(dev);
1268
1269 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1270 struct flowi4 fl4;
1271 struct rtable *rt;
1272
1273 rt = ip_route_output_gre(dev_net(dev), &fl4,
1274 t->parms.iph.daddr,
1275 t->parms.iph.saddr,
1276 t->parms.o_key,
1277 RT_TOS(t->parms.iph.tos),
1278 t->parms.link);
1279 if (IS_ERR(rt))
1280 return -EADDRNOTAVAIL;
1281 dev = rt->dst.dev;
1282 ip_rt_put(rt);
1283 if (__in_dev_get_rtnl(dev) == NULL)
1284 return -EADDRNOTAVAIL;
1285 t->mlink = dev->ifindex;
1286 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1287 }
1288 return 0;
1289 }
1290
1291 static int ipgre_close(struct net_device *dev)
1292 {
1293 struct ip_tunnel *t = netdev_priv(dev);
1294
1295 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1296 struct in_device *in_dev;
1297 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1298 if (in_dev)
1299 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1300 }
1301 return 0;
1302 }
1303
1304 #endif
1305
1306 static const struct net_device_ops ipgre_netdev_ops = {
1307 .ndo_init = ipgre_tunnel_init,
1308 .ndo_uninit = ipgre_tunnel_uninit,
1309 #ifdef CONFIG_NET_IPGRE_BROADCAST
1310 .ndo_open = ipgre_open,
1311 .ndo_stop = ipgre_close,
1312 #endif
1313 .ndo_start_xmit = ipgre_tunnel_xmit,
1314 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1315 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1316 .ndo_get_stats64 = ipgre_get_stats64,
1317 };
1318
1319 static void ipgre_dev_free(struct net_device *dev)
1320 {
1321 struct ip_tunnel *tunnel = netdev_priv(dev);
1322
1323 gro_cells_destroy(&tunnel->gro_cells);
1324 free_percpu(dev->tstats);
1325 free_netdev(dev);
1326 }
1327
1328 #define GRE_FEATURES (NETIF_F_SG | \
1329 NETIF_F_FRAGLIST | \
1330 NETIF_F_HIGHDMA | \
1331 NETIF_F_HW_CSUM)
1332
1333 static void ipgre_tunnel_setup(struct net_device *dev)
1334 {
1335 dev->netdev_ops = &ipgre_netdev_ops;
1336 dev->destructor = ipgre_dev_free;
1337
1338 dev->type = ARPHRD_IPGRE;
1339 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1340 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1341 dev->flags = IFF_NOARP;
1342 dev->iflink = 0;
1343 dev->addr_len = 4;
1344 dev->features |= NETIF_F_NETNS_LOCAL;
1345 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1346
1347 dev->features |= GRE_FEATURES;
1348 dev->hw_features |= GRE_FEATURES;
1349 }
1350
1351 static int ipgre_tunnel_init(struct net_device *dev)
1352 {
1353 struct ip_tunnel *tunnel;
1354 struct iphdr *iph;
1355 int err;
1356
1357 tunnel = netdev_priv(dev);
1358 iph = &tunnel->parms.iph;
1359
1360 tunnel->dev = dev;
1361 strcpy(tunnel->parms.name, dev->name);
1362
1363 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1364 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1365
1366 if (iph->daddr) {
1367 #ifdef CONFIG_NET_IPGRE_BROADCAST
1368 if (ipv4_is_multicast(iph->daddr)) {
1369 if (!iph->saddr)
1370 return -EINVAL;
1371 dev->flags = IFF_BROADCAST;
1372 dev->header_ops = &ipgre_header_ops;
1373 }
1374 #endif
1375 } else
1376 dev->header_ops = &ipgre_header_ops;
1377
1378 dev->tstats = alloc_percpu(struct pcpu_tstats);
1379 if (!dev->tstats)
1380 return -ENOMEM;
1381
1382 err = gro_cells_init(&tunnel->gro_cells, dev);
1383 if (err) {
1384 free_percpu(dev->tstats);
1385 return err;
1386 }
1387
1388 return 0;
1389 }
1390
1391 static void ipgre_fb_tunnel_init(struct net_device *dev)
1392 {
1393 struct ip_tunnel *tunnel = netdev_priv(dev);
1394 struct iphdr *iph = &tunnel->parms.iph;
1395
1396 tunnel->dev = dev;
1397 strcpy(tunnel->parms.name, dev->name);
1398
1399 iph->version = 4;
1400 iph->protocol = IPPROTO_GRE;
1401 iph->ihl = 5;
1402 tunnel->hlen = sizeof(struct iphdr) + 4;
1403
1404 dev_hold(dev);
1405 }
1406
1407
1408 static const struct gre_protocol ipgre_protocol = {
1409 .handler = ipgre_rcv,
1410 .err_handler = ipgre_err,
1411 };
1412
1413 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1414 {
1415 int prio;
1416
1417 for (prio = 0; prio < 4; prio++) {
1418 int h;
1419 for (h = 0; h < HASH_SIZE; h++) {
1420 struct ip_tunnel *t;
1421
1422 t = rtnl_dereference(ign->tunnels[prio][h]);
1423
1424 while (t != NULL) {
1425 unregister_netdevice_queue(t->dev, head);
1426 t = rtnl_dereference(t->next);
1427 }
1428 }
1429 }
1430 }
1431
1432 static int __net_init ipgre_init_net(struct net *net)
1433 {
1434 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1435 int err;
1436
1437 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1438 ipgre_tunnel_setup);
1439 if (!ign->fb_tunnel_dev) {
1440 err = -ENOMEM;
1441 goto err_alloc_dev;
1442 }
1443 dev_net_set(ign->fb_tunnel_dev, net);
1444
1445 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1446 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1447
1448 if ((err = register_netdev(ign->fb_tunnel_dev)))
1449 goto err_reg_dev;
1450
1451 rcu_assign_pointer(ign->tunnels_wc[0],
1452 netdev_priv(ign->fb_tunnel_dev));
1453 return 0;
1454
1455 err_reg_dev:
1456 ipgre_dev_free(ign->fb_tunnel_dev);
1457 err_alloc_dev:
1458 return err;
1459 }
1460
1461 static void __net_exit ipgre_exit_net(struct net *net)
1462 {
1463 struct ipgre_net *ign;
1464 LIST_HEAD(list);
1465
1466 ign = net_generic(net, ipgre_net_id);
1467 rtnl_lock();
1468 ipgre_destroy_tunnels(ign, &list);
1469 unregister_netdevice_many(&list);
1470 rtnl_unlock();
1471 }
1472
1473 static struct pernet_operations ipgre_net_ops = {
1474 .init = ipgre_init_net,
1475 .exit = ipgre_exit_net,
1476 .id = &ipgre_net_id,
1477 .size = sizeof(struct ipgre_net),
1478 };
1479
1480 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1481 {
1482 __be16 flags;
1483
1484 if (!data)
1485 return 0;
1486
1487 flags = 0;
1488 if (data[IFLA_GRE_IFLAGS])
1489 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1490 if (data[IFLA_GRE_OFLAGS])
1491 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1492 if (flags & (GRE_VERSION|GRE_ROUTING))
1493 return -EINVAL;
1494
1495 return 0;
1496 }
1497
1498 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1499 {
1500 __be32 daddr;
1501
1502 if (tb[IFLA_ADDRESS]) {
1503 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1504 return -EINVAL;
1505 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1506 return -EADDRNOTAVAIL;
1507 }
1508
1509 if (!data)
1510 goto out;
1511
1512 if (data[IFLA_GRE_REMOTE]) {
1513 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1514 if (!daddr)
1515 return -EINVAL;
1516 }
1517
1518 out:
1519 return ipgre_tunnel_validate(tb, data);
1520 }
1521
1522 static void ipgre_netlink_parms(struct nlattr *data[],
1523 struct ip_tunnel_parm *parms)
1524 {
1525 memset(parms, 0, sizeof(*parms));
1526
1527 parms->iph.protocol = IPPROTO_GRE;
1528
1529 if (!data)
1530 return;
1531
1532 if (data[IFLA_GRE_LINK])
1533 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1534
1535 if (data[IFLA_GRE_IFLAGS])
1536 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1537
1538 if (data[IFLA_GRE_OFLAGS])
1539 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1540
1541 if (data[IFLA_GRE_IKEY])
1542 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1543
1544 if (data[IFLA_GRE_OKEY])
1545 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1546
1547 if (data[IFLA_GRE_LOCAL])
1548 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1549
1550 if (data[IFLA_GRE_REMOTE])
1551 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1552
1553 if (data[IFLA_GRE_TTL])
1554 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1555
1556 if (data[IFLA_GRE_TOS])
1557 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1558
1559 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1560 parms->iph.frag_off = htons(IP_DF);
1561 }
1562
1563 static int ipgre_tap_init(struct net_device *dev)
1564 {
1565 struct ip_tunnel *tunnel;
1566
1567 tunnel = netdev_priv(dev);
1568
1569 tunnel->dev = dev;
1570 strcpy(tunnel->parms.name, dev->name);
1571
1572 ipgre_tunnel_bind_dev(dev);
1573
1574 dev->tstats = alloc_percpu(struct pcpu_tstats);
1575 if (!dev->tstats)
1576 return -ENOMEM;
1577
1578 return 0;
1579 }
1580
1581 static const struct net_device_ops ipgre_tap_netdev_ops = {
1582 .ndo_init = ipgre_tap_init,
1583 .ndo_uninit = ipgre_tunnel_uninit,
1584 .ndo_start_xmit = ipgre_tunnel_xmit,
1585 .ndo_set_mac_address = eth_mac_addr,
1586 .ndo_validate_addr = eth_validate_addr,
1587 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1588 .ndo_get_stats64 = ipgre_get_stats64,
1589 };
1590
1591 static void ipgre_tap_setup(struct net_device *dev)
1592 {
1593
1594 ether_setup(dev);
1595
1596 dev->netdev_ops = &ipgre_tap_netdev_ops;
1597 dev->destructor = ipgre_dev_free;
1598
1599 dev->iflink = 0;
1600 dev->features |= NETIF_F_NETNS_LOCAL;
1601 }
1602
1603 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1604 struct nlattr *data[])
1605 {
1606 struct ip_tunnel *nt;
1607 struct net *net = dev_net(dev);
1608 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1609 int mtu;
1610 int err;
1611
1612 nt = netdev_priv(dev);
1613 ipgre_netlink_parms(data, &nt->parms);
1614
1615 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1616 return -EEXIST;
1617
1618 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1619 eth_hw_addr_random(dev);
1620
1621 mtu = ipgre_tunnel_bind_dev(dev);
1622 if (!tb[IFLA_MTU])
1623 dev->mtu = mtu;
1624
1625 /* Can use a lockless transmit, unless we generate output sequences */
1626 if (!(nt->parms.o_flags & GRE_SEQ))
1627 dev->features |= NETIF_F_LLTX;
1628
1629 err = register_netdevice(dev);
1630 if (err)
1631 goto out;
1632
1633 dev_hold(dev);
1634 ipgre_tunnel_link(ign, nt);
1635
1636 out:
1637 return err;
1638 }
1639
1640 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1641 struct nlattr *data[])
1642 {
1643 struct ip_tunnel *t, *nt;
1644 struct net *net = dev_net(dev);
1645 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1646 struct ip_tunnel_parm p;
1647 int mtu;
1648
1649 if (dev == ign->fb_tunnel_dev)
1650 return -EINVAL;
1651
1652 nt = netdev_priv(dev);
1653 ipgre_netlink_parms(data, &p);
1654
1655 t = ipgre_tunnel_locate(net, &p, 0);
1656
1657 if (t) {
1658 if (t->dev != dev)
1659 return -EEXIST;
1660 } else {
1661 t = nt;
1662
1663 if (dev->type != ARPHRD_ETHER) {
1664 unsigned int nflags = 0;
1665
1666 if (ipv4_is_multicast(p.iph.daddr))
1667 nflags = IFF_BROADCAST;
1668 else if (p.iph.daddr)
1669 nflags = IFF_POINTOPOINT;
1670
1671 if ((dev->flags ^ nflags) &
1672 (IFF_POINTOPOINT | IFF_BROADCAST))
1673 return -EINVAL;
1674 }
1675
1676 ipgre_tunnel_unlink(ign, t);
1677 t->parms.iph.saddr = p.iph.saddr;
1678 t->parms.iph.daddr = p.iph.daddr;
1679 t->parms.i_key = p.i_key;
1680 if (dev->type != ARPHRD_ETHER) {
1681 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1682 memcpy(dev->broadcast, &p.iph.daddr, 4);
1683 }
1684 ipgre_tunnel_link(ign, t);
1685 netdev_state_change(dev);
1686 }
1687
1688 t->parms.o_key = p.o_key;
1689 t->parms.iph.ttl = p.iph.ttl;
1690 t->parms.iph.tos = p.iph.tos;
1691 t->parms.iph.frag_off = p.iph.frag_off;
1692
1693 if (t->parms.link != p.link) {
1694 t->parms.link = p.link;
1695 mtu = ipgre_tunnel_bind_dev(dev);
1696 if (!tb[IFLA_MTU])
1697 dev->mtu = mtu;
1698 netdev_state_change(dev);
1699 }
1700
1701 return 0;
1702 }
1703
1704 static size_t ipgre_get_size(const struct net_device *dev)
1705 {
1706 return
1707 /* IFLA_GRE_LINK */
1708 nla_total_size(4) +
1709 /* IFLA_GRE_IFLAGS */
1710 nla_total_size(2) +
1711 /* IFLA_GRE_OFLAGS */
1712 nla_total_size(2) +
1713 /* IFLA_GRE_IKEY */
1714 nla_total_size(4) +
1715 /* IFLA_GRE_OKEY */
1716 nla_total_size(4) +
1717 /* IFLA_GRE_LOCAL */
1718 nla_total_size(4) +
1719 /* IFLA_GRE_REMOTE */
1720 nla_total_size(4) +
1721 /* IFLA_GRE_TTL */
1722 nla_total_size(1) +
1723 /* IFLA_GRE_TOS */
1724 nla_total_size(1) +
1725 /* IFLA_GRE_PMTUDISC */
1726 nla_total_size(1) +
1727 0;
1728 }
1729
1730 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1731 {
1732 struct ip_tunnel *t = netdev_priv(dev);
1733 struct ip_tunnel_parm *p = &t->parms;
1734
1735 if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1736 nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1737 nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1738 nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1739 nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1740 nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1741 nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1742 nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1743 nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1744 nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1745 !!(p->iph.frag_off & htons(IP_DF))))
1746 goto nla_put_failure;
1747 return 0;
1748
1749 nla_put_failure:
1750 return -EMSGSIZE;
1751 }
1752
1753 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1754 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1755 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1756 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1757 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1758 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1759 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1760 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1761 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1762 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1763 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1764 };
1765
1766 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1767 .kind = "gre",
1768 .maxtype = IFLA_GRE_MAX,
1769 .policy = ipgre_policy,
1770 .priv_size = sizeof(struct ip_tunnel),
1771 .setup = ipgre_tunnel_setup,
1772 .validate = ipgre_tunnel_validate,
1773 .newlink = ipgre_newlink,
1774 .changelink = ipgre_changelink,
1775 .get_size = ipgre_get_size,
1776 .fill_info = ipgre_fill_info,
1777 };
1778
1779 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1780 .kind = "gretap",
1781 .maxtype = IFLA_GRE_MAX,
1782 .policy = ipgre_policy,
1783 .priv_size = sizeof(struct ip_tunnel),
1784 .setup = ipgre_tap_setup,
1785 .validate = ipgre_tap_validate,
1786 .newlink = ipgre_newlink,
1787 .changelink = ipgre_changelink,
1788 .get_size = ipgre_get_size,
1789 .fill_info = ipgre_fill_info,
1790 };
1791
1792 /*
1793 * And now the modules code and kernel interface.
1794 */
1795
1796 static int __init ipgre_init(void)
1797 {
1798 int err;
1799
1800 pr_info("GRE over IPv4 tunneling driver\n");
1801
1802 err = register_pernet_device(&ipgre_net_ops);
1803 if (err < 0)
1804 return err;
1805
1806 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1807 if (err < 0) {
1808 pr_info("%s: can't add protocol\n", __func__);
1809 goto add_proto_failed;
1810 }
1811
1812 err = rtnl_link_register(&ipgre_link_ops);
1813 if (err < 0)
1814 goto rtnl_link_failed;
1815
1816 err = rtnl_link_register(&ipgre_tap_ops);
1817 if (err < 0)
1818 goto tap_ops_failed;
1819
1820 out:
1821 return err;
1822
1823 tap_ops_failed:
1824 rtnl_link_unregister(&ipgre_link_ops);
1825 rtnl_link_failed:
1826 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1827 add_proto_failed:
1828 unregister_pernet_device(&ipgre_net_ops);
1829 goto out;
1830 }
1831
1832 static void __exit ipgre_fini(void)
1833 {
1834 rtnl_link_unregister(&ipgre_tap_ops);
1835 rtnl_link_unregister(&ipgre_link_ops);
1836 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1837 pr_info("%s: can't remove protocol\n", __func__);
1838 unregister_pernet_device(&ipgre_net_ops);
1839 }
1840
1841 module_init(ipgre_init);
1842 module_exit(ipgre_fini);
1843 MODULE_LICENSE("GPL");
1844 MODULE_ALIAS_RTNL_LINK("gre");
1845 MODULE_ALIAS_RTNL_LINK("gretap");
1846 MODULE_ALIAS_NETDEV("gre0");
This page took 0.068525 seconds and 5 git commands to generate.