2 * Linux NET3: GRE over IP protocol decoder.
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/if_vlan.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
39 #include <net/protocol.h>
40 #include <net/ip_tunnels.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
50 #include <net/dst_metadata.h>
52 #if IS_ENABLED(CONFIG_IPV6)
54 #include <net/ip6_fib.h>
55 #include <net/ip6_route.h>
62 1. The most important issue is detecting local dead loops.
63 They would cause complete host lockup in transmit, which
64 would be "resolved" by stack overflow or, if queueing is enabled,
65 with infinite looping in net_bh.
67 We cannot track such dead loops during route installation,
68 it is infeasible task. The most general solutions would be
69 to keep skb->encapsulation counter (sort of local ttl),
70 and silently drop packet when it expires. It is a good
71 solution, but it supposes maintaining new variable in ALL
72 skb, even if no tunneling is used.
74 Current solution: xmit_recursion breaks dead loops. This is a percpu
75 counter, since when we enter the first ndo_xmit(), cpu migration is
76 forbidden. We force an exit if this counter reaches RECURSION_LIMIT
78 2. Networking dead loops would not kill routers, but would really
79 kill network. IP hop limit plays role of "t->recursion" in this case,
80 if we copy it from packet being encapsulated to upper header.
81 It is very good solution, but it introduces two problems:
83 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
84 do not work over tunnels.
85 - traceroute does not work. I planned to relay ICMP from tunnel,
86 so that this problem would be solved and traceroute output
87 would even more informative. This idea appeared to be wrong:
88 only Linux complies to rfc1812 now (yes, guys, Linux is the only
89 true router now :-)), all routers (at least, in neighbourhood of mine)
90 return only 8 bytes of payload. It is the end.
92 Hence, if we want that OSPF worked or traceroute said something reasonable,
93 we should search for another solution.
95 One of them is to parse packet trying to detect inner encapsulation
96 made by our node. It is difficult or even impossible, especially,
97 taking into account fragmentation. TO be short, ttl is not solution at all.
99 Current solution: The solution was UNEXPECTEDLY SIMPLE.
100 We force DF flag on tunnels with preconfigured hop limit,
101 that is ALL. :-) Well, it does not remove the problem completely,
102 but exponential growth of network traffic is changed to linear
103 (branches, that exceed pmtu are pruned) and tunnel mtu
104 rapidly degrades to value <68, where looping stops.
105 Yes, it is not good if there exists a router in the loop,
106 which does not force DF, even when encapsulating packets have DF set.
107 But it is not our problem! Nobody could accuse us, we made
108 all that we could make. Even if it is your gated who injected
109 fatal route to network, even if it were you who configured
110 fatal static route: you are innocent. :-)
115 static bool log_ecn_error
= true;
116 module_param(log_ecn_error
, bool, 0644);
117 MODULE_PARM_DESC(log_ecn_error
, "Log packets received with corrupted ECN");
119 static struct rtnl_link_ops ipgre_link_ops __read_mostly
;
120 static int ipgre_tunnel_init(struct net_device
*dev
);
122 static int ipgre_net_id __read_mostly
;
123 static int gre_tap_net_id __read_mostly
;
125 static void ipgre_err(struct sk_buff
*skb
, u32 info
,
126 const struct tnl_ptk_info
*tpi
)
129 /* All the routers (except for Linux) return only
130 8 bytes of packet payload. It means, that precise relaying of
131 ICMP in the real Internet is absolutely infeasible.
133 Moreover, Cisco "wise men" put GRE key to the third word
134 in GRE header. It makes impossible maintaining even soft
135 state for keyed GRE tunnels with enabled checksum. Tell
138 Well, I wonder, rfc1812 was written by Cisco employee,
139 what the hell these idiots break standards established
142 struct net
*net
= dev_net(skb
->dev
);
143 struct ip_tunnel_net
*itn
;
144 const struct iphdr
*iph
;
145 const int type
= icmp_hdr(skb
)->type
;
146 const int code
= icmp_hdr(skb
)->code
;
147 unsigned int data_len
= 0;
152 case ICMP_PARAMETERPROB
:
155 case ICMP_DEST_UNREACH
:
158 case ICMP_PORT_UNREACH
:
159 /* Impossible event. */
162 /* All others are translated to HOST_UNREACH.
163 rfc2003 contains "deep thoughts" about NET_UNREACH,
164 I believe they are just ether pollution. --ANK
170 case ICMP_TIME_EXCEEDED
:
171 if (code
!= ICMP_EXC_TTL
)
173 data_len
= icmp_hdr(skb
)->un
.reserved
[1] * 4; /* RFC 4884 4.1 */
180 if (tpi
->proto
== htons(ETH_P_TEB
))
181 itn
= net_generic(net
, gre_tap_net_id
);
183 itn
= net_generic(net
, ipgre_net_id
);
185 iph
= (const struct iphdr
*)(icmp_hdr(skb
) + 1);
186 t
= ip_tunnel_lookup(itn
, skb
->dev
->ifindex
, tpi
->flags
,
187 iph
->daddr
, iph
->saddr
, tpi
->key
);
192 #if IS_ENABLED(CONFIG_IPV6)
193 if (tpi
->proto
== htons(ETH_P_IPV6
) &&
194 !ip6_err_gen_icmpv6_unreach(skb
, iph
->ihl
* 4 + tpi
->hdr_len
,
199 if (t
->parms
.iph
.daddr
== 0 ||
200 ipv4_is_multicast(t
->parms
.iph
.daddr
))
203 if (t
->parms
.iph
.ttl
== 0 && type
== ICMP_TIME_EXCEEDED
)
206 if (time_before(jiffies
, t
->err_time
+ IPTUNNEL_ERR_TIMEO
))
210 t
->err_time
= jiffies
;
213 static void gre_err(struct sk_buff
*skb
, u32 info
)
215 /* All the routers (except for Linux) return only
216 * 8 bytes of packet payload. It means, that precise relaying of
217 * ICMP in the real Internet is absolutely infeasible.
219 * Moreover, Cisco "wise men" put GRE key to the third word
220 * in GRE header. It makes impossible maintaining even soft
222 * GRE tunnels with enabled checksum. Tell them "thank you".
224 * Well, I wonder, rfc1812 was written by Cisco employee,
225 * what the hell these idiots break standards established
229 const int type
= icmp_hdr(skb
)->type
;
230 const int code
= icmp_hdr(skb
)->code
;
231 struct tnl_ptk_info tpi
;
232 bool csum_err
= false;
234 if (gre_parse_header(skb
, &tpi
, &csum_err
, htons(ETH_P_IP
)) < 0) {
235 if (!csum_err
) /* ignore csum errors. */
239 if (type
== ICMP_DEST_UNREACH
&& code
== ICMP_FRAG_NEEDED
) {
240 ipv4_update_pmtu(skb
, dev_net(skb
->dev
), info
,
241 skb
->dev
->ifindex
, 0, IPPROTO_GRE
, 0);
244 if (type
== ICMP_REDIRECT
) {
245 ipv4_redirect(skb
, dev_net(skb
->dev
), skb
->dev
->ifindex
, 0,
250 ipgre_err(skb
, info
, &tpi
);
253 static __be64
key_to_tunnel_id(__be32 key
)
256 return (__force __be64
)((__force u32
)key
);
258 return (__force __be64
)((__force u64
)key
<< 32);
262 /* Returns the least-significant 32 bits of a __be64. */
263 static __be32
tunnel_id_to_key(__be64 x
)
266 return (__force __be32
)x
;
268 return (__force __be32
)((__force u64
)x
>> 32);
272 static int __ipgre_rcv(struct sk_buff
*skb
, const struct tnl_ptk_info
*tpi
,
273 struct ip_tunnel_net
*itn
, int hdr_len
, bool raw_proto
)
275 struct metadata_dst
*tun_dst
= NULL
;
276 const struct iphdr
*iph
;
277 struct ip_tunnel
*tunnel
;
280 tunnel
= ip_tunnel_lookup(itn
, skb
->dev
->ifindex
, tpi
->flags
,
281 iph
->saddr
, iph
->daddr
, tpi
->key
);
284 if (__iptunnel_pull_header(skb
, hdr_len
, tpi
->proto
,
285 raw_proto
, false) < 0)
288 if (tunnel
->dev
->type
!= ARPHRD_NONE
)
289 skb_pop_mac_header(skb
);
291 skb_reset_mac_header(skb
);
292 if (tunnel
->collect_md
) {
296 flags
= tpi
->flags
& (TUNNEL_CSUM
| TUNNEL_KEY
);
297 tun_id
= key_to_tunnel_id(tpi
->key
);
298 tun_dst
= ip_tun_rx_dst(skb
, flags
, tun_id
, 0);
300 return PACKET_REJECT
;
303 ip_tunnel_rcv(tunnel
, skb
, tpi
, tun_dst
, log_ecn_error
);
313 static int ipgre_rcv(struct sk_buff
*skb
, const struct tnl_ptk_info
*tpi
,
316 struct net
*net
= dev_net(skb
->dev
);
317 struct ip_tunnel_net
*itn
;
320 if (tpi
->proto
== htons(ETH_P_TEB
))
321 itn
= net_generic(net
, gre_tap_net_id
);
323 itn
= net_generic(net
, ipgre_net_id
);
325 res
= __ipgre_rcv(skb
, tpi
, itn
, hdr_len
, false);
326 if (res
== PACKET_NEXT
&& tpi
->proto
== htons(ETH_P_TEB
)) {
327 /* ipgre tunnels in collect metadata mode should receive
328 * also ETH_P_TEB traffic.
330 itn
= net_generic(net
, ipgre_net_id
);
331 res
= __ipgre_rcv(skb
, tpi
, itn
, hdr_len
, true);
336 static int gre_rcv(struct sk_buff
*skb
)
338 struct tnl_ptk_info tpi
;
339 bool csum_err
= false;
342 #ifdef CONFIG_NET_IPGRE_BROADCAST
343 if (ipv4_is_multicast(ip_hdr(skb
)->daddr
)) {
344 /* Looped back packet, drop it! */
345 if (rt_is_output_route(skb_rtable(skb
)))
350 hdr_len
= gre_parse_header(skb
, &tpi
, &csum_err
, htons(ETH_P_IP
));
354 if (ipgre_rcv(skb
, &tpi
, hdr_len
) == PACKET_RCVD
)
357 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_PORT_UNREACH
, 0);
363 static void __gre_xmit(struct sk_buff
*skb
, struct net_device
*dev
,
364 const struct iphdr
*tnl_params
,
367 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
369 if (tunnel
->parms
.o_flags
& TUNNEL_SEQ
)
372 /* Push GRE header. */
373 gre_build_header(skb
, tunnel
->tun_hlen
,
374 tunnel
->parms
.o_flags
, proto
, tunnel
->parms
.o_key
,
375 htonl(tunnel
->o_seqno
));
377 skb_set_inner_protocol(skb
, proto
);
378 ip_tunnel_xmit(skb
, dev
, tnl_params
, tnl_params
->protocol
);
381 static int gre_handle_offloads(struct sk_buff
*skb
, bool csum
)
383 return iptunnel_handle_offloads(skb
, csum
? SKB_GSO_GRE_CSUM
: SKB_GSO_GRE
);
386 static struct rtable
*gre_get_rt(struct sk_buff
*skb
,
387 struct net_device
*dev
,
389 const struct ip_tunnel_key
*key
)
391 struct net
*net
= dev_net(dev
);
393 memset(fl
, 0, sizeof(*fl
));
394 fl
->daddr
= key
->u
.ipv4
.dst
;
395 fl
->saddr
= key
->u
.ipv4
.src
;
396 fl
->flowi4_tos
= RT_TOS(key
->tos
);
397 fl
->flowi4_mark
= skb
->mark
;
398 fl
->flowi4_proto
= IPPROTO_GRE
;
400 return ip_route_output_key(net
, fl
);
403 static void gre_fb_xmit(struct sk_buff
*skb
, struct net_device
*dev
,
406 struct ip_tunnel_info
*tun_info
;
407 const struct ip_tunnel_key
*key
;
408 struct rtable
*rt
= NULL
;
416 tun_info
= skb_tunnel_info(skb
);
417 if (unlikely(!tun_info
|| !(tun_info
->mode
& IP_TUNNEL_INFO_TX
) ||
418 ip_tunnel_info_af(tun_info
) != AF_INET
))
421 key
= &tun_info
->key
;
422 use_cache
= ip_tunnel_dst_cache_usable(skb
, tun_info
);
424 rt
= dst_cache_get_ip4(&tun_info
->dst_cache
, &fl
.saddr
);
426 rt
= gre_get_rt(skb
, dev
, &fl
, key
);
430 dst_cache_set_ip4(&tun_info
->dst_cache
, &rt
->dst
,
434 tunnel_hlen
= gre_calc_hlen(key
->tun_flags
);
436 min_headroom
= LL_RESERVED_SPACE(rt
->dst
.dev
) + rt
->dst
.header_len
437 + tunnel_hlen
+ sizeof(struct iphdr
);
438 if (skb_headroom(skb
) < min_headroom
|| skb_header_cloned(skb
)) {
439 int head_delta
= SKB_DATA_ALIGN(min_headroom
-
442 err
= pskb_expand_head(skb
, max_t(int, head_delta
, 0),
448 /* Push Tunnel header. */
449 if (gre_handle_offloads(skb
, !!(tun_info
->key
.tun_flags
& TUNNEL_CSUM
)))
452 flags
= tun_info
->key
.tun_flags
& (TUNNEL_CSUM
| TUNNEL_KEY
);
453 gre_build_header(skb
, tunnel_hlen
, flags
, proto
,
454 tunnel_id_to_key(tun_info
->key
.tun_id
), 0);
456 df
= key
->tun_flags
& TUNNEL_DONT_FRAGMENT
? htons(IP_DF
) : 0;
458 iptunnel_xmit(skb
->sk
, rt
, skb
, fl
.saddr
, key
->u
.ipv4
.dst
, IPPROTO_GRE
,
459 key
->tos
, key
->ttl
, df
, false);
466 dev
->stats
.tx_dropped
++;
469 static int gre_fill_metadata_dst(struct net_device
*dev
, struct sk_buff
*skb
)
471 struct ip_tunnel_info
*info
= skb_tunnel_info(skb
);
475 if (ip_tunnel_info_af(info
) != AF_INET
)
478 rt
= gre_get_rt(skb
, dev
, &fl4
, &info
->key
);
483 info
->key
.u
.ipv4
.src
= fl4
.saddr
;
487 static netdev_tx_t
ipgre_xmit(struct sk_buff
*skb
,
488 struct net_device
*dev
)
490 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
491 const struct iphdr
*tnl_params
;
493 if (tunnel
->collect_md
) {
494 gre_fb_xmit(skb
, dev
, skb
->protocol
);
498 if (dev
->header_ops
) {
499 /* Need space for new headers */
500 if (skb_cow_head(skb
, dev
->needed_headroom
-
501 (tunnel
->hlen
+ sizeof(struct iphdr
))))
504 tnl_params
= (const struct iphdr
*)skb
->data
;
506 /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
509 skb_pull(skb
, tunnel
->hlen
+ sizeof(struct iphdr
));
510 skb_reset_mac_header(skb
);
512 if (skb_cow_head(skb
, dev
->needed_headroom
))
515 tnl_params
= &tunnel
->parms
.iph
;
518 if (gre_handle_offloads(skb
, !!(tunnel
->parms
.o_flags
& TUNNEL_CSUM
)))
521 __gre_xmit(skb
, dev
, tnl_params
, skb
->protocol
);
526 dev
->stats
.tx_dropped
++;
530 static netdev_tx_t
gre_tap_xmit(struct sk_buff
*skb
,
531 struct net_device
*dev
)
533 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
535 if (tunnel
->collect_md
) {
536 gre_fb_xmit(skb
, dev
, htons(ETH_P_TEB
));
540 if (gre_handle_offloads(skb
, !!(tunnel
->parms
.o_flags
& TUNNEL_CSUM
)))
543 if (skb_cow_head(skb
, dev
->needed_headroom
))
546 __gre_xmit(skb
, dev
, &tunnel
->parms
.iph
, htons(ETH_P_TEB
));
551 dev
->stats
.tx_dropped
++;
555 static int ipgre_tunnel_ioctl(struct net_device
*dev
,
556 struct ifreq
*ifr
, int cmd
)
559 struct ip_tunnel_parm p
;
561 if (copy_from_user(&p
, ifr
->ifr_ifru
.ifru_data
, sizeof(p
)))
563 if (cmd
== SIOCADDTUNNEL
|| cmd
== SIOCCHGTUNNEL
) {
564 if (p
.iph
.version
!= 4 || p
.iph
.protocol
!= IPPROTO_GRE
||
565 p
.iph
.ihl
!= 5 || (p
.iph
.frag_off
&htons(~IP_DF
)) ||
566 ((p
.i_flags
|p
.o_flags
)&(GRE_VERSION
|GRE_ROUTING
)))
569 p
.i_flags
= gre_flags_to_tnl_flags(p
.i_flags
);
570 p
.o_flags
= gre_flags_to_tnl_flags(p
.o_flags
);
572 err
= ip_tunnel_ioctl(dev
, &p
, cmd
);
576 p
.i_flags
= gre_tnl_flags_to_gre_flags(p
.i_flags
);
577 p
.o_flags
= gre_tnl_flags_to_gre_flags(p
.o_flags
);
579 if (copy_to_user(ifr
->ifr_ifru
.ifru_data
, &p
, sizeof(p
)))
584 /* Nice toy. Unfortunately, useless in real life :-)
585 It allows to construct virtual multiprotocol broadcast "LAN"
586 over the Internet, provided multicast routing is tuned.
589 I have no idea was this bicycle invented before me,
590 so that I had to set ARPHRD_IPGRE to a random value.
591 I have an impression, that Cisco could make something similar,
592 but this feature is apparently missing in IOS<=11.2(8).
594 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
595 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
597 ping -t 255 224.66.66.66
599 If nobody answers, mbone does not work.
601 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
602 ip addr add 10.66.66.<somewhat>/24 dev Universe
604 ifconfig Universe add fe80::<Your_real_addr>/10
605 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
608 ftp fec0:6666:6666::193.233.7.65
611 static int ipgre_header(struct sk_buff
*skb
, struct net_device
*dev
,
613 const void *daddr
, const void *saddr
, unsigned int len
)
615 struct ip_tunnel
*t
= netdev_priv(dev
);
617 struct gre_base_hdr
*greh
;
619 iph
= (struct iphdr
*)skb_push(skb
, t
->hlen
+ sizeof(*iph
));
620 greh
= (struct gre_base_hdr
*)(iph
+1);
621 greh
->flags
= gre_tnl_flags_to_gre_flags(t
->parms
.o_flags
);
622 greh
->protocol
= htons(type
);
624 memcpy(iph
, &t
->parms
.iph
, sizeof(struct iphdr
));
626 /* Set the source hardware address. */
628 memcpy(&iph
->saddr
, saddr
, 4);
630 memcpy(&iph
->daddr
, daddr
, 4);
632 return t
->hlen
+ sizeof(*iph
);
634 return -(t
->hlen
+ sizeof(*iph
));
637 static int ipgre_header_parse(const struct sk_buff
*skb
, unsigned char *haddr
)
639 const struct iphdr
*iph
= (const struct iphdr
*) skb_mac_header(skb
);
640 memcpy(haddr
, &iph
->saddr
, 4);
644 static const struct header_ops ipgre_header_ops
= {
645 .create
= ipgre_header
,
646 .parse
= ipgre_header_parse
,
649 #ifdef CONFIG_NET_IPGRE_BROADCAST
650 static int ipgre_open(struct net_device
*dev
)
652 struct ip_tunnel
*t
= netdev_priv(dev
);
654 if (ipv4_is_multicast(t
->parms
.iph
.daddr
)) {
658 rt
= ip_route_output_gre(t
->net
, &fl4
,
662 RT_TOS(t
->parms
.iph
.tos
),
665 return -EADDRNOTAVAIL
;
668 if (!__in_dev_get_rtnl(dev
))
669 return -EADDRNOTAVAIL
;
670 t
->mlink
= dev
->ifindex
;
671 ip_mc_inc_group(__in_dev_get_rtnl(dev
), t
->parms
.iph
.daddr
);
676 static int ipgre_close(struct net_device
*dev
)
678 struct ip_tunnel
*t
= netdev_priv(dev
);
680 if (ipv4_is_multicast(t
->parms
.iph
.daddr
) && t
->mlink
) {
681 struct in_device
*in_dev
;
682 in_dev
= inetdev_by_index(t
->net
, t
->mlink
);
684 ip_mc_dec_group(in_dev
, t
->parms
.iph
.daddr
);
690 static const struct net_device_ops ipgre_netdev_ops
= {
691 .ndo_init
= ipgre_tunnel_init
,
692 .ndo_uninit
= ip_tunnel_uninit
,
693 #ifdef CONFIG_NET_IPGRE_BROADCAST
694 .ndo_open
= ipgre_open
,
695 .ndo_stop
= ipgre_close
,
697 .ndo_start_xmit
= ipgre_xmit
,
698 .ndo_do_ioctl
= ipgre_tunnel_ioctl
,
699 .ndo_change_mtu
= ip_tunnel_change_mtu
,
700 .ndo_get_stats64
= ip_tunnel_get_stats64
,
701 .ndo_get_iflink
= ip_tunnel_get_iflink
,
704 #define GRE_FEATURES (NETIF_F_SG | \
709 static void ipgre_tunnel_setup(struct net_device
*dev
)
711 dev
->netdev_ops
= &ipgre_netdev_ops
;
712 dev
->type
= ARPHRD_IPGRE
;
713 ip_tunnel_setup(dev
, ipgre_net_id
);
716 static void __gre_tunnel_init(struct net_device
*dev
)
718 struct ip_tunnel
*tunnel
;
721 tunnel
= netdev_priv(dev
);
722 tunnel
->tun_hlen
= gre_calc_hlen(tunnel
->parms
.o_flags
);
723 tunnel
->parms
.iph
.protocol
= IPPROTO_GRE
;
725 tunnel
->hlen
= tunnel
->tun_hlen
+ tunnel
->encap_hlen
;
727 t_hlen
= tunnel
->hlen
+ sizeof(struct iphdr
);
729 dev
->needed_headroom
= LL_MAX_HEADER
+ t_hlen
+ 4;
730 dev
->mtu
= ETH_DATA_LEN
- t_hlen
- 4;
732 dev
->features
|= GRE_FEATURES
;
733 dev
->hw_features
|= GRE_FEATURES
;
735 if (!(tunnel
->parms
.o_flags
& TUNNEL_SEQ
)) {
736 /* TCP offload with GRE SEQ is not supported, nor
737 * can we support 2 levels of outer headers requiring
740 if (!(tunnel
->parms
.o_flags
& TUNNEL_CSUM
) ||
741 (tunnel
->encap
.type
== TUNNEL_ENCAP_NONE
)) {
742 dev
->features
|= NETIF_F_GSO_SOFTWARE
;
743 dev
->hw_features
|= NETIF_F_GSO_SOFTWARE
;
746 /* Can use a lockless transmit, unless we generate
749 dev
->features
|= NETIF_F_LLTX
;
753 static int ipgre_tunnel_init(struct net_device
*dev
)
755 struct ip_tunnel
*tunnel
= netdev_priv(dev
);
756 struct iphdr
*iph
= &tunnel
->parms
.iph
;
758 __gre_tunnel_init(dev
);
760 memcpy(dev
->dev_addr
, &iph
->saddr
, 4);
761 memcpy(dev
->broadcast
, &iph
->daddr
, 4);
763 dev
->flags
= IFF_NOARP
;
767 if (iph
->daddr
&& !tunnel
->collect_md
) {
768 #ifdef CONFIG_NET_IPGRE_BROADCAST
769 if (ipv4_is_multicast(iph
->daddr
)) {
772 dev
->flags
= IFF_BROADCAST
;
773 dev
->header_ops
= &ipgre_header_ops
;
776 } else if (!tunnel
->collect_md
) {
777 dev
->header_ops
= &ipgre_header_ops
;
780 return ip_tunnel_init(dev
);
783 static const struct gre_protocol ipgre_protocol
= {
785 .err_handler
= gre_err
,
788 static int __net_init
ipgre_init_net(struct net
*net
)
790 return ip_tunnel_init_net(net
, ipgre_net_id
, &ipgre_link_ops
, NULL
);
793 static void __net_exit
ipgre_exit_net(struct net
*net
)
795 struct ip_tunnel_net
*itn
= net_generic(net
, ipgre_net_id
);
796 ip_tunnel_delete_net(itn
, &ipgre_link_ops
);
799 static struct pernet_operations ipgre_net_ops
= {
800 .init
= ipgre_init_net
,
801 .exit
= ipgre_exit_net
,
803 .size
= sizeof(struct ip_tunnel_net
),
806 static int ipgre_tunnel_validate(struct nlattr
*tb
[], struct nlattr
*data
[])
814 if (data
[IFLA_GRE_IFLAGS
])
815 flags
|= nla_get_be16(data
[IFLA_GRE_IFLAGS
]);
816 if (data
[IFLA_GRE_OFLAGS
])
817 flags
|= nla_get_be16(data
[IFLA_GRE_OFLAGS
]);
818 if (flags
& (GRE_VERSION
|GRE_ROUTING
))
821 if (data
[IFLA_GRE_COLLECT_METADATA
] &&
822 data
[IFLA_GRE_ENCAP_TYPE
] &&
823 nla_get_u16(data
[IFLA_GRE_ENCAP_TYPE
]) != TUNNEL_ENCAP_NONE
)
829 static int ipgre_tap_validate(struct nlattr
*tb
[], struct nlattr
*data
[])
833 if (tb
[IFLA_ADDRESS
]) {
834 if (nla_len(tb
[IFLA_ADDRESS
]) != ETH_ALEN
)
836 if (!is_valid_ether_addr(nla_data(tb
[IFLA_ADDRESS
])))
837 return -EADDRNOTAVAIL
;
843 if (data
[IFLA_GRE_REMOTE
]) {
844 memcpy(&daddr
, nla_data(data
[IFLA_GRE_REMOTE
]), 4);
850 return ipgre_tunnel_validate(tb
, data
);
853 static int ipgre_netlink_parms(struct net_device
*dev
,
854 struct nlattr
*data
[],
856 struct ip_tunnel_parm
*parms
)
858 struct ip_tunnel
*t
= netdev_priv(dev
);
860 memset(parms
, 0, sizeof(*parms
));
862 parms
->iph
.protocol
= IPPROTO_GRE
;
867 if (data
[IFLA_GRE_LINK
])
868 parms
->link
= nla_get_u32(data
[IFLA_GRE_LINK
]);
870 if (data
[IFLA_GRE_IFLAGS
])
871 parms
->i_flags
= gre_flags_to_tnl_flags(nla_get_be16(data
[IFLA_GRE_IFLAGS
]));
873 if (data
[IFLA_GRE_OFLAGS
])
874 parms
->o_flags
= gre_flags_to_tnl_flags(nla_get_be16(data
[IFLA_GRE_OFLAGS
]));
876 if (data
[IFLA_GRE_IKEY
])
877 parms
->i_key
= nla_get_be32(data
[IFLA_GRE_IKEY
]);
879 if (data
[IFLA_GRE_OKEY
])
880 parms
->o_key
= nla_get_be32(data
[IFLA_GRE_OKEY
]);
882 if (data
[IFLA_GRE_LOCAL
])
883 parms
->iph
.saddr
= nla_get_in_addr(data
[IFLA_GRE_LOCAL
]);
885 if (data
[IFLA_GRE_REMOTE
])
886 parms
->iph
.daddr
= nla_get_in_addr(data
[IFLA_GRE_REMOTE
]);
888 if (data
[IFLA_GRE_TTL
])
889 parms
->iph
.ttl
= nla_get_u8(data
[IFLA_GRE_TTL
]);
891 if (data
[IFLA_GRE_TOS
])
892 parms
->iph
.tos
= nla_get_u8(data
[IFLA_GRE_TOS
]);
894 if (!data
[IFLA_GRE_PMTUDISC
] || nla_get_u8(data
[IFLA_GRE_PMTUDISC
])) {
897 parms
->iph
.frag_off
= htons(IP_DF
);
900 if (data
[IFLA_GRE_COLLECT_METADATA
]) {
901 t
->collect_md
= true;
902 if (dev
->type
== ARPHRD_IPGRE
)
903 dev
->type
= ARPHRD_NONE
;
906 if (data
[IFLA_GRE_IGNORE_DF
]) {
907 if (nla_get_u8(data
[IFLA_GRE_IGNORE_DF
])
908 && (parms
->iph
.frag_off
& htons(IP_DF
)))
910 t
->ignore_df
= !!nla_get_u8(data
[IFLA_GRE_IGNORE_DF
]);
916 /* This function returns true when ENCAP attributes are present in the nl msg */
917 static bool ipgre_netlink_encap_parms(struct nlattr
*data
[],
918 struct ip_tunnel_encap
*ipencap
)
922 memset(ipencap
, 0, sizeof(*ipencap
));
927 if (data
[IFLA_GRE_ENCAP_TYPE
]) {
929 ipencap
->type
= nla_get_u16(data
[IFLA_GRE_ENCAP_TYPE
]);
932 if (data
[IFLA_GRE_ENCAP_FLAGS
]) {
934 ipencap
->flags
= nla_get_u16(data
[IFLA_GRE_ENCAP_FLAGS
]);
937 if (data
[IFLA_GRE_ENCAP_SPORT
]) {
939 ipencap
->sport
= nla_get_be16(data
[IFLA_GRE_ENCAP_SPORT
]);
942 if (data
[IFLA_GRE_ENCAP_DPORT
]) {
944 ipencap
->dport
= nla_get_be16(data
[IFLA_GRE_ENCAP_DPORT
]);
950 static int gre_tap_init(struct net_device
*dev
)
952 __gre_tunnel_init(dev
);
953 dev
->priv_flags
|= IFF_LIVE_ADDR_CHANGE
;
955 return ip_tunnel_init(dev
);
958 static const struct net_device_ops gre_tap_netdev_ops
= {
959 .ndo_init
= gre_tap_init
,
960 .ndo_uninit
= ip_tunnel_uninit
,
961 .ndo_start_xmit
= gre_tap_xmit
,
962 .ndo_set_mac_address
= eth_mac_addr
,
963 .ndo_validate_addr
= eth_validate_addr
,
964 .ndo_change_mtu
= ip_tunnel_change_mtu
,
965 .ndo_get_stats64
= ip_tunnel_get_stats64
,
966 .ndo_get_iflink
= ip_tunnel_get_iflink
,
967 .ndo_fill_metadata_dst
= gre_fill_metadata_dst
,
970 static void ipgre_tap_setup(struct net_device
*dev
)
973 dev
->netdev_ops
= &gre_tap_netdev_ops
;
974 dev
->priv_flags
&= ~IFF_TX_SKB_SHARING
;
975 dev
->priv_flags
|= IFF_LIVE_ADDR_CHANGE
;
976 ip_tunnel_setup(dev
, gre_tap_net_id
);
979 static int ipgre_newlink(struct net
*src_net
, struct net_device
*dev
,
980 struct nlattr
*tb
[], struct nlattr
*data
[])
982 struct ip_tunnel_parm p
;
983 struct ip_tunnel_encap ipencap
;
986 if (ipgre_netlink_encap_parms(data
, &ipencap
)) {
987 struct ip_tunnel
*t
= netdev_priv(dev
);
988 err
= ip_tunnel_encap_setup(t
, &ipencap
);
994 err
= ipgre_netlink_parms(dev
, data
, tb
, &p
);
997 return ip_tunnel_newlink(dev
, tb
, &p
);
1000 static int ipgre_changelink(struct net_device
*dev
, struct nlattr
*tb
[],
1001 struct nlattr
*data
[])
1003 struct ip_tunnel_parm p
;
1004 struct ip_tunnel_encap ipencap
;
1007 if (ipgre_netlink_encap_parms(data
, &ipencap
)) {
1008 struct ip_tunnel
*t
= netdev_priv(dev
);
1009 err
= ip_tunnel_encap_setup(t
, &ipencap
);
1015 err
= ipgre_netlink_parms(dev
, data
, tb
, &p
);
1018 return ip_tunnel_changelink(dev
, tb
, &p
);
1021 static size_t ipgre_get_size(const struct net_device
*dev
)
1026 /* IFLA_GRE_IFLAGS */
1028 /* IFLA_GRE_OFLAGS */
1034 /* IFLA_GRE_LOCAL */
1036 /* IFLA_GRE_REMOTE */
1042 /* IFLA_GRE_PMTUDISC */
1044 /* IFLA_GRE_ENCAP_TYPE */
1046 /* IFLA_GRE_ENCAP_FLAGS */
1048 /* IFLA_GRE_ENCAP_SPORT */
1050 /* IFLA_GRE_ENCAP_DPORT */
1052 /* IFLA_GRE_COLLECT_METADATA */
1054 /* IFLA_GRE_IGNORE_DF */
1059 static int ipgre_fill_info(struct sk_buff
*skb
, const struct net_device
*dev
)
1061 struct ip_tunnel
*t
= netdev_priv(dev
);
1062 struct ip_tunnel_parm
*p
= &t
->parms
;
1064 if (nla_put_u32(skb
, IFLA_GRE_LINK
, p
->link
) ||
1065 nla_put_be16(skb
, IFLA_GRE_IFLAGS
,
1066 gre_tnl_flags_to_gre_flags(p
->i_flags
)) ||
1067 nla_put_be16(skb
, IFLA_GRE_OFLAGS
,
1068 gre_tnl_flags_to_gre_flags(p
->o_flags
)) ||
1069 nla_put_be32(skb
, IFLA_GRE_IKEY
, p
->i_key
) ||
1070 nla_put_be32(skb
, IFLA_GRE_OKEY
, p
->o_key
) ||
1071 nla_put_in_addr(skb
, IFLA_GRE_LOCAL
, p
->iph
.saddr
) ||
1072 nla_put_in_addr(skb
, IFLA_GRE_REMOTE
, p
->iph
.daddr
) ||
1073 nla_put_u8(skb
, IFLA_GRE_TTL
, p
->iph
.ttl
) ||
1074 nla_put_u8(skb
, IFLA_GRE_TOS
, p
->iph
.tos
) ||
1075 nla_put_u8(skb
, IFLA_GRE_PMTUDISC
,
1076 !!(p
->iph
.frag_off
& htons(IP_DF
))))
1077 goto nla_put_failure
;
1079 if (nla_put_u16(skb
, IFLA_GRE_ENCAP_TYPE
,
1081 nla_put_be16(skb
, IFLA_GRE_ENCAP_SPORT
,
1083 nla_put_be16(skb
, IFLA_GRE_ENCAP_DPORT
,
1085 nla_put_u16(skb
, IFLA_GRE_ENCAP_FLAGS
,
1087 goto nla_put_failure
;
1089 if (nla_put_u8(skb
, IFLA_GRE_IGNORE_DF
, t
->ignore_df
))
1090 goto nla_put_failure
;
1092 if (t
->collect_md
) {
1093 if (nla_put_flag(skb
, IFLA_GRE_COLLECT_METADATA
))
1094 goto nla_put_failure
;
1103 static const struct nla_policy ipgre_policy
[IFLA_GRE_MAX
+ 1] = {
1104 [IFLA_GRE_LINK
] = { .type
= NLA_U32
},
1105 [IFLA_GRE_IFLAGS
] = { .type
= NLA_U16
},
1106 [IFLA_GRE_OFLAGS
] = { .type
= NLA_U16
},
1107 [IFLA_GRE_IKEY
] = { .type
= NLA_U32
},
1108 [IFLA_GRE_OKEY
] = { .type
= NLA_U32
},
1109 [IFLA_GRE_LOCAL
] = { .len
= FIELD_SIZEOF(struct iphdr
, saddr
) },
1110 [IFLA_GRE_REMOTE
] = { .len
= FIELD_SIZEOF(struct iphdr
, daddr
) },
1111 [IFLA_GRE_TTL
] = { .type
= NLA_U8
},
1112 [IFLA_GRE_TOS
] = { .type
= NLA_U8
},
1113 [IFLA_GRE_PMTUDISC
] = { .type
= NLA_U8
},
1114 [IFLA_GRE_ENCAP_TYPE
] = { .type
= NLA_U16
},
1115 [IFLA_GRE_ENCAP_FLAGS
] = { .type
= NLA_U16
},
1116 [IFLA_GRE_ENCAP_SPORT
] = { .type
= NLA_U16
},
1117 [IFLA_GRE_ENCAP_DPORT
] = { .type
= NLA_U16
},
1118 [IFLA_GRE_COLLECT_METADATA
] = { .type
= NLA_FLAG
},
1119 [IFLA_GRE_IGNORE_DF
] = { .type
= NLA_U8
},
1122 static struct rtnl_link_ops ipgre_link_ops __read_mostly
= {
1124 .maxtype
= IFLA_GRE_MAX
,
1125 .policy
= ipgre_policy
,
1126 .priv_size
= sizeof(struct ip_tunnel
),
1127 .setup
= ipgre_tunnel_setup
,
1128 .validate
= ipgre_tunnel_validate
,
1129 .newlink
= ipgre_newlink
,
1130 .changelink
= ipgre_changelink
,
1131 .dellink
= ip_tunnel_dellink
,
1132 .get_size
= ipgre_get_size
,
1133 .fill_info
= ipgre_fill_info
,
1134 .get_link_net
= ip_tunnel_get_link_net
,
1137 static struct rtnl_link_ops ipgre_tap_ops __read_mostly
= {
1139 .maxtype
= IFLA_GRE_MAX
,
1140 .policy
= ipgre_policy
,
1141 .priv_size
= sizeof(struct ip_tunnel
),
1142 .setup
= ipgre_tap_setup
,
1143 .validate
= ipgre_tap_validate
,
1144 .newlink
= ipgre_newlink
,
1145 .changelink
= ipgre_changelink
,
1146 .dellink
= ip_tunnel_dellink
,
1147 .get_size
= ipgre_get_size
,
1148 .fill_info
= ipgre_fill_info
,
1149 .get_link_net
= ip_tunnel_get_link_net
,
1152 struct net_device
*gretap_fb_dev_create(struct net
*net
, const char *name
,
1153 u8 name_assign_type
)
1155 struct nlattr
*tb
[IFLA_MAX
+ 1];
1156 struct net_device
*dev
;
1157 struct ip_tunnel
*t
;
1160 memset(&tb
, 0, sizeof(tb
));
1162 dev
= rtnl_create_link(net
, name
, name_assign_type
,
1163 &ipgre_tap_ops
, tb
);
1167 /* Configure flow based GRE device. */
1168 t
= netdev_priv(dev
);
1169 t
->collect_md
= true;
1171 err
= ipgre_newlink(net
, dev
, tb
, NULL
);
1175 /* openvswitch users expect packet sizes to be unrestricted,
1176 * so set the largest MTU we can.
1178 err
= __ip_tunnel_change_mtu(dev
, IP_MAX_MTU
, false);
1185 return ERR_PTR(err
);
1187 EXPORT_SYMBOL_GPL(gretap_fb_dev_create
);
1189 static int __net_init
ipgre_tap_init_net(struct net
*net
)
1191 return ip_tunnel_init_net(net
, gre_tap_net_id
, &ipgre_tap_ops
, "gretap0");
1194 static void __net_exit
ipgre_tap_exit_net(struct net
*net
)
1196 struct ip_tunnel_net
*itn
= net_generic(net
, gre_tap_net_id
);
1197 ip_tunnel_delete_net(itn
, &ipgre_tap_ops
);
1200 static struct pernet_operations ipgre_tap_net_ops
= {
1201 .init
= ipgre_tap_init_net
,
1202 .exit
= ipgre_tap_exit_net
,
1203 .id
= &gre_tap_net_id
,
1204 .size
= sizeof(struct ip_tunnel_net
),
1207 static int __init
ipgre_init(void)
1211 pr_info("GRE over IPv4 tunneling driver\n");
1213 err
= register_pernet_device(&ipgre_net_ops
);
1217 err
= register_pernet_device(&ipgre_tap_net_ops
);
1219 goto pnet_tap_faied
;
1221 err
= gre_add_protocol(&ipgre_protocol
, GREPROTO_CISCO
);
1223 pr_info("%s: can't add protocol\n", __func__
);
1224 goto add_proto_failed
;
1227 err
= rtnl_link_register(&ipgre_link_ops
);
1229 goto rtnl_link_failed
;
1231 err
= rtnl_link_register(&ipgre_tap_ops
);
1233 goto tap_ops_failed
;
1238 rtnl_link_unregister(&ipgre_link_ops
);
1240 gre_del_protocol(&ipgre_protocol
, GREPROTO_CISCO
);
1242 unregister_pernet_device(&ipgre_tap_net_ops
);
1244 unregister_pernet_device(&ipgre_net_ops
);
1248 static void __exit
ipgre_fini(void)
1250 rtnl_link_unregister(&ipgre_tap_ops
);
1251 rtnl_link_unregister(&ipgre_link_ops
);
1252 gre_del_protocol(&ipgre_protocol
, GREPROTO_CISCO
);
1253 unregister_pernet_device(&ipgre_tap_net_ops
);
1254 unregister_pernet_device(&ipgre_net_ops
);
1257 module_init(ipgre_init
);
1258 module_exit(ipgre_fini
);
1259 MODULE_LICENSE("GPL");
1260 MODULE_ALIAS_RTNL_LINK("gre");
1261 MODULE_ALIAS_RTNL_LINK("gretap");
1262 MODULE_ALIAS_NETDEV("gre0");
1263 MODULE_ALIAS_NETDEV("gretap0");