2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
52 #include <linux/rtnetlink.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
61 #include <linux/sysctl.h>
64 /* Set to 3 to get tracing. */
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #define RT6_TRACE(x...) do { ; } while (0)
75 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
);
76 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
);
77 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
);
78 static unsigned int ip6_default_mtu(const struct dst_entry
*dst
);
79 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*);
80 static void ip6_dst_destroy(struct dst_entry
*);
81 static void ip6_dst_ifdown(struct dst_entry
*,
82 struct net_device
*dev
, int how
);
83 static int ip6_dst_gc(struct dst_ops
*ops
);
85 static int ip6_pkt_discard(struct sk_buff
*skb
);
86 static int ip6_pkt_discard_out(struct sk_buff
*skb
);
87 static void ip6_link_failure(struct sk_buff
*skb
);
88 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
);
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
92 struct in6_addr
*prefix
, int prefixlen
,
93 struct in6_addr
*gwaddr
, int ifindex
,
95 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
96 struct in6_addr
*prefix
, int prefixlen
,
97 struct in6_addr
*gwaddr
, int ifindex
);
100 static u32
*ipv6_cow_metrics(struct dst_entry
*dst
, unsigned long old
)
102 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
103 struct inet_peer
*peer
;
107 rt6_bind_peer(rt
, 1);
109 peer
= rt
->rt6i_peer
;
111 u32
*old_p
= __DST_METRICS_PTR(old
);
112 unsigned long prev
, new;
115 if (inet_metrics_new(peer
))
116 memcpy(p
, old_p
, sizeof(u32
) * RTAX_MAX
);
118 new = (unsigned long) p
;
119 prev
= cmpxchg(&dst
->_metrics
, old
, new);
122 p
= __DST_METRICS_PTR(prev
);
123 if (prev
& DST_METRICS_READ_ONLY
)
130 static struct dst_ops ip6_dst_ops_template
= {
132 .protocol
= cpu_to_be16(ETH_P_IPV6
),
135 .check
= ip6_dst_check
,
136 .default_advmss
= ip6_default_advmss
,
137 .default_mtu
= ip6_default_mtu
,
138 .cow_metrics
= ipv6_cow_metrics
,
139 .destroy
= ip6_dst_destroy
,
140 .ifdown
= ip6_dst_ifdown
,
141 .negative_advice
= ip6_negative_advice
,
142 .link_failure
= ip6_link_failure
,
143 .update_pmtu
= ip6_rt_update_pmtu
,
144 .local_out
= __ip6_local_out
,
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry
*dst
)
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
156 static struct dst_ops ip6_dst_blackhole_ops
= {
158 .protocol
= cpu_to_be16(ETH_P_IPV6
),
159 .destroy
= ip6_dst_destroy
,
160 .check
= ip6_dst_check
,
161 .default_mtu
= ip6_blackhole_default_mtu
,
162 .default_advmss
= ip6_default_advmss
,
163 .update_pmtu
= ip6_rt_blackhole_update_pmtu
,
166 static const u32 ip6_template_metrics
[RTAX_MAX
] = {
167 [RTAX_HOPLIMIT
- 1] = 255,
170 static struct rt6_info ip6_null_entry_template
= {
172 .__refcnt
= ATOMIC_INIT(1),
175 .error
= -ENETUNREACH
,
176 .input
= ip6_pkt_discard
,
177 .output
= ip6_pkt_discard_out
,
179 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
180 .rt6i_protocol
= RTPROT_KERNEL
,
181 .rt6i_metric
= ~(u32
) 0,
182 .rt6i_ref
= ATOMIC_INIT(1),
185 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
187 static int ip6_pkt_prohibit(struct sk_buff
*skb
);
188 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
);
190 static struct rt6_info ip6_prohibit_entry_template
= {
192 .__refcnt
= ATOMIC_INIT(1),
196 .input
= ip6_pkt_prohibit
,
197 .output
= ip6_pkt_prohibit_out
,
199 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
200 .rt6i_protocol
= RTPROT_KERNEL
,
201 .rt6i_metric
= ~(u32
) 0,
202 .rt6i_ref
= ATOMIC_INIT(1),
205 static struct rt6_info ip6_blk_hole_entry_template
= {
207 .__refcnt
= ATOMIC_INIT(1),
211 .input
= dst_discard
,
212 .output
= dst_discard
,
214 .rt6i_flags
= (RTF_REJECT
| RTF_NONEXTHOP
),
215 .rt6i_protocol
= RTPROT_KERNEL
,
216 .rt6i_metric
= ~(u32
) 0,
217 .rt6i_ref
= ATOMIC_INIT(1),
222 /* allocate dst with ip6_dst_ops */
223 static inline struct rt6_info
*ip6_dst_alloc(struct dst_ops
*ops
)
225 return (struct rt6_info
*)dst_alloc(ops
, 0);
228 static void ip6_dst_destroy(struct dst_entry
*dst
)
230 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
231 struct inet6_dev
*idev
= rt
->rt6i_idev
;
232 struct inet_peer
*peer
= rt
->rt6i_peer
;
235 rt
->rt6i_idev
= NULL
;
239 rt
->rt6i_peer
= NULL
;
244 static atomic_t __rt6_peer_genid
= ATOMIC_INIT(0);
246 static u32
rt6_peer_genid(void)
248 return atomic_read(&__rt6_peer_genid
);
251 void rt6_bind_peer(struct rt6_info
*rt
, int create
)
253 struct inet_peer
*peer
;
255 peer
= inet_getpeer_v6(&rt
->rt6i_dst
.addr
, create
);
256 if (peer
&& cmpxchg(&rt
->rt6i_peer
, NULL
, peer
) != NULL
)
259 rt
->rt6i_peer_genid
= rt6_peer_genid();
262 static void ip6_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
265 struct rt6_info
*rt
= (struct rt6_info
*)dst
;
266 struct inet6_dev
*idev
= rt
->rt6i_idev
;
267 struct net_device
*loopback_dev
=
268 dev_net(dev
)->loopback_dev
;
270 if (dev
!= loopback_dev
&& idev
!= NULL
&& idev
->dev
== dev
) {
271 struct inet6_dev
*loopback_idev
=
272 in6_dev_get(loopback_dev
);
273 if (loopback_idev
!= NULL
) {
274 rt
->rt6i_idev
= loopback_idev
;
280 static __inline__
int rt6_check_expired(const struct rt6_info
*rt
)
282 return (rt
->rt6i_flags
& RTF_EXPIRES
) &&
283 time_after(jiffies
, rt
->rt6i_expires
);
286 static inline int rt6_need_strict(struct in6_addr
*daddr
)
288 return ipv6_addr_type(daddr
) &
289 (IPV6_ADDR_MULTICAST
| IPV6_ADDR_LINKLOCAL
| IPV6_ADDR_LOOPBACK
);
293 * Route lookup. Any table->tb6_lock is implied.
296 static inline struct rt6_info
*rt6_device_match(struct net
*net
,
298 struct in6_addr
*saddr
,
302 struct rt6_info
*local
= NULL
;
303 struct rt6_info
*sprt
;
305 if (!oif
&& ipv6_addr_any(saddr
))
308 for (sprt
= rt
; sprt
; sprt
= sprt
->dst
.rt6_next
) {
309 struct net_device
*dev
= sprt
->rt6i_dev
;
312 if (dev
->ifindex
== oif
)
314 if (dev
->flags
& IFF_LOOPBACK
) {
315 if (sprt
->rt6i_idev
== NULL
||
316 sprt
->rt6i_idev
->dev
->ifindex
!= oif
) {
317 if (flags
& RT6_LOOKUP_F_IFACE
&& oif
)
319 if (local
&& (!oif
||
320 local
->rt6i_idev
->dev
->ifindex
== oif
))
326 if (ipv6_chk_addr(net
, saddr
, dev
,
327 flags
& RT6_LOOKUP_F_IFACE
))
336 if (flags
& RT6_LOOKUP_F_IFACE
)
337 return net
->ipv6
.ip6_null_entry
;
343 #ifdef CONFIG_IPV6_ROUTER_PREF
344 static void rt6_probe(struct rt6_info
*rt
)
346 struct neighbour
*neigh
= rt
? rt
->rt6i_nexthop
: NULL
;
348 * Okay, this does not seem to be appropriate
349 * for now, however, we need to check if it
350 * is really so; aka Router Reachability Probing.
352 * Router Reachability Probe MUST be rate-limited
353 * to no more than one per minute.
355 if (!neigh
|| (neigh
->nud_state
& NUD_VALID
))
357 read_lock_bh(&neigh
->lock
);
358 if (!(neigh
->nud_state
& NUD_VALID
) &&
359 time_after(jiffies
, neigh
->updated
+ rt
->rt6i_idev
->cnf
.rtr_probe_interval
)) {
360 struct in6_addr mcaddr
;
361 struct in6_addr
*target
;
363 neigh
->updated
= jiffies
;
364 read_unlock_bh(&neigh
->lock
);
366 target
= (struct in6_addr
*)&neigh
->primary_key
;
367 addrconf_addr_solict_mult(target
, &mcaddr
);
368 ndisc_send_ns(rt
->rt6i_dev
, NULL
, target
, &mcaddr
, NULL
);
370 read_unlock_bh(&neigh
->lock
);
373 static inline void rt6_probe(struct rt6_info
*rt
)
379 * Default Router Selection (RFC 2461 6.3.6)
381 static inline int rt6_check_dev(struct rt6_info
*rt
, int oif
)
383 struct net_device
*dev
= rt
->rt6i_dev
;
384 if (!oif
|| dev
->ifindex
== oif
)
386 if ((dev
->flags
& IFF_LOOPBACK
) &&
387 rt
->rt6i_idev
&& rt
->rt6i_idev
->dev
->ifindex
== oif
)
392 static inline int rt6_check_neigh(struct rt6_info
*rt
)
394 struct neighbour
*neigh
= rt
->rt6i_nexthop
;
396 if (rt
->rt6i_flags
& RTF_NONEXTHOP
||
397 !(rt
->rt6i_flags
& RTF_GATEWAY
))
400 read_lock_bh(&neigh
->lock
);
401 if (neigh
->nud_state
& NUD_VALID
)
403 #ifdef CONFIG_IPV6_ROUTER_PREF
404 else if (neigh
->nud_state
& NUD_FAILED
)
409 read_unlock_bh(&neigh
->lock
);
415 static int rt6_score_route(struct rt6_info
*rt
, int oif
,
420 m
= rt6_check_dev(rt
, oif
);
421 if (!m
&& (strict
& RT6_LOOKUP_F_IFACE
))
423 #ifdef CONFIG_IPV6_ROUTER_PREF
424 m
|= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt
->rt6i_flags
)) << 2;
426 n
= rt6_check_neigh(rt
);
427 if (!n
&& (strict
& RT6_LOOKUP_F_REACHABLE
))
432 static struct rt6_info
*find_match(struct rt6_info
*rt
, int oif
, int strict
,
433 int *mpri
, struct rt6_info
*match
)
437 if (rt6_check_expired(rt
))
440 m
= rt6_score_route(rt
, oif
, strict
);
445 if (strict
& RT6_LOOKUP_F_REACHABLE
)
449 } else if (strict
& RT6_LOOKUP_F_REACHABLE
) {
457 static struct rt6_info
*find_rr_leaf(struct fib6_node
*fn
,
458 struct rt6_info
*rr_head
,
459 u32 metric
, int oif
, int strict
)
461 struct rt6_info
*rt
, *match
;
465 for (rt
= rr_head
; rt
&& rt
->rt6i_metric
== metric
;
466 rt
= rt
->dst
.rt6_next
)
467 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
468 for (rt
= fn
->leaf
; rt
&& rt
!= rr_head
&& rt
->rt6i_metric
== metric
;
469 rt
= rt
->dst
.rt6_next
)
470 match
= find_match(rt
, oif
, strict
, &mpri
, match
);
475 static struct rt6_info
*rt6_select(struct fib6_node
*fn
, int oif
, int strict
)
477 struct rt6_info
*match
, *rt0
;
480 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
481 __func__
, fn
->leaf
, oif
);
485 fn
->rr_ptr
= rt0
= fn
->leaf
;
487 match
= find_rr_leaf(fn
, rt0
, rt0
->rt6i_metric
, oif
, strict
);
490 (strict
& RT6_LOOKUP_F_REACHABLE
)) {
491 struct rt6_info
*next
= rt0
->dst
.rt6_next
;
493 /* no entries matched; do round-robin */
494 if (!next
|| next
->rt6i_metric
!= rt0
->rt6i_metric
)
501 RT6_TRACE("%s() => %p\n",
504 net
= dev_net(rt0
->rt6i_dev
);
505 return match
? match
: net
->ipv6
.ip6_null_entry
;
508 #ifdef CONFIG_IPV6_ROUTE_INFO
509 int rt6_route_rcv(struct net_device
*dev
, u8
*opt
, int len
,
510 struct in6_addr
*gwaddr
)
512 struct net
*net
= dev_net(dev
);
513 struct route_info
*rinfo
= (struct route_info
*) opt
;
514 struct in6_addr prefix_buf
, *prefix
;
516 unsigned long lifetime
;
519 if (len
< sizeof(struct route_info
)) {
523 /* Sanity check for prefix_len and length */
524 if (rinfo
->length
> 3) {
526 } else if (rinfo
->prefix_len
> 128) {
528 } else if (rinfo
->prefix_len
> 64) {
529 if (rinfo
->length
< 2) {
532 } else if (rinfo
->prefix_len
> 0) {
533 if (rinfo
->length
< 1) {
538 pref
= rinfo
->route_pref
;
539 if (pref
== ICMPV6_ROUTER_PREF_INVALID
)
542 lifetime
= addrconf_timeout_fixup(ntohl(rinfo
->lifetime
), HZ
);
544 if (rinfo
->length
== 3)
545 prefix
= (struct in6_addr
*)rinfo
->prefix
;
547 /* this function is safe */
548 ipv6_addr_prefix(&prefix_buf
,
549 (struct in6_addr
*)rinfo
->prefix
,
551 prefix
= &prefix_buf
;
554 rt
= rt6_get_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
,
557 if (rt
&& !lifetime
) {
563 rt
= rt6_add_route_info(net
, prefix
, rinfo
->prefix_len
, gwaddr
, dev
->ifindex
,
566 rt
->rt6i_flags
= RTF_ROUTEINFO
|
567 (rt
->rt6i_flags
& ~RTF_PREF_MASK
) | RTF_PREF(pref
);
570 if (!addrconf_finite_timeout(lifetime
)) {
571 rt
->rt6i_flags
&= ~RTF_EXPIRES
;
573 rt
->rt6i_expires
= jiffies
+ HZ
* lifetime
;
574 rt
->rt6i_flags
|= RTF_EXPIRES
;
576 dst_release(&rt
->dst
);
582 #define BACKTRACK(__net, saddr) \
584 if (rt == __net->ipv6.ip6_null_entry) { \
585 struct fib6_node *pn; \
587 if (fn->fn_flags & RTN_TL_ROOT) \
590 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
591 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
594 if (fn->fn_flags & RTN_RTINFO) \
600 static struct rt6_info
*ip6_pol_route_lookup(struct net
*net
,
601 struct fib6_table
*table
,
602 struct flowi6
*fl6
, int flags
)
604 struct fib6_node
*fn
;
607 read_lock_bh(&table
->tb6_lock
);
608 fn
= fib6_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
611 rt
= rt6_device_match(net
, rt
, &fl6
->saddr
, fl6
->flowi6_oif
, flags
);
612 BACKTRACK(net
, &fl6
->saddr
);
614 dst_use(&rt
->dst
, jiffies
);
615 read_unlock_bh(&table
->tb6_lock
);
620 struct rt6_info
*rt6_lookup(struct net
*net
, const struct in6_addr
*daddr
,
621 const struct in6_addr
*saddr
, int oif
, int strict
)
623 struct flowi6 fl6
= {
627 struct dst_entry
*dst
;
628 int flags
= strict
? RT6_LOOKUP_F_IFACE
: 0;
631 memcpy(&fl6
.saddr
, saddr
, sizeof(*saddr
));
632 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
635 dst
= fib6_rule_lookup(net
, &fl6
, flags
, ip6_pol_route_lookup
);
637 return (struct rt6_info
*) dst
;
644 EXPORT_SYMBOL(rt6_lookup
);
646 /* ip6_ins_rt is called with FREE table->tb6_lock.
647 It takes new route entry, the addition fails by any reason the
648 route is freed. In any case, if caller does not hold it, it may
652 static int __ip6_ins_rt(struct rt6_info
*rt
, struct nl_info
*info
)
655 struct fib6_table
*table
;
657 table
= rt
->rt6i_table
;
658 write_lock_bh(&table
->tb6_lock
);
659 err
= fib6_add(&table
->tb6_root
, rt
, info
);
660 write_unlock_bh(&table
->tb6_lock
);
665 int ip6_ins_rt(struct rt6_info
*rt
)
667 struct nl_info info
= {
668 .nl_net
= dev_net(rt
->rt6i_dev
),
670 return __ip6_ins_rt(rt
, &info
);
673 static struct rt6_info
*rt6_alloc_cow(struct rt6_info
*ort
, struct in6_addr
*daddr
,
674 struct in6_addr
*saddr
)
682 rt
= ip6_rt_copy(ort
);
685 struct neighbour
*neigh
;
686 int attempts
= !in_softirq();
688 if (!(rt
->rt6i_flags
&RTF_GATEWAY
)) {
689 if (rt
->rt6i_dst
.plen
!= 128 &&
690 ipv6_addr_equal(&rt
->rt6i_dst
.addr
, daddr
))
691 rt
->rt6i_flags
|= RTF_ANYCAST
;
692 ipv6_addr_copy(&rt
->rt6i_gateway
, daddr
);
695 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
696 rt
->rt6i_dst
.plen
= 128;
697 rt
->rt6i_flags
|= RTF_CACHE
;
698 rt
->dst
.flags
|= DST_HOST
;
700 #ifdef CONFIG_IPV6_SUBTREES
701 if (rt
->rt6i_src
.plen
&& saddr
) {
702 ipv6_addr_copy(&rt
->rt6i_src
.addr
, saddr
);
703 rt
->rt6i_src
.plen
= 128;
708 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
710 struct net
*net
= dev_net(rt
->rt6i_dev
);
711 int saved_rt_min_interval
=
712 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
713 int saved_rt_elasticity
=
714 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
716 if (attempts
-- > 0) {
717 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 1;
718 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= 0;
720 ip6_dst_gc(&net
->ipv6
.ip6_dst_ops
);
722 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
=
724 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
=
725 saved_rt_min_interval
;
731 "ipv6: Neighbour table overflow.\n");
735 rt
->rt6i_nexthop
= neigh
;
742 static struct rt6_info
*rt6_alloc_clone(struct rt6_info
*ort
, struct in6_addr
*daddr
)
744 struct rt6_info
*rt
= ip6_rt_copy(ort
);
746 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, daddr
);
747 rt
->rt6i_dst
.plen
= 128;
748 rt
->rt6i_flags
|= RTF_CACHE
;
749 rt
->dst
.flags
|= DST_HOST
;
750 rt
->rt6i_nexthop
= neigh_clone(ort
->rt6i_nexthop
);
755 static struct rt6_info
*ip6_pol_route(struct net
*net
, struct fib6_table
*table
, int oif
,
756 struct flowi6
*fl6
, int flags
)
758 struct fib6_node
*fn
;
759 struct rt6_info
*rt
, *nrt
;
763 int reachable
= net
->ipv6
.devconf_all
->forwarding
? 0 : RT6_LOOKUP_F_REACHABLE
;
765 strict
|= flags
& RT6_LOOKUP_F_IFACE
;
768 read_lock_bh(&table
->tb6_lock
);
771 fn
= fib6_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
774 rt
= rt6_select(fn
, oif
, strict
| reachable
);
776 BACKTRACK(net
, &fl6
->saddr
);
777 if (rt
== net
->ipv6
.ip6_null_entry
||
778 rt
->rt6i_flags
& RTF_CACHE
)
782 read_unlock_bh(&table
->tb6_lock
);
784 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
785 nrt
= rt6_alloc_cow(rt
, &fl6
->daddr
, &fl6
->saddr
);
786 else if (!(rt
->dst
.flags
& DST_HOST
))
787 nrt
= rt6_alloc_clone(rt
, &fl6
->daddr
);
791 dst_release(&rt
->dst
);
792 rt
= nrt
? : net
->ipv6
.ip6_null_entry
;
796 err
= ip6_ins_rt(nrt
);
805 * Race condition! In the gap, when table->tb6_lock was
806 * released someone could insert this route. Relookup.
808 dst_release(&rt
->dst
);
817 read_unlock_bh(&table
->tb6_lock
);
819 rt
->dst
.lastuse
= jiffies
;
825 static struct rt6_info
*ip6_pol_route_input(struct net
*net
, struct fib6_table
*table
,
826 struct flowi6
*fl6
, int flags
)
828 return ip6_pol_route(net
, table
, fl6
->flowi6_iif
, fl6
, flags
);
831 void ip6_route_input(struct sk_buff
*skb
)
833 struct ipv6hdr
*iph
= ipv6_hdr(skb
);
834 struct net
*net
= dev_net(skb
->dev
);
835 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
836 struct flowi6 fl6
= {
837 .flowi6_iif
= skb
->dev
->ifindex
,
840 .flowlabel
= (* (__be32
*) iph
)&IPV6_FLOWINFO_MASK
,
841 .flowi6_mark
= skb
->mark
,
842 .flowi6_proto
= iph
->nexthdr
,
845 if (rt6_need_strict(&iph
->daddr
) && skb
->dev
->type
!= ARPHRD_PIMREG
)
846 flags
|= RT6_LOOKUP_F_IFACE
;
848 skb_dst_set(skb
, fib6_rule_lookup(net
, &fl6
, flags
, ip6_pol_route_input
));
851 static struct rt6_info
*ip6_pol_route_output(struct net
*net
, struct fib6_table
*table
,
852 struct flowi6
*fl6
, int flags
)
854 return ip6_pol_route(net
, table
, fl6
->flowi6_oif
, fl6
, flags
);
857 struct dst_entry
* ip6_route_output(struct net
*net
, const struct sock
*sk
,
862 if ((sk
&& sk
->sk_bound_dev_if
) || rt6_need_strict(&fl6
->daddr
))
863 flags
|= RT6_LOOKUP_F_IFACE
;
865 if (!ipv6_addr_any(&fl6
->saddr
))
866 flags
|= RT6_LOOKUP_F_HAS_SADDR
;
868 flags
|= rt6_srcprefs2flags(inet6_sk(sk
)->srcprefs
);
870 return fib6_rule_lookup(net
, fl6
, flags
, ip6_pol_route_output
);
873 EXPORT_SYMBOL(ip6_route_output
);
875 struct dst_entry
*ip6_blackhole_route(struct net
*net
, struct dst_entry
*dst_orig
)
877 struct rt6_info
*rt
= dst_alloc(&ip6_dst_blackhole_ops
, 1);
878 struct rt6_info
*ort
= (struct rt6_info
*) dst_orig
;
879 struct dst_entry
*new = NULL
;
885 new->input
= dst_discard
;
886 new->output
= dst_discard
;
888 dst_copy_metrics(new, &ort
->dst
);
889 new->dev
= ort
->dst
.dev
;
892 rt
->rt6i_idev
= ort
->rt6i_idev
;
894 in6_dev_hold(rt
->rt6i_idev
);
895 rt
->rt6i_expires
= 0;
897 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
898 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
901 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
902 #ifdef CONFIG_IPV6_SUBTREES
903 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
909 dst_release(dst_orig
);
910 return new ? new : ERR_PTR(-ENOMEM
);
914 * Destination cache support functions
917 static struct dst_entry
*ip6_dst_check(struct dst_entry
*dst
, u32 cookie
)
921 rt
= (struct rt6_info
*) dst
;
923 if (rt
->rt6i_node
&& (rt
->rt6i_node
->fn_sernum
== cookie
)) {
924 if (rt
->rt6i_peer_genid
!= rt6_peer_genid()) {
926 rt6_bind_peer(rt
, 0);
927 rt
->rt6i_peer_genid
= rt6_peer_genid();
934 static struct dst_entry
*ip6_negative_advice(struct dst_entry
*dst
)
936 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
939 if (rt
->rt6i_flags
& RTF_CACHE
) {
940 if (rt6_check_expired(rt
)) {
952 static void ip6_link_failure(struct sk_buff
*skb
)
956 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, ICMPV6_ADDR_UNREACH
, 0);
958 rt
= (struct rt6_info
*) skb_dst(skb
);
960 if (rt
->rt6i_flags
&RTF_CACHE
) {
961 dst_set_expires(&rt
->dst
, 0);
962 rt
->rt6i_flags
|= RTF_EXPIRES
;
963 } else if (rt
->rt6i_node
&& (rt
->rt6i_flags
& RTF_DEFAULT
))
964 rt
->rt6i_node
->fn_sernum
= -1;
968 static void ip6_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
970 struct rt6_info
*rt6
= (struct rt6_info
*)dst
;
972 if (mtu
< dst_mtu(dst
) && rt6
->rt6i_dst
.plen
== 128) {
973 rt6
->rt6i_flags
|= RTF_MODIFIED
;
974 if (mtu
< IPV6_MIN_MTU
) {
975 u32 features
= dst_metric(dst
, RTAX_FEATURES
);
977 features
|= RTAX_FEATURE_ALLFRAG
;
978 dst_metric_set(dst
, RTAX_FEATURES
, features
);
980 dst_metric_set(dst
, RTAX_MTU
, mtu
);
984 static unsigned int ip6_default_advmss(const struct dst_entry
*dst
)
986 struct net_device
*dev
= dst
->dev
;
987 unsigned int mtu
= dst_mtu(dst
);
988 struct net
*net
= dev_net(dev
);
990 mtu
-= sizeof(struct ipv6hdr
) + sizeof(struct tcphdr
);
992 if (mtu
< net
->ipv6
.sysctl
.ip6_rt_min_advmss
)
993 mtu
= net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
996 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
997 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
998 * IPV6_MAXPLEN is also valid and means: "any MSS,
999 * rely only on pmtu discovery"
1001 if (mtu
> IPV6_MAXPLEN
- sizeof(struct tcphdr
))
1006 static unsigned int ip6_default_mtu(const struct dst_entry
*dst
)
1008 unsigned int mtu
= IPV6_MIN_MTU
;
1009 struct inet6_dev
*idev
;
1012 idev
= __in6_dev_get(dst
->dev
);
1014 mtu
= idev
->cnf
.mtu6
;
1020 static struct dst_entry
*icmp6_dst_gc_list
;
1021 static DEFINE_SPINLOCK(icmp6_dst_lock
);
1023 struct dst_entry
*icmp6_dst_alloc(struct net_device
*dev
,
1024 struct neighbour
*neigh
,
1025 const struct in6_addr
*addr
)
1027 struct rt6_info
*rt
;
1028 struct inet6_dev
*idev
= in6_dev_get(dev
);
1029 struct net
*net
= dev_net(dev
);
1031 if (unlikely(idev
== NULL
))
1034 rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1035 if (unlikely(rt
== NULL
)) {
1044 neigh
= ndisc_get_neigh(dev
, addr
);
1050 rt
->rt6i_idev
= idev
;
1051 rt
->rt6i_nexthop
= neigh
;
1052 atomic_set(&rt
->dst
.__refcnt
, 1);
1053 dst_metric_set(&rt
->dst
, RTAX_HOPLIMIT
, 255);
1054 rt
->dst
.output
= ip6_output
;
1056 #if 0 /* there's no chance to use these for ndisc */
1057 rt
->dst
.flags
= ipv6_addr_type(addr
) & IPV6_ADDR_UNICAST
1060 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, addr
);
1061 rt
->rt6i_dst
.plen
= 128;
1064 spin_lock_bh(&icmp6_dst_lock
);
1065 rt
->dst
.next
= icmp6_dst_gc_list
;
1066 icmp6_dst_gc_list
= &rt
->dst
;
1067 spin_unlock_bh(&icmp6_dst_lock
);
1069 fib6_force_start_gc(net
);
1075 int icmp6_dst_gc(void)
1077 struct dst_entry
*dst
, **pprev
;
1080 spin_lock_bh(&icmp6_dst_lock
);
1081 pprev
= &icmp6_dst_gc_list
;
1083 while ((dst
= *pprev
) != NULL
) {
1084 if (!atomic_read(&dst
->__refcnt
)) {
1093 spin_unlock_bh(&icmp6_dst_lock
);
1098 static void icmp6_clean_all(int (*func
)(struct rt6_info
*rt
, void *arg
),
1101 struct dst_entry
*dst
, **pprev
;
1103 spin_lock_bh(&icmp6_dst_lock
);
1104 pprev
= &icmp6_dst_gc_list
;
1105 while ((dst
= *pprev
) != NULL
) {
1106 struct rt6_info
*rt
= (struct rt6_info
*) dst
;
1107 if (func(rt
, arg
)) {
1114 spin_unlock_bh(&icmp6_dst_lock
);
1117 static int ip6_dst_gc(struct dst_ops
*ops
)
1119 unsigned long now
= jiffies
;
1120 struct net
*net
= container_of(ops
, struct net
, ipv6
.ip6_dst_ops
);
1121 int rt_min_interval
= net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
1122 int rt_max_size
= net
->ipv6
.sysctl
.ip6_rt_max_size
;
1123 int rt_elasticity
= net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
1124 int rt_gc_timeout
= net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
1125 unsigned long rt_last_gc
= net
->ipv6
.ip6_rt_last_gc
;
1128 entries
= dst_entries_get_fast(ops
);
1129 if (time_after(rt_last_gc
+ rt_min_interval
, now
) &&
1130 entries
<= rt_max_size
)
1133 net
->ipv6
.ip6_rt_gc_expire
++;
1134 fib6_run_gc(net
->ipv6
.ip6_rt_gc_expire
, net
);
1135 net
->ipv6
.ip6_rt_last_gc
= now
;
1136 entries
= dst_entries_get_slow(ops
);
1137 if (entries
< ops
->gc_thresh
)
1138 net
->ipv6
.ip6_rt_gc_expire
= rt_gc_timeout
>>1;
1140 net
->ipv6
.ip6_rt_gc_expire
-= net
->ipv6
.ip6_rt_gc_expire
>>rt_elasticity
;
1141 return entries
> rt_max_size
;
1144 /* Clean host part of a prefix. Not necessary in radix tree,
1145 but results in cleaner routing tables.
1147 Remove it only when all the things will work!
1150 int ip6_dst_hoplimit(struct dst_entry
*dst
)
1152 int hoplimit
= dst_metric_raw(dst
, RTAX_HOPLIMIT
);
1153 if (hoplimit
== 0) {
1154 struct net_device
*dev
= dst
->dev
;
1155 struct inet6_dev
*idev
;
1158 idev
= __in6_dev_get(dev
);
1160 hoplimit
= idev
->cnf
.hop_limit
;
1162 hoplimit
= dev_net(dev
)->ipv6
.devconf_all
->hop_limit
;
1167 EXPORT_SYMBOL(ip6_dst_hoplimit
);
1173 int ip6_route_add(struct fib6_config
*cfg
)
1176 struct net
*net
= cfg
->fc_nlinfo
.nl_net
;
1177 struct rt6_info
*rt
= NULL
;
1178 struct net_device
*dev
= NULL
;
1179 struct inet6_dev
*idev
= NULL
;
1180 struct fib6_table
*table
;
1183 if (cfg
->fc_dst_len
> 128 || cfg
->fc_src_len
> 128)
1185 #ifndef CONFIG_IPV6_SUBTREES
1186 if (cfg
->fc_src_len
)
1189 if (cfg
->fc_ifindex
) {
1191 dev
= dev_get_by_index(net
, cfg
->fc_ifindex
);
1194 idev
= in6_dev_get(dev
);
1199 if (cfg
->fc_metric
== 0)
1200 cfg
->fc_metric
= IP6_RT_PRIO_USER
;
1202 table
= fib6_new_table(net
, cfg
->fc_table
);
1203 if (table
== NULL
) {
1208 rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1215 rt
->dst
.obsolete
= -1;
1216 rt
->rt6i_expires
= (cfg
->fc_flags
& RTF_EXPIRES
) ?
1217 jiffies
+ clock_t_to_jiffies(cfg
->fc_expires
) :
1220 if (cfg
->fc_protocol
== RTPROT_UNSPEC
)
1221 cfg
->fc_protocol
= RTPROT_BOOT
;
1222 rt
->rt6i_protocol
= cfg
->fc_protocol
;
1224 addr_type
= ipv6_addr_type(&cfg
->fc_dst
);
1226 if (addr_type
& IPV6_ADDR_MULTICAST
)
1227 rt
->dst
.input
= ip6_mc_input
;
1228 else if (cfg
->fc_flags
& RTF_LOCAL
)
1229 rt
->dst
.input
= ip6_input
;
1231 rt
->dst
.input
= ip6_forward
;
1233 rt
->dst
.output
= ip6_output
;
1235 ipv6_addr_prefix(&rt
->rt6i_dst
.addr
, &cfg
->fc_dst
, cfg
->fc_dst_len
);
1236 rt
->rt6i_dst
.plen
= cfg
->fc_dst_len
;
1237 if (rt
->rt6i_dst
.plen
== 128)
1238 rt
->dst
.flags
= DST_HOST
;
1240 #ifdef CONFIG_IPV6_SUBTREES
1241 ipv6_addr_prefix(&rt
->rt6i_src
.addr
, &cfg
->fc_src
, cfg
->fc_src_len
);
1242 rt
->rt6i_src
.plen
= cfg
->fc_src_len
;
1245 rt
->rt6i_metric
= cfg
->fc_metric
;
1247 /* We cannot add true routes via loopback here,
1248 they would result in kernel looping; promote them to reject routes
1250 if ((cfg
->fc_flags
& RTF_REJECT
) ||
1251 (dev
&& (dev
->flags
&IFF_LOOPBACK
) && !(addr_type
&IPV6_ADDR_LOOPBACK
)
1252 && !(cfg
->fc_flags
&RTF_LOCAL
))) {
1253 /* hold loopback dev/idev if we haven't done so. */
1254 if (dev
!= net
->loopback_dev
) {
1259 dev
= net
->loopback_dev
;
1261 idev
= in6_dev_get(dev
);
1267 rt
->dst
.output
= ip6_pkt_discard_out
;
1268 rt
->dst
.input
= ip6_pkt_discard
;
1269 rt
->dst
.error
= -ENETUNREACH
;
1270 rt
->rt6i_flags
= RTF_REJECT
|RTF_NONEXTHOP
;
1274 if (cfg
->fc_flags
& RTF_GATEWAY
) {
1275 struct in6_addr
*gw_addr
;
1278 gw_addr
= &cfg
->fc_gateway
;
1279 ipv6_addr_copy(&rt
->rt6i_gateway
, gw_addr
);
1280 gwa_type
= ipv6_addr_type(gw_addr
);
1282 if (gwa_type
!= (IPV6_ADDR_LINKLOCAL
|IPV6_ADDR_UNICAST
)) {
1283 struct rt6_info
*grt
;
1285 /* IPv6 strictly inhibits using not link-local
1286 addresses as nexthop address.
1287 Otherwise, router will not able to send redirects.
1288 It is very good, but in some (rare!) circumstances
1289 (SIT, PtP, NBMA NOARP links) it is handy to allow
1290 some exceptions. --ANK
1293 if (!(gwa_type
&IPV6_ADDR_UNICAST
))
1296 grt
= rt6_lookup(net
, gw_addr
, NULL
, cfg
->fc_ifindex
, 1);
1298 err
= -EHOSTUNREACH
;
1302 if (dev
!= grt
->rt6i_dev
) {
1303 dst_release(&grt
->dst
);
1307 dev
= grt
->rt6i_dev
;
1308 idev
= grt
->rt6i_idev
;
1310 in6_dev_hold(grt
->rt6i_idev
);
1312 if (!(grt
->rt6i_flags
&RTF_GATEWAY
))
1314 dst_release(&grt
->dst
);
1320 if (dev
== NULL
|| (dev
->flags
&IFF_LOOPBACK
))
1328 if (cfg
->fc_flags
& (RTF_GATEWAY
| RTF_NONEXTHOP
)) {
1329 rt
->rt6i_nexthop
= __neigh_lookup_errno(&nd_tbl
, &rt
->rt6i_gateway
, dev
);
1330 if (IS_ERR(rt
->rt6i_nexthop
)) {
1331 err
= PTR_ERR(rt
->rt6i_nexthop
);
1332 rt
->rt6i_nexthop
= NULL
;
1337 rt
->rt6i_flags
= cfg
->fc_flags
;
1344 nla_for_each_attr(nla
, cfg
->fc_mx
, cfg
->fc_mx_len
, remaining
) {
1345 int type
= nla_type(nla
);
1348 if (type
> RTAX_MAX
) {
1353 dst_metric_set(&rt
->dst
, type
, nla_get_u32(nla
));
1359 rt
->rt6i_idev
= idev
;
1360 rt
->rt6i_table
= table
;
1362 cfg
->fc_nlinfo
.nl_net
= dev_net(dev
);
1364 return __ip6_ins_rt(rt
, &cfg
->fc_nlinfo
);
1376 static int __ip6_del_rt(struct rt6_info
*rt
, struct nl_info
*info
)
1379 struct fib6_table
*table
;
1380 struct net
*net
= dev_net(rt
->rt6i_dev
);
1382 if (rt
== net
->ipv6
.ip6_null_entry
)
1385 table
= rt
->rt6i_table
;
1386 write_lock_bh(&table
->tb6_lock
);
1388 err
= fib6_del(rt
, info
);
1389 dst_release(&rt
->dst
);
1391 write_unlock_bh(&table
->tb6_lock
);
1396 int ip6_del_rt(struct rt6_info
*rt
)
1398 struct nl_info info
= {
1399 .nl_net
= dev_net(rt
->rt6i_dev
),
1401 return __ip6_del_rt(rt
, &info
);
1404 static int ip6_route_del(struct fib6_config
*cfg
)
1406 struct fib6_table
*table
;
1407 struct fib6_node
*fn
;
1408 struct rt6_info
*rt
;
1411 table
= fib6_get_table(cfg
->fc_nlinfo
.nl_net
, cfg
->fc_table
);
1415 read_lock_bh(&table
->tb6_lock
);
1417 fn
= fib6_locate(&table
->tb6_root
,
1418 &cfg
->fc_dst
, cfg
->fc_dst_len
,
1419 &cfg
->fc_src
, cfg
->fc_src_len
);
1422 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1423 if (cfg
->fc_ifindex
&&
1424 (rt
->rt6i_dev
== NULL
||
1425 rt
->rt6i_dev
->ifindex
!= cfg
->fc_ifindex
))
1427 if (cfg
->fc_flags
& RTF_GATEWAY
&&
1428 !ipv6_addr_equal(&cfg
->fc_gateway
, &rt
->rt6i_gateway
))
1430 if (cfg
->fc_metric
&& cfg
->fc_metric
!= rt
->rt6i_metric
)
1433 read_unlock_bh(&table
->tb6_lock
);
1435 return __ip6_del_rt(rt
, &cfg
->fc_nlinfo
);
1438 read_unlock_bh(&table
->tb6_lock
);
1446 struct ip6rd_flowi
{
1448 struct in6_addr gateway
;
1451 static struct rt6_info
*__ip6_route_redirect(struct net
*net
,
1452 struct fib6_table
*table
,
1456 struct ip6rd_flowi
*rdfl
= (struct ip6rd_flowi
*)fl6
;
1457 struct rt6_info
*rt
;
1458 struct fib6_node
*fn
;
1461 * Get the "current" route for this destination and
1462 * check if the redirect has come from approriate router.
1464 * RFC 2461 specifies that redirects should only be
1465 * accepted if they come from the nexthop to the target.
1466 * Due to the way the routes are chosen, this notion
1467 * is a bit fuzzy and one might need to check all possible
1471 read_lock_bh(&table
->tb6_lock
);
1472 fn
= fib6_lookup(&table
->tb6_root
, &fl6
->daddr
, &fl6
->saddr
);
1474 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1476 * Current route is on-link; redirect is always invalid.
1478 * Seems, previous statement is not true. It could
1479 * be node, which looks for us as on-link (f.e. proxy ndisc)
1480 * But then router serving it might decide, that we should
1481 * know truth 8)8) --ANK (980726).
1483 if (rt6_check_expired(rt
))
1485 if (!(rt
->rt6i_flags
& RTF_GATEWAY
))
1487 if (fl6
->flowi6_oif
!= rt
->rt6i_dev
->ifindex
)
1489 if (!ipv6_addr_equal(&rdfl
->gateway
, &rt
->rt6i_gateway
))
1495 rt
= net
->ipv6
.ip6_null_entry
;
1496 BACKTRACK(net
, &fl6
->saddr
);
1500 read_unlock_bh(&table
->tb6_lock
);
1505 static struct rt6_info
*ip6_route_redirect(struct in6_addr
*dest
,
1506 struct in6_addr
*src
,
1507 struct in6_addr
*gateway
,
1508 struct net_device
*dev
)
1510 int flags
= RT6_LOOKUP_F_HAS_SADDR
;
1511 struct net
*net
= dev_net(dev
);
1512 struct ip6rd_flowi rdfl
= {
1514 .flowi6_oif
= dev
->ifindex
,
1520 ipv6_addr_copy(&rdfl
.gateway
, gateway
);
1522 if (rt6_need_strict(dest
))
1523 flags
|= RT6_LOOKUP_F_IFACE
;
1525 return (struct rt6_info
*)fib6_rule_lookup(net
, &rdfl
.fl6
,
1526 flags
, __ip6_route_redirect
);
1529 void rt6_redirect(struct in6_addr
*dest
, struct in6_addr
*src
,
1530 struct in6_addr
*saddr
,
1531 struct neighbour
*neigh
, u8
*lladdr
, int on_link
)
1533 struct rt6_info
*rt
, *nrt
= NULL
;
1534 struct netevent_redirect netevent
;
1535 struct net
*net
= dev_net(neigh
->dev
);
1537 rt
= ip6_route_redirect(dest
, src
, saddr
, neigh
->dev
);
1539 if (rt
== net
->ipv6
.ip6_null_entry
) {
1540 if (net_ratelimit())
1541 printk(KERN_DEBUG
"rt6_redirect: source isn't a valid nexthop "
1542 "for redirect target\n");
1547 * We have finally decided to accept it.
1550 neigh_update(neigh
, lladdr
, NUD_STALE
,
1551 NEIGH_UPDATE_F_WEAK_OVERRIDE
|
1552 NEIGH_UPDATE_F_OVERRIDE
|
1553 (on_link
? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER
|
1554 NEIGH_UPDATE_F_ISROUTER
))
1558 * Redirect received -> path was valid.
1559 * Look, redirects are sent only in response to data packets,
1560 * so that this nexthop apparently is reachable. --ANK
1562 dst_confirm(&rt
->dst
);
1564 /* Duplicate redirect: silently ignore. */
1565 if (neigh
== rt
->dst
.neighbour
)
1568 nrt
= ip6_rt_copy(rt
);
1572 nrt
->rt6i_flags
= RTF_GATEWAY
|RTF_UP
|RTF_DYNAMIC
|RTF_CACHE
;
1574 nrt
->rt6i_flags
&= ~RTF_GATEWAY
;
1576 ipv6_addr_copy(&nrt
->rt6i_dst
.addr
, dest
);
1577 nrt
->rt6i_dst
.plen
= 128;
1578 nrt
->dst
.flags
|= DST_HOST
;
1580 ipv6_addr_copy(&nrt
->rt6i_gateway
, (struct in6_addr
*)neigh
->primary_key
);
1581 nrt
->rt6i_nexthop
= neigh_clone(neigh
);
1583 if (ip6_ins_rt(nrt
))
1586 netevent
.old
= &rt
->dst
;
1587 netevent
.new = &nrt
->dst
;
1588 call_netevent_notifiers(NETEVENT_REDIRECT
, &netevent
);
1590 if (rt
->rt6i_flags
&RTF_CACHE
) {
1596 dst_release(&rt
->dst
);
1600 * Handle ICMP "packet too big" messages
1601 * i.e. Path MTU discovery
1604 static void rt6_do_pmtu_disc(struct in6_addr
*daddr
, struct in6_addr
*saddr
,
1605 struct net
*net
, u32 pmtu
, int ifindex
)
1607 struct rt6_info
*rt
, *nrt
;
1610 rt
= rt6_lookup(net
, daddr
, saddr
, ifindex
, 0);
1614 if (rt6_check_expired(rt
)) {
1619 if (pmtu
>= dst_mtu(&rt
->dst
))
1622 if (pmtu
< IPV6_MIN_MTU
) {
1624 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1625 * MTU (1280) and a fragment header should always be included
1626 * after a node receiving Too Big message reporting PMTU is
1627 * less than the IPv6 Minimum Link MTU.
1629 pmtu
= IPV6_MIN_MTU
;
1633 /* New mtu received -> path was valid.
1634 They are sent only in response to data packets,
1635 so that this nexthop apparently is reachable. --ANK
1637 dst_confirm(&rt
->dst
);
1639 /* Host route. If it is static, it would be better
1640 not to override it, but add new one, so that
1641 when cache entry will expire old pmtu
1642 would return automatically.
1644 if (rt
->rt6i_flags
& RTF_CACHE
) {
1645 dst_metric_set(&rt
->dst
, RTAX_MTU
, pmtu
);
1647 u32 features
= dst_metric(&rt
->dst
, RTAX_FEATURES
);
1648 features
|= RTAX_FEATURE_ALLFRAG
;
1649 dst_metric_set(&rt
->dst
, RTAX_FEATURES
, features
);
1651 dst_set_expires(&rt
->dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1652 rt
->rt6i_flags
|= RTF_MODIFIED
|RTF_EXPIRES
;
1657 Two cases are possible:
1658 1. It is connected route. Action: COW
1659 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1661 if (!rt
->rt6i_nexthop
&& !(rt
->rt6i_flags
& RTF_NONEXTHOP
))
1662 nrt
= rt6_alloc_cow(rt
, daddr
, saddr
);
1664 nrt
= rt6_alloc_clone(rt
, daddr
);
1667 dst_metric_set(&nrt
->dst
, RTAX_MTU
, pmtu
);
1669 u32 features
= dst_metric(&nrt
->dst
, RTAX_FEATURES
);
1670 features
|= RTAX_FEATURE_ALLFRAG
;
1671 dst_metric_set(&nrt
->dst
, RTAX_FEATURES
, features
);
1674 /* According to RFC 1981, detecting PMTU increase shouldn't be
1675 * happened within 5 mins, the recommended timer is 10 mins.
1676 * Here this route expiration time is set to ip6_rt_mtu_expires
1677 * which is 10 mins. After 10 mins the decreased pmtu is expired
1678 * and detecting PMTU increase will be automatically happened.
1680 dst_set_expires(&nrt
->dst
, net
->ipv6
.sysctl
.ip6_rt_mtu_expires
);
1681 nrt
->rt6i_flags
|= RTF_DYNAMIC
|RTF_EXPIRES
;
1686 dst_release(&rt
->dst
);
1689 void rt6_pmtu_discovery(struct in6_addr
*daddr
, struct in6_addr
*saddr
,
1690 struct net_device
*dev
, u32 pmtu
)
1692 struct net
*net
= dev_net(dev
);
1695 * RFC 1981 states that a node "MUST reduce the size of the packets it
1696 * is sending along the path" that caused the Packet Too Big message.
1697 * Since it's not possible in the general case to determine which
1698 * interface was used to send the original packet, we update the MTU
1699 * on the interface that will be used to send future packets. We also
1700 * update the MTU on the interface that received the Packet Too Big in
1701 * case the original packet was forced out that interface with
1702 * SO_BINDTODEVICE or similar. This is the next best thing to the
1703 * correct behaviour, which would be to update the MTU on all
1706 rt6_do_pmtu_disc(daddr
, saddr
, net
, pmtu
, 0);
1707 rt6_do_pmtu_disc(daddr
, saddr
, net
, pmtu
, dev
->ifindex
);
1711 * Misc support functions
1714 static struct rt6_info
* ip6_rt_copy(struct rt6_info
*ort
)
1716 struct net
*net
= dev_net(ort
->rt6i_dev
);
1717 struct rt6_info
*rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1720 rt
->dst
.input
= ort
->dst
.input
;
1721 rt
->dst
.output
= ort
->dst
.output
;
1723 dst_copy_metrics(&rt
->dst
, &ort
->dst
);
1724 rt
->dst
.error
= ort
->dst
.error
;
1725 rt
->dst
.dev
= ort
->dst
.dev
;
1727 dev_hold(rt
->dst
.dev
);
1728 rt
->rt6i_idev
= ort
->rt6i_idev
;
1730 in6_dev_hold(rt
->rt6i_idev
);
1731 rt
->dst
.lastuse
= jiffies
;
1732 rt
->rt6i_expires
= 0;
1734 ipv6_addr_copy(&rt
->rt6i_gateway
, &ort
->rt6i_gateway
);
1735 rt
->rt6i_flags
= ort
->rt6i_flags
& ~RTF_EXPIRES
;
1736 rt
->rt6i_metric
= 0;
1738 memcpy(&rt
->rt6i_dst
, &ort
->rt6i_dst
, sizeof(struct rt6key
));
1739 #ifdef CONFIG_IPV6_SUBTREES
1740 memcpy(&rt
->rt6i_src
, &ort
->rt6i_src
, sizeof(struct rt6key
));
1742 rt
->rt6i_table
= ort
->rt6i_table
;
1747 #ifdef CONFIG_IPV6_ROUTE_INFO
1748 static struct rt6_info
*rt6_get_route_info(struct net
*net
,
1749 struct in6_addr
*prefix
, int prefixlen
,
1750 struct in6_addr
*gwaddr
, int ifindex
)
1752 struct fib6_node
*fn
;
1753 struct rt6_info
*rt
= NULL
;
1754 struct fib6_table
*table
;
1756 table
= fib6_get_table(net
, RT6_TABLE_INFO
);
1760 write_lock_bh(&table
->tb6_lock
);
1761 fn
= fib6_locate(&table
->tb6_root
, prefix
,prefixlen
, NULL
, 0);
1765 for (rt
= fn
->leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1766 if (rt
->rt6i_dev
->ifindex
!= ifindex
)
1768 if ((rt
->rt6i_flags
& (RTF_ROUTEINFO
|RTF_GATEWAY
)) != (RTF_ROUTEINFO
|RTF_GATEWAY
))
1770 if (!ipv6_addr_equal(&rt
->rt6i_gateway
, gwaddr
))
1776 write_unlock_bh(&table
->tb6_lock
);
1780 static struct rt6_info
*rt6_add_route_info(struct net
*net
,
1781 struct in6_addr
*prefix
, int prefixlen
,
1782 struct in6_addr
*gwaddr
, int ifindex
,
1785 struct fib6_config cfg
= {
1786 .fc_table
= RT6_TABLE_INFO
,
1787 .fc_metric
= IP6_RT_PRIO_USER
,
1788 .fc_ifindex
= ifindex
,
1789 .fc_dst_len
= prefixlen
,
1790 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_ROUTEINFO
|
1791 RTF_UP
| RTF_PREF(pref
),
1793 .fc_nlinfo
.nlh
= NULL
,
1794 .fc_nlinfo
.nl_net
= net
,
1797 ipv6_addr_copy(&cfg
.fc_dst
, prefix
);
1798 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1800 /* We should treat it as a default route if prefix length is 0. */
1802 cfg
.fc_flags
|= RTF_DEFAULT
;
1804 ip6_route_add(&cfg
);
1806 return rt6_get_route_info(net
, prefix
, prefixlen
, gwaddr
, ifindex
);
1810 struct rt6_info
*rt6_get_dflt_router(struct in6_addr
*addr
, struct net_device
*dev
)
1812 struct rt6_info
*rt
;
1813 struct fib6_table
*table
;
1815 table
= fib6_get_table(dev_net(dev
), RT6_TABLE_DFLT
);
1819 write_lock_bh(&table
->tb6_lock
);
1820 for (rt
= table
->tb6_root
.leaf
; rt
; rt
=rt
->dst
.rt6_next
) {
1821 if (dev
== rt
->rt6i_dev
&&
1822 ((rt
->rt6i_flags
& (RTF_ADDRCONF
| RTF_DEFAULT
)) == (RTF_ADDRCONF
| RTF_DEFAULT
)) &&
1823 ipv6_addr_equal(&rt
->rt6i_gateway
, addr
))
1828 write_unlock_bh(&table
->tb6_lock
);
1832 struct rt6_info
*rt6_add_dflt_router(struct in6_addr
*gwaddr
,
1833 struct net_device
*dev
,
1836 struct fib6_config cfg
= {
1837 .fc_table
= RT6_TABLE_DFLT
,
1838 .fc_metric
= IP6_RT_PRIO_USER
,
1839 .fc_ifindex
= dev
->ifindex
,
1840 .fc_flags
= RTF_GATEWAY
| RTF_ADDRCONF
| RTF_DEFAULT
|
1841 RTF_UP
| RTF_EXPIRES
| RTF_PREF(pref
),
1843 .fc_nlinfo
.nlh
= NULL
,
1844 .fc_nlinfo
.nl_net
= dev_net(dev
),
1847 ipv6_addr_copy(&cfg
.fc_gateway
, gwaddr
);
1849 ip6_route_add(&cfg
);
1851 return rt6_get_dflt_router(gwaddr
, dev
);
1854 void rt6_purge_dflt_routers(struct net
*net
)
1856 struct rt6_info
*rt
;
1857 struct fib6_table
*table
;
1859 /* NOTE: Keep consistent with rt6_get_dflt_router */
1860 table
= fib6_get_table(net
, RT6_TABLE_DFLT
);
1865 read_lock_bh(&table
->tb6_lock
);
1866 for (rt
= table
->tb6_root
.leaf
; rt
; rt
= rt
->dst
.rt6_next
) {
1867 if (rt
->rt6i_flags
& (RTF_DEFAULT
| RTF_ADDRCONF
)) {
1869 read_unlock_bh(&table
->tb6_lock
);
1874 read_unlock_bh(&table
->tb6_lock
);
1877 static void rtmsg_to_fib6_config(struct net
*net
,
1878 struct in6_rtmsg
*rtmsg
,
1879 struct fib6_config
*cfg
)
1881 memset(cfg
, 0, sizeof(*cfg
));
1883 cfg
->fc_table
= RT6_TABLE_MAIN
;
1884 cfg
->fc_ifindex
= rtmsg
->rtmsg_ifindex
;
1885 cfg
->fc_metric
= rtmsg
->rtmsg_metric
;
1886 cfg
->fc_expires
= rtmsg
->rtmsg_info
;
1887 cfg
->fc_dst_len
= rtmsg
->rtmsg_dst_len
;
1888 cfg
->fc_src_len
= rtmsg
->rtmsg_src_len
;
1889 cfg
->fc_flags
= rtmsg
->rtmsg_flags
;
1891 cfg
->fc_nlinfo
.nl_net
= net
;
1893 ipv6_addr_copy(&cfg
->fc_dst
, &rtmsg
->rtmsg_dst
);
1894 ipv6_addr_copy(&cfg
->fc_src
, &rtmsg
->rtmsg_src
);
1895 ipv6_addr_copy(&cfg
->fc_gateway
, &rtmsg
->rtmsg_gateway
);
1898 int ipv6_route_ioctl(struct net
*net
, unsigned int cmd
, void __user
*arg
)
1900 struct fib6_config cfg
;
1901 struct in6_rtmsg rtmsg
;
1905 case SIOCADDRT
: /* Add a route */
1906 case SIOCDELRT
: /* Delete a route */
1907 if (!capable(CAP_NET_ADMIN
))
1909 err
= copy_from_user(&rtmsg
, arg
,
1910 sizeof(struct in6_rtmsg
));
1914 rtmsg_to_fib6_config(net
, &rtmsg
, &cfg
);
1919 err
= ip6_route_add(&cfg
);
1922 err
= ip6_route_del(&cfg
);
1936 * Drop the packet on the floor
1939 static int ip6_pkt_drop(struct sk_buff
*skb
, u8 code
, int ipstats_mib_noroutes
)
1942 struct dst_entry
*dst
= skb_dst(skb
);
1943 switch (ipstats_mib_noroutes
) {
1944 case IPSTATS_MIB_INNOROUTES
:
1945 type
= ipv6_addr_type(&ipv6_hdr(skb
)->daddr
);
1946 if (type
== IPV6_ADDR_ANY
) {
1947 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1948 IPSTATS_MIB_INADDRERRORS
);
1952 case IPSTATS_MIB_OUTNOROUTES
:
1953 IP6_INC_STATS(dev_net(dst
->dev
), ip6_dst_idev(dst
),
1954 ipstats_mib_noroutes
);
1957 icmpv6_send(skb
, ICMPV6_DEST_UNREACH
, code
, 0);
1962 static int ip6_pkt_discard(struct sk_buff
*skb
)
1964 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_INNOROUTES
);
1967 static int ip6_pkt_discard_out(struct sk_buff
*skb
)
1969 skb
->dev
= skb_dst(skb
)->dev
;
1970 return ip6_pkt_drop(skb
, ICMPV6_NOROUTE
, IPSTATS_MIB_OUTNOROUTES
);
1973 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1975 static int ip6_pkt_prohibit(struct sk_buff
*skb
)
1977 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_INNOROUTES
);
1980 static int ip6_pkt_prohibit_out(struct sk_buff
*skb
)
1982 skb
->dev
= skb_dst(skb
)->dev
;
1983 return ip6_pkt_drop(skb
, ICMPV6_ADM_PROHIBITED
, IPSTATS_MIB_OUTNOROUTES
);
1989 * Allocate a dst for local (unicast / anycast) address.
1992 struct rt6_info
*addrconf_dst_alloc(struct inet6_dev
*idev
,
1993 const struct in6_addr
*addr
,
1996 struct net
*net
= dev_net(idev
->dev
);
1997 struct rt6_info
*rt
= ip6_dst_alloc(&net
->ipv6
.ip6_dst_ops
);
1998 struct neighbour
*neigh
;
2001 if (net_ratelimit())
2002 pr_warning("IPv6: Maximum number of routes reached,"
2003 " consider increasing route/max_size.\n");
2004 return ERR_PTR(-ENOMEM
);
2007 dev_hold(net
->loopback_dev
);
2010 rt
->dst
.flags
= DST_HOST
;
2011 rt
->dst
.input
= ip6_input
;
2012 rt
->dst
.output
= ip6_output
;
2013 rt
->rt6i_dev
= net
->loopback_dev
;
2014 rt
->rt6i_idev
= idev
;
2015 rt
->dst
.obsolete
= -1;
2017 rt
->rt6i_flags
= RTF_UP
| RTF_NONEXTHOP
;
2019 rt
->rt6i_flags
|= RTF_ANYCAST
;
2021 rt
->rt6i_flags
|= RTF_LOCAL
;
2022 neigh
= ndisc_get_neigh(rt
->rt6i_dev
, &rt
->rt6i_gateway
);
2023 if (IS_ERR(neigh
)) {
2026 return ERR_CAST(neigh
);
2028 rt
->rt6i_nexthop
= neigh
;
2030 ipv6_addr_copy(&rt
->rt6i_dst
.addr
, addr
);
2031 rt
->rt6i_dst
.plen
= 128;
2032 rt
->rt6i_table
= fib6_get_table(net
, RT6_TABLE_LOCAL
);
2034 atomic_set(&rt
->dst
.__refcnt
, 1);
2039 struct arg_dev_net
{
2040 struct net_device
*dev
;
2044 static int fib6_ifdown(struct rt6_info
*rt
, void *arg
)
2046 const struct arg_dev_net
*adn
= arg
;
2047 const struct net_device
*dev
= adn
->dev
;
2049 if ((rt
->rt6i_dev
== dev
|| dev
== NULL
) &&
2050 rt
!= adn
->net
->ipv6
.ip6_null_entry
) {
2051 RT6_TRACE("deleted by ifdown %p\n", rt
);
2057 void rt6_ifdown(struct net
*net
, struct net_device
*dev
)
2059 struct arg_dev_net adn
= {
2064 fib6_clean_all(net
, fib6_ifdown
, 0, &adn
);
2065 icmp6_clean_all(fib6_ifdown
, &adn
);
2068 struct rt6_mtu_change_arg
2070 struct net_device
*dev
;
2074 static int rt6_mtu_change_route(struct rt6_info
*rt
, void *p_arg
)
2076 struct rt6_mtu_change_arg
*arg
= (struct rt6_mtu_change_arg
*) p_arg
;
2077 struct inet6_dev
*idev
;
2079 /* In IPv6 pmtu discovery is not optional,
2080 so that RTAX_MTU lock cannot disable it.
2081 We still use this lock to block changes
2082 caused by addrconf/ndisc.
2085 idev
= __in6_dev_get(arg
->dev
);
2089 /* For administrative MTU increase, there is no way to discover
2090 IPv6 PMTU increase, so PMTU increase should be updated here.
2091 Since RFC 1981 doesn't include administrative MTU increase
2092 update PMTU increase is a MUST. (i.e. jumbo frame)
2095 If new MTU is less than route PMTU, this new MTU will be the
2096 lowest MTU in the path, update the route PMTU to reflect PMTU
2097 decreases; if new MTU is greater than route PMTU, and the
2098 old MTU is the lowest MTU in the path, update the route PMTU
2099 to reflect the increase. In this case if the other nodes' MTU
2100 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2103 if (rt
->rt6i_dev
== arg
->dev
&&
2104 !dst_metric_locked(&rt
->dst
, RTAX_MTU
) &&
2105 (dst_mtu(&rt
->dst
) >= arg
->mtu
||
2106 (dst_mtu(&rt
->dst
) < arg
->mtu
&&
2107 dst_mtu(&rt
->dst
) == idev
->cnf
.mtu6
))) {
2108 dst_metric_set(&rt
->dst
, RTAX_MTU
, arg
->mtu
);
2113 void rt6_mtu_change(struct net_device
*dev
, unsigned mtu
)
2115 struct rt6_mtu_change_arg arg
= {
2120 fib6_clean_all(dev_net(dev
), rt6_mtu_change_route
, 0, &arg
);
2123 static const struct nla_policy rtm_ipv6_policy
[RTA_MAX
+1] = {
2124 [RTA_GATEWAY
] = { .len
= sizeof(struct in6_addr
) },
2125 [RTA_OIF
] = { .type
= NLA_U32
},
2126 [RTA_IIF
] = { .type
= NLA_U32
},
2127 [RTA_PRIORITY
] = { .type
= NLA_U32
},
2128 [RTA_METRICS
] = { .type
= NLA_NESTED
},
2131 static int rtm_to_fib6_config(struct sk_buff
*skb
, struct nlmsghdr
*nlh
,
2132 struct fib6_config
*cfg
)
2135 struct nlattr
*tb
[RTA_MAX
+1];
2138 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2143 rtm
= nlmsg_data(nlh
);
2144 memset(cfg
, 0, sizeof(*cfg
));
2146 cfg
->fc_table
= rtm
->rtm_table
;
2147 cfg
->fc_dst_len
= rtm
->rtm_dst_len
;
2148 cfg
->fc_src_len
= rtm
->rtm_src_len
;
2149 cfg
->fc_flags
= RTF_UP
;
2150 cfg
->fc_protocol
= rtm
->rtm_protocol
;
2152 if (rtm
->rtm_type
== RTN_UNREACHABLE
)
2153 cfg
->fc_flags
|= RTF_REJECT
;
2155 if (rtm
->rtm_type
== RTN_LOCAL
)
2156 cfg
->fc_flags
|= RTF_LOCAL
;
2158 cfg
->fc_nlinfo
.pid
= NETLINK_CB(skb
).pid
;
2159 cfg
->fc_nlinfo
.nlh
= nlh
;
2160 cfg
->fc_nlinfo
.nl_net
= sock_net(skb
->sk
);
2162 if (tb
[RTA_GATEWAY
]) {
2163 nla_memcpy(&cfg
->fc_gateway
, tb
[RTA_GATEWAY
], 16);
2164 cfg
->fc_flags
|= RTF_GATEWAY
;
2168 int plen
= (rtm
->rtm_dst_len
+ 7) >> 3;
2170 if (nla_len(tb
[RTA_DST
]) < plen
)
2173 nla_memcpy(&cfg
->fc_dst
, tb
[RTA_DST
], plen
);
2177 int plen
= (rtm
->rtm_src_len
+ 7) >> 3;
2179 if (nla_len(tb
[RTA_SRC
]) < plen
)
2182 nla_memcpy(&cfg
->fc_src
, tb
[RTA_SRC
], plen
);
2186 cfg
->fc_ifindex
= nla_get_u32(tb
[RTA_OIF
]);
2188 if (tb
[RTA_PRIORITY
])
2189 cfg
->fc_metric
= nla_get_u32(tb
[RTA_PRIORITY
]);
2191 if (tb
[RTA_METRICS
]) {
2192 cfg
->fc_mx
= nla_data(tb
[RTA_METRICS
]);
2193 cfg
->fc_mx_len
= nla_len(tb
[RTA_METRICS
]);
2197 cfg
->fc_table
= nla_get_u32(tb
[RTA_TABLE
]);
2204 static int inet6_rtm_delroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2206 struct fib6_config cfg
;
2209 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2213 return ip6_route_del(&cfg
);
2216 static int inet6_rtm_newroute(struct sk_buff
*skb
, struct nlmsghdr
* nlh
, void *arg
)
2218 struct fib6_config cfg
;
2221 err
= rtm_to_fib6_config(skb
, nlh
, &cfg
);
2225 return ip6_route_add(&cfg
);
2228 static inline size_t rt6_nlmsg_size(void)
2230 return NLMSG_ALIGN(sizeof(struct rtmsg
))
2231 + nla_total_size(16) /* RTA_SRC */
2232 + nla_total_size(16) /* RTA_DST */
2233 + nla_total_size(16) /* RTA_GATEWAY */
2234 + nla_total_size(16) /* RTA_PREFSRC */
2235 + nla_total_size(4) /* RTA_TABLE */
2236 + nla_total_size(4) /* RTA_IIF */
2237 + nla_total_size(4) /* RTA_OIF */
2238 + nla_total_size(4) /* RTA_PRIORITY */
2239 + RTAX_MAX
* nla_total_size(4) /* RTA_METRICS */
2240 + nla_total_size(sizeof(struct rta_cacheinfo
));
2243 static int rt6_fill_node(struct net
*net
,
2244 struct sk_buff
*skb
, struct rt6_info
*rt
,
2245 struct in6_addr
*dst
, struct in6_addr
*src
,
2246 int iif
, int type
, u32 pid
, u32 seq
,
2247 int prefix
, int nowait
, unsigned int flags
)
2250 struct nlmsghdr
*nlh
;
2254 if (prefix
) { /* user wants prefix routes only */
2255 if (!(rt
->rt6i_flags
& RTF_PREFIX_RT
)) {
2256 /* success since this is not a prefix route */
2261 nlh
= nlmsg_put(skb
, pid
, seq
, type
, sizeof(*rtm
), flags
);
2265 rtm
= nlmsg_data(nlh
);
2266 rtm
->rtm_family
= AF_INET6
;
2267 rtm
->rtm_dst_len
= rt
->rt6i_dst
.plen
;
2268 rtm
->rtm_src_len
= rt
->rt6i_src
.plen
;
2271 table
= rt
->rt6i_table
->tb6_id
;
2273 table
= RT6_TABLE_UNSPEC
;
2274 rtm
->rtm_table
= table
;
2275 NLA_PUT_U32(skb
, RTA_TABLE
, table
);
2276 if (rt
->rt6i_flags
&RTF_REJECT
)
2277 rtm
->rtm_type
= RTN_UNREACHABLE
;
2278 else if (rt
->rt6i_flags
&RTF_LOCAL
)
2279 rtm
->rtm_type
= RTN_LOCAL
;
2280 else if (rt
->rt6i_dev
&& (rt
->rt6i_dev
->flags
&IFF_LOOPBACK
))
2281 rtm
->rtm_type
= RTN_LOCAL
;
2283 rtm
->rtm_type
= RTN_UNICAST
;
2285 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2286 rtm
->rtm_protocol
= rt
->rt6i_protocol
;
2287 if (rt
->rt6i_flags
&RTF_DYNAMIC
)
2288 rtm
->rtm_protocol
= RTPROT_REDIRECT
;
2289 else if (rt
->rt6i_flags
& RTF_ADDRCONF
)
2290 rtm
->rtm_protocol
= RTPROT_KERNEL
;
2291 else if (rt
->rt6i_flags
&RTF_DEFAULT
)
2292 rtm
->rtm_protocol
= RTPROT_RA
;
2294 if (rt
->rt6i_flags
&RTF_CACHE
)
2295 rtm
->rtm_flags
|= RTM_F_CLONED
;
2298 NLA_PUT(skb
, RTA_DST
, 16, dst
);
2299 rtm
->rtm_dst_len
= 128;
2300 } else if (rtm
->rtm_dst_len
)
2301 NLA_PUT(skb
, RTA_DST
, 16, &rt
->rt6i_dst
.addr
);
2302 #ifdef CONFIG_IPV6_SUBTREES
2304 NLA_PUT(skb
, RTA_SRC
, 16, src
);
2305 rtm
->rtm_src_len
= 128;
2306 } else if (rtm
->rtm_src_len
)
2307 NLA_PUT(skb
, RTA_SRC
, 16, &rt
->rt6i_src
.addr
);
2310 #ifdef CONFIG_IPV6_MROUTE
2311 if (ipv6_addr_is_multicast(&rt
->rt6i_dst
.addr
)) {
2312 int err
= ip6mr_get_route(net
, skb
, rtm
, nowait
);
2317 goto nla_put_failure
;
2319 if (err
== -EMSGSIZE
)
2320 goto nla_put_failure
;
2325 NLA_PUT_U32(skb
, RTA_IIF
, iif
);
2327 struct inet6_dev
*idev
= ip6_dst_idev(&rt
->dst
);
2328 struct in6_addr saddr_buf
;
2329 if (ipv6_dev_get_saddr(net
, idev
? idev
->dev
: NULL
,
2330 dst
, 0, &saddr_buf
) == 0)
2331 NLA_PUT(skb
, RTA_PREFSRC
, 16, &saddr_buf
);
2334 if (rtnetlink_put_metrics(skb
, dst_metrics_ptr(&rt
->dst
)) < 0)
2335 goto nla_put_failure
;
2337 if (rt
->dst
.neighbour
)
2338 NLA_PUT(skb
, RTA_GATEWAY
, 16, &rt
->dst
.neighbour
->primary_key
);
2341 NLA_PUT_U32(skb
, RTA_OIF
, rt
->rt6i_dev
->ifindex
);
2343 NLA_PUT_U32(skb
, RTA_PRIORITY
, rt
->rt6i_metric
);
2345 if (!(rt
->rt6i_flags
& RTF_EXPIRES
))
2347 else if (rt
->rt6i_expires
- jiffies
< INT_MAX
)
2348 expires
= rt
->rt6i_expires
- jiffies
;
2352 if (rtnl_put_cacheinfo(skb
, &rt
->dst
, 0, 0, 0,
2353 expires
, rt
->dst
.error
) < 0)
2354 goto nla_put_failure
;
2356 return nlmsg_end(skb
, nlh
);
2359 nlmsg_cancel(skb
, nlh
);
2363 int rt6_dump_route(struct rt6_info
*rt
, void *p_arg
)
2365 struct rt6_rtnl_dump_arg
*arg
= (struct rt6_rtnl_dump_arg
*) p_arg
;
2368 if (nlmsg_len(arg
->cb
->nlh
) >= sizeof(struct rtmsg
)) {
2369 struct rtmsg
*rtm
= nlmsg_data(arg
->cb
->nlh
);
2370 prefix
= (rtm
->rtm_flags
& RTM_F_PREFIX
) != 0;
2374 return rt6_fill_node(arg
->net
,
2375 arg
->skb
, rt
, NULL
, NULL
, 0, RTM_NEWROUTE
,
2376 NETLINK_CB(arg
->cb
->skb
).pid
, arg
->cb
->nlh
->nlmsg_seq
,
2377 prefix
, 0, NLM_F_MULTI
);
2380 static int inet6_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
2382 struct net
*net
= sock_net(in_skb
->sk
);
2383 struct nlattr
*tb
[RTA_MAX
+1];
2384 struct rt6_info
*rt
;
2385 struct sk_buff
*skb
;
2390 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv6_policy
);
2395 memset(&fl6
, 0, sizeof(fl6
));
2398 if (nla_len(tb
[RTA_SRC
]) < sizeof(struct in6_addr
))
2401 ipv6_addr_copy(&fl6
.saddr
, nla_data(tb
[RTA_SRC
]));
2405 if (nla_len(tb
[RTA_DST
]) < sizeof(struct in6_addr
))
2408 ipv6_addr_copy(&fl6
.daddr
, nla_data(tb
[RTA_DST
]));
2412 iif
= nla_get_u32(tb
[RTA_IIF
]);
2415 fl6
.flowi6_oif
= nla_get_u32(tb
[RTA_OIF
]);
2418 struct net_device
*dev
;
2419 dev
= __dev_get_by_index(net
, iif
);
2426 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2432 /* Reserve room for dummy headers, this skb can pass
2433 through good chunk of routing engine.
2435 skb_reset_mac_header(skb
);
2436 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct ipv6hdr
));
2438 rt
= (struct rt6_info
*) ip6_route_output(net
, NULL
, &fl6
);
2439 skb_dst_set(skb
, &rt
->dst
);
2441 err
= rt6_fill_node(net
, skb
, rt
, &fl6
.daddr
, &fl6
.saddr
, iif
,
2442 RTM_NEWROUTE
, NETLINK_CB(in_skb
).pid
,
2443 nlh
->nlmsg_seq
, 0, 0, 0);
2449 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).pid
);
2454 void inet6_rt_notify(int event
, struct rt6_info
*rt
, struct nl_info
*info
)
2456 struct sk_buff
*skb
;
2457 struct net
*net
= info
->nl_net
;
2462 seq
= info
->nlh
!= NULL
? info
->nlh
->nlmsg_seq
: 0;
2464 skb
= nlmsg_new(rt6_nlmsg_size(), gfp_any());
2468 err
= rt6_fill_node(net
, skb
, rt
, NULL
, NULL
, 0,
2469 event
, info
->pid
, seq
, 0, 0, 0);
2471 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2472 WARN_ON(err
== -EMSGSIZE
);
2476 rtnl_notify(skb
, net
, info
->pid
, RTNLGRP_IPV6_ROUTE
,
2477 info
->nlh
, gfp_any());
2481 rtnl_set_sk_err(net
, RTNLGRP_IPV6_ROUTE
, err
);
2484 static int ip6_route_dev_notify(struct notifier_block
*this,
2485 unsigned long event
, void *data
)
2487 struct net_device
*dev
= (struct net_device
*)data
;
2488 struct net
*net
= dev_net(dev
);
2490 if (event
== NETDEV_REGISTER
&& (dev
->flags
& IFF_LOOPBACK
)) {
2491 net
->ipv6
.ip6_null_entry
->dst
.dev
= dev
;
2492 net
->ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(dev
);
2493 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2494 net
->ipv6
.ip6_prohibit_entry
->dst
.dev
= dev
;
2495 net
->ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(dev
);
2496 net
->ipv6
.ip6_blk_hole_entry
->dst
.dev
= dev
;
2497 net
->ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(dev
);
2508 #ifdef CONFIG_PROC_FS
2519 static int rt6_info_route(struct rt6_info
*rt
, void *p_arg
)
2521 struct seq_file
*m
= p_arg
;
2523 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_dst
.addr
, rt
->rt6i_dst
.plen
);
2525 #ifdef CONFIG_IPV6_SUBTREES
2526 seq_printf(m
, "%pi6 %02x ", &rt
->rt6i_src
.addr
, rt
->rt6i_src
.plen
);
2528 seq_puts(m
, "00000000000000000000000000000000 00 ");
2531 if (rt
->rt6i_nexthop
) {
2532 seq_printf(m
, "%pi6", rt
->rt6i_nexthop
->primary_key
);
2534 seq_puts(m
, "00000000000000000000000000000000");
2536 seq_printf(m
, " %08x %08x %08x %08x %8s\n",
2537 rt
->rt6i_metric
, atomic_read(&rt
->dst
.__refcnt
),
2538 rt
->dst
.__use
, rt
->rt6i_flags
,
2539 rt
->rt6i_dev
? rt
->rt6i_dev
->name
: "");
2543 static int ipv6_route_show(struct seq_file
*m
, void *v
)
2545 struct net
*net
= (struct net
*)m
->private;
2546 fib6_clean_all(net
, rt6_info_route
, 0, m
);
2550 static int ipv6_route_open(struct inode
*inode
, struct file
*file
)
2552 return single_open_net(inode
, file
, ipv6_route_show
);
2555 static const struct file_operations ipv6_route_proc_fops
= {
2556 .owner
= THIS_MODULE
,
2557 .open
= ipv6_route_open
,
2559 .llseek
= seq_lseek
,
2560 .release
= single_release_net
,
2563 static int rt6_stats_seq_show(struct seq_file
*seq
, void *v
)
2565 struct net
*net
= (struct net
*)seq
->private;
2566 seq_printf(seq
, "%04x %04x %04x %04x %04x %04x %04x\n",
2567 net
->ipv6
.rt6_stats
->fib_nodes
,
2568 net
->ipv6
.rt6_stats
->fib_route_nodes
,
2569 net
->ipv6
.rt6_stats
->fib_rt_alloc
,
2570 net
->ipv6
.rt6_stats
->fib_rt_entries
,
2571 net
->ipv6
.rt6_stats
->fib_rt_cache
,
2572 dst_entries_get_slow(&net
->ipv6
.ip6_dst_ops
),
2573 net
->ipv6
.rt6_stats
->fib_discarded_routes
);
2578 static int rt6_stats_seq_open(struct inode
*inode
, struct file
*file
)
2580 return single_open_net(inode
, file
, rt6_stats_seq_show
);
2583 static const struct file_operations rt6_stats_seq_fops
= {
2584 .owner
= THIS_MODULE
,
2585 .open
= rt6_stats_seq_open
,
2587 .llseek
= seq_lseek
,
2588 .release
= single_release_net
,
2590 #endif /* CONFIG_PROC_FS */
2592 #ifdef CONFIG_SYSCTL
2595 int ipv6_sysctl_rtcache_flush(ctl_table
*ctl
, int write
,
2596 void __user
*buffer
, size_t *lenp
, loff_t
*ppos
)
2603 net
= (struct net
*)ctl
->extra1
;
2604 delay
= net
->ipv6
.sysctl
.flush_delay
;
2605 proc_dointvec(ctl
, write
, buffer
, lenp
, ppos
);
2606 fib6_run_gc(delay
<= 0 ? ~0UL : (unsigned long)delay
, net
);
2610 ctl_table ipv6_route_table_template
[] = {
2612 .procname
= "flush",
2613 .data
= &init_net
.ipv6
.sysctl
.flush_delay
,
2614 .maxlen
= sizeof(int),
2616 .proc_handler
= ipv6_sysctl_rtcache_flush
2619 .procname
= "gc_thresh",
2620 .data
= &ip6_dst_ops_template
.gc_thresh
,
2621 .maxlen
= sizeof(int),
2623 .proc_handler
= proc_dointvec
,
2626 .procname
= "max_size",
2627 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_max_size
,
2628 .maxlen
= sizeof(int),
2630 .proc_handler
= proc_dointvec
,
2633 .procname
= "gc_min_interval",
2634 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2635 .maxlen
= sizeof(int),
2637 .proc_handler
= proc_dointvec_jiffies
,
2640 .procname
= "gc_timeout",
2641 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_timeout
,
2642 .maxlen
= sizeof(int),
2644 .proc_handler
= proc_dointvec_jiffies
,
2647 .procname
= "gc_interval",
2648 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_interval
,
2649 .maxlen
= sizeof(int),
2651 .proc_handler
= proc_dointvec_jiffies
,
2654 .procname
= "gc_elasticity",
2655 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_elasticity
,
2656 .maxlen
= sizeof(int),
2658 .proc_handler
= proc_dointvec
,
2661 .procname
= "mtu_expires",
2662 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_mtu_expires
,
2663 .maxlen
= sizeof(int),
2665 .proc_handler
= proc_dointvec_jiffies
,
2668 .procname
= "min_adv_mss",
2669 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_min_advmss
,
2670 .maxlen
= sizeof(int),
2672 .proc_handler
= proc_dointvec
,
2675 .procname
= "gc_min_interval_ms",
2676 .data
= &init_net
.ipv6
.sysctl
.ip6_rt_gc_min_interval
,
2677 .maxlen
= sizeof(int),
2679 .proc_handler
= proc_dointvec_ms_jiffies
,
2684 struct ctl_table
* __net_init
ipv6_route_sysctl_init(struct net
*net
)
2686 struct ctl_table
*table
;
2688 table
= kmemdup(ipv6_route_table_template
,
2689 sizeof(ipv6_route_table_template
),
2693 table
[0].data
= &net
->ipv6
.sysctl
.flush_delay
;
2694 table
[0].extra1
= net
;
2695 table
[1].data
= &net
->ipv6
.ip6_dst_ops
.gc_thresh
;
2696 table
[2].data
= &net
->ipv6
.sysctl
.ip6_rt_max_size
;
2697 table
[3].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2698 table
[4].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_timeout
;
2699 table
[5].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_interval
;
2700 table
[6].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
;
2701 table
[7].data
= &net
->ipv6
.sysctl
.ip6_rt_mtu_expires
;
2702 table
[8].data
= &net
->ipv6
.sysctl
.ip6_rt_min_advmss
;
2703 table
[9].data
= &net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
;
2710 static int __net_init
ip6_route_net_init(struct net
*net
)
2714 memcpy(&net
->ipv6
.ip6_dst_ops
, &ip6_dst_ops_template
,
2715 sizeof(net
->ipv6
.ip6_dst_ops
));
2717 if (dst_entries_init(&net
->ipv6
.ip6_dst_ops
) < 0)
2718 goto out_ip6_dst_ops
;
2720 net
->ipv6
.ip6_null_entry
= kmemdup(&ip6_null_entry_template
,
2721 sizeof(*net
->ipv6
.ip6_null_entry
),
2723 if (!net
->ipv6
.ip6_null_entry
)
2724 goto out_ip6_dst_entries
;
2725 net
->ipv6
.ip6_null_entry
->dst
.path
=
2726 (struct dst_entry
*)net
->ipv6
.ip6_null_entry
;
2727 net
->ipv6
.ip6_null_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2728 dst_init_metrics(&net
->ipv6
.ip6_null_entry
->dst
,
2729 ip6_template_metrics
, true);
2731 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2732 net
->ipv6
.ip6_prohibit_entry
= kmemdup(&ip6_prohibit_entry_template
,
2733 sizeof(*net
->ipv6
.ip6_prohibit_entry
),
2735 if (!net
->ipv6
.ip6_prohibit_entry
)
2736 goto out_ip6_null_entry
;
2737 net
->ipv6
.ip6_prohibit_entry
->dst
.path
=
2738 (struct dst_entry
*)net
->ipv6
.ip6_prohibit_entry
;
2739 net
->ipv6
.ip6_prohibit_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2740 dst_init_metrics(&net
->ipv6
.ip6_prohibit_entry
->dst
,
2741 ip6_template_metrics
, true);
2743 net
->ipv6
.ip6_blk_hole_entry
= kmemdup(&ip6_blk_hole_entry_template
,
2744 sizeof(*net
->ipv6
.ip6_blk_hole_entry
),
2746 if (!net
->ipv6
.ip6_blk_hole_entry
)
2747 goto out_ip6_prohibit_entry
;
2748 net
->ipv6
.ip6_blk_hole_entry
->dst
.path
=
2749 (struct dst_entry
*)net
->ipv6
.ip6_blk_hole_entry
;
2750 net
->ipv6
.ip6_blk_hole_entry
->dst
.ops
= &net
->ipv6
.ip6_dst_ops
;
2751 dst_init_metrics(&net
->ipv6
.ip6_blk_hole_entry
->dst
,
2752 ip6_template_metrics
, true);
2755 net
->ipv6
.sysctl
.flush_delay
= 0;
2756 net
->ipv6
.sysctl
.ip6_rt_max_size
= 4096;
2757 net
->ipv6
.sysctl
.ip6_rt_gc_min_interval
= HZ
/ 2;
2758 net
->ipv6
.sysctl
.ip6_rt_gc_timeout
= 60*HZ
;
2759 net
->ipv6
.sysctl
.ip6_rt_gc_interval
= 30*HZ
;
2760 net
->ipv6
.sysctl
.ip6_rt_gc_elasticity
= 9;
2761 net
->ipv6
.sysctl
.ip6_rt_mtu_expires
= 10*60*HZ
;
2762 net
->ipv6
.sysctl
.ip6_rt_min_advmss
= IPV6_MIN_MTU
- 20 - 40;
2764 #ifdef CONFIG_PROC_FS
2765 proc_net_fops_create(net
, "ipv6_route", 0, &ipv6_route_proc_fops
);
2766 proc_net_fops_create(net
, "rt6_stats", S_IRUGO
, &rt6_stats_seq_fops
);
2768 net
->ipv6
.ip6_rt_gc_expire
= 30*HZ
;
2774 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2775 out_ip6_prohibit_entry
:
2776 kfree(net
->ipv6
.ip6_prohibit_entry
);
2778 kfree(net
->ipv6
.ip6_null_entry
);
2780 out_ip6_dst_entries
:
2781 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
2786 static void __net_exit
ip6_route_net_exit(struct net
*net
)
2788 #ifdef CONFIG_PROC_FS
2789 proc_net_remove(net
, "ipv6_route");
2790 proc_net_remove(net
, "rt6_stats");
2792 kfree(net
->ipv6
.ip6_null_entry
);
2793 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2794 kfree(net
->ipv6
.ip6_prohibit_entry
);
2795 kfree(net
->ipv6
.ip6_blk_hole_entry
);
2797 dst_entries_destroy(&net
->ipv6
.ip6_dst_ops
);
2800 static struct pernet_operations ip6_route_net_ops
= {
2801 .init
= ip6_route_net_init
,
2802 .exit
= ip6_route_net_exit
,
2805 static struct notifier_block ip6_route_dev_notifier
= {
2806 .notifier_call
= ip6_route_dev_notify
,
2810 int __init
ip6_route_init(void)
2815 ip6_dst_ops_template
.kmem_cachep
=
2816 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info
), 0,
2817 SLAB_HWCACHE_ALIGN
, NULL
);
2818 if (!ip6_dst_ops_template
.kmem_cachep
)
2821 ret
= dst_entries_init(&ip6_dst_blackhole_ops
);
2823 goto out_kmem_cache
;
2825 ret
= register_pernet_subsys(&ip6_route_net_ops
);
2827 goto out_dst_entries
;
2829 ip6_dst_blackhole_ops
.kmem_cachep
= ip6_dst_ops_template
.kmem_cachep
;
2831 /* Registering of the loopback is done before this portion of code,
2832 * the loopback reference in rt6_info will not be taken, do it
2833 * manually for init_net */
2834 init_net
.ipv6
.ip6_null_entry
->dst
.dev
= init_net
.loopback_dev
;
2835 init_net
.ipv6
.ip6_null_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2836 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2837 init_net
.ipv6
.ip6_prohibit_entry
->dst
.dev
= init_net
.loopback_dev
;
2838 init_net
.ipv6
.ip6_prohibit_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2839 init_net
.ipv6
.ip6_blk_hole_entry
->dst
.dev
= init_net
.loopback_dev
;
2840 init_net
.ipv6
.ip6_blk_hole_entry
->rt6i_idev
= in6_dev_get(init_net
.loopback_dev
);
2844 goto out_register_subsys
;
2850 ret
= fib6_rules_init();
2855 if (__rtnl_register(PF_INET6
, RTM_NEWROUTE
, inet6_rtm_newroute
, NULL
) ||
2856 __rtnl_register(PF_INET6
, RTM_DELROUTE
, inet6_rtm_delroute
, NULL
) ||
2857 __rtnl_register(PF_INET6
, RTM_GETROUTE
, inet6_rtm_getroute
, NULL
))
2858 goto fib6_rules_init
;
2860 ret
= register_netdevice_notifier(&ip6_route_dev_notifier
);
2862 goto fib6_rules_init
;
2868 fib6_rules_cleanup();
2873 out_register_subsys
:
2874 unregister_pernet_subsys(&ip6_route_net_ops
);
2876 dst_entries_destroy(&ip6_dst_blackhole_ops
);
2878 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);
2882 void ip6_route_cleanup(void)
2884 unregister_netdevice_notifier(&ip6_route_dev_notifier
);
2885 fib6_rules_cleanup();
2888 unregister_pernet_subsys(&ip6_route_net_ops
);
2889 dst_entries_destroy(&ip6_dst_blackhole_ops
);
2890 kmem_cache_destroy(ip6_dst_ops_template
.kmem_cachep
);