2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * IPv4 Forwarding Information Base: semantics.
8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/netlink.h>
37 #include <linux/init.h>
41 #include <net/protocol.h>
42 #include <net/route.h>
45 #include <net/ip_fib.h>
46 #include <net/ip_mp_alg.h>
48 #include "fib_lookup.h"
50 #define FSprintk(a...)
52 static DEFINE_SPINLOCK(fib_info_lock
);
53 static struct hlist_head
*fib_info_hash
;
54 static struct hlist_head
*fib_info_laddrhash
;
55 static unsigned int fib_hash_size
;
56 static unsigned int fib_info_cnt
;
58 #define DEVINDEX_HASHBITS 8
59 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
60 static struct hlist_head fib_info_devhash
[DEVINDEX_HASHSIZE
];
62 #ifdef CONFIG_IP_ROUTE_MULTIPATH
64 static DEFINE_SPINLOCK(fib_multipath_lock
);
66 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
67 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
69 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
70 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
72 #else /* CONFIG_IP_ROUTE_MULTIPATH */
74 /* Hope, that gcc will optimize it to get rid of dummy loop */
76 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
77 for (nhsel=0; nhsel < 1; nhsel++)
79 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
80 for (nhsel=0; nhsel < 1; nhsel++)
82 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
84 #define endfor_nexthops(fi) }
91 } fib_props
[RTA_MAX
+ 1] = {
94 .scope
= RT_SCOPE_NOWHERE
,
98 .scope
= RT_SCOPE_UNIVERSE
,
102 .scope
= RT_SCOPE_HOST
,
106 .scope
= RT_SCOPE_LINK
,
107 }, /* RTN_BROADCAST */
110 .scope
= RT_SCOPE_LINK
,
114 .scope
= RT_SCOPE_UNIVERSE
,
115 }, /* RTN_MULTICAST */
118 .scope
= RT_SCOPE_UNIVERSE
,
119 }, /* RTN_BLACKHOLE */
121 .error
= -EHOSTUNREACH
,
122 .scope
= RT_SCOPE_UNIVERSE
,
123 }, /* RTN_UNREACHABLE */
126 .scope
= RT_SCOPE_UNIVERSE
,
127 }, /* RTN_PROHIBIT */
130 .scope
= RT_SCOPE_UNIVERSE
,
134 .scope
= RT_SCOPE_NOWHERE
,
138 .scope
= RT_SCOPE_NOWHERE
,
139 }, /* RTN_XRESOLVE */
143 /* Release a nexthop info record */
145 void free_fib_info(struct fib_info
*fi
)
147 if (fi
->fib_dead
== 0) {
148 printk("Freeing alive fib_info %p\n", fi
);
151 change_nexthops(fi
) {
155 } endfor_nexthops(fi
);
160 void fib_release_info(struct fib_info
*fi
)
162 spin_lock_bh(&fib_info_lock
);
163 if (fi
&& --fi
->fib_treeref
== 0) {
164 hlist_del(&fi
->fib_hash
);
166 hlist_del(&fi
->fib_lhash
);
167 change_nexthops(fi
) {
170 hlist_del(&nh
->nh_hash
);
171 } endfor_nexthops(fi
)
175 spin_unlock_bh(&fib_info_lock
);
178 static __inline__
int nh_comp(const struct fib_info
*fi
, const struct fib_info
*ofi
)
180 const struct fib_nh
*onh
= ofi
->fib_nh
;
183 if (nh
->nh_oif
!= onh
->nh_oif
||
184 nh
->nh_gw
!= onh
->nh_gw
||
185 nh
->nh_scope
!= onh
->nh_scope
||
186 #ifdef CONFIG_IP_ROUTE_MULTIPATH
187 nh
->nh_weight
!= onh
->nh_weight
||
189 #ifdef CONFIG_NET_CLS_ROUTE
190 nh
->nh_tclassid
!= onh
->nh_tclassid
||
192 ((nh
->nh_flags
^onh
->nh_flags
)&~RTNH_F_DEAD
))
195 } endfor_nexthops(fi
);
199 static inline unsigned int fib_info_hashfn(const struct fib_info
*fi
)
201 unsigned int mask
= (fib_hash_size
- 1);
202 unsigned int val
= fi
->fib_nhs
;
204 val
^= fi
->fib_protocol
;
205 val
^= fi
->fib_prefsrc
;
206 val
^= fi
->fib_priority
;
208 return (val
^ (val
>> 7) ^ (val
>> 12)) & mask
;
211 static struct fib_info
*fib_find_info(const struct fib_info
*nfi
)
213 struct hlist_head
*head
;
214 struct hlist_node
*node
;
218 hash
= fib_info_hashfn(nfi
);
219 head
= &fib_info_hash
[hash
];
221 hlist_for_each_entry(fi
, node
, head
, fib_hash
) {
222 if (fi
->fib_nhs
!= nfi
->fib_nhs
)
224 if (nfi
->fib_protocol
== fi
->fib_protocol
&&
225 nfi
->fib_prefsrc
== fi
->fib_prefsrc
&&
226 nfi
->fib_priority
== fi
->fib_priority
&&
227 memcmp(nfi
->fib_metrics
, fi
->fib_metrics
,
228 sizeof(fi
->fib_metrics
)) == 0 &&
229 ((nfi
->fib_flags
^fi
->fib_flags
)&~RTNH_F_DEAD
) == 0 &&
230 (nfi
->fib_nhs
== 0 || nh_comp(fi
, nfi
) == 0))
237 static inline unsigned int fib_devindex_hashfn(unsigned int val
)
239 unsigned int mask
= DEVINDEX_HASHSIZE
- 1;
242 (val
>> DEVINDEX_HASHBITS
) ^
243 (val
>> (DEVINDEX_HASHBITS
* 2))) & mask
;
246 /* Check, that the gateway is already configured.
247 Used only by redirect accept routine.
250 int ip_fib_check_default(u32 gw
, struct net_device
*dev
)
252 struct hlist_head
*head
;
253 struct hlist_node
*node
;
257 spin_lock(&fib_info_lock
);
259 hash
= fib_devindex_hashfn(dev
->ifindex
);
260 head
= &fib_info_devhash
[hash
];
261 hlist_for_each_entry(nh
, node
, head
, nh_hash
) {
262 if (nh
->nh_dev
== dev
&&
264 !(nh
->nh_flags
&RTNH_F_DEAD
)) {
265 spin_unlock(&fib_info_lock
);
270 spin_unlock(&fib_info_lock
);
275 void rtmsg_fib(int event
, u32 key
, struct fib_alias
*fa
,
277 struct nlmsghdr
*n
, struct netlink_skb_parms
*req
)
280 u32 pid
= req
? req
->pid
: n
->nlmsg_pid
;
281 int size
= NLMSG_SPACE(sizeof(struct rtmsg
)+256);
283 skb
= alloc_skb(size
, GFP_KERNEL
);
287 if (fib_dump_info(skb
, pid
, n
->nlmsg_seq
, event
, tb_id
,
288 fa
->fa_type
, fa
->fa_scope
, &key
, z
,
290 fa
->fa_info
, 0) < 0) {
294 NETLINK_CB(skb
).dst_group
= RTNLGRP_IPV4_ROUTE
;
295 if (n
->nlmsg_flags
&NLM_F_ECHO
)
296 atomic_inc(&skb
->users
);
297 netlink_broadcast(rtnl
, skb
, pid
, RTNLGRP_IPV4_ROUTE
, GFP_KERNEL
);
298 if (n
->nlmsg_flags
&NLM_F_ECHO
)
299 netlink_unicast(rtnl
, skb
, pid
, MSG_DONTWAIT
);
302 /* Return the first fib alias matching TOS with
303 * priority less than or equal to PRIO.
305 struct fib_alias
*fib_find_alias(struct list_head
*fah
, u8 tos
, u32 prio
)
308 struct fib_alias
*fa
;
309 list_for_each_entry(fa
, fah
, fa_list
) {
310 if (fa
->fa_tos
> tos
)
312 if (fa
->fa_info
->fib_priority
>= prio
||
320 int fib_detect_death(struct fib_info
*fi
, int order
,
321 struct fib_info
**last_resort
, int *last_idx
, int *dflt
)
324 int state
= NUD_NONE
;
326 n
= neigh_lookup(&arp_tbl
, &fi
->fib_nh
[0].nh_gw
, fi
->fib_dev
);
328 state
= n
->nud_state
;
331 if (state
==NUD_REACHABLE
)
333 if ((state
&NUD_VALID
) && order
!= *dflt
)
335 if ((state
&NUD_VALID
) ||
336 (*last_idx
<0 && order
> *dflt
)) {
343 #ifdef CONFIG_IP_ROUTE_MULTIPATH
345 static u32
fib_get_attr32(struct rtattr
*attr
, int attrlen
, int type
)
347 while (RTA_OK(attr
,attrlen
)) {
348 if (attr
->rta_type
== type
)
349 return *(u32
*)RTA_DATA(attr
);
350 attr
= RTA_NEXT(attr
, attrlen
);
356 fib_count_nexthops(struct rtattr
*rta
)
359 struct rtnexthop
*nhp
= RTA_DATA(rta
);
360 int nhlen
= RTA_PAYLOAD(rta
);
362 while (nhlen
>= (int)sizeof(struct rtnexthop
)) {
363 if ((nhlen
-= nhp
->rtnh_len
) < 0)
366 nhp
= RTNH_NEXT(nhp
);
372 fib_get_nhs(struct fib_info
*fi
, const struct rtattr
*rta
, const struct rtmsg
*r
)
374 struct rtnexthop
*nhp
= RTA_DATA(rta
);
375 int nhlen
= RTA_PAYLOAD(rta
);
377 change_nexthops(fi
) {
378 int attrlen
= nhlen
- sizeof(struct rtnexthop
);
379 if (attrlen
< 0 || (nhlen
-= nhp
->rtnh_len
) < 0)
381 nh
->nh_flags
= (r
->rtm_flags
&~0xFF) | nhp
->rtnh_flags
;
382 nh
->nh_oif
= nhp
->rtnh_ifindex
;
383 nh
->nh_weight
= nhp
->rtnh_hops
+ 1;
385 nh
->nh_gw
= fib_get_attr32(RTNH_DATA(nhp
), attrlen
, RTA_GATEWAY
);
386 #ifdef CONFIG_NET_CLS_ROUTE
387 nh
->nh_tclassid
= fib_get_attr32(RTNH_DATA(nhp
), attrlen
, RTA_FLOW
);
390 nhp
= RTNH_NEXT(nhp
);
391 } endfor_nexthops(fi
);
397 int fib_nh_match(struct rtmsg
*r
, struct nlmsghdr
*nlh
, struct kern_rta
*rta
,
400 #ifdef CONFIG_IP_ROUTE_MULTIPATH
401 struct rtnexthop
*nhp
;
405 if (rta
->rta_priority
&&
406 *rta
->rta_priority
!= fi
->fib_priority
)
409 if (rta
->rta_oif
|| rta
->rta_gw
) {
410 if ((!rta
->rta_oif
|| *rta
->rta_oif
== fi
->fib_nh
->nh_oif
) &&
411 (!rta
->rta_gw
|| memcmp(rta
->rta_gw
, &fi
->fib_nh
->nh_gw
, 4) == 0))
416 #ifdef CONFIG_IP_ROUTE_MULTIPATH
417 if (rta
->rta_mp
== NULL
)
419 nhp
= RTA_DATA(rta
->rta_mp
);
420 nhlen
= RTA_PAYLOAD(rta
->rta_mp
);
423 int attrlen
= nhlen
- sizeof(struct rtnexthop
);
426 if (attrlen
< 0 || (nhlen
-= nhp
->rtnh_len
) < 0)
428 if (nhp
->rtnh_ifindex
&& nhp
->rtnh_ifindex
!= nh
->nh_oif
)
431 gw
= fib_get_attr32(RTNH_DATA(nhp
), attrlen
, RTA_GATEWAY
);
432 if (gw
&& gw
!= nh
->nh_gw
)
434 #ifdef CONFIG_NET_CLS_ROUTE
435 gw
= fib_get_attr32(RTNH_DATA(nhp
), attrlen
, RTA_FLOW
);
436 if (gw
&& gw
!= nh
->nh_tclassid
)
440 nhp
= RTNH_NEXT(nhp
);
441 } endfor_nexthops(fi
);
451 Semantics of nexthop is very messy by historical reasons.
452 We have to take into account, that:
453 a) gateway can be actually local interface address,
454 so that gatewayed route is direct.
455 b) gateway must be on-link address, possibly
456 described not by an ifaddr, but also by a direct route.
457 c) If both gateway and interface are specified, they should not
459 d) If we use tunnel routes, gateway could be not on-link.
461 Attempt to reconcile all of these (alas, self-contradictory) conditions
462 results in pretty ugly and hairy code with obscure logic.
464 I chose to generalized it instead, so that the size
465 of code does not increase practically, but it becomes
467 Every prefix is assigned a "scope" value: "host" is local address,
468 "link" is direct route,
469 [ ... "site" ... "interior" ... ]
470 and "universe" is true gateway route with global meaning.
472 Every prefix refers to a set of "nexthop"s (gw, oif),
473 where gw must have narrower scope. This recursion stops
474 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
475 which means that gw is forced to be on link.
477 Code is still hairy, but now it is apparently logically
478 consistent and very flexible. F.e. as by-product it allows
479 to co-exists in peace independent exterior and interior
482 Normally it looks as following.
484 {universe prefix} -> (gw, oif) [scope link]
486 |-> {link prefix} -> (gw, oif) [scope local]
488 |-> {local prefix} (terminal node)
491 static int fib_check_nh(const struct rtmsg
*r
, struct fib_info
*fi
, struct fib_nh
*nh
)
496 struct fib_result res
;
498 #ifdef CONFIG_IP_ROUTE_PERVASIVE
499 if (nh
->nh_flags
&RTNH_F_PERVASIVE
)
502 if (nh
->nh_flags
&RTNH_F_ONLINK
) {
503 struct net_device
*dev
;
505 if (r
->rtm_scope
>= RT_SCOPE_LINK
)
507 if (inet_addr_type(nh
->nh_gw
) != RTN_UNICAST
)
509 if ((dev
= __dev_get_by_index(nh
->nh_oif
)) == NULL
)
511 if (!(dev
->flags
&IFF_UP
))
515 nh
->nh_scope
= RT_SCOPE_LINK
;
519 struct flowi fl
= { .nl_u
= { .ip4_u
=
520 { .daddr
= nh
->nh_gw
,
521 .scope
= r
->rtm_scope
+ 1 } },
524 /* It is not necessary, but requires a bit of thinking */
525 if (fl
.fl4_scope
< RT_SCOPE_LINK
)
526 fl
.fl4_scope
= RT_SCOPE_LINK
;
527 if ((err
= fib_lookup(&fl
, &res
)) != 0)
531 if (res
.type
!= RTN_UNICAST
&& res
.type
!= RTN_LOCAL
)
533 nh
->nh_scope
= res
.scope
;
534 nh
->nh_oif
= FIB_RES_OIF(res
);
535 if ((nh
->nh_dev
= FIB_RES_DEV(res
)) == NULL
)
537 dev_hold(nh
->nh_dev
);
539 if (!(nh
->nh_dev
->flags
& IFF_UP
))
546 struct in_device
*in_dev
;
548 if (nh
->nh_flags
&(RTNH_F_PERVASIVE
|RTNH_F_ONLINK
))
551 in_dev
= inetdev_by_index(nh
->nh_oif
);
554 if (!(in_dev
->dev
->flags
&IFF_UP
)) {
558 nh
->nh_dev
= in_dev
->dev
;
559 dev_hold(nh
->nh_dev
);
560 nh
->nh_scope
= RT_SCOPE_HOST
;
566 static inline unsigned int fib_laddr_hashfn(u32 val
)
568 unsigned int mask
= (fib_hash_size
- 1);
570 return (val
^ (val
>> 7) ^ (val
>> 14)) & mask
;
573 static struct hlist_head
*fib_hash_alloc(int bytes
)
575 if (bytes
<= PAGE_SIZE
)
576 return kmalloc(bytes
, GFP_KERNEL
);
578 return (struct hlist_head
*)
579 __get_free_pages(GFP_KERNEL
, get_order(bytes
));
582 static void fib_hash_free(struct hlist_head
*hash
, int bytes
)
587 if (bytes
<= PAGE_SIZE
)
590 free_pages((unsigned long) hash
, get_order(bytes
));
593 static void fib_hash_move(struct hlist_head
*new_info_hash
,
594 struct hlist_head
*new_laddrhash
,
595 unsigned int new_size
)
597 struct hlist_head
*old_info_hash
, *old_laddrhash
;
598 unsigned int old_size
= fib_hash_size
;
599 unsigned int i
, bytes
;
601 spin_lock_bh(&fib_info_lock
);
602 old_info_hash
= fib_info_hash
;
603 old_laddrhash
= fib_info_laddrhash
;
604 fib_hash_size
= new_size
;
606 for (i
= 0; i
< old_size
; i
++) {
607 struct hlist_head
*head
= &fib_info_hash
[i
];
608 struct hlist_node
*node
, *n
;
611 hlist_for_each_entry_safe(fi
, node
, n
, head
, fib_hash
) {
612 struct hlist_head
*dest
;
613 unsigned int new_hash
;
615 hlist_del(&fi
->fib_hash
);
617 new_hash
= fib_info_hashfn(fi
);
618 dest
= &new_info_hash
[new_hash
];
619 hlist_add_head(&fi
->fib_hash
, dest
);
622 fib_info_hash
= new_info_hash
;
624 for (i
= 0; i
< old_size
; i
++) {
625 struct hlist_head
*lhead
= &fib_info_laddrhash
[i
];
626 struct hlist_node
*node
, *n
;
629 hlist_for_each_entry_safe(fi
, node
, n
, lhead
, fib_lhash
) {
630 struct hlist_head
*ldest
;
631 unsigned int new_hash
;
633 hlist_del(&fi
->fib_lhash
);
635 new_hash
= fib_laddr_hashfn(fi
->fib_prefsrc
);
636 ldest
= &new_laddrhash
[new_hash
];
637 hlist_add_head(&fi
->fib_lhash
, ldest
);
640 fib_info_laddrhash
= new_laddrhash
;
642 spin_unlock_bh(&fib_info_lock
);
644 bytes
= old_size
* sizeof(struct hlist_head
*);
645 fib_hash_free(old_info_hash
, bytes
);
646 fib_hash_free(old_laddrhash
, bytes
);
650 fib_create_info(const struct rtmsg
*r
, struct kern_rta
*rta
,
651 const struct nlmsghdr
*nlh
, int *errp
)
654 struct fib_info
*fi
= NULL
;
655 struct fib_info
*ofi
;
656 #ifdef CONFIG_IP_ROUTE_MULTIPATH
661 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
662 u32 mp_alg
= IP_MP_ALG_NONE
;
665 /* Fast check to catch the most weird cases */
666 if (fib_props
[r
->rtm_type
].scope
> r
->rtm_scope
)
669 #ifdef CONFIG_IP_ROUTE_MULTIPATH
671 nhs
= fib_count_nexthops(rta
->rta_mp
);
676 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
677 if (rta
->rta_mp_alg
) {
678 mp_alg
= *rta
->rta_mp_alg
;
680 if (mp_alg
< IP_MP_ALG_NONE
||
681 mp_alg
> IP_MP_ALG_MAX
)
687 if (fib_info_cnt
>= fib_hash_size
) {
688 unsigned int new_size
= fib_hash_size
<< 1;
689 struct hlist_head
*new_info_hash
;
690 struct hlist_head
*new_laddrhash
;
695 bytes
= new_size
* sizeof(struct hlist_head
*);
696 new_info_hash
= fib_hash_alloc(bytes
);
697 new_laddrhash
= fib_hash_alloc(bytes
);
698 if (!new_info_hash
|| !new_laddrhash
) {
699 fib_hash_free(new_info_hash
, bytes
);
700 fib_hash_free(new_laddrhash
, bytes
);
702 memset(new_info_hash
, 0, bytes
);
703 memset(new_laddrhash
, 0, bytes
);
705 fib_hash_move(new_info_hash
, new_laddrhash
, new_size
);
712 fi
= kzalloc(sizeof(*fi
)+nhs
*sizeof(struct fib_nh
), GFP_KERNEL
);
717 fi
->fib_protocol
= r
->rtm_protocol
;
720 change_nexthops(fi
) {
722 } endfor_nexthops(fi
)
724 fi
->fib_flags
= r
->rtm_flags
;
725 if (rta
->rta_priority
)
726 fi
->fib_priority
= *rta
->rta_priority
;
728 int attrlen
= RTA_PAYLOAD(rta
->rta_mx
);
729 struct rtattr
*attr
= RTA_DATA(rta
->rta_mx
);
731 while (RTA_OK(attr
, attrlen
)) {
732 unsigned flavor
= attr
->rta_type
;
734 if (flavor
> RTAX_MAX
)
736 fi
->fib_metrics
[flavor
-1] = *(unsigned*)RTA_DATA(attr
);
738 attr
= RTA_NEXT(attr
, attrlen
);
741 if (rta
->rta_prefsrc
)
742 memcpy(&fi
->fib_prefsrc
, rta
->rta_prefsrc
, 4);
745 #ifdef CONFIG_IP_ROUTE_MULTIPATH
746 if ((err
= fib_get_nhs(fi
, rta
->rta_mp
, r
)) != 0)
748 if (rta
->rta_oif
&& fi
->fib_nh
->nh_oif
!= *rta
->rta_oif
)
750 if (rta
->rta_gw
&& memcmp(&fi
->fib_nh
->nh_gw
, rta
->rta_gw
, 4))
752 #ifdef CONFIG_NET_CLS_ROUTE
753 if (rta
->rta_flow
&& memcmp(&fi
->fib_nh
->nh_tclassid
, rta
->rta_flow
, 4))
760 struct fib_nh
*nh
= fi
->fib_nh
;
762 nh
->nh_oif
= *rta
->rta_oif
;
764 memcpy(&nh
->nh_gw
, rta
->rta_gw
, 4);
765 #ifdef CONFIG_NET_CLS_ROUTE
767 memcpy(&nh
->nh_tclassid
, rta
->rta_flow
, 4);
769 nh
->nh_flags
= r
->rtm_flags
;
770 #ifdef CONFIG_IP_ROUTE_MULTIPATH
775 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
776 fi
->fib_mp_alg
= mp_alg
;
779 if (fib_props
[r
->rtm_type
].error
) {
780 if (rta
->rta_gw
|| rta
->rta_oif
|| rta
->rta_mp
)
785 if (r
->rtm_scope
> RT_SCOPE_HOST
)
788 if (r
->rtm_scope
== RT_SCOPE_HOST
) {
789 struct fib_nh
*nh
= fi
->fib_nh
;
791 /* Local address is added. */
792 if (nhs
!= 1 || nh
->nh_gw
)
794 nh
->nh_scope
= RT_SCOPE_NOWHERE
;
795 nh
->nh_dev
= dev_get_by_index(fi
->fib_nh
->nh_oif
);
797 if (nh
->nh_dev
== NULL
)
800 change_nexthops(fi
) {
801 if ((err
= fib_check_nh(r
, fi
, nh
)) != 0)
803 } endfor_nexthops(fi
)
806 if (fi
->fib_prefsrc
) {
807 if (r
->rtm_type
!= RTN_LOCAL
|| rta
->rta_dst
== NULL
||
808 memcmp(&fi
->fib_prefsrc
, rta
->rta_dst
, 4))
809 if (inet_addr_type(fi
->fib_prefsrc
) != RTN_LOCAL
)
814 if ((ofi
= fib_find_info(fi
)) != NULL
) {
822 atomic_inc(&fi
->fib_clntref
);
823 spin_lock_bh(&fib_info_lock
);
824 hlist_add_head(&fi
->fib_hash
,
825 &fib_info_hash
[fib_info_hashfn(fi
)]);
826 if (fi
->fib_prefsrc
) {
827 struct hlist_head
*head
;
829 head
= &fib_info_laddrhash
[fib_laddr_hashfn(fi
->fib_prefsrc
)];
830 hlist_add_head(&fi
->fib_lhash
, head
);
832 change_nexthops(fi
) {
833 struct hlist_head
*head
;
838 hash
= fib_devindex_hashfn(nh
->nh_dev
->ifindex
);
839 head
= &fib_info_devhash
[hash
];
840 hlist_add_head(&nh
->nh_hash
, head
);
841 } endfor_nexthops(fi
)
842 spin_unlock_bh(&fib_info_lock
);
857 /* Note! fib_semantic_match intentionally uses RCU list functions. */
858 int fib_semantic_match(struct list_head
*head
, const struct flowi
*flp
,
859 struct fib_result
*res
, __u32 zone
, __u32 mask
,
862 struct fib_alias
*fa
;
865 list_for_each_entry_rcu(fa
, head
, fa_list
) {
869 fa
->fa_tos
!= flp
->fl4_tos
)
872 if (fa
->fa_scope
< flp
->fl4_scope
)
875 fa
->fa_state
|= FA_S_ACCESSED
;
877 err
= fib_props
[fa
->fa_type
].error
;
879 struct fib_info
*fi
= fa
->fa_info
;
881 if (fi
->fib_flags
& RTNH_F_DEAD
)
884 switch (fa
->fa_type
) {
891 if (nh
->nh_flags
&RTNH_F_DEAD
)
893 if (!flp
->oif
|| flp
->oif
== nh
->nh_oif
)
896 #ifdef CONFIG_IP_ROUTE_MULTIPATH
897 if (nhsel
< fi
->fib_nhs
) {
910 printk(KERN_DEBUG
"impossible 102\n");
919 res
->prefixlen
= prefixlen
;
920 res
->nh_sel
= nh_sel
;
921 res
->type
= fa
->fa_type
;
922 res
->scope
= fa
->fa_scope
;
923 res
->fi
= fa
->fa_info
;
924 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
926 res
->network
= zone
&
927 (0xFFFFFFFF >> (32 - prefixlen
));
929 atomic_inc(&res
->fi
->fib_clntref
);
933 /* Find appropriate source address to this destination */
935 u32
__fib_res_prefsrc(struct fib_result
*res
)
937 return inet_select_addr(FIB_RES_DEV(*res
), FIB_RES_GW(*res
), res
->scope
);
941 fib_dump_info(struct sk_buff
*skb
, u32 pid
, u32 seq
, int event
,
942 u32 tb_id
, u8 type
, u8 scope
, void *dst
, int dst_len
, u8 tos
,
943 struct fib_info
*fi
, unsigned int flags
)
946 struct nlmsghdr
*nlh
;
947 unsigned char *b
= skb
->tail
;
949 nlh
= NLMSG_NEW(skb
, pid
, seq
, event
, sizeof(*rtm
), flags
);
950 rtm
= NLMSG_DATA(nlh
);
951 rtm
->rtm_family
= AF_INET
;
952 rtm
->rtm_dst_len
= dst_len
;
953 rtm
->rtm_src_len
= 0;
955 rtm
->rtm_table
= tb_id
;
956 RTA_PUT_U32(skb
, RTA_TABLE
, tb_id
);
957 rtm
->rtm_type
= type
;
958 rtm
->rtm_flags
= fi
->fib_flags
;
959 rtm
->rtm_scope
= scope
;
960 if (rtm
->rtm_dst_len
)
961 RTA_PUT(skb
, RTA_DST
, 4, dst
);
962 rtm
->rtm_protocol
= fi
->fib_protocol
;
963 if (fi
->fib_priority
)
964 RTA_PUT(skb
, RTA_PRIORITY
, 4, &fi
->fib_priority
);
965 if (rtnetlink_put_metrics(skb
, fi
->fib_metrics
) < 0)
968 RTA_PUT(skb
, RTA_PREFSRC
, 4, &fi
->fib_prefsrc
);
969 if (fi
->fib_nhs
== 1) {
970 if (fi
->fib_nh
->nh_gw
)
971 RTA_PUT(skb
, RTA_GATEWAY
, 4, &fi
->fib_nh
->nh_gw
);
972 if (fi
->fib_nh
->nh_oif
)
973 RTA_PUT(skb
, RTA_OIF
, sizeof(int), &fi
->fib_nh
->nh_oif
);
974 #ifdef CONFIG_NET_CLS_ROUTE
975 if (fi
->fib_nh
[0].nh_tclassid
)
976 RTA_PUT(skb
, RTA_FLOW
, 4, &fi
->fib_nh
[0].nh_tclassid
);
979 #ifdef CONFIG_IP_ROUTE_MULTIPATH
980 if (fi
->fib_nhs
> 1) {
981 struct rtnexthop
*nhp
;
982 struct rtattr
*mp_head
;
983 if (skb_tailroom(skb
) <= RTA_SPACE(0))
985 mp_head
= (struct rtattr
*)skb_put(skb
, RTA_SPACE(0));
988 if (skb_tailroom(skb
) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp
)) + 4))
990 nhp
= (struct rtnexthop
*)skb_put(skb
, RTA_ALIGN(sizeof(*nhp
)));
991 nhp
->rtnh_flags
= nh
->nh_flags
& 0xFF;
992 nhp
->rtnh_hops
= nh
->nh_weight
-1;
993 nhp
->rtnh_ifindex
= nh
->nh_oif
;
995 RTA_PUT(skb
, RTA_GATEWAY
, 4, &nh
->nh_gw
);
996 #ifdef CONFIG_NET_CLS_ROUTE
998 RTA_PUT(skb
, RTA_FLOW
, 4, &nh
->nh_tclassid
);
1000 nhp
->rtnh_len
= skb
->tail
- (unsigned char*)nhp
;
1001 } endfor_nexthops(fi
);
1002 mp_head
->rta_type
= RTA_MULTIPATH
;
1003 mp_head
->rta_len
= skb
->tail
- (u8
*)mp_head
;
1006 nlh
->nlmsg_len
= skb
->tail
- b
;
1011 skb_trim(skb
, b
- skb
->data
);
1015 #ifndef CONFIG_IP_NOSIOCRT
1018 fib_convert_rtentry(int cmd
, struct nlmsghdr
*nl
, struct rtmsg
*rtm
,
1019 struct kern_rta
*rta
, struct rtentry
*r
)
1024 memset(rtm
, 0, sizeof(*rtm
));
1025 memset(rta
, 0, sizeof(*rta
));
1027 if (r
->rt_dst
.sa_family
!= AF_INET
)
1028 return -EAFNOSUPPORT
;
1030 /* Check mask for validity:
1031 a) it must be contiguous.
1032 b) destination must have all host bits clear.
1033 c) if application forgot to set correct family (AF_INET),
1034 reject request unless it is absolutely clear i.e.
1035 both family and mask are zero.
1038 ptr
= &((struct sockaddr_in
*)&r
->rt_dst
)->sin_addr
.s_addr
;
1039 if (!(r
->rt_flags
&RTF_HOST
)) {
1040 u32 mask
= ((struct sockaddr_in
*)&r
->rt_genmask
)->sin_addr
.s_addr
;
1041 if (r
->rt_genmask
.sa_family
!= AF_INET
) {
1042 if (mask
|| r
->rt_genmask
.sa_family
)
1043 return -EAFNOSUPPORT
;
1045 if (bad_mask(mask
, *ptr
))
1047 plen
= inet_mask_len(mask
);
1050 nl
->nlmsg_flags
= NLM_F_REQUEST
;
1053 nl
->nlmsg_len
= NLMSG_LENGTH(sizeof(*rtm
));
1054 if (cmd
== SIOCDELRT
) {
1055 nl
->nlmsg_type
= RTM_DELROUTE
;
1056 nl
->nlmsg_flags
= 0;
1058 nl
->nlmsg_type
= RTM_NEWROUTE
;
1059 nl
->nlmsg_flags
= NLM_F_REQUEST
|NLM_F_CREATE
;
1060 rtm
->rtm_protocol
= RTPROT_BOOT
;
1063 rtm
->rtm_dst_len
= plen
;
1067 *(u32
*)&r
->rt_pad3
= r
->rt_metric
- 1;
1068 rta
->rta_priority
= (u32
*)&r
->rt_pad3
;
1070 if (r
->rt_flags
&RTF_REJECT
) {
1071 rtm
->rtm_scope
= RT_SCOPE_HOST
;
1072 rtm
->rtm_type
= RTN_UNREACHABLE
;
1075 rtm
->rtm_scope
= RT_SCOPE_NOWHERE
;
1076 rtm
->rtm_type
= RTN_UNICAST
;
1080 struct net_device
*dev
;
1081 char devname
[IFNAMSIZ
];
1083 if (copy_from_user(devname
, r
->rt_dev
, IFNAMSIZ
-1))
1085 devname
[IFNAMSIZ
-1] = 0;
1086 colon
= strchr(devname
, ':');
1089 dev
= __dev_get_by_name(devname
);
1092 rta
->rta_oif
= &dev
->ifindex
;
1094 struct in_ifaddr
*ifa
;
1095 struct in_device
*in_dev
= __in_dev_get_rtnl(dev
);
1099 for (ifa
= in_dev
->ifa_list
; ifa
; ifa
= ifa
->ifa_next
)
1100 if (strcmp(ifa
->ifa_label
, devname
) == 0)
1104 rta
->rta_prefsrc
= &ifa
->ifa_local
;
1108 ptr
= &((struct sockaddr_in
*)&r
->rt_gateway
)->sin_addr
.s_addr
;
1109 if (r
->rt_gateway
.sa_family
== AF_INET
&& *ptr
) {
1111 if (r
->rt_flags
&RTF_GATEWAY
&& inet_addr_type(*ptr
) == RTN_UNICAST
)
1112 rtm
->rtm_scope
= RT_SCOPE_UNIVERSE
;
1115 if (cmd
== SIOCDELRT
)
1118 if (r
->rt_flags
&RTF_GATEWAY
&& rta
->rta_gw
== NULL
)
1121 if (rtm
->rtm_scope
== RT_SCOPE_NOWHERE
)
1122 rtm
->rtm_scope
= RT_SCOPE_LINK
;
1124 if (r
->rt_flags
&(RTF_MTU
|RTF_WINDOW
|RTF_IRTT
)) {
1126 struct rtattr
*mx
= kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL
);
1130 mx
->rta_type
= RTA_METRICS
;
1131 mx
->rta_len
= RTA_LENGTH(0);
1132 if (r
->rt_flags
&RTF_MTU
) {
1133 rec
= (void*)((char*)mx
+ RTA_ALIGN(mx
->rta_len
));
1134 rec
->rta_type
= RTAX_ADVMSS
;
1135 rec
->rta_len
= RTA_LENGTH(4);
1136 mx
->rta_len
+= RTA_LENGTH(4);
1137 *(u32
*)RTA_DATA(rec
) = r
->rt_mtu
- 40;
1139 if (r
->rt_flags
&RTF_WINDOW
) {
1140 rec
= (void*)((char*)mx
+ RTA_ALIGN(mx
->rta_len
));
1141 rec
->rta_type
= RTAX_WINDOW
;
1142 rec
->rta_len
= RTA_LENGTH(4);
1143 mx
->rta_len
+= RTA_LENGTH(4);
1144 *(u32
*)RTA_DATA(rec
) = r
->rt_window
;
1146 if (r
->rt_flags
&RTF_IRTT
) {
1147 rec
= (void*)((char*)mx
+ RTA_ALIGN(mx
->rta_len
));
1148 rec
->rta_type
= RTAX_RTT
;
1149 rec
->rta_len
= RTA_LENGTH(4);
1150 mx
->rta_len
+= RTA_LENGTH(4);
1151 *(u32
*)RTA_DATA(rec
) = r
->rt_irtt
<<3;
1161 - local address disappeared -> we must delete all the entries
1163 - device went down -> we must shutdown all nexthops going via it.
1166 int fib_sync_down(u32 local
, struct net_device
*dev
, int force
)
1169 int scope
= RT_SCOPE_NOWHERE
;
1174 if (local
&& fib_info_laddrhash
) {
1175 unsigned int hash
= fib_laddr_hashfn(local
);
1176 struct hlist_head
*head
= &fib_info_laddrhash
[hash
];
1177 struct hlist_node
*node
;
1178 struct fib_info
*fi
;
1180 hlist_for_each_entry(fi
, node
, head
, fib_lhash
) {
1181 if (fi
->fib_prefsrc
== local
) {
1182 fi
->fib_flags
|= RTNH_F_DEAD
;
1189 struct fib_info
*prev_fi
= NULL
;
1190 unsigned int hash
= fib_devindex_hashfn(dev
->ifindex
);
1191 struct hlist_head
*head
= &fib_info_devhash
[hash
];
1192 struct hlist_node
*node
;
1195 hlist_for_each_entry(nh
, node
, head
, nh_hash
) {
1196 struct fib_info
*fi
= nh
->nh_parent
;
1199 BUG_ON(!fi
->fib_nhs
);
1200 if (nh
->nh_dev
!= dev
|| fi
== prev_fi
)
1204 change_nexthops(fi
) {
1205 if (nh
->nh_flags
&RTNH_F_DEAD
)
1207 else if (nh
->nh_dev
== dev
&&
1208 nh
->nh_scope
!= scope
) {
1209 nh
->nh_flags
|= RTNH_F_DEAD
;
1210 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1211 spin_lock_bh(&fib_multipath_lock
);
1212 fi
->fib_power
-= nh
->nh_power
;
1214 spin_unlock_bh(&fib_multipath_lock
);
1218 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1219 if (force
> 1 && nh
->nh_dev
== dev
) {
1224 } endfor_nexthops(fi
)
1225 if (dead
== fi
->fib_nhs
) {
1226 fi
->fib_flags
|= RTNH_F_DEAD
;
1235 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1238 Dead device goes up. We wake up dead nexthops.
1239 It takes sense only on multipath routes.
1242 int fib_sync_up(struct net_device
*dev
)
1244 struct fib_info
*prev_fi
;
1246 struct hlist_head
*head
;
1247 struct hlist_node
*node
;
1251 if (!(dev
->flags
&IFF_UP
))
1255 hash
= fib_devindex_hashfn(dev
->ifindex
);
1256 head
= &fib_info_devhash
[hash
];
1259 hlist_for_each_entry(nh
, node
, head
, nh_hash
) {
1260 struct fib_info
*fi
= nh
->nh_parent
;
1263 BUG_ON(!fi
->fib_nhs
);
1264 if (nh
->nh_dev
!= dev
|| fi
== prev_fi
)
1269 change_nexthops(fi
) {
1270 if (!(nh
->nh_flags
&RTNH_F_DEAD
)) {
1274 if (nh
->nh_dev
== NULL
|| !(nh
->nh_dev
->flags
&IFF_UP
))
1276 if (nh
->nh_dev
!= dev
|| !__in_dev_get_rtnl(dev
))
1279 spin_lock_bh(&fib_multipath_lock
);
1281 nh
->nh_flags
&= ~RTNH_F_DEAD
;
1282 spin_unlock_bh(&fib_multipath_lock
);
1283 } endfor_nexthops(fi
)
1286 fi
->fib_flags
&= ~RTNH_F_DEAD
;
1295 The algorithm is suboptimal, but it provides really
1296 fair weighted route distribution.
1299 void fib_select_multipath(const struct flowi
*flp
, struct fib_result
*res
)
1301 struct fib_info
*fi
= res
->fi
;
1304 spin_lock_bh(&fib_multipath_lock
);
1305 if (fi
->fib_power
<= 0) {
1307 change_nexthops(fi
) {
1308 if (!(nh
->nh_flags
&RTNH_F_DEAD
)) {
1309 power
+= nh
->nh_weight
;
1310 nh
->nh_power
= nh
->nh_weight
;
1312 } endfor_nexthops(fi
);
1313 fi
->fib_power
= power
;
1315 spin_unlock_bh(&fib_multipath_lock
);
1316 /* Race condition: route has just become dead. */
1323 /* w should be random number [0..fi->fib_power-1],
1324 it is pretty bad approximation.
1327 w
= jiffies
% fi
->fib_power
;
1329 change_nexthops(fi
) {
1330 if (!(nh
->nh_flags
&RTNH_F_DEAD
) && nh
->nh_power
) {
1331 if ((w
-= nh
->nh_power
) <= 0) {
1334 res
->nh_sel
= nhsel
;
1335 spin_unlock_bh(&fib_multipath_lock
);
1339 } endfor_nexthops(fi
);
1341 /* Race condition: route has just become dead. */
1343 spin_unlock_bh(&fib_multipath_lock
);