2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
109 #include <linux/sysctl.h>
112 #define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115 #define IP_MAX_MTU 0xFFF0
117 #define RT_GC_TIMEOUT (300*HZ)
119 static int ip_rt_max_size
;
120 static int ip_rt_gc_timeout __read_mostly
= RT_GC_TIMEOUT
;
121 static int ip_rt_gc_interval __read_mostly
= 60 * HZ
;
122 static int ip_rt_gc_min_interval __read_mostly
= HZ
/ 2;
123 static int ip_rt_redirect_number __read_mostly
= 9;
124 static int ip_rt_redirect_load __read_mostly
= HZ
/ 50;
125 static int ip_rt_redirect_silence __read_mostly
= ((HZ
/ 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly
= HZ
;
127 static int ip_rt_error_burst __read_mostly
= 5 * HZ
;
128 static int ip_rt_gc_elasticity __read_mostly
= 8;
129 static int ip_rt_mtu_expires __read_mostly
= 10 * 60 * HZ
;
130 static int ip_rt_min_pmtu __read_mostly
= 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly
= 256;
132 static int rt_chain_length_max __read_mostly
= 20;
134 static struct delayed_work expires_work
;
135 static unsigned long expires_ljiffies
;
138 * Interface to generic destination cache.
141 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
);
142 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
);
143 static unsigned int ipv4_default_mtu(const struct dst_entry
*dst
);
144 static void ipv4_dst_destroy(struct dst_entry
*dst
);
145 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
);
146 static void ipv4_link_failure(struct sk_buff
*skb
);
147 static void ip_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
);
148 static int rt_garbage_collect(struct dst_ops
*ops
);
150 static void ipv4_dst_ifdown(struct dst_entry
*dst
, struct net_device
*dev
,
155 static u32
*ipv4_cow_metrics(struct dst_entry
*dst
, unsigned long old
)
157 u32
*p
= kmalloc(sizeof(u32
) * RTAX_MAX
, GFP_ATOMIC
);
160 u32
*old_p
= __DST_METRICS_PTR(old
);
161 unsigned long prev
, new;
163 memcpy(p
, old_p
, sizeof(u32
) * RTAX_MAX
);
165 new = (unsigned long) p
;
166 prev
= cmpxchg(&dst
->_metrics
, old
, new);
170 p
= __DST_METRICS_PTR(prev
);
171 if (prev
& DST_METRICS_READ_ONLY
)
174 struct rtable
*rt
= (struct rtable
*) dst
;
177 fib_info_put(rt
->fi
);
185 static struct dst_ops ipv4_dst_ops
= {
187 .protocol
= cpu_to_be16(ETH_P_IP
),
188 .gc
= rt_garbage_collect
,
189 .check
= ipv4_dst_check
,
190 .default_advmss
= ipv4_default_advmss
,
191 .default_mtu
= ipv4_default_mtu
,
192 .cow_metrics
= ipv4_cow_metrics
,
193 .destroy
= ipv4_dst_destroy
,
194 .ifdown
= ipv4_dst_ifdown
,
195 .negative_advice
= ipv4_negative_advice
,
196 .link_failure
= ipv4_link_failure
,
197 .update_pmtu
= ip_rt_update_pmtu
,
198 .local_out
= __ip_local_out
,
201 #define ECN_OR_COST(class) TC_PRIO_##class
203 const __u8 ip_tos2prio
[16] = {
207 ECN_OR_COST(BESTEFFORT
),
213 ECN_OR_COST(INTERACTIVE
),
215 ECN_OR_COST(INTERACTIVE
),
216 TC_PRIO_INTERACTIVE_BULK
,
217 ECN_OR_COST(INTERACTIVE_BULK
),
218 TC_PRIO_INTERACTIVE_BULK
,
219 ECN_OR_COST(INTERACTIVE_BULK
)
227 /* The locking scheme is rather straight forward:
229 * 1) Read-Copy Update protects the buckets of the central route hash.
230 * 2) Only writers remove entries, and they hold the lock
231 * as they look at rtable reference counts.
232 * 3) Only readers acquire references to rtable entries,
233 * they do so with atomic increments and with the
237 struct rt_hash_bucket
{
238 struct rtable __rcu
*chain
;
241 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
242 defined(CONFIG_PROVE_LOCKING)
244 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
245 * The size of this table is a power of two and depends on the number of CPUS.
246 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
248 #ifdef CONFIG_LOCKDEP
249 # define RT_HASH_LOCK_SZ 256
252 # define RT_HASH_LOCK_SZ 4096
254 # define RT_HASH_LOCK_SZ 2048
256 # define RT_HASH_LOCK_SZ 1024
258 # define RT_HASH_LOCK_SZ 512
260 # define RT_HASH_LOCK_SZ 256
264 static spinlock_t
*rt_hash_locks
;
265 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
267 static __init
void rt_hash_lock_init(void)
271 rt_hash_locks
= kmalloc(sizeof(spinlock_t
) * RT_HASH_LOCK_SZ
,
274 panic("IP: failed to allocate rt_hash_locks\n");
276 for (i
= 0; i
< RT_HASH_LOCK_SZ
; i
++)
277 spin_lock_init(&rt_hash_locks
[i
]);
280 # define rt_hash_lock_addr(slot) NULL
282 static inline void rt_hash_lock_init(void)
287 static struct rt_hash_bucket
*rt_hash_table __read_mostly
;
288 static unsigned rt_hash_mask __read_mostly
;
289 static unsigned int rt_hash_log __read_mostly
;
291 static DEFINE_PER_CPU(struct rt_cache_stat
, rt_cache_stat
);
292 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
294 static inline unsigned int rt_hash(__be32 daddr
, __be32 saddr
, int idx
,
297 return jhash_3words((__force u32
)daddr
, (__force u32
)saddr
,
302 static inline int rt_genid(struct net
*net
)
304 return atomic_read(&net
->ipv4
.rt_genid
);
307 #ifdef CONFIG_PROC_FS
308 struct rt_cache_iter_state
{
309 struct seq_net_private p
;
314 static struct rtable
*rt_cache_get_first(struct seq_file
*seq
)
316 struct rt_cache_iter_state
*st
= seq
->private;
317 struct rtable
*r
= NULL
;
319 for (st
->bucket
= rt_hash_mask
; st
->bucket
>= 0; --st
->bucket
) {
320 if (!rcu_dereference_raw(rt_hash_table
[st
->bucket
].chain
))
323 r
= rcu_dereference_bh(rt_hash_table
[st
->bucket
].chain
);
325 if (dev_net(r
->dst
.dev
) == seq_file_net(seq
) &&
326 r
->rt_genid
== st
->genid
)
328 r
= rcu_dereference_bh(r
->dst
.rt_next
);
330 rcu_read_unlock_bh();
335 static struct rtable
*__rt_cache_get_next(struct seq_file
*seq
,
338 struct rt_cache_iter_state
*st
= seq
->private;
340 r
= rcu_dereference_bh(r
->dst
.rt_next
);
342 rcu_read_unlock_bh();
344 if (--st
->bucket
< 0)
346 } while (!rcu_dereference_raw(rt_hash_table
[st
->bucket
].chain
));
348 r
= rcu_dereference_bh(rt_hash_table
[st
->bucket
].chain
);
353 static struct rtable
*rt_cache_get_next(struct seq_file
*seq
,
356 struct rt_cache_iter_state
*st
= seq
->private;
357 while ((r
= __rt_cache_get_next(seq
, r
)) != NULL
) {
358 if (dev_net(r
->dst
.dev
) != seq_file_net(seq
))
360 if (r
->rt_genid
== st
->genid
)
366 static struct rtable
*rt_cache_get_idx(struct seq_file
*seq
, loff_t pos
)
368 struct rtable
*r
= rt_cache_get_first(seq
);
371 while (pos
&& (r
= rt_cache_get_next(seq
, r
)))
373 return pos
? NULL
: r
;
376 static void *rt_cache_seq_start(struct seq_file
*seq
, loff_t
*pos
)
378 struct rt_cache_iter_state
*st
= seq
->private;
380 return rt_cache_get_idx(seq
, *pos
- 1);
381 st
->genid
= rt_genid(seq_file_net(seq
));
382 return SEQ_START_TOKEN
;
385 static void *rt_cache_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
389 if (v
== SEQ_START_TOKEN
)
390 r
= rt_cache_get_first(seq
);
392 r
= rt_cache_get_next(seq
, v
);
397 static void rt_cache_seq_stop(struct seq_file
*seq
, void *v
)
399 if (v
&& v
!= SEQ_START_TOKEN
)
400 rcu_read_unlock_bh();
403 static int rt_cache_seq_show(struct seq_file
*seq
, void *v
)
405 if (v
== SEQ_START_TOKEN
)
406 seq_printf(seq
, "%-127s\n",
407 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
408 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
411 struct rtable
*r
= v
;
414 seq_printf(seq
, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
415 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
416 r
->dst
.dev
? r
->dst
.dev
->name
: "*",
417 (__force u32
)r
->rt_dst
,
418 (__force u32
)r
->rt_gateway
,
419 r
->rt_flags
, atomic_read(&r
->dst
.__refcnt
),
420 r
->dst
.__use
, 0, (__force u32
)r
->rt_src
,
421 dst_metric_advmss(&r
->dst
) + 40,
422 dst_metric(&r
->dst
, RTAX_WINDOW
),
423 (int)((dst_metric(&r
->dst
, RTAX_RTT
) >> 3) +
424 dst_metric(&r
->dst
, RTAX_RTTVAR
)),
426 r
->dst
.hh
? atomic_read(&r
->dst
.hh
->hh_refcnt
) : -1,
427 r
->dst
.hh
? (r
->dst
.hh
->hh_output
==
429 r
->rt_spec_dst
, &len
);
431 seq_printf(seq
, "%*s\n", 127 - len
, "");
436 static const struct seq_operations rt_cache_seq_ops
= {
437 .start
= rt_cache_seq_start
,
438 .next
= rt_cache_seq_next
,
439 .stop
= rt_cache_seq_stop
,
440 .show
= rt_cache_seq_show
,
443 static int rt_cache_seq_open(struct inode
*inode
, struct file
*file
)
445 return seq_open_net(inode
, file
, &rt_cache_seq_ops
,
446 sizeof(struct rt_cache_iter_state
));
449 static const struct file_operations rt_cache_seq_fops
= {
450 .owner
= THIS_MODULE
,
451 .open
= rt_cache_seq_open
,
454 .release
= seq_release_net
,
458 static void *rt_cpu_seq_start(struct seq_file
*seq
, loff_t
*pos
)
463 return SEQ_START_TOKEN
;
465 for (cpu
= *pos
-1; cpu
< nr_cpu_ids
; ++cpu
) {
466 if (!cpu_possible(cpu
))
469 return &per_cpu(rt_cache_stat
, cpu
);
474 static void *rt_cpu_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
478 for (cpu
= *pos
; cpu
< nr_cpu_ids
; ++cpu
) {
479 if (!cpu_possible(cpu
))
482 return &per_cpu(rt_cache_stat
, cpu
);
488 static void rt_cpu_seq_stop(struct seq_file
*seq
, void *v
)
493 static int rt_cpu_seq_show(struct seq_file
*seq
, void *v
)
495 struct rt_cache_stat
*st
= v
;
497 if (v
== SEQ_START_TOKEN
) {
498 seq_printf(seq
, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
502 seq_printf(seq
,"%08x %08x %08x %08x %08x %08x %08x %08x "
503 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
504 dst_entries_get_slow(&ipv4_dst_ops
),
527 static const struct seq_operations rt_cpu_seq_ops
= {
528 .start
= rt_cpu_seq_start
,
529 .next
= rt_cpu_seq_next
,
530 .stop
= rt_cpu_seq_stop
,
531 .show
= rt_cpu_seq_show
,
535 static int rt_cpu_seq_open(struct inode
*inode
, struct file
*file
)
537 return seq_open(file
, &rt_cpu_seq_ops
);
540 static const struct file_operations rt_cpu_seq_fops
= {
541 .owner
= THIS_MODULE
,
542 .open
= rt_cpu_seq_open
,
545 .release
= seq_release
,
548 #ifdef CONFIG_IP_ROUTE_CLASSID
549 static int rt_acct_proc_show(struct seq_file
*m
, void *v
)
551 struct ip_rt_acct
*dst
, *src
;
554 dst
= kcalloc(256, sizeof(struct ip_rt_acct
), GFP_KERNEL
);
558 for_each_possible_cpu(i
) {
559 src
= (struct ip_rt_acct
*)per_cpu_ptr(ip_rt_acct
, i
);
560 for (j
= 0; j
< 256; j
++) {
561 dst
[j
].o_bytes
+= src
[j
].o_bytes
;
562 dst
[j
].o_packets
+= src
[j
].o_packets
;
563 dst
[j
].i_bytes
+= src
[j
].i_bytes
;
564 dst
[j
].i_packets
+= src
[j
].i_packets
;
568 seq_write(m
, dst
, 256 * sizeof(struct ip_rt_acct
));
573 static int rt_acct_proc_open(struct inode
*inode
, struct file
*file
)
575 return single_open(file
, rt_acct_proc_show
, NULL
);
578 static const struct file_operations rt_acct_proc_fops
= {
579 .owner
= THIS_MODULE
,
580 .open
= rt_acct_proc_open
,
583 .release
= single_release
,
587 static int __net_init
ip_rt_do_proc_init(struct net
*net
)
589 struct proc_dir_entry
*pde
;
591 pde
= proc_net_fops_create(net
, "rt_cache", S_IRUGO
,
596 pde
= proc_create("rt_cache", S_IRUGO
,
597 net
->proc_net_stat
, &rt_cpu_seq_fops
);
601 #ifdef CONFIG_IP_ROUTE_CLASSID
602 pde
= proc_create("rt_acct", 0, net
->proc_net
, &rt_acct_proc_fops
);
608 #ifdef CONFIG_IP_ROUTE_CLASSID
610 remove_proc_entry("rt_cache", net
->proc_net_stat
);
613 remove_proc_entry("rt_cache", net
->proc_net
);
618 static void __net_exit
ip_rt_do_proc_exit(struct net
*net
)
620 remove_proc_entry("rt_cache", net
->proc_net_stat
);
621 remove_proc_entry("rt_cache", net
->proc_net
);
622 #ifdef CONFIG_IP_ROUTE_CLASSID
623 remove_proc_entry("rt_acct", net
->proc_net
);
627 static struct pernet_operations ip_rt_proc_ops __net_initdata
= {
628 .init
= ip_rt_do_proc_init
,
629 .exit
= ip_rt_do_proc_exit
,
632 static int __init
ip_rt_proc_init(void)
634 return register_pernet_subsys(&ip_rt_proc_ops
);
638 static inline int ip_rt_proc_init(void)
642 #endif /* CONFIG_PROC_FS */
644 static inline void rt_free(struct rtable
*rt
)
646 call_rcu_bh(&rt
->dst
.rcu_head
, dst_rcu_free
);
649 static inline void rt_drop(struct rtable
*rt
)
652 call_rcu_bh(&rt
->dst
.rcu_head
, dst_rcu_free
);
655 static inline int rt_fast_clean(struct rtable
*rth
)
657 /* Kill broadcast/multicast entries very aggresively, if they
658 collide in hash table with more useful entries */
659 return (rth
->rt_flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) &&
660 rt_is_input_route(rth
) && rth
->dst
.rt_next
;
663 static inline int rt_valuable(struct rtable
*rth
)
665 return (rth
->rt_flags
& (RTCF_REDIRECTED
| RTCF_NOTIFY
)) ||
669 static int rt_may_expire(struct rtable
*rth
, unsigned long tmo1
, unsigned long tmo2
)
674 if (atomic_read(&rth
->dst
.__refcnt
))
678 if (rth
->dst
.expires
&&
679 time_after_eq(jiffies
, rth
->dst
.expires
))
682 age
= jiffies
- rth
->dst
.lastuse
;
684 if ((age
<= tmo1
&& !rt_fast_clean(rth
)) ||
685 (age
<= tmo2
&& rt_valuable(rth
)))
691 /* Bits of score are:
693 * 30: not quite useless
694 * 29..0: usage counter
696 static inline u32
rt_score(struct rtable
*rt
)
698 u32 score
= jiffies
- rt
->dst
.lastuse
;
700 score
= ~score
& ~(3<<30);
705 if (rt_is_output_route(rt
) ||
706 !(rt
->rt_flags
& (RTCF_BROADCAST
|RTCF_MULTICAST
|RTCF_LOCAL
)))
712 static inline bool rt_caching(const struct net
*net
)
714 return net
->ipv4
.current_rt_cache_rebuild_count
<=
715 net
->ipv4
.sysctl_rt_cache_rebuild_count
;
718 static inline bool compare_hash_inputs(const struct flowi
*fl1
,
719 const struct flowi
*fl2
)
721 return ((((__force u32
)fl1
->fl4_dst
^ (__force u32
)fl2
->fl4_dst
) |
722 ((__force u32
)fl1
->fl4_src
^ (__force u32
)fl2
->fl4_src
) |
723 (fl1
->iif
^ fl2
->iif
)) == 0);
726 static inline int compare_keys(struct flowi
*fl1
, struct flowi
*fl2
)
728 return (((__force u32
)fl1
->fl4_dst
^ (__force u32
)fl2
->fl4_dst
) |
729 ((__force u32
)fl1
->fl4_src
^ (__force u32
)fl2
->fl4_src
) |
730 (fl1
->mark
^ fl2
->mark
) |
731 (*(u16
*)&fl1
->fl4_tos
^ *(u16
*)&fl2
->fl4_tos
) |
732 (fl1
->oif
^ fl2
->oif
) |
733 (fl1
->iif
^ fl2
->iif
)) == 0;
736 static inline int compare_netns(struct rtable
*rt1
, struct rtable
*rt2
)
738 return net_eq(dev_net(rt1
->dst
.dev
), dev_net(rt2
->dst
.dev
));
741 static inline int rt_is_expired(struct rtable
*rth
)
743 return rth
->rt_genid
!= rt_genid(dev_net(rth
->dst
.dev
));
747 * Perform a full scan of hash table and free all entries.
748 * Can be called by a softirq or a process.
749 * In the later case, we want to be reschedule if necessary
751 static void rt_do_flush(struct net
*net
, int process_context
)
754 struct rtable
*rth
, *next
;
756 for (i
= 0; i
<= rt_hash_mask
; i
++) {
757 struct rtable __rcu
**pprev
;
760 if (process_context
&& need_resched())
762 rth
= rcu_dereference_raw(rt_hash_table
[i
].chain
);
766 spin_lock_bh(rt_hash_lock_addr(i
));
769 pprev
= &rt_hash_table
[i
].chain
;
770 rth
= rcu_dereference_protected(*pprev
,
771 lockdep_is_held(rt_hash_lock_addr(i
)));
774 next
= rcu_dereference_protected(rth
->dst
.rt_next
,
775 lockdep_is_held(rt_hash_lock_addr(i
)));
778 net_eq(dev_net(rth
->dst
.dev
), net
)) {
779 rcu_assign_pointer(*pprev
, next
);
780 rcu_assign_pointer(rth
->dst
.rt_next
, list
);
783 pprev
= &rth
->dst
.rt_next
;
788 spin_unlock_bh(rt_hash_lock_addr(i
));
790 for (; list
; list
= next
) {
791 next
= rcu_dereference_protected(list
->dst
.rt_next
, 1);
798 * While freeing expired entries, we compute average chain length
799 * and standard deviation, using fixed-point arithmetic.
800 * This to have an estimation of rt_chain_length_max
801 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
802 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
806 #define ONE (1UL << FRACT_BITS)
809 * Given a hash chain and an item in this hash chain,
810 * find if a previous entry has the same hash_inputs
811 * (but differs on tos, mark or oif)
812 * Returns 0 if an alias is found.
813 * Returns ONE if rth has no alias before itself.
815 static int has_noalias(const struct rtable
*head
, const struct rtable
*rth
)
817 const struct rtable
*aux
= head
;
820 if (compare_hash_inputs(&aux
->fl
, &rth
->fl
))
822 aux
= rcu_dereference_protected(aux
->dst
.rt_next
, 1);
827 static void rt_check_expire(void)
829 static unsigned int rover
;
830 unsigned int i
= rover
, goal
;
832 struct rtable __rcu
**rthp
;
833 unsigned long samples
= 0;
834 unsigned long sum
= 0, sum2
= 0;
838 delta
= jiffies
- expires_ljiffies
;
839 expires_ljiffies
= jiffies
;
840 mult
= ((u64
)delta
) << rt_hash_log
;
841 if (ip_rt_gc_timeout
> 1)
842 do_div(mult
, ip_rt_gc_timeout
);
843 goal
= (unsigned int)mult
;
844 if (goal
> rt_hash_mask
)
845 goal
= rt_hash_mask
+ 1;
846 for (; goal
> 0; goal
--) {
847 unsigned long tmo
= ip_rt_gc_timeout
;
848 unsigned long length
;
850 i
= (i
+ 1) & rt_hash_mask
;
851 rthp
= &rt_hash_table
[i
].chain
;
858 if (rcu_dereference_raw(*rthp
) == NULL
)
861 spin_lock_bh(rt_hash_lock_addr(i
));
862 while ((rth
= rcu_dereference_protected(*rthp
,
863 lockdep_is_held(rt_hash_lock_addr(i
)))) != NULL
) {
864 prefetch(rth
->dst
.rt_next
);
865 if (rt_is_expired(rth
)) {
866 *rthp
= rth
->dst
.rt_next
;
870 if (rth
->dst
.expires
) {
871 /* Entry is expired even if it is in use */
872 if (time_before_eq(jiffies
, rth
->dst
.expires
)) {
875 rthp
= &rth
->dst
.rt_next
;
877 * We only count entries on
878 * a chain with equal hash inputs once
879 * so that entries for different QOS
880 * levels, and other non-hash input
881 * attributes don't unfairly skew
882 * the length computation
884 length
+= has_noalias(rt_hash_table
[i
].chain
, rth
);
887 } else if (!rt_may_expire(rth
, tmo
, ip_rt_gc_timeout
))
890 /* Cleanup aged off entries. */
891 *rthp
= rth
->dst
.rt_next
;
894 spin_unlock_bh(rt_hash_lock_addr(i
));
896 sum2
+= length
*length
;
899 unsigned long avg
= sum
/ samples
;
900 unsigned long sd
= int_sqrt(sum2
/ samples
- avg
*avg
);
901 rt_chain_length_max
= max_t(unsigned long,
903 (avg
+ 4*sd
) >> FRACT_BITS
);
909 * rt_worker_func() is run in process context.
910 * we call rt_check_expire() to scan part of the hash table
912 static void rt_worker_func(struct work_struct
*work
)
915 schedule_delayed_work(&expires_work
, ip_rt_gc_interval
);
919 * Pertubation of rt_genid by a small quantity [1..256]
920 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
921 * many times (2^24) without giving recent rt_genid.
922 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
924 static void rt_cache_invalidate(struct net
*net
)
926 unsigned char shuffle
;
928 get_random_bytes(&shuffle
, sizeof(shuffle
));
929 atomic_add(shuffle
+ 1U, &net
->ipv4
.rt_genid
);
933 * delay < 0 : invalidate cache (fast : entries will be deleted later)
934 * delay >= 0 : invalidate & flush cache (can be long)
936 void rt_cache_flush(struct net
*net
, int delay
)
938 rt_cache_invalidate(net
);
940 rt_do_flush(net
, !in_softirq());
943 /* Flush previous cache invalidated entries from the cache */
944 void rt_cache_flush_batch(struct net
*net
)
946 rt_do_flush(net
, !in_softirq());
949 static void rt_emergency_hash_rebuild(struct net
*net
)
952 printk(KERN_WARNING
"Route hash chain too long!\n");
953 rt_cache_invalidate(net
);
957 Short description of GC goals.
959 We want to build algorithm, which will keep routing cache
960 at some equilibrium point, when number of aged off entries
961 is kept approximately equal to newly generated ones.
963 Current expiration strength is variable "expire".
964 We try to adjust it dynamically, so that if networking
965 is idle expires is large enough to keep enough of warm entries,
966 and when load increases it reduces to limit cache size.
969 static int rt_garbage_collect(struct dst_ops
*ops
)
971 static unsigned long expire
= RT_GC_TIMEOUT
;
972 static unsigned long last_gc
;
974 static int equilibrium
;
976 struct rtable __rcu
**rthp
;
977 unsigned long now
= jiffies
;
979 int entries
= dst_entries_get_fast(&ipv4_dst_ops
);
982 * Garbage collection is pretty expensive,
983 * do not make it too frequently.
986 RT_CACHE_STAT_INC(gc_total
);
988 if (now
- last_gc
< ip_rt_gc_min_interval
&&
989 entries
< ip_rt_max_size
) {
990 RT_CACHE_STAT_INC(gc_ignored
);
994 entries
= dst_entries_get_slow(&ipv4_dst_ops
);
995 /* Calculate number of entries, which we want to expire now. */
996 goal
= entries
- (ip_rt_gc_elasticity
<< rt_hash_log
);
998 if (equilibrium
< ipv4_dst_ops
.gc_thresh
)
999 equilibrium
= ipv4_dst_ops
.gc_thresh
;
1000 goal
= entries
- equilibrium
;
1002 equilibrium
+= min_t(unsigned int, goal
>> 1, rt_hash_mask
+ 1);
1003 goal
= entries
- equilibrium
;
1006 /* We are in dangerous area. Try to reduce cache really
1009 goal
= max_t(unsigned int, goal
>> 1, rt_hash_mask
+ 1);
1010 equilibrium
= entries
- goal
;
1013 if (now
- last_gc
>= ip_rt_gc_min_interval
)
1017 equilibrium
+= goal
;
1024 for (i
= rt_hash_mask
, k
= rover
; i
>= 0; i
--) {
1025 unsigned long tmo
= expire
;
1027 k
= (k
+ 1) & rt_hash_mask
;
1028 rthp
= &rt_hash_table
[k
].chain
;
1029 spin_lock_bh(rt_hash_lock_addr(k
));
1030 while ((rth
= rcu_dereference_protected(*rthp
,
1031 lockdep_is_held(rt_hash_lock_addr(k
)))) != NULL
) {
1032 if (!rt_is_expired(rth
) &&
1033 !rt_may_expire(rth
, tmo
, expire
)) {
1035 rthp
= &rth
->dst
.rt_next
;
1038 *rthp
= rth
->dst
.rt_next
;
1042 spin_unlock_bh(rt_hash_lock_addr(k
));
1051 /* Goal is not achieved. We stop process if:
1053 - if expire reduced to zero. Otherwise, expire is halfed.
1054 - if table is not full.
1055 - if we are called from interrupt.
1056 - jiffies check is just fallback/debug loop breaker.
1057 We will not spin here for long time in any case.
1060 RT_CACHE_STAT_INC(gc_goal_miss
);
1066 #if RT_CACHE_DEBUG >= 2
1067 printk(KERN_DEBUG
"expire>> %u %d %d %d\n", expire
,
1068 dst_entries_get_fast(&ipv4_dst_ops
), goal
, i
);
1071 if (dst_entries_get_fast(&ipv4_dst_ops
) < ip_rt_max_size
)
1073 } while (!in_softirq() && time_before_eq(jiffies
, now
));
1075 if (dst_entries_get_fast(&ipv4_dst_ops
) < ip_rt_max_size
)
1077 if (dst_entries_get_slow(&ipv4_dst_ops
) < ip_rt_max_size
)
1079 if (net_ratelimit())
1080 printk(KERN_WARNING
"dst cache overflow\n");
1081 RT_CACHE_STAT_INC(gc_dst_overflow
);
1085 expire
+= ip_rt_gc_min_interval
;
1086 if (expire
> ip_rt_gc_timeout
||
1087 dst_entries_get_fast(&ipv4_dst_ops
) < ipv4_dst_ops
.gc_thresh
||
1088 dst_entries_get_slow(&ipv4_dst_ops
) < ipv4_dst_ops
.gc_thresh
)
1089 expire
= ip_rt_gc_timeout
;
1090 #if RT_CACHE_DEBUG >= 2
1091 printk(KERN_DEBUG
"expire++ %u %d %d %d\n", expire
,
1092 dst_entries_get_fast(&ipv4_dst_ops
), goal
, rover
);
1098 * Returns number of entries in a hash chain that have different hash_inputs
1100 static int slow_chain_length(const struct rtable
*head
)
1103 const struct rtable
*rth
= head
;
1106 length
+= has_noalias(head
, rth
);
1107 rth
= rcu_dereference_protected(rth
->dst
.rt_next
, 1);
1109 return length
>> FRACT_BITS
;
1112 static int rt_intern_hash(unsigned hash
, struct rtable
*rt
,
1113 struct rtable
**rp
, struct sk_buff
*skb
, int ifindex
)
1115 struct rtable
*rth
, *cand
;
1116 struct rtable __rcu
**rthp
, **candp
;
1120 int attempts
= !in_softirq();
1124 min_score
= ~(u32
)0;
1129 if (!rt_caching(dev_net(rt
->dst
.dev
))) {
1131 * If we're not caching, just tell the caller we
1132 * were successful and don't touch the route. The
1133 * caller hold the sole reference to the cache entry, and
1134 * it will be released when the caller is done with it.
1135 * If we drop it here, the callers have no way to resolve routes
1136 * when we're not caching. Instead, just point *rp at rt, so
1137 * the caller gets a single use out of the route
1138 * Note that we do rt_free on this new route entry, so that
1139 * once its refcount hits zero, we are still able to reap it
1141 * Note: To avoid expensive rcu stuff for this uncached dst,
1142 * we set DST_NOCACHE so that dst_release() can free dst without
1143 * waiting a grace period.
1146 rt
->dst
.flags
|= DST_NOCACHE
;
1147 if (rt
->rt_type
== RTN_UNICAST
|| rt_is_output_route(rt
)) {
1148 int err
= arp_bind_neighbour(&rt
->dst
);
1150 if (net_ratelimit())
1152 "Neighbour table failure & not caching routes.\n");
1161 rthp
= &rt_hash_table
[hash
].chain
;
1163 spin_lock_bh(rt_hash_lock_addr(hash
));
1164 while ((rth
= rcu_dereference_protected(*rthp
,
1165 lockdep_is_held(rt_hash_lock_addr(hash
)))) != NULL
) {
1166 if (rt_is_expired(rth
)) {
1167 *rthp
= rth
->dst
.rt_next
;
1171 if (compare_keys(&rth
->fl
, &rt
->fl
) && compare_netns(rth
, rt
)) {
1173 *rthp
= rth
->dst
.rt_next
;
1175 * Since lookup is lockfree, the deletion
1176 * must be visible to another weakly ordered CPU before
1177 * the insertion at the start of the hash chain.
1179 rcu_assign_pointer(rth
->dst
.rt_next
,
1180 rt_hash_table
[hash
].chain
);
1182 * Since lookup is lockfree, the update writes
1183 * must be ordered for consistency on SMP.
1185 rcu_assign_pointer(rt_hash_table
[hash
].chain
, rth
);
1187 dst_use(&rth
->dst
, now
);
1188 spin_unlock_bh(rt_hash_lock_addr(hash
));
1194 skb_dst_set(skb
, &rth
->dst
);
1198 if (!atomic_read(&rth
->dst
.__refcnt
)) {
1199 u32 score
= rt_score(rth
);
1201 if (score
<= min_score
) {
1210 rthp
= &rth
->dst
.rt_next
;
1214 /* ip_rt_gc_elasticity used to be average length of chain
1215 * length, when exceeded gc becomes really aggressive.
1217 * The second limit is less certain. At the moment it allows
1218 * only 2 entries per bucket. We will see.
1220 if (chain_length
> ip_rt_gc_elasticity
) {
1221 *candp
= cand
->dst
.rt_next
;
1225 if (chain_length
> rt_chain_length_max
&&
1226 slow_chain_length(rt_hash_table
[hash
].chain
) > rt_chain_length_max
) {
1227 struct net
*net
= dev_net(rt
->dst
.dev
);
1228 int num
= ++net
->ipv4
.current_rt_cache_rebuild_count
;
1229 if (!rt_caching(net
)) {
1230 printk(KERN_WARNING
"%s: %d rebuilds is over limit, route caching disabled\n",
1231 rt
->dst
.dev
->name
, num
);
1233 rt_emergency_hash_rebuild(net
);
1234 spin_unlock_bh(rt_hash_lock_addr(hash
));
1236 hash
= rt_hash(rt
->fl
.fl4_dst
, rt
->fl
.fl4_src
,
1237 ifindex
, rt_genid(net
));
1242 /* Try to bind route to arp only if it is output
1243 route or unicast forwarding path.
1245 if (rt
->rt_type
== RTN_UNICAST
|| rt_is_output_route(rt
)) {
1246 int err
= arp_bind_neighbour(&rt
->dst
);
1248 spin_unlock_bh(rt_hash_lock_addr(hash
));
1250 if (err
!= -ENOBUFS
) {
1255 /* Neighbour tables are full and nothing
1256 can be released. Try to shrink route cache,
1257 it is most likely it holds some neighbour records.
1259 if (attempts
-- > 0) {
1260 int saved_elasticity
= ip_rt_gc_elasticity
;
1261 int saved_int
= ip_rt_gc_min_interval
;
1262 ip_rt_gc_elasticity
= 1;
1263 ip_rt_gc_min_interval
= 0;
1264 rt_garbage_collect(&ipv4_dst_ops
);
1265 ip_rt_gc_min_interval
= saved_int
;
1266 ip_rt_gc_elasticity
= saved_elasticity
;
1270 if (net_ratelimit())
1271 printk(KERN_WARNING
"ipv4: Neighbour table overflow.\n");
1277 rt
->dst
.rt_next
= rt_hash_table
[hash
].chain
;
1279 #if RT_CACHE_DEBUG >= 2
1280 if (rt
->dst
.rt_next
) {
1282 printk(KERN_DEBUG
"rt_cache @%02x: %pI4",
1284 for (trt
= rt
->dst
.rt_next
; trt
; trt
= trt
->dst
.rt_next
)
1285 printk(" . %pI4", &trt
->rt_dst
);
1290 * Since lookup is lockfree, we must make sure
1291 * previous writes to rt are comitted to memory
1292 * before making rt visible to other CPUS.
1294 rcu_assign_pointer(rt_hash_table
[hash
].chain
, rt
);
1296 spin_unlock_bh(rt_hash_lock_addr(hash
));
1302 skb_dst_set(skb
, &rt
->dst
);
1306 void rt_bind_peer(struct rtable
*rt
, int create
)
1308 struct inet_peer
*peer
;
1310 peer
= inet_getpeer_v4(rt
->rt_dst
, create
);
1312 if (peer
&& cmpxchg(&rt
->peer
, NULL
, peer
) != NULL
)
1317 * Peer allocation may fail only in serious out-of-memory conditions. However
1318 * we still can generate some output.
1319 * Random ID selection looks a bit dangerous because we have no chances to
1320 * select ID being unique in a reasonable period of time.
1321 * But broken packet identifier may be better than no packet at all.
1323 static void ip_select_fb_ident(struct iphdr
*iph
)
1325 static DEFINE_SPINLOCK(ip_fb_id_lock
);
1326 static u32 ip_fallback_id
;
1329 spin_lock_bh(&ip_fb_id_lock
);
1330 salt
= secure_ip_id((__force __be32
)ip_fallback_id
^ iph
->daddr
);
1331 iph
->id
= htons(salt
& 0xFFFF);
1332 ip_fallback_id
= salt
;
1333 spin_unlock_bh(&ip_fb_id_lock
);
1336 void __ip_select_ident(struct iphdr
*iph
, struct dst_entry
*dst
, int more
)
1338 struct rtable
*rt
= (struct rtable
*) dst
;
1341 if (rt
->peer
== NULL
)
1342 rt_bind_peer(rt
, 1);
1344 /* If peer is attached to destination, it is never detached,
1345 so that we need not to grab a lock to dereference it.
1348 iph
->id
= htons(inet_getid(rt
->peer
, more
));
1352 printk(KERN_DEBUG
"rt_bind_peer(0) @%p\n",
1353 __builtin_return_address(0));
1355 ip_select_fb_ident(iph
);
1357 EXPORT_SYMBOL(__ip_select_ident
);
1359 static void rt_del(unsigned hash
, struct rtable
*rt
)
1361 struct rtable __rcu
**rthp
;
1364 rthp
= &rt_hash_table
[hash
].chain
;
1365 spin_lock_bh(rt_hash_lock_addr(hash
));
1367 while ((aux
= rcu_dereference_protected(*rthp
,
1368 lockdep_is_held(rt_hash_lock_addr(hash
)))) != NULL
) {
1369 if (aux
== rt
|| rt_is_expired(aux
)) {
1370 *rthp
= aux
->dst
.rt_next
;
1374 rthp
= &aux
->dst
.rt_next
;
1376 spin_unlock_bh(rt_hash_lock_addr(hash
));
1379 /* called in rcu_read_lock() section */
1380 void ip_rt_redirect(__be32 old_gw
, __be32 daddr
, __be32 new_gw
,
1381 __be32 saddr
, struct net_device
*dev
)
1384 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1386 struct rtable __rcu
**rthp
;
1387 __be32 skeys
[2] = { saddr
, 0 };
1388 int ikeys
[2] = { dev
->ifindex
, 0 };
1389 struct netevent_redirect netevent
;
1396 if (new_gw
== old_gw
|| !IN_DEV_RX_REDIRECTS(in_dev
) ||
1397 ipv4_is_multicast(new_gw
) || ipv4_is_lbcast(new_gw
) ||
1398 ipv4_is_zeronet(new_gw
))
1399 goto reject_redirect
;
1401 if (!rt_caching(net
))
1402 goto reject_redirect
;
1404 if (!IN_DEV_SHARED_MEDIA(in_dev
)) {
1405 if (!inet_addr_onlink(in_dev
, new_gw
, old_gw
))
1406 goto reject_redirect
;
1407 if (IN_DEV_SEC_REDIRECTS(in_dev
) && ip_fib_check_default(new_gw
, dev
))
1408 goto reject_redirect
;
1410 if (inet_addr_type(net
, new_gw
) != RTN_UNICAST
)
1411 goto reject_redirect
;
1414 for (i
= 0; i
< 2; i
++) {
1415 for (k
= 0; k
< 2; k
++) {
1416 unsigned hash
= rt_hash(daddr
, skeys
[i
], ikeys
[k
],
1419 rthp
= &rt_hash_table
[hash
].chain
;
1421 while ((rth
= rcu_dereference(*rthp
)) != NULL
) {
1424 if (rth
->fl
.fl4_dst
!= daddr
||
1425 rth
->fl
.fl4_src
!= skeys
[i
] ||
1426 rth
->fl
.oif
!= ikeys
[k
] ||
1427 rt_is_input_route(rth
) ||
1428 rt_is_expired(rth
) ||
1429 !net_eq(dev_net(rth
->dst
.dev
), net
)) {
1430 rthp
= &rth
->dst
.rt_next
;
1434 if (rth
->rt_dst
!= daddr
||
1435 rth
->rt_src
!= saddr
||
1437 rth
->rt_gateway
!= old_gw
||
1438 rth
->dst
.dev
!= dev
)
1441 dst_hold(&rth
->dst
);
1443 rt
= dst_alloc(&ipv4_dst_ops
);
1449 /* Copy all the information. */
1452 atomic_set(&rt
->dst
.__refcnt
, 1);
1453 rt
->dst
.child
= NULL
;
1455 dev_hold(rt
->dst
.dev
);
1456 rt
->dst
.obsolete
= -1;
1457 rt
->dst
.lastuse
= jiffies
;
1458 rt
->dst
.path
= &rt
->dst
;
1459 rt
->dst
.neighbour
= NULL
;
1462 rt
->dst
.xfrm
= NULL
;
1464 rt
->rt_genid
= rt_genid(net
);
1465 rt
->rt_flags
|= RTCF_REDIRECTED
;
1467 /* Gateway is different ... */
1468 rt
->rt_gateway
= new_gw
;
1470 /* Redirect received -> path was valid */
1471 dst_confirm(&rth
->dst
);
1474 atomic_inc(&rt
->peer
->refcnt
);
1476 atomic_inc(&rt
->fi
->fib_clntref
);
1478 if (arp_bind_neighbour(&rt
->dst
) ||
1479 !(rt
->dst
.neighbour
->nud_state
&
1481 if (rt
->dst
.neighbour
)
1482 neigh_event_send(rt
->dst
.neighbour
, NULL
);
1488 netevent
.old
= &rth
->dst
;
1489 netevent
.new = &rt
->dst
;
1490 call_netevent_notifiers(NETEVENT_REDIRECT
,
1494 if (!rt_intern_hash(hash
, rt
, &rt
, NULL
, rt
->fl
.oif
))
1505 #ifdef CONFIG_IP_ROUTE_VERBOSE
1506 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
1507 printk(KERN_INFO
"Redirect from %pI4 on %s about %pI4 ignored.\n"
1508 " Advised path = %pI4 -> %pI4\n",
1509 &old_gw
, dev
->name
, &new_gw
,
1515 static struct dst_entry
*ipv4_negative_advice(struct dst_entry
*dst
)
1517 struct rtable
*rt
= (struct rtable
*)dst
;
1518 struct dst_entry
*ret
= dst
;
1521 if (dst
->obsolete
> 0) {
1524 } else if ((rt
->rt_flags
& RTCF_REDIRECTED
) ||
1526 time_after_eq(jiffies
, rt
->dst
.expires
))) {
1527 unsigned hash
= rt_hash(rt
->fl
.fl4_dst
, rt
->fl
.fl4_src
,
1529 rt_genid(dev_net(dst
->dev
)));
1530 #if RT_CACHE_DEBUG >= 1
1531 printk(KERN_DEBUG
"ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1532 &rt
->rt_dst
, rt
->fl
.fl4_tos
);
1543 * 1. The first ip_rt_redirect_number redirects are sent
1544 * with exponential backoff, then we stop sending them at all,
1545 * assuming that the host ignores our redirects.
1546 * 2. If we did not see packets requiring redirects
1547 * during ip_rt_redirect_silence, we assume that the host
1548 * forgot redirected route and start to send redirects again.
1550 * This algorithm is much cheaper and more intelligent than dumb load limiting
1553 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1554 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1557 void ip_rt_send_redirect(struct sk_buff
*skb
)
1559 struct rtable
*rt
= skb_rtable(skb
);
1560 struct in_device
*in_dev
;
1564 in_dev
= __in_dev_get_rcu(rt
->dst
.dev
);
1565 if (!in_dev
|| !IN_DEV_TX_REDIRECTS(in_dev
)) {
1569 log_martians
= IN_DEV_LOG_MARTIANS(in_dev
);
1572 /* No redirected packets during ip_rt_redirect_silence;
1573 * reset the algorithm.
1575 if (time_after(jiffies
, rt
->dst
.rate_last
+ ip_rt_redirect_silence
))
1576 rt
->dst
.rate_tokens
= 0;
1578 /* Too many ignored redirects; do not send anything
1579 * set dst.rate_last to the last seen redirected packet.
1581 if (rt
->dst
.rate_tokens
>= ip_rt_redirect_number
) {
1582 rt
->dst
.rate_last
= jiffies
;
1586 /* Check for load limit; set rate_last to the latest sent
1589 if (rt
->dst
.rate_tokens
== 0 ||
1591 (rt
->dst
.rate_last
+
1592 (ip_rt_redirect_load
<< rt
->dst
.rate_tokens
)))) {
1593 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, rt
->rt_gateway
);
1594 rt
->dst
.rate_last
= jiffies
;
1595 ++rt
->dst
.rate_tokens
;
1596 #ifdef CONFIG_IP_ROUTE_VERBOSE
1598 rt
->dst
.rate_tokens
== ip_rt_redirect_number
&&
1600 printk(KERN_WARNING
"host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1601 &rt
->rt_src
, rt
->rt_iif
,
1602 &rt
->rt_dst
, &rt
->rt_gateway
);
1607 static int ip_error(struct sk_buff
*skb
)
1609 struct rtable
*rt
= skb_rtable(skb
);
1613 switch (rt
->dst
.error
) {
1618 code
= ICMP_HOST_UNREACH
;
1621 code
= ICMP_NET_UNREACH
;
1622 IP_INC_STATS_BH(dev_net(rt
->dst
.dev
),
1623 IPSTATS_MIB_INNOROUTES
);
1626 code
= ICMP_PKT_FILTERED
;
1631 rt
->dst
.rate_tokens
+= now
- rt
->dst
.rate_last
;
1632 if (rt
->dst
.rate_tokens
> ip_rt_error_burst
)
1633 rt
->dst
.rate_tokens
= ip_rt_error_burst
;
1634 rt
->dst
.rate_last
= now
;
1635 if (rt
->dst
.rate_tokens
>= ip_rt_error_cost
) {
1636 rt
->dst
.rate_tokens
-= ip_rt_error_cost
;
1637 icmp_send(skb
, ICMP_DEST_UNREACH
, code
, 0);
1640 out
: kfree_skb(skb
);
1645 * The last two values are not from the RFC but
1646 * are needed for AMPRnet AX.25 paths.
1649 static const unsigned short mtu_plateau
[] =
1650 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1652 static inline unsigned short guess_mtu(unsigned short old_mtu
)
1656 for (i
= 0; i
< ARRAY_SIZE(mtu_plateau
); i
++)
1657 if (old_mtu
> mtu_plateau
[i
])
1658 return mtu_plateau
[i
];
1662 unsigned short ip_rt_frag_needed(struct net
*net
, struct iphdr
*iph
,
1663 unsigned short new_mtu
,
1664 struct net_device
*dev
)
1667 unsigned short old_mtu
= ntohs(iph
->tot_len
);
1669 int ikeys
[2] = { dev
->ifindex
, 0 };
1670 __be32 skeys
[2] = { iph
->saddr
, 0, };
1671 __be32 daddr
= iph
->daddr
;
1672 unsigned short est_mtu
= 0;
1674 for (k
= 0; k
< 2; k
++) {
1675 for (i
= 0; i
< 2; i
++) {
1676 unsigned hash
= rt_hash(daddr
, skeys
[i
], ikeys
[k
],
1680 for (rth
= rcu_dereference(rt_hash_table
[hash
].chain
); rth
;
1681 rth
= rcu_dereference(rth
->dst
.rt_next
)) {
1682 unsigned short mtu
= new_mtu
;
1684 if (rth
->fl
.fl4_dst
!= daddr
||
1685 rth
->fl
.fl4_src
!= skeys
[i
] ||
1686 rth
->rt_dst
!= daddr
||
1687 rth
->rt_src
!= iph
->saddr
||
1688 rth
->fl
.oif
!= ikeys
[k
] ||
1689 rt_is_input_route(rth
) ||
1690 dst_metric_locked(&rth
->dst
, RTAX_MTU
) ||
1691 !net_eq(dev_net(rth
->dst
.dev
), net
) ||
1695 if (new_mtu
< 68 || new_mtu
>= old_mtu
) {
1697 /* BSD 4.2 compatibility hack :-( */
1699 old_mtu
>= dst_mtu(&rth
->dst
) &&
1700 old_mtu
>= 68 + (iph
->ihl
<< 2))
1701 old_mtu
-= iph
->ihl
<< 2;
1703 mtu
= guess_mtu(old_mtu
);
1705 if (mtu
<= dst_mtu(&rth
->dst
)) {
1706 if (mtu
< dst_mtu(&rth
->dst
)) {
1707 dst_confirm(&rth
->dst
);
1708 if (mtu
< ip_rt_min_pmtu
) {
1709 u32 lock
= dst_metric(&rth
->dst
,
1711 mtu
= ip_rt_min_pmtu
;
1712 lock
|= (1 << RTAX_MTU
);
1713 dst_metric_set(&rth
->dst
, RTAX_LOCK
,
1716 dst_metric_set(&rth
->dst
, RTAX_MTU
, mtu
);
1717 dst_set_expires(&rth
->dst
,
1726 return est_mtu
? : new_mtu
;
1729 static void ip_rt_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
1731 if (dst_mtu(dst
) > mtu
&& mtu
>= 68 &&
1732 !(dst_metric_locked(dst
, RTAX_MTU
))) {
1733 if (mtu
< ip_rt_min_pmtu
) {
1734 u32 lock
= dst_metric(dst
, RTAX_LOCK
);
1735 mtu
= ip_rt_min_pmtu
;
1736 dst_metric_set(dst
, RTAX_LOCK
, lock
| (1 << RTAX_MTU
));
1738 dst_metric_set(dst
, RTAX_MTU
, mtu
);
1739 dst_set_expires(dst
, ip_rt_mtu_expires
);
1740 call_netevent_notifiers(NETEVENT_PMTU_UPDATE
, dst
);
1744 static struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
)
1746 if (rt_is_expired((struct rtable
*)dst
))
1751 static void ipv4_dst_destroy(struct dst_entry
*dst
)
1753 struct rtable
*rt
= (struct rtable
*) dst
;
1754 struct inet_peer
*peer
= rt
->peer
;
1756 dst_destroy_metrics_generic(dst
);
1758 fib_info_put(rt
->fi
);
1768 static void ipv4_link_failure(struct sk_buff
*skb
)
1772 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_HOST_UNREACH
, 0);
1774 rt
= skb_rtable(skb
);
1776 dst_set_expires(&rt
->dst
, 0);
1779 static int ip_rt_bug(struct sk_buff
*skb
)
1781 printk(KERN_DEBUG
"ip_rt_bug: %pI4 -> %pI4, %s\n",
1782 &ip_hdr(skb
)->saddr
, &ip_hdr(skb
)->daddr
,
1783 skb
->dev
? skb
->dev
->name
: "?");
1789 We do not cache source address of outgoing interface,
1790 because it is used only by IP RR, TS and SRR options,
1791 so that it out of fast path.
1793 BTW remember: "addr" is allowed to be not aligned
1797 void ip_rt_get_source(u8
*addr
, struct rtable
*rt
)
1800 struct fib_result res
;
1802 if (rt_is_output_route(rt
))
1806 if (fib_lookup(dev_net(rt
->dst
.dev
), &rt
->fl
, &res
) == 0)
1807 src
= FIB_RES_PREFSRC(res
);
1809 src
= inet_select_addr(rt
->dst
.dev
, rt
->rt_gateway
,
1813 memcpy(addr
, &src
, 4);
1816 #ifdef CONFIG_IP_ROUTE_CLASSID
1817 static void set_class_tag(struct rtable
*rt
, u32 tag
)
1819 if (!(rt
->dst
.tclassid
& 0xFFFF))
1820 rt
->dst
.tclassid
|= tag
& 0xFFFF;
1821 if (!(rt
->dst
.tclassid
& 0xFFFF0000))
1822 rt
->dst
.tclassid
|= tag
& 0xFFFF0000;
1826 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
)
1828 unsigned int advmss
= dst_metric_raw(dst
, RTAX_ADVMSS
);
1831 advmss
= max_t(unsigned int, dst
->dev
->mtu
- 40,
1833 if (advmss
> 65535 - 40)
1834 advmss
= 65535 - 40;
1839 static unsigned int ipv4_default_mtu(const struct dst_entry
*dst
)
1841 unsigned int mtu
= dst
->dev
->mtu
;
1843 if (unlikely(dst_metric_locked(dst
, RTAX_MTU
))) {
1844 const struct rtable
*rt
= (const struct rtable
*) dst
;
1846 if (rt
->rt_gateway
!= rt
->rt_dst
&& mtu
> 576)
1850 if (mtu
> IP_MAX_MTU
)
1856 static void rt_set_nexthop(struct rtable
*rt
, struct fib_result
*res
, u32 itag
)
1858 struct dst_entry
*dst
= &rt
->dst
;
1859 struct fib_info
*fi
= res
->fi
;
1862 if (FIB_RES_GW(*res
) &&
1863 FIB_RES_NH(*res
).nh_scope
== RT_SCOPE_LINK
)
1864 rt
->rt_gateway
= FIB_RES_GW(*res
);
1866 atomic_inc(&fi
->fib_clntref
);
1867 dst_init_metrics(dst
, fi
->fib_metrics
, true);
1868 #ifdef CONFIG_IP_ROUTE_CLASSID
1869 dst
->tclassid
= FIB_RES_NH(*res
).nh_tclassid
;
1873 if (dst_mtu(dst
) > IP_MAX_MTU
)
1874 dst_metric_set(dst
, RTAX_MTU
, IP_MAX_MTU
);
1875 if (dst_metric_raw(dst
, RTAX_ADVMSS
) > 65535 - 40)
1876 dst_metric_set(dst
, RTAX_ADVMSS
, 65535 - 40);
1878 #ifdef CONFIG_IP_ROUTE_CLASSID
1879 #ifdef CONFIG_IP_MULTIPLE_TABLES
1880 set_class_tag(rt
, fib_rules_tclass(res
));
1882 set_class_tag(rt
, itag
);
1884 rt
->rt_type
= res
->type
;
1887 /* called in rcu_read_lock() section */
1888 static int ip_route_input_mc(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1889 u8 tos
, struct net_device
*dev
, int our
)
1894 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1898 /* Primary sanity checks. */
1903 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
1904 ipv4_is_loopback(saddr
) || skb
->protocol
!= htons(ETH_P_IP
))
1907 if (ipv4_is_zeronet(saddr
)) {
1908 if (!ipv4_is_local_multicast(daddr
))
1910 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
1912 err
= fib_validate_source(saddr
, 0, tos
, 0, dev
, &spec_dst
,
1917 rth
= dst_alloc(&ipv4_dst_ops
);
1921 rth
->dst
.output
= ip_rt_bug
;
1922 rth
->dst
.obsolete
= -1;
1924 atomic_set(&rth
->dst
.__refcnt
, 1);
1925 rth
->dst
.flags
= DST_HOST
;
1926 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
1927 rth
->dst
.flags
|= DST_NOPOLICY
;
1928 rth
->fl
.fl4_dst
= daddr
;
1929 rth
->rt_dst
= daddr
;
1930 rth
->fl
.fl4_tos
= tos
;
1931 rth
->fl
.mark
= skb
->mark
;
1932 rth
->fl
.fl4_src
= saddr
;
1933 rth
->rt_src
= saddr
;
1934 #ifdef CONFIG_IP_ROUTE_CLASSID
1935 rth
->dst
.tclassid
= itag
;
1938 rth
->fl
.iif
= dev
->ifindex
;
1939 rth
->dst
.dev
= init_net
.loopback_dev
;
1940 dev_hold(rth
->dst
.dev
);
1942 rth
->rt_gateway
= daddr
;
1943 rth
->rt_spec_dst
= spec_dst
;
1944 rth
->rt_genid
= rt_genid(dev_net(dev
));
1945 rth
->rt_flags
= RTCF_MULTICAST
;
1946 rth
->rt_type
= RTN_MULTICAST
;
1948 rth
->dst
.input
= ip_local_deliver
;
1949 rth
->rt_flags
|= RTCF_LOCAL
;
1952 #ifdef CONFIG_IP_MROUTE
1953 if (!ipv4_is_local_multicast(daddr
) && IN_DEV_MFORWARD(in_dev
))
1954 rth
->dst
.input
= ip_mr_input
;
1956 RT_CACHE_STAT_INC(in_slow_mc
);
1958 hash
= rt_hash(daddr
, saddr
, dev
->ifindex
, rt_genid(dev_net(dev
)));
1959 return rt_intern_hash(hash
, rth
, NULL
, skb
, dev
->ifindex
);
1970 static void ip_handle_martian_source(struct net_device
*dev
,
1971 struct in_device
*in_dev
,
1972 struct sk_buff
*skb
,
1976 RT_CACHE_STAT_INC(in_martian_src
);
1977 #ifdef CONFIG_IP_ROUTE_VERBOSE
1978 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit()) {
1980 * RFC1812 recommendation, if source is martian,
1981 * the only hint is MAC header.
1983 printk(KERN_WARNING
"martian source %pI4 from %pI4, on dev %s\n",
1984 &daddr
, &saddr
, dev
->name
);
1985 if (dev
->hard_header_len
&& skb_mac_header_was_set(skb
)) {
1987 const unsigned char *p
= skb_mac_header(skb
);
1988 printk(KERN_WARNING
"ll header: ");
1989 for (i
= 0; i
< dev
->hard_header_len
; i
++, p
++) {
1991 if (i
< (dev
->hard_header_len
- 1))
2000 /* called in rcu_read_lock() section */
2001 static int __mkroute_input(struct sk_buff
*skb
,
2002 struct fib_result
*res
,
2003 struct in_device
*in_dev
,
2004 __be32 daddr
, __be32 saddr
, u32 tos
,
2005 struct rtable
**result
)
2009 struct in_device
*out_dev
;
2010 unsigned int flags
= 0;
2014 /* get a working reference to the output device */
2015 out_dev
= __in_dev_get_rcu(FIB_RES_DEV(*res
));
2016 if (out_dev
== NULL
) {
2017 if (net_ratelimit())
2018 printk(KERN_CRIT
"Bug in ip_route_input" \
2019 "_slow(). Please, report\n");
2024 err
= fib_validate_source(saddr
, daddr
, tos
, FIB_RES_OIF(*res
),
2025 in_dev
->dev
, &spec_dst
, &itag
, skb
->mark
);
2027 ip_handle_martian_source(in_dev
->dev
, in_dev
, skb
, daddr
,
2034 flags
|= RTCF_DIRECTSRC
;
2036 if (out_dev
== in_dev
&& err
&&
2037 (IN_DEV_SHARED_MEDIA(out_dev
) ||
2038 inet_addr_onlink(out_dev
, saddr
, FIB_RES_GW(*res
))))
2039 flags
|= RTCF_DOREDIRECT
;
2041 if (skb
->protocol
!= htons(ETH_P_IP
)) {
2042 /* Not IP (i.e. ARP). Do not create route, if it is
2043 * invalid for proxy arp. DNAT routes are always valid.
2045 * Proxy arp feature have been extended to allow, ARP
2046 * replies back to the same interface, to support
2047 * Private VLAN switch technologies. See arp.c.
2049 if (out_dev
== in_dev
&&
2050 IN_DEV_PROXY_ARP_PVLAN(in_dev
) == 0) {
2057 rth
= dst_alloc(&ipv4_dst_ops
);
2063 atomic_set(&rth
->dst
.__refcnt
, 1);
2064 rth
->dst
.flags
= DST_HOST
;
2065 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
2066 rth
->dst
.flags
|= DST_NOPOLICY
;
2067 if (IN_DEV_CONF_GET(out_dev
, NOXFRM
))
2068 rth
->dst
.flags
|= DST_NOXFRM
;
2069 rth
->fl
.fl4_dst
= daddr
;
2070 rth
->rt_dst
= daddr
;
2071 rth
->fl
.fl4_tos
= tos
;
2072 rth
->fl
.mark
= skb
->mark
;
2073 rth
->fl
.fl4_src
= saddr
;
2074 rth
->rt_src
= saddr
;
2075 rth
->rt_gateway
= daddr
;
2077 rth
->fl
.iif
= in_dev
->dev
->ifindex
;
2078 rth
->dst
.dev
= (out_dev
)->dev
;
2079 dev_hold(rth
->dst
.dev
);
2081 rth
->rt_spec_dst
= spec_dst
;
2083 rth
->dst
.obsolete
= -1;
2084 rth
->dst
.input
= ip_forward
;
2085 rth
->dst
.output
= ip_output
;
2086 rth
->rt_genid
= rt_genid(dev_net(rth
->dst
.dev
));
2088 rt_set_nexthop(rth
, res
, itag
);
2090 rth
->rt_flags
= flags
;
2098 static int ip_mkroute_input(struct sk_buff
*skb
,
2099 struct fib_result
*res
,
2100 const struct flowi
*fl
,
2101 struct in_device
*in_dev
,
2102 __be32 daddr
, __be32 saddr
, u32 tos
)
2104 struct rtable
* rth
= NULL
;
2108 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2109 if (res
->fi
&& res
->fi
->fib_nhs
> 1 && fl
->oif
== 0)
2110 fib_select_multipath(fl
, res
);
2113 /* create a routing cache entry */
2114 err
= __mkroute_input(skb
, res
, in_dev
, daddr
, saddr
, tos
, &rth
);
2118 /* put it into the cache */
2119 hash
= rt_hash(daddr
, saddr
, fl
->iif
,
2120 rt_genid(dev_net(rth
->dst
.dev
)));
2121 return rt_intern_hash(hash
, rth
, NULL
, skb
, fl
->iif
);
2125 * NOTE. We drop all the packets that has local source
2126 * addresses, because every properly looped back packet
2127 * must have correct destination already attached by output routine.
2129 * Such approach solves two big problems:
2130 * 1. Not simplex devices are handled properly.
2131 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2132 * called with rcu_read_lock()
2135 static int ip_route_input_slow(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2136 u8 tos
, struct net_device
*dev
)
2138 struct fib_result res
;
2139 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
2140 struct flowi fl
= { .fl4_dst
= daddr
,
2143 .fl4_scope
= RT_SCOPE_UNIVERSE
,
2145 .iif
= dev
->ifindex
};
2148 struct rtable
* rth
;
2152 struct net
* net
= dev_net(dev
);
2154 /* IP on this device is disabled. */
2159 /* Check for the most weird martians, which can be not detected
2163 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
) ||
2164 ipv4_is_loopback(saddr
))
2165 goto martian_source
;
2167 if (ipv4_is_lbcast(daddr
) || (saddr
== 0 && daddr
== 0))
2170 /* Accept zero addresses only to limited broadcast;
2171 * I even do not know to fix it or not. Waiting for complains :-)
2173 if (ipv4_is_zeronet(saddr
))
2174 goto martian_source
;
2176 if (ipv4_is_zeronet(daddr
) || ipv4_is_loopback(daddr
))
2177 goto martian_destination
;
2180 * Now we are ready to route packet.
2182 err
= fib_lookup(net
, &fl
, &res
);
2184 if (!IN_DEV_FORWARD(in_dev
))
2189 RT_CACHE_STAT_INC(in_slow_tot
);
2191 if (res
.type
== RTN_BROADCAST
)
2194 if (res
.type
== RTN_LOCAL
) {
2195 err
= fib_validate_source(saddr
, daddr
, tos
,
2196 net
->loopback_dev
->ifindex
,
2197 dev
, &spec_dst
, &itag
, skb
->mark
);
2199 goto martian_source_keep_err
;
2201 flags
|= RTCF_DIRECTSRC
;
2206 if (!IN_DEV_FORWARD(in_dev
))
2208 if (res
.type
!= RTN_UNICAST
)
2209 goto martian_destination
;
2211 err
= ip_mkroute_input(skb
, &res
, &fl
, in_dev
, daddr
, saddr
, tos
);
2215 if (skb
->protocol
!= htons(ETH_P_IP
))
2218 if (ipv4_is_zeronet(saddr
))
2219 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_LINK
);
2221 err
= fib_validate_source(saddr
, 0, tos
, 0, dev
, &spec_dst
,
2224 goto martian_source_keep_err
;
2226 flags
|= RTCF_DIRECTSRC
;
2228 flags
|= RTCF_BROADCAST
;
2229 res
.type
= RTN_BROADCAST
;
2230 RT_CACHE_STAT_INC(in_brd
);
2233 rth
= dst_alloc(&ipv4_dst_ops
);
2237 rth
->dst
.output
= ip_rt_bug
;
2238 rth
->dst
.obsolete
= -1;
2239 rth
->rt_genid
= rt_genid(net
);
2241 atomic_set(&rth
->dst
.__refcnt
, 1);
2242 rth
->dst
.flags
= DST_HOST
;
2243 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
2244 rth
->dst
.flags
|= DST_NOPOLICY
;
2245 rth
->fl
.fl4_dst
= daddr
;
2246 rth
->rt_dst
= daddr
;
2247 rth
->fl
.fl4_tos
= tos
;
2248 rth
->fl
.mark
= skb
->mark
;
2249 rth
->fl
.fl4_src
= saddr
;
2250 rth
->rt_src
= saddr
;
2251 #ifdef CONFIG_IP_ROUTE_CLASSID
2252 rth
->dst
.tclassid
= itag
;
2255 rth
->fl
.iif
= dev
->ifindex
;
2256 rth
->dst
.dev
= net
->loopback_dev
;
2257 dev_hold(rth
->dst
.dev
);
2258 rth
->rt_gateway
= daddr
;
2259 rth
->rt_spec_dst
= spec_dst
;
2260 rth
->dst
.input
= ip_local_deliver
;
2261 rth
->rt_flags
= flags
|RTCF_LOCAL
;
2262 if (res
.type
== RTN_UNREACHABLE
) {
2263 rth
->dst
.input
= ip_error
;
2264 rth
->dst
.error
= -err
;
2265 rth
->rt_flags
&= ~RTCF_LOCAL
;
2267 rth
->rt_type
= res
.type
;
2268 hash
= rt_hash(daddr
, saddr
, fl
.iif
, rt_genid(net
));
2269 err
= rt_intern_hash(hash
, rth
, NULL
, skb
, fl
.iif
);
2273 RT_CACHE_STAT_INC(in_no_route
);
2274 spec_dst
= inet_select_addr(dev
, 0, RT_SCOPE_UNIVERSE
);
2275 res
.type
= RTN_UNREACHABLE
;
2281 * Do not cache martian addresses: they should be logged (RFC1812)
2283 martian_destination
:
2284 RT_CACHE_STAT_INC(in_martian_dst
);
2285 #ifdef CONFIG_IP_ROUTE_VERBOSE
2286 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit())
2287 printk(KERN_WARNING
"martian destination %pI4 from %pI4, dev %s\n",
2288 &daddr
, &saddr
, dev
->name
);
2292 err
= -EHOSTUNREACH
;
2305 martian_source_keep_err
:
2306 ip_handle_martian_source(dev
, in_dev
, skb
, daddr
, saddr
);
2310 int ip_route_input_common(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2311 u8 tos
, struct net_device
*dev
, bool noref
)
2313 struct rtable
* rth
;
2315 int iif
= dev
->ifindex
;
2323 if (!rt_caching(net
))
2326 tos
&= IPTOS_RT_MASK
;
2327 hash
= rt_hash(daddr
, saddr
, iif
, rt_genid(net
));
2329 for (rth
= rcu_dereference(rt_hash_table
[hash
].chain
); rth
;
2330 rth
= rcu_dereference(rth
->dst
.rt_next
)) {
2331 if ((((__force u32
)rth
->fl
.fl4_dst
^ (__force u32
)daddr
) |
2332 ((__force u32
)rth
->fl
.fl4_src
^ (__force u32
)saddr
) |
2333 (rth
->fl
.iif
^ iif
) |
2335 (rth
->fl
.fl4_tos
^ tos
)) == 0 &&
2336 rth
->fl
.mark
== skb
->mark
&&
2337 net_eq(dev_net(rth
->dst
.dev
), net
) &&
2338 !rt_is_expired(rth
)) {
2340 dst_use_noref(&rth
->dst
, jiffies
);
2341 skb_dst_set_noref(skb
, &rth
->dst
);
2343 dst_use(&rth
->dst
, jiffies
);
2344 skb_dst_set(skb
, &rth
->dst
);
2346 RT_CACHE_STAT_INC(in_hit
);
2350 RT_CACHE_STAT_INC(in_hlist_search
);
2354 /* Multicast recognition logic is moved from route cache to here.
2355 The problem was that too many Ethernet cards have broken/missing
2356 hardware multicast filters :-( As result the host on multicasting
2357 network acquires a lot of useless route cache entries, sort of
2358 SDR messages from all the world. Now we try to get rid of them.
2359 Really, provided software IP multicast filter is organized
2360 reasonably (at least, hashed), it does not result in a slowdown
2361 comparing with route cache reject entries.
2362 Note, that multicast routers are not affected, because
2363 route cache entry is created eventually.
2365 if (ipv4_is_multicast(daddr
)) {
2366 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
2369 int our
= ip_check_mc(in_dev
, daddr
, saddr
,
2370 ip_hdr(skb
)->protocol
);
2372 #ifdef CONFIG_IP_MROUTE
2374 (!ipv4_is_local_multicast(daddr
) &&
2375 IN_DEV_MFORWARD(in_dev
))
2378 int res
= ip_route_input_mc(skb
, daddr
, saddr
,
2387 res
= ip_route_input_slow(skb
, daddr
, saddr
, tos
, dev
);
2391 EXPORT_SYMBOL(ip_route_input_common
);
2393 /* called with rcu_read_lock() */
2394 static int __mkroute_output(struct rtable
**result
,
2395 struct fib_result
*res
,
2396 const struct flowi
*fl
,
2397 const struct flowi
*oldflp
,
2398 struct net_device
*dev_out
,
2402 struct in_device
*in_dev
;
2403 u32 tos
= RT_FL_TOS(oldflp
);
2405 if (ipv4_is_loopback(fl
->fl4_src
) && !(dev_out
->flags
& IFF_LOOPBACK
))
2408 if (ipv4_is_lbcast(fl
->fl4_dst
))
2409 res
->type
= RTN_BROADCAST
;
2410 else if (ipv4_is_multicast(fl
->fl4_dst
))
2411 res
->type
= RTN_MULTICAST
;
2412 else if (ipv4_is_zeronet(fl
->fl4_dst
))
2415 if (dev_out
->flags
& IFF_LOOPBACK
)
2416 flags
|= RTCF_LOCAL
;
2418 in_dev
= __in_dev_get_rcu(dev_out
);
2422 if (res
->type
== RTN_BROADCAST
) {
2423 flags
|= RTCF_BROADCAST
| RTCF_LOCAL
;
2425 } else if (res
->type
== RTN_MULTICAST
) {
2426 flags
|= RTCF_MULTICAST
| RTCF_LOCAL
;
2427 if (!ip_check_mc(in_dev
, oldflp
->fl4_dst
, oldflp
->fl4_src
,
2429 flags
&= ~RTCF_LOCAL
;
2430 /* If multicast route do not exist use
2431 * default one, but do not gateway in this case.
2434 if (res
->fi
&& res
->prefixlen
< 4)
2439 rth
= dst_alloc(&ipv4_dst_ops
);
2443 atomic_set(&rth
->dst
.__refcnt
, 1);
2444 rth
->dst
.flags
= DST_HOST
;
2445 if (IN_DEV_CONF_GET(in_dev
, NOXFRM
))
2446 rth
->dst
.flags
|= DST_NOXFRM
;
2447 if (IN_DEV_CONF_GET(in_dev
, NOPOLICY
))
2448 rth
->dst
.flags
|= DST_NOPOLICY
;
2450 rth
->fl
.fl4_dst
= oldflp
->fl4_dst
;
2451 rth
->fl
.fl4_tos
= tos
;
2452 rth
->fl
.fl4_src
= oldflp
->fl4_src
;
2453 rth
->fl
.oif
= oldflp
->oif
;
2454 rth
->fl
.mark
= oldflp
->mark
;
2455 rth
->rt_dst
= fl
->fl4_dst
;
2456 rth
->rt_src
= fl
->fl4_src
;
2457 rth
->rt_iif
= oldflp
->oif
? : dev_out
->ifindex
;
2458 /* get references to the devices that are to be hold by the routing
2460 rth
->dst
.dev
= dev_out
;
2462 rth
->rt_gateway
= fl
->fl4_dst
;
2463 rth
->rt_spec_dst
= fl
->fl4_src
;
2465 rth
->dst
.output
=ip_output
;
2466 rth
->dst
.obsolete
= -1;
2467 rth
->rt_genid
= rt_genid(dev_net(dev_out
));
2469 RT_CACHE_STAT_INC(out_slow_tot
);
2471 if (flags
& RTCF_LOCAL
) {
2472 rth
->dst
.input
= ip_local_deliver
;
2473 rth
->rt_spec_dst
= fl
->fl4_dst
;
2475 if (flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) {
2476 rth
->rt_spec_dst
= fl
->fl4_src
;
2477 if (flags
& RTCF_LOCAL
&&
2478 !(dev_out
->flags
& IFF_LOOPBACK
)) {
2479 rth
->dst
.output
= ip_mc_output
;
2480 RT_CACHE_STAT_INC(out_slow_mc
);
2482 #ifdef CONFIG_IP_MROUTE
2483 if (res
->type
== RTN_MULTICAST
) {
2484 if (IN_DEV_MFORWARD(in_dev
) &&
2485 !ipv4_is_local_multicast(oldflp
->fl4_dst
)) {
2486 rth
->dst
.input
= ip_mr_input
;
2487 rth
->dst
.output
= ip_mc_output
;
2493 rt_set_nexthop(rth
, res
, 0);
2495 rth
->rt_flags
= flags
;
2500 /* called with rcu_read_lock() */
2501 static int ip_mkroute_output(struct rtable
**rp
,
2502 struct fib_result
*res
,
2503 const struct flowi
*fl
,
2504 const struct flowi
*oldflp
,
2505 struct net_device
*dev_out
,
2508 struct rtable
*rth
= NULL
;
2509 int err
= __mkroute_output(&rth
, res
, fl
, oldflp
, dev_out
, flags
);
2512 hash
= rt_hash(oldflp
->fl4_dst
, oldflp
->fl4_src
, oldflp
->oif
,
2513 rt_genid(dev_net(dev_out
)));
2514 err
= rt_intern_hash(hash
, rth
, rp
, NULL
, oldflp
->oif
);
2521 * Major route resolver routine.
2522 * called with rcu_read_lock();
2525 static int ip_route_output_slow(struct net
*net
, struct rtable
**rp
,
2526 const struct flowi
*oldflp
)
2528 u32 tos
= RT_FL_TOS(oldflp
);
2529 struct flowi fl
= { .fl4_dst
= oldflp
->fl4_dst
,
2530 .fl4_src
= oldflp
->fl4_src
,
2531 .fl4_tos
= tos
& IPTOS_RT_MASK
,
2532 .fl4_scope
= ((tos
& RTO_ONLINK
) ?
2533 RT_SCOPE_LINK
: RT_SCOPE_UNIVERSE
),
2534 .mark
= oldflp
->mark
,
2535 .iif
= net
->loopback_dev
->ifindex
,
2536 .oif
= oldflp
->oif
};
2537 struct fib_result res
;
2538 unsigned int flags
= 0;
2539 struct net_device
*dev_out
= NULL
;
2544 #ifdef CONFIG_IP_MULTIPLE_TABLES
2548 if (oldflp
->fl4_src
) {
2550 if (ipv4_is_multicast(oldflp
->fl4_src
) ||
2551 ipv4_is_lbcast(oldflp
->fl4_src
) ||
2552 ipv4_is_zeronet(oldflp
->fl4_src
))
2555 /* I removed check for oif == dev_out->oif here.
2556 It was wrong for two reasons:
2557 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2558 is assigned to multiple interfaces.
2559 2. Moreover, we are allowed to send packets with saddr
2560 of another iface. --ANK
2563 if (oldflp
->oif
== 0 &&
2564 (ipv4_is_multicast(oldflp
->fl4_dst
) ||
2565 ipv4_is_lbcast(oldflp
->fl4_dst
))) {
2566 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2567 dev_out
= __ip_dev_find(net
, oldflp
->fl4_src
, false);
2568 if (dev_out
== NULL
)
2571 /* Special hack: user can direct multicasts
2572 and limited broadcast via necessary interface
2573 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2574 This hack is not just for fun, it allows
2575 vic,vat and friends to work.
2576 They bind socket to loopback, set ttl to zero
2577 and expect that it will work.
2578 From the viewpoint of routing cache they are broken,
2579 because we are not allowed to build multicast path
2580 with loopback source addr (look, routing cache
2581 cannot know, that ttl is zero, so that packet
2582 will not leave this host and route is valid).
2583 Luckily, this hack is good workaround.
2586 fl
.oif
= dev_out
->ifindex
;
2590 if (!(oldflp
->flags
& FLOWI_FLAG_ANYSRC
)) {
2591 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2592 if (!__ip_dev_find(net
, oldflp
->fl4_src
, false))
2599 dev_out
= dev_get_by_index_rcu(net
, oldflp
->oif
);
2601 if (dev_out
== NULL
)
2604 /* RACE: Check return value of inet_select_addr instead. */
2605 if (!(dev_out
->flags
& IFF_UP
) || !__in_dev_get_rcu(dev_out
)) {
2609 if (ipv4_is_local_multicast(oldflp
->fl4_dst
) ||
2610 ipv4_is_lbcast(oldflp
->fl4_dst
)) {
2612 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2617 if (ipv4_is_multicast(oldflp
->fl4_dst
))
2618 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2620 else if (!oldflp
->fl4_dst
)
2621 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2627 fl
.fl4_dst
= fl
.fl4_src
;
2629 fl
.fl4_dst
= fl
.fl4_src
= htonl(INADDR_LOOPBACK
);
2630 dev_out
= net
->loopback_dev
;
2631 fl
.oif
= net
->loopback_dev
->ifindex
;
2632 res
.type
= RTN_LOCAL
;
2633 flags
|= RTCF_LOCAL
;
2637 if (fib_lookup(net
, &fl
, &res
)) {
2640 /* Apparently, routing tables are wrong. Assume,
2641 that the destination is on link.
2644 Because we are allowed to send to iface
2645 even if it has NO routes and NO assigned
2646 addresses. When oif is specified, routing
2647 tables are looked up with only one purpose:
2648 to catch if destination is gatewayed, rather than
2649 direct. Moreover, if MSG_DONTROUTE is set,
2650 we send packet, ignoring both routing tables
2651 and ifaddr state. --ANK
2654 We could make it even if oif is unknown,
2655 likely IPv6, but we do not.
2658 if (fl
.fl4_src
== 0)
2659 fl
.fl4_src
= inet_select_addr(dev_out
, 0,
2661 res
.type
= RTN_UNICAST
;
2668 if (res
.type
== RTN_LOCAL
) {
2670 if (res
.fi
->fib_prefsrc
)
2671 fl
.fl4_src
= res
.fi
->fib_prefsrc
;
2673 fl
.fl4_src
= fl
.fl4_dst
;
2675 dev_out
= net
->loopback_dev
;
2676 fl
.oif
= dev_out
->ifindex
;
2678 flags
|= RTCF_LOCAL
;
2682 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2683 if (res
.fi
->fib_nhs
> 1 && fl
.oif
== 0)
2684 fib_select_multipath(&fl
, &res
);
2687 if (!res
.prefixlen
&& res
.type
== RTN_UNICAST
&& !fl
.oif
)
2688 fib_select_default(net
, &fl
, &res
);
2691 fl
.fl4_src
= FIB_RES_PREFSRC(res
);
2693 dev_out
= FIB_RES_DEV(res
);
2694 fl
.oif
= dev_out
->ifindex
;
2698 err
= ip_mkroute_output(rp
, &res
, &fl
, oldflp
, dev_out
, flags
);
2703 int __ip_route_output_key(struct net
*net
, struct rtable
**rp
,
2704 const struct flowi
*flp
)
2710 if (!rt_caching(net
))
2713 hash
= rt_hash(flp
->fl4_dst
, flp
->fl4_src
, flp
->oif
, rt_genid(net
));
2716 for (rth
= rcu_dereference_bh(rt_hash_table
[hash
].chain
); rth
;
2717 rth
= rcu_dereference_bh(rth
->dst
.rt_next
)) {
2718 if (rth
->fl
.fl4_dst
== flp
->fl4_dst
&&
2719 rth
->fl
.fl4_src
== flp
->fl4_src
&&
2720 rt_is_output_route(rth
) &&
2721 rth
->fl
.oif
== flp
->oif
&&
2722 rth
->fl
.mark
== flp
->mark
&&
2723 !((rth
->fl
.fl4_tos
^ flp
->fl4_tos
) &
2724 (IPTOS_RT_MASK
| RTO_ONLINK
)) &&
2725 net_eq(dev_net(rth
->dst
.dev
), net
) &&
2726 !rt_is_expired(rth
)) {
2727 dst_use(&rth
->dst
, jiffies
);
2728 RT_CACHE_STAT_INC(out_hit
);
2729 rcu_read_unlock_bh();
2733 RT_CACHE_STAT_INC(out_hlist_search
);
2735 rcu_read_unlock_bh();
2739 res
= ip_route_output_slow(net
, rp
, flp
);
2743 EXPORT_SYMBOL_GPL(__ip_route_output_key
);
2745 static struct dst_entry
*ipv4_blackhole_dst_check(struct dst_entry
*dst
, u32 cookie
)
2750 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry
*dst
, u32 mtu
)
2754 static struct dst_ops ipv4_dst_blackhole_ops
= {
2756 .protocol
= cpu_to_be16(ETH_P_IP
),
2757 .destroy
= ipv4_dst_destroy
,
2758 .check
= ipv4_blackhole_dst_check
,
2759 .update_pmtu
= ipv4_rt_blackhole_update_pmtu
,
2763 static int ipv4_dst_blackhole(struct net
*net
, struct rtable
**rp
, struct flowi
*flp
)
2765 struct rtable
*ort
= *rp
;
2766 struct rtable
*rt
= (struct rtable
*)
2767 dst_alloc(&ipv4_dst_blackhole_ops
);
2770 struct dst_entry
*new = &rt
->dst
;
2772 atomic_set(&new->__refcnt
, 1);
2774 new->input
= dst_discard
;
2775 new->output
= dst_discard
;
2776 dst_copy_metrics(new, &ort
->dst
);
2778 new->dev
= ort
->dst
.dev
;
2784 rt
->rt_genid
= rt_genid(net
);
2785 rt
->rt_flags
= ort
->rt_flags
;
2786 rt
->rt_type
= ort
->rt_type
;
2787 rt
->rt_dst
= ort
->rt_dst
;
2788 rt
->rt_src
= ort
->rt_src
;
2789 rt
->rt_iif
= ort
->rt_iif
;
2790 rt
->rt_gateway
= ort
->rt_gateway
;
2791 rt
->rt_spec_dst
= ort
->rt_spec_dst
;
2792 rt
->peer
= ort
->peer
;
2794 atomic_inc(&rt
->peer
->refcnt
);
2797 atomic_inc(&rt
->fi
->fib_clntref
);
2802 dst_release(&(*rp
)->dst
);
2804 return rt
? 0 : -ENOMEM
;
2807 int ip_route_output_flow(struct net
*net
, struct rtable
**rp
, struct flowi
*flp
,
2808 struct sock
*sk
, int flags
)
2812 if ((err
= __ip_route_output_key(net
, rp
, flp
)) != 0)
2817 flp
->fl4_src
= (*rp
)->rt_src
;
2819 flp
->fl4_dst
= (*rp
)->rt_dst
;
2820 err
= __xfrm_lookup(net
, (struct dst_entry
**)rp
, flp
, sk
,
2821 flags
? XFRM_LOOKUP_WAIT
: 0);
2822 if (err
== -EREMOTE
)
2823 err
= ipv4_dst_blackhole(net
, rp
, flp
);
2830 EXPORT_SYMBOL_GPL(ip_route_output_flow
);
2832 int ip_route_output_key(struct net
*net
, struct rtable
**rp
, struct flowi
*flp
)
2834 return ip_route_output_flow(net
, rp
, flp
, NULL
, 0);
2836 EXPORT_SYMBOL(ip_route_output_key
);
2838 static int rt_fill_info(struct net
*net
,
2839 struct sk_buff
*skb
, u32 pid
, u32 seq
, int event
,
2840 int nowait
, unsigned int flags
)
2842 struct rtable
*rt
= skb_rtable(skb
);
2844 struct nlmsghdr
*nlh
;
2846 u32 id
= 0, ts
= 0, tsage
= 0, error
;
2848 nlh
= nlmsg_put(skb
, pid
, seq
, event
, sizeof(*r
), flags
);
2852 r
= nlmsg_data(nlh
);
2853 r
->rtm_family
= AF_INET
;
2854 r
->rtm_dst_len
= 32;
2856 r
->rtm_tos
= rt
->fl
.fl4_tos
;
2857 r
->rtm_table
= RT_TABLE_MAIN
;
2858 NLA_PUT_U32(skb
, RTA_TABLE
, RT_TABLE_MAIN
);
2859 r
->rtm_type
= rt
->rt_type
;
2860 r
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2861 r
->rtm_protocol
= RTPROT_UNSPEC
;
2862 r
->rtm_flags
= (rt
->rt_flags
& ~0xFFFF) | RTM_F_CLONED
;
2863 if (rt
->rt_flags
& RTCF_NOTIFY
)
2864 r
->rtm_flags
|= RTM_F_NOTIFY
;
2866 NLA_PUT_BE32(skb
, RTA_DST
, rt
->rt_dst
);
2868 if (rt
->fl
.fl4_src
) {
2869 r
->rtm_src_len
= 32;
2870 NLA_PUT_BE32(skb
, RTA_SRC
, rt
->fl
.fl4_src
);
2873 NLA_PUT_U32(skb
, RTA_OIF
, rt
->dst
.dev
->ifindex
);
2874 #ifdef CONFIG_IP_ROUTE_CLASSID
2875 if (rt
->dst
.tclassid
)
2876 NLA_PUT_U32(skb
, RTA_FLOW
, rt
->dst
.tclassid
);
2878 if (rt_is_input_route(rt
))
2879 NLA_PUT_BE32(skb
, RTA_PREFSRC
, rt
->rt_spec_dst
);
2880 else if (rt
->rt_src
!= rt
->fl
.fl4_src
)
2881 NLA_PUT_BE32(skb
, RTA_PREFSRC
, rt
->rt_src
);
2883 if (rt
->rt_dst
!= rt
->rt_gateway
)
2884 NLA_PUT_BE32(skb
, RTA_GATEWAY
, rt
->rt_gateway
);
2886 if (rtnetlink_put_metrics(skb
, dst_metrics_ptr(&rt
->dst
)) < 0)
2887 goto nla_put_failure
;
2890 NLA_PUT_BE32(skb
, RTA_MARK
, rt
->fl
.mark
);
2892 error
= rt
->dst
.error
;
2893 expires
= rt
->dst
.expires
? rt
->dst
.expires
- jiffies
: 0;
2895 inet_peer_refcheck(rt
->peer
);
2896 id
= atomic_read(&rt
->peer
->ip_id_count
) & 0xffff;
2897 if (rt
->peer
->tcp_ts_stamp
) {
2898 ts
= rt
->peer
->tcp_ts
;
2899 tsage
= get_seconds() - rt
->peer
->tcp_ts_stamp
;
2903 if (rt_is_input_route(rt
)) {
2904 #ifdef CONFIG_IP_MROUTE
2905 __be32 dst
= rt
->rt_dst
;
2907 if (ipv4_is_multicast(dst
) && !ipv4_is_local_multicast(dst
) &&
2908 IPV4_DEVCONF_ALL(net
, MC_FORWARDING
)) {
2909 int err
= ipmr_get_route(net
, skb
, r
, nowait
);
2914 goto nla_put_failure
;
2916 if (err
== -EMSGSIZE
)
2917 goto nla_put_failure
;
2923 NLA_PUT_U32(skb
, RTA_IIF
, rt
->fl
.iif
);
2926 if (rtnl_put_cacheinfo(skb
, &rt
->dst
, id
, ts
, tsage
,
2927 expires
, error
) < 0)
2928 goto nla_put_failure
;
2930 return nlmsg_end(skb
, nlh
);
2933 nlmsg_cancel(skb
, nlh
);
2937 static int inet_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
* nlh
, void *arg
)
2939 struct net
*net
= sock_net(in_skb
->sk
);
2941 struct nlattr
*tb
[RTA_MAX
+1];
2942 struct rtable
*rt
= NULL
;
2948 struct sk_buff
*skb
;
2950 err
= nlmsg_parse(nlh
, sizeof(*rtm
), tb
, RTA_MAX
, rtm_ipv4_policy
);
2954 rtm
= nlmsg_data(nlh
);
2956 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
2962 /* Reserve room for dummy headers, this skb can pass
2963 through good chunk of routing engine.
2965 skb_reset_mac_header(skb
);
2966 skb_reset_network_header(skb
);
2968 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2969 ip_hdr(skb
)->protocol
= IPPROTO_ICMP
;
2970 skb_reserve(skb
, MAX_HEADER
+ sizeof(struct iphdr
));
2972 src
= tb
[RTA_SRC
] ? nla_get_be32(tb
[RTA_SRC
]) : 0;
2973 dst
= tb
[RTA_DST
] ? nla_get_be32(tb
[RTA_DST
]) : 0;
2974 iif
= tb
[RTA_IIF
] ? nla_get_u32(tb
[RTA_IIF
]) : 0;
2975 mark
= tb
[RTA_MARK
] ? nla_get_u32(tb
[RTA_MARK
]) : 0;
2978 struct net_device
*dev
;
2980 dev
= __dev_get_by_index(net
, iif
);
2986 skb
->protocol
= htons(ETH_P_IP
);
2990 err
= ip_route_input(skb
, dst
, src
, rtm
->rtm_tos
, dev
);
2993 rt
= skb_rtable(skb
);
2994 if (err
== 0 && rt
->dst
.error
)
2995 err
= -rt
->dst
.error
;
3000 .fl4_tos
= rtm
->rtm_tos
,
3001 .oif
= tb
[RTA_OIF
] ? nla_get_u32(tb
[RTA_OIF
]) : 0,
3004 err
= ip_route_output_key(net
, &rt
, &fl
);
3010 skb_dst_set(skb
, &rt
->dst
);
3011 if (rtm
->rtm_flags
& RTM_F_NOTIFY
)
3012 rt
->rt_flags
|= RTCF_NOTIFY
;
3014 err
= rt_fill_info(net
, skb
, NETLINK_CB(in_skb
).pid
, nlh
->nlmsg_seq
,
3015 RTM_NEWROUTE
, 0, 0);
3019 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).pid
);
3028 int ip_rt_dump(struct sk_buff
*skb
, struct netlink_callback
*cb
)
3035 net
= sock_net(skb
->sk
);
3040 s_idx
= idx
= cb
->args
[1];
3041 for (h
= s_h
; h
<= rt_hash_mask
; h
++, s_idx
= 0) {
3042 if (!rt_hash_table
[h
].chain
)
3045 for (rt
= rcu_dereference_bh(rt_hash_table
[h
].chain
), idx
= 0; rt
;
3046 rt
= rcu_dereference_bh(rt
->dst
.rt_next
), idx
++) {
3047 if (!net_eq(dev_net(rt
->dst
.dev
), net
) || idx
< s_idx
)
3049 if (rt_is_expired(rt
))
3051 skb_dst_set_noref(skb
, &rt
->dst
);
3052 if (rt_fill_info(net
, skb
, NETLINK_CB(cb
->skb
).pid
,
3053 cb
->nlh
->nlmsg_seq
, RTM_NEWROUTE
,
3054 1, NLM_F_MULTI
) <= 0) {
3056 rcu_read_unlock_bh();
3061 rcu_read_unlock_bh();
3070 void ip_rt_multicast_event(struct in_device
*in_dev
)
3072 rt_cache_flush(dev_net(in_dev
->dev
), 0);
3075 #ifdef CONFIG_SYSCTL
3076 static int ipv4_sysctl_rtcache_flush(ctl_table
*__ctl
, int write
,
3077 void __user
*buffer
,
3078 size_t *lenp
, loff_t
*ppos
)
3085 memcpy(&ctl
, __ctl
, sizeof(ctl
));
3086 ctl
.data
= &flush_delay
;
3087 proc_dointvec(&ctl
, write
, buffer
, lenp
, ppos
);
3089 net
= (struct net
*)__ctl
->extra1
;
3090 rt_cache_flush(net
, flush_delay
);
3097 static ctl_table ipv4_route_table
[] = {
3099 .procname
= "gc_thresh",
3100 .data
= &ipv4_dst_ops
.gc_thresh
,
3101 .maxlen
= sizeof(int),
3103 .proc_handler
= proc_dointvec
,
3106 .procname
= "max_size",
3107 .data
= &ip_rt_max_size
,
3108 .maxlen
= sizeof(int),
3110 .proc_handler
= proc_dointvec
,
3113 /* Deprecated. Use gc_min_interval_ms */
3115 .procname
= "gc_min_interval",
3116 .data
= &ip_rt_gc_min_interval
,
3117 .maxlen
= sizeof(int),
3119 .proc_handler
= proc_dointvec_jiffies
,
3122 .procname
= "gc_min_interval_ms",
3123 .data
= &ip_rt_gc_min_interval
,
3124 .maxlen
= sizeof(int),
3126 .proc_handler
= proc_dointvec_ms_jiffies
,
3129 .procname
= "gc_timeout",
3130 .data
= &ip_rt_gc_timeout
,
3131 .maxlen
= sizeof(int),
3133 .proc_handler
= proc_dointvec_jiffies
,
3136 .procname
= "gc_interval",
3137 .data
= &ip_rt_gc_interval
,
3138 .maxlen
= sizeof(int),
3140 .proc_handler
= proc_dointvec_jiffies
,
3143 .procname
= "redirect_load",
3144 .data
= &ip_rt_redirect_load
,
3145 .maxlen
= sizeof(int),
3147 .proc_handler
= proc_dointvec
,
3150 .procname
= "redirect_number",
3151 .data
= &ip_rt_redirect_number
,
3152 .maxlen
= sizeof(int),
3154 .proc_handler
= proc_dointvec
,
3157 .procname
= "redirect_silence",
3158 .data
= &ip_rt_redirect_silence
,
3159 .maxlen
= sizeof(int),
3161 .proc_handler
= proc_dointvec
,
3164 .procname
= "error_cost",
3165 .data
= &ip_rt_error_cost
,
3166 .maxlen
= sizeof(int),
3168 .proc_handler
= proc_dointvec
,
3171 .procname
= "error_burst",
3172 .data
= &ip_rt_error_burst
,
3173 .maxlen
= sizeof(int),
3175 .proc_handler
= proc_dointvec
,
3178 .procname
= "gc_elasticity",
3179 .data
= &ip_rt_gc_elasticity
,
3180 .maxlen
= sizeof(int),
3182 .proc_handler
= proc_dointvec
,
3185 .procname
= "mtu_expires",
3186 .data
= &ip_rt_mtu_expires
,
3187 .maxlen
= sizeof(int),
3189 .proc_handler
= proc_dointvec_jiffies
,
3192 .procname
= "min_pmtu",
3193 .data
= &ip_rt_min_pmtu
,
3194 .maxlen
= sizeof(int),
3196 .proc_handler
= proc_dointvec
,
3199 .procname
= "min_adv_mss",
3200 .data
= &ip_rt_min_advmss
,
3201 .maxlen
= sizeof(int),
3203 .proc_handler
= proc_dointvec
,
3208 static struct ctl_table empty
[1];
3210 static struct ctl_table ipv4_skeleton
[] =
3212 { .procname
= "route",
3213 .mode
= 0555, .child
= ipv4_route_table
},
3214 { .procname
= "neigh",
3215 .mode
= 0555, .child
= empty
},
3219 static __net_initdata
struct ctl_path ipv4_path
[] = {
3220 { .procname
= "net", },
3221 { .procname
= "ipv4", },
3225 static struct ctl_table ipv4_route_flush_table
[] = {
3227 .procname
= "flush",
3228 .maxlen
= sizeof(int),
3230 .proc_handler
= ipv4_sysctl_rtcache_flush
,
3235 static __net_initdata
struct ctl_path ipv4_route_path
[] = {
3236 { .procname
= "net", },
3237 { .procname
= "ipv4", },
3238 { .procname
= "route", },
3242 static __net_init
int sysctl_route_net_init(struct net
*net
)
3244 struct ctl_table
*tbl
;
3246 tbl
= ipv4_route_flush_table
;
3247 if (!net_eq(net
, &init_net
)) {
3248 tbl
= kmemdup(tbl
, sizeof(ipv4_route_flush_table
), GFP_KERNEL
);
3252 tbl
[0].extra1
= net
;
3254 net
->ipv4
.route_hdr
=
3255 register_net_sysctl_table(net
, ipv4_route_path
, tbl
);
3256 if (net
->ipv4
.route_hdr
== NULL
)
3261 if (tbl
!= ipv4_route_flush_table
)
3267 static __net_exit
void sysctl_route_net_exit(struct net
*net
)
3269 struct ctl_table
*tbl
;
3271 tbl
= net
->ipv4
.route_hdr
->ctl_table_arg
;
3272 unregister_net_sysctl_table(net
->ipv4
.route_hdr
);
3273 BUG_ON(tbl
== ipv4_route_flush_table
);
3277 static __net_initdata
struct pernet_operations sysctl_route_ops
= {
3278 .init
= sysctl_route_net_init
,
3279 .exit
= sysctl_route_net_exit
,
3283 static __net_init
int rt_genid_init(struct net
*net
)
3285 get_random_bytes(&net
->ipv4
.rt_genid
,
3286 sizeof(net
->ipv4
.rt_genid
));
3290 static __net_initdata
struct pernet_operations rt_genid_ops
= {
3291 .init
= rt_genid_init
,
3295 #ifdef CONFIG_IP_ROUTE_CLASSID
3296 struct ip_rt_acct __percpu
*ip_rt_acct __read_mostly
;
3297 #endif /* CONFIG_IP_ROUTE_CLASSID */
3299 static __initdata
unsigned long rhash_entries
;
3300 static int __init
set_rhash_entries(char *str
)
3304 rhash_entries
= simple_strtoul(str
, &str
, 0);
3307 __setup("rhash_entries=", set_rhash_entries
);
3309 int __init
ip_rt_init(void)
3313 #ifdef CONFIG_IP_ROUTE_CLASSID
3314 ip_rt_acct
= __alloc_percpu(256 * sizeof(struct ip_rt_acct
), __alignof__(struct ip_rt_acct
));
3316 panic("IP: failed to allocate ip_rt_acct\n");
3319 ipv4_dst_ops
.kmem_cachep
=
3320 kmem_cache_create("ip_dst_cache", sizeof(struct rtable
), 0,
3321 SLAB_HWCACHE_ALIGN
|SLAB_PANIC
, NULL
);
3323 ipv4_dst_blackhole_ops
.kmem_cachep
= ipv4_dst_ops
.kmem_cachep
;
3325 if (dst_entries_init(&ipv4_dst_ops
) < 0)
3326 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3328 if (dst_entries_init(&ipv4_dst_blackhole_ops
) < 0)
3329 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3331 rt_hash_table
= (struct rt_hash_bucket
*)
3332 alloc_large_system_hash("IP route cache",
3333 sizeof(struct rt_hash_bucket
),
3335 (totalram_pages
>= 128 * 1024) ?
3340 rhash_entries
? 0 : 512 * 1024);
3341 memset(rt_hash_table
, 0, (rt_hash_mask
+ 1) * sizeof(struct rt_hash_bucket
));
3342 rt_hash_lock_init();
3344 ipv4_dst_ops
.gc_thresh
= (rt_hash_mask
+ 1);
3345 ip_rt_max_size
= (rt_hash_mask
+ 1) * 16;
3350 /* All the timers, started at system startup tend
3351 to synchronize. Perturb it a bit.
3353 INIT_DELAYED_WORK_DEFERRABLE(&expires_work
, rt_worker_func
);
3354 expires_ljiffies
= jiffies
;
3355 schedule_delayed_work(&expires_work
,
3356 net_random() % ip_rt_gc_interval
+ ip_rt_gc_interval
);
3358 if (ip_rt_proc_init())
3359 printk(KERN_ERR
"Unable to create route proc files\n");
3362 xfrm4_init(ip_rt_max_size
);
3364 rtnl_register(PF_INET
, RTM_GETROUTE
, inet_rtm_getroute
, NULL
);
3366 #ifdef CONFIG_SYSCTL
3367 register_pernet_subsys(&sysctl_route_ops
);
3369 register_pernet_subsys(&rt_genid_ops
);
3373 #ifdef CONFIG_SYSCTL
3375 * We really need to sanitize the damn ipv4 init order, then all
3376 * this nonsense will go away.
3378 void __init
ip_static_sysctl_init(void)
3380 register_sysctl_paths(ipv4_path
, ipv4_skeleton
);