980030d4e4ae9161b3a1a9edd971e57e341e0338
[deliverable/linux.git] / net / ipv4 / route.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU 0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly = 9;
124 static int ip_rt_redirect_load __read_mostly = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly = HZ;
127 static int ip_rt_error_burst __read_mostly = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly = 8;
129 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly = 256;
132 static int rt_chain_length_max __read_mostly = 20;
133
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136
137 /*
138 * Interface to generic destination cache.
139 */
140
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
143 static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
144 static void ipv4_dst_destroy(struct dst_entry *dst);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void ipv4_link_failure(struct sk_buff *skb);
147 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149
150 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
151 int how)
152 {
153 }
154
155 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156 {
157 u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
158
159 if (p) {
160 u32 *old_p = __DST_METRICS_PTR(old);
161 unsigned long prev, new;
162
163 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
164
165 new = (unsigned long) p;
166 prev = cmpxchg(&dst->_metrics, old, new);
167
168 if (prev != old) {
169 kfree(p);
170 p = __DST_METRICS_PTR(prev);
171 if (prev & DST_METRICS_READ_ONLY)
172 p = NULL;
173 } else {
174 struct rtable *rt = (struct rtable *) dst;
175
176 if (rt->fi) {
177 fib_info_put(rt->fi);
178 rt->fi = NULL;
179 }
180 }
181 }
182 return p;
183 }
184
185 static struct dst_ops ipv4_dst_ops = {
186 .family = AF_INET,
187 .protocol = cpu_to_be16(ETH_P_IP),
188 .gc = rt_garbage_collect,
189 .check = ipv4_dst_check,
190 .default_advmss = ipv4_default_advmss,
191 .default_mtu = ipv4_default_mtu,
192 .cow_metrics = ipv4_cow_metrics,
193 .destroy = ipv4_dst_destroy,
194 .ifdown = ipv4_dst_ifdown,
195 .negative_advice = ipv4_negative_advice,
196 .link_failure = ipv4_link_failure,
197 .update_pmtu = ip_rt_update_pmtu,
198 .local_out = __ip_local_out,
199 };
200
201 #define ECN_OR_COST(class) TC_PRIO_##class
202
203 const __u8 ip_tos2prio[16] = {
204 TC_PRIO_BESTEFFORT,
205 ECN_OR_COST(FILLER),
206 TC_PRIO_BESTEFFORT,
207 ECN_OR_COST(BESTEFFORT),
208 TC_PRIO_BULK,
209 ECN_OR_COST(BULK),
210 TC_PRIO_BULK,
211 ECN_OR_COST(BULK),
212 TC_PRIO_INTERACTIVE,
213 ECN_OR_COST(INTERACTIVE),
214 TC_PRIO_INTERACTIVE,
215 ECN_OR_COST(INTERACTIVE),
216 TC_PRIO_INTERACTIVE_BULK,
217 ECN_OR_COST(INTERACTIVE_BULK),
218 TC_PRIO_INTERACTIVE_BULK,
219 ECN_OR_COST(INTERACTIVE_BULK)
220 };
221
222
223 /*
224 * Route cache.
225 */
226
227 /* The locking scheme is rather straight forward:
228 *
229 * 1) Read-Copy Update protects the buckets of the central route hash.
230 * 2) Only writers remove entries, and they hold the lock
231 * as they look at rtable reference counts.
232 * 3) Only readers acquire references to rtable entries,
233 * they do so with atomic increments and with the
234 * lock held.
235 */
236
237 struct rt_hash_bucket {
238 struct rtable __rcu *chain;
239 };
240
241 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
242 defined(CONFIG_PROVE_LOCKING)
243 /*
244 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
245 * The size of this table is a power of two and depends on the number of CPUS.
246 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
247 */
248 #ifdef CONFIG_LOCKDEP
249 # define RT_HASH_LOCK_SZ 256
250 #else
251 # if NR_CPUS >= 32
252 # define RT_HASH_LOCK_SZ 4096
253 # elif NR_CPUS >= 16
254 # define RT_HASH_LOCK_SZ 2048
255 # elif NR_CPUS >= 8
256 # define RT_HASH_LOCK_SZ 1024
257 # elif NR_CPUS >= 4
258 # define RT_HASH_LOCK_SZ 512
259 # else
260 # define RT_HASH_LOCK_SZ 256
261 # endif
262 #endif
263
264 static spinlock_t *rt_hash_locks;
265 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
266
267 static __init void rt_hash_lock_init(void)
268 {
269 int i;
270
271 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
272 GFP_KERNEL);
273 if (!rt_hash_locks)
274 panic("IP: failed to allocate rt_hash_locks\n");
275
276 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
277 spin_lock_init(&rt_hash_locks[i]);
278 }
279 #else
280 # define rt_hash_lock_addr(slot) NULL
281
282 static inline void rt_hash_lock_init(void)
283 {
284 }
285 #endif
286
287 static struct rt_hash_bucket *rt_hash_table __read_mostly;
288 static unsigned rt_hash_mask __read_mostly;
289 static unsigned int rt_hash_log __read_mostly;
290
291 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
292 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
293
294 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
295 int genid)
296 {
297 return jhash_3words((__force u32)daddr, (__force u32)saddr,
298 idx, genid)
299 & rt_hash_mask;
300 }
301
302 static inline int rt_genid(struct net *net)
303 {
304 return atomic_read(&net->ipv4.rt_genid);
305 }
306
307 #ifdef CONFIG_PROC_FS
308 struct rt_cache_iter_state {
309 struct seq_net_private p;
310 int bucket;
311 int genid;
312 };
313
314 static struct rtable *rt_cache_get_first(struct seq_file *seq)
315 {
316 struct rt_cache_iter_state *st = seq->private;
317 struct rtable *r = NULL;
318
319 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
320 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
321 continue;
322 rcu_read_lock_bh();
323 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
324 while (r) {
325 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
326 r->rt_genid == st->genid)
327 return r;
328 r = rcu_dereference_bh(r->dst.rt_next);
329 }
330 rcu_read_unlock_bh();
331 }
332 return r;
333 }
334
335 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
336 struct rtable *r)
337 {
338 struct rt_cache_iter_state *st = seq->private;
339
340 r = rcu_dereference_bh(r->dst.rt_next);
341 while (!r) {
342 rcu_read_unlock_bh();
343 do {
344 if (--st->bucket < 0)
345 return NULL;
346 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
347 rcu_read_lock_bh();
348 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
349 }
350 return r;
351 }
352
353 static struct rtable *rt_cache_get_next(struct seq_file *seq,
354 struct rtable *r)
355 {
356 struct rt_cache_iter_state *st = seq->private;
357 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
358 if (dev_net(r->dst.dev) != seq_file_net(seq))
359 continue;
360 if (r->rt_genid == st->genid)
361 break;
362 }
363 return r;
364 }
365
366 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
367 {
368 struct rtable *r = rt_cache_get_first(seq);
369
370 if (r)
371 while (pos && (r = rt_cache_get_next(seq, r)))
372 --pos;
373 return pos ? NULL : r;
374 }
375
376 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
377 {
378 struct rt_cache_iter_state *st = seq->private;
379 if (*pos)
380 return rt_cache_get_idx(seq, *pos - 1);
381 st->genid = rt_genid(seq_file_net(seq));
382 return SEQ_START_TOKEN;
383 }
384
385 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
386 {
387 struct rtable *r;
388
389 if (v == SEQ_START_TOKEN)
390 r = rt_cache_get_first(seq);
391 else
392 r = rt_cache_get_next(seq, v);
393 ++*pos;
394 return r;
395 }
396
397 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
398 {
399 if (v && v != SEQ_START_TOKEN)
400 rcu_read_unlock_bh();
401 }
402
403 static int rt_cache_seq_show(struct seq_file *seq, void *v)
404 {
405 if (v == SEQ_START_TOKEN)
406 seq_printf(seq, "%-127s\n",
407 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
408 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
409 "HHUptod\tSpecDst");
410 else {
411 struct rtable *r = v;
412 int len;
413
414 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
415 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
416 r->dst.dev ? r->dst.dev->name : "*",
417 (__force u32)r->rt_dst,
418 (__force u32)r->rt_gateway,
419 r->rt_flags, atomic_read(&r->dst.__refcnt),
420 r->dst.__use, 0, (__force u32)r->rt_src,
421 dst_metric_advmss(&r->dst) + 40,
422 dst_metric(&r->dst, RTAX_WINDOW),
423 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
424 dst_metric(&r->dst, RTAX_RTTVAR)),
425 r->fl.fl4_tos,
426 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
427 r->dst.hh ? (r->dst.hh->hh_output ==
428 dev_queue_xmit) : 0,
429 r->rt_spec_dst, &len);
430
431 seq_printf(seq, "%*s\n", 127 - len, "");
432 }
433 return 0;
434 }
435
436 static const struct seq_operations rt_cache_seq_ops = {
437 .start = rt_cache_seq_start,
438 .next = rt_cache_seq_next,
439 .stop = rt_cache_seq_stop,
440 .show = rt_cache_seq_show,
441 };
442
443 static int rt_cache_seq_open(struct inode *inode, struct file *file)
444 {
445 return seq_open_net(inode, file, &rt_cache_seq_ops,
446 sizeof(struct rt_cache_iter_state));
447 }
448
449 static const struct file_operations rt_cache_seq_fops = {
450 .owner = THIS_MODULE,
451 .open = rt_cache_seq_open,
452 .read = seq_read,
453 .llseek = seq_lseek,
454 .release = seq_release_net,
455 };
456
457
458 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
459 {
460 int cpu;
461
462 if (*pos == 0)
463 return SEQ_START_TOKEN;
464
465 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
466 if (!cpu_possible(cpu))
467 continue;
468 *pos = cpu+1;
469 return &per_cpu(rt_cache_stat, cpu);
470 }
471 return NULL;
472 }
473
474 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
475 {
476 int cpu;
477
478 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
479 if (!cpu_possible(cpu))
480 continue;
481 *pos = cpu+1;
482 return &per_cpu(rt_cache_stat, cpu);
483 }
484 return NULL;
485
486 }
487
488 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
489 {
490
491 }
492
493 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
494 {
495 struct rt_cache_stat *st = v;
496
497 if (v == SEQ_START_TOKEN) {
498 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
499 return 0;
500 }
501
502 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
503 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
504 dst_entries_get_slow(&ipv4_dst_ops),
505 st->in_hit,
506 st->in_slow_tot,
507 st->in_slow_mc,
508 st->in_no_route,
509 st->in_brd,
510 st->in_martian_dst,
511 st->in_martian_src,
512
513 st->out_hit,
514 st->out_slow_tot,
515 st->out_slow_mc,
516
517 st->gc_total,
518 st->gc_ignored,
519 st->gc_goal_miss,
520 st->gc_dst_overflow,
521 st->in_hlist_search,
522 st->out_hlist_search
523 );
524 return 0;
525 }
526
527 static const struct seq_operations rt_cpu_seq_ops = {
528 .start = rt_cpu_seq_start,
529 .next = rt_cpu_seq_next,
530 .stop = rt_cpu_seq_stop,
531 .show = rt_cpu_seq_show,
532 };
533
534
535 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
536 {
537 return seq_open(file, &rt_cpu_seq_ops);
538 }
539
540 static const struct file_operations rt_cpu_seq_fops = {
541 .owner = THIS_MODULE,
542 .open = rt_cpu_seq_open,
543 .read = seq_read,
544 .llseek = seq_lseek,
545 .release = seq_release,
546 };
547
548 #ifdef CONFIG_IP_ROUTE_CLASSID
549 static int rt_acct_proc_show(struct seq_file *m, void *v)
550 {
551 struct ip_rt_acct *dst, *src;
552 unsigned int i, j;
553
554 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
555 if (!dst)
556 return -ENOMEM;
557
558 for_each_possible_cpu(i) {
559 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
560 for (j = 0; j < 256; j++) {
561 dst[j].o_bytes += src[j].o_bytes;
562 dst[j].o_packets += src[j].o_packets;
563 dst[j].i_bytes += src[j].i_bytes;
564 dst[j].i_packets += src[j].i_packets;
565 }
566 }
567
568 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
569 kfree(dst);
570 return 0;
571 }
572
573 static int rt_acct_proc_open(struct inode *inode, struct file *file)
574 {
575 return single_open(file, rt_acct_proc_show, NULL);
576 }
577
578 static const struct file_operations rt_acct_proc_fops = {
579 .owner = THIS_MODULE,
580 .open = rt_acct_proc_open,
581 .read = seq_read,
582 .llseek = seq_lseek,
583 .release = single_release,
584 };
585 #endif
586
587 static int __net_init ip_rt_do_proc_init(struct net *net)
588 {
589 struct proc_dir_entry *pde;
590
591 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
592 &rt_cache_seq_fops);
593 if (!pde)
594 goto err1;
595
596 pde = proc_create("rt_cache", S_IRUGO,
597 net->proc_net_stat, &rt_cpu_seq_fops);
598 if (!pde)
599 goto err2;
600
601 #ifdef CONFIG_IP_ROUTE_CLASSID
602 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
603 if (!pde)
604 goto err3;
605 #endif
606 return 0;
607
608 #ifdef CONFIG_IP_ROUTE_CLASSID
609 err3:
610 remove_proc_entry("rt_cache", net->proc_net_stat);
611 #endif
612 err2:
613 remove_proc_entry("rt_cache", net->proc_net);
614 err1:
615 return -ENOMEM;
616 }
617
618 static void __net_exit ip_rt_do_proc_exit(struct net *net)
619 {
620 remove_proc_entry("rt_cache", net->proc_net_stat);
621 remove_proc_entry("rt_cache", net->proc_net);
622 #ifdef CONFIG_IP_ROUTE_CLASSID
623 remove_proc_entry("rt_acct", net->proc_net);
624 #endif
625 }
626
627 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
628 .init = ip_rt_do_proc_init,
629 .exit = ip_rt_do_proc_exit,
630 };
631
632 static int __init ip_rt_proc_init(void)
633 {
634 return register_pernet_subsys(&ip_rt_proc_ops);
635 }
636
637 #else
638 static inline int ip_rt_proc_init(void)
639 {
640 return 0;
641 }
642 #endif /* CONFIG_PROC_FS */
643
644 static inline void rt_free(struct rtable *rt)
645 {
646 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
647 }
648
649 static inline void rt_drop(struct rtable *rt)
650 {
651 ip_rt_put(rt);
652 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
653 }
654
655 static inline int rt_fast_clean(struct rtable *rth)
656 {
657 /* Kill broadcast/multicast entries very aggresively, if they
658 collide in hash table with more useful entries */
659 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
660 rt_is_input_route(rth) && rth->dst.rt_next;
661 }
662
663 static inline int rt_valuable(struct rtable *rth)
664 {
665 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
666 rth->dst.expires;
667 }
668
669 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
670 {
671 unsigned long age;
672 int ret = 0;
673
674 if (atomic_read(&rth->dst.__refcnt))
675 goto out;
676
677 ret = 1;
678 if (rth->dst.expires &&
679 time_after_eq(jiffies, rth->dst.expires))
680 goto out;
681
682 age = jiffies - rth->dst.lastuse;
683 ret = 0;
684 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
685 (age <= tmo2 && rt_valuable(rth)))
686 goto out;
687 ret = 1;
688 out: return ret;
689 }
690
691 /* Bits of score are:
692 * 31: very valuable
693 * 30: not quite useless
694 * 29..0: usage counter
695 */
696 static inline u32 rt_score(struct rtable *rt)
697 {
698 u32 score = jiffies - rt->dst.lastuse;
699
700 score = ~score & ~(3<<30);
701
702 if (rt_valuable(rt))
703 score |= (1<<31);
704
705 if (rt_is_output_route(rt) ||
706 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
707 score |= (1<<30);
708
709 return score;
710 }
711
712 static inline bool rt_caching(const struct net *net)
713 {
714 return net->ipv4.current_rt_cache_rebuild_count <=
715 net->ipv4.sysctl_rt_cache_rebuild_count;
716 }
717
718 static inline bool compare_hash_inputs(const struct flowi *fl1,
719 const struct flowi *fl2)
720 {
721 return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
722 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
723 (fl1->iif ^ fl2->iif)) == 0);
724 }
725
726 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
727 {
728 return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
729 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
730 (fl1->mark ^ fl2->mark) |
731 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
732 (fl1->oif ^ fl2->oif) |
733 (fl1->iif ^ fl2->iif)) == 0;
734 }
735
736 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
737 {
738 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
739 }
740
741 static inline int rt_is_expired(struct rtable *rth)
742 {
743 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
744 }
745
746 /*
747 * Perform a full scan of hash table and free all entries.
748 * Can be called by a softirq or a process.
749 * In the later case, we want to be reschedule if necessary
750 */
751 static void rt_do_flush(struct net *net, int process_context)
752 {
753 unsigned int i;
754 struct rtable *rth, *next;
755
756 for (i = 0; i <= rt_hash_mask; i++) {
757 struct rtable __rcu **pprev;
758 struct rtable *list;
759
760 if (process_context && need_resched())
761 cond_resched();
762 rth = rcu_dereference_raw(rt_hash_table[i].chain);
763 if (!rth)
764 continue;
765
766 spin_lock_bh(rt_hash_lock_addr(i));
767
768 list = NULL;
769 pprev = &rt_hash_table[i].chain;
770 rth = rcu_dereference_protected(*pprev,
771 lockdep_is_held(rt_hash_lock_addr(i)));
772
773 while (rth) {
774 next = rcu_dereference_protected(rth->dst.rt_next,
775 lockdep_is_held(rt_hash_lock_addr(i)));
776
777 if (!net ||
778 net_eq(dev_net(rth->dst.dev), net)) {
779 rcu_assign_pointer(*pprev, next);
780 rcu_assign_pointer(rth->dst.rt_next, list);
781 list = rth;
782 } else {
783 pprev = &rth->dst.rt_next;
784 }
785 rth = next;
786 }
787
788 spin_unlock_bh(rt_hash_lock_addr(i));
789
790 for (; list; list = next) {
791 next = rcu_dereference_protected(list->dst.rt_next, 1);
792 rt_free(list);
793 }
794 }
795 }
796
797 /*
798 * While freeing expired entries, we compute average chain length
799 * and standard deviation, using fixed-point arithmetic.
800 * This to have an estimation of rt_chain_length_max
801 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
802 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
803 */
804
805 #define FRACT_BITS 3
806 #define ONE (1UL << FRACT_BITS)
807
808 /*
809 * Given a hash chain and an item in this hash chain,
810 * find if a previous entry has the same hash_inputs
811 * (but differs on tos, mark or oif)
812 * Returns 0 if an alias is found.
813 * Returns ONE if rth has no alias before itself.
814 */
815 static int has_noalias(const struct rtable *head, const struct rtable *rth)
816 {
817 const struct rtable *aux = head;
818
819 while (aux != rth) {
820 if (compare_hash_inputs(&aux->fl, &rth->fl))
821 return 0;
822 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
823 }
824 return ONE;
825 }
826
827 static void rt_check_expire(void)
828 {
829 static unsigned int rover;
830 unsigned int i = rover, goal;
831 struct rtable *rth;
832 struct rtable __rcu **rthp;
833 unsigned long samples = 0;
834 unsigned long sum = 0, sum2 = 0;
835 unsigned long delta;
836 u64 mult;
837
838 delta = jiffies - expires_ljiffies;
839 expires_ljiffies = jiffies;
840 mult = ((u64)delta) << rt_hash_log;
841 if (ip_rt_gc_timeout > 1)
842 do_div(mult, ip_rt_gc_timeout);
843 goal = (unsigned int)mult;
844 if (goal > rt_hash_mask)
845 goal = rt_hash_mask + 1;
846 for (; goal > 0; goal--) {
847 unsigned long tmo = ip_rt_gc_timeout;
848 unsigned long length;
849
850 i = (i + 1) & rt_hash_mask;
851 rthp = &rt_hash_table[i].chain;
852
853 if (need_resched())
854 cond_resched();
855
856 samples++;
857
858 if (rcu_dereference_raw(*rthp) == NULL)
859 continue;
860 length = 0;
861 spin_lock_bh(rt_hash_lock_addr(i));
862 while ((rth = rcu_dereference_protected(*rthp,
863 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
864 prefetch(rth->dst.rt_next);
865 if (rt_is_expired(rth)) {
866 *rthp = rth->dst.rt_next;
867 rt_free(rth);
868 continue;
869 }
870 if (rth->dst.expires) {
871 /* Entry is expired even if it is in use */
872 if (time_before_eq(jiffies, rth->dst.expires)) {
873 nofree:
874 tmo >>= 1;
875 rthp = &rth->dst.rt_next;
876 /*
877 * We only count entries on
878 * a chain with equal hash inputs once
879 * so that entries for different QOS
880 * levels, and other non-hash input
881 * attributes don't unfairly skew
882 * the length computation
883 */
884 length += has_noalias(rt_hash_table[i].chain, rth);
885 continue;
886 }
887 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
888 goto nofree;
889
890 /* Cleanup aged off entries. */
891 *rthp = rth->dst.rt_next;
892 rt_free(rth);
893 }
894 spin_unlock_bh(rt_hash_lock_addr(i));
895 sum += length;
896 sum2 += length*length;
897 }
898 if (samples) {
899 unsigned long avg = sum / samples;
900 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
901 rt_chain_length_max = max_t(unsigned long,
902 ip_rt_gc_elasticity,
903 (avg + 4*sd) >> FRACT_BITS);
904 }
905 rover = i;
906 }
907
908 /*
909 * rt_worker_func() is run in process context.
910 * we call rt_check_expire() to scan part of the hash table
911 */
912 static void rt_worker_func(struct work_struct *work)
913 {
914 rt_check_expire();
915 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
916 }
917
918 /*
919 * Pertubation of rt_genid by a small quantity [1..256]
920 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
921 * many times (2^24) without giving recent rt_genid.
922 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
923 */
924 static void rt_cache_invalidate(struct net *net)
925 {
926 unsigned char shuffle;
927
928 get_random_bytes(&shuffle, sizeof(shuffle));
929 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
930 }
931
932 /*
933 * delay < 0 : invalidate cache (fast : entries will be deleted later)
934 * delay >= 0 : invalidate & flush cache (can be long)
935 */
936 void rt_cache_flush(struct net *net, int delay)
937 {
938 rt_cache_invalidate(net);
939 if (delay >= 0)
940 rt_do_flush(net, !in_softirq());
941 }
942
943 /* Flush previous cache invalidated entries from the cache */
944 void rt_cache_flush_batch(struct net *net)
945 {
946 rt_do_flush(net, !in_softirq());
947 }
948
949 static void rt_emergency_hash_rebuild(struct net *net)
950 {
951 if (net_ratelimit())
952 printk(KERN_WARNING "Route hash chain too long!\n");
953 rt_cache_invalidate(net);
954 }
955
956 /*
957 Short description of GC goals.
958
959 We want to build algorithm, which will keep routing cache
960 at some equilibrium point, when number of aged off entries
961 is kept approximately equal to newly generated ones.
962
963 Current expiration strength is variable "expire".
964 We try to adjust it dynamically, so that if networking
965 is idle expires is large enough to keep enough of warm entries,
966 and when load increases it reduces to limit cache size.
967 */
968
969 static int rt_garbage_collect(struct dst_ops *ops)
970 {
971 static unsigned long expire = RT_GC_TIMEOUT;
972 static unsigned long last_gc;
973 static int rover;
974 static int equilibrium;
975 struct rtable *rth;
976 struct rtable __rcu **rthp;
977 unsigned long now = jiffies;
978 int goal;
979 int entries = dst_entries_get_fast(&ipv4_dst_ops);
980
981 /*
982 * Garbage collection is pretty expensive,
983 * do not make it too frequently.
984 */
985
986 RT_CACHE_STAT_INC(gc_total);
987
988 if (now - last_gc < ip_rt_gc_min_interval &&
989 entries < ip_rt_max_size) {
990 RT_CACHE_STAT_INC(gc_ignored);
991 goto out;
992 }
993
994 entries = dst_entries_get_slow(&ipv4_dst_ops);
995 /* Calculate number of entries, which we want to expire now. */
996 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
997 if (goal <= 0) {
998 if (equilibrium < ipv4_dst_ops.gc_thresh)
999 equilibrium = ipv4_dst_ops.gc_thresh;
1000 goal = entries - equilibrium;
1001 if (goal > 0) {
1002 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1003 goal = entries - equilibrium;
1004 }
1005 } else {
1006 /* We are in dangerous area. Try to reduce cache really
1007 * aggressively.
1008 */
1009 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1010 equilibrium = entries - goal;
1011 }
1012
1013 if (now - last_gc >= ip_rt_gc_min_interval)
1014 last_gc = now;
1015
1016 if (goal <= 0) {
1017 equilibrium += goal;
1018 goto work_done;
1019 }
1020
1021 do {
1022 int i, k;
1023
1024 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1025 unsigned long tmo = expire;
1026
1027 k = (k + 1) & rt_hash_mask;
1028 rthp = &rt_hash_table[k].chain;
1029 spin_lock_bh(rt_hash_lock_addr(k));
1030 while ((rth = rcu_dereference_protected(*rthp,
1031 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1032 if (!rt_is_expired(rth) &&
1033 !rt_may_expire(rth, tmo, expire)) {
1034 tmo >>= 1;
1035 rthp = &rth->dst.rt_next;
1036 continue;
1037 }
1038 *rthp = rth->dst.rt_next;
1039 rt_free(rth);
1040 goal--;
1041 }
1042 spin_unlock_bh(rt_hash_lock_addr(k));
1043 if (goal <= 0)
1044 break;
1045 }
1046 rover = k;
1047
1048 if (goal <= 0)
1049 goto work_done;
1050
1051 /* Goal is not achieved. We stop process if:
1052
1053 - if expire reduced to zero. Otherwise, expire is halfed.
1054 - if table is not full.
1055 - if we are called from interrupt.
1056 - jiffies check is just fallback/debug loop breaker.
1057 We will not spin here for long time in any case.
1058 */
1059
1060 RT_CACHE_STAT_INC(gc_goal_miss);
1061
1062 if (expire == 0)
1063 break;
1064
1065 expire >>= 1;
1066 #if RT_CACHE_DEBUG >= 2
1067 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1068 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1069 #endif
1070
1071 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1072 goto out;
1073 } while (!in_softirq() && time_before_eq(jiffies, now));
1074
1075 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1076 goto out;
1077 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1078 goto out;
1079 if (net_ratelimit())
1080 printk(KERN_WARNING "dst cache overflow\n");
1081 RT_CACHE_STAT_INC(gc_dst_overflow);
1082 return 1;
1083
1084 work_done:
1085 expire += ip_rt_gc_min_interval;
1086 if (expire > ip_rt_gc_timeout ||
1087 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1088 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1089 expire = ip_rt_gc_timeout;
1090 #if RT_CACHE_DEBUG >= 2
1091 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1092 dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1093 #endif
1094 out: return 0;
1095 }
1096
1097 /*
1098 * Returns number of entries in a hash chain that have different hash_inputs
1099 */
1100 static int slow_chain_length(const struct rtable *head)
1101 {
1102 int length = 0;
1103 const struct rtable *rth = head;
1104
1105 while (rth) {
1106 length += has_noalias(head, rth);
1107 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1108 }
1109 return length >> FRACT_BITS;
1110 }
1111
1112 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1113 struct rtable **rp, struct sk_buff *skb, int ifindex)
1114 {
1115 struct rtable *rth, *cand;
1116 struct rtable __rcu **rthp, **candp;
1117 unsigned long now;
1118 u32 min_score;
1119 int chain_length;
1120 int attempts = !in_softirq();
1121
1122 restart:
1123 chain_length = 0;
1124 min_score = ~(u32)0;
1125 cand = NULL;
1126 candp = NULL;
1127 now = jiffies;
1128
1129 if (!rt_caching(dev_net(rt->dst.dev))) {
1130 /*
1131 * If we're not caching, just tell the caller we
1132 * were successful and don't touch the route. The
1133 * caller hold the sole reference to the cache entry, and
1134 * it will be released when the caller is done with it.
1135 * If we drop it here, the callers have no way to resolve routes
1136 * when we're not caching. Instead, just point *rp at rt, so
1137 * the caller gets a single use out of the route
1138 * Note that we do rt_free on this new route entry, so that
1139 * once its refcount hits zero, we are still able to reap it
1140 * (Thanks Alexey)
1141 * Note: To avoid expensive rcu stuff for this uncached dst,
1142 * we set DST_NOCACHE so that dst_release() can free dst without
1143 * waiting a grace period.
1144 */
1145
1146 rt->dst.flags |= DST_NOCACHE;
1147 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1148 int err = arp_bind_neighbour(&rt->dst);
1149 if (err) {
1150 if (net_ratelimit())
1151 printk(KERN_WARNING
1152 "Neighbour table failure & not caching routes.\n");
1153 ip_rt_put(rt);
1154 return err;
1155 }
1156 }
1157
1158 goto skip_hashing;
1159 }
1160
1161 rthp = &rt_hash_table[hash].chain;
1162
1163 spin_lock_bh(rt_hash_lock_addr(hash));
1164 while ((rth = rcu_dereference_protected(*rthp,
1165 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1166 if (rt_is_expired(rth)) {
1167 *rthp = rth->dst.rt_next;
1168 rt_free(rth);
1169 continue;
1170 }
1171 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1172 /* Put it first */
1173 *rthp = rth->dst.rt_next;
1174 /*
1175 * Since lookup is lockfree, the deletion
1176 * must be visible to another weakly ordered CPU before
1177 * the insertion at the start of the hash chain.
1178 */
1179 rcu_assign_pointer(rth->dst.rt_next,
1180 rt_hash_table[hash].chain);
1181 /*
1182 * Since lookup is lockfree, the update writes
1183 * must be ordered for consistency on SMP.
1184 */
1185 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1186
1187 dst_use(&rth->dst, now);
1188 spin_unlock_bh(rt_hash_lock_addr(hash));
1189
1190 rt_drop(rt);
1191 if (rp)
1192 *rp = rth;
1193 else
1194 skb_dst_set(skb, &rth->dst);
1195 return 0;
1196 }
1197
1198 if (!atomic_read(&rth->dst.__refcnt)) {
1199 u32 score = rt_score(rth);
1200
1201 if (score <= min_score) {
1202 cand = rth;
1203 candp = rthp;
1204 min_score = score;
1205 }
1206 }
1207
1208 chain_length++;
1209
1210 rthp = &rth->dst.rt_next;
1211 }
1212
1213 if (cand) {
1214 /* ip_rt_gc_elasticity used to be average length of chain
1215 * length, when exceeded gc becomes really aggressive.
1216 *
1217 * The second limit is less certain. At the moment it allows
1218 * only 2 entries per bucket. We will see.
1219 */
1220 if (chain_length > ip_rt_gc_elasticity) {
1221 *candp = cand->dst.rt_next;
1222 rt_free(cand);
1223 }
1224 } else {
1225 if (chain_length > rt_chain_length_max &&
1226 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1227 struct net *net = dev_net(rt->dst.dev);
1228 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1229 if (!rt_caching(net)) {
1230 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1231 rt->dst.dev->name, num);
1232 }
1233 rt_emergency_hash_rebuild(net);
1234 spin_unlock_bh(rt_hash_lock_addr(hash));
1235
1236 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1237 ifindex, rt_genid(net));
1238 goto restart;
1239 }
1240 }
1241
1242 /* Try to bind route to arp only if it is output
1243 route or unicast forwarding path.
1244 */
1245 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1246 int err = arp_bind_neighbour(&rt->dst);
1247 if (err) {
1248 spin_unlock_bh(rt_hash_lock_addr(hash));
1249
1250 if (err != -ENOBUFS) {
1251 rt_drop(rt);
1252 return err;
1253 }
1254
1255 /* Neighbour tables are full and nothing
1256 can be released. Try to shrink route cache,
1257 it is most likely it holds some neighbour records.
1258 */
1259 if (attempts-- > 0) {
1260 int saved_elasticity = ip_rt_gc_elasticity;
1261 int saved_int = ip_rt_gc_min_interval;
1262 ip_rt_gc_elasticity = 1;
1263 ip_rt_gc_min_interval = 0;
1264 rt_garbage_collect(&ipv4_dst_ops);
1265 ip_rt_gc_min_interval = saved_int;
1266 ip_rt_gc_elasticity = saved_elasticity;
1267 goto restart;
1268 }
1269
1270 if (net_ratelimit())
1271 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1272 rt_drop(rt);
1273 return -ENOBUFS;
1274 }
1275 }
1276
1277 rt->dst.rt_next = rt_hash_table[hash].chain;
1278
1279 #if RT_CACHE_DEBUG >= 2
1280 if (rt->dst.rt_next) {
1281 struct rtable *trt;
1282 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1283 hash, &rt->rt_dst);
1284 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1285 printk(" . %pI4", &trt->rt_dst);
1286 printk("\n");
1287 }
1288 #endif
1289 /*
1290 * Since lookup is lockfree, we must make sure
1291 * previous writes to rt are comitted to memory
1292 * before making rt visible to other CPUS.
1293 */
1294 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1295
1296 spin_unlock_bh(rt_hash_lock_addr(hash));
1297
1298 skip_hashing:
1299 if (rp)
1300 *rp = rt;
1301 else
1302 skb_dst_set(skb, &rt->dst);
1303 return 0;
1304 }
1305
1306 void rt_bind_peer(struct rtable *rt, int create)
1307 {
1308 struct inet_peer *peer;
1309
1310 peer = inet_getpeer_v4(rt->rt_dst, create);
1311
1312 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1313 inet_putpeer(peer);
1314 }
1315
1316 /*
1317 * Peer allocation may fail only in serious out-of-memory conditions. However
1318 * we still can generate some output.
1319 * Random ID selection looks a bit dangerous because we have no chances to
1320 * select ID being unique in a reasonable period of time.
1321 * But broken packet identifier may be better than no packet at all.
1322 */
1323 static void ip_select_fb_ident(struct iphdr *iph)
1324 {
1325 static DEFINE_SPINLOCK(ip_fb_id_lock);
1326 static u32 ip_fallback_id;
1327 u32 salt;
1328
1329 spin_lock_bh(&ip_fb_id_lock);
1330 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1331 iph->id = htons(salt & 0xFFFF);
1332 ip_fallback_id = salt;
1333 spin_unlock_bh(&ip_fb_id_lock);
1334 }
1335
1336 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1337 {
1338 struct rtable *rt = (struct rtable *) dst;
1339
1340 if (rt) {
1341 if (rt->peer == NULL)
1342 rt_bind_peer(rt, 1);
1343
1344 /* If peer is attached to destination, it is never detached,
1345 so that we need not to grab a lock to dereference it.
1346 */
1347 if (rt->peer) {
1348 iph->id = htons(inet_getid(rt->peer, more));
1349 return;
1350 }
1351 } else
1352 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1353 __builtin_return_address(0));
1354
1355 ip_select_fb_ident(iph);
1356 }
1357 EXPORT_SYMBOL(__ip_select_ident);
1358
1359 static void rt_del(unsigned hash, struct rtable *rt)
1360 {
1361 struct rtable __rcu **rthp;
1362 struct rtable *aux;
1363
1364 rthp = &rt_hash_table[hash].chain;
1365 spin_lock_bh(rt_hash_lock_addr(hash));
1366 ip_rt_put(rt);
1367 while ((aux = rcu_dereference_protected(*rthp,
1368 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1369 if (aux == rt || rt_is_expired(aux)) {
1370 *rthp = aux->dst.rt_next;
1371 rt_free(aux);
1372 continue;
1373 }
1374 rthp = &aux->dst.rt_next;
1375 }
1376 spin_unlock_bh(rt_hash_lock_addr(hash));
1377 }
1378
1379 /* called in rcu_read_lock() section */
1380 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1381 __be32 saddr, struct net_device *dev)
1382 {
1383 int i, k;
1384 struct in_device *in_dev = __in_dev_get_rcu(dev);
1385 struct rtable *rth;
1386 struct rtable __rcu **rthp;
1387 __be32 skeys[2] = { saddr, 0 };
1388 int ikeys[2] = { dev->ifindex, 0 };
1389 struct netevent_redirect netevent;
1390 struct net *net;
1391
1392 if (!in_dev)
1393 return;
1394
1395 net = dev_net(dev);
1396 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1397 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1398 ipv4_is_zeronet(new_gw))
1399 goto reject_redirect;
1400
1401 if (!rt_caching(net))
1402 goto reject_redirect;
1403
1404 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1405 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1406 goto reject_redirect;
1407 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1408 goto reject_redirect;
1409 } else {
1410 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1411 goto reject_redirect;
1412 }
1413
1414 for (i = 0; i < 2; i++) {
1415 for (k = 0; k < 2; k++) {
1416 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1417 rt_genid(net));
1418
1419 rthp = &rt_hash_table[hash].chain;
1420
1421 while ((rth = rcu_dereference(*rthp)) != NULL) {
1422 struct rtable *rt;
1423
1424 if (rth->fl.fl4_dst != daddr ||
1425 rth->fl.fl4_src != skeys[i] ||
1426 rth->fl.oif != ikeys[k] ||
1427 rt_is_input_route(rth) ||
1428 rt_is_expired(rth) ||
1429 !net_eq(dev_net(rth->dst.dev), net)) {
1430 rthp = &rth->dst.rt_next;
1431 continue;
1432 }
1433
1434 if (rth->rt_dst != daddr ||
1435 rth->rt_src != saddr ||
1436 rth->dst.error ||
1437 rth->rt_gateway != old_gw ||
1438 rth->dst.dev != dev)
1439 break;
1440
1441 dst_hold(&rth->dst);
1442
1443 rt = dst_alloc(&ipv4_dst_ops);
1444 if (rt == NULL) {
1445 ip_rt_put(rth);
1446 return;
1447 }
1448
1449 /* Copy all the information. */
1450 *rt = *rth;
1451 rt->dst.__use = 1;
1452 atomic_set(&rt->dst.__refcnt, 1);
1453 rt->dst.child = NULL;
1454 if (rt->dst.dev)
1455 dev_hold(rt->dst.dev);
1456 rt->dst.obsolete = -1;
1457 rt->dst.lastuse = jiffies;
1458 rt->dst.path = &rt->dst;
1459 rt->dst.neighbour = NULL;
1460 rt->dst.hh = NULL;
1461 #ifdef CONFIG_XFRM
1462 rt->dst.xfrm = NULL;
1463 #endif
1464 rt->rt_genid = rt_genid(net);
1465 rt->rt_flags |= RTCF_REDIRECTED;
1466
1467 /* Gateway is different ... */
1468 rt->rt_gateway = new_gw;
1469
1470 /* Redirect received -> path was valid */
1471 dst_confirm(&rth->dst);
1472
1473 if (rt->peer)
1474 atomic_inc(&rt->peer->refcnt);
1475 if (rt->fi)
1476 atomic_inc(&rt->fi->fib_clntref);
1477
1478 if (arp_bind_neighbour(&rt->dst) ||
1479 !(rt->dst.neighbour->nud_state &
1480 NUD_VALID)) {
1481 if (rt->dst.neighbour)
1482 neigh_event_send(rt->dst.neighbour, NULL);
1483 ip_rt_put(rth);
1484 rt_drop(rt);
1485 goto do_next;
1486 }
1487
1488 netevent.old = &rth->dst;
1489 netevent.new = &rt->dst;
1490 call_netevent_notifiers(NETEVENT_REDIRECT,
1491 &netevent);
1492
1493 rt_del(hash, rth);
1494 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1495 ip_rt_put(rt);
1496 goto do_next;
1497 }
1498 do_next:
1499 ;
1500 }
1501 }
1502 return;
1503
1504 reject_redirect:
1505 #ifdef CONFIG_IP_ROUTE_VERBOSE
1506 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1507 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1508 " Advised path = %pI4 -> %pI4\n",
1509 &old_gw, dev->name, &new_gw,
1510 &saddr, &daddr);
1511 #endif
1512 ;
1513 }
1514
1515 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1516 {
1517 struct rtable *rt = (struct rtable *)dst;
1518 struct dst_entry *ret = dst;
1519
1520 if (rt) {
1521 if (dst->obsolete > 0) {
1522 ip_rt_put(rt);
1523 ret = NULL;
1524 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1525 (rt->dst.expires &&
1526 time_after_eq(jiffies, rt->dst.expires))) {
1527 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1528 rt->fl.oif,
1529 rt_genid(dev_net(dst->dev)));
1530 #if RT_CACHE_DEBUG >= 1
1531 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1532 &rt->rt_dst, rt->fl.fl4_tos);
1533 #endif
1534 rt_del(hash, rt);
1535 ret = NULL;
1536 }
1537 }
1538 return ret;
1539 }
1540
1541 /*
1542 * Algorithm:
1543 * 1. The first ip_rt_redirect_number redirects are sent
1544 * with exponential backoff, then we stop sending them at all,
1545 * assuming that the host ignores our redirects.
1546 * 2. If we did not see packets requiring redirects
1547 * during ip_rt_redirect_silence, we assume that the host
1548 * forgot redirected route and start to send redirects again.
1549 *
1550 * This algorithm is much cheaper and more intelligent than dumb load limiting
1551 * in icmp.c.
1552 *
1553 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1554 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1555 */
1556
1557 void ip_rt_send_redirect(struct sk_buff *skb)
1558 {
1559 struct rtable *rt = skb_rtable(skb);
1560 struct in_device *in_dev;
1561 int log_martians;
1562
1563 rcu_read_lock();
1564 in_dev = __in_dev_get_rcu(rt->dst.dev);
1565 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1566 rcu_read_unlock();
1567 return;
1568 }
1569 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1570 rcu_read_unlock();
1571
1572 /* No redirected packets during ip_rt_redirect_silence;
1573 * reset the algorithm.
1574 */
1575 if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1576 rt->dst.rate_tokens = 0;
1577
1578 /* Too many ignored redirects; do not send anything
1579 * set dst.rate_last to the last seen redirected packet.
1580 */
1581 if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1582 rt->dst.rate_last = jiffies;
1583 return;
1584 }
1585
1586 /* Check for load limit; set rate_last to the latest sent
1587 * redirect.
1588 */
1589 if (rt->dst.rate_tokens == 0 ||
1590 time_after(jiffies,
1591 (rt->dst.rate_last +
1592 (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1593 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1594 rt->dst.rate_last = jiffies;
1595 ++rt->dst.rate_tokens;
1596 #ifdef CONFIG_IP_ROUTE_VERBOSE
1597 if (log_martians &&
1598 rt->dst.rate_tokens == ip_rt_redirect_number &&
1599 net_ratelimit())
1600 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1601 &rt->rt_src, rt->rt_iif,
1602 &rt->rt_dst, &rt->rt_gateway);
1603 #endif
1604 }
1605 }
1606
1607 static int ip_error(struct sk_buff *skb)
1608 {
1609 struct rtable *rt = skb_rtable(skb);
1610 unsigned long now;
1611 int code;
1612
1613 switch (rt->dst.error) {
1614 case EINVAL:
1615 default:
1616 goto out;
1617 case EHOSTUNREACH:
1618 code = ICMP_HOST_UNREACH;
1619 break;
1620 case ENETUNREACH:
1621 code = ICMP_NET_UNREACH;
1622 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1623 IPSTATS_MIB_INNOROUTES);
1624 break;
1625 case EACCES:
1626 code = ICMP_PKT_FILTERED;
1627 break;
1628 }
1629
1630 now = jiffies;
1631 rt->dst.rate_tokens += now - rt->dst.rate_last;
1632 if (rt->dst.rate_tokens > ip_rt_error_burst)
1633 rt->dst.rate_tokens = ip_rt_error_burst;
1634 rt->dst.rate_last = now;
1635 if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1636 rt->dst.rate_tokens -= ip_rt_error_cost;
1637 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1638 }
1639
1640 out: kfree_skb(skb);
1641 return 0;
1642 }
1643
1644 /*
1645 * The last two values are not from the RFC but
1646 * are needed for AMPRnet AX.25 paths.
1647 */
1648
1649 static const unsigned short mtu_plateau[] =
1650 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1651
1652 static inline unsigned short guess_mtu(unsigned short old_mtu)
1653 {
1654 int i;
1655
1656 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1657 if (old_mtu > mtu_plateau[i])
1658 return mtu_plateau[i];
1659 return 68;
1660 }
1661
1662 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1663 unsigned short new_mtu,
1664 struct net_device *dev)
1665 {
1666 int i, k;
1667 unsigned short old_mtu = ntohs(iph->tot_len);
1668 struct rtable *rth;
1669 int ikeys[2] = { dev->ifindex, 0 };
1670 __be32 skeys[2] = { iph->saddr, 0, };
1671 __be32 daddr = iph->daddr;
1672 unsigned short est_mtu = 0;
1673
1674 for (k = 0; k < 2; k++) {
1675 for (i = 0; i < 2; i++) {
1676 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1677 rt_genid(net));
1678
1679 rcu_read_lock();
1680 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1681 rth = rcu_dereference(rth->dst.rt_next)) {
1682 unsigned short mtu = new_mtu;
1683
1684 if (rth->fl.fl4_dst != daddr ||
1685 rth->fl.fl4_src != skeys[i] ||
1686 rth->rt_dst != daddr ||
1687 rth->rt_src != iph->saddr ||
1688 rth->fl.oif != ikeys[k] ||
1689 rt_is_input_route(rth) ||
1690 dst_metric_locked(&rth->dst, RTAX_MTU) ||
1691 !net_eq(dev_net(rth->dst.dev), net) ||
1692 rt_is_expired(rth))
1693 continue;
1694
1695 if (new_mtu < 68 || new_mtu >= old_mtu) {
1696
1697 /* BSD 4.2 compatibility hack :-( */
1698 if (mtu == 0 &&
1699 old_mtu >= dst_mtu(&rth->dst) &&
1700 old_mtu >= 68 + (iph->ihl << 2))
1701 old_mtu -= iph->ihl << 2;
1702
1703 mtu = guess_mtu(old_mtu);
1704 }
1705 if (mtu <= dst_mtu(&rth->dst)) {
1706 if (mtu < dst_mtu(&rth->dst)) {
1707 dst_confirm(&rth->dst);
1708 if (mtu < ip_rt_min_pmtu) {
1709 u32 lock = dst_metric(&rth->dst,
1710 RTAX_LOCK);
1711 mtu = ip_rt_min_pmtu;
1712 lock |= (1 << RTAX_MTU);
1713 dst_metric_set(&rth->dst, RTAX_LOCK,
1714 lock);
1715 }
1716 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1717 dst_set_expires(&rth->dst,
1718 ip_rt_mtu_expires);
1719 }
1720 est_mtu = mtu;
1721 }
1722 }
1723 rcu_read_unlock();
1724 }
1725 }
1726 return est_mtu ? : new_mtu;
1727 }
1728
1729 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1730 {
1731 if (dst_mtu(dst) > mtu && mtu >= 68 &&
1732 !(dst_metric_locked(dst, RTAX_MTU))) {
1733 if (mtu < ip_rt_min_pmtu) {
1734 u32 lock = dst_metric(dst, RTAX_LOCK);
1735 mtu = ip_rt_min_pmtu;
1736 dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1737 }
1738 dst_metric_set(dst, RTAX_MTU, mtu);
1739 dst_set_expires(dst, ip_rt_mtu_expires);
1740 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1741 }
1742 }
1743
1744 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1745 {
1746 if (rt_is_expired((struct rtable *)dst))
1747 return NULL;
1748 return dst;
1749 }
1750
1751 static void ipv4_dst_destroy(struct dst_entry *dst)
1752 {
1753 struct rtable *rt = (struct rtable *) dst;
1754 struct inet_peer *peer = rt->peer;
1755
1756 dst_destroy_metrics_generic(dst);
1757 if (rt->fi) {
1758 fib_info_put(rt->fi);
1759 rt->fi = NULL;
1760 }
1761 if (peer) {
1762 rt->peer = NULL;
1763 inet_putpeer(peer);
1764 }
1765 }
1766
1767
1768 static void ipv4_link_failure(struct sk_buff *skb)
1769 {
1770 struct rtable *rt;
1771
1772 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1773
1774 rt = skb_rtable(skb);
1775 if (rt)
1776 dst_set_expires(&rt->dst, 0);
1777 }
1778
1779 static int ip_rt_bug(struct sk_buff *skb)
1780 {
1781 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1782 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1783 skb->dev ? skb->dev->name : "?");
1784 kfree_skb(skb);
1785 return 0;
1786 }
1787
1788 /*
1789 We do not cache source address of outgoing interface,
1790 because it is used only by IP RR, TS and SRR options,
1791 so that it out of fast path.
1792
1793 BTW remember: "addr" is allowed to be not aligned
1794 in IP options!
1795 */
1796
1797 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1798 {
1799 __be32 src;
1800 struct fib_result res;
1801
1802 if (rt_is_output_route(rt))
1803 src = rt->rt_src;
1804 else {
1805 rcu_read_lock();
1806 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1807 src = FIB_RES_PREFSRC(res);
1808 else
1809 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1810 RT_SCOPE_UNIVERSE);
1811 rcu_read_unlock();
1812 }
1813 memcpy(addr, &src, 4);
1814 }
1815
1816 #ifdef CONFIG_IP_ROUTE_CLASSID
1817 static void set_class_tag(struct rtable *rt, u32 tag)
1818 {
1819 if (!(rt->dst.tclassid & 0xFFFF))
1820 rt->dst.tclassid |= tag & 0xFFFF;
1821 if (!(rt->dst.tclassid & 0xFFFF0000))
1822 rt->dst.tclassid |= tag & 0xFFFF0000;
1823 }
1824 #endif
1825
1826 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1827 {
1828 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1829
1830 if (advmss == 0) {
1831 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1832 ip_rt_min_advmss);
1833 if (advmss > 65535 - 40)
1834 advmss = 65535 - 40;
1835 }
1836 return advmss;
1837 }
1838
1839 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1840 {
1841 unsigned int mtu = dst->dev->mtu;
1842
1843 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1844 const struct rtable *rt = (const struct rtable *) dst;
1845
1846 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1847 mtu = 576;
1848 }
1849
1850 if (mtu > IP_MAX_MTU)
1851 mtu = IP_MAX_MTU;
1852
1853 return mtu;
1854 }
1855
1856 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1857 {
1858 struct dst_entry *dst = &rt->dst;
1859 struct fib_info *fi = res->fi;
1860
1861 if (fi) {
1862 if (FIB_RES_GW(*res) &&
1863 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1864 rt->rt_gateway = FIB_RES_GW(*res);
1865 rt->fi = fi;
1866 atomic_inc(&fi->fib_clntref);
1867 dst_init_metrics(dst, fi->fib_metrics, true);
1868 #ifdef CONFIG_IP_ROUTE_CLASSID
1869 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1870 #endif
1871 }
1872
1873 if (dst_mtu(dst) > IP_MAX_MTU)
1874 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1875 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1876 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1877
1878 #ifdef CONFIG_IP_ROUTE_CLASSID
1879 #ifdef CONFIG_IP_MULTIPLE_TABLES
1880 set_class_tag(rt, fib_rules_tclass(res));
1881 #endif
1882 set_class_tag(rt, itag);
1883 #endif
1884 rt->rt_type = res->type;
1885 }
1886
1887 /* called in rcu_read_lock() section */
1888 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1889 u8 tos, struct net_device *dev, int our)
1890 {
1891 unsigned int hash;
1892 struct rtable *rth;
1893 __be32 spec_dst;
1894 struct in_device *in_dev = __in_dev_get_rcu(dev);
1895 u32 itag = 0;
1896 int err;
1897
1898 /* Primary sanity checks. */
1899
1900 if (in_dev == NULL)
1901 return -EINVAL;
1902
1903 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1904 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1905 goto e_inval;
1906
1907 if (ipv4_is_zeronet(saddr)) {
1908 if (!ipv4_is_local_multicast(daddr))
1909 goto e_inval;
1910 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1911 } else {
1912 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1913 &itag, 0);
1914 if (err < 0)
1915 goto e_err;
1916 }
1917 rth = dst_alloc(&ipv4_dst_ops);
1918 if (!rth)
1919 goto e_nobufs;
1920
1921 rth->dst.output = ip_rt_bug;
1922 rth->dst.obsolete = -1;
1923
1924 atomic_set(&rth->dst.__refcnt, 1);
1925 rth->dst.flags= DST_HOST;
1926 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1927 rth->dst.flags |= DST_NOPOLICY;
1928 rth->fl.fl4_dst = daddr;
1929 rth->rt_dst = daddr;
1930 rth->fl.fl4_tos = tos;
1931 rth->fl.mark = skb->mark;
1932 rth->fl.fl4_src = saddr;
1933 rth->rt_src = saddr;
1934 #ifdef CONFIG_IP_ROUTE_CLASSID
1935 rth->dst.tclassid = itag;
1936 #endif
1937 rth->rt_iif =
1938 rth->fl.iif = dev->ifindex;
1939 rth->dst.dev = init_net.loopback_dev;
1940 dev_hold(rth->dst.dev);
1941 rth->fl.oif = 0;
1942 rth->rt_gateway = daddr;
1943 rth->rt_spec_dst= spec_dst;
1944 rth->rt_genid = rt_genid(dev_net(dev));
1945 rth->rt_flags = RTCF_MULTICAST;
1946 rth->rt_type = RTN_MULTICAST;
1947 if (our) {
1948 rth->dst.input= ip_local_deliver;
1949 rth->rt_flags |= RTCF_LOCAL;
1950 }
1951
1952 #ifdef CONFIG_IP_MROUTE
1953 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1954 rth->dst.input = ip_mr_input;
1955 #endif
1956 RT_CACHE_STAT_INC(in_slow_mc);
1957
1958 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1959 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1960
1961 e_nobufs:
1962 return -ENOBUFS;
1963 e_inval:
1964 return -EINVAL;
1965 e_err:
1966 return err;
1967 }
1968
1969
1970 static void ip_handle_martian_source(struct net_device *dev,
1971 struct in_device *in_dev,
1972 struct sk_buff *skb,
1973 __be32 daddr,
1974 __be32 saddr)
1975 {
1976 RT_CACHE_STAT_INC(in_martian_src);
1977 #ifdef CONFIG_IP_ROUTE_VERBOSE
1978 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1979 /*
1980 * RFC1812 recommendation, if source is martian,
1981 * the only hint is MAC header.
1982 */
1983 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1984 &daddr, &saddr, dev->name);
1985 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1986 int i;
1987 const unsigned char *p = skb_mac_header(skb);
1988 printk(KERN_WARNING "ll header: ");
1989 for (i = 0; i < dev->hard_header_len; i++, p++) {
1990 printk("%02x", *p);
1991 if (i < (dev->hard_header_len - 1))
1992 printk(":");
1993 }
1994 printk("\n");
1995 }
1996 }
1997 #endif
1998 }
1999
2000 /* called in rcu_read_lock() section */
2001 static int __mkroute_input(struct sk_buff *skb,
2002 struct fib_result *res,
2003 struct in_device *in_dev,
2004 __be32 daddr, __be32 saddr, u32 tos,
2005 struct rtable **result)
2006 {
2007 struct rtable *rth;
2008 int err;
2009 struct in_device *out_dev;
2010 unsigned int flags = 0;
2011 __be32 spec_dst;
2012 u32 itag;
2013
2014 /* get a working reference to the output device */
2015 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2016 if (out_dev == NULL) {
2017 if (net_ratelimit())
2018 printk(KERN_CRIT "Bug in ip_route_input" \
2019 "_slow(). Please, report\n");
2020 return -EINVAL;
2021 }
2022
2023
2024 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
2025 in_dev->dev, &spec_dst, &itag, skb->mark);
2026 if (err < 0) {
2027 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2028 saddr);
2029
2030 goto cleanup;
2031 }
2032
2033 if (err)
2034 flags |= RTCF_DIRECTSRC;
2035
2036 if (out_dev == in_dev && err &&
2037 (IN_DEV_SHARED_MEDIA(out_dev) ||
2038 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2039 flags |= RTCF_DOREDIRECT;
2040
2041 if (skb->protocol != htons(ETH_P_IP)) {
2042 /* Not IP (i.e. ARP). Do not create route, if it is
2043 * invalid for proxy arp. DNAT routes are always valid.
2044 *
2045 * Proxy arp feature have been extended to allow, ARP
2046 * replies back to the same interface, to support
2047 * Private VLAN switch technologies. See arp.c.
2048 */
2049 if (out_dev == in_dev &&
2050 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2051 err = -EINVAL;
2052 goto cleanup;
2053 }
2054 }
2055
2056
2057 rth = dst_alloc(&ipv4_dst_ops);
2058 if (!rth) {
2059 err = -ENOBUFS;
2060 goto cleanup;
2061 }
2062
2063 atomic_set(&rth->dst.__refcnt, 1);
2064 rth->dst.flags= DST_HOST;
2065 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2066 rth->dst.flags |= DST_NOPOLICY;
2067 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2068 rth->dst.flags |= DST_NOXFRM;
2069 rth->fl.fl4_dst = daddr;
2070 rth->rt_dst = daddr;
2071 rth->fl.fl4_tos = tos;
2072 rth->fl.mark = skb->mark;
2073 rth->fl.fl4_src = saddr;
2074 rth->rt_src = saddr;
2075 rth->rt_gateway = daddr;
2076 rth->rt_iif =
2077 rth->fl.iif = in_dev->dev->ifindex;
2078 rth->dst.dev = (out_dev)->dev;
2079 dev_hold(rth->dst.dev);
2080 rth->fl.oif = 0;
2081 rth->rt_spec_dst= spec_dst;
2082
2083 rth->dst.obsolete = -1;
2084 rth->dst.input = ip_forward;
2085 rth->dst.output = ip_output;
2086 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2087
2088 rt_set_nexthop(rth, res, itag);
2089
2090 rth->rt_flags = flags;
2091
2092 *result = rth;
2093 err = 0;
2094 cleanup:
2095 return err;
2096 }
2097
2098 static int ip_mkroute_input(struct sk_buff *skb,
2099 struct fib_result *res,
2100 const struct flowi *fl,
2101 struct in_device *in_dev,
2102 __be32 daddr, __be32 saddr, u32 tos)
2103 {
2104 struct rtable* rth = NULL;
2105 int err;
2106 unsigned hash;
2107
2108 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2109 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2110 fib_select_multipath(fl, res);
2111 #endif
2112
2113 /* create a routing cache entry */
2114 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2115 if (err)
2116 return err;
2117
2118 /* put it into the cache */
2119 hash = rt_hash(daddr, saddr, fl->iif,
2120 rt_genid(dev_net(rth->dst.dev)));
2121 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2122 }
2123
2124 /*
2125 * NOTE. We drop all the packets that has local source
2126 * addresses, because every properly looped back packet
2127 * must have correct destination already attached by output routine.
2128 *
2129 * Such approach solves two big problems:
2130 * 1. Not simplex devices are handled properly.
2131 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2132 * called with rcu_read_lock()
2133 */
2134
2135 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2136 u8 tos, struct net_device *dev)
2137 {
2138 struct fib_result res;
2139 struct in_device *in_dev = __in_dev_get_rcu(dev);
2140 struct flowi fl = { .fl4_dst = daddr,
2141 .fl4_src = saddr,
2142 .fl4_tos = tos,
2143 .fl4_scope = RT_SCOPE_UNIVERSE,
2144 .mark = skb->mark,
2145 .iif = dev->ifindex };
2146 unsigned flags = 0;
2147 u32 itag = 0;
2148 struct rtable * rth;
2149 unsigned hash;
2150 __be32 spec_dst;
2151 int err = -EINVAL;
2152 struct net * net = dev_net(dev);
2153
2154 /* IP on this device is disabled. */
2155
2156 if (!in_dev)
2157 goto out;
2158
2159 /* Check for the most weird martians, which can be not detected
2160 by fib_lookup.
2161 */
2162
2163 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2164 ipv4_is_loopback(saddr))
2165 goto martian_source;
2166
2167 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2168 goto brd_input;
2169
2170 /* Accept zero addresses only to limited broadcast;
2171 * I even do not know to fix it or not. Waiting for complains :-)
2172 */
2173 if (ipv4_is_zeronet(saddr))
2174 goto martian_source;
2175
2176 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2177 goto martian_destination;
2178
2179 /*
2180 * Now we are ready to route packet.
2181 */
2182 err = fib_lookup(net, &fl, &res);
2183 if (err != 0) {
2184 if (!IN_DEV_FORWARD(in_dev))
2185 goto e_hostunreach;
2186 goto no_route;
2187 }
2188
2189 RT_CACHE_STAT_INC(in_slow_tot);
2190
2191 if (res.type == RTN_BROADCAST)
2192 goto brd_input;
2193
2194 if (res.type == RTN_LOCAL) {
2195 err = fib_validate_source(saddr, daddr, tos,
2196 net->loopback_dev->ifindex,
2197 dev, &spec_dst, &itag, skb->mark);
2198 if (err < 0)
2199 goto martian_source_keep_err;
2200 if (err)
2201 flags |= RTCF_DIRECTSRC;
2202 spec_dst = daddr;
2203 goto local_input;
2204 }
2205
2206 if (!IN_DEV_FORWARD(in_dev))
2207 goto e_hostunreach;
2208 if (res.type != RTN_UNICAST)
2209 goto martian_destination;
2210
2211 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2212 out: return err;
2213
2214 brd_input:
2215 if (skb->protocol != htons(ETH_P_IP))
2216 goto e_inval;
2217
2218 if (ipv4_is_zeronet(saddr))
2219 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2220 else {
2221 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2222 &itag, skb->mark);
2223 if (err < 0)
2224 goto martian_source_keep_err;
2225 if (err)
2226 flags |= RTCF_DIRECTSRC;
2227 }
2228 flags |= RTCF_BROADCAST;
2229 res.type = RTN_BROADCAST;
2230 RT_CACHE_STAT_INC(in_brd);
2231
2232 local_input:
2233 rth = dst_alloc(&ipv4_dst_ops);
2234 if (!rth)
2235 goto e_nobufs;
2236
2237 rth->dst.output= ip_rt_bug;
2238 rth->dst.obsolete = -1;
2239 rth->rt_genid = rt_genid(net);
2240
2241 atomic_set(&rth->dst.__refcnt, 1);
2242 rth->dst.flags= DST_HOST;
2243 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2244 rth->dst.flags |= DST_NOPOLICY;
2245 rth->fl.fl4_dst = daddr;
2246 rth->rt_dst = daddr;
2247 rth->fl.fl4_tos = tos;
2248 rth->fl.mark = skb->mark;
2249 rth->fl.fl4_src = saddr;
2250 rth->rt_src = saddr;
2251 #ifdef CONFIG_IP_ROUTE_CLASSID
2252 rth->dst.tclassid = itag;
2253 #endif
2254 rth->rt_iif =
2255 rth->fl.iif = dev->ifindex;
2256 rth->dst.dev = net->loopback_dev;
2257 dev_hold(rth->dst.dev);
2258 rth->rt_gateway = daddr;
2259 rth->rt_spec_dst= spec_dst;
2260 rth->dst.input= ip_local_deliver;
2261 rth->rt_flags = flags|RTCF_LOCAL;
2262 if (res.type == RTN_UNREACHABLE) {
2263 rth->dst.input= ip_error;
2264 rth->dst.error= -err;
2265 rth->rt_flags &= ~RTCF_LOCAL;
2266 }
2267 rth->rt_type = res.type;
2268 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2269 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2270 goto out;
2271
2272 no_route:
2273 RT_CACHE_STAT_INC(in_no_route);
2274 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2275 res.type = RTN_UNREACHABLE;
2276 if (err == -ESRCH)
2277 err = -ENETUNREACH;
2278 goto local_input;
2279
2280 /*
2281 * Do not cache martian addresses: they should be logged (RFC1812)
2282 */
2283 martian_destination:
2284 RT_CACHE_STAT_INC(in_martian_dst);
2285 #ifdef CONFIG_IP_ROUTE_VERBOSE
2286 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2287 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2288 &daddr, &saddr, dev->name);
2289 #endif
2290
2291 e_hostunreach:
2292 err = -EHOSTUNREACH;
2293 goto out;
2294
2295 e_inval:
2296 err = -EINVAL;
2297 goto out;
2298
2299 e_nobufs:
2300 err = -ENOBUFS;
2301 goto out;
2302
2303 martian_source:
2304 err = -EINVAL;
2305 martian_source_keep_err:
2306 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2307 goto out;
2308 }
2309
2310 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2311 u8 tos, struct net_device *dev, bool noref)
2312 {
2313 struct rtable * rth;
2314 unsigned hash;
2315 int iif = dev->ifindex;
2316 struct net *net;
2317 int res;
2318
2319 net = dev_net(dev);
2320
2321 rcu_read_lock();
2322
2323 if (!rt_caching(net))
2324 goto skip_cache;
2325
2326 tos &= IPTOS_RT_MASK;
2327 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2328
2329 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2330 rth = rcu_dereference(rth->dst.rt_next)) {
2331 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2332 ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2333 (rth->fl.iif ^ iif) |
2334 rth->fl.oif |
2335 (rth->fl.fl4_tos ^ tos)) == 0 &&
2336 rth->fl.mark == skb->mark &&
2337 net_eq(dev_net(rth->dst.dev), net) &&
2338 !rt_is_expired(rth)) {
2339 if (noref) {
2340 dst_use_noref(&rth->dst, jiffies);
2341 skb_dst_set_noref(skb, &rth->dst);
2342 } else {
2343 dst_use(&rth->dst, jiffies);
2344 skb_dst_set(skb, &rth->dst);
2345 }
2346 RT_CACHE_STAT_INC(in_hit);
2347 rcu_read_unlock();
2348 return 0;
2349 }
2350 RT_CACHE_STAT_INC(in_hlist_search);
2351 }
2352
2353 skip_cache:
2354 /* Multicast recognition logic is moved from route cache to here.
2355 The problem was that too many Ethernet cards have broken/missing
2356 hardware multicast filters :-( As result the host on multicasting
2357 network acquires a lot of useless route cache entries, sort of
2358 SDR messages from all the world. Now we try to get rid of them.
2359 Really, provided software IP multicast filter is organized
2360 reasonably (at least, hashed), it does not result in a slowdown
2361 comparing with route cache reject entries.
2362 Note, that multicast routers are not affected, because
2363 route cache entry is created eventually.
2364 */
2365 if (ipv4_is_multicast(daddr)) {
2366 struct in_device *in_dev = __in_dev_get_rcu(dev);
2367
2368 if (in_dev) {
2369 int our = ip_check_mc(in_dev, daddr, saddr,
2370 ip_hdr(skb)->protocol);
2371 if (our
2372 #ifdef CONFIG_IP_MROUTE
2373 ||
2374 (!ipv4_is_local_multicast(daddr) &&
2375 IN_DEV_MFORWARD(in_dev))
2376 #endif
2377 ) {
2378 int res = ip_route_input_mc(skb, daddr, saddr,
2379 tos, dev, our);
2380 rcu_read_unlock();
2381 return res;
2382 }
2383 }
2384 rcu_read_unlock();
2385 return -EINVAL;
2386 }
2387 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2388 rcu_read_unlock();
2389 return res;
2390 }
2391 EXPORT_SYMBOL(ip_route_input_common);
2392
2393 /* called with rcu_read_lock() */
2394 static int __mkroute_output(struct rtable **result,
2395 struct fib_result *res,
2396 const struct flowi *fl,
2397 const struct flowi *oldflp,
2398 struct net_device *dev_out,
2399 unsigned flags)
2400 {
2401 struct rtable *rth;
2402 struct in_device *in_dev;
2403 u32 tos = RT_FL_TOS(oldflp);
2404
2405 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2406 return -EINVAL;
2407
2408 if (ipv4_is_lbcast(fl->fl4_dst))
2409 res->type = RTN_BROADCAST;
2410 else if (ipv4_is_multicast(fl->fl4_dst))
2411 res->type = RTN_MULTICAST;
2412 else if (ipv4_is_zeronet(fl->fl4_dst))
2413 return -EINVAL;
2414
2415 if (dev_out->flags & IFF_LOOPBACK)
2416 flags |= RTCF_LOCAL;
2417
2418 in_dev = __in_dev_get_rcu(dev_out);
2419 if (!in_dev)
2420 return -EINVAL;
2421
2422 if (res->type == RTN_BROADCAST) {
2423 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2424 res->fi = NULL;
2425 } else if (res->type == RTN_MULTICAST) {
2426 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2427 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2428 oldflp->proto))
2429 flags &= ~RTCF_LOCAL;
2430 /* If multicast route do not exist use
2431 * default one, but do not gateway in this case.
2432 * Yes, it is hack.
2433 */
2434 if (res->fi && res->prefixlen < 4)
2435 res->fi = NULL;
2436 }
2437
2438
2439 rth = dst_alloc(&ipv4_dst_ops);
2440 if (!rth)
2441 return -ENOBUFS;
2442
2443 atomic_set(&rth->dst.__refcnt, 1);
2444 rth->dst.flags= DST_HOST;
2445 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2446 rth->dst.flags |= DST_NOXFRM;
2447 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2448 rth->dst.flags |= DST_NOPOLICY;
2449
2450 rth->fl.fl4_dst = oldflp->fl4_dst;
2451 rth->fl.fl4_tos = tos;
2452 rth->fl.fl4_src = oldflp->fl4_src;
2453 rth->fl.oif = oldflp->oif;
2454 rth->fl.mark = oldflp->mark;
2455 rth->rt_dst = fl->fl4_dst;
2456 rth->rt_src = fl->fl4_src;
2457 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2458 /* get references to the devices that are to be hold by the routing
2459 cache entry */
2460 rth->dst.dev = dev_out;
2461 dev_hold(dev_out);
2462 rth->rt_gateway = fl->fl4_dst;
2463 rth->rt_spec_dst= fl->fl4_src;
2464
2465 rth->dst.output=ip_output;
2466 rth->dst.obsolete = -1;
2467 rth->rt_genid = rt_genid(dev_net(dev_out));
2468
2469 RT_CACHE_STAT_INC(out_slow_tot);
2470
2471 if (flags & RTCF_LOCAL) {
2472 rth->dst.input = ip_local_deliver;
2473 rth->rt_spec_dst = fl->fl4_dst;
2474 }
2475 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2476 rth->rt_spec_dst = fl->fl4_src;
2477 if (flags & RTCF_LOCAL &&
2478 !(dev_out->flags & IFF_LOOPBACK)) {
2479 rth->dst.output = ip_mc_output;
2480 RT_CACHE_STAT_INC(out_slow_mc);
2481 }
2482 #ifdef CONFIG_IP_MROUTE
2483 if (res->type == RTN_MULTICAST) {
2484 if (IN_DEV_MFORWARD(in_dev) &&
2485 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2486 rth->dst.input = ip_mr_input;
2487 rth->dst.output = ip_mc_output;
2488 }
2489 }
2490 #endif
2491 }
2492
2493 rt_set_nexthop(rth, res, 0);
2494
2495 rth->rt_flags = flags;
2496 *result = rth;
2497 return 0;
2498 }
2499
2500 /* called with rcu_read_lock() */
2501 static int ip_mkroute_output(struct rtable **rp,
2502 struct fib_result *res,
2503 const struct flowi *fl,
2504 const struct flowi *oldflp,
2505 struct net_device *dev_out,
2506 unsigned flags)
2507 {
2508 struct rtable *rth = NULL;
2509 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2510 unsigned hash;
2511 if (err == 0) {
2512 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2513 rt_genid(dev_net(dev_out)));
2514 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2515 }
2516
2517 return err;
2518 }
2519
2520 /*
2521 * Major route resolver routine.
2522 * called with rcu_read_lock();
2523 */
2524
2525 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2526 const struct flowi *oldflp)
2527 {
2528 u32 tos = RT_FL_TOS(oldflp);
2529 struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2530 .fl4_src = oldflp->fl4_src,
2531 .fl4_tos = tos & IPTOS_RT_MASK,
2532 .fl4_scope = ((tos & RTO_ONLINK) ?
2533 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2534 .mark = oldflp->mark,
2535 .iif = net->loopback_dev->ifindex,
2536 .oif = oldflp->oif };
2537 struct fib_result res;
2538 unsigned int flags = 0;
2539 struct net_device *dev_out = NULL;
2540 int err;
2541
2542
2543 res.fi = NULL;
2544 #ifdef CONFIG_IP_MULTIPLE_TABLES
2545 res.r = NULL;
2546 #endif
2547
2548 if (oldflp->fl4_src) {
2549 err = -EINVAL;
2550 if (ipv4_is_multicast(oldflp->fl4_src) ||
2551 ipv4_is_lbcast(oldflp->fl4_src) ||
2552 ipv4_is_zeronet(oldflp->fl4_src))
2553 goto out;
2554
2555 /* I removed check for oif == dev_out->oif here.
2556 It was wrong for two reasons:
2557 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2558 is assigned to multiple interfaces.
2559 2. Moreover, we are allowed to send packets with saddr
2560 of another iface. --ANK
2561 */
2562
2563 if (oldflp->oif == 0 &&
2564 (ipv4_is_multicast(oldflp->fl4_dst) ||
2565 ipv4_is_lbcast(oldflp->fl4_dst))) {
2566 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2567 dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2568 if (dev_out == NULL)
2569 goto out;
2570
2571 /* Special hack: user can direct multicasts
2572 and limited broadcast via necessary interface
2573 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2574 This hack is not just for fun, it allows
2575 vic,vat and friends to work.
2576 They bind socket to loopback, set ttl to zero
2577 and expect that it will work.
2578 From the viewpoint of routing cache they are broken,
2579 because we are not allowed to build multicast path
2580 with loopback source addr (look, routing cache
2581 cannot know, that ttl is zero, so that packet
2582 will not leave this host and route is valid).
2583 Luckily, this hack is good workaround.
2584 */
2585
2586 fl.oif = dev_out->ifindex;
2587 goto make_route;
2588 }
2589
2590 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2591 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2592 if (!__ip_dev_find(net, oldflp->fl4_src, false))
2593 goto out;
2594 }
2595 }
2596
2597
2598 if (oldflp->oif) {
2599 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2600 err = -ENODEV;
2601 if (dev_out == NULL)
2602 goto out;
2603
2604 /* RACE: Check return value of inet_select_addr instead. */
2605 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2606 err = -ENETUNREACH;
2607 goto out;
2608 }
2609 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2610 ipv4_is_lbcast(oldflp->fl4_dst)) {
2611 if (!fl.fl4_src)
2612 fl.fl4_src = inet_select_addr(dev_out, 0,
2613 RT_SCOPE_LINK);
2614 goto make_route;
2615 }
2616 if (!fl.fl4_src) {
2617 if (ipv4_is_multicast(oldflp->fl4_dst))
2618 fl.fl4_src = inet_select_addr(dev_out, 0,
2619 fl.fl4_scope);
2620 else if (!oldflp->fl4_dst)
2621 fl.fl4_src = inet_select_addr(dev_out, 0,
2622 RT_SCOPE_HOST);
2623 }
2624 }
2625
2626 if (!fl.fl4_dst) {
2627 fl.fl4_dst = fl.fl4_src;
2628 if (!fl.fl4_dst)
2629 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2630 dev_out = net->loopback_dev;
2631 fl.oif = net->loopback_dev->ifindex;
2632 res.type = RTN_LOCAL;
2633 flags |= RTCF_LOCAL;
2634 goto make_route;
2635 }
2636
2637 if (fib_lookup(net, &fl, &res)) {
2638 res.fi = NULL;
2639 if (oldflp->oif) {
2640 /* Apparently, routing tables are wrong. Assume,
2641 that the destination is on link.
2642
2643 WHY? DW.
2644 Because we are allowed to send to iface
2645 even if it has NO routes and NO assigned
2646 addresses. When oif is specified, routing
2647 tables are looked up with only one purpose:
2648 to catch if destination is gatewayed, rather than
2649 direct. Moreover, if MSG_DONTROUTE is set,
2650 we send packet, ignoring both routing tables
2651 and ifaddr state. --ANK
2652
2653
2654 We could make it even if oif is unknown,
2655 likely IPv6, but we do not.
2656 */
2657
2658 if (fl.fl4_src == 0)
2659 fl.fl4_src = inet_select_addr(dev_out, 0,
2660 RT_SCOPE_LINK);
2661 res.type = RTN_UNICAST;
2662 goto make_route;
2663 }
2664 err = -ENETUNREACH;
2665 goto out;
2666 }
2667
2668 if (res.type == RTN_LOCAL) {
2669 if (!fl.fl4_src) {
2670 if (res.fi->fib_prefsrc)
2671 fl.fl4_src = res.fi->fib_prefsrc;
2672 else
2673 fl.fl4_src = fl.fl4_dst;
2674 }
2675 dev_out = net->loopback_dev;
2676 fl.oif = dev_out->ifindex;
2677 res.fi = NULL;
2678 flags |= RTCF_LOCAL;
2679 goto make_route;
2680 }
2681
2682 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2683 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2684 fib_select_multipath(&fl, &res);
2685 else
2686 #endif
2687 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2688 fib_select_default(net, &fl, &res);
2689
2690 if (!fl.fl4_src)
2691 fl.fl4_src = FIB_RES_PREFSRC(res);
2692
2693 dev_out = FIB_RES_DEV(res);
2694 fl.oif = dev_out->ifindex;
2695
2696
2697 make_route:
2698 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2699
2700 out: return err;
2701 }
2702
2703 int __ip_route_output_key(struct net *net, struct rtable **rp,
2704 const struct flowi *flp)
2705 {
2706 unsigned int hash;
2707 int res;
2708 struct rtable *rth;
2709
2710 if (!rt_caching(net))
2711 goto slow_output;
2712
2713 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2714
2715 rcu_read_lock_bh();
2716 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2717 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2718 if (rth->fl.fl4_dst == flp->fl4_dst &&
2719 rth->fl.fl4_src == flp->fl4_src &&
2720 rt_is_output_route(rth) &&
2721 rth->fl.oif == flp->oif &&
2722 rth->fl.mark == flp->mark &&
2723 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2724 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2725 net_eq(dev_net(rth->dst.dev), net) &&
2726 !rt_is_expired(rth)) {
2727 dst_use(&rth->dst, jiffies);
2728 RT_CACHE_STAT_INC(out_hit);
2729 rcu_read_unlock_bh();
2730 *rp = rth;
2731 return 0;
2732 }
2733 RT_CACHE_STAT_INC(out_hlist_search);
2734 }
2735 rcu_read_unlock_bh();
2736
2737 slow_output:
2738 rcu_read_lock();
2739 res = ip_route_output_slow(net, rp, flp);
2740 rcu_read_unlock();
2741 return res;
2742 }
2743 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2744
2745 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2746 {
2747 return NULL;
2748 }
2749
2750 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2751 {
2752 }
2753
2754 static struct dst_ops ipv4_dst_blackhole_ops = {
2755 .family = AF_INET,
2756 .protocol = cpu_to_be16(ETH_P_IP),
2757 .destroy = ipv4_dst_destroy,
2758 .check = ipv4_blackhole_dst_check,
2759 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2760 };
2761
2762
2763 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2764 {
2765 struct rtable *ort = *rp;
2766 struct rtable *rt = (struct rtable *)
2767 dst_alloc(&ipv4_dst_blackhole_ops);
2768
2769 if (rt) {
2770 struct dst_entry *new = &rt->dst;
2771
2772 atomic_set(&new->__refcnt, 1);
2773 new->__use = 1;
2774 new->input = dst_discard;
2775 new->output = dst_discard;
2776 dst_copy_metrics(new, &ort->dst);
2777
2778 new->dev = ort->dst.dev;
2779 if (new->dev)
2780 dev_hold(new->dev);
2781
2782 rt->fl = ort->fl;
2783
2784 rt->rt_genid = rt_genid(net);
2785 rt->rt_flags = ort->rt_flags;
2786 rt->rt_type = ort->rt_type;
2787 rt->rt_dst = ort->rt_dst;
2788 rt->rt_src = ort->rt_src;
2789 rt->rt_iif = ort->rt_iif;
2790 rt->rt_gateway = ort->rt_gateway;
2791 rt->rt_spec_dst = ort->rt_spec_dst;
2792 rt->peer = ort->peer;
2793 if (rt->peer)
2794 atomic_inc(&rt->peer->refcnt);
2795 rt->fi = ort->fi;
2796 if (rt->fi)
2797 atomic_inc(&rt->fi->fib_clntref);
2798
2799 dst_free(new);
2800 }
2801
2802 dst_release(&(*rp)->dst);
2803 *rp = rt;
2804 return rt ? 0 : -ENOMEM;
2805 }
2806
2807 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2808 struct sock *sk, int flags)
2809 {
2810 int err;
2811
2812 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2813 return err;
2814
2815 if (flp->proto) {
2816 if (!flp->fl4_src)
2817 flp->fl4_src = (*rp)->rt_src;
2818 if (!flp->fl4_dst)
2819 flp->fl4_dst = (*rp)->rt_dst;
2820 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2821 flags ? XFRM_LOOKUP_WAIT : 0);
2822 if (err == -EREMOTE)
2823 err = ipv4_dst_blackhole(net, rp, flp);
2824
2825 return err;
2826 }
2827
2828 return 0;
2829 }
2830 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2831
2832 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2833 {
2834 return ip_route_output_flow(net, rp, flp, NULL, 0);
2835 }
2836 EXPORT_SYMBOL(ip_route_output_key);
2837
2838 static int rt_fill_info(struct net *net,
2839 struct sk_buff *skb, u32 pid, u32 seq, int event,
2840 int nowait, unsigned int flags)
2841 {
2842 struct rtable *rt = skb_rtable(skb);
2843 struct rtmsg *r;
2844 struct nlmsghdr *nlh;
2845 long expires;
2846 u32 id = 0, ts = 0, tsage = 0, error;
2847
2848 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2849 if (nlh == NULL)
2850 return -EMSGSIZE;
2851
2852 r = nlmsg_data(nlh);
2853 r->rtm_family = AF_INET;
2854 r->rtm_dst_len = 32;
2855 r->rtm_src_len = 0;
2856 r->rtm_tos = rt->fl.fl4_tos;
2857 r->rtm_table = RT_TABLE_MAIN;
2858 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2859 r->rtm_type = rt->rt_type;
2860 r->rtm_scope = RT_SCOPE_UNIVERSE;
2861 r->rtm_protocol = RTPROT_UNSPEC;
2862 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2863 if (rt->rt_flags & RTCF_NOTIFY)
2864 r->rtm_flags |= RTM_F_NOTIFY;
2865
2866 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2867
2868 if (rt->fl.fl4_src) {
2869 r->rtm_src_len = 32;
2870 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2871 }
2872 if (rt->dst.dev)
2873 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2874 #ifdef CONFIG_IP_ROUTE_CLASSID
2875 if (rt->dst.tclassid)
2876 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2877 #endif
2878 if (rt_is_input_route(rt))
2879 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2880 else if (rt->rt_src != rt->fl.fl4_src)
2881 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2882
2883 if (rt->rt_dst != rt->rt_gateway)
2884 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2885
2886 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2887 goto nla_put_failure;
2888
2889 if (rt->fl.mark)
2890 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2891
2892 error = rt->dst.error;
2893 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2894 if (rt->peer) {
2895 inet_peer_refcheck(rt->peer);
2896 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2897 if (rt->peer->tcp_ts_stamp) {
2898 ts = rt->peer->tcp_ts;
2899 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2900 }
2901 }
2902
2903 if (rt_is_input_route(rt)) {
2904 #ifdef CONFIG_IP_MROUTE
2905 __be32 dst = rt->rt_dst;
2906
2907 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2908 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2909 int err = ipmr_get_route(net, skb, r, nowait);
2910 if (err <= 0) {
2911 if (!nowait) {
2912 if (err == 0)
2913 return 0;
2914 goto nla_put_failure;
2915 } else {
2916 if (err == -EMSGSIZE)
2917 goto nla_put_failure;
2918 error = err;
2919 }
2920 }
2921 } else
2922 #endif
2923 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2924 }
2925
2926 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2927 expires, error) < 0)
2928 goto nla_put_failure;
2929
2930 return nlmsg_end(skb, nlh);
2931
2932 nla_put_failure:
2933 nlmsg_cancel(skb, nlh);
2934 return -EMSGSIZE;
2935 }
2936
2937 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2938 {
2939 struct net *net = sock_net(in_skb->sk);
2940 struct rtmsg *rtm;
2941 struct nlattr *tb[RTA_MAX+1];
2942 struct rtable *rt = NULL;
2943 __be32 dst = 0;
2944 __be32 src = 0;
2945 u32 iif;
2946 int err;
2947 int mark;
2948 struct sk_buff *skb;
2949
2950 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2951 if (err < 0)
2952 goto errout;
2953
2954 rtm = nlmsg_data(nlh);
2955
2956 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2957 if (skb == NULL) {
2958 err = -ENOBUFS;
2959 goto errout;
2960 }
2961
2962 /* Reserve room for dummy headers, this skb can pass
2963 through good chunk of routing engine.
2964 */
2965 skb_reset_mac_header(skb);
2966 skb_reset_network_header(skb);
2967
2968 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2969 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2970 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2971
2972 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2973 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2974 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2975 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2976
2977 if (iif) {
2978 struct net_device *dev;
2979
2980 dev = __dev_get_by_index(net, iif);
2981 if (dev == NULL) {
2982 err = -ENODEV;
2983 goto errout_free;
2984 }
2985
2986 skb->protocol = htons(ETH_P_IP);
2987 skb->dev = dev;
2988 skb->mark = mark;
2989 local_bh_disable();
2990 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2991 local_bh_enable();
2992
2993 rt = skb_rtable(skb);
2994 if (err == 0 && rt->dst.error)
2995 err = -rt->dst.error;
2996 } else {
2997 struct flowi fl = {
2998 .fl4_dst = dst,
2999 .fl4_src = src,
3000 .fl4_tos = rtm->rtm_tos,
3001 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3002 .mark = mark,
3003 };
3004 err = ip_route_output_key(net, &rt, &fl);
3005 }
3006
3007 if (err)
3008 goto errout_free;
3009
3010 skb_dst_set(skb, &rt->dst);
3011 if (rtm->rtm_flags & RTM_F_NOTIFY)
3012 rt->rt_flags |= RTCF_NOTIFY;
3013
3014 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3015 RTM_NEWROUTE, 0, 0);
3016 if (err <= 0)
3017 goto errout_free;
3018
3019 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3020 errout:
3021 return err;
3022
3023 errout_free:
3024 kfree_skb(skb);
3025 goto errout;
3026 }
3027
3028 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3029 {
3030 struct rtable *rt;
3031 int h, s_h;
3032 int idx, s_idx;
3033 struct net *net;
3034
3035 net = sock_net(skb->sk);
3036
3037 s_h = cb->args[0];
3038 if (s_h < 0)
3039 s_h = 0;
3040 s_idx = idx = cb->args[1];
3041 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3042 if (!rt_hash_table[h].chain)
3043 continue;
3044 rcu_read_lock_bh();
3045 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3046 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3047 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3048 continue;
3049 if (rt_is_expired(rt))
3050 continue;
3051 skb_dst_set_noref(skb, &rt->dst);
3052 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3053 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3054 1, NLM_F_MULTI) <= 0) {
3055 skb_dst_drop(skb);
3056 rcu_read_unlock_bh();
3057 goto done;
3058 }
3059 skb_dst_drop(skb);
3060 }
3061 rcu_read_unlock_bh();
3062 }
3063
3064 done:
3065 cb->args[0] = h;
3066 cb->args[1] = idx;
3067 return skb->len;
3068 }
3069
3070 void ip_rt_multicast_event(struct in_device *in_dev)
3071 {
3072 rt_cache_flush(dev_net(in_dev->dev), 0);
3073 }
3074
3075 #ifdef CONFIG_SYSCTL
3076 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3077 void __user *buffer,
3078 size_t *lenp, loff_t *ppos)
3079 {
3080 if (write) {
3081 int flush_delay;
3082 ctl_table ctl;
3083 struct net *net;
3084
3085 memcpy(&ctl, __ctl, sizeof(ctl));
3086 ctl.data = &flush_delay;
3087 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3088
3089 net = (struct net *)__ctl->extra1;
3090 rt_cache_flush(net, flush_delay);
3091 return 0;
3092 }
3093
3094 return -EINVAL;
3095 }
3096
3097 static ctl_table ipv4_route_table[] = {
3098 {
3099 .procname = "gc_thresh",
3100 .data = &ipv4_dst_ops.gc_thresh,
3101 .maxlen = sizeof(int),
3102 .mode = 0644,
3103 .proc_handler = proc_dointvec,
3104 },
3105 {
3106 .procname = "max_size",
3107 .data = &ip_rt_max_size,
3108 .maxlen = sizeof(int),
3109 .mode = 0644,
3110 .proc_handler = proc_dointvec,
3111 },
3112 {
3113 /* Deprecated. Use gc_min_interval_ms */
3114
3115 .procname = "gc_min_interval",
3116 .data = &ip_rt_gc_min_interval,
3117 .maxlen = sizeof(int),
3118 .mode = 0644,
3119 .proc_handler = proc_dointvec_jiffies,
3120 },
3121 {
3122 .procname = "gc_min_interval_ms",
3123 .data = &ip_rt_gc_min_interval,
3124 .maxlen = sizeof(int),
3125 .mode = 0644,
3126 .proc_handler = proc_dointvec_ms_jiffies,
3127 },
3128 {
3129 .procname = "gc_timeout",
3130 .data = &ip_rt_gc_timeout,
3131 .maxlen = sizeof(int),
3132 .mode = 0644,
3133 .proc_handler = proc_dointvec_jiffies,
3134 },
3135 {
3136 .procname = "gc_interval",
3137 .data = &ip_rt_gc_interval,
3138 .maxlen = sizeof(int),
3139 .mode = 0644,
3140 .proc_handler = proc_dointvec_jiffies,
3141 },
3142 {
3143 .procname = "redirect_load",
3144 .data = &ip_rt_redirect_load,
3145 .maxlen = sizeof(int),
3146 .mode = 0644,
3147 .proc_handler = proc_dointvec,
3148 },
3149 {
3150 .procname = "redirect_number",
3151 .data = &ip_rt_redirect_number,
3152 .maxlen = sizeof(int),
3153 .mode = 0644,
3154 .proc_handler = proc_dointvec,
3155 },
3156 {
3157 .procname = "redirect_silence",
3158 .data = &ip_rt_redirect_silence,
3159 .maxlen = sizeof(int),
3160 .mode = 0644,
3161 .proc_handler = proc_dointvec,
3162 },
3163 {
3164 .procname = "error_cost",
3165 .data = &ip_rt_error_cost,
3166 .maxlen = sizeof(int),
3167 .mode = 0644,
3168 .proc_handler = proc_dointvec,
3169 },
3170 {
3171 .procname = "error_burst",
3172 .data = &ip_rt_error_burst,
3173 .maxlen = sizeof(int),
3174 .mode = 0644,
3175 .proc_handler = proc_dointvec,
3176 },
3177 {
3178 .procname = "gc_elasticity",
3179 .data = &ip_rt_gc_elasticity,
3180 .maxlen = sizeof(int),
3181 .mode = 0644,
3182 .proc_handler = proc_dointvec,
3183 },
3184 {
3185 .procname = "mtu_expires",
3186 .data = &ip_rt_mtu_expires,
3187 .maxlen = sizeof(int),
3188 .mode = 0644,
3189 .proc_handler = proc_dointvec_jiffies,
3190 },
3191 {
3192 .procname = "min_pmtu",
3193 .data = &ip_rt_min_pmtu,
3194 .maxlen = sizeof(int),
3195 .mode = 0644,
3196 .proc_handler = proc_dointvec,
3197 },
3198 {
3199 .procname = "min_adv_mss",
3200 .data = &ip_rt_min_advmss,
3201 .maxlen = sizeof(int),
3202 .mode = 0644,
3203 .proc_handler = proc_dointvec,
3204 },
3205 { }
3206 };
3207
3208 static struct ctl_table empty[1];
3209
3210 static struct ctl_table ipv4_skeleton[] =
3211 {
3212 { .procname = "route",
3213 .mode = 0555, .child = ipv4_route_table},
3214 { .procname = "neigh",
3215 .mode = 0555, .child = empty},
3216 { }
3217 };
3218
3219 static __net_initdata struct ctl_path ipv4_path[] = {
3220 { .procname = "net", },
3221 { .procname = "ipv4", },
3222 { },
3223 };
3224
3225 static struct ctl_table ipv4_route_flush_table[] = {
3226 {
3227 .procname = "flush",
3228 .maxlen = sizeof(int),
3229 .mode = 0200,
3230 .proc_handler = ipv4_sysctl_rtcache_flush,
3231 },
3232 { },
3233 };
3234
3235 static __net_initdata struct ctl_path ipv4_route_path[] = {
3236 { .procname = "net", },
3237 { .procname = "ipv4", },
3238 { .procname = "route", },
3239 { },
3240 };
3241
3242 static __net_init int sysctl_route_net_init(struct net *net)
3243 {
3244 struct ctl_table *tbl;
3245
3246 tbl = ipv4_route_flush_table;
3247 if (!net_eq(net, &init_net)) {
3248 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3249 if (tbl == NULL)
3250 goto err_dup;
3251 }
3252 tbl[0].extra1 = net;
3253
3254 net->ipv4.route_hdr =
3255 register_net_sysctl_table(net, ipv4_route_path, tbl);
3256 if (net->ipv4.route_hdr == NULL)
3257 goto err_reg;
3258 return 0;
3259
3260 err_reg:
3261 if (tbl != ipv4_route_flush_table)
3262 kfree(tbl);
3263 err_dup:
3264 return -ENOMEM;
3265 }
3266
3267 static __net_exit void sysctl_route_net_exit(struct net *net)
3268 {
3269 struct ctl_table *tbl;
3270
3271 tbl = net->ipv4.route_hdr->ctl_table_arg;
3272 unregister_net_sysctl_table(net->ipv4.route_hdr);
3273 BUG_ON(tbl == ipv4_route_flush_table);
3274 kfree(tbl);
3275 }
3276
3277 static __net_initdata struct pernet_operations sysctl_route_ops = {
3278 .init = sysctl_route_net_init,
3279 .exit = sysctl_route_net_exit,
3280 };
3281 #endif
3282
3283 static __net_init int rt_genid_init(struct net *net)
3284 {
3285 get_random_bytes(&net->ipv4.rt_genid,
3286 sizeof(net->ipv4.rt_genid));
3287 return 0;
3288 }
3289
3290 static __net_initdata struct pernet_operations rt_genid_ops = {
3291 .init = rt_genid_init,
3292 };
3293
3294
3295 #ifdef CONFIG_IP_ROUTE_CLASSID
3296 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3297 #endif /* CONFIG_IP_ROUTE_CLASSID */
3298
3299 static __initdata unsigned long rhash_entries;
3300 static int __init set_rhash_entries(char *str)
3301 {
3302 if (!str)
3303 return 0;
3304 rhash_entries = simple_strtoul(str, &str, 0);
3305 return 1;
3306 }
3307 __setup("rhash_entries=", set_rhash_entries);
3308
3309 int __init ip_rt_init(void)
3310 {
3311 int rc = 0;
3312
3313 #ifdef CONFIG_IP_ROUTE_CLASSID
3314 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3315 if (!ip_rt_acct)
3316 panic("IP: failed to allocate ip_rt_acct\n");
3317 #endif
3318
3319 ipv4_dst_ops.kmem_cachep =
3320 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3321 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3322
3323 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3324
3325 if (dst_entries_init(&ipv4_dst_ops) < 0)
3326 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3327
3328 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3329 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3330
3331 rt_hash_table = (struct rt_hash_bucket *)
3332 alloc_large_system_hash("IP route cache",
3333 sizeof(struct rt_hash_bucket),
3334 rhash_entries,
3335 (totalram_pages >= 128 * 1024) ?
3336 15 : 17,
3337 0,
3338 &rt_hash_log,
3339 &rt_hash_mask,
3340 rhash_entries ? 0 : 512 * 1024);
3341 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3342 rt_hash_lock_init();
3343
3344 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3345 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3346
3347 devinet_init();
3348 ip_fib_init();
3349
3350 /* All the timers, started at system startup tend
3351 to synchronize. Perturb it a bit.
3352 */
3353 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3354 expires_ljiffies = jiffies;
3355 schedule_delayed_work(&expires_work,
3356 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3357
3358 if (ip_rt_proc_init())
3359 printk(KERN_ERR "Unable to create route proc files\n");
3360 #ifdef CONFIG_XFRM
3361 xfrm_init();
3362 xfrm4_init(ip_rt_max_size);
3363 #endif
3364 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3365
3366 #ifdef CONFIG_SYSCTL
3367 register_pernet_subsys(&sysctl_route_ops);
3368 #endif
3369 register_pernet_subsys(&rt_genid_ops);
3370 return rc;
3371 }
3372
3373 #ifdef CONFIG_SYSCTL
3374 /*
3375 * We really need to sanitize the damn ipv4 init order, then all
3376 * this nonsense will go away.
3377 */
3378 void __init ip_static_sysctl_init(void)
3379 {
3380 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3381 }
3382 #endif
This page took 0.09633 seconds and 4 git commands to generate.