bcacf54e541879a01f8afe737e2cbbe13ce497c3
[deliverable/linux.git] / net / ipv4 / route.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
113
114 #define RT_FL_TOS(oldflp4) \
115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
116
117 #define IP_MAX_MTU 0xFFF0
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
124 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125 static int ip_rt_redirect_number __read_mostly = 9;
126 static int ip_rt_redirect_load __read_mostly = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly = HZ;
129 static int ip_rt_error_burst __read_mostly = 5 * HZ;
130 static int ip_rt_gc_elasticity __read_mostly = 8;
131 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133 static int ip_rt_min_advmss __read_mostly = 256;
134 static int rt_chain_length_max __read_mostly = 20;
135 static int redirect_genid;
136
137 static struct delayed_work expires_work;
138 static unsigned long expires_ljiffies;
139
140 /*
141 * Interface to generic destination cache.
142 */
143
144 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
145 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
146 static unsigned int ipv4_mtu(const struct dst_entry *dst);
147 static void ipv4_dst_destroy(struct dst_entry *dst);
148 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
149 static void ipv4_link_failure(struct sk_buff *skb);
150 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
151 static int rt_garbage_collect(struct dst_ops *ops);
152
153 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
154 int how)
155 {
156 }
157
158 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
159 {
160 struct rtable *rt = (struct rtable *) dst;
161 struct inet_peer *peer;
162 u32 *p = NULL;
163
164 if (!rt->peer)
165 rt_bind_peer(rt, rt->rt_dst, 1);
166
167 peer = rt->peer;
168 if (peer) {
169 u32 *old_p = __DST_METRICS_PTR(old);
170 unsigned long prev, new;
171
172 p = peer->metrics;
173 if (inet_metrics_new(peer))
174 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
175
176 new = (unsigned long) p;
177 prev = cmpxchg(&dst->_metrics, old, new);
178
179 if (prev != old) {
180 p = __DST_METRICS_PTR(prev);
181 if (prev & DST_METRICS_READ_ONLY)
182 p = NULL;
183 } else {
184 if (rt->fi) {
185 fib_info_put(rt->fi);
186 rt->fi = NULL;
187 }
188 }
189 }
190 return p;
191 }
192
193 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
194
195 static struct dst_ops ipv4_dst_ops = {
196 .family = AF_INET,
197 .protocol = cpu_to_be16(ETH_P_IP),
198 .gc = rt_garbage_collect,
199 .check = ipv4_dst_check,
200 .default_advmss = ipv4_default_advmss,
201 .mtu = ipv4_mtu,
202 .cow_metrics = ipv4_cow_metrics,
203 .destroy = ipv4_dst_destroy,
204 .ifdown = ipv4_dst_ifdown,
205 .negative_advice = ipv4_negative_advice,
206 .link_failure = ipv4_link_failure,
207 .update_pmtu = ip_rt_update_pmtu,
208 .local_out = __ip_local_out,
209 .neigh_lookup = ipv4_neigh_lookup,
210 };
211
212 #define ECN_OR_COST(class) TC_PRIO_##class
213
214 const __u8 ip_tos2prio[16] = {
215 TC_PRIO_BESTEFFORT,
216 ECN_OR_COST(BESTEFFORT),
217 TC_PRIO_BESTEFFORT,
218 ECN_OR_COST(BESTEFFORT),
219 TC_PRIO_BULK,
220 ECN_OR_COST(BULK),
221 TC_PRIO_BULK,
222 ECN_OR_COST(BULK),
223 TC_PRIO_INTERACTIVE,
224 ECN_OR_COST(INTERACTIVE),
225 TC_PRIO_INTERACTIVE,
226 ECN_OR_COST(INTERACTIVE),
227 TC_PRIO_INTERACTIVE_BULK,
228 ECN_OR_COST(INTERACTIVE_BULK),
229 TC_PRIO_INTERACTIVE_BULK,
230 ECN_OR_COST(INTERACTIVE_BULK)
231 };
232
233
234 /*
235 * Route cache.
236 */
237
238 /* The locking scheme is rather straight forward:
239 *
240 * 1) Read-Copy Update protects the buckets of the central route hash.
241 * 2) Only writers remove entries, and they hold the lock
242 * as they look at rtable reference counts.
243 * 3) Only readers acquire references to rtable entries,
244 * they do so with atomic increments and with the
245 * lock held.
246 */
247
248 struct rt_hash_bucket {
249 struct rtable __rcu *chain;
250 };
251
252 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
253 defined(CONFIG_PROVE_LOCKING)
254 /*
255 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
256 * The size of this table is a power of two and depends on the number of CPUS.
257 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
258 */
259 #ifdef CONFIG_LOCKDEP
260 # define RT_HASH_LOCK_SZ 256
261 #else
262 # if NR_CPUS >= 32
263 # define RT_HASH_LOCK_SZ 4096
264 # elif NR_CPUS >= 16
265 # define RT_HASH_LOCK_SZ 2048
266 # elif NR_CPUS >= 8
267 # define RT_HASH_LOCK_SZ 1024
268 # elif NR_CPUS >= 4
269 # define RT_HASH_LOCK_SZ 512
270 # else
271 # define RT_HASH_LOCK_SZ 256
272 # endif
273 #endif
274
275 static spinlock_t *rt_hash_locks;
276 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
277
278 static __init void rt_hash_lock_init(void)
279 {
280 int i;
281
282 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
283 GFP_KERNEL);
284 if (!rt_hash_locks)
285 panic("IP: failed to allocate rt_hash_locks\n");
286
287 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
288 spin_lock_init(&rt_hash_locks[i]);
289 }
290 #else
291 # define rt_hash_lock_addr(slot) NULL
292
293 static inline void rt_hash_lock_init(void)
294 {
295 }
296 #endif
297
298 static struct rt_hash_bucket *rt_hash_table __read_mostly;
299 static unsigned rt_hash_mask __read_mostly;
300 static unsigned int rt_hash_log __read_mostly;
301
302 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
303 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
304
305 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
306 int genid)
307 {
308 return jhash_3words((__force u32)daddr, (__force u32)saddr,
309 idx, genid)
310 & rt_hash_mask;
311 }
312
313 static inline int rt_genid(struct net *net)
314 {
315 return atomic_read(&net->ipv4.rt_genid);
316 }
317
318 #ifdef CONFIG_PROC_FS
319 struct rt_cache_iter_state {
320 struct seq_net_private p;
321 int bucket;
322 int genid;
323 };
324
325 static struct rtable *rt_cache_get_first(struct seq_file *seq)
326 {
327 struct rt_cache_iter_state *st = seq->private;
328 struct rtable *r = NULL;
329
330 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
331 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
332 continue;
333 rcu_read_lock_bh();
334 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
335 while (r) {
336 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
337 r->rt_genid == st->genid)
338 return r;
339 r = rcu_dereference_bh(r->dst.rt_next);
340 }
341 rcu_read_unlock_bh();
342 }
343 return r;
344 }
345
346 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
347 struct rtable *r)
348 {
349 struct rt_cache_iter_state *st = seq->private;
350
351 r = rcu_dereference_bh(r->dst.rt_next);
352 while (!r) {
353 rcu_read_unlock_bh();
354 do {
355 if (--st->bucket < 0)
356 return NULL;
357 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
358 rcu_read_lock_bh();
359 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
360 }
361 return r;
362 }
363
364 static struct rtable *rt_cache_get_next(struct seq_file *seq,
365 struct rtable *r)
366 {
367 struct rt_cache_iter_state *st = seq->private;
368 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
369 if (dev_net(r->dst.dev) != seq_file_net(seq))
370 continue;
371 if (r->rt_genid == st->genid)
372 break;
373 }
374 return r;
375 }
376
377 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
378 {
379 struct rtable *r = rt_cache_get_first(seq);
380
381 if (r)
382 while (pos && (r = rt_cache_get_next(seq, r)))
383 --pos;
384 return pos ? NULL : r;
385 }
386
387 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
388 {
389 struct rt_cache_iter_state *st = seq->private;
390 if (*pos)
391 return rt_cache_get_idx(seq, *pos - 1);
392 st->genid = rt_genid(seq_file_net(seq));
393 return SEQ_START_TOKEN;
394 }
395
396 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
397 {
398 struct rtable *r;
399
400 if (v == SEQ_START_TOKEN)
401 r = rt_cache_get_first(seq);
402 else
403 r = rt_cache_get_next(seq, v);
404 ++*pos;
405 return r;
406 }
407
408 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
409 {
410 if (v && v != SEQ_START_TOKEN)
411 rcu_read_unlock_bh();
412 }
413
414 static int rt_cache_seq_show(struct seq_file *seq, void *v)
415 {
416 if (v == SEQ_START_TOKEN)
417 seq_printf(seq, "%-127s\n",
418 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
419 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
420 "HHUptod\tSpecDst");
421 else {
422 struct rtable *r = v;
423 struct neighbour *n;
424 int len, HHUptod;
425
426 rcu_read_lock();
427 n = dst_get_neighbour_noref(&r->dst);
428 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
429 rcu_read_unlock();
430
431 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
432 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
433 r->dst.dev ? r->dst.dev->name : "*",
434 (__force u32)r->rt_dst,
435 (__force u32)r->rt_gateway,
436 r->rt_flags, atomic_read(&r->dst.__refcnt),
437 r->dst.__use, 0, (__force u32)r->rt_src,
438 dst_metric_advmss(&r->dst) + 40,
439 dst_metric(&r->dst, RTAX_WINDOW),
440 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
441 dst_metric(&r->dst, RTAX_RTTVAR)),
442 r->rt_key_tos,
443 -1,
444 HHUptod,
445 r->rt_spec_dst, &len);
446
447 seq_printf(seq, "%*s\n", 127 - len, "");
448 }
449 return 0;
450 }
451
452 static const struct seq_operations rt_cache_seq_ops = {
453 .start = rt_cache_seq_start,
454 .next = rt_cache_seq_next,
455 .stop = rt_cache_seq_stop,
456 .show = rt_cache_seq_show,
457 };
458
459 static int rt_cache_seq_open(struct inode *inode, struct file *file)
460 {
461 return seq_open_net(inode, file, &rt_cache_seq_ops,
462 sizeof(struct rt_cache_iter_state));
463 }
464
465 static const struct file_operations rt_cache_seq_fops = {
466 .owner = THIS_MODULE,
467 .open = rt_cache_seq_open,
468 .read = seq_read,
469 .llseek = seq_lseek,
470 .release = seq_release_net,
471 };
472
473
474 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
475 {
476 int cpu;
477
478 if (*pos == 0)
479 return SEQ_START_TOKEN;
480
481 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
482 if (!cpu_possible(cpu))
483 continue;
484 *pos = cpu+1;
485 return &per_cpu(rt_cache_stat, cpu);
486 }
487 return NULL;
488 }
489
490 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
491 {
492 int cpu;
493
494 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
495 if (!cpu_possible(cpu))
496 continue;
497 *pos = cpu+1;
498 return &per_cpu(rt_cache_stat, cpu);
499 }
500 return NULL;
501
502 }
503
504 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
505 {
506
507 }
508
509 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
510 {
511 struct rt_cache_stat *st = v;
512
513 if (v == SEQ_START_TOKEN) {
514 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
515 return 0;
516 }
517
518 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
519 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
520 dst_entries_get_slow(&ipv4_dst_ops),
521 st->in_hit,
522 st->in_slow_tot,
523 st->in_slow_mc,
524 st->in_no_route,
525 st->in_brd,
526 st->in_martian_dst,
527 st->in_martian_src,
528
529 st->out_hit,
530 st->out_slow_tot,
531 st->out_slow_mc,
532
533 st->gc_total,
534 st->gc_ignored,
535 st->gc_goal_miss,
536 st->gc_dst_overflow,
537 st->in_hlist_search,
538 st->out_hlist_search
539 );
540 return 0;
541 }
542
543 static const struct seq_operations rt_cpu_seq_ops = {
544 .start = rt_cpu_seq_start,
545 .next = rt_cpu_seq_next,
546 .stop = rt_cpu_seq_stop,
547 .show = rt_cpu_seq_show,
548 };
549
550
551 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
552 {
553 return seq_open(file, &rt_cpu_seq_ops);
554 }
555
556 static const struct file_operations rt_cpu_seq_fops = {
557 .owner = THIS_MODULE,
558 .open = rt_cpu_seq_open,
559 .read = seq_read,
560 .llseek = seq_lseek,
561 .release = seq_release,
562 };
563
564 #ifdef CONFIG_IP_ROUTE_CLASSID
565 static int rt_acct_proc_show(struct seq_file *m, void *v)
566 {
567 struct ip_rt_acct *dst, *src;
568 unsigned int i, j;
569
570 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
571 if (!dst)
572 return -ENOMEM;
573
574 for_each_possible_cpu(i) {
575 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
576 for (j = 0; j < 256; j++) {
577 dst[j].o_bytes += src[j].o_bytes;
578 dst[j].o_packets += src[j].o_packets;
579 dst[j].i_bytes += src[j].i_bytes;
580 dst[j].i_packets += src[j].i_packets;
581 }
582 }
583
584 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
585 kfree(dst);
586 return 0;
587 }
588
589 static int rt_acct_proc_open(struct inode *inode, struct file *file)
590 {
591 return single_open(file, rt_acct_proc_show, NULL);
592 }
593
594 static const struct file_operations rt_acct_proc_fops = {
595 .owner = THIS_MODULE,
596 .open = rt_acct_proc_open,
597 .read = seq_read,
598 .llseek = seq_lseek,
599 .release = single_release,
600 };
601 #endif
602
603 static int __net_init ip_rt_do_proc_init(struct net *net)
604 {
605 struct proc_dir_entry *pde;
606
607 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
608 &rt_cache_seq_fops);
609 if (!pde)
610 goto err1;
611
612 pde = proc_create("rt_cache", S_IRUGO,
613 net->proc_net_stat, &rt_cpu_seq_fops);
614 if (!pde)
615 goto err2;
616
617 #ifdef CONFIG_IP_ROUTE_CLASSID
618 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
619 if (!pde)
620 goto err3;
621 #endif
622 return 0;
623
624 #ifdef CONFIG_IP_ROUTE_CLASSID
625 err3:
626 remove_proc_entry("rt_cache", net->proc_net_stat);
627 #endif
628 err2:
629 remove_proc_entry("rt_cache", net->proc_net);
630 err1:
631 return -ENOMEM;
632 }
633
634 static void __net_exit ip_rt_do_proc_exit(struct net *net)
635 {
636 remove_proc_entry("rt_cache", net->proc_net_stat);
637 remove_proc_entry("rt_cache", net->proc_net);
638 #ifdef CONFIG_IP_ROUTE_CLASSID
639 remove_proc_entry("rt_acct", net->proc_net);
640 #endif
641 }
642
643 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
644 .init = ip_rt_do_proc_init,
645 .exit = ip_rt_do_proc_exit,
646 };
647
648 static int __init ip_rt_proc_init(void)
649 {
650 return register_pernet_subsys(&ip_rt_proc_ops);
651 }
652
653 #else
654 static inline int ip_rt_proc_init(void)
655 {
656 return 0;
657 }
658 #endif /* CONFIG_PROC_FS */
659
660 static inline void rt_free(struct rtable *rt)
661 {
662 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
663 }
664
665 static inline void rt_drop(struct rtable *rt)
666 {
667 ip_rt_put(rt);
668 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
669 }
670
671 static inline int rt_fast_clean(struct rtable *rth)
672 {
673 /* Kill broadcast/multicast entries very aggresively, if they
674 collide in hash table with more useful entries */
675 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
676 rt_is_input_route(rth) && rth->dst.rt_next;
677 }
678
679 static inline int rt_valuable(struct rtable *rth)
680 {
681 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
682 (rth->peer && rth->peer->pmtu_expires);
683 }
684
685 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
686 {
687 unsigned long age;
688 int ret = 0;
689
690 if (atomic_read(&rth->dst.__refcnt))
691 goto out;
692
693 age = jiffies - rth->dst.lastuse;
694 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
695 (age <= tmo2 && rt_valuable(rth)))
696 goto out;
697 ret = 1;
698 out: return ret;
699 }
700
701 /* Bits of score are:
702 * 31: very valuable
703 * 30: not quite useless
704 * 29..0: usage counter
705 */
706 static inline u32 rt_score(struct rtable *rt)
707 {
708 u32 score = jiffies - rt->dst.lastuse;
709
710 score = ~score & ~(3<<30);
711
712 if (rt_valuable(rt))
713 score |= (1<<31);
714
715 if (rt_is_output_route(rt) ||
716 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
717 score |= (1<<30);
718
719 return score;
720 }
721
722 static inline bool rt_caching(const struct net *net)
723 {
724 return net->ipv4.current_rt_cache_rebuild_count <=
725 net->ipv4.sysctl_rt_cache_rebuild_count;
726 }
727
728 static inline bool compare_hash_inputs(const struct rtable *rt1,
729 const struct rtable *rt2)
730 {
731 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
732 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
733 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
734 }
735
736 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
737 {
738 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
739 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
740 (rt1->rt_mark ^ rt2->rt_mark) |
741 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
742 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
743 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
744 }
745
746 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
747 {
748 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
749 }
750
751 static inline int rt_is_expired(struct rtable *rth)
752 {
753 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
754 }
755
756 /*
757 * Perform a full scan of hash table and free all entries.
758 * Can be called by a softirq or a process.
759 * In the later case, we want to be reschedule if necessary
760 */
761 static void rt_do_flush(struct net *net, int process_context)
762 {
763 unsigned int i;
764 struct rtable *rth, *next;
765
766 for (i = 0; i <= rt_hash_mask; i++) {
767 struct rtable __rcu **pprev;
768 struct rtable *list;
769
770 if (process_context && need_resched())
771 cond_resched();
772 rth = rcu_access_pointer(rt_hash_table[i].chain);
773 if (!rth)
774 continue;
775
776 spin_lock_bh(rt_hash_lock_addr(i));
777
778 list = NULL;
779 pprev = &rt_hash_table[i].chain;
780 rth = rcu_dereference_protected(*pprev,
781 lockdep_is_held(rt_hash_lock_addr(i)));
782
783 while (rth) {
784 next = rcu_dereference_protected(rth->dst.rt_next,
785 lockdep_is_held(rt_hash_lock_addr(i)));
786
787 if (!net ||
788 net_eq(dev_net(rth->dst.dev), net)) {
789 rcu_assign_pointer(*pprev, next);
790 rcu_assign_pointer(rth->dst.rt_next, list);
791 list = rth;
792 } else {
793 pprev = &rth->dst.rt_next;
794 }
795 rth = next;
796 }
797
798 spin_unlock_bh(rt_hash_lock_addr(i));
799
800 for (; list; list = next) {
801 next = rcu_dereference_protected(list->dst.rt_next, 1);
802 rt_free(list);
803 }
804 }
805 }
806
807 /*
808 * While freeing expired entries, we compute average chain length
809 * and standard deviation, using fixed-point arithmetic.
810 * This to have an estimation of rt_chain_length_max
811 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
812 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
813 */
814
815 #define FRACT_BITS 3
816 #define ONE (1UL << FRACT_BITS)
817
818 /*
819 * Given a hash chain and an item in this hash chain,
820 * find if a previous entry has the same hash_inputs
821 * (but differs on tos, mark or oif)
822 * Returns 0 if an alias is found.
823 * Returns ONE if rth has no alias before itself.
824 */
825 static int has_noalias(const struct rtable *head, const struct rtable *rth)
826 {
827 const struct rtable *aux = head;
828
829 while (aux != rth) {
830 if (compare_hash_inputs(aux, rth))
831 return 0;
832 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
833 }
834 return ONE;
835 }
836
837 static void rt_check_expire(void)
838 {
839 static unsigned int rover;
840 unsigned int i = rover, goal;
841 struct rtable *rth;
842 struct rtable __rcu **rthp;
843 unsigned long samples = 0;
844 unsigned long sum = 0, sum2 = 0;
845 unsigned long delta;
846 u64 mult;
847
848 delta = jiffies - expires_ljiffies;
849 expires_ljiffies = jiffies;
850 mult = ((u64)delta) << rt_hash_log;
851 if (ip_rt_gc_timeout > 1)
852 do_div(mult, ip_rt_gc_timeout);
853 goal = (unsigned int)mult;
854 if (goal > rt_hash_mask)
855 goal = rt_hash_mask + 1;
856 for (; goal > 0; goal--) {
857 unsigned long tmo = ip_rt_gc_timeout;
858 unsigned long length;
859
860 i = (i + 1) & rt_hash_mask;
861 rthp = &rt_hash_table[i].chain;
862
863 if (need_resched())
864 cond_resched();
865
866 samples++;
867
868 if (rcu_dereference_raw(*rthp) == NULL)
869 continue;
870 length = 0;
871 spin_lock_bh(rt_hash_lock_addr(i));
872 while ((rth = rcu_dereference_protected(*rthp,
873 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
874 prefetch(rth->dst.rt_next);
875 if (rt_is_expired(rth)) {
876 *rthp = rth->dst.rt_next;
877 rt_free(rth);
878 continue;
879 }
880 if (rth->dst.expires) {
881 /* Entry is expired even if it is in use */
882 if (time_before_eq(jiffies, rth->dst.expires)) {
883 nofree:
884 tmo >>= 1;
885 rthp = &rth->dst.rt_next;
886 /*
887 * We only count entries on
888 * a chain with equal hash inputs once
889 * so that entries for different QOS
890 * levels, and other non-hash input
891 * attributes don't unfairly skew
892 * the length computation
893 */
894 length += has_noalias(rt_hash_table[i].chain, rth);
895 continue;
896 }
897 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
898 goto nofree;
899
900 /* Cleanup aged off entries. */
901 *rthp = rth->dst.rt_next;
902 rt_free(rth);
903 }
904 spin_unlock_bh(rt_hash_lock_addr(i));
905 sum += length;
906 sum2 += length*length;
907 }
908 if (samples) {
909 unsigned long avg = sum / samples;
910 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
911 rt_chain_length_max = max_t(unsigned long,
912 ip_rt_gc_elasticity,
913 (avg + 4*sd) >> FRACT_BITS);
914 }
915 rover = i;
916 }
917
918 /*
919 * rt_worker_func() is run in process context.
920 * we call rt_check_expire() to scan part of the hash table
921 */
922 static void rt_worker_func(struct work_struct *work)
923 {
924 rt_check_expire();
925 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
926 }
927
928 /*
929 * Perturbation of rt_genid by a small quantity [1..256]
930 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
931 * many times (2^24) without giving recent rt_genid.
932 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
933 */
934 static void rt_cache_invalidate(struct net *net)
935 {
936 unsigned char shuffle;
937
938 get_random_bytes(&shuffle, sizeof(shuffle));
939 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
940 redirect_genid++;
941 }
942
943 /*
944 * delay < 0 : invalidate cache (fast : entries will be deleted later)
945 * delay >= 0 : invalidate & flush cache (can be long)
946 */
947 void rt_cache_flush(struct net *net, int delay)
948 {
949 rt_cache_invalidate(net);
950 if (delay >= 0)
951 rt_do_flush(net, !in_softirq());
952 }
953
954 /* Flush previous cache invalidated entries from the cache */
955 void rt_cache_flush_batch(struct net *net)
956 {
957 rt_do_flush(net, !in_softirq());
958 }
959
960 static void rt_emergency_hash_rebuild(struct net *net)
961 {
962 if (net_ratelimit())
963 printk(KERN_WARNING "Route hash chain too long!\n");
964 rt_cache_invalidate(net);
965 }
966
967 /*
968 Short description of GC goals.
969
970 We want to build algorithm, which will keep routing cache
971 at some equilibrium point, when number of aged off entries
972 is kept approximately equal to newly generated ones.
973
974 Current expiration strength is variable "expire".
975 We try to adjust it dynamically, so that if networking
976 is idle expires is large enough to keep enough of warm entries,
977 and when load increases it reduces to limit cache size.
978 */
979
980 static int rt_garbage_collect(struct dst_ops *ops)
981 {
982 static unsigned long expire = RT_GC_TIMEOUT;
983 static unsigned long last_gc;
984 static int rover;
985 static int equilibrium;
986 struct rtable *rth;
987 struct rtable __rcu **rthp;
988 unsigned long now = jiffies;
989 int goal;
990 int entries = dst_entries_get_fast(&ipv4_dst_ops);
991
992 /*
993 * Garbage collection is pretty expensive,
994 * do not make it too frequently.
995 */
996
997 RT_CACHE_STAT_INC(gc_total);
998
999 if (now - last_gc < ip_rt_gc_min_interval &&
1000 entries < ip_rt_max_size) {
1001 RT_CACHE_STAT_INC(gc_ignored);
1002 goto out;
1003 }
1004
1005 entries = dst_entries_get_slow(&ipv4_dst_ops);
1006 /* Calculate number of entries, which we want to expire now. */
1007 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1008 if (goal <= 0) {
1009 if (equilibrium < ipv4_dst_ops.gc_thresh)
1010 equilibrium = ipv4_dst_ops.gc_thresh;
1011 goal = entries - equilibrium;
1012 if (goal > 0) {
1013 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014 goal = entries - equilibrium;
1015 }
1016 } else {
1017 /* We are in dangerous area. Try to reduce cache really
1018 * aggressively.
1019 */
1020 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021 equilibrium = entries - goal;
1022 }
1023
1024 if (now - last_gc >= ip_rt_gc_min_interval)
1025 last_gc = now;
1026
1027 if (goal <= 0) {
1028 equilibrium += goal;
1029 goto work_done;
1030 }
1031
1032 do {
1033 int i, k;
1034
1035 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036 unsigned long tmo = expire;
1037
1038 k = (k + 1) & rt_hash_mask;
1039 rthp = &rt_hash_table[k].chain;
1040 spin_lock_bh(rt_hash_lock_addr(k));
1041 while ((rth = rcu_dereference_protected(*rthp,
1042 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043 if (!rt_is_expired(rth) &&
1044 !rt_may_expire(rth, tmo, expire)) {
1045 tmo >>= 1;
1046 rthp = &rth->dst.rt_next;
1047 continue;
1048 }
1049 *rthp = rth->dst.rt_next;
1050 rt_free(rth);
1051 goal--;
1052 }
1053 spin_unlock_bh(rt_hash_lock_addr(k));
1054 if (goal <= 0)
1055 break;
1056 }
1057 rover = k;
1058
1059 if (goal <= 0)
1060 goto work_done;
1061
1062 /* Goal is not achieved. We stop process if:
1063
1064 - if expire reduced to zero. Otherwise, expire is halfed.
1065 - if table is not full.
1066 - if we are called from interrupt.
1067 - jiffies check is just fallback/debug loop breaker.
1068 We will not spin here for long time in any case.
1069 */
1070
1071 RT_CACHE_STAT_INC(gc_goal_miss);
1072
1073 if (expire == 0)
1074 break;
1075
1076 expire >>= 1;
1077
1078 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079 goto out;
1080 } while (!in_softirq() && time_before_eq(jiffies, now));
1081
1082 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083 goto out;
1084 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085 goto out;
1086 if (net_ratelimit())
1087 printk(KERN_WARNING "dst cache overflow\n");
1088 RT_CACHE_STAT_INC(gc_dst_overflow);
1089 return 1;
1090
1091 work_done:
1092 expire += ip_rt_gc_min_interval;
1093 if (expire > ip_rt_gc_timeout ||
1094 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1095 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1096 expire = ip_rt_gc_timeout;
1097 out: return 0;
1098 }
1099
1100 /*
1101 * Returns number of entries in a hash chain that have different hash_inputs
1102 */
1103 static int slow_chain_length(const struct rtable *head)
1104 {
1105 int length = 0;
1106 const struct rtable *rth = head;
1107
1108 while (rth) {
1109 length += has_noalias(head, rth);
1110 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1111 }
1112 return length >> FRACT_BITS;
1113 }
1114
1115 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1116 {
1117 static const __be32 inaddr_any = 0;
1118 struct net_device *dev = dst->dev;
1119 const __be32 *pkey = daddr;
1120 struct neighbour *n;
1121
1122 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1123 pkey = &inaddr_any;
1124
1125 n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
1126 if (n)
1127 return n;
1128 return neigh_create(&arp_tbl, pkey, dev);
1129 }
1130
1131 static int rt_bind_neighbour(struct rtable *rt)
1132 {
1133 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1134 if (IS_ERR(n))
1135 return PTR_ERR(n);
1136 dst_set_neighbour(&rt->dst, n);
1137
1138 return 0;
1139 }
1140
1141 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1142 struct sk_buff *skb, int ifindex)
1143 {
1144 struct rtable *rth, *cand;
1145 struct rtable __rcu **rthp, **candp;
1146 unsigned long now;
1147 u32 min_score;
1148 int chain_length;
1149 int attempts = !in_softirq();
1150
1151 restart:
1152 chain_length = 0;
1153 min_score = ~(u32)0;
1154 cand = NULL;
1155 candp = NULL;
1156 now = jiffies;
1157
1158 if (!rt_caching(dev_net(rt->dst.dev))) {
1159 /*
1160 * If we're not caching, just tell the caller we
1161 * were successful and don't touch the route. The
1162 * caller hold the sole reference to the cache entry, and
1163 * it will be released when the caller is done with it.
1164 * If we drop it here, the callers have no way to resolve routes
1165 * when we're not caching. Instead, just point *rp at rt, so
1166 * the caller gets a single use out of the route
1167 * Note that we do rt_free on this new route entry, so that
1168 * once its refcount hits zero, we are still able to reap it
1169 * (Thanks Alexey)
1170 * Note: To avoid expensive rcu stuff for this uncached dst,
1171 * we set DST_NOCACHE so that dst_release() can free dst without
1172 * waiting a grace period.
1173 */
1174
1175 rt->dst.flags |= DST_NOCACHE;
1176 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1177 int err = rt_bind_neighbour(rt);
1178 if (err) {
1179 if (net_ratelimit())
1180 printk(KERN_WARNING
1181 "Neighbour table failure & not caching routes.\n");
1182 ip_rt_put(rt);
1183 return ERR_PTR(err);
1184 }
1185 }
1186
1187 goto skip_hashing;
1188 }
1189
1190 rthp = &rt_hash_table[hash].chain;
1191
1192 spin_lock_bh(rt_hash_lock_addr(hash));
1193 while ((rth = rcu_dereference_protected(*rthp,
1194 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1195 if (rt_is_expired(rth)) {
1196 *rthp = rth->dst.rt_next;
1197 rt_free(rth);
1198 continue;
1199 }
1200 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1201 /* Put it first */
1202 *rthp = rth->dst.rt_next;
1203 /*
1204 * Since lookup is lockfree, the deletion
1205 * must be visible to another weakly ordered CPU before
1206 * the insertion at the start of the hash chain.
1207 */
1208 rcu_assign_pointer(rth->dst.rt_next,
1209 rt_hash_table[hash].chain);
1210 /*
1211 * Since lookup is lockfree, the update writes
1212 * must be ordered for consistency on SMP.
1213 */
1214 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1215
1216 dst_use(&rth->dst, now);
1217 spin_unlock_bh(rt_hash_lock_addr(hash));
1218
1219 rt_drop(rt);
1220 if (skb)
1221 skb_dst_set(skb, &rth->dst);
1222 return rth;
1223 }
1224
1225 if (!atomic_read(&rth->dst.__refcnt)) {
1226 u32 score = rt_score(rth);
1227
1228 if (score <= min_score) {
1229 cand = rth;
1230 candp = rthp;
1231 min_score = score;
1232 }
1233 }
1234
1235 chain_length++;
1236
1237 rthp = &rth->dst.rt_next;
1238 }
1239
1240 if (cand) {
1241 /* ip_rt_gc_elasticity used to be average length of chain
1242 * length, when exceeded gc becomes really aggressive.
1243 *
1244 * The second limit is less certain. At the moment it allows
1245 * only 2 entries per bucket. We will see.
1246 */
1247 if (chain_length > ip_rt_gc_elasticity) {
1248 *candp = cand->dst.rt_next;
1249 rt_free(cand);
1250 }
1251 } else {
1252 if (chain_length > rt_chain_length_max &&
1253 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1254 struct net *net = dev_net(rt->dst.dev);
1255 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1256 if (!rt_caching(net)) {
1257 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1258 rt->dst.dev->name, num);
1259 }
1260 rt_emergency_hash_rebuild(net);
1261 spin_unlock_bh(rt_hash_lock_addr(hash));
1262
1263 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1264 ifindex, rt_genid(net));
1265 goto restart;
1266 }
1267 }
1268
1269 /* Try to bind route to arp only if it is output
1270 route or unicast forwarding path.
1271 */
1272 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1273 int err = rt_bind_neighbour(rt);
1274 if (err) {
1275 spin_unlock_bh(rt_hash_lock_addr(hash));
1276
1277 if (err != -ENOBUFS) {
1278 rt_drop(rt);
1279 return ERR_PTR(err);
1280 }
1281
1282 /* Neighbour tables are full and nothing
1283 can be released. Try to shrink route cache,
1284 it is most likely it holds some neighbour records.
1285 */
1286 if (attempts-- > 0) {
1287 int saved_elasticity = ip_rt_gc_elasticity;
1288 int saved_int = ip_rt_gc_min_interval;
1289 ip_rt_gc_elasticity = 1;
1290 ip_rt_gc_min_interval = 0;
1291 rt_garbage_collect(&ipv4_dst_ops);
1292 ip_rt_gc_min_interval = saved_int;
1293 ip_rt_gc_elasticity = saved_elasticity;
1294 goto restart;
1295 }
1296
1297 if (net_ratelimit())
1298 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1299 rt_drop(rt);
1300 return ERR_PTR(-ENOBUFS);
1301 }
1302 }
1303
1304 rt->dst.rt_next = rt_hash_table[hash].chain;
1305
1306 /*
1307 * Since lookup is lockfree, we must make sure
1308 * previous writes to rt are committed to memory
1309 * before making rt visible to other CPUS.
1310 */
1311 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1312
1313 spin_unlock_bh(rt_hash_lock_addr(hash));
1314
1315 skip_hashing:
1316 if (skb)
1317 skb_dst_set(skb, &rt->dst);
1318 return rt;
1319 }
1320
1321 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1322
1323 static u32 rt_peer_genid(void)
1324 {
1325 return atomic_read(&__rt_peer_genid);
1326 }
1327
1328 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1329 {
1330 struct inet_peer *peer;
1331
1332 peer = inet_getpeer_v4(daddr, create);
1333
1334 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1335 inet_putpeer(peer);
1336 else
1337 rt->rt_peer_genid = rt_peer_genid();
1338 }
1339
1340 /*
1341 * Peer allocation may fail only in serious out-of-memory conditions. However
1342 * we still can generate some output.
1343 * Random ID selection looks a bit dangerous because we have no chances to
1344 * select ID being unique in a reasonable period of time.
1345 * But broken packet identifier may be better than no packet at all.
1346 */
1347 static void ip_select_fb_ident(struct iphdr *iph)
1348 {
1349 static DEFINE_SPINLOCK(ip_fb_id_lock);
1350 static u32 ip_fallback_id;
1351 u32 salt;
1352
1353 spin_lock_bh(&ip_fb_id_lock);
1354 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1355 iph->id = htons(salt & 0xFFFF);
1356 ip_fallback_id = salt;
1357 spin_unlock_bh(&ip_fb_id_lock);
1358 }
1359
1360 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1361 {
1362 struct rtable *rt = (struct rtable *) dst;
1363
1364 if (rt && !(rt->dst.flags & DST_NOPEER)) {
1365 if (rt->peer == NULL)
1366 rt_bind_peer(rt, rt->rt_dst, 1);
1367
1368 /* If peer is attached to destination, it is never detached,
1369 so that we need not to grab a lock to dereference it.
1370 */
1371 if (rt->peer) {
1372 iph->id = htons(inet_getid(rt->peer, more));
1373 return;
1374 }
1375 } else if (!rt)
1376 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1377 __builtin_return_address(0));
1378
1379 ip_select_fb_ident(iph);
1380 }
1381 EXPORT_SYMBOL(__ip_select_ident);
1382
1383 static void rt_del(unsigned hash, struct rtable *rt)
1384 {
1385 struct rtable __rcu **rthp;
1386 struct rtable *aux;
1387
1388 rthp = &rt_hash_table[hash].chain;
1389 spin_lock_bh(rt_hash_lock_addr(hash));
1390 ip_rt_put(rt);
1391 while ((aux = rcu_dereference_protected(*rthp,
1392 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1393 if (aux == rt || rt_is_expired(aux)) {
1394 *rthp = aux->dst.rt_next;
1395 rt_free(aux);
1396 continue;
1397 }
1398 rthp = &aux->dst.rt_next;
1399 }
1400 spin_unlock_bh(rt_hash_lock_addr(hash));
1401 }
1402
1403 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1404 {
1405 struct rtable *rt = (struct rtable *) dst;
1406 __be32 orig_gw = rt->rt_gateway;
1407 struct neighbour *n, *old_n;
1408
1409 dst_confirm(&rt->dst);
1410
1411 rt->rt_gateway = peer->redirect_learned.a4;
1412
1413 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1414 if (IS_ERR(n)) {
1415 rt->rt_gateway = orig_gw;
1416 return;
1417 }
1418 old_n = xchg(&rt->dst._neighbour, n);
1419 if (old_n)
1420 neigh_release(old_n);
1421 if (!(n->nud_state & NUD_VALID)) {
1422 neigh_event_send(n, NULL);
1423 } else {
1424 rt->rt_flags |= RTCF_REDIRECTED;
1425 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1426 }
1427 }
1428
1429 /* called in rcu_read_lock() section */
1430 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1431 __be32 saddr, struct net_device *dev)
1432 {
1433 int s, i;
1434 struct in_device *in_dev = __in_dev_get_rcu(dev);
1435 __be32 skeys[2] = { saddr, 0 };
1436 int ikeys[2] = { dev->ifindex, 0 };
1437 struct inet_peer *peer;
1438 struct net *net;
1439
1440 if (!in_dev)
1441 return;
1442
1443 net = dev_net(dev);
1444 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1445 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1446 ipv4_is_zeronet(new_gw))
1447 goto reject_redirect;
1448
1449 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1450 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1451 goto reject_redirect;
1452 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1453 goto reject_redirect;
1454 } else {
1455 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1456 goto reject_redirect;
1457 }
1458
1459 for (s = 0; s < 2; s++) {
1460 for (i = 0; i < 2; i++) {
1461 unsigned int hash;
1462 struct rtable __rcu **rthp;
1463 struct rtable *rt;
1464
1465 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1466
1467 rthp = &rt_hash_table[hash].chain;
1468
1469 while ((rt = rcu_dereference(*rthp)) != NULL) {
1470 rthp = &rt->dst.rt_next;
1471
1472 if (rt->rt_key_dst != daddr ||
1473 rt->rt_key_src != skeys[s] ||
1474 rt->rt_oif != ikeys[i] ||
1475 rt_is_input_route(rt) ||
1476 rt_is_expired(rt) ||
1477 !net_eq(dev_net(rt->dst.dev), net) ||
1478 rt->dst.error ||
1479 rt->dst.dev != dev ||
1480 rt->rt_gateway != old_gw)
1481 continue;
1482
1483 if (!rt->peer)
1484 rt_bind_peer(rt, rt->rt_dst, 1);
1485
1486 peer = rt->peer;
1487 if (peer) {
1488 if (peer->redirect_learned.a4 != new_gw ||
1489 peer->redirect_genid != redirect_genid) {
1490 peer->redirect_learned.a4 = new_gw;
1491 peer->redirect_genid = redirect_genid;
1492 atomic_inc(&__rt_peer_genid);
1493 }
1494 check_peer_redir(&rt->dst, peer);
1495 }
1496 }
1497 }
1498 }
1499 return;
1500
1501 reject_redirect:
1502 #ifdef CONFIG_IP_ROUTE_VERBOSE
1503 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1504 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1505 " Advised path = %pI4 -> %pI4\n",
1506 &old_gw, dev->name, &new_gw,
1507 &saddr, &daddr);
1508 #endif
1509 ;
1510 }
1511
1512 static bool peer_pmtu_expired(struct inet_peer *peer)
1513 {
1514 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1515
1516 return orig &&
1517 time_after_eq(jiffies, orig) &&
1518 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1519 }
1520
1521 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1522 {
1523 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1524
1525 return orig &&
1526 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1527 }
1528
1529 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1530 {
1531 struct rtable *rt = (struct rtable *)dst;
1532 struct dst_entry *ret = dst;
1533
1534 if (rt) {
1535 if (dst->obsolete > 0) {
1536 ip_rt_put(rt);
1537 ret = NULL;
1538 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1539 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1540 rt->rt_oif,
1541 rt_genid(dev_net(dst->dev)));
1542 rt_del(hash, rt);
1543 ret = NULL;
1544 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1545 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1546 }
1547 }
1548 return ret;
1549 }
1550
1551 /*
1552 * Algorithm:
1553 * 1. The first ip_rt_redirect_number redirects are sent
1554 * with exponential backoff, then we stop sending them at all,
1555 * assuming that the host ignores our redirects.
1556 * 2. If we did not see packets requiring redirects
1557 * during ip_rt_redirect_silence, we assume that the host
1558 * forgot redirected route and start to send redirects again.
1559 *
1560 * This algorithm is much cheaper and more intelligent than dumb load limiting
1561 * in icmp.c.
1562 *
1563 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1564 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1565 */
1566
1567 void ip_rt_send_redirect(struct sk_buff *skb)
1568 {
1569 struct rtable *rt = skb_rtable(skb);
1570 struct in_device *in_dev;
1571 struct inet_peer *peer;
1572 int log_martians;
1573
1574 rcu_read_lock();
1575 in_dev = __in_dev_get_rcu(rt->dst.dev);
1576 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1577 rcu_read_unlock();
1578 return;
1579 }
1580 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1581 rcu_read_unlock();
1582
1583 if (!rt->peer)
1584 rt_bind_peer(rt, rt->rt_dst, 1);
1585 peer = rt->peer;
1586 if (!peer) {
1587 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1588 return;
1589 }
1590
1591 /* No redirected packets during ip_rt_redirect_silence;
1592 * reset the algorithm.
1593 */
1594 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1595 peer->rate_tokens = 0;
1596
1597 /* Too many ignored redirects; do not send anything
1598 * set dst.rate_last to the last seen redirected packet.
1599 */
1600 if (peer->rate_tokens >= ip_rt_redirect_number) {
1601 peer->rate_last = jiffies;
1602 return;
1603 }
1604
1605 /* Check for load limit; set rate_last to the latest sent
1606 * redirect.
1607 */
1608 if (peer->rate_tokens == 0 ||
1609 time_after(jiffies,
1610 (peer->rate_last +
1611 (ip_rt_redirect_load << peer->rate_tokens)))) {
1612 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1613 peer->rate_last = jiffies;
1614 ++peer->rate_tokens;
1615 #ifdef CONFIG_IP_ROUTE_VERBOSE
1616 if (log_martians &&
1617 peer->rate_tokens == ip_rt_redirect_number &&
1618 net_ratelimit())
1619 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1620 &ip_hdr(skb)->saddr, rt->rt_iif,
1621 &rt->rt_dst, &rt->rt_gateway);
1622 #endif
1623 }
1624 }
1625
1626 static int ip_error(struct sk_buff *skb)
1627 {
1628 struct rtable *rt = skb_rtable(skb);
1629 struct inet_peer *peer;
1630 unsigned long now;
1631 bool send;
1632 int code;
1633
1634 switch (rt->dst.error) {
1635 case EINVAL:
1636 default:
1637 goto out;
1638 case EHOSTUNREACH:
1639 code = ICMP_HOST_UNREACH;
1640 break;
1641 case ENETUNREACH:
1642 code = ICMP_NET_UNREACH;
1643 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1644 IPSTATS_MIB_INNOROUTES);
1645 break;
1646 case EACCES:
1647 code = ICMP_PKT_FILTERED;
1648 break;
1649 }
1650
1651 if (!rt->peer)
1652 rt_bind_peer(rt, rt->rt_dst, 1);
1653 peer = rt->peer;
1654
1655 send = true;
1656 if (peer) {
1657 now = jiffies;
1658 peer->rate_tokens += now - peer->rate_last;
1659 if (peer->rate_tokens > ip_rt_error_burst)
1660 peer->rate_tokens = ip_rt_error_burst;
1661 peer->rate_last = now;
1662 if (peer->rate_tokens >= ip_rt_error_cost)
1663 peer->rate_tokens -= ip_rt_error_cost;
1664 else
1665 send = false;
1666 }
1667 if (send)
1668 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1669
1670 out: kfree_skb(skb);
1671 return 0;
1672 }
1673
1674 /*
1675 * The last two values are not from the RFC but
1676 * are needed for AMPRnet AX.25 paths.
1677 */
1678
1679 static const unsigned short mtu_plateau[] =
1680 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1681
1682 static inline unsigned short guess_mtu(unsigned short old_mtu)
1683 {
1684 int i;
1685
1686 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1687 if (old_mtu > mtu_plateau[i])
1688 return mtu_plateau[i];
1689 return 68;
1690 }
1691
1692 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1693 unsigned short new_mtu,
1694 struct net_device *dev)
1695 {
1696 unsigned short old_mtu = ntohs(iph->tot_len);
1697 unsigned short est_mtu = 0;
1698 struct inet_peer *peer;
1699
1700 peer = inet_getpeer_v4(iph->daddr, 1);
1701 if (peer) {
1702 unsigned short mtu = new_mtu;
1703
1704 if (new_mtu < 68 || new_mtu >= old_mtu) {
1705 /* BSD 4.2 derived systems incorrectly adjust
1706 * tot_len by the IP header length, and report
1707 * a zero MTU in the ICMP message.
1708 */
1709 if (mtu == 0 &&
1710 old_mtu >= 68 + (iph->ihl << 2))
1711 old_mtu -= iph->ihl << 2;
1712 mtu = guess_mtu(old_mtu);
1713 }
1714
1715 if (mtu < ip_rt_min_pmtu)
1716 mtu = ip_rt_min_pmtu;
1717 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1718 unsigned long pmtu_expires;
1719
1720 pmtu_expires = jiffies + ip_rt_mtu_expires;
1721 if (!pmtu_expires)
1722 pmtu_expires = 1UL;
1723
1724 est_mtu = mtu;
1725 peer->pmtu_learned = mtu;
1726 peer->pmtu_expires = pmtu_expires;
1727 atomic_inc(&__rt_peer_genid);
1728 }
1729
1730 inet_putpeer(peer);
1731 }
1732 return est_mtu ? : new_mtu;
1733 }
1734
1735 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1736 {
1737 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1738
1739 if (!expires)
1740 return;
1741 if (time_before(jiffies, expires)) {
1742 u32 orig_dst_mtu = dst_mtu(dst);
1743 if (peer->pmtu_learned < orig_dst_mtu) {
1744 if (!peer->pmtu_orig)
1745 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1746 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1747 }
1748 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1749 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1750 }
1751
1752 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1753 {
1754 struct rtable *rt = (struct rtable *) dst;
1755 struct inet_peer *peer;
1756
1757 dst_confirm(dst);
1758
1759 if (!rt->peer)
1760 rt_bind_peer(rt, rt->rt_dst, 1);
1761 peer = rt->peer;
1762 if (peer) {
1763 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1764
1765 if (mtu < ip_rt_min_pmtu)
1766 mtu = ip_rt_min_pmtu;
1767 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1768
1769 pmtu_expires = jiffies + ip_rt_mtu_expires;
1770 if (!pmtu_expires)
1771 pmtu_expires = 1UL;
1772
1773 peer->pmtu_learned = mtu;
1774 peer->pmtu_expires = pmtu_expires;
1775
1776 atomic_inc(&__rt_peer_genid);
1777 rt->rt_peer_genid = rt_peer_genid();
1778 }
1779 check_peer_pmtu(dst, peer);
1780 }
1781 }
1782
1783
1784 static void ipv4_validate_peer(struct rtable *rt)
1785 {
1786 if (rt->rt_peer_genid != rt_peer_genid()) {
1787 struct inet_peer *peer;
1788
1789 if (!rt->peer)
1790 rt_bind_peer(rt, rt->rt_dst, 0);
1791
1792 peer = rt->peer;
1793 if (peer) {
1794 check_peer_pmtu(&rt->dst, peer);
1795
1796 if (peer->redirect_genid != redirect_genid)
1797 peer->redirect_learned.a4 = 0;
1798 if (peer->redirect_learned.a4 &&
1799 peer->redirect_learned.a4 != rt->rt_gateway)
1800 check_peer_redir(&rt->dst, peer);
1801 }
1802
1803 rt->rt_peer_genid = rt_peer_genid();
1804 }
1805 }
1806
1807 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1808 {
1809 struct rtable *rt = (struct rtable *) dst;
1810
1811 if (rt_is_expired(rt))
1812 return NULL;
1813 ipv4_validate_peer(rt);
1814 return dst;
1815 }
1816
1817 static void ipv4_dst_destroy(struct dst_entry *dst)
1818 {
1819 struct rtable *rt = (struct rtable *) dst;
1820 struct inet_peer *peer = rt->peer;
1821
1822 if (rt->fi) {
1823 fib_info_put(rt->fi);
1824 rt->fi = NULL;
1825 }
1826 if (peer) {
1827 rt->peer = NULL;
1828 inet_putpeer(peer);
1829 }
1830 }
1831
1832
1833 static void ipv4_link_failure(struct sk_buff *skb)
1834 {
1835 struct rtable *rt;
1836
1837 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1838
1839 rt = skb_rtable(skb);
1840 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1841 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1842 }
1843
1844 static int ip_rt_bug(struct sk_buff *skb)
1845 {
1846 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1847 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1848 skb->dev ? skb->dev->name : "?");
1849 kfree_skb(skb);
1850 WARN_ON(1);
1851 return 0;
1852 }
1853
1854 /*
1855 We do not cache source address of outgoing interface,
1856 because it is used only by IP RR, TS and SRR options,
1857 so that it out of fast path.
1858
1859 BTW remember: "addr" is allowed to be not aligned
1860 in IP options!
1861 */
1862
1863 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1864 {
1865 __be32 src;
1866
1867 if (rt_is_output_route(rt))
1868 src = ip_hdr(skb)->saddr;
1869 else {
1870 struct fib_result res;
1871 struct flowi4 fl4;
1872 struct iphdr *iph;
1873
1874 iph = ip_hdr(skb);
1875
1876 memset(&fl4, 0, sizeof(fl4));
1877 fl4.daddr = iph->daddr;
1878 fl4.saddr = iph->saddr;
1879 fl4.flowi4_tos = RT_TOS(iph->tos);
1880 fl4.flowi4_oif = rt->dst.dev->ifindex;
1881 fl4.flowi4_iif = skb->dev->ifindex;
1882 fl4.flowi4_mark = skb->mark;
1883
1884 rcu_read_lock();
1885 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1886 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1887 else
1888 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1889 RT_SCOPE_UNIVERSE);
1890 rcu_read_unlock();
1891 }
1892 memcpy(addr, &src, 4);
1893 }
1894
1895 #ifdef CONFIG_IP_ROUTE_CLASSID
1896 static void set_class_tag(struct rtable *rt, u32 tag)
1897 {
1898 if (!(rt->dst.tclassid & 0xFFFF))
1899 rt->dst.tclassid |= tag & 0xFFFF;
1900 if (!(rt->dst.tclassid & 0xFFFF0000))
1901 rt->dst.tclassid |= tag & 0xFFFF0000;
1902 }
1903 #endif
1904
1905 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1906 {
1907 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1908
1909 if (advmss == 0) {
1910 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1911 ip_rt_min_advmss);
1912 if (advmss > 65535 - 40)
1913 advmss = 65535 - 40;
1914 }
1915 return advmss;
1916 }
1917
1918 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1919 {
1920 const struct rtable *rt = (const struct rtable *) dst;
1921 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1922
1923 if (mtu && rt_is_output_route(rt))
1924 return mtu;
1925
1926 mtu = dst->dev->mtu;
1927
1928 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1929
1930 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1931 mtu = 576;
1932 }
1933
1934 if (mtu > IP_MAX_MTU)
1935 mtu = IP_MAX_MTU;
1936
1937 return mtu;
1938 }
1939
1940 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1941 struct fib_info *fi)
1942 {
1943 struct inet_peer *peer;
1944 int create = 0;
1945
1946 /* If a peer entry exists for this destination, we must hook
1947 * it up in order to get at cached metrics.
1948 */
1949 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1950 create = 1;
1951
1952 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1953 if (peer) {
1954 rt->rt_peer_genid = rt_peer_genid();
1955 if (inet_metrics_new(peer))
1956 memcpy(peer->metrics, fi->fib_metrics,
1957 sizeof(u32) * RTAX_MAX);
1958 dst_init_metrics(&rt->dst, peer->metrics, false);
1959
1960 check_peer_pmtu(&rt->dst, peer);
1961 if (peer->redirect_genid != redirect_genid)
1962 peer->redirect_learned.a4 = 0;
1963 if (peer->redirect_learned.a4 &&
1964 peer->redirect_learned.a4 != rt->rt_gateway) {
1965 rt->rt_gateway = peer->redirect_learned.a4;
1966 rt->rt_flags |= RTCF_REDIRECTED;
1967 }
1968 } else {
1969 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1970 rt->fi = fi;
1971 atomic_inc(&fi->fib_clntref);
1972 }
1973 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1974 }
1975 }
1976
1977 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1978 const struct fib_result *res,
1979 struct fib_info *fi, u16 type, u32 itag)
1980 {
1981 struct dst_entry *dst = &rt->dst;
1982
1983 if (fi) {
1984 if (FIB_RES_GW(*res) &&
1985 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1986 rt->rt_gateway = FIB_RES_GW(*res);
1987 rt_init_metrics(rt, fl4, fi);
1988 #ifdef CONFIG_IP_ROUTE_CLASSID
1989 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1990 #endif
1991 }
1992
1993 if (dst_mtu(dst) > IP_MAX_MTU)
1994 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1995 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1996 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1997
1998 #ifdef CONFIG_IP_ROUTE_CLASSID
1999 #ifdef CONFIG_IP_MULTIPLE_TABLES
2000 set_class_tag(rt, fib_rules_tclass(res));
2001 #endif
2002 set_class_tag(rt, itag);
2003 #endif
2004 }
2005
2006 static struct rtable *rt_dst_alloc(struct net_device *dev,
2007 bool nopolicy, bool noxfrm)
2008 {
2009 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2010 DST_HOST |
2011 (nopolicy ? DST_NOPOLICY : 0) |
2012 (noxfrm ? DST_NOXFRM : 0));
2013 }
2014
2015 /* called in rcu_read_lock() section */
2016 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2017 u8 tos, struct net_device *dev, int our)
2018 {
2019 unsigned int hash;
2020 struct rtable *rth;
2021 __be32 spec_dst;
2022 struct in_device *in_dev = __in_dev_get_rcu(dev);
2023 u32 itag = 0;
2024 int err;
2025
2026 /* Primary sanity checks. */
2027
2028 if (in_dev == NULL)
2029 return -EINVAL;
2030
2031 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2032 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2033 goto e_inval;
2034
2035 if (ipv4_is_zeronet(saddr)) {
2036 if (!ipv4_is_local_multicast(daddr))
2037 goto e_inval;
2038 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2039 } else {
2040 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2041 &itag);
2042 if (err < 0)
2043 goto e_err;
2044 }
2045 rth = rt_dst_alloc(init_net.loopback_dev,
2046 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2047 if (!rth)
2048 goto e_nobufs;
2049
2050 #ifdef CONFIG_IP_ROUTE_CLASSID
2051 rth->dst.tclassid = itag;
2052 #endif
2053 rth->dst.output = ip_rt_bug;
2054
2055 rth->rt_key_dst = daddr;
2056 rth->rt_key_src = saddr;
2057 rth->rt_genid = rt_genid(dev_net(dev));
2058 rth->rt_flags = RTCF_MULTICAST;
2059 rth->rt_type = RTN_MULTICAST;
2060 rth->rt_key_tos = tos;
2061 rth->rt_dst = daddr;
2062 rth->rt_src = saddr;
2063 rth->rt_route_iif = dev->ifindex;
2064 rth->rt_iif = dev->ifindex;
2065 rth->rt_oif = 0;
2066 rth->rt_mark = skb->mark;
2067 rth->rt_gateway = daddr;
2068 rth->rt_spec_dst= spec_dst;
2069 rth->rt_peer_genid = 0;
2070 rth->peer = NULL;
2071 rth->fi = NULL;
2072 if (our) {
2073 rth->dst.input= ip_local_deliver;
2074 rth->rt_flags |= RTCF_LOCAL;
2075 }
2076
2077 #ifdef CONFIG_IP_MROUTE
2078 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2079 rth->dst.input = ip_mr_input;
2080 #endif
2081 RT_CACHE_STAT_INC(in_slow_mc);
2082
2083 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2084 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2085 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2086
2087 e_nobufs:
2088 return -ENOBUFS;
2089 e_inval:
2090 return -EINVAL;
2091 e_err:
2092 return err;
2093 }
2094
2095
2096 static void ip_handle_martian_source(struct net_device *dev,
2097 struct in_device *in_dev,
2098 struct sk_buff *skb,
2099 __be32 daddr,
2100 __be32 saddr)
2101 {
2102 RT_CACHE_STAT_INC(in_martian_src);
2103 #ifdef CONFIG_IP_ROUTE_VERBOSE
2104 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2105 /*
2106 * RFC1812 recommendation, if source is martian,
2107 * the only hint is MAC header.
2108 */
2109 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2110 &daddr, &saddr, dev->name);
2111 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2112 int i;
2113 const unsigned char *p = skb_mac_header(skb);
2114 printk(KERN_WARNING "ll header: ");
2115 for (i = 0; i < dev->hard_header_len; i++, p++) {
2116 printk("%02x", *p);
2117 if (i < (dev->hard_header_len - 1))
2118 printk(":");
2119 }
2120 printk("\n");
2121 }
2122 }
2123 #endif
2124 }
2125
2126 /* called in rcu_read_lock() section */
2127 static int __mkroute_input(struct sk_buff *skb,
2128 const struct fib_result *res,
2129 struct in_device *in_dev,
2130 __be32 daddr, __be32 saddr, u32 tos,
2131 struct rtable **result)
2132 {
2133 struct rtable *rth;
2134 int err;
2135 struct in_device *out_dev;
2136 unsigned int flags = 0;
2137 __be32 spec_dst;
2138 u32 itag;
2139
2140 /* get a working reference to the output device */
2141 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2142 if (out_dev == NULL) {
2143 if (net_ratelimit())
2144 printk(KERN_CRIT "Bug in ip_route_input" \
2145 "_slow(). Please, report\n");
2146 return -EINVAL;
2147 }
2148
2149
2150 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2151 in_dev->dev, &spec_dst, &itag);
2152 if (err < 0) {
2153 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2154 saddr);
2155
2156 goto cleanup;
2157 }
2158
2159 if (err)
2160 flags |= RTCF_DIRECTSRC;
2161
2162 if (out_dev == in_dev && err &&
2163 (IN_DEV_SHARED_MEDIA(out_dev) ||
2164 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2165 flags |= RTCF_DOREDIRECT;
2166
2167 if (skb->protocol != htons(ETH_P_IP)) {
2168 /* Not IP (i.e. ARP). Do not create route, if it is
2169 * invalid for proxy arp. DNAT routes are always valid.
2170 *
2171 * Proxy arp feature have been extended to allow, ARP
2172 * replies back to the same interface, to support
2173 * Private VLAN switch technologies. See arp.c.
2174 */
2175 if (out_dev == in_dev &&
2176 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2177 err = -EINVAL;
2178 goto cleanup;
2179 }
2180 }
2181
2182 rth = rt_dst_alloc(out_dev->dev,
2183 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2184 IN_DEV_CONF_GET(out_dev, NOXFRM));
2185 if (!rth) {
2186 err = -ENOBUFS;
2187 goto cleanup;
2188 }
2189
2190 rth->rt_key_dst = daddr;
2191 rth->rt_key_src = saddr;
2192 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2193 rth->rt_flags = flags;
2194 rth->rt_type = res->type;
2195 rth->rt_key_tos = tos;
2196 rth->rt_dst = daddr;
2197 rth->rt_src = saddr;
2198 rth->rt_route_iif = in_dev->dev->ifindex;
2199 rth->rt_iif = in_dev->dev->ifindex;
2200 rth->rt_oif = 0;
2201 rth->rt_mark = skb->mark;
2202 rth->rt_gateway = daddr;
2203 rth->rt_spec_dst= spec_dst;
2204 rth->rt_peer_genid = 0;
2205 rth->peer = NULL;
2206 rth->fi = NULL;
2207
2208 rth->dst.input = ip_forward;
2209 rth->dst.output = ip_output;
2210
2211 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2212
2213 *result = rth;
2214 err = 0;
2215 cleanup:
2216 return err;
2217 }
2218
2219 static int ip_mkroute_input(struct sk_buff *skb,
2220 struct fib_result *res,
2221 const struct flowi4 *fl4,
2222 struct in_device *in_dev,
2223 __be32 daddr, __be32 saddr, u32 tos)
2224 {
2225 struct rtable* rth = NULL;
2226 int err;
2227 unsigned hash;
2228
2229 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2230 if (res->fi && res->fi->fib_nhs > 1)
2231 fib_select_multipath(res);
2232 #endif
2233
2234 /* create a routing cache entry */
2235 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2236 if (err)
2237 return err;
2238
2239 /* put it into the cache */
2240 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2241 rt_genid(dev_net(rth->dst.dev)));
2242 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2243 if (IS_ERR(rth))
2244 return PTR_ERR(rth);
2245 return 0;
2246 }
2247
2248 /*
2249 * NOTE. We drop all the packets that has local source
2250 * addresses, because every properly looped back packet
2251 * must have correct destination already attached by output routine.
2252 *
2253 * Such approach solves two big problems:
2254 * 1. Not simplex devices are handled properly.
2255 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2256 * called with rcu_read_lock()
2257 */
2258
2259 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2260 u8 tos, struct net_device *dev)
2261 {
2262 struct fib_result res;
2263 struct in_device *in_dev = __in_dev_get_rcu(dev);
2264 struct flowi4 fl4;
2265 unsigned flags = 0;
2266 u32 itag = 0;
2267 struct rtable * rth;
2268 unsigned hash;
2269 __be32 spec_dst;
2270 int err = -EINVAL;
2271 struct net * net = dev_net(dev);
2272
2273 /* IP on this device is disabled. */
2274
2275 if (!in_dev)
2276 goto out;
2277
2278 /* Check for the most weird martians, which can be not detected
2279 by fib_lookup.
2280 */
2281
2282 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2283 ipv4_is_loopback(saddr))
2284 goto martian_source;
2285
2286 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2287 goto brd_input;
2288
2289 /* Accept zero addresses only to limited broadcast;
2290 * I even do not know to fix it or not. Waiting for complains :-)
2291 */
2292 if (ipv4_is_zeronet(saddr))
2293 goto martian_source;
2294
2295 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2296 goto martian_destination;
2297
2298 /*
2299 * Now we are ready to route packet.
2300 */
2301 fl4.flowi4_oif = 0;
2302 fl4.flowi4_iif = dev->ifindex;
2303 fl4.flowi4_mark = skb->mark;
2304 fl4.flowi4_tos = tos;
2305 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2306 fl4.daddr = daddr;
2307 fl4.saddr = saddr;
2308 err = fib_lookup(net, &fl4, &res);
2309 if (err != 0) {
2310 if (!IN_DEV_FORWARD(in_dev))
2311 goto e_hostunreach;
2312 goto no_route;
2313 }
2314
2315 RT_CACHE_STAT_INC(in_slow_tot);
2316
2317 if (res.type == RTN_BROADCAST)
2318 goto brd_input;
2319
2320 if (res.type == RTN_LOCAL) {
2321 err = fib_validate_source(skb, saddr, daddr, tos,
2322 net->loopback_dev->ifindex,
2323 dev, &spec_dst, &itag);
2324 if (err < 0)
2325 goto martian_source_keep_err;
2326 if (err)
2327 flags |= RTCF_DIRECTSRC;
2328 spec_dst = daddr;
2329 goto local_input;
2330 }
2331
2332 if (!IN_DEV_FORWARD(in_dev))
2333 goto e_hostunreach;
2334 if (res.type != RTN_UNICAST)
2335 goto martian_destination;
2336
2337 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2338 out: return err;
2339
2340 brd_input:
2341 if (skb->protocol != htons(ETH_P_IP))
2342 goto e_inval;
2343
2344 if (ipv4_is_zeronet(saddr))
2345 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2346 else {
2347 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2348 &itag);
2349 if (err < 0)
2350 goto martian_source_keep_err;
2351 if (err)
2352 flags |= RTCF_DIRECTSRC;
2353 }
2354 flags |= RTCF_BROADCAST;
2355 res.type = RTN_BROADCAST;
2356 RT_CACHE_STAT_INC(in_brd);
2357
2358 local_input:
2359 rth = rt_dst_alloc(net->loopback_dev,
2360 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2361 if (!rth)
2362 goto e_nobufs;
2363
2364 rth->dst.input= ip_local_deliver;
2365 rth->dst.output= ip_rt_bug;
2366 #ifdef CONFIG_IP_ROUTE_CLASSID
2367 rth->dst.tclassid = itag;
2368 #endif
2369
2370 rth->rt_key_dst = daddr;
2371 rth->rt_key_src = saddr;
2372 rth->rt_genid = rt_genid(net);
2373 rth->rt_flags = flags|RTCF_LOCAL;
2374 rth->rt_type = res.type;
2375 rth->rt_key_tos = tos;
2376 rth->rt_dst = daddr;
2377 rth->rt_src = saddr;
2378 #ifdef CONFIG_IP_ROUTE_CLASSID
2379 rth->dst.tclassid = itag;
2380 #endif
2381 rth->rt_route_iif = dev->ifindex;
2382 rth->rt_iif = dev->ifindex;
2383 rth->rt_oif = 0;
2384 rth->rt_mark = skb->mark;
2385 rth->rt_gateway = daddr;
2386 rth->rt_spec_dst= spec_dst;
2387 rth->rt_peer_genid = 0;
2388 rth->peer = NULL;
2389 rth->fi = NULL;
2390 if (res.type == RTN_UNREACHABLE) {
2391 rth->dst.input= ip_error;
2392 rth->dst.error= -err;
2393 rth->rt_flags &= ~RTCF_LOCAL;
2394 }
2395 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2396 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2397 err = 0;
2398 if (IS_ERR(rth))
2399 err = PTR_ERR(rth);
2400 goto out;
2401
2402 no_route:
2403 RT_CACHE_STAT_INC(in_no_route);
2404 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2405 res.type = RTN_UNREACHABLE;
2406 if (err == -ESRCH)
2407 err = -ENETUNREACH;
2408 goto local_input;
2409
2410 /*
2411 * Do not cache martian addresses: they should be logged (RFC1812)
2412 */
2413 martian_destination:
2414 RT_CACHE_STAT_INC(in_martian_dst);
2415 #ifdef CONFIG_IP_ROUTE_VERBOSE
2416 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2417 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2418 &daddr, &saddr, dev->name);
2419 #endif
2420
2421 e_hostunreach:
2422 err = -EHOSTUNREACH;
2423 goto out;
2424
2425 e_inval:
2426 err = -EINVAL;
2427 goto out;
2428
2429 e_nobufs:
2430 err = -ENOBUFS;
2431 goto out;
2432
2433 martian_source:
2434 err = -EINVAL;
2435 martian_source_keep_err:
2436 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2437 goto out;
2438 }
2439
2440 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2441 u8 tos, struct net_device *dev, bool noref)
2442 {
2443 struct rtable * rth;
2444 unsigned hash;
2445 int iif = dev->ifindex;
2446 struct net *net;
2447 int res;
2448
2449 net = dev_net(dev);
2450
2451 rcu_read_lock();
2452
2453 if (!rt_caching(net))
2454 goto skip_cache;
2455
2456 tos &= IPTOS_RT_MASK;
2457 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2458
2459 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2460 rth = rcu_dereference(rth->dst.rt_next)) {
2461 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2462 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2463 (rth->rt_route_iif ^ iif) |
2464 (rth->rt_key_tos ^ tos)) == 0 &&
2465 rth->rt_mark == skb->mark &&
2466 net_eq(dev_net(rth->dst.dev), net) &&
2467 !rt_is_expired(rth)) {
2468 ipv4_validate_peer(rth);
2469 if (noref) {
2470 dst_use_noref(&rth->dst, jiffies);
2471 skb_dst_set_noref(skb, &rth->dst);
2472 } else {
2473 dst_use(&rth->dst, jiffies);
2474 skb_dst_set(skb, &rth->dst);
2475 }
2476 RT_CACHE_STAT_INC(in_hit);
2477 rcu_read_unlock();
2478 return 0;
2479 }
2480 RT_CACHE_STAT_INC(in_hlist_search);
2481 }
2482
2483 skip_cache:
2484 /* Multicast recognition logic is moved from route cache to here.
2485 The problem was that too many Ethernet cards have broken/missing
2486 hardware multicast filters :-( As result the host on multicasting
2487 network acquires a lot of useless route cache entries, sort of
2488 SDR messages from all the world. Now we try to get rid of them.
2489 Really, provided software IP multicast filter is organized
2490 reasonably (at least, hashed), it does not result in a slowdown
2491 comparing with route cache reject entries.
2492 Note, that multicast routers are not affected, because
2493 route cache entry is created eventually.
2494 */
2495 if (ipv4_is_multicast(daddr)) {
2496 struct in_device *in_dev = __in_dev_get_rcu(dev);
2497
2498 if (in_dev) {
2499 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2500 ip_hdr(skb)->protocol);
2501 if (our
2502 #ifdef CONFIG_IP_MROUTE
2503 ||
2504 (!ipv4_is_local_multicast(daddr) &&
2505 IN_DEV_MFORWARD(in_dev))
2506 #endif
2507 ) {
2508 int res = ip_route_input_mc(skb, daddr, saddr,
2509 tos, dev, our);
2510 rcu_read_unlock();
2511 return res;
2512 }
2513 }
2514 rcu_read_unlock();
2515 return -EINVAL;
2516 }
2517 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2518 rcu_read_unlock();
2519 return res;
2520 }
2521 EXPORT_SYMBOL(ip_route_input_common);
2522
2523 /* called with rcu_read_lock() */
2524 static struct rtable *__mkroute_output(const struct fib_result *res,
2525 const struct flowi4 *fl4,
2526 __be32 orig_daddr, __be32 orig_saddr,
2527 int orig_oif, __u8 orig_rtos,
2528 struct net_device *dev_out,
2529 unsigned int flags)
2530 {
2531 struct fib_info *fi = res->fi;
2532 struct in_device *in_dev;
2533 u16 type = res->type;
2534 struct rtable *rth;
2535
2536 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2537 return ERR_PTR(-EINVAL);
2538
2539 if (ipv4_is_lbcast(fl4->daddr))
2540 type = RTN_BROADCAST;
2541 else if (ipv4_is_multicast(fl4->daddr))
2542 type = RTN_MULTICAST;
2543 else if (ipv4_is_zeronet(fl4->daddr))
2544 return ERR_PTR(-EINVAL);
2545
2546 if (dev_out->flags & IFF_LOOPBACK)
2547 flags |= RTCF_LOCAL;
2548
2549 in_dev = __in_dev_get_rcu(dev_out);
2550 if (!in_dev)
2551 return ERR_PTR(-EINVAL);
2552
2553 if (type == RTN_BROADCAST) {
2554 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2555 fi = NULL;
2556 } else if (type == RTN_MULTICAST) {
2557 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2558 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2559 fl4->flowi4_proto))
2560 flags &= ~RTCF_LOCAL;
2561 /* If multicast route do not exist use
2562 * default one, but do not gateway in this case.
2563 * Yes, it is hack.
2564 */
2565 if (fi && res->prefixlen < 4)
2566 fi = NULL;
2567 }
2568
2569 rth = rt_dst_alloc(dev_out,
2570 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2571 IN_DEV_CONF_GET(in_dev, NOXFRM));
2572 if (!rth)
2573 return ERR_PTR(-ENOBUFS);
2574
2575 rth->dst.output = ip_output;
2576
2577 rth->rt_key_dst = orig_daddr;
2578 rth->rt_key_src = orig_saddr;
2579 rth->rt_genid = rt_genid(dev_net(dev_out));
2580 rth->rt_flags = flags;
2581 rth->rt_type = type;
2582 rth->rt_key_tos = orig_rtos;
2583 rth->rt_dst = fl4->daddr;
2584 rth->rt_src = fl4->saddr;
2585 rth->rt_route_iif = 0;
2586 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2587 rth->rt_oif = orig_oif;
2588 rth->rt_mark = fl4->flowi4_mark;
2589 rth->rt_gateway = fl4->daddr;
2590 rth->rt_spec_dst= fl4->saddr;
2591 rth->rt_peer_genid = 0;
2592 rth->peer = NULL;
2593 rth->fi = NULL;
2594
2595 RT_CACHE_STAT_INC(out_slow_tot);
2596
2597 if (flags & RTCF_LOCAL) {
2598 rth->dst.input = ip_local_deliver;
2599 rth->rt_spec_dst = fl4->daddr;
2600 }
2601 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2602 rth->rt_spec_dst = fl4->saddr;
2603 if (flags & RTCF_LOCAL &&
2604 !(dev_out->flags & IFF_LOOPBACK)) {
2605 rth->dst.output = ip_mc_output;
2606 RT_CACHE_STAT_INC(out_slow_mc);
2607 }
2608 #ifdef CONFIG_IP_MROUTE
2609 if (type == RTN_MULTICAST) {
2610 if (IN_DEV_MFORWARD(in_dev) &&
2611 !ipv4_is_local_multicast(fl4->daddr)) {
2612 rth->dst.input = ip_mr_input;
2613 rth->dst.output = ip_mc_output;
2614 }
2615 }
2616 #endif
2617 }
2618
2619 rt_set_nexthop(rth, fl4, res, fi, type, 0);
2620
2621 return rth;
2622 }
2623
2624 /*
2625 * Major route resolver routine.
2626 * called with rcu_read_lock();
2627 */
2628
2629 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2630 {
2631 struct net_device *dev_out = NULL;
2632 __u8 tos = RT_FL_TOS(fl4);
2633 unsigned int flags = 0;
2634 struct fib_result res;
2635 struct rtable *rth;
2636 __be32 orig_daddr;
2637 __be32 orig_saddr;
2638 int orig_oif;
2639
2640 res.fi = NULL;
2641 #ifdef CONFIG_IP_MULTIPLE_TABLES
2642 res.r = NULL;
2643 #endif
2644
2645 orig_daddr = fl4->daddr;
2646 orig_saddr = fl4->saddr;
2647 orig_oif = fl4->flowi4_oif;
2648
2649 fl4->flowi4_iif = net->loopback_dev->ifindex;
2650 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2651 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2652 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2653
2654 rcu_read_lock();
2655 if (fl4->saddr) {
2656 rth = ERR_PTR(-EINVAL);
2657 if (ipv4_is_multicast(fl4->saddr) ||
2658 ipv4_is_lbcast(fl4->saddr) ||
2659 ipv4_is_zeronet(fl4->saddr))
2660 goto out;
2661
2662 /* I removed check for oif == dev_out->oif here.
2663 It was wrong for two reasons:
2664 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2665 is assigned to multiple interfaces.
2666 2. Moreover, we are allowed to send packets with saddr
2667 of another iface. --ANK
2668 */
2669
2670 if (fl4->flowi4_oif == 0 &&
2671 (ipv4_is_multicast(fl4->daddr) ||
2672 ipv4_is_lbcast(fl4->daddr))) {
2673 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2674 dev_out = __ip_dev_find(net, fl4->saddr, false);
2675 if (dev_out == NULL)
2676 goto out;
2677
2678 /* Special hack: user can direct multicasts
2679 and limited broadcast via necessary interface
2680 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2681 This hack is not just for fun, it allows
2682 vic,vat and friends to work.
2683 They bind socket to loopback, set ttl to zero
2684 and expect that it will work.
2685 From the viewpoint of routing cache they are broken,
2686 because we are not allowed to build multicast path
2687 with loopback source addr (look, routing cache
2688 cannot know, that ttl is zero, so that packet
2689 will not leave this host and route is valid).
2690 Luckily, this hack is good workaround.
2691 */
2692
2693 fl4->flowi4_oif = dev_out->ifindex;
2694 goto make_route;
2695 }
2696
2697 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2698 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2699 if (!__ip_dev_find(net, fl4->saddr, false))
2700 goto out;
2701 }
2702 }
2703
2704
2705 if (fl4->flowi4_oif) {
2706 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2707 rth = ERR_PTR(-ENODEV);
2708 if (dev_out == NULL)
2709 goto out;
2710
2711 /* RACE: Check return value of inet_select_addr instead. */
2712 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2713 rth = ERR_PTR(-ENETUNREACH);
2714 goto out;
2715 }
2716 if (ipv4_is_local_multicast(fl4->daddr) ||
2717 ipv4_is_lbcast(fl4->daddr)) {
2718 if (!fl4->saddr)
2719 fl4->saddr = inet_select_addr(dev_out, 0,
2720 RT_SCOPE_LINK);
2721 goto make_route;
2722 }
2723 if (fl4->saddr) {
2724 if (ipv4_is_multicast(fl4->daddr))
2725 fl4->saddr = inet_select_addr(dev_out, 0,
2726 fl4->flowi4_scope);
2727 else if (!fl4->daddr)
2728 fl4->saddr = inet_select_addr(dev_out, 0,
2729 RT_SCOPE_HOST);
2730 }
2731 }
2732
2733 if (!fl4->daddr) {
2734 fl4->daddr = fl4->saddr;
2735 if (!fl4->daddr)
2736 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2737 dev_out = net->loopback_dev;
2738 fl4->flowi4_oif = net->loopback_dev->ifindex;
2739 res.type = RTN_LOCAL;
2740 flags |= RTCF_LOCAL;
2741 goto make_route;
2742 }
2743
2744 if (fib_lookup(net, fl4, &res)) {
2745 res.fi = NULL;
2746 if (fl4->flowi4_oif) {
2747 /* Apparently, routing tables are wrong. Assume,
2748 that the destination is on link.
2749
2750 WHY? DW.
2751 Because we are allowed to send to iface
2752 even if it has NO routes and NO assigned
2753 addresses. When oif is specified, routing
2754 tables are looked up with only one purpose:
2755 to catch if destination is gatewayed, rather than
2756 direct. Moreover, if MSG_DONTROUTE is set,
2757 we send packet, ignoring both routing tables
2758 and ifaddr state. --ANK
2759
2760
2761 We could make it even if oif is unknown,
2762 likely IPv6, but we do not.
2763 */
2764
2765 if (fl4->saddr == 0)
2766 fl4->saddr = inet_select_addr(dev_out, 0,
2767 RT_SCOPE_LINK);
2768 res.type = RTN_UNICAST;
2769 goto make_route;
2770 }
2771 rth = ERR_PTR(-ENETUNREACH);
2772 goto out;
2773 }
2774
2775 if (res.type == RTN_LOCAL) {
2776 if (!fl4->saddr) {
2777 if (res.fi->fib_prefsrc)
2778 fl4->saddr = res.fi->fib_prefsrc;
2779 else
2780 fl4->saddr = fl4->daddr;
2781 }
2782 dev_out = net->loopback_dev;
2783 fl4->flowi4_oif = dev_out->ifindex;
2784 res.fi = NULL;
2785 flags |= RTCF_LOCAL;
2786 goto make_route;
2787 }
2788
2789 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2790 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2791 fib_select_multipath(&res);
2792 else
2793 #endif
2794 if (!res.prefixlen &&
2795 res.table->tb_num_default > 1 &&
2796 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2797 fib_select_default(&res);
2798
2799 if (!fl4->saddr)
2800 fl4->saddr = FIB_RES_PREFSRC(net, res);
2801
2802 dev_out = FIB_RES_DEV(res);
2803 fl4->flowi4_oif = dev_out->ifindex;
2804
2805
2806 make_route:
2807 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2808 tos, dev_out, flags);
2809 if (!IS_ERR(rth)) {
2810 unsigned int hash;
2811
2812 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2813 rt_genid(dev_net(dev_out)));
2814 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2815 }
2816
2817 out:
2818 rcu_read_unlock();
2819 return rth;
2820 }
2821
2822 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2823 {
2824 struct rtable *rth;
2825 unsigned int hash;
2826
2827 if (!rt_caching(net))
2828 goto slow_output;
2829
2830 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2831
2832 rcu_read_lock_bh();
2833 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2834 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2835 if (rth->rt_key_dst == flp4->daddr &&
2836 rth->rt_key_src == flp4->saddr &&
2837 rt_is_output_route(rth) &&
2838 rth->rt_oif == flp4->flowi4_oif &&
2839 rth->rt_mark == flp4->flowi4_mark &&
2840 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2841 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2842 net_eq(dev_net(rth->dst.dev), net) &&
2843 !rt_is_expired(rth)) {
2844 ipv4_validate_peer(rth);
2845 dst_use(&rth->dst, jiffies);
2846 RT_CACHE_STAT_INC(out_hit);
2847 rcu_read_unlock_bh();
2848 if (!flp4->saddr)
2849 flp4->saddr = rth->rt_src;
2850 if (!flp4->daddr)
2851 flp4->daddr = rth->rt_dst;
2852 return rth;
2853 }
2854 RT_CACHE_STAT_INC(out_hlist_search);
2855 }
2856 rcu_read_unlock_bh();
2857
2858 slow_output:
2859 return ip_route_output_slow(net, flp4);
2860 }
2861 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2862
2863 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2864 {
2865 return NULL;
2866 }
2867
2868 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2869 {
2870 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2871
2872 return mtu ? : dst->dev->mtu;
2873 }
2874
2875 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2876 {
2877 }
2878
2879 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2880 unsigned long old)
2881 {
2882 return NULL;
2883 }
2884
2885 static struct dst_ops ipv4_dst_blackhole_ops = {
2886 .family = AF_INET,
2887 .protocol = cpu_to_be16(ETH_P_IP),
2888 .destroy = ipv4_dst_destroy,
2889 .check = ipv4_blackhole_dst_check,
2890 .mtu = ipv4_blackhole_mtu,
2891 .default_advmss = ipv4_default_advmss,
2892 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2893 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2894 .neigh_lookup = ipv4_neigh_lookup,
2895 };
2896
2897 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2898 {
2899 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2900 struct rtable *ort = (struct rtable *) dst_orig;
2901
2902 if (rt) {
2903 struct dst_entry *new = &rt->dst;
2904
2905 new->__use = 1;
2906 new->input = dst_discard;
2907 new->output = dst_discard;
2908 dst_copy_metrics(new, &ort->dst);
2909
2910 new->dev = ort->dst.dev;
2911 if (new->dev)
2912 dev_hold(new->dev);
2913
2914 rt->rt_key_dst = ort->rt_key_dst;
2915 rt->rt_key_src = ort->rt_key_src;
2916 rt->rt_key_tos = ort->rt_key_tos;
2917 rt->rt_route_iif = ort->rt_route_iif;
2918 rt->rt_iif = ort->rt_iif;
2919 rt->rt_oif = ort->rt_oif;
2920 rt->rt_mark = ort->rt_mark;
2921
2922 rt->rt_genid = rt_genid(net);
2923 rt->rt_flags = ort->rt_flags;
2924 rt->rt_type = ort->rt_type;
2925 rt->rt_dst = ort->rt_dst;
2926 rt->rt_src = ort->rt_src;
2927 rt->rt_gateway = ort->rt_gateway;
2928 rt->rt_spec_dst = ort->rt_spec_dst;
2929 rt->peer = ort->peer;
2930 if (rt->peer)
2931 atomic_inc(&rt->peer->refcnt);
2932 rt->fi = ort->fi;
2933 if (rt->fi)
2934 atomic_inc(&rt->fi->fib_clntref);
2935
2936 dst_free(new);
2937 }
2938
2939 dst_release(dst_orig);
2940
2941 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2942 }
2943
2944 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2945 struct sock *sk)
2946 {
2947 struct rtable *rt = __ip_route_output_key(net, flp4);
2948
2949 if (IS_ERR(rt))
2950 return rt;
2951
2952 if (flp4->flowi4_proto)
2953 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2954 flowi4_to_flowi(flp4),
2955 sk, 0);
2956
2957 return rt;
2958 }
2959 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2960
2961 static int rt_fill_info(struct net *net,
2962 struct sk_buff *skb, u32 pid, u32 seq, int event,
2963 int nowait, unsigned int flags)
2964 {
2965 struct rtable *rt = skb_rtable(skb);
2966 struct rtmsg *r;
2967 struct nlmsghdr *nlh;
2968 unsigned long expires = 0;
2969 const struct inet_peer *peer = rt->peer;
2970 u32 id = 0, ts = 0, tsage = 0, error;
2971
2972 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2973 if (nlh == NULL)
2974 return -EMSGSIZE;
2975
2976 r = nlmsg_data(nlh);
2977 r->rtm_family = AF_INET;
2978 r->rtm_dst_len = 32;
2979 r->rtm_src_len = 0;
2980 r->rtm_tos = rt->rt_key_tos;
2981 r->rtm_table = RT_TABLE_MAIN;
2982 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2983 r->rtm_type = rt->rt_type;
2984 r->rtm_scope = RT_SCOPE_UNIVERSE;
2985 r->rtm_protocol = RTPROT_UNSPEC;
2986 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2987 if (rt->rt_flags & RTCF_NOTIFY)
2988 r->rtm_flags |= RTM_F_NOTIFY;
2989
2990 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2991
2992 if (rt->rt_key_src) {
2993 r->rtm_src_len = 32;
2994 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2995 }
2996 if (rt->dst.dev)
2997 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2998 #ifdef CONFIG_IP_ROUTE_CLASSID
2999 if (rt->dst.tclassid)
3000 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3001 #endif
3002 if (rt_is_input_route(rt))
3003 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3004 else if (rt->rt_src != rt->rt_key_src)
3005 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3006
3007 if (rt->rt_dst != rt->rt_gateway)
3008 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3009
3010 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3011 goto nla_put_failure;
3012
3013 if (rt->rt_mark)
3014 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3015
3016 error = rt->dst.error;
3017 if (peer) {
3018 inet_peer_refcheck(rt->peer);
3019 id = atomic_read(&peer->ip_id_count) & 0xffff;
3020 if (peer->tcp_ts_stamp) {
3021 ts = peer->tcp_ts;
3022 tsage = get_seconds() - peer->tcp_ts_stamp;
3023 }
3024 expires = ACCESS_ONCE(peer->pmtu_expires);
3025 if (expires) {
3026 if (time_before(jiffies, expires))
3027 expires -= jiffies;
3028 else
3029 expires = 0;
3030 }
3031 }
3032
3033 if (rt_is_input_route(rt)) {
3034 #ifdef CONFIG_IP_MROUTE
3035 __be32 dst = rt->rt_dst;
3036
3037 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3038 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3039 int err = ipmr_get_route(net, skb,
3040 rt->rt_src, rt->rt_dst,
3041 r, nowait);
3042 if (err <= 0) {
3043 if (!nowait) {
3044 if (err == 0)
3045 return 0;
3046 goto nla_put_failure;
3047 } else {
3048 if (err == -EMSGSIZE)
3049 goto nla_put_failure;
3050 error = err;
3051 }
3052 }
3053 } else
3054 #endif
3055 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3056 }
3057
3058 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3059 expires, error) < 0)
3060 goto nla_put_failure;
3061
3062 return nlmsg_end(skb, nlh);
3063
3064 nla_put_failure:
3065 nlmsg_cancel(skb, nlh);
3066 return -EMSGSIZE;
3067 }
3068
3069 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3070 {
3071 struct net *net = sock_net(in_skb->sk);
3072 struct rtmsg *rtm;
3073 struct nlattr *tb[RTA_MAX+1];
3074 struct rtable *rt = NULL;
3075 __be32 dst = 0;
3076 __be32 src = 0;
3077 u32 iif;
3078 int err;
3079 int mark;
3080 struct sk_buff *skb;
3081
3082 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3083 if (err < 0)
3084 goto errout;
3085
3086 rtm = nlmsg_data(nlh);
3087
3088 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3089 if (skb == NULL) {
3090 err = -ENOBUFS;
3091 goto errout;
3092 }
3093
3094 /* Reserve room for dummy headers, this skb can pass
3095 through good chunk of routing engine.
3096 */
3097 skb_reset_mac_header(skb);
3098 skb_reset_network_header(skb);
3099
3100 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3101 ip_hdr(skb)->protocol = IPPROTO_ICMP;
3102 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3103
3104 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3105 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3106 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3107 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3108
3109 if (iif) {
3110 struct net_device *dev;
3111
3112 dev = __dev_get_by_index(net, iif);
3113 if (dev == NULL) {
3114 err = -ENODEV;
3115 goto errout_free;
3116 }
3117
3118 skb->protocol = htons(ETH_P_IP);
3119 skb->dev = dev;
3120 skb->mark = mark;
3121 local_bh_disable();
3122 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3123 local_bh_enable();
3124
3125 rt = skb_rtable(skb);
3126 if (err == 0 && rt->dst.error)
3127 err = -rt->dst.error;
3128 } else {
3129 struct flowi4 fl4 = {
3130 .daddr = dst,
3131 .saddr = src,
3132 .flowi4_tos = rtm->rtm_tos,
3133 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3134 .flowi4_mark = mark,
3135 };
3136 rt = ip_route_output_key(net, &fl4);
3137
3138 err = 0;
3139 if (IS_ERR(rt))
3140 err = PTR_ERR(rt);
3141 }
3142
3143 if (err)
3144 goto errout_free;
3145
3146 skb_dst_set(skb, &rt->dst);
3147 if (rtm->rtm_flags & RTM_F_NOTIFY)
3148 rt->rt_flags |= RTCF_NOTIFY;
3149
3150 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3151 RTM_NEWROUTE, 0, 0);
3152 if (err <= 0)
3153 goto errout_free;
3154
3155 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3156 errout:
3157 return err;
3158
3159 errout_free:
3160 kfree_skb(skb);
3161 goto errout;
3162 }
3163
3164 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3165 {
3166 struct rtable *rt;
3167 int h, s_h;
3168 int idx, s_idx;
3169 struct net *net;
3170
3171 net = sock_net(skb->sk);
3172
3173 s_h = cb->args[0];
3174 if (s_h < 0)
3175 s_h = 0;
3176 s_idx = idx = cb->args[1];
3177 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3178 if (!rt_hash_table[h].chain)
3179 continue;
3180 rcu_read_lock_bh();
3181 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3182 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3183 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3184 continue;
3185 if (rt_is_expired(rt))
3186 continue;
3187 skb_dst_set_noref(skb, &rt->dst);
3188 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3189 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3190 1, NLM_F_MULTI) <= 0) {
3191 skb_dst_drop(skb);
3192 rcu_read_unlock_bh();
3193 goto done;
3194 }
3195 skb_dst_drop(skb);
3196 }
3197 rcu_read_unlock_bh();
3198 }
3199
3200 done:
3201 cb->args[0] = h;
3202 cb->args[1] = idx;
3203 return skb->len;
3204 }
3205
3206 void ip_rt_multicast_event(struct in_device *in_dev)
3207 {
3208 rt_cache_flush(dev_net(in_dev->dev), 0);
3209 }
3210
3211 #ifdef CONFIG_SYSCTL
3212 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3213 void __user *buffer,
3214 size_t *lenp, loff_t *ppos)
3215 {
3216 if (write) {
3217 int flush_delay;
3218 ctl_table ctl;
3219 struct net *net;
3220
3221 memcpy(&ctl, __ctl, sizeof(ctl));
3222 ctl.data = &flush_delay;
3223 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3224
3225 net = (struct net *)__ctl->extra1;
3226 rt_cache_flush(net, flush_delay);
3227 return 0;
3228 }
3229
3230 return -EINVAL;
3231 }
3232
3233 static ctl_table ipv4_route_table[] = {
3234 {
3235 .procname = "gc_thresh",
3236 .data = &ipv4_dst_ops.gc_thresh,
3237 .maxlen = sizeof(int),
3238 .mode = 0644,
3239 .proc_handler = proc_dointvec,
3240 },
3241 {
3242 .procname = "max_size",
3243 .data = &ip_rt_max_size,
3244 .maxlen = sizeof(int),
3245 .mode = 0644,
3246 .proc_handler = proc_dointvec,
3247 },
3248 {
3249 /* Deprecated. Use gc_min_interval_ms */
3250
3251 .procname = "gc_min_interval",
3252 .data = &ip_rt_gc_min_interval,
3253 .maxlen = sizeof(int),
3254 .mode = 0644,
3255 .proc_handler = proc_dointvec_jiffies,
3256 },
3257 {
3258 .procname = "gc_min_interval_ms",
3259 .data = &ip_rt_gc_min_interval,
3260 .maxlen = sizeof(int),
3261 .mode = 0644,
3262 .proc_handler = proc_dointvec_ms_jiffies,
3263 },
3264 {
3265 .procname = "gc_timeout",
3266 .data = &ip_rt_gc_timeout,
3267 .maxlen = sizeof(int),
3268 .mode = 0644,
3269 .proc_handler = proc_dointvec_jiffies,
3270 },
3271 {
3272 .procname = "gc_interval",
3273 .data = &ip_rt_gc_interval,
3274 .maxlen = sizeof(int),
3275 .mode = 0644,
3276 .proc_handler = proc_dointvec_jiffies,
3277 },
3278 {
3279 .procname = "redirect_load",
3280 .data = &ip_rt_redirect_load,
3281 .maxlen = sizeof(int),
3282 .mode = 0644,
3283 .proc_handler = proc_dointvec,
3284 },
3285 {
3286 .procname = "redirect_number",
3287 .data = &ip_rt_redirect_number,
3288 .maxlen = sizeof(int),
3289 .mode = 0644,
3290 .proc_handler = proc_dointvec,
3291 },
3292 {
3293 .procname = "redirect_silence",
3294 .data = &ip_rt_redirect_silence,
3295 .maxlen = sizeof(int),
3296 .mode = 0644,
3297 .proc_handler = proc_dointvec,
3298 },
3299 {
3300 .procname = "error_cost",
3301 .data = &ip_rt_error_cost,
3302 .maxlen = sizeof(int),
3303 .mode = 0644,
3304 .proc_handler = proc_dointvec,
3305 },
3306 {
3307 .procname = "error_burst",
3308 .data = &ip_rt_error_burst,
3309 .maxlen = sizeof(int),
3310 .mode = 0644,
3311 .proc_handler = proc_dointvec,
3312 },
3313 {
3314 .procname = "gc_elasticity",
3315 .data = &ip_rt_gc_elasticity,
3316 .maxlen = sizeof(int),
3317 .mode = 0644,
3318 .proc_handler = proc_dointvec,
3319 },
3320 {
3321 .procname = "mtu_expires",
3322 .data = &ip_rt_mtu_expires,
3323 .maxlen = sizeof(int),
3324 .mode = 0644,
3325 .proc_handler = proc_dointvec_jiffies,
3326 },
3327 {
3328 .procname = "min_pmtu",
3329 .data = &ip_rt_min_pmtu,
3330 .maxlen = sizeof(int),
3331 .mode = 0644,
3332 .proc_handler = proc_dointvec,
3333 },
3334 {
3335 .procname = "min_adv_mss",
3336 .data = &ip_rt_min_advmss,
3337 .maxlen = sizeof(int),
3338 .mode = 0644,
3339 .proc_handler = proc_dointvec,
3340 },
3341 { }
3342 };
3343
3344 static struct ctl_table empty[1];
3345
3346 static struct ctl_table ipv4_skeleton[] =
3347 {
3348 { .procname = "route",
3349 .mode = 0555, .child = ipv4_route_table},
3350 { .procname = "neigh",
3351 .mode = 0555, .child = empty},
3352 { }
3353 };
3354
3355 static __net_initdata struct ctl_path ipv4_path[] = {
3356 { .procname = "net", },
3357 { .procname = "ipv4", },
3358 { },
3359 };
3360
3361 static struct ctl_table ipv4_route_flush_table[] = {
3362 {
3363 .procname = "flush",
3364 .maxlen = sizeof(int),
3365 .mode = 0200,
3366 .proc_handler = ipv4_sysctl_rtcache_flush,
3367 },
3368 { },
3369 };
3370
3371 static __net_initdata struct ctl_path ipv4_route_path[] = {
3372 { .procname = "net", },
3373 { .procname = "ipv4", },
3374 { .procname = "route", },
3375 { },
3376 };
3377
3378 static __net_init int sysctl_route_net_init(struct net *net)
3379 {
3380 struct ctl_table *tbl;
3381
3382 tbl = ipv4_route_flush_table;
3383 if (!net_eq(net, &init_net)) {
3384 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3385 if (tbl == NULL)
3386 goto err_dup;
3387 }
3388 tbl[0].extra1 = net;
3389
3390 net->ipv4.route_hdr =
3391 register_net_sysctl_table(net, ipv4_route_path, tbl);
3392 if (net->ipv4.route_hdr == NULL)
3393 goto err_reg;
3394 return 0;
3395
3396 err_reg:
3397 if (tbl != ipv4_route_flush_table)
3398 kfree(tbl);
3399 err_dup:
3400 return -ENOMEM;
3401 }
3402
3403 static __net_exit void sysctl_route_net_exit(struct net *net)
3404 {
3405 struct ctl_table *tbl;
3406
3407 tbl = net->ipv4.route_hdr->ctl_table_arg;
3408 unregister_net_sysctl_table(net->ipv4.route_hdr);
3409 BUG_ON(tbl == ipv4_route_flush_table);
3410 kfree(tbl);
3411 }
3412
3413 static __net_initdata struct pernet_operations sysctl_route_ops = {
3414 .init = sysctl_route_net_init,
3415 .exit = sysctl_route_net_exit,
3416 };
3417 #endif
3418
3419 static __net_init int rt_genid_init(struct net *net)
3420 {
3421 get_random_bytes(&net->ipv4.rt_genid,
3422 sizeof(net->ipv4.rt_genid));
3423 get_random_bytes(&net->ipv4.dev_addr_genid,
3424 sizeof(net->ipv4.dev_addr_genid));
3425 return 0;
3426 }
3427
3428 static __net_initdata struct pernet_operations rt_genid_ops = {
3429 .init = rt_genid_init,
3430 };
3431
3432
3433 #ifdef CONFIG_IP_ROUTE_CLASSID
3434 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3435 #endif /* CONFIG_IP_ROUTE_CLASSID */
3436
3437 static __initdata unsigned long rhash_entries;
3438 static int __init set_rhash_entries(char *str)
3439 {
3440 if (!str)
3441 return 0;
3442 rhash_entries = simple_strtoul(str, &str, 0);
3443 return 1;
3444 }
3445 __setup("rhash_entries=", set_rhash_entries);
3446
3447 int __init ip_rt_init(void)
3448 {
3449 int rc = 0;
3450
3451 #ifdef CONFIG_IP_ROUTE_CLASSID
3452 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3453 if (!ip_rt_acct)
3454 panic("IP: failed to allocate ip_rt_acct\n");
3455 #endif
3456
3457 ipv4_dst_ops.kmem_cachep =
3458 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3459 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3460
3461 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3462
3463 if (dst_entries_init(&ipv4_dst_ops) < 0)
3464 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3465
3466 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3467 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3468
3469 rt_hash_table = (struct rt_hash_bucket *)
3470 alloc_large_system_hash("IP route cache",
3471 sizeof(struct rt_hash_bucket),
3472 rhash_entries,
3473 (totalram_pages >= 128 * 1024) ?
3474 15 : 17,
3475 0,
3476 &rt_hash_log,
3477 &rt_hash_mask,
3478 rhash_entries ? 0 : 512 * 1024);
3479 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3480 rt_hash_lock_init();
3481
3482 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3483 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3484
3485 devinet_init();
3486 ip_fib_init();
3487
3488 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3489 expires_ljiffies = jiffies;
3490 schedule_delayed_work(&expires_work,
3491 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3492
3493 if (ip_rt_proc_init())
3494 printk(KERN_ERR "Unable to create route proc files\n");
3495 #ifdef CONFIG_XFRM
3496 xfrm_init();
3497 xfrm4_init(ip_rt_max_size);
3498 #endif
3499 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3500
3501 #ifdef CONFIG_SYSCTL
3502 register_pernet_subsys(&sysctl_route_ops);
3503 #endif
3504 register_pernet_subsys(&rt_genid_ops);
3505 return rc;
3506 }
3507
3508 #ifdef CONFIG_SYSCTL
3509 /*
3510 * We really need to sanitize the damn ipv4 init order, then all
3511 * this nonsense will go away.
3512 */
3513 void __init ip_static_sysctl_init(void)
3514 {
3515 register_sysctl_paths(ipv4_path, ipv4_skeleton);
3516 }
3517 #endif
This page took 0.137976 seconds and 4 git commands to generate.