[IPV4] ROUTE: Collect proc-related functions together
[deliverable/linux.git] / net / ipv4 / route.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
41 *
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
60 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU 0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_min_delay = 2 * HZ;
121 static int ip_rt_max_delay = 10 * HZ;
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval = 60 * HZ;
125 static int ip_rt_gc_min_interval = HZ / 2;
126 static int ip_rt_redirect_number = 9;
127 static int ip_rt_redirect_load = HZ / 50;
128 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost = HZ;
130 static int ip_rt_error_burst = 5 * HZ;
131 static int ip_rt_gc_elasticity = 8;
132 static int ip_rt_mtu_expires = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu = 512 + 20 + 20;
134 static int ip_rt_min_advmss = 256;
135 static int ip_rt_secret_interval = 10 * 60 * HZ;
136 static int ip_rt_flush_expected;
137 static unsigned long rt_deadline;
138
139 #define RTprint(a...) printk(KERN_DEBUG a)
140
141 static struct timer_list rt_flush_timer;
142 static void rt_worker_func(struct work_struct *work);
143 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
144 static struct timer_list rt_secret_timer;
145
146 /*
147 * Interface to generic destination cache.
148 */
149
150 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151 static void ipv4_dst_destroy(struct dst_entry *dst);
152 static void ipv4_dst_ifdown(struct dst_entry *dst,
153 struct net_device *dev, int how);
154 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155 static void ipv4_link_failure(struct sk_buff *skb);
156 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157 static int rt_garbage_collect(void);
158
159
160 static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
162 .protocol = __constant_htons(ETH_P_IP),
163 .gc = rt_garbage_collect,
164 .check = ipv4_dst_check,
165 .destroy = ipv4_dst_destroy,
166 .ifdown = ipv4_dst_ifdown,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
170 .local_out = ip_local_out,
171 .entry_size = sizeof(struct rtable),
172 };
173
174 #define ECN_OR_COST(class) TC_PRIO_##class
175
176 const __u8 ip_tos2prio[16] = {
177 TC_PRIO_BESTEFFORT,
178 ECN_OR_COST(FILLER),
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BULK,
182 ECN_OR_COST(BULK),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_INTERACTIVE,
186 ECN_OR_COST(INTERACTIVE),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE_BULK,
190 ECN_OR_COST(INTERACTIVE_BULK),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK)
193 };
194
195
196 /*
197 * Route cache.
198 */
199
200 /* The locking scheme is rather straight forward:
201 *
202 * 1) Read-Copy Update protects the buckets of the central route hash.
203 * 2) Only writers remove entries, and they hold the lock
204 * as they look at rtable reference counts.
205 * 3) Only readers acquire references to rtable entries,
206 * they do so with atomic increments and with the
207 * lock held.
208 */
209
210 struct rt_hash_bucket {
211 struct rtable *chain;
212 };
213 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214 defined(CONFIG_PROVE_LOCKING)
215 /*
216 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217 * The size of this table is a power of two and depends on the number of CPUS.
218 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
219 */
220 #ifdef CONFIG_LOCKDEP
221 # define RT_HASH_LOCK_SZ 256
222 #else
223 # if NR_CPUS >= 32
224 # define RT_HASH_LOCK_SZ 4096
225 # elif NR_CPUS >= 16
226 # define RT_HASH_LOCK_SZ 2048
227 # elif NR_CPUS >= 8
228 # define RT_HASH_LOCK_SZ 1024
229 # elif NR_CPUS >= 4
230 # define RT_HASH_LOCK_SZ 512
231 # else
232 # define RT_HASH_LOCK_SZ 256
233 # endif
234 #endif
235
236 static spinlock_t *rt_hash_locks;
237 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238 # define rt_hash_lock_init() { \
239 int i; \
240 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
241 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
242 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
243 spin_lock_init(&rt_hash_locks[i]); \
244 }
245 #else
246 # define rt_hash_lock_addr(slot) NULL
247 # define rt_hash_lock_init()
248 #endif
249
250 static struct rt_hash_bucket *rt_hash_table;
251 static unsigned rt_hash_mask;
252 static unsigned int rt_hash_log;
253 static unsigned int rt_hash_rnd;
254
255 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
256 #define RT_CACHE_STAT_INC(field) \
257 (__raw_get_cpu_var(rt_cache_stat).field++)
258
259 static int rt_intern_hash(unsigned hash, struct rtable *rth,
260 struct rtable **res);
261
262 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
263 {
264 return (jhash_2words(daddr, saddr, rt_hash_rnd)
265 & rt_hash_mask);
266 }
267
268 #define rt_hash(daddr, saddr, idx) \
269 rt_hash_code((__force u32)(__be32)(daddr),\
270 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
271
272 #ifdef CONFIG_PROC_FS
273 struct rt_cache_iter_state {
274 int bucket;
275 };
276
277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
278 {
279 struct rtable *r = NULL;
280 struct rt_cache_iter_state *st = seq->private;
281
282 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283 rcu_read_lock_bh();
284 r = rt_hash_table[st->bucket].chain;
285 if (r)
286 break;
287 rcu_read_unlock_bh();
288 }
289 return rcu_dereference(r);
290 }
291
292 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
293 {
294 struct rt_cache_iter_state *st = seq->private;
295
296 r = r->u.dst.rt_next;
297 while (!r) {
298 rcu_read_unlock_bh();
299 if (--st->bucket < 0)
300 break;
301 rcu_read_lock_bh();
302 r = rt_hash_table[st->bucket].chain;
303 }
304 return rcu_dereference(r);
305 }
306
307 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
308 {
309 struct rtable *r = rt_cache_get_first(seq);
310
311 if (r)
312 while (pos && (r = rt_cache_get_next(seq, r)))
313 --pos;
314 return pos ? NULL : r;
315 }
316
317 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
318 {
319 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
320 }
321
322 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
323 {
324 struct rtable *r = NULL;
325
326 if (v == SEQ_START_TOKEN)
327 r = rt_cache_get_first(seq);
328 else
329 r = rt_cache_get_next(seq, v);
330 ++*pos;
331 return r;
332 }
333
334 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
335 {
336 if (v && v != SEQ_START_TOKEN)
337 rcu_read_unlock_bh();
338 }
339
340 static int rt_cache_seq_show(struct seq_file *seq, void *v)
341 {
342 if (v == SEQ_START_TOKEN)
343 seq_printf(seq, "%-127s\n",
344 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
345 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
346 "HHUptod\tSpecDst");
347 else {
348 struct rtable *r = v;
349 char temp[256];
350
351 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
352 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
353 r->u.dst.dev ? r->u.dst.dev->name : "*",
354 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
355 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
356 r->u.dst.__use, 0, (unsigned long)r->rt_src,
357 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
358 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
359 dst_metric(&r->u.dst, RTAX_WINDOW),
360 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
361 dst_metric(&r->u.dst, RTAX_RTTVAR)),
362 r->fl.fl4_tos,
363 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
364 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
365 dev_queue_xmit) : 0,
366 r->rt_spec_dst);
367 seq_printf(seq, "%-127s\n", temp);
368 }
369 return 0;
370 }
371
372 static const struct seq_operations rt_cache_seq_ops = {
373 .start = rt_cache_seq_start,
374 .next = rt_cache_seq_next,
375 .stop = rt_cache_seq_stop,
376 .show = rt_cache_seq_show,
377 };
378
379 static int rt_cache_seq_open(struct inode *inode, struct file *file)
380 {
381 return seq_open_private(file, &rt_cache_seq_ops,
382 sizeof(struct rt_cache_iter_state));
383 }
384
385 static const struct file_operations rt_cache_seq_fops = {
386 .owner = THIS_MODULE,
387 .open = rt_cache_seq_open,
388 .read = seq_read,
389 .llseek = seq_lseek,
390 .release = seq_release_private,
391 };
392
393
394 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
395 {
396 int cpu;
397
398 if (*pos == 0)
399 return SEQ_START_TOKEN;
400
401 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
402 if (!cpu_possible(cpu))
403 continue;
404 *pos = cpu+1;
405 return &per_cpu(rt_cache_stat, cpu);
406 }
407 return NULL;
408 }
409
410 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
411 {
412 int cpu;
413
414 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
415 if (!cpu_possible(cpu))
416 continue;
417 *pos = cpu+1;
418 return &per_cpu(rt_cache_stat, cpu);
419 }
420 return NULL;
421
422 }
423
424 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
425 {
426
427 }
428
429 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
430 {
431 struct rt_cache_stat *st = v;
432
433 if (v == SEQ_START_TOKEN) {
434 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
435 return 0;
436 }
437
438 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
439 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
440 atomic_read(&ipv4_dst_ops.entries),
441 st->in_hit,
442 st->in_slow_tot,
443 st->in_slow_mc,
444 st->in_no_route,
445 st->in_brd,
446 st->in_martian_dst,
447 st->in_martian_src,
448
449 st->out_hit,
450 st->out_slow_tot,
451 st->out_slow_mc,
452
453 st->gc_total,
454 st->gc_ignored,
455 st->gc_goal_miss,
456 st->gc_dst_overflow,
457 st->in_hlist_search,
458 st->out_hlist_search
459 );
460 return 0;
461 }
462
463 static const struct seq_operations rt_cpu_seq_ops = {
464 .start = rt_cpu_seq_start,
465 .next = rt_cpu_seq_next,
466 .stop = rt_cpu_seq_stop,
467 .show = rt_cpu_seq_show,
468 };
469
470
471 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
472 {
473 return seq_open(file, &rt_cpu_seq_ops);
474 }
475
476 static const struct file_operations rt_cpu_seq_fops = {
477 .owner = THIS_MODULE,
478 .open = rt_cpu_seq_open,
479 .read = seq_read,
480 .llseek = seq_lseek,
481 .release = seq_release,
482 };
483
484 #ifdef CONFIG_NET_CLS_ROUTE
485 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
486 int length, int *eof, void *data)
487 {
488 unsigned int i;
489
490 if ((offset & 3) || (length & 3))
491 return -EIO;
492
493 if (offset >= sizeof(struct ip_rt_acct) * 256) {
494 *eof = 1;
495 return 0;
496 }
497
498 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
499 length = sizeof(struct ip_rt_acct) * 256 - offset;
500 *eof = 1;
501 }
502
503 offset /= sizeof(u32);
504
505 if (length > 0) {
506 u32 *dst = (u32 *) buffer;
507
508 *start = buffer;
509 memset(dst, 0, length);
510
511 for_each_possible_cpu(i) {
512 unsigned int j;
513 u32 *src;
514
515 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
516 for (j = 0; j < length/4; j++)
517 dst[j] += src[j];
518 }
519 }
520 return length;
521 }
522 #endif
523 #endif /* CONFIG_PROC_FS */
524
525 static __inline__ void rt_free(struct rtable *rt)
526 {
527 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
528 }
529
530 static __inline__ void rt_drop(struct rtable *rt)
531 {
532 ip_rt_put(rt);
533 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
534 }
535
536 static __inline__ int rt_fast_clean(struct rtable *rth)
537 {
538 /* Kill broadcast/multicast entries very aggresively, if they
539 collide in hash table with more useful entries */
540 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
541 rth->fl.iif && rth->u.dst.rt_next;
542 }
543
544 static __inline__ int rt_valuable(struct rtable *rth)
545 {
546 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
547 rth->u.dst.expires;
548 }
549
550 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
551 {
552 unsigned long age;
553 int ret = 0;
554
555 if (atomic_read(&rth->u.dst.__refcnt))
556 goto out;
557
558 ret = 1;
559 if (rth->u.dst.expires &&
560 time_after_eq(jiffies, rth->u.dst.expires))
561 goto out;
562
563 age = jiffies - rth->u.dst.lastuse;
564 ret = 0;
565 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
566 (age <= tmo2 && rt_valuable(rth)))
567 goto out;
568 ret = 1;
569 out: return ret;
570 }
571
572 /* Bits of score are:
573 * 31: very valuable
574 * 30: not quite useless
575 * 29..0: usage counter
576 */
577 static inline u32 rt_score(struct rtable *rt)
578 {
579 u32 score = jiffies - rt->u.dst.lastuse;
580
581 score = ~score & ~(3<<30);
582
583 if (rt_valuable(rt))
584 score |= (1<<31);
585
586 if (!rt->fl.iif ||
587 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
588 score |= (1<<30);
589
590 return score;
591 }
592
593 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
594 {
595 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
596 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
597 (fl1->mark ^ fl2->mark) |
598 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
599 *(u16 *)&fl2->nl_u.ip4_u.tos) |
600 (fl1->oif ^ fl2->oif) |
601 (fl1->iif ^ fl2->iif)) == 0;
602 }
603
604 /*
605 * Perform a full scan of hash table and free all entries.
606 * Can be called by a softirq or a process.
607 * In the later case, we want to be reschedule if necessary
608 */
609 static void rt_do_flush(int process_context)
610 {
611 unsigned int i;
612 struct rtable *rth, *next;
613
614 for (i = 0; i <= rt_hash_mask; i++) {
615 if (process_context && need_resched())
616 cond_resched();
617 rth = rt_hash_table[i].chain;
618 if (!rth)
619 continue;
620
621 spin_lock_bh(rt_hash_lock_addr(i));
622 rth = rt_hash_table[i].chain;
623 rt_hash_table[i].chain = NULL;
624 spin_unlock_bh(rt_hash_lock_addr(i));
625
626 for (; rth; rth = next) {
627 next = rth->u.dst.rt_next;
628 rt_free(rth);
629 }
630 }
631 }
632
633 static void rt_check_expire(void)
634 {
635 static unsigned int rover;
636 unsigned int i = rover, goal;
637 struct rtable *rth, **rthp;
638 u64 mult;
639
640 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
641 if (ip_rt_gc_timeout > 1)
642 do_div(mult, ip_rt_gc_timeout);
643 goal = (unsigned int)mult;
644 if (goal > rt_hash_mask)
645 goal = rt_hash_mask + 1;
646 for (; goal > 0; goal--) {
647 unsigned long tmo = ip_rt_gc_timeout;
648
649 i = (i + 1) & rt_hash_mask;
650 rthp = &rt_hash_table[i].chain;
651
652 if (need_resched())
653 cond_resched();
654
655 if (*rthp == NULL)
656 continue;
657 spin_lock_bh(rt_hash_lock_addr(i));
658 while ((rth = *rthp) != NULL) {
659 if (rth->u.dst.expires) {
660 /* Entry is expired even if it is in use */
661 if (time_before_eq(jiffies, rth->u.dst.expires)) {
662 tmo >>= 1;
663 rthp = &rth->u.dst.rt_next;
664 continue;
665 }
666 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
667 tmo >>= 1;
668 rthp = &rth->u.dst.rt_next;
669 continue;
670 }
671
672 /* Cleanup aged off entries. */
673 *rthp = rth->u.dst.rt_next;
674 rt_free(rth);
675 }
676 spin_unlock_bh(rt_hash_lock_addr(i));
677 }
678 rover = i;
679 }
680
681 /*
682 * rt_worker_func() is run in process context.
683 * If a whole flush was scheduled, it is done.
684 * Else, we call rt_check_expire() to scan part of the hash table
685 */
686 static void rt_worker_func(struct work_struct *work)
687 {
688 if (ip_rt_flush_expected) {
689 ip_rt_flush_expected = 0;
690 rt_do_flush(1);
691 } else
692 rt_check_expire();
693 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
694 }
695
696 /* This can run from both BH and non-BH contexts, the latter
697 * in the case of a forced flush event.
698 */
699 static void rt_run_flush(unsigned long process_context)
700 {
701 rt_deadline = 0;
702
703 get_random_bytes(&rt_hash_rnd, 4);
704
705 rt_do_flush(process_context);
706 }
707
708 static DEFINE_SPINLOCK(rt_flush_lock);
709
710 void rt_cache_flush(int delay)
711 {
712 unsigned long now = jiffies;
713 int user_mode = !in_softirq();
714
715 if (delay < 0)
716 delay = ip_rt_min_delay;
717
718 spin_lock_bh(&rt_flush_lock);
719
720 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
721 long tmo = (long)(rt_deadline - now);
722
723 /* If flush timer is already running
724 and flush request is not immediate (delay > 0):
725
726 if deadline is not achieved, prolongate timer to "delay",
727 otherwise fire it at deadline time.
728 */
729
730 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
731 tmo = 0;
732
733 if (delay > tmo)
734 delay = tmo;
735 }
736
737 if (delay <= 0) {
738 spin_unlock_bh(&rt_flush_lock);
739 rt_run_flush(user_mode);
740 return;
741 }
742
743 if (rt_deadline == 0)
744 rt_deadline = now + ip_rt_max_delay;
745
746 mod_timer(&rt_flush_timer, now+delay);
747 spin_unlock_bh(&rt_flush_lock);
748 }
749
750 /*
751 * We change rt_hash_rnd and ask next rt_worker_func() invocation
752 * to perform a flush in process context
753 */
754 static void rt_secret_rebuild(unsigned long dummy)
755 {
756 get_random_bytes(&rt_hash_rnd, 4);
757 ip_rt_flush_expected = 1;
758 cancel_delayed_work(&expires_work);
759 schedule_delayed_work(&expires_work, HZ/10);
760 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
761 }
762
763 /*
764 Short description of GC goals.
765
766 We want to build algorithm, which will keep routing cache
767 at some equilibrium point, when number of aged off entries
768 is kept approximately equal to newly generated ones.
769
770 Current expiration strength is variable "expire".
771 We try to adjust it dynamically, so that if networking
772 is idle expires is large enough to keep enough of warm entries,
773 and when load increases it reduces to limit cache size.
774 */
775
776 static int rt_garbage_collect(void)
777 {
778 static unsigned long expire = RT_GC_TIMEOUT;
779 static unsigned long last_gc;
780 static int rover;
781 static int equilibrium;
782 struct rtable *rth, **rthp;
783 unsigned long now = jiffies;
784 int goal;
785
786 /*
787 * Garbage collection is pretty expensive,
788 * do not make it too frequently.
789 */
790
791 RT_CACHE_STAT_INC(gc_total);
792
793 if (now - last_gc < ip_rt_gc_min_interval &&
794 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
795 RT_CACHE_STAT_INC(gc_ignored);
796 goto out;
797 }
798
799 /* Calculate number of entries, which we want to expire now. */
800 goal = atomic_read(&ipv4_dst_ops.entries) -
801 (ip_rt_gc_elasticity << rt_hash_log);
802 if (goal <= 0) {
803 if (equilibrium < ipv4_dst_ops.gc_thresh)
804 equilibrium = ipv4_dst_ops.gc_thresh;
805 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
806 if (goal > 0) {
807 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
808 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
809 }
810 } else {
811 /* We are in dangerous area. Try to reduce cache really
812 * aggressively.
813 */
814 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
815 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
816 }
817
818 if (now - last_gc >= ip_rt_gc_min_interval)
819 last_gc = now;
820
821 if (goal <= 0) {
822 equilibrium += goal;
823 goto work_done;
824 }
825
826 do {
827 int i, k;
828
829 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
830 unsigned long tmo = expire;
831
832 k = (k + 1) & rt_hash_mask;
833 rthp = &rt_hash_table[k].chain;
834 spin_lock_bh(rt_hash_lock_addr(k));
835 while ((rth = *rthp) != NULL) {
836 if (!rt_may_expire(rth, tmo, expire)) {
837 tmo >>= 1;
838 rthp = &rth->u.dst.rt_next;
839 continue;
840 }
841 *rthp = rth->u.dst.rt_next;
842 rt_free(rth);
843 goal--;
844 }
845 spin_unlock_bh(rt_hash_lock_addr(k));
846 if (goal <= 0)
847 break;
848 }
849 rover = k;
850
851 if (goal <= 0)
852 goto work_done;
853
854 /* Goal is not achieved. We stop process if:
855
856 - if expire reduced to zero. Otherwise, expire is halfed.
857 - if table is not full.
858 - if we are called from interrupt.
859 - jiffies check is just fallback/debug loop breaker.
860 We will not spin here for long time in any case.
861 */
862
863 RT_CACHE_STAT_INC(gc_goal_miss);
864
865 if (expire == 0)
866 break;
867
868 expire >>= 1;
869 #if RT_CACHE_DEBUG >= 2
870 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
871 atomic_read(&ipv4_dst_ops.entries), goal, i);
872 #endif
873
874 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
875 goto out;
876 } while (!in_softirq() && time_before_eq(jiffies, now));
877
878 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
879 goto out;
880 if (net_ratelimit())
881 printk(KERN_WARNING "dst cache overflow\n");
882 RT_CACHE_STAT_INC(gc_dst_overflow);
883 return 1;
884
885 work_done:
886 expire += ip_rt_gc_min_interval;
887 if (expire > ip_rt_gc_timeout ||
888 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
889 expire = ip_rt_gc_timeout;
890 #if RT_CACHE_DEBUG >= 2
891 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
892 atomic_read(&ipv4_dst_ops.entries), goal, rover);
893 #endif
894 out: return 0;
895 }
896
897 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
898 {
899 struct rtable *rth, **rthp;
900 unsigned long now;
901 struct rtable *cand, **candp;
902 u32 min_score;
903 int chain_length;
904 int attempts = !in_softirq();
905
906 restart:
907 chain_length = 0;
908 min_score = ~(u32)0;
909 cand = NULL;
910 candp = NULL;
911 now = jiffies;
912
913 rthp = &rt_hash_table[hash].chain;
914
915 spin_lock_bh(rt_hash_lock_addr(hash));
916 while ((rth = *rthp) != NULL) {
917 if (compare_keys(&rth->fl, &rt->fl)) {
918 /* Put it first */
919 *rthp = rth->u.dst.rt_next;
920 /*
921 * Since lookup is lockfree, the deletion
922 * must be visible to another weakly ordered CPU before
923 * the insertion at the start of the hash chain.
924 */
925 rcu_assign_pointer(rth->u.dst.rt_next,
926 rt_hash_table[hash].chain);
927 /*
928 * Since lookup is lockfree, the update writes
929 * must be ordered for consistency on SMP.
930 */
931 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
932
933 dst_use(&rth->u.dst, now);
934 spin_unlock_bh(rt_hash_lock_addr(hash));
935
936 rt_drop(rt);
937 *rp = rth;
938 return 0;
939 }
940
941 if (!atomic_read(&rth->u.dst.__refcnt)) {
942 u32 score = rt_score(rth);
943
944 if (score <= min_score) {
945 cand = rth;
946 candp = rthp;
947 min_score = score;
948 }
949 }
950
951 chain_length++;
952
953 rthp = &rth->u.dst.rt_next;
954 }
955
956 if (cand) {
957 /* ip_rt_gc_elasticity used to be average length of chain
958 * length, when exceeded gc becomes really aggressive.
959 *
960 * The second limit is less certain. At the moment it allows
961 * only 2 entries per bucket. We will see.
962 */
963 if (chain_length > ip_rt_gc_elasticity) {
964 *candp = cand->u.dst.rt_next;
965 rt_free(cand);
966 }
967 }
968
969 /* Try to bind route to arp only if it is output
970 route or unicast forwarding path.
971 */
972 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
973 int err = arp_bind_neighbour(&rt->u.dst);
974 if (err) {
975 spin_unlock_bh(rt_hash_lock_addr(hash));
976
977 if (err != -ENOBUFS) {
978 rt_drop(rt);
979 return err;
980 }
981
982 /* Neighbour tables are full and nothing
983 can be released. Try to shrink route cache,
984 it is most likely it holds some neighbour records.
985 */
986 if (attempts-- > 0) {
987 int saved_elasticity = ip_rt_gc_elasticity;
988 int saved_int = ip_rt_gc_min_interval;
989 ip_rt_gc_elasticity = 1;
990 ip_rt_gc_min_interval = 0;
991 rt_garbage_collect();
992 ip_rt_gc_min_interval = saved_int;
993 ip_rt_gc_elasticity = saved_elasticity;
994 goto restart;
995 }
996
997 if (net_ratelimit())
998 printk(KERN_WARNING "Neighbour table overflow.\n");
999 rt_drop(rt);
1000 return -ENOBUFS;
1001 }
1002 }
1003
1004 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1005 #if RT_CACHE_DEBUG >= 2
1006 if (rt->u.dst.rt_next) {
1007 struct rtable *trt;
1008 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1009 NIPQUAD(rt->rt_dst));
1010 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1011 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1012 printk("\n");
1013 }
1014 #endif
1015 rt_hash_table[hash].chain = rt;
1016 spin_unlock_bh(rt_hash_lock_addr(hash));
1017 *rp = rt;
1018 return 0;
1019 }
1020
1021 void rt_bind_peer(struct rtable *rt, int create)
1022 {
1023 static DEFINE_SPINLOCK(rt_peer_lock);
1024 struct inet_peer *peer;
1025
1026 peer = inet_getpeer(rt->rt_dst, create);
1027
1028 spin_lock_bh(&rt_peer_lock);
1029 if (rt->peer == NULL) {
1030 rt->peer = peer;
1031 peer = NULL;
1032 }
1033 spin_unlock_bh(&rt_peer_lock);
1034 if (peer)
1035 inet_putpeer(peer);
1036 }
1037
1038 /*
1039 * Peer allocation may fail only in serious out-of-memory conditions. However
1040 * we still can generate some output.
1041 * Random ID selection looks a bit dangerous because we have no chances to
1042 * select ID being unique in a reasonable period of time.
1043 * But broken packet identifier may be better than no packet at all.
1044 */
1045 static void ip_select_fb_ident(struct iphdr *iph)
1046 {
1047 static DEFINE_SPINLOCK(ip_fb_id_lock);
1048 static u32 ip_fallback_id;
1049 u32 salt;
1050
1051 spin_lock_bh(&ip_fb_id_lock);
1052 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1053 iph->id = htons(salt & 0xFFFF);
1054 ip_fallback_id = salt;
1055 spin_unlock_bh(&ip_fb_id_lock);
1056 }
1057
1058 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1059 {
1060 struct rtable *rt = (struct rtable *) dst;
1061
1062 if (rt) {
1063 if (rt->peer == NULL)
1064 rt_bind_peer(rt, 1);
1065
1066 /* If peer is attached to destination, it is never detached,
1067 so that we need not to grab a lock to dereference it.
1068 */
1069 if (rt->peer) {
1070 iph->id = htons(inet_getid(rt->peer, more));
1071 return;
1072 }
1073 } else
1074 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1075 __builtin_return_address(0));
1076
1077 ip_select_fb_ident(iph);
1078 }
1079
1080 static void rt_del(unsigned hash, struct rtable *rt)
1081 {
1082 struct rtable **rthp;
1083
1084 spin_lock_bh(rt_hash_lock_addr(hash));
1085 ip_rt_put(rt);
1086 for (rthp = &rt_hash_table[hash].chain; *rthp;
1087 rthp = &(*rthp)->u.dst.rt_next)
1088 if (*rthp == rt) {
1089 *rthp = rt->u.dst.rt_next;
1090 rt_free(rt);
1091 break;
1092 }
1093 spin_unlock_bh(rt_hash_lock_addr(hash));
1094 }
1095
1096 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1097 __be32 saddr, struct net_device *dev)
1098 {
1099 int i, k;
1100 struct in_device *in_dev = in_dev_get(dev);
1101 struct rtable *rth, **rthp;
1102 __be32 skeys[2] = { saddr, 0 };
1103 int ikeys[2] = { dev->ifindex, 0 };
1104 struct netevent_redirect netevent;
1105
1106 if (!in_dev)
1107 return;
1108
1109 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1110 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1111 goto reject_redirect;
1112
1113 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1114 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1115 goto reject_redirect;
1116 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1117 goto reject_redirect;
1118 } else {
1119 if (inet_addr_type(new_gw) != RTN_UNICAST)
1120 goto reject_redirect;
1121 }
1122
1123 for (i = 0; i < 2; i++) {
1124 for (k = 0; k < 2; k++) {
1125 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1126
1127 rthp=&rt_hash_table[hash].chain;
1128
1129 rcu_read_lock();
1130 while ((rth = rcu_dereference(*rthp)) != NULL) {
1131 struct rtable *rt;
1132
1133 if (rth->fl.fl4_dst != daddr ||
1134 rth->fl.fl4_src != skeys[i] ||
1135 rth->fl.oif != ikeys[k] ||
1136 rth->fl.iif != 0) {
1137 rthp = &rth->u.dst.rt_next;
1138 continue;
1139 }
1140
1141 if (rth->rt_dst != daddr ||
1142 rth->rt_src != saddr ||
1143 rth->u.dst.error ||
1144 rth->rt_gateway != old_gw ||
1145 rth->u.dst.dev != dev)
1146 break;
1147
1148 dst_hold(&rth->u.dst);
1149 rcu_read_unlock();
1150
1151 rt = dst_alloc(&ipv4_dst_ops);
1152 if (rt == NULL) {
1153 ip_rt_put(rth);
1154 in_dev_put(in_dev);
1155 return;
1156 }
1157
1158 /* Copy all the information. */
1159 *rt = *rth;
1160 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1161 rt->u.dst.__use = 1;
1162 atomic_set(&rt->u.dst.__refcnt, 1);
1163 rt->u.dst.child = NULL;
1164 if (rt->u.dst.dev)
1165 dev_hold(rt->u.dst.dev);
1166 if (rt->idev)
1167 in_dev_hold(rt->idev);
1168 rt->u.dst.obsolete = 0;
1169 rt->u.dst.lastuse = jiffies;
1170 rt->u.dst.path = &rt->u.dst;
1171 rt->u.dst.neighbour = NULL;
1172 rt->u.dst.hh = NULL;
1173 rt->u.dst.xfrm = NULL;
1174
1175 rt->rt_flags |= RTCF_REDIRECTED;
1176
1177 /* Gateway is different ... */
1178 rt->rt_gateway = new_gw;
1179
1180 /* Redirect received -> path was valid */
1181 dst_confirm(&rth->u.dst);
1182
1183 if (rt->peer)
1184 atomic_inc(&rt->peer->refcnt);
1185
1186 if (arp_bind_neighbour(&rt->u.dst) ||
1187 !(rt->u.dst.neighbour->nud_state &
1188 NUD_VALID)) {
1189 if (rt->u.dst.neighbour)
1190 neigh_event_send(rt->u.dst.neighbour, NULL);
1191 ip_rt_put(rth);
1192 rt_drop(rt);
1193 goto do_next;
1194 }
1195
1196 netevent.old = &rth->u.dst;
1197 netevent.new = &rt->u.dst;
1198 call_netevent_notifiers(NETEVENT_REDIRECT,
1199 &netevent);
1200
1201 rt_del(hash, rth);
1202 if (!rt_intern_hash(hash, rt, &rt))
1203 ip_rt_put(rt);
1204 goto do_next;
1205 }
1206 rcu_read_unlock();
1207 do_next:
1208 ;
1209 }
1210 }
1211 in_dev_put(in_dev);
1212 return;
1213
1214 reject_redirect:
1215 #ifdef CONFIG_IP_ROUTE_VERBOSE
1216 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1217 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1218 "%u.%u.%u.%u ignored.\n"
1219 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1220 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1221 NIPQUAD(saddr), NIPQUAD(daddr));
1222 #endif
1223 in_dev_put(in_dev);
1224 }
1225
1226 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1227 {
1228 struct rtable *rt = (struct rtable*)dst;
1229 struct dst_entry *ret = dst;
1230
1231 if (rt) {
1232 if (dst->obsolete) {
1233 ip_rt_put(rt);
1234 ret = NULL;
1235 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1236 rt->u.dst.expires) {
1237 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1238 rt->fl.oif);
1239 #if RT_CACHE_DEBUG >= 1
1240 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1241 "%u.%u.%u.%u/%02x dropped\n",
1242 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1243 #endif
1244 rt_del(hash, rt);
1245 ret = NULL;
1246 }
1247 }
1248 return ret;
1249 }
1250
1251 /*
1252 * Algorithm:
1253 * 1. The first ip_rt_redirect_number redirects are sent
1254 * with exponential backoff, then we stop sending them at all,
1255 * assuming that the host ignores our redirects.
1256 * 2. If we did not see packets requiring redirects
1257 * during ip_rt_redirect_silence, we assume that the host
1258 * forgot redirected route and start to send redirects again.
1259 *
1260 * This algorithm is much cheaper and more intelligent than dumb load limiting
1261 * in icmp.c.
1262 *
1263 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1264 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1265 */
1266
1267 void ip_rt_send_redirect(struct sk_buff *skb)
1268 {
1269 struct rtable *rt = (struct rtable*)skb->dst;
1270 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1271
1272 if (!in_dev)
1273 return;
1274
1275 if (!IN_DEV_TX_REDIRECTS(in_dev))
1276 goto out;
1277
1278 /* No redirected packets during ip_rt_redirect_silence;
1279 * reset the algorithm.
1280 */
1281 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1282 rt->u.dst.rate_tokens = 0;
1283
1284 /* Too many ignored redirects; do not send anything
1285 * set u.dst.rate_last to the last seen redirected packet.
1286 */
1287 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1288 rt->u.dst.rate_last = jiffies;
1289 goto out;
1290 }
1291
1292 /* Check for load limit; set rate_last to the latest sent
1293 * redirect.
1294 */
1295 if (rt->u.dst.rate_tokens == 0 ||
1296 time_after(jiffies,
1297 (rt->u.dst.rate_last +
1298 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1299 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1300 rt->u.dst.rate_last = jiffies;
1301 ++rt->u.dst.rate_tokens;
1302 #ifdef CONFIG_IP_ROUTE_VERBOSE
1303 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1304 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1305 net_ratelimit())
1306 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1307 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1308 NIPQUAD(rt->rt_src), rt->rt_iif,
1309 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1310 #endif
1311 }
1312 out:
1313 in_dev_put(in_dev);
1314 }
1315
1316 static int ip_error(struct sk_buff *skb)
1317 {
1318 struct rtable *rt = (struct rtable*)skb->dst;
1319 unsigned long now;
1320 int code;
1321
1322 switch (rt->u.dst.error) {
1323 case EINVAL:
1324 default:
1325 goto out;
1326 case EHOSTUNREACH:
1327 code = ICMP_HOST_UNREACH;
1328 break;
1329 case ENETUNREACH:
1330 code = ICMP_NET_UNREACH;
1331 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1332 break;
1333 case EACCES:
1334 code = ICMP_PKT_FILTERED;
1335 break;
1336 }
1337
1338 now = jiffies;
1339 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1340 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1341 rt->u.dst.rate_tokens = ip_rt_error_burst;
1342 rt->u.dst.rate_last = now;
1343 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1344 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1345 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1346 }
1347
1348 out: kfree_skb(skb);
1349 return 0;
1350 }
1351
1352 /*
1353 * The last two values are not from the RFC but
1354 * are needed for AMPRnet AX.25 paths.
1355 */
1356
1357 static const unsigned short mtu_plateau[] =
1358 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1359
1360 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1361 {
1362 int i;
1363
1364 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1365 if (old_mtu > mtu_plateau[i])
1366 return mtu_plateau[i];
1367 return 68;
1368 }
1369
1370 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1371 {
1372 int i;
1373 unsigned short old_mtu = ntohs(iph->tot_len);
1374 struct rtable *rth;
1375 __be32 skeys[2] = { iph->saddr, 0, };
1376 __be32 daddr = iph->daddr;
1377 unsigned short est_mtu = 0;
1378
1379 if (ipv4_config.no_pmtu_disc)
1380 return 0;
1381
1382 for (i = 0; i < 2; i++) {
1383 unsigned hash = rt_hash(daddr, skeys[i], 0);
1384
1385 rcu_read_lock();
1386 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1387 rth = rcu_dereference(rth->u.dst.rt_next)) {
1388 if (rth->fl.fl4_dst == daddr &&
1389 rth->fl.fl4_src == skeys[i] &&
1390 rth->rt_dst == daddr &&
1391 rth->rt_src == iph->saddr &&
1392 rth->fl.iif == 0 &&
1393 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1394 unsigned short mtu = new_mtu;
1395
1396 if (new_mtu < 68 || new_mtu >= old_mtu) {
1397
1398 /* BSD 4.2 compatibility hack :-( */
1399 if (mtu == 0 &&
1400 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1401 old_mtu >= 68 + (iph->ihl << 2))
1402 old_mtu -= iph->ihl << 2;
1403
1404 mtu = guess_mtu(old_mtu);
1405 }
1406 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1407 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1408 dst_confirm(&rth->u.dst);
1409 if (mtu < ip_rt_min_pmtu) {
1410 mtu = ip_rt_min_pmtu;
1411 rth->u.dst.metrics[RTAX_LOCK-1] |=
1412 (1 << RTAX_MTU);
1413 }
1414 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1415 dst_set_expires(&rth->u.dst,
1416 ip_rt_mtu_expires);
1417 }
1418 est_mtu = mtu;
1419 }
1420 }
1421 }
1422 rcu_read_unlock();
1423 }
1424 return est_mtu ? : new_mtu;
1425 }
1426
1427 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1428 {
1429 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1430 !(dst_metric_locked(dst, RTAX_MTU))) {
1431 if (mtu < ip_rt_min_pmtu) {
1432 mtu = ip_rt_min_pmtu;
1433 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1434 }
1435 dst->metrics[RTAX_MTU-1] = mtu;
1436 dst_set_expires(dst, ip_rt_mtu_expires);
1437 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1438 }
1439 }
1440
1441 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1442 {
1443 return NULL;
1444 }
1445
1446 static void ipv4_dst_destroy(struct dst_entry *dst)
1447 {
1448 struct rtable *rt = (struct rtable *) dst;
1449 struct inet_peer *peer = rt->peer;
1450 struct in_device *idev = rt->idev;
1451
1452 if (peer) {
1453 rt->peer = NULL;
1454 inet_putpeer(peer);
1455 }
1456
1457 if (idev) {
1458 rt->idev = NULL;
1459 in_dev_put(idev);
1460 }
1461 }
1462
1463 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1464 int how)
1465 {
1466 struct rtable *rt = (struct rtable *) dst;
1467 struct in_device *idev = rt->idev;
1468 if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1469 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1470 if (loopback_idev) {
1471 rt->idev = loopback_idev;
1472 in_dev_put(idev);
1473 }
1474 }
1475 }
1476
1477 static void ipv4_link_failure(struct sk_buff *skb)
1478 {
1479 struct rtable *rt;
1480
1481 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1482
1483 rt = (struct rtable *) skb->dst;
1484 if (rt)
1485 dst_set_expires(&rt->u.dst, 0);
1486 }
1487
1488 static int ip_rt_bug(struct sk_buff *skb)
1489 {
1490 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1491 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1492 skb->dev ? skb->dev->name : "?");
1493 kfree_skb(skb);
1494 return 0;
1495 }
1496
1497 /*
1498 We do not cache source address of outgoing interface,
1499 because it is used only by IP RR, TS and SRR options,
1500 so that it out of fast path.
1501
1502 BTW remember: "addr" is allowed to be not aligned
1503 in IP options!
1504 */
1505
1506 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1507 {
1508 __be32 src;
1509 struct fib_result res;
1510
1511 if (rt->fl.iif == 0)
1512 src = rt->rt_src;
1513 else if (fib_lookup(&rt->fl, &res) == 0) {
1514 src = FIB_RES_PREFSRC(res);
1515 fib_res_put(&res);
1516 } else
1517 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1518 RT_SCOPE_UNIVERSE);
1519 memcpy(addr, &src, 4);
1520 }
1521
1522 #ifdef CONFIG_NET_CLS_ROUTE
1523 static void set_class_tag(struct rtable *rt, u32 tag)
1524 {
1525 if (!(rt->u.dst.tclassid & 0xFFFF))
1526 rt->u.dst.tclassid |= tag & 0xFFFF;
1527 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1528 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1529 }
1530 #endif
1531
1532 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1533 {
1534 struct fib_info *fi = res->fi;
1535
1536 if (fi) {
1537 if (FIB_RES_GW(*res) &&
1538 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1539 rt->rt_gateway = FIB_RES_GW(*res);
1540 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1541 sizeof(rt->u.dst.metrics));
1542 if (fi->fib_mtu == 0) {
1543 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1544 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1545 rt->rt_gateway != rt->rt_dst &&
1546 rt->u.dst.dev->mtu > 576)
1547 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1548 }
1549 #ifdef CONFIG_NET_CLS_ROUTE
1550 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1551 #endif
1552 } else
1553 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1554
1555 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1556 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1557 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1558 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1559 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1560 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1561 ip_rt_min_advmss);
1562 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1563 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1564
1565 #ifdef CONFIG_NET_CLS_ROUTE
1566 #ifdef CONFIG_IP_MULTIPLE_TABLES
1567 set_class_tag(rt, fib_rules_tclass(res));
1568 #endif
1569 set_class_tag(rt, itag);
1570 #endif
1571 rt->rt_type = res->type;
1572 }
1573
1574 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1575 u8 tos, struct net_device *dev, int our)
1576 {
1577 unsigned hash;
1578 struct rtable *rth;
1579 __be32 spec_dst;
1580 struct in_device *in_dev = in_dev_get(dev);
1581 u32 itag = 0;
1582
1583 /* Primary sanity checks. */
1584
1585 if (in_dev == NULL)
1586 return -EINVAL;
1587
1588 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1589 skb->protocol != htons(ETH_P_IP))
1590 goto e_inval;
1591
1592 if (ZERONET(saddr)) {
1593 if (!LOCAL_MCAST(daddr))
1594 goto e_inval;
1595 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1596 } else if (fib_validate_source(saddr, 0, tos, 0,
1597 dev, &spec_dst, &itag) < 0)
1598 goto e_inval;
1599
1600 rth = dst_alloc(&ipv4_dst_ops);
1601 if (!rth)
1602 goto e_nobufs;
1603
1604 rth->u.dst.output= ip_rt_bug;
1605
1606 atomic_set(&rth->u.dst.__refcnt, 1);
1607 rth->u.dst.flags= DST_HOST;
1608 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1609 rth->u.dst.flags |= DST_NOPOLICY;
1610 rth->fl.fl4_dst = daddr;
1611 rth->rt_dst = daddr;
1612 rth->fl.fl4_tos = tos;
1613 rth->fl.mark = skb->mark;
1614 rth->fl.fl4_src = saddr;
1615 rth->rt_src = saddr;
1616 #ifdef CONFIG_NET_CLS_ROUTE
1617 rth->u.dst.tclassid = itag;
1618 #endif
1619 rth->rt_iif =
1620 rth->fl.iif = dev->ifindex;
1621 rth->u.dst.dev = init_net.loopback_dev;
1622 dev_hold(rth->u.dst.dev);
1623 rth->idev = in_dev_get(rth->u.dst.dev);
1624 rth->fl.oif = 0;
1625 rth->rt_gateway = daddr;
1626 rth->rt_spec_dst= spec_dst;
1627 rth->rt_type = RTN_MULTICAST;
1628 rth->rt_flags = RTCF_MULTICAST;
1629 if (our) {
1630 rth->u.dst.input= ip_local_deliver;
1631 rth->rt_flags |= RTCF_LOCAL;
1632 }
1633
1634 #ifdef CONFIG_IP_MROUTE
1635 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1636 rth->u.dst.input = ip_mr_input;
1637 #endif
1638 RT_CACHE_STAT_INC(in_slow_mc);
1639
1640 in_dev_put(in_dev);
1641 hash = rt_hash(daddr, saddr, dev->ifindex);
1642 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1643
1644 e_nobufs:
1645 in_dev_put(in_dev);
1646 return -ENOBUFS;
1647
1648 e_inval:
1649 in_dev_put(in_dev);
1650 return -EINVAL;
1651 }
1652
1653
1654 static void ip_handle_martian_source(struct net_device *dev,
1655 struct in_device *in_dev,
1656 struct sk_buff *skb,
1657 __be32 daddr,
1658 __be32 saddr)
1659 {
1660 RT_CACHE_STAT_INC(in_martian_src);
1661 #ifdef CONFIG_IP_ROUTE_VERBOSE
1662 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1663 /*
1664 * RFC1812 recommendation, if source is martian,
1665 * the only hint is MAC header.
1666 */
1667 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1668 "%u.%u.%u.%u, on dev %s\n",
1669 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1670 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1671 int i;
1672 const unsigned char *p = skb_mac_header(skb);
1673 printk(KERN_WARNING "ll header: ");
1674 for (i = 0; i < dev->hard_header_len; i++, p++) {
1675 printk("%02x", *p);
1676 if (i < (dev->hard_header_len - 1))
1677 printk(":");
1678 }
1679 printk("\n");
1680 }
1681 }
1682 #endif
1683 }
1684
1685 static inline int __mkroute_input(struct sk_buff *skb,
1686 struct fib_result* res,
1687 struct in_device *in_dev,
1688 __be32 daddr, __be32 saddr, u32 tos,
1689 struct rtable **result)
1690 {
1691
1692 struct rtable *rth;
1693 int err;
1694 struct in_device *out_dev;
1695 unsigned flags = 0;
1696 __be32 spec_dst;
1697 u32 itag;
1698
1699 /* get a working reference to the output device */
1700 out_dev = in_dev_get(FIB_RES_DEV(*res));
1701 if (out_dev == NULL) {
1702 if (net_ratelimit())
1703 printk(KERN_CRIT "Bug in ip_route_input" \
1704 "_slow(). Please, report\n");
1705 return -EINVAL;
1706 }
1707
1708
1709 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1710 in_dev->dev, &spec_dst, &itag);
1711 if (err < 0) {
1712 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1713 saddr);
1714
1715 err = -EINVAL;
1716 goto cleanup;
1717 }
1718
1719 if (err)
1720 flags |= RTCF_DIRECTSRC;
1721
1722 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1723 (IN_DEV_SHARED_MEDIA(out_dev) ||
1724 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1725 flags |= RTCF_DOREDIRECT;
1726
1727 if (skb->protocol != htons(ETH_P_IP)) {
1728 /* Not IP (i.e. ARP). Do not create route, if it is
1729 * invalid for proxy arp. DNAT routes are always valid.
1730 */
1731 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1732 err = -EINVAL;
1733 goto cleanup;
1734 }
1735 }
1736
1737
1738 rth = dst_alloc(&ipv4_dst_ops);
1739 if (!rth) {
1740 err = -ENOBUFS;
1741 goto cleanup;
1742 }
1743
1744 atomic_set(&rth->u.dst.__refcnt, 1);
1745 rth->u.dst.flags= DST_HOST;
1746 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1747 rth->u.dst.flags |= DST_NOPOLICY;
1748 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1749 rth->u.dst.flags |= DST_NOXFRM;
1750 rth->fl.fl4_dst = daddr;
1751 rth->rt_dst = daddr;
1752 rth->fl.fl4_tos = tos;
1753 rth->fl.mark = skb->mark;
1754 rth->fl.fl4_src = saddr;
1755 rth->rt_src = saddr;
1756 rth->rt_gateway = daddr;
1757 rth->rt_iif =
1758 rth->fl.iif = in_dev->dev->ifindex;
1759 rth->u.dst.dev = (out_dev)->dev;
1760 dev_hold(rth->u.dst.dev);
1761 rth->idev = in_dev_get(rth->u.dst.dev);
1762 rth->fl.oif = 0;
1763 rth->rt_spec_dst= spec_dst;
1764
1765 rth->u.dst.input = ip_forward;
1766 rth->u.dst.output = ip_output;
1767
1768 rt_set_nexthop(rth, res, itag);
1769
1770 rth->rt_flags = flags;
1771
1772 *result = rth;
1773 err = 0;
1774 cleanup:
1775 /* release the working reference to the output device */
1776 in_dev_put(out_dev);
1777 return err;
1778 }
1779
1780 static inline int ip_mkroute_input(struct sk_buff *skb,
1781 struct fib_result* res,
1782 const struct flowi *fl,
1783 struct in_device *in_dev,
1784 __be32 daddr, __be32 saddr, u32 tos)
1785 {
1786 struct rtable* rth = NULL;
1787 int err;
1788 unsigned hash;
1789
1790 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1791 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1792 fib_select_multipath(fl, res);
1793 #endif
1794
1795 /* create a routing cache entry */
1796 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1797 if (err)
1798 return err;
1799
1800 /* put it into the cache */
1801 hash = rt_hash(daddr, saddr, fl->iif);
1802 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1803 }
1804
1805 /*
1806 * NOTE. We drop all the packets that has local source
1807 * addresses, because every properly looped back packet
1808 * must have correct destination already attached by output routine.
1809 *
1810 * Such approach solves two big problems:
1811 * 1. Not simplex devices are handled properly.
1812 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1813 */
1814
1815 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1816 u8 tos, struct net_device *dev)
1817 {
1818 struct fib_result res;
1819 struct in_device *in_dev = in_dev_get(dev);
1820 struct flowi fl = { .nl_u = { .ip4_u =
1821 { .daddr = daddr,
1822 .saddr = saddr,
1823 .tos = tos,
1824 .scope = RT_SCOPE_UNIVERSE,
1825 } },
1826 .mark = skb->mark,
1827 .iif = dev->ifindex };
1828 unsigned flags = 0;
1829 u32 itag = 0;
1830 struct rtable * rth;
1831 unsigned hash;
1832 __be32 spec_dst;
1833 int err = -EINVAL;
1834 int free_res = 0;
1835
1836 /* IP on this device is disabled. */
1837
1838 if (!in_dev)
1839 goto out;
1840
1841 /* Check for the most weird martians, which can be not detected
1842 by fib_lookup.
1843 */
1844
1845 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1846 goto martian_source;
1847
1848 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1849 goto brd_input;
1850
1851 /* Accept zero addresses only to limited broadcast;
1852 * I even do not know to fix it or not. Waiting for complains :-)
1853 */
1854 if (ZERONET(saddr))
1855 goto martian_source;
1856
1857 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1858 goto martian_destination;
1859
1860 /*
1861 * Now we are ready to route packet.
1862 */
1863 if ((err = fib_lookup(&fl, &res)) != 0) {
1864 if (!IN_DEV_FORWARD(in_dev))
1865 goto e_hostunreach;
1866 goto no_route;
1867 }
1868 free_res = 1;
1869
1870 RT_CACHE_STAT_INC(in_slow_tot);
1871
1872 if (res.type == RTN_BROADCAST)
1873 goto brd_input;
1874
1875 if (res.type == RTN_LOCAL) {
1876 int result;
1877 result = fib_validate_source(saddr, daddr, tos,
1878 init_net.loopback_dev->ifindex,
1879 dev, &spec_dst, &itag);
1880 if (result < 0)
1881 goto martian_source;
1882 if (result)
1883 flags |= RTCF_DIRECTSRC;
1884 spec_dst = daddr;
1885 goto local_input;
1886 }
1887
1888 if (!IN_DEV_FORWARD(in_dev))
1889 goto e_hostunreach;
1890 if (res.type != RTN_UNICAST)
1891 goto martian_destination;
1892
1893 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1894 done:
1895 in_dev_put(in_dev);
1896 if (free_res)
1897 fib_res_put(&res);
1898 out: return err;
1899
1900 brd_input:
1901 if (skb->protocol != htons(ETH_P_IP))
1902 goto e_inval;
1903
1904 if (ZERONET(saddr))
1905 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1906 else {
1907 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1908 &itag);
1909 if (err < 0)
1910 goto martian_source;
1911 if (err)
1912 flags |= RTCF_DIRECTSRC;
1913 }
1914 flags |= RTCF_BROADCAST;
1915 res.type = RTN_BROADCAST;
1916 RT_CACHE_STAT_INC(in_brd);
1917
1918 local_input:
1919 rth = dst_alloc(&ipv4_dst_ops);
1920 if (!rth)
1921 goto e_nobufs;
1922
1923 rth->u.dst.output= ip_rt_bug;
1924
1925 atomic_set(&rth->u.dst.__refcnt, 1);
1926 rth->u.dst.flags= DST_HOST;
1927 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1928 rth->u.dst.flags |= DST_NOPOLICY;
1929 rth->fl.fl4_dst = daddr;
1930 rth->rt_dst = daddr;
1931 rth->fl.fl4_tos = tos;
1932 rth->fl.mark = skb->mark;
1933 rth->fl.fl4_src = saddr;
1934 rth->rt_src = saddr;
1935 #ifdef CONFIG_NET_CLS_ROUTE
1936 rth->u.dst.tclassid = itag;
1937 #endif
1938 rth->rt_iif =
1939 rth->fl.iif = dev->ifindex;
1940 rth->u.dst.dev = init_net.loopback_dev;
1941 dev_hold(rth->u.dst.dev);
1942 rth->idev = in_dev_get(rth->u.dst.dev);
1943 rth->rt_gateway = daddr;
1944 rth->rt_spec_dst= spec_dst;
1945 rth->u.dst.input= ip_local_deliver;
1946 rth->rt_flags = flags|RTCF_LOCAL;
1947 if (res.type == RTN_UNREACHABLE) {
1948 rth->u.dst.input= ip_error;
1949 rth->u.dst.error= -err;
1950 rth->rt_flags &= ~RTCF_LOCAL;
1951 }
1952 rth->rt_type = res.type;
1953 hash = rt_hash(daddr, saddr, fl.iif);
1954 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1955 goto done;
1956
1957 no_route:
1958 RT_CACHE_STAT_INC(in_no_route);
1959 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1960 res.type = RTN_UNREACHABLE;
1961 if (err == -ESRCH)
1962 err = -ENETUNREACH;
1963 goto local_input;
1964
1965 /*
1966 * Do not cache martian addresses: they should be logged (RFC1812)
1967 */
1968 martian_destination:
1969 RT_CACHE_STAT_INC(in_martian_dst);
1970 #ifdef CONFIG_IP_ROUTE_VERBOSE
1971 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1972 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1973 "%u.%u.%u.%u, dev %s\n",
1974 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1975 #endif
1976
1977 e_hostunreach:
1978 err = -EHOSTUNREACH;
1979 goto done;
1980
1981 e_inval:
1982 err = -EINVAL;
1983 goto done;
1984
1985 e_nobufs:
1986 err = -ENOBUFS;
1987 goto done;
1988
1989 martian_source:
1990 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1991 goto e_inval;
1992 }
1993
1994 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1995 u8 tos, struct net_device *dev)
1996 {
1997 struct rtable * rth;
1998 unsigned hash;
1999 int iif = dev->ifindex;
2000
2001 tos &= IPTOS_RT_MASK;
2002 hash = rt_hash(daddr, saddr, iif);
2003
2004 rcu_read_lock();
2005 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2006 rth = rcu_dereference(rth->u.dst.rt_next)) {
2007 if (rth->fl.fl4_dst == daddr &&
2008 rth->fl.fl4_src == saddr &&
2009 rth->fl.iif == iif &&
2010 rth->fl.oif == 0 &&
2011 rth->fl.mark == skb->mark &&
2012 rth->fl.fl4_tos == tos) {
2013 dst_use(&rth->u.dst, jiffies);
2014 RT_CACHE_STAT_INC(in_hit);
2015 rcu_read_unlock();
2016 skb->dst = (struct dst_entry*)rth;
2017 return 0;
2018 }
2019 RT_CACHE_STAT_INC(in_hlist_search);
2020 }
2021 rcu_read_unlock();
2022
2023 /* Multicast recognition logic is moved from route cache to here.
2024 The problem was that too many Ethernet cards have broken/missing
2025 hardware multicast filters :-( As result the host on multicasting
2026 network acquires a lot of useless route cache entries, sort of
2027 SDR messages from all the world. Now we try to get rid of them.
2028 Really, provided software IP multicast filter is organized
2029 reasonably (at least, hashed), it does not result in a slowdown
2030 comparing with route cache reject entries.
2031 Note, that multicast routers are not affected, because
2032 route cache entry is created eventually.
2033 */
2034 if (MULTICAST(daddr)) {
2035 struct in_device *in_dev;
2036
2037 rcu_read_lock();
2038 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2039 int our = ip_check_mc(in_dev, daddr, saddr,
2040 ip_hdr(skb)->protocol);
2041 if (our
2042 #ifdef CONFIG_IP_MROUTE
2043 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2044 #endif
2045 ) {
2046 rcu_read_unlock();
2047 return ip_route_input_mc(skb, daddr, saddr,
2048 tos, dev, our);
2049 }
2050 }
2051 rcu_read_unlock();
2052 return -EINVAL;
2053 }
2054 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2055 }
2056
2057 static inline int __mkroute_output(struct rtable **result,
2058 struct fib_result* res,
2059 const struct flowi *fl,
2060 const struct flowi *oldflp,
2061 struct net_device *dev_out,
2062 unsigned flags)
2063 {
2064 struct rtable *rth;
2065 struct in_device *in_dev;
2066 u32 tos = RT_FL_TOS(oldflp);
2067 int err = 0;
2068
2069 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2070 return -EINVAL;
2071
2072 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2073 res->type = RTN_BROADCAST;
2074 else if (MULTICAST(fl->fl4_dst))
2075 res->type = RTN_MULTICAST;
2076 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2077 return -EINVAL;
2078
2079 if (dev_out->flags & IFF_LOOPBACK)
2080 flags |= RTCF_LOCAL;
2081
2082 /* get work reference to inet device */
2083 in_dev = in_dev_get(dev_out);
2084 if (!in_dev)
2085 return -EINVAL;
2086
2087 if (res->type == RTN_BROADCAST) {
2088 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2089 if (res->fi) {
2090 fib_info_put(res->fi);
2091 res->fi = NULL;
2092 }
2093 } else if (res->type == RTN_MULTICAST) {
2094 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2095 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2096 oldflp->proto))
2097 flags &= ~RTCF_LOCAL;
2098 /* If multicast route do not exist use
2099 default one, but do not gateway in this case.
2100 Yes, it is hack.
2101 */
2102 if (res->fi && res->prefixlen < 4) {
2103 fib_info_put(res->fi);
2104 res->fi = NULL;
2105 }
2106 }
2107
2108
2109 rth = dst_alloc(&ipv4_dst_ops);
2110 if (!rth) {
2111 err = -ENOBUFS;
2112 goto cleanup;
2113 }
2114
2115 atomic_set(&rth->u.dst.__refcnt, 1);
2116 rth->u.dst.flags= DST_HOST;
2117 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2118 rth->u.dst.flags |= DST_NOXFRM;
2119 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2120 rth->u.dst.flags |= DST_NOPOLICY;
2121
2122 rth->fl.fl4_dst = oldflp->fl4_dst;
2123 rth->fl.fl4_tos = tos;
2124 rth->fl.fl4_src = oldflp->fl4_src;
2125 rth->fl.oif = oldflp->oif;
2126 rth->fl.mark = oldflp->mark;
2127 rth->rt_dst = fl->fl4_dst;
2128 rth->rt_src = fl->fl4_src;
2129 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2130 /* get references to the devices that are to be hold by the routing
2131 cache entry */
2132 rth->u.dst.dev = dev_out;
2133 dev_hold(dev_out);
2134 rth->idev = in_dev_get(dev_out);
2135 rth->rt_gateway = fl->fl4_dst;
2136 rth->rt_spec_dst= fl->fl4_src;
2137
2138 rth->u.dst.output=ip_output;
2139
2140 RT_CACHE_STAT_INC(out_slow_tot);
2141
2142 if (flags & RTCF_LOCAL) {
2143 rth->u.dst.input = ip_local_deliver;
2144 rth->rt_spec_dst = fl->fl4_dst;
2145 }
2146 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2147 rth->rt_spec_dst = fl->fl4_src;
2148 if (flags & RTCF_LOCAL &&
2149 !(dev_out->flags & IFF_LOOPBACK)) {
2150 rth->u.dst.output = ip_mc_output;
2151 RT_CACHE_STAT_INC(out_slow_mc);
2152 }
2153 #ifdef CONFIG_IP_MROUTE
2154 if (res->type == RTN_MULTICAST) {
2155 if (IN_DEV_MFORWARD(in_dev) &&
2156 !LOCAL_MCAST(oldflp->fl4_dst)) {
2157 rth->u.dst.input = ip_mr_input;
2158 rth->u.dst.output = ip_mc_output;
2159 }
2160 }
2161 #endif
2162 }
2163
2164 rt_set_nexthop(rth, res, 0);
2165
2166 rth->rt_flags = flags;
2167
2168 *result = rth;
2169 cleanup:
2170 /* release work reference to inet device */
2171 in_dev_put(in_dev);
2172
2173 return err;
2174 }
2175
2176 static inline int ip_mkroute_output(struct rtable **rp,
2177 struct fib_result* res,
2178 const struct flowi *fl,
2179 const struct flowi *oldflp,
2180 struct net_device *dev_out,
2181 unsigned flags)
2182 {
2183 struct rtable *rth = NULL;
2184 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2185 unsigned hash;
2186 if (err == 0) {
2187 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2188 err = rt_intern_hash(hash, rth, rp);
2189 }
2190
2191 return err;
2192 }
2193
2194 /*
2195 * Major route resolver routine.
2196 */
2197
2198 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2199 {
2200 u32 tos = RT_FL_TOS(oldflp);
2201 struct flowi fl = { .nl_u = { .ip4_u =
2202 { .daddr = oldflp->fl4_dst,
2203 .saddr = oldflp->fl4_src,
2204 .tos = tos & IPTOS_RT_MASK,
2205 .scope = ((tos & RTO_ONLINK) ?
2206 RT_SCOPE_LINK :
2207 RT_SCOPE_UNIVERSE),
2208 } },
2209 .mark = oldflp->mark,
2210 .iif = init_net.loopback_dev->ifindex,
2211 .oif = oldflp->oif };
2212 struct fib_result res;
2213 unsigned flags = 0;
2214 struct net_device *dev_out = NULL;
2215 int free_res = 0;
2216 int err;
2217
2218
2219 res.fi = NULL;
2220 #ifdef CONFIG_IP_MULTIPLE_TABLES
2221 res.r = NULL;
2222 #endif
2223
2224 if (oldflp->fl4_src) {
2225 err = -EINVAL;
2226 if (MULTICAST(oldflp->fl4_src) ||
2227 BADCLASS(oldflp->fl4_src) ||
2228 ZERONET(oldflp->fl4_src))
2229 goto out;
2230
2231 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2232 dev_out = ip_dev_find(oldflp->fl4_src);
2233 if (dev_out == NULL)
2234 goto out;
2235
2236 /* I removed check for oif == dev_out->oif here.
2237 It was wrong for two reasons:
2238 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2239 assigned to multiple interfaces.
2240 2. Moreover, we are allowed to send packets with saddr
2241 of another iface. --ANK
2242 */
2243
2244 if (oldflp->oif == 0
2245 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2246 /* Special hack: user can direct multicasts
2247 and limited broadcast via necessary interface
2248 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2249 This hack is not just for fun, it allows
2250 vic,vat and friends to work.
2251 They bind socket to loopback, set ttl to zero
2252 and expect that it will work.
2253 From the viewpoint of routing cache they are broken,
2254 because we are not allowed to build multicast path
2255 with loopback source addr (look, routing cache
2256 cannot know, that ttl is zero, so that packet
2257 will not leave this host and route is valid).
2258 Luckily, this hack is good workaround.
2259 */
2260
2261 fl.oif = dev_out->ifindex;
2262 goto make_route;
2263 }
2264 if (dev_out)
2265 dev_put(dev_out);
2266 dev_out = NULL;
2267 }
2268
2269
2270 if (oldflp->oif) {
2271 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2272 err = -ENODEV;
2273 if (dev_out == NULL)
2274 goto out;
2275
2276 /* RACE: Check return value of inet_select_addr instead. */
2277 if (__in_dev_get_rtnl(dev_out) == NULL) {
2278 dev_put(dev_out);
2279 goto out; /* Wrong error code */
2280 }
2281
2282 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2283 if (!fl.fl4_src)
2284 fl.fl4_src = inet_select_addr(dev_out, 0,
2285 RT_SCOPE_LINK);
2286 goto make_route;
2287 }
2288 if (!fl.fl4_src) {
2289 if (MULTICAST(oldflp->fl4_dst))
2290 fl.fl4_src = inet_select_addr(dev_out, 0,
2291 fl.fl4_scope);
2292 else if (!oldflp->fl4_dst)
2293 fl.fl4_src = inet_select_addr(dev_out, 0,
2294 RT_SCOPE_HOST);
2295 }
2296 }
2297
2298 if (!fl.fl4_dst) {
2299 fl.fl4_dst = fl.fl4_src;
2300 if (!fl.fl4_dst)
2301 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2302 if (dev_out)
2303 dev_put(dev_out);
2304 dev_out = init_net.loopback_dev;
2305 dev_hold(dev_out);
2306 fl.oif = init_net.loopback_dev->ifindex;
2307 res.type = RTN_LOCAL;
2308 flags |= RTCF_LOCAL;
2309 goto make_route;
2310 }
2311
2312 if (fib_lookup(&fl, &res)) {
2313 res.fi = NULL;
2314 if (oldflp->oif) {
2315 /* Apparently, routing tables are wrong. Assume,
2316 that the destination is on link.
2317
2318 WHY? DW.
2319 Because we are allowed to send to iface
2320 even if it has NO routes and NO assigned
2321 addresses. When oif is specified, routing
2322 tables are looked up with only one purpose:
2323 to catch if destination is gatewayed, rather than
2324 direct. Moreover, if MSG_DONTROUTE is set,
2325 we send packet, ignoring both routing tables
2326 and ifaddr state. --ANK
2327
2328
2329 We could make it even if oif is unknown,
2330 likely IPv6, but we do not.
2331 */
2332
2333 if (fl.fl4_src == 0)
2334 fl.fl4_src = inet_select_addr(dev_out, 0,
2335 RT_SCOPE_LINK);
2336 res.type = RTN_UNICAST;
2337 goto make_route;
2338 }
2339 if (dev_out)
2340 dev_put(dev_out);
2341 err = -ENETUNREACH;
2342 goto out;
2343 }
2344 free_res = 1;
2345
2346 if (res.type == RTN_LOCAL) {
2347 if (!fl.fl4_src)
2348 fl.fl4_src = fl.fl4_dst;
2349 if (dev_out)
2350 dev_put(dev_out);
2351 dev_out = init_net.loopback_dev;
2352 dev_hold(dev_out);
2353 fl.oif = dev_out->ifindex;
2354 if (res.fi)
2355 fib_info_put(res.fi);
2356 res.fi = NULL;
2357 flags |= RTCF_LOCAL;
2358 goto make_route;
2359 }
2360
2361 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2362 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2363 fib_select_multipath(&fl, &res);
2364 else
2365 #endif
2366 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2367 fib_select_default(&fl, &res);
2368
2369 if (!fl.fl4_src)
2370 fl.fl4_src = FIB_RES_PREFSRC(res);
2371
2372 if (dev_out)
2373 dev_put(dev_out);
2374 dev_out = FIB_RES_DEV(res);
2375 dev_hold(dev_out);
2376 fl.oif = dev_out->ifindex;
2377
2378
2379 make_route:
2380 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2381
2382
2383 if (free_res)
2384 fib_res_put(&res);
2385 if (dev_out)
2386 dev_put(dev_out);
2387 out: return err;
2388 }
2389
2390 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2391 {
2392 unsigned hash;
2393 struct rtable *rth;
2394
2395 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2396
2397 rcu_read_lock_bh();
2398 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2399 rth = rcu_dereference(rth->u.dst.rt_next)) {
2400 if (rth->fl.fl4_dst == flp->fl4_dst &&
2401 rth->fl.fl4_src == flp->fl4_src &&
2402 rth->fl.iif == 0 &&
2403 rth->fl.oif == flp->oif &&
2404 rth->fl.mark == flp->mark &&
2405 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2406 (IPTOS_RT_MASK | RTO_ONLINK))) {
2407 dst_use(&rth->u.dst, jiffies);
2408 RT_CACHE_STAT_INC(out_hit);
2409 rcu_read_unlock_bh();
2410 *rp = rth;
2411 return 0;
2412 }
2413 RT_CACHE_STAT_INC(out_hlist_search);
2414 }
2415 rcu_read_unlock_bh();
2416
2417 return ip_route_output_slow(rp, flp);
2418 }
2419
2420 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2421
2422 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2423 {
2424 }
2425
2426 static struct dst_ops ipv4_dst_blackhole_ops = {
2427 .family = AF_INET,
2428 .protocol = __constant_htons(ETH_P_IP),
2429 .destroy = ipv4_dst_destroy,
2430 .check = ipv4_dst_check,
2431 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2432 .entry_size = sizeof(struct rtable),
2433 };
2434
2435
2436 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2437 {
2438 struct rtable *ort = *rp;
2439 struct rtable *rt = (struct rtable *)
2440 dst_alloc(&ipv4_dst_blackhole_ops);
2441
2442 if (rt) {
2443 struct dst_entry *new = &rt->u.dst;
2444
2445 atomic_set(&new->__refcnt, 1);
2446 new->__use = 1;
2447 new->input = dst_discard;
2448 new->output = dst_discard;
2449 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2450
2451 new->dev = ort->u.dst.dev;
2452 if (new->dev)
2453 dev_hold(new->dev);
2454
2455 rt->fl = ort->fl;
2456
2457 rt->idev = ort->idev;
2458 if (rt->idev)
2459 in_dev_hold(rt->idev);
2460 rt->rt_flags = ort->rt_flags;
2461 rt->rt_type = ort->rt_type;
2462 rt->rt_dst = ort->rt_dst;
2463 rt->rt_src = ort->rt_src;
2464 rt->rt_iif = ort->rt_iif;
2465 rt->rt_gateway = ort->rt_gateway;
2466 rt->rt_spec_dst = ort->rt_spec_dst;
2467 rt->peer = ort->peer;
2468 if (rt->peer)
2469 atomic_inc(&rt->peer->refcnt);
2470
2471 dst_free(new);
2472 }
2473
2474 dst_release(&(*rp)->u.dst);
2475 *rp = rt;
2476 return (rt ? 0 : -ENOMEM);
2477 }
2478
2479 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2480 {
2481 int err;
2482
2483 if ((err = __ip_route_output_key(rp, flp)) != 0)
2484 return err;
2485
2486 if (flp->proto) {
2487 if (!flp->fl4_src)
2488 flp->fl4_src = (*rp)->rt_src;
2489 if (!flp->fl4_dst)
2490 flp->fl4_dst = (*rp)->rt_dst;
2491 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2492 if (err == -EREMOTE)
2493 err = ipv4_dst_blackhole(rp, flp, sk);
2494
2495 return err;
2496 }
2497
2498 return 0;
2499 }
2500
2501 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2502
2503 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2504 {
2505 return ip_route_output_flow(rp, flp, NULL, 0);
2506 }
2507
2508 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2509 int nowait, unsigned int flags)
2510 {
2511 struct rtable *rt = (struct rtable*)skb->dst;
2512 struct rtmsg *r;
2513 struct nlmsghdr *nlh;
2514 long expires;
2515 u32 id = 0, ts = 0, tsage = 0, error;
2516
2517 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2518 if (nlh == NULL)
2519 return -EMSGSIZE;
2520
2521 r = nlmsg_data(nlh);
2522 r->rtm_family = AF_INET;
2523 r->rtm_dst_len = 32;
2524 r->rtm_src_len = 0;
2525 r->rtm_tos = rt->fl.fl4_tos;
2526 r->rtm_table = RT_TABLE_MAIN;
2527 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2528 r->rtm_type = rt->rt_type;
2529 r->rtm_scope = RT_SCOPE_UNIVERSE;
2530 r->rtm_protocol = RTPROT_UNSPEC;
2531 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2532 if (rt->rt_flags & RTCF_NOTIFY)
2533 r->rtm_flags |= RTM_F_NOTIFY;
2534
2535 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2536
2537 if (rt->fl.fl4_src) {
2538 r->rtm_src_len = 32;
2539 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2540 }
2541 if (rt->u.dst.dev)
2542 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2543 #ifdef CONFIG_NET_CLS_ROUTE
2544 if (rt->u.dst.tclassid)
2545 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2546 #endif
2547 if (rt->fl.iif)
2548 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2549 else if (rt->rt_src != rt->fl.fl4_src)
2550 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2551
2552 if (rt->rt_dst != rt->rt_gateway)
2553 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2554
2555 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2556 goto nla_put_failure;
2557
2558 error = rt->u.dst.error;
2559 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2560 if (rt->peer) {
2561 id = rt->peer->ip_id_count;
2562 if (rt->peer->tcp_ts_stamp) {
2563 ts = rt->peer->tcp_ts;
2564 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2565 }
2566 }
2567
2568 if (rt->fl.iif) {
2569 #ifdef CONFIG_IP_MROUTE
2570 __be32 dst = rt->rt_dst;
2571
2572 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2573 IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2574 int err = ipmr_get_route(skb, r, nowait);
2575 if (err <= 0) {
2576 if (!nowait) {
2577 if (err == 0)
2578 return 0;
2579 goto nla_put_failure;
2580 } else {
2581 if (err == -EMSGSIZE)
2582 goto nla_put_failure;
2583 error = err;
2584 }
2585 }
2586 } else
2587 #endif
2588 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2589 }
2590
2591 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2592 expires, error) < 0)
2593 goto nla_put_failure;
2594
2595 return nlmsg_end(skb, nlh);
2596
2597 nla_put_failure:
2598 nlmsg_cancel(skb, nlh);
2599 return -EMSGSIZE;
2600 }
2601
2602 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2603 {
2604 struct net *net = in_skb->sk->sk_net;
2605 struct rtmsg *rtm;
2606 struct nlattr *tb[RTA_MAX+1];
2607 struct rtable *rt = NULL;
2608 __be32 dst = 0;
2609 __be32 src = 0;
2610 u32 iif;
2611 int err;
2612 struct sk_buff *skb;
2613
2614 if (net != &init_net)
2615 return -EINVAL;
2616
2617 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2618 if (err < 0)
2619 goto errout;
2620
2621 rtm = nlmsg_data(nlh);
2622
2623 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2624 if (skb == NULL) {
2625 err = -ENOBUFS;
2626 goto errout;
2627 }
2628
2629 /* Reserve room for dummy headers, this skb can pass
2630 through good chunk of routing engine.
2631 */
2632 skb_reset_mac_header(skb);
2633 skb_reset_network_header(skb);
2634
2635 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2636 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2637 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2638
2639 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2640 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2641 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2642
2643 if (iif) {
2644 struct net_device *dev;
2645
2646 dev = __dev_get_by_index(&init_net, iif);
2647 if (dev == NULL) {
2648 err = -ENODEV;
2649 goto errout_free;
2650 }
2651
2652 skb->protocol = htons(ETH_P_IP);
2653 skb->dev = dev;
2654 local_bh_disable();
2655 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2656 local_bh_enable();
2657
2658 rt = (struct rtable*) skb->dst;
2659 if (err == 0 && rt->u.dst.error)
2660 err = -rt->u.dst.error;
2661 } else {
2662 struct flowi fl = {
2663 .nl_u = {
2664 .ip4_u = {
2665 .daddr = dst,
2666 .saddr = src,
2667 .tos = rtm->rtm_tos,
2668 },
2669 },
2670 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2671 };
2672 err = ip_route_output_key(&rt, &fl);
2673 }
2674
2675 if (err)
2676 goto errout_free;
2677
2678 skb->dst = &rt->u.dst;
2679 if (rtm->rtm_flags & RTM_F_NOTIFY)
2680 rt->rt_flags |= RTCF_NOTIFY;
2681
2682 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2683 RTM_NEWROUTE, 0, 0);
2684 if (err <= 0)
2685 goto errout_free;
2686
2687 err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2688 errout:
2689 return err;
2690
2691 errout_free:
2692 kfree_skb(skb);
2693 goto errout;
2694 }
2695
2696 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2697 {
2698 struct rtable *rt;
2699 int h, s_h;
2700 int idx, s_idx;
2701
2702 s_h = cb->args[0];
2703 if (s_h < 0)
2704 s_h = 0;
2705 s_idx = idx = cb->args[1];
2706 for (h = s_h; h <= rt_hash_mask; h++) {
2707 rcu_read_lock_bh();
2708 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2709 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2710 if (idx < s_idx)
2711 continue;
2712 skb->dst = dst_clone(&rt->u.dst);
2713 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2714 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2715 1, NLM_F_MULTI) <= 0) {
2716 dst_release(xchg(&skb->dst, NULL));
2717 rcu_read_unlock_bh();
2718 goto done;
2719 }
2720 dst_release(xchg(&skb->dst, NULL));
2721 }
2722 rcu_read_unlock_bh();
2723 s_idx = 0;
2724 }
2725
2726 done:
2727 cb->args[0] = h;
2728 cb->args[1] = idx;
2729 return skb->len;
2730 }
2731
2732 void ip_rt_multicast_event(struct in_device *in_dev)
2733 {
2734 rt_cache_flush(0);
2735 }
2736
2737 #ifdef CONFIG_SYSCTL
2738 static int flush_delay;
2739
2740 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2741 struct file *filp, void __user *buffer,
2742 size_t *lenp, loff_t *ppos)
2743 {
2744 if (write) {
2745 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2746 rt_cache_flush(flush_delay);
2747 return 0;
2748 }
2749
2750 return -EINVAL;
2751 }
2752
2753 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2754 int __user *name,
2755 int nlen,
2756 void __user *oldval,
2757 size_t __user *oldlenp,
2758 void __user *newval,
2759 size_t newlen)
2760 {
2761 int delay;
2762 if (newlen != sizeof(int))
2763 return -EINVAL;
2764 if (get_user(delay, (int __user *)newval))
2765 return -EFAULT;
2766 rt_cache_flush(delay);
2767 return 0;
2768 }
2769
2770 ctl_table ipv4_route_table[] = {
2771 {
2772 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2773 .procname = "flush",
2774 .data = &flush_delay,
2775 .maxlen = sizeof(int),
2776 .mode = 0200,
2777 .proc_handler = &ipv4_sysctl_rtcache_flush,
2778 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2779 },
2780 {
2781 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2782 .procname = "min_delay",
2783 .data = &ip_rt_min_delay,
2784 .maxlen = sizeof(int),
2785 .mode = 0644,
2786 .proc_handler = &proc_dointvec_jiffies,
2787 .strategy = &sysctl_jiffies,
2788 },
2789 {
2790 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2791 .procname = "max_delay",
2792 .data = &ip_rt_max_delay,
2793 .maxlen = sizeof(int),
2794 .mode = 0644,
2795 .proc_handler = &proc_dointvec_jiffies,
2796 .strategy = &sysctl_jiffies,
2797 },
2798 {
2799 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2800 .procname = "gc_thresh",
2801 .data = &ipv4_dst_ops.gc_thresh,
2802 .maxlen = sizeof(int),
2803 .mode = 0644,
2804 .proc_handler = &proc_dointvec,
2805 },
2806 {
2807 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2808 .procname = "max_size",
2809 .data = &ip_rt_max_size,
2810 .maxlen = sizeof(int),
2811 .mode = 0644,
2812 .proc_handler = &proc_dointvec,
2813 },
2814 {
2815 /* Deprecated. Use gc_min_interval_ms */
2816
2817 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2818 .procname = "gc_min_interval",
2819 .data = &ip_rt_gc_min_interval,
2820 .maxlen = sizeof(int),
2821 .mode = 0644,
2822 .proc_handler = &proc_dointvec_jiffies,
2823 .strategy = &sysctl_jiffies,
2824 },
2825 {
2826 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2827 .procname = "gc_min_interval_ms",
2828 .data = &ip_rt_gc_min_interval,
2829 .maxlen = sizeof(int),
2830 .mode = 0644,
2831 .proc_handler = &proc_dointvec_ms_jiffies,
2832 .strategy = &sysctl_ms_jiffies,
2833 },
2834 {
2835 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2836 .procname = "gc_timeout",
2837 .data = &ip_rt_gc_timeout,
2838 .maxlen = sizeof(int),
2839 .mode = 0644,
2840 .proc_handler = &proc_dointvec_jiffies,
2841 .strategy = &sysctl_jiffies,
2842 },
2843 {
2844 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2845 .procname = "gc_interval",
2846 .data = &ip_rt_gc_interval,
2847 .maxlen = sizeof(int),
2848 .mode = 0644,
2849 .proc_handler = &proc_dointvec_jiffies,
2850 .strategy = &sysctl_jiffies,
2851 },
2852 {
2853 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2854 .procname = "redirect_load",
2855 .data = &ip_rt_redirect_load,
2856 .maxlen = sizeof(int),
2857 .mode = 0644,
2858 .proc_handler = &proc_dointvec,
2859 },
2860 {
2861 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2862 .procname = "redirect_number",
2863 .data = &ip_rt_redirect_number,
2864 .maxlen = sizeof(int),
2865 .mode = 0644,
2866 .proc_handler = &proc_dointvec,
2867 },
2868 {
2869 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2870 .procname = "redirect_silence",
2871 .data = &ip_rt_redirect_silence,
2872 .maxlen = sizeof(int),
2873 .mode = 0644,
2874 .proc_handler = &proc_dointvec,
2875 },
2876 {
2877 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2878 .procname = "error_cost",
2879 .data = &ip_rt_error_cost,
2880 .maxlen = sizeof(int),
2881 .mode = 0644,
2882 .proc_handler = &proc_dointvec,
2883 },
2884 {
2885 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2886 .procname = "error_burst",
2887 .data = &ip_rt_error_burst,
2888 .maxlen = sizeof(int),
2889 .mode = 0644,
2890 .proc_handler = &proc_dointvec,
2891 },
2892 {
2893 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2894 .procname = "gc_elasticity",
2895 .data = &ip_rt_gc_elasticity,
2896 .maxlen = sizeof(int),
2897 .mode = 0644,
2898 .proc_handler = &proc_dointvec,
2899 },
2900 {
2901 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2902 .procname = "mtu_expires",
2903 .data = &ip_rt_mtu_expires,
2904 .maxlen = sizeof(int),
2905 .mode = 0644,
2906 .proc_handler = &proc_dointvec_jiffies,
2907 .strategy = &sysctl_jiffies,
2908 },
2909 {
2910 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2911 .procname = "min_pmtu",
2912 .data = &ip_rt_min_pmtu,
2913 .maxlen = sizeof(int),
2914 .mode = 0644,
2915 .proc_handler = &proc_dointvec,
2916 },
2917 {
2918 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2919 .procname = "min_adv_mss",
2920 .data = &ip_rt_min_advmss,
2921 .maxlen = sizeof(int),
2922 .mode = 0644,
2923 .proc_handler = &proc_dointvec,
2924 },
2925 {
2926 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2927 .procname = "secret_interval",
2928 .data = &ip_rt_secret_interval,
2929 .maxlen = sizeof(int),
2930 .mode = 0644,
2931 .proc_handler = &proc_dointvec_jiffies,
2932 .strategy = &sysctl_jiffies,
2933 },
2934 { .ctl_name = 0 }
2935 };
2936 #endif
2937
2938 #ifdef CONFIG_NET_CLS_ROUTE
2939 struct ip_rt_acct *ip_rt_acct __read_mostly;
2940 #endif /* CONFIG_NET_CLS_ROUTE */
2941
2942 static __initdata unsigned long rhash_entries;
2943 static int __init set_rhash_entries(char *str)
2944 {
2945 if (!str)
2946 return 0;
2947 rhash_entries = simple_strtoul(str, &str, 0);
2948 return 1;
2949 }
2950 __setup("rhash_entries=", set_rhash_entries);
2951
2952 int __init ip_rt_init(void)
2953 {
2954 int rc = 0;
2955
2956 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2957 (jiffies ^ (jiffies >> 7)));
2958
2959 #ifdef CONFIG_NET_CLS_ROUTE
2960 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
2961 if (!ip_rt_acct)
2962 panic("IP: failed to allocate ip_rt_acct\n");
2963 #endif
2964
2965 ipv4_dst_ops.kmem_cachep =
2966 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2967 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2968
2969 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2970
2971 rt_hash_table = (struct rt_hash_bucket *)
2972 alloc_large_system_hash("IP route cache",
2973 sizeof(struct rt_hash_bucket),
2974 rhash_entries,
2975 (num_physpages >= 128 * 1024) ?
2976 15 : 17,
2977 0,
2978 &rt_hash_log,
2979 &rt_hash_mask,
2980 0);
2981 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2982 rt_hash_lock_init();
2983
2984 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2985 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2986
2987 devinet_init();
2988 ip_fib_init();
2989
2990 setup_timer(&rt_flush_timer, rt_run_flush, 0);
2991 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
2992
2993 /* All the timers, started at system startup tend
2994 to synchronize. Perturb it a bit.
2995 */
2996 schedule_delayed_work(&expires_work,
2997 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
2998
2999 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3000 ip_rt_secret_interval;
3001 add_timer(&rt_secret_timer);
3002
3003 #ifdef CONFIG_PROC_FS
3004 {
3005 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3006 if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3007 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3008 init_net.proc_net_stat))) {
3009 return -ENOMEM;
3010 }
3011 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3012 }
3013 #ifdef CONFIG_NET_CLS_ROUTE
3014 create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
3015 #endif
3016 #endif
3017 #ifdef CONFIG_XFRM
3018 xfrm_init();
3019 xfrm4_init();
3020 #endif
3021 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3022
3023 return rc;
3024 }
3025
3026 EXPORT_SYMBOL(__ip_select_ident);
3027 EXPORT_SYMBOL(ip_route_input);
3028 EXPORT_SYMBOL(ip_route_output_key);
This page took 0.134059 seconds and 6 git commands to generate.