Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[deliverable/linux.git] / net / ipv4 / route.c
1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
39 *
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU 0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly = 9;
124 static int ip_rt_redirect_load __read_mostly = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly = HZ;
127 static int ip_rt_error_burst __read_mostly = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly = 8;
129 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly = 256;
132
133 /*
134 * Interface to generic destination cache.
135 */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void ipv4_link_failure(struct sk_buff *skb);
142 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb, u32 mtu);
144 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149 int how)
150 {
151 }
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155 WARN_ON(1);
156 return NULL;
157 }
158
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160 struct sk_buff *skb,
161 const void *daddr);
162
163 static struct dst_ops ipv4_dst_ops = {
164 .family = AF_INET,
165 .protocol = cpu_to_be16(ETH_P_IP),
166 .check = ipv4_dst_check,
167 .default_advmss = ipv4_default_advmss,
168 .mtu = ipv4_mtu,
169 .cow_metrics = ipv4_cow_metrics,
170 .destroy = ipv4_dst_destroy,
171 .ifdown = ipv4_dst_ifdown,
172 .negative_advice = ipv4_negative_advice,
173 .link_failure = ipv4_link_failure,
174 .update_pmtu = ip_rt_update_pmtu,
175 .redirect = ip_do_redirect,
176 .local_out = __ip_local_out,
177 .neigh_lookup = ipv4_neigh_lookup,
178 };
179
180 #define ECN_OR_COST(class) TC_PRIO_##class
181
182 const __u8 ip_tos2prio[16] = {
183 TC_PRIO_BESTEFFORT,
184 ECN_OR_COST(BESTEFFORT),
185 TC_PRIO_BESTEFFORT,
186 ECN_OR_COST(BESTEFFORT),
187 TC_PRIO_BULK,
188 ECN_OR_COST(BULK),
189 TC_PRIO_BULK,
190 ECN_OR_COST(BULK),
191 TC_PRIO_INTERACTIVE,
192 ECN_OR_COST(INTERACTIVE),
193 TC_PRIO_INTERACTIVE,
194 ECN_OR_COST(INTERACTIVE),
195 TC_PRIO_INTERACTIVE_BULK,
196 ECN_OR_COST(INTERACTIVE_BULK),
197 TC_PRIO_INTERACTIVE_BULK,
198 ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207 {
208 if (*pos)
209 return NULL;
210 return SEQ_START_TOKEN;
211 }
212
213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214 {
215 ++*pos;
216 return NULL;
217 }
218
219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220 {
221 }
222
223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 {
225 if (v == SEQ_START_TOKEN)
226 seq_printf(seq, "%-127s\n",
227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 "HHUptod\tSpecDst");
230 return 0;
231 }
232
233 static const struct seq_operations rt_cache_seq_ops = {
234 .start = rt_cache_seq_start,
235 .next = rt_cache_seq_next,
236 .stop = rt_cache_seq_stop,
237 .show = rt_cache_seq_show,
238 };
239
240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 {
242 return seq_open(file, &rt_cache_seq_ops);
243 }
244
245 static const struct file_operations rt_cache_seq_fops = {
246 .owner = THIS_MODULE,
247 .open = rt_cache_seq_open,
248 .read = seq_read,
249 .llseek = seq_lseek,
250 .release = seq_release,
251 };
252
253
254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255 {
256 int cpu;
257
258 if (*pos == 0)
259 return SEQ_START_TOKEN;
260
261 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262 if (!cpu_possible(cpu))
263 continue;
264 *pos = cpu+1;
265 return &per_cpu(rt_cache_stat, cpu);
266 }
267 return NULL;
268 }
269
270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 {
272 int cpu;
273
274 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275 if (!cpu_possible(cpu))
276 continue;
277 *pos = cpu+1;
278 return &per_cpu(rt_cache_stat, cpu);
279 }
280 return NULL;
281
282 }
283
284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 {
286
287 }
288
289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 {
291 struct rt_cache_stat *st = v;
292
293 if (v == SEQ_START_TOKEN) {
294 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295 return 0;
296 }
297
298 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
299 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300 dst_entries_get_slow(&ipv4_dst_ops),
301 st->in_hit,
302 st->in_slow_tot,
303 st->in_slow_mc,
304 st->in_no_route,
305 st->in_brd,
306 st->in_martian_dst,
307 st->in_martian_src,
308
309 st->out_hit,
310 st->out_slow_tot,
311 st->out_slow_mc,
312
313 st->gc_total,
314 st->gc_ignored,
315 st->gc_goal_miss,
316 st->gc_dst_overflow,
317 st->in_hlist_search,
318 st->out_hlist_search
319 );
320 return 0;
321 }
322
323 static const struct seq_operations rt_cpu_seq_ops = {
324 .start = rt_cpu_seq_start,
325 .next = rt_cpu_seq_next,
326 .stop = rt_cpu_seq_stop,
327 .show = rt_cpu_seq_show,
328 };
329
330
331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 {
333 return seq_open(file, &rt_cpu_seq_ops);
334 }
335
336 static const struct file_operations rt_cpu_seq_fops = {
337 .owner = THIS_MODULE,
338 .open = rt_cpu_seq_open,
339 .read = seq_read,
340 .llseek = seq_lseek,
341 .release = seq_release,
342 };
343
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file *m, void *v)
346 {
347 struct ip_rt_acct *dst, *src;
348 unsigned int i, j;
349
350 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351 if (!dst)
352 return -ENOMEM;
353
354 for_each_possible_cpu(i) {
355 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356 for (j = 0; j < 256; j++) {
357 dst[j].o_bytes += src[j].o_bytes;
358 dst[j].o_packets += src[j].o_packets;
359 dst[j].i_bytes += src[j].i_bytes;
360 dst[j].i_packets += src[j].i_packets;
361 }
362 }
363
364 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 kfree(dst);
366 return 0;
367 }
368
369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
370 {
371 return single_open(file, rt_acct_proc_show, NULL);
372 }
373
374 static const struct file_operations rt_acct_proc_fops = {
375 .owner = THIS_MODULE,
376 .open = rt_acct_proc_open,
377 .read = seq_read,
378 .llseek = seq_lseek,
379 .release = single_release,
380 };
381 #endif
382
383 static int __net_init ip_rt_do_proc_init(struct net *net)
384 {
385 struct proc_dir_entry *pde;
386
387 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
388 &rt_cache_seq_fops);
389 if (!pde)
390 goto err1;
391
392 pde = proc_create("rt_cache", S_IRUGO,
393 net->proc_net_stat, &rt_cpu_seq_fops);
394 if (!pde)
395 goto err2;
396
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399 if (!pde)
400 goto err3;
401 #endif
402 return 0;
403
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 err3:
406 remove_proc_entry("rt_cache", net->proc_net_stat);
407 #endif
408 err2:
409 remove_proc_entry("rt_cache", net->proc_net);
410 err1:
411 return -ENOMEM;
412 }
413
414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
415 {
416 remove_proc_entry("rt_cache", net->proc_net_stat);
417 remove_proc_entry("rt_cache", net->proc_net);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419 remove_proc_entry("rt_acct", net->proc_net);
420 #endif
421 }
422
423 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
424 .init = ip_rt_do_proc_init,
425 .exit = ip_rt_do_proc_exit,
426 };
427
428 static int __init ip_rt_proc_init(void)
429 {
430 return register_pernet_subsys(&ip_rt_proc_ops);
431 }
432
433 #else
434 static inline int ip_rt_proc_init(void)
435 {
436 return 0;
437 }
438 #endif /* CONFIG_PROC_FS */
439
440 static inline bool rt_is_expired(const struct rtable *rth)
441 {
442 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
443 }
444
445 void rt_cache_flush(struct net *net)
446 {
447 rt_genid_bump(net);
448 }
449
450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451 struct sk_buff *skb,
452 const void *daddr)
453 {
454 struct net_device *dev = dst->dev;
455 const __be32 *pkey = daddr;
456 const struct rtable *rt;
457 struct neighbour *n;
458
459 rt = (const struct rtable *) dst;
460 if (rt->rt_gateway)
461 pkey = (const __be32 *) &rt->rt_gateway;
462 else if (skb)
463 pkey = &ip_hdr(skb)->daddr;
464
465 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466 if (n)
467 return n;
468 return neigh_create(&arp_tbl, pkey, dev);
469 }
470
471 /*
472 * Peer allocation may fail only in serious out-of-memory conditions. However
473 * we still can generate some output.
474 * Random ID selection looks a bit dangerous because we have no chances to
475 * select ID being unique in a reasonable period of time.
476 * But broken packet identifier may be better than no packet at all.
477 */
478 static void ip_select_fb_ident(struct iphdr *iph)
479 {
480 static DEFINE_SPINLOCK(ip_fb_id_lock);
481 static u32 ip_fallback_id;
482 u32 salt;
483
484 spin_lock_bh(&ip_fb_id_lock);
485 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
486 iph->id = htons(salt & 0xFFFF);
487 ip_fallback_id = salt;
488 spin_unlock_bh(&ip_fb_id_lock);
489 }
490
491 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
492 {
493 struct net *net = dev_net(dst->dev);
494 struct inet_peer *peer;
495
496 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
497 if (peer) {
498 iph->id = htons(inet_getid(peer, more));
499 inet_putpeer(peer);
500 return;
501 }
502
503 ip_select_fb_ident(iph);
504 }
505 EXPORT_SYMBOL(__ip_select_ident);
506
507 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
508 const struct iphdr *iph,
509 int oif, u8 tos,
510 u8 prot, u32 mark, int flow_flags)
511 {
512 if (sk) {
513 const struct inet_sock *inet = inet_sk(sk);
514
515 oif = sk->sk_bound_dev_if;
516 mark = sk->sk_mark;
517 tos = RT_CONN_FLAGS(sk);
518 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
519 }
520 flowi4_init_output(fl4, oif, mark, tos,
521 RT_SCOPE_UNIVERSE, prot,
522 flow_flags,
523 iph->daddr, iph->saddr, 0, 0);
524 }
525
526 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527 const struct sock *sk)
528 {
529 const struct iphdr *iph = ip_hdr(skb);
530 int oif = skb->dev->ifindex;
531 u8 tos = RT_TOS(iph->tos);
532 u8 prot = iph->protocol;
533 u32 mark = skb->mark;
534
535 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
536 }
537
538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
539 {
540 const struct inet_sock *inet = inet_sk(sk);
541 const struct ip_options_rcu *inet_opt;
542 __be32 daddr = inet->inet_daddr;
543
544 rcu_read_lock();
545 inet_opt = rcu_dereference(inet->inet_opt);
546 if (inet_opt && inet_opt->opt.srr)
547 daddr = inet_opt->opt.faddr;
548 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551 inet_sk_flowi_flags(sk),
552 daddr, inet->inet_saddr, 0, 0);
553 rcu_read_unlock();
554 }
555
556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557 const struct sk_buff *skb)
558 {
559 if (skb)
560 build_skb_flow_key(fl4, skb, sk);
561 else
562 build_sk_flow_key(fl4, sk);
563 }
564
565 static inline void rt_free(struct rtable *rt)
566 {
567 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568 }
569
570 static DEFINE_SPINLOCK(fnhe_lock);
571
572 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
573 {
574 struct fib_nh_exception *fnhe, *oldest;
575 struct rtable *orig;
576
577 oldest = rcu_dereference(hash->chain);
578 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579 fnhe = rcu_dereference(fnhe->fnhe_next)) {
580 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
581 oldest = fnhe;
582 }
583 orig = rcu_dereference(oldest->fnhe_rth);
584 if (orig) {
585 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
586 rt_free(orig);
587 }
588 return oldest;
589 }
590
591 static inline u32 fnhe_hashfun(__be32 daddr)
592 {
593 u32 hval;
594
595 hval = (__force u32) daddr;
596 hval ^= (hval >> 11) ^ (hval >> 22);
597
598 return hval & (FNHE_HASH_SIZE - 1);
599 }
600
601 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602 u32 pmtu, unsigned long expires)
603 {
604 struct fnhe_hash_bucket *hash;
605 struct fib_nh_exception *fnhe;
606 int depth;
607 u32 hval = fnhe_hashfun(daddr);
608
609 spin_lock_bh(&fnhe_lock);
610
611 hash = nh->nh_exceptions;
612 if (!hash) {
613 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
614 if (!hash)
615 goto out_unlock;
616 nh->nh_exceptions = hash;
617 }
618
619 hash += hval;
620
621 depth = 0;
622 for (fnhe = rcu_dereference(hash->chain); fnhe;
623 fnhe = rcu_dereference(fnhe->fnhe_next)) {
624 if (fnhe->fnhe_daddr == daddr)
625 break;
626 depth++;
627 }
628
629 if (fnhe) {
630 if (gw)
631 fnhe->fnhe_gw = gw;
632 if (pmtu) {
633 fnhe->fnhe_pmtu = pmtu;
634 fnhe->fnhe_expires = expires;
635 }
636 } else {
637 if (depth > FNHE_RECLAIM_DEPTH)
638 fnhe = fnhe_oldest(hash);
639 else {
640 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
641 if (!fnhe)
642 goto out_unlock;
643
644 fnhe->fnhe_next = hash->chain;
645 rcu_assign_pointer(hash->chain, fnhe);
646 }
647 fnhe->fnhe_daddr = daddr;
648 fnhe->fnhe_gw = gw;
649 fnhe->fnhe_pmtu = pmtu;
650 fnhe->fnhe_expires = expires;
651 }
652
653 fnhe->fnhe_stamp = jiffies;
654
655 out_unlock:
656 spin_unlock_bh(&fnhe_lock);
657 return;
658 }
659
660 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
661 bool kill_route)
662 {
663 __be32 new_gw = icmp_hdr(skb)->un.gateway;
664 __be32 old_gw = ip_hdr(skb)->saddr;
665 struct net_device *dev = skb->dev;
666 struct in_device *in_dev;
667 struct fib_result res;
668 struct neighbour *n;
669 struct net *net;
670
671 switch (icmp_hdr(skb)->code & 7) {
672 case ICMP_REDIR_NET:
673 case ICMP_REDIR_NETTOS:
674 case ICMP_REDIR_HOST:
675 case ICMP_REDIR_HOSTTOS:
676 break;
677
678 default:
679 return;
680 }
681
682 if (rt->rt_gateway != old_gw)
683 return;
684
685 in_dev = __in_dev_get_rcu(dev);
686 if (!in_dev)
687 return;
688
689 net = dev_net(dev);
690 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692 ipv4_is_zeronet(new_gw))
693 goto reject_redirect;
694
695 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697 goto reject_redirect;
698 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699 goto reject_redirect;
700 } else {
701 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
702 goto reject_redirect;
703 }
704
705 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
706 if (n) {
707 if (!(n->nud_state & NUD_VALID)) {
708 neigh_event_send(n, NULL);
709 } else {
710 if (fib_lookup(net, fl4, &res) == 0) {
711 struct fib_nh *nh = &FIB_RES_NH(res);
712
713 update_or_create_fnhe(nh, fl4->daddr, new_gw,
714 0, 0);
715 }
716 if (kill_route)
717 rt->dst.obsolete = DST_OBSOLETE_KILL;
718 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
719 }
720 neigh_release(n);
721 }
722 return;
723
724 reject_redirect:
725 #ifdef CONFIG_IP_ROUTE_VERBOSE
726 if (IN_DEV_LOG_MARTIANS(in_dev)) {
727 const struct iphdr *iph = (const struct iphdr *) skb->data;
728 __be32 daddr = iph->daddr;
729 __be32 saddr = iph->saddr;
730
731 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732 " Advised path = %pI4 -> %pI4\n",
733 &old_gw, dev->name, &new_gw,
734 &saddr, &daddr);
735 }
736 #endif
737 ;
738 }
739
740 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
741 {
742 struct rtable *rt;
743 struct flowi4 fl4;
744
745 rt = (struct rtable *) dst;
746
747 ip_rt_build_flow_key(&fl4, sk, skb);
748 __ip_do_redirect(rt, skb, &fl4, true);
749 }
750
751 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
752 {
753 struct rtable *rt = (struct rtable *)dst;
754 struct dst_entry *ret = dst;
755
756 if (rt) {
757 if (dst->obsolete > 0) {
758 ip_rt_put(rt);
759 ret = NULL;
760 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
761 rt->dst.expires) {
762 ip_rt_put(rt);
763 ret = NULL;
764 }
765 }
766 return ret;
767 }
768
769 /*
770 * Algorithm:
771 * 1. The first ip_rt_redirect_number redirects are sent
772 * with exponential backoff, then we stop sending them at all,
773 * assuming that the host ignores our redirects.
774 * 2. If we did not see packets requiring redirects
775 * during ip_rt_redirect_silence, we assume that the host
776 * forgot redirected route and start to send redirects again.
777 *
778 * This algorithm is much cheaper and more intelligent than dumb load limiting
779 * in icmp.c.
780 *
781 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
782 * and "frag. need" (breaks PMTU discovery) in icmp.c.
783 */
784
785 void ip_rt_send_redirect(struct sk_buff *skb)
786 {
787 struct rtable *rt = skb_rtable(skb);
788 struct in_device *in_dev;
789 struct inet_peer *peer;
790 struct net *net;
791 int log_martians;
792
793 rcu_read_lock();
794 in_dev = __in_dev_get_rcu(rt->dst.dev);
795 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
796 rcu_read_unlock();
797 return;
798 }
799 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800 rcu_read_unlock();
801
802 net = dev_net(rt->dst.dev);
803 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
804 if (!peer) {
805 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
806 return;
807 }
808
809 /* No redirected packets during ip_rt_redirect_silence;
810 * reset the algorithm.
811 */
812 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
813 peer->rate_tokens = 0;
814
815 /* Too many ignored redirects; do not send anything
816 * set dst.rate_last to the last seen redirected packet.
817 */
818 if (peer->rate_tokens >= ip_rt_redirect_number) {
819 peer->rate_last = jiffies;
820 goto out_put_peer;
821 }
822
823 /* Check for load limit; set rate_last to the latest sent
824 * redirect.
825 */
826 if (peer->rate_tokens == 0 ||
827 time_after(jiffies,
828 (peer->rate_last +
829 (ip_rt_redirect_load << peer->rate_tokens)))) {
830 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
831 peer->rate_last = jiffies;
832 ++peer->rate_tokens;
833 #ifdef CONFIG_IP_ROUTE_VERBOSE
834 if (log_martians &&
835 peer->rate_tokens == ip_rt_redirect_number)
836 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
837 &ip_hdr(skb)->saddr, inet_iif(skb),
838 &ip_hdr(skb)->daddr, &rt->rt_gateway);
839 #endif
840 }
841 out_put_peer:
842 inet_putpeer(peer);
843 }
844
845 static int ip_error(struct sk_buff *skb)
846 {
847 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
848 struct rtable *rt = skb_rtable(skb);
849 struct inet_peer *peer;
850 unsigned long now;
851 struct net *net;
852 bool send;
853 int code;
854
855 net = dev_net(rt->dst.dev);
856 if (!IN_DEV_FORWARD(in_dev)) {
857 switch (rt->dst.error) {
858 case EHOSTUNREACH:
859 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
860 break;
861
862 case ENETUNREACH:
863 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
864 break;
865 }
866 goto out;
867 }
868
869 switch (rt->dst.error) {
870 case EINVAL:
871 default:
872 goto out;
873 case EHOSTUNREACH:
874 code = ICMP_HOST_UNREACH;
875 break;
876 case ENETUNREACH:
877 code = ICMP_NET_UNREACH;
878 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
879 break;
880 case EACCES:
881 code = ICMP_PKT_FILTERED;
882 break;
883 }
884
885 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
886
887 send = true;
888 if (peer) {
889 now = jiffies;
890 peer->rate_tokens += now - peer->rate_last;
891 if (peer->rate_tokens > ip_rt_error_burst)
892 peer->rate_tokens = ip_rt_error_burst;
893 peer->rate_last = now;
894 if (peer->rate_tokens >= ip_rt_error_cost)
895 peer->rate_tokens -= ip_rt_error_cost;
896 else
897 send = false;
898 inet_putpeer(peer);
899 }
900 if (send)
901 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
902
903 out: kfree_skb(skb);
904 return 0;
905 }
906
907 static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
908 {
909 struct fib_result res;
910
911 if (mtu < ip_rt_min_pmtu)
912 mtu = ip_rt_min_pmtu;
913
914 rcu_read_lock();
915 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
916 struct fib_nh *nh = &FIB_RES_NH(res);
917
918 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
919 jiffies + ip_rt_mtu_expires);
920 }
921 rcu_read_unlock();
922 return mtu;
923 }
924
925 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
926 struct sk_buff *skb, u32 mtu)
927 {
928 struct rtable *rt = (struct rtable *) dst;
929 struct flowi4 fl4;
930
931 ip_rt_build_flow_key(&fl4, sk, skb);
932 mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
933
934 if (!rt->rt_pmtu) {
935 dst->obsolete = DST_OBSOLETE_KILL;
936 } else {
937 rt->rt_pmtu = mtu;
938 rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
939 }
940 }
941
942 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
943 int oif, u32 mark, u8 protocol, int flow_flags)
944 {
945 const struct iphdr *iph = (const struct iphdr *) skb->data;
946 struct flowi4 fl4;
947 struct rtable *rt;
948
949 __build_flow_key(&fl4, NULL, iph, oif,
950 RT_TOS(iph->tos), protocol, mark, flow_flags);
951 rt = __ip_route_output_key(net, &fl4);
952 if (!IS_ERR(rt)) {
953 __ip_rt_update_pmtu(rt, &fl4, mtu);
954 ip_rt_put(rt);
955 }
956 }
957 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
958
959 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
960 {
961 const struct iphdr *iph = (const struct iphdr *) skb->data;
962 struct flowi4 fl4;
963 struct rtable *rt;
964
965 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
966 rt = __ip_route_output_key(sock_net(sk), &fl4);
967 if (!IS_ERR(rt)) {
968 __ip_rt_update_pmtu(rt, &fl4, mtu);
969 ip_rt_put(rt);
970 }
971 }
972 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
973
974 void ipv4_redirect(struct sk_buff *skb, struct net *net,
975 int oif, u32 mark, u8 protocol, int flow_flags)
976 {
977 const struct iphdr *iph = (const struct iphdr *) skb->data;
978 struct flowi4 fl4;
979 struct rtable *rt;
980
981 __build_flow_key(&fl4, NULL, iph, oif,
982 RT_TOS(iph->tos), protocol, mark, flow_flags);
983 rt = __ip_route_output_key(net, &fl4);
984 if (!IS_ERR(rt)) {
985 __ip_do_redirect(rt, skb, &fl4, false);
986 ip_rt_put(rt);
987 }
988 }
989 EXPORT_SYMBOL_GPL(ipv4_redirect);
990
991 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
992 {
993 const struct iphdr *iph = (const struct iphdr *) skb->data;
994 struct flowi4 fl4;
995 struct rtable *rt;
996
997 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
998 rt = __ip_route_output_key(sock_net(sk), &fl4);
999 if (!IS_ERR(rt)) {
1000 __ip_do_redirect(rt, skb, &fl4, false);
1001 ip_rt_put(rt);
1002 }
1003 }
1004 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1005
1006 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1007 {
1008 struct rtable *rt = (struct rtable *) dst;
1009
1010 /* All IPV4 dsts are created with ->obsolete set to the value
1011 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1012 * into this function always.
1013 *
1014 * When a PMTU/redirect information update invalidates a
1015 * route, this is indicated by setting obsolete to
1016 * DST_OBSOLETE_KILL.
1017 */
1018 if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1019 return NULL;
1020 return dst;
1021 }
1022
1023 static void ipv4_link_failure(struct sk_buff *skb)
1024 {
1025 struct rtable *rt;
1026
1027 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1028
1029 rt = skb_rtable(skb);
1030 if (rt)
1031 dst_set_expires(&rt->dst, 0);
1032 }
1033
1034 static int ip_rt_bug(struct sk_buff *skb)
1035 {
1036 pr_debug("%s: %pI4 -> %pI4, %s\n",
1037 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1038 skb->dev ? skb->dev->name : "?");
1039 kfree_skb(skb);
1040 WARN_ON(1);
1041 return 0;
1042 }
1043
1044 /*
1045 We do not cache source address of outgoing interface,
1046 because it is used only by IP RR, TS and SRR options,
1047 so that it out of fast path.
1048
1049 BTW remember: "addr" is allowed to be not aligned
1050 in IP options!
1051 */
1052
1053 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1054 {
1055 __be32 src;
1056
1057 if (rt_is_output_route(rt))
1058 src = ip_hdr(skb)->saddr;
1059 else {
1060 struct fib_result res;
1061 struct flowi4 fl4;
1062 struct iphdr *iph;
1063
1064 iph = ip_hdr(skb);
1065
1066 memset(&fl4, 0, sizeof(fl4));
1067 fl4.daddr = iph->daddr;
1068 fl4.saddr = iph->saddr;
1069 fl4.flowi4_tos = RT_TOS(iph->tos);
1070 fl4.flowi4_oif = rt->dst.dev->ifindex;
1071 fl4.flowi4_iif = skb->dev->ifindex;
1072 fl4.flowi4_mark = skb->mark;
1073
1074 rcu_read_lock();
1075 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1076 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1077 else
1078 src = inet_select_addr(rt->dst.dev,
1079 rt_nexthop(rt, iph->daddr),
1080 RT_SCOPE_UNIVERSE);
1081 rcu_read_unlock();
1082 }
1083 memcpy(addr, &src, 4);
1084 }
1085
1086 #ifdef CONFIG_IP_ROUTE_CLASSID
1087 static void set_class_tag(struct rtable *rt, u32 tag)
1088 {
1089 if (!(rt->dst.tclassid & 0xFFFF))
1090 rt->dst.tclassid |= tag & 0xFFFF;
1091 if (!(rt->dst.tclassid & 0xFFFF0000))
1092 rt->dst.tclassid |= tag & 0xFFFF0000;
1093 }
1094 #endif
1095
1096 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1097 {
1098 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1099
1100 if (advmss == 0) {
1101 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1102 ip_rt_min_advmss);
1103 if (advmss > 65535 - 40)
1104 advmss = 65535 - 40;
1105 }
1106 return advmss;
1107 }
1108
1109 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1110 {
1111 const struct rtable *rt = (const struct rtable *) dst;
1112 unsigned int mtu = rt->rt_pmtu;
1113
1114 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1115 mtu = dst_metric_raw(dst, RTAX_MTU);
1116
1117 if (mtu && rt_is_output_route(rt))
1118 return mtu;
1119
1120 mtu = dst->dev->mtu;
1121
1122 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1123 if (rt->rt_gateway && mtu > 576)
1124 mtu = 576;
1125 }
1126
1127 if (mtu > IP_MAX_MTU)
1128 mtu = IP_MAX_MTU;
1129
1130 return mtu;
1131 }
1132
1133 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1134 {
1135 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1136 struct fib_nh_exception *fnhe;
1137 u32 hval;
1138
1139 if (!hash)
1140 return NULL;
1141
1142 hval = fnhe_hashfun(daddr);
1143
1144 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1145 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1146 if (fnhe->fnhe_daddr == daddr)
1147 return fnhe;
1148 }
1149 return NULL;
1150 }
1151
1152 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1153 __be32 daddr)
1154 {
1155 bool ret = false;
1156
1157 spin_lock_bh(&fnhe_lock);
1158
1159 if (daddr == fnhe->fnhe_daddr) {
1160 struct rtable *orig;
1161
1162 if (fnhe->fnhe_pmtu) {
1163 unsigned long expires = fnhe->fnhe_expires;
1164 unsigned long diff = expires - jiffies;
1165
1166 if (time_before(jiffies, expires)) {
1167 rt->rt_pmtu = fnhe->fnhe_pmtu;
1168 dst_set_expires(&rt->dst, diff);
1169 }
1170 }
1171 if (fnhe->fnhe_gw) {
1172 rt->rt_flags |= RTCF_REDIRECTED;
1173 rt->rt_gateway = fnhe->fnhe_gw;
1174 }
1175
1176 orig = rcu_dereference(fnhe->fnhe_rth);
1177 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1178 if (orig)
1179 rt_free(orig);
1180
1181 fnhe->fnhe_stamp = jiffies;
1182 ret = true;
1183 } else {
1184 /* Routes we intend to cache in nexthop exception have
1185 * the DST_NOCACHE bit clear. However, if we are
1186 * unsuccessful at storing this route into the cache
1187 * we really need to set it.
1188 */
1189 rt->dst.flags |= DST_NOCACHE;
1190 }
1191 spin_unlock_bh(&fnhe_lock);
1192
1193 return ret;
1194 }
1195
1196 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1197 {
1198 struct rtable *orig, *prev, **p;
1199 bool ret = true;
1200
1201 if (rt_is_input_route(rt)) {
1202 p = (struct rtable **)&nh->nh_rth_input;
1203 } else {
1204 if (!nh->nh_pcpu_rth_output)
1205 goto nocache;
1206 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1207 }
1208 orig = *p;
1209
1210 prev = cmpxchg(p, orig, rt);
1211 if (prev == orig) {
1212 if (orig)
1213 rt_free(orig);
1214 } else {
1215 /* Routes we intend to cache in the FIB nexthop have
1216 * the DST_NOCACHE bit clear. However, if we are
1217 * unsuccessful at storing this route into the cache
1218 * we really need to set it.
1219 */
1220 nocache:
1221 rt->dst.flags |= DST_NOCACHE;
1222 ret = false;
1223 }
1224
1225 return ret;
1226 }
1227
1228 static DEFINE_SPINLOCK(rt_uncached_lock);
1229 static LIST_HEAD(rt_uncached_list);
1230
1231 static void rt_add_uncached_list(struct rtable *rt)
1232 {
1233 spin_lock_bh(&rt_uncached_lock);
1234 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1235 spin_unlock_bh(&rt_uncached_lock);
1236 }
1237
1238 static void ipv4_dst_destroy(struct dst_entry *dst)
1239 {
1240 struct rtable *rt = (struct rtable *) dst;
1241
1242 if (!list_empty(&rt->rt_uncached)) {
1243 spin_lock_bh(&rt_uncached_lock);
1244 list_del(&rt->rt_uncached);
1245 spin_unlock_bh(&rt_uncached_lock);
1246 }
1247 }
1248
1249 void rt_flush_dev(struct net_device *dev)
1250 {
1251 if (!list_empty(&rt_uncached_list)) {
1252 struct net *net = dev_net(dev);
1253 struct rtable *rt;
1254
1255 spin_lock_bh(&rt_uncached_lock);
1256 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1257 if (rt->dst.dev != dev)
1258 continue;
1259 rt->dst.dev = net->loopback_dev;
1260 dev_hold(rt->dst.dev);
1261 dev_put(dev);
1262 }
1263 spin_unlock_bh(&rt_uncached_lock);
1264 }
1265 }
1266
1267 static bool rt_cache_valid(const struct rtable *rt)
1268 {
1269 return rt &&
1270 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1271 !rt_is_expired(rt);
1272 }
1273
1274 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1275 const struct fib_result *res,
1276 struct fib_nh_exception *fnhe,
1277 struct fib_info *fi, u16 type, u32 itag)
1278 {
1279 bool cached = false;
1280
1281 if (fi) {
1282 struct fib_nh *nh = &FIB_RES_NH(*res);
1283
1284 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1285 rt->rt_gateway = nh->nh_gw;
1286 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1287 #ifdef CONFIG_IP_ROUTE_CLASSID
1288 rt->dst.tclassid = nh->nh_tclassid;
1289 #endif
1290 if (unlikely(fnhe))
1291 cached = rt_bind_exception(rt, fnhe, daddr);
1292 else if (!(rt->dst.flags & DST_NOCACHE))
1293 cached = rt_cache_route(nh, rt);
1294 }
1295 if (unlikely(!cached))
1296 rt_add_uncached_list(rt);
1297
1298 #ifdef CONFIG_IP_ROUTE_CLASSID
1299 #ifdef CONFIG_IP_MULTIPLE_TABLES
1300 set_class_tag(rt, res->tclassid);
1301 #endif
1302 set_class_tag(rt, itag);
1303 #endif
1304 }
1305
1306 static struct rtable *rt_dst_alloc(struct net_device *dev,
1307 bool nopolicy, bool noxfrm, bool will_cache)
1308 {
1309 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1310 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1311 (nopolicy ? DST_NOPOLICY : 0) |
1312 (noxfrm ? DST_NOXFRM : 0));
1313 }
1314
1315 /* called in rcu_read_lock() section */
1316 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1317 u8 tos, struct net_device *dev, int our)
1318 {
1319 struct rtable *rth;
1320 struct in_device *in_dev = __in_dev_get_rcu(dev);
1321 u32 itag = 0;
1322 int err;
1323
1324 /* Primary sanity checks. */
1325
1326 if (in_dev == NULL)
1327 return -EINVAL;
1328
1329 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1330 skb->protocol != htons(ETH_P_IP))
1331 goto e_inval;
1332
1333 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1334 if (ipv4_is_loopback(saddr))
1335 goto e_inval;
1336
1337 if (ipv4_is_zeronet(saddr)) {
1338 if (!ipv4_is_local_multicast(daddr))
1339 goto e_inval;
1340 } else {
1341 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1342 in_dev, &itag);
1343 if (err < 0)
1344 goto e_err;
1345 }
1346 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1347 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1348 if (!rth)
1349 goto e_nobufs;
1350
1351 #ifdef CONFIG_IP_ROUTE_CLASSID
1352 rth->dst.tclassid = itag;
1353 #endif
1354 rth->dst.output = ip_rt_bug;
1355
1356 rth->rt_genid = rt_genid(dev_net(dev));
1357 rth->rt_flags = RTCF_MULTICAST;
1358 rth->rt_type = RTN_MULTICAST;
1359 rth->rt_is_input= 1;
1360 rth->rt_iif = 0;
1361 rth->rt_pmtu = 0;
1362 rth->rt_gateway = 0;
1363 INIT_LIST_HEAD(&rth->rt_uncached);
1364 if (our) {
1365 rth->dst.input= ip_local_deliver;
1366 rth->rt_flags |= RTCF_LOCAL;
1367 }
1368
1369 #ifdef CONFIG_IP_MROUTE
1370 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1371 rth->dst.input = ip_mr_input;
1372 #endif
1373 RT_CACHE_STAT_INC(in_slow_mc);
1374
1375 skb_dst_set(skb, &rth->dst);
1376 return 0;
1377
1378 e_nobufs:
1379 return -ENOBUFS;
1380 e_inval:
1381 return -EINVAL;
1382 e_err:
1383 return err;
1384 }
1385
1386
1387 static void ip_handle_martian_source(struct net_device *dev,
1388 struct in_device *in_dev,
1389 struct sk_buff *skb,
1390 __be32 daddr,
1391 __be32 saddr)
1392 {
1393 RT_CACHE_STAT_INC(in_martian_src);
1394 #ifdef CONFIG_IP_ROUTE_VERBOSE
1395 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1396 /*
1397 * RFC1812 recommendation, if source is martian,
1398 * the only hint is MAC header.
1399 */
1400 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1401 &daddr, &saddr, dev->name);
1402 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1403 print_hex_dump(KERN_WARNING, "ll header: ",
1404 DUMP_PREFIX_OFFSET, 16, 1,
1405 skb_mac_header(skb),
1406 dev->hard_header_len, true);
1407 }
1408 }
1409 #endif
1410 }
1411
1412 /* called in rcu_read_lock() section */
1413 static int __mkroute_input(struct sk_buff *skb,
1414 const struct fib_result *res,
1415 struct in_device *in_dev,
1416 __be32 daddr, __be32 saddr, u32 tos)
1417 {
1418 struct rtable *rth;
1419 int err;
1420 struct in_device *out_dev;
1421 unsigned int flags = 0;
1422 bool do_cache;
1423 u32 itag;
1424
1425 /* get a working reference to the output device */
1426 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1427 if (out_dev == NULL) {
1428 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1429 return -EINVAL;
1430 }
1431
1432
1433 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1434 in_dev->dev, in_dev, &itag);
1435 if (err < 0) {
1436 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1437 saddr);
1438
1439 goto cleanup;
1440 }
1441
1442 if (out_dev == in_dev && err &&
1443 (IN_DEV_SHARED_MEDIA(out_dev) ||
1444 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1445 flags |= RTCF_DOREDIRECT;
1446
1447 if (skb->protocol != htons(ETH_P_IP)) {
1448 /* Not IP (i.e. ARP). Do not create route, if it is
1449 * invalid for proxy arp. DNAT routes are always valid.
1450 *
1451 * Proxy arp feature have been extended to allow, ARP
1452 * replies back to the same interface, to support
1453 * Private VLAN switch technologies. See arp.c.
1454 */
1455 if (out_dev == in_dev &&
1456 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1457 err = -EINVAL;
1458 goto cleanup;
1459 }
1460 }
1461
1462 do_cache = false;
1463 if (res->fi) {
1464 if (!itag) {
1465 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1466 if (rt_cache_valid(rth)) {
1467 skb_dst_set_noref(skb, &rth->dst);
1468 goto out;
1469 }
1470 do_cache = true;
1471 }
1472 }
1473
1474 rth = rt_dst_alloc(out_dev->dev,
1475 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1476 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1477 if (!rth) {
1478 err = -ENOBUFS;
1479 goto cleanup;
1480 }
1481
1482 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1483 rth->rt_flags = flags;
1484 rth->rt_type = res->type;
1485 rth->rt_is_input = 1;
1486 rth->rt_iif = 0;
1487 rth->rt_pmtu = 0;
1488 rth->rt_gateway = 0;
1489 INIT_LIST_HEAD(&rth->rt_uncached);
1490
1491 rth->dst.input = ip_forward;
1492 rth->dst.output = ip_output;
1493
1494 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1495 skb_dst_set(skb, &rth->dst);
1496 out:
1497 err = 0;
1498 cleanup:
1499 return err;
1500 }
1501
1502 static int ip_mkroute_input(struct sk_buff *skb,
1503 struct fib_result *res,
1504 const struct flowi4 *fl4,
1505 struct in_device *in_dev,
1506 __be32 daddr, __be32 saddr, u32 tos)
1507 {
1508 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1509 if (res->fi && res->fi->fib_nhs > 1)
1510 fib_select_multipath(res);
1511 #endif
1512
1513 /* create a routing cache entry */
1514 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1515 }
1516
1517 /*
1518 * NOTE. We drop all the packets that has local source
1519 * addresses, because every properly looped back packet
1520 * must have correct destination already attached by output routine.
1521 *
1522 * Such approach solves two big problems:
1523 * 1. Not simplex devices are handled properly.
1524 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1525 * called with rcu_read_lock()
1526 */
1527
1528 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1529 u8 tos, struct net_device *dev)
1530 {
1531 struct fib_result res;
1532 struct in_device *in_dev = __in_dev_get_rcu(dev);
1533 struct flowi4 fl4;
1534 unsigned int flags = 0;
1535 u32 itag = 0;
1536 struct rtable *rth;
1537 int err = -EINVAL;
1538 struct net *net = dev_net(dev);
1539 bool do_cache;
1540
1541 /* IP on this device is disabled. */
1542
1543 if (!in_dev)
1544 goto out;
1545
1546 /* Check for the most weird martians, which can be not detected
1547 by fib_lookup.
1548 */
1549
1550 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1551 goto martian_source;
1552
1553 res.fi = NULL;
1554 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1555 goto brd_input;
1556
1557 /* Accept zero addresses only to limited broadcast;
1558 * I even do not know to fix it or not. Waiting for complains :-)
1559 */
1560 if (ipv4_is_zeronet(saddr))
1561 goto martian_source;
1562
1563 if (ipv4_is_zeronet(daddr))
1564 goto martian_destination;
1565
1566 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1567 * and call it once if daddr or/and saddr are loopback addresses
1568 */
1569 if (ipv4_is_loopback(daddr)) {
1570 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1571 goto martian_destination;
1572 } else if (ipv4_is_loopback(saddr)) {
1573 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1574 goto martian_source;
1575 }
1576
1577 /*
1578 * Now we are ready to route packet.
1579 */
1580 fl4.flowi4_oif = 0;
1581 fl4.flowi4_iif = dev->ifindex;
1582 fl4.flowi4_mark = skb->mark;
1583 fl4.flowi4_tos = tos;
1584 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1585 fl4.daddr = daddr;
1586 fl4.saddr = saddr;
1587 err = fib_lookup(net, &fl4, &res);
1588 if (err != 0)
1589 goto no_route;
1590
1591 RT_CACHE_STAT_INC(in_slow_tot);
1592
1593 if (res.type == RTN_BROADCAST)
1594 goto brd_input;
1595
1596 if (res.type == RTN_LOCAL) {
1597 err = fib_validate_source(skb, saddr, daddr, tos,
1598 LOOPBACK_IFINDEX,
1599 dev, in_dev, &itag);
1600 if (err < 0)
1601 goto martian_source_keep_err;
1602 goto local_input;
1603 }
1604
1605 if (!IN_DEV_FORWARD(in_dev))
1606 goto no_route;
1607 if (res.type != RTN_UNICAST)
1608 goto martian_destination;
1609
1610 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1611 out: return err;
1612
1613 brd_input:
1614 if (skb->protocol != htons(ETH_P_IP))
1615 goto e_inval;
1616
1617 if (!ipv4_is_zeronet(saddr)) {
1618 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1619 in_dev, &itag);
1620 if (err < 0)
1621 goto martian_source_keep_err;
1622 }
1623 flags |= RTCF_BROADCAST;
1624 res.type = RTN_BROADCAST;
1625 RT_CACHE_STAT_INC(in_brd);
1626
1627 local_input:
1628 do_cache = false;
1629 if (res.fi) {
1630 if (!itag) {
1631 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1632 if (rt_cache_valid(rth)) {
1633 skb_dst_set_noref(skb, &rth->dst);
1634 err = 0;
1635 goto out;
1636 }
1637 do_cache = true;
1638 }
1639 }
1640
1641 rth = rt_dst_alloc(net->loopback_dev,
1642 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1643 if (!rth)
1644 goto e_nobufs;
1645
1646 rth->dst.input= ip_local_deliver;
1647 rth->dst.output= ip_rt_bug;
1648 #ifdef CONFIG_IP_ROUTE_CLASSID
1649 rth->dst.tclassid = itag;
1650 #endif
1651
1652 rth->rt_genid = rt_genid(net);
1653 rth->rt_flags = flags|RTCF_LOCAL;
1654 rth->rt_type = res.type;
1655 rth->rt_is_input = 1;
1656 rth->rt_iif = 0;
1657 rth->rt_pmtu = 0;
1658 rth->rt_gateway = 0;
1659 INIT_LIST_HEAD(&rth->rt_uncached);
1660 if (res.type == RTN_UNREACHABLE) {
1661 rth->dst.input= ip_error;
1662 rth->dst.error= -err;
1663 rth->rt_flags &= ~RTCF_LOCAL;
1664 }
1665 if (do_cache)
1666 rt_cache_route(&FIB_RES_NH(res), rth);
1667 skb_dst_set(skb, &rth->dst);
1668 err = 0;
1669 goto out;
1670
1671 no_route:
1672 RT_CACHE_STAT_INC(in_no_route);
1673 res.type = RTN_UNREACHABLE;
1674 if (err == -ESRCH)
1675 err = -ENETUNREACH;
1676 goto local_input;
1677
1678 /*
1679 * Do not cache martian addresses: they should be logged (RFC1812)
1680 */
1681 martian_destination:
1682 RT_CACHE_STAT_INC(in_martian_dst);
1683 #ifdef CONFIG_IP_ROUTE_VERBOSE
1684 if (IN_DEV_LOG_MARTIANS(in_dev))
1685 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1686 &daddr, &saddr, dev->name);
1687 #endif
1688
1689 e_inval:
1690 err = -EINVAL;
1691 goto out;
1692
1693 e_nobufs:
1694 err = -ENOBUFS;
1695 goto out;
1696
1697 martian_source:
1698 err = -EINVAL;
1699 martian_source_keep_err:
1700 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1701 goto out;
1702 }
1703
1704 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1705 u8 tos, struct net_device *dev)
1706 {
1707 int res;
1708
1709 rcu_read_lock();
1710
1711 /* Multicast recognition logic is moved from route cache to here.
1712 The problem was that too many Ethernet cards have broken/missing
1713 hardware multicast filters :-( As result the host on multicasting
1714 network acquires a lot of useless route cache entries, sort of
1715 SDR messages from all the world. Now we try to get rid of them.
1716 Really, provided software IP multicast filter is organized
1717 reasonably (at least, hashed), it does not result in a slowdown
1718 comparing with route cache reject entries.
1719 Note, that multicast routers are not affected, because
1720 route cache entry is created eventually.
1721 */
1722 if (ipv4_is_multicast(daddr)) {
1723 struct in_device *in_dev = __in_dev_get_rcu(dev);
1724
1725 if (in_dev) {
1726 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1727 ip_hdr(skb)->protocol);
1728 if (our
1729 #ifdef CONFIG_IP_MROUTE
1730 ||
1731 (!ipv4_is_local_multicast(daddr) &&
1732 IN_DEV_MFORWARD(in_dev))
1733 #endif
1734 ) {
1735 int res = ip_route_input_mc(skb, daddr, saddr,
1736 tos, dev, our);
1737 rcu_read_unlock();
1738 return res;
1739 }
1740 }
1741 rcu_read_unlock();
1742 return -EINVAL;
1743 }
1744 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1745 rcu_read_unlock();
1746 return res;
1747 }
1748 EXPORT_SYMBOL(ip_route_input_noref);
1749
1750 /* called with rcu_read_lock() */
1751 static struct rtable *__mkroute_output(const struct fib_result *res,
1752 const struct flowi4 *fl4, int orig_oif,
1753 struct net_device *dev_out,
1754 unsigned int flags)
1755 {
1756 struct fib_info *fi = res->fi;
1757 struct fib_nh_exception *fnhe;
1758 struct in_device *in_dev;
1759 u16 type = res->type;
1760 struct rtable *rth;
1761
1762 in_dev = __in_dev_get_rcu(dev_out);
1763 if (!in_dev)
1764 return ERR_PTR(-EINVAL);
1765
1766 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1767 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1768 return ERR_PTR(-EINVAL);
1769
1770 if (ipv4_is_lbcast(fl4->daddr))
1771 type = RTN_BROADCAST;
1772 else if (ipv4_is_multicast(fl4->daddr))
1773 type = RTN_MULTICAST;
1774 else if (ipv4_is_zeronet(fl4->daddr))
1775 return ERR_PTR(-EINVAL);
1776
1777 if (dev_out->flags & IFF_LOOPBACK)
1778 flags |= RTCF_LOCAL;
1779
1780 if (type == RTN_BROADCAST) {
1781 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1782 fi = NULL;
1783 } else if (type == RTN_MULTICAST) {
1784 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1785 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1786 fl4->flowi4_proto))
1787 flags &= ~RTCF_LOCAL;
1788 /* If multicast route do not exist use
1789 * default one, but do not gateway in this case.
1790 * Yes, it is hack.
1791 */
1792 if (fi && res->prefixlen < 4)
1793 fi = NULL;
1794 }
1795
1796 fnhe = NULL;
1797 if (fi) {
1798 struct rtable __rcu **prth;
1799
1800 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1801 if (fnhe)
1802 prth = &fnhe->fnhe_rth;
1803 else
1804 prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1805 rth = rcu_dereference(*prth);
1806 if (rt_cache_valid(rth)) {
1807 dst_hold(&rth->dst);
1808 return rth;
1809 }
1810 }
1811 rth = rt_dst_alloc(dev_out,
1812 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1813 IN_DEV_CONF_GET(in_dev, NOXFRM),
1814 fi);
1815 if (!rth)
1816 return ERR_PTR(-ENOBUFS);
1817
1818 rth->dst.output = ip_output;
1819
1820 rth->rt_genid = rt_genid(dev_net(dev_out));
1821 rth->rt_flags = flags;
1822 rth->rt_type = type;
1823 rth->rt_is_input = 0;
1824 rth->rt_iif = orig_oif ? : 0;
1825 rth->rt_pmtu = 0;
1826 rth->rt_gateway = 0;
1827 INIT_LIST_HEAD(&rth->rt_uncached);
1828
1829 RT_CACHE_STAT_INC(out_slow_tot);
1830
1831 if (flags & RTCF_LOCAL)
1832 rth->dst.input = ip_local_deliver;
1833 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1834 if (flags & RTCF_LOCAL &&
1835 !(dev_out->flags & IFF_LOOPBACK)) {
1836 rth->dst.output = ip_mc_output;
1837 RT_CACHE_STAT_INC(out_slow_mc);
1838 }
1839 #ifdef CONFIG_IP_MROUTE
1840 if (type == RTN_MULTICAST) {
1841 if (IN_DEV_MFORWARD(in_dev) &&
1842 !ipv4_is_local_multicast(fl4->daddr)) {
1843 rth->dst.input = ip_mr_input;
1844 rth->dst.output = ip_mc_output;
1845 }
1846 }
1847 #endif
1848 }
1849
1850 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1851
1852 return rth;
1853 }
1854
1855 /*
1856 * Major route resolver routine.
1857 */
1858
1859 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1860 {
1861 struct net_device *dev_out = NULL;
1862 __u8 tos = RT_FL_TOS(fl4);
1863 unsigned int flags = 0;
1864 struct fib_result res;
1865 struct rtable *rth;
1866 int orig_oif;
1867
1868 res.tclassid = 0;
1869 res.fi = NULL;
1870 res.table = NULL;
1871
1872 orig_oif = fl4->flowi4_oif;
1873
1874 fl4->flowi4_iif = LOOPBACK_IFINDEX;
1875 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1876 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1877 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1878
1879 rcu_read_lock();
1880 if (fl4->saddr) {
1881 rth = ERR_PTR(-EINVAL);
1882 if (ipv4_is_multicast(fl4->saddr) ||
1883 ipv4_is_lbcast(fl4->saddr) ||
1884 ipv4_is_zeronet(fl4->saddr))
1885 goto out;
1886
1887 /* I removed check for oif == dev_out->oif here.
1888 It was wrong for two reasons:
1889 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1890 is assigned to multiple interfaces.
1891 2. Moreover, we are allowed to send packets with saddr
1892 of another iface. --ANK
1893 */
1894
1895 if (fl4->flowi4_oif == 0 &&
1896 (ipv4_is_multicast(fl4->daddr) ||
1897 ipv4_is_lbcast(fl4->daddr))) {
1898 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1899 dev_out = __ip_dev_find(net, fl4->saddr, false);
1900 if (dev_out == NULL)
1901 goto out;
1902
1903 /* Special hack: user can direct multicasts
1904 and limited broadcast via necessary interface
1905 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1906 This hack is not just for fun, it allows
1907 vic,vat and friends to work.
1908 They bind socket to loopback, set ttl to zero
1909 and expect that it will work.
1910 From the viewpoint of routing cache they are broken,
1911 because we are not allowed to build multicast path
1912 with loopback source addr (look, routing cache
1913 cannot know, that ttl is zero, so that packet
1914 will not leave this host and route is valid).
1915 Luckily, this hack is good workaround.
1916 */
1917
1918 fl4->flowi4_oif = dev_out->ifindex;
1919 goto make_route;
1920 }
1921
1922 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1923 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1924 if (!__ip_dev_find(net, fl4->saddr, false))
1925 goto out;
1926 }
1927 }
1928
1929
1930 if (fl4->flowi4_oif) {
1931 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1932 rth = ERR_PTR(-ENODEV);
1933 if (dev_out == NULL)
1934 goto out;
1935
1936 /* RACE: Check return value of inet_select_addr instead. */
1937 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1938 rth = ERR_PTR(-ENETUNREACH);
1939 goto out;
1940 }
1941 if (ipv4_is_local_multicast(fl4->daddr) ||
1942 ipv4_is_lbcast(fl4->daddr)) {
1943 if (!fl4->saddr)
1944 fl4->saddr = inet_select_addr(dev_out, 0,
1945 RT_SCOPE_LINK);
1946 goto make_route;
1947 }
1948 if (fl4->saddr) {
1949 if (ipv4_is_multicast(fl4->daddr))
1950 fl4->saddr = inet_select_addr(dev_out, 0,
1951 fl4->flowi4_scope);
1952 else if (!fl4->daddr)
1953 fl4->saddr = inet_select_addr(dev_out, 0,
1954 RT_SCOPE_HOST);
1955 }
1956 }
1957
1958 if (!fl4->daddr) {
1959 fl4->daddr = fl4->saddr;
1960 if (!fl4->daddr)
1961 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1962 dev_out = net->loopback_dev;
1963 fl4->flowi4_oif = LOOPBACK_IFINDEX;
1964 res.type = RTN_LOCAL;
1965 flags |= RTCF_LOCAL;
1966 goto make_route;
1967 }
1968
1969 if (fib_lookup(net, fl4, &res)) {
1970 res.fi = NULL;
1971 res.table = NULL;
1972 if (fl4->flowi4_oif) {
1973 /* Apparently, routing tables are wrong. Assume,
1974 that the destination is on link.
1975
1976 WHY? DW.
1977 Because we are allowed to send to iface
1978 even if it has NO routes and NO assigned
1979 addresses. When oif is specified, routing
1980 tables are looked up with only one purpose:
1981 to catch if destination is gatewayed, rather than
1982 direct. Moreover, if MSG_DONTROUTE is set,
1983 we send packet, ignoring both routing tables
1984 and ifaddr state. --ANK
1985
1986
1987 We could make it even if oif is unknown,
1988 likely IPv6, but we do not.
1989 */
1990
1991 if (fl4->saddr == 0)
1992 fl4->saddr = inet_select_addr(dev_out, 0,
1993 RT_SCOPE_LINK);
1994 res.type = RTN_UNICAST;
1995 goto make_route;
1996 }
1997 rth = ERR_PTR(-ENETUNREACH);
1998 goto out;
1999 }
2000
2001 if (res.type == RTN_LOCAL) {
2002 if (!fl4->saddr) {
2003 if (res.fi->fib_prefsrc)
2004 fl4->saddr = res.fi->fib_prefsrc;
2005 else
2006 fl4->saddr = fl4->daddr;
2007 }
2008 dev_out = net->loopback_dev;
2009 fl4->flowi4_oif = dev_out->ifindex;
2010 flags |= RTCF_LOCAL;
2011 goto make_route;
2012 }
2013
2014 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2015 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2016 fib_select_multipath(&res);
2017 else
2018 #endif
2019 if (!res.prefixlen &&
2020 res.table->tb_num_default > 1 &&
2021 res.type == RTN_UNICAST && !fl4->flowi4_oif)
2022 fib_select_default(&res);
2023
2024 if (!fl4->saddr)
2025 fl4->saddr = FIB_RES_PREFSRC(net, res);
2026
2027 dev_out = FIB_RES_DEV(res);
2028 fl4->flowi4_oif = dev_out->ifindex;
2029
2030
2031 make_route:
2032 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2033
2034 out:
2035 rcu_read_unlock();
2036 return rth;
2037 }
2038 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2039
2040 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2041 {
2042 return NULL;
2043 }
2044
2045 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2046 {
2047 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2048
2049 return mtu ? : dst->dev->mtu;
2050 }
2051
2052 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2053 struct sk_buff *skb, u32 mtu)
2054 {
2055 }
2056
2057 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2058 struct sk_buff *skb)
2059 {
2060 }
2061
2062 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2063 unsigned long old)
2064 {
2065 return NULL;
2066 }
2067
2068 static struct dst_ops ipv4_dst_blackhole_ops = {
2069 .family = AF_INET,
2070 .protocol = cpu_to_be16(ETH_P_IP),
2071 .check = ipv4_blackhole_dst_check,
2072 .mtu = ipv4_blackhole_mtu,
2073 .default_advmss = ipv4_default_advmss,
2074 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2075 .redirect = ipv4_rt_blackhole_redirect,
2076 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2077 .neigh_lookup = ipv4_neigh_lookup,
2078 };
2079
2080 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2081 {
2082 struct rtable *ort = (struct rtable *) dst_orig;
2083 struct rtable *rt;
2084
2085 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2086 if (rt) {
2087 struct dst_entry *new = &rt->dst;
2088
2089 new->__use = 1;
2090 new->input = dst_discard;
2091 new->output = dst_discard;
2092
2093 new->dev = ort->dst.dev;
2094 if (new->dev)
2095 dev_hold(new->dev);
2096
2097 rt->rt_is_input = ort->rt_is_input;
2098 rt->rt_iif = ort->rt_iif;
2099 rt->rt_pmtu = ort->rt_pmtu;
2100
2101 rt->rt_genid = rt_genid(net);
2102 rt->rt_flags = ort->rt_flags;
2103 rt->rt_type = ort->rt_type;
2104 rt->rt_gateway = ort->rt_gateway;
2105
2106 INIT_LIST_HEAD(&rt->rt_uncached);
2107
2108 dst_free(new);
2109 }
2110
2111 dst_release(dst_orig);
2112
2113 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2114 }
2115
2116 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2117 struct sock *sk)
2118 {
2119 struct rtable *rt = __ip_route_output_key(net, flp4);
2120
2121 if (IS_ERR(rt))
2122 return rt;
2123
2124 if (flp4->flowi4_proto)
2125 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2126 flowi4_to_flowi(flp4),
2127 sk, 0);
2128
2129 return rt;
2130 }
2131 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2132
2133 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2134 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2135 u32 seq, int event, int nowait, unsigned int flags)
2136 {
2137 struct rtable *rt = skb_rtable(skb);
2138 struct rtmsg *r;
2139 struct nlmsghdr *nlh;
2140 unsigned long expires = 0;
2141 u32 error;
2142 u32 metrics[RTAX_MAX];
2143
2144 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2145 if (nlh == NULL)
2146 return -EMSGSIZE;
2147
2148 r = nlmsg_data(nlh);
2149 r->rtm_family = AF_INET;
2150 r->rtm_dst_len = 32;
2151 r->rtm_src_len = 0;
2152 r->rtm_tos = fl4->flowi4_tos;
2153 r->rtm_table = RT_TABLE_MAIN;
2154 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2155 goto nla_put_failure;
2156 r->rtm_type = rt->rt_type;
2157 r->rtm_scope = RT_SCOPE_UNIVERSE;
2158 r->rtm_protocol = RTPROT_UNSPEC;
2159 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2160 if (rt->rt_flags & RTCF_NOTIFY)
2161 r->rtm_flags |= RTM_F_NOTIFY;
2162
2163 if (nla_put_be32(skb, RTA_DST, dst))
2164 goto nla_put_failure;
2165 if (src) {
2166 r->rtm_src_len = 32;
2167 if (nla_put_be32(skb, RTA_SRC, src))
2168 goto nla_put_failure;
2169 }
2170 if (rt->dst.dev &&
2171 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2172 goto nla_put_failure;
2173 #ifdef CONFIG_IP_ROUTE_CLASSID
2174 if (rt->dst.tclassid &&
2175 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2176 goto nla_put_failure;
2177 #endif
2178 if (!rt_is_input_route(rt) &&
2179 fl4->saddr != src) {
2180 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2181 goto nla_put_failure;
2182 }
2183 if (rt->rt_gateway &&
2184 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2185 goto nla_put_failure;
2186
2187 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2188 if (rt->rt_pmtu)
2189 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2190 if (rtnetlink_put_metrics(skb, metrics) < 0)
2191 goto nla_put_failure;
2192
2193 if (fl4->flowi4_mark &&
2194 nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
2195 goto nla_put_failure;
2196
2197 error = rt->dst.error;
2198 expires = rt->dst.expires;
2199 if (expires) {
2200 if (time_before(jiffies, expires))
2201 expires -= jiffies;
2202 else
2203 expires = 0;
2204 }
2205
2206 if (rt_is_input_route(rt)) {
2207 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2208 goto nla_put_failure;
2209 }
2210
2211 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2212 goto nla_put_failure;
2213
2214 return nlmsg_end(skb, nlh);
2215
2216 nla_put_failure:
2217 nlmsg_cancel(skb, nlh);
2218 return -EMSGSIZE;
2219 }
2220
2221 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2222 {
2223 struct net *net = sock_net(in_skb->sk);
2224 struct rtmsg *rtm;
2225 struct nlattr *tb[RTA_MAX+1];
2226 struct rtable *rt = NULL;
2227 struct flowi4 fl4;
2228 __be32 dst = 0;
2229 __be32 src = 0;
2230 u32 iif;
2231 int err;
2232 int mark;
2233 struct sk_buff *skb;
2234
2235 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2236 if (err < 0)
2237 goto errout;
2238
2239 rtm = nlmsg_data(nlh);
2240
2241 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2242 if (skb == NULL) {
2243 err = -ENOBUFS;
2244 goto errout;
2245 }
2246
2247 /* Reserve room for dummy headers, this skb can pass
2248 through good chunk of routing engine.
2249 */
2250 skb_reset_mac_header(skb);
2251 skb_reset_network_header(skb);
2252
2253 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2254 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2255 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2256
2257 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2258 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2259 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2260 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2261
2262 memset(&fl4, 0, sizeof(fl4));
2263 fl4.daddr = dst;
2264 fl4.saddr = src;
2265 fl4.flowi4_tos = rtm->rtm_tos;
2266 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2267 fl4.flowi4_mark = mark;
2268
2269 if (iif) {
2270 struct net_device *dev;
2271
2272 dev = __dev_get_by_index(net, iif);
2273 if (dev == NULL) {
2274 err = -ENODEV;
2275 goto errout_free;
2276 }
2277
2278 skb->protocol = htons(ETH_P_IP);
2279 skb->dev = dev;
2280 skb->mark = mark;
2281 local_bh_disable();
2282 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2283 local_bh_enable();
2284
2285 rt = skb_rtable(skb);
2286 if (err == 0 && rt->dst.error)
2287 err = -rt->dst.error;
2288 } else {
2289 rt = ip_route_output_key(net, &fl4);
2290
2291 err = 0;
2292 if (IS_ERR(rt))
2293 err = PTR_ERR(rt);
2294 }
2295
2296 if (err)
2297 goto errout_free;
2298
2299 skb_dst_set(skb, &rt->dst);
2300 if (rtm->rtm_flags & RTM_F_NOTIFY)
2301 rt->rt_flags |= RTCF_NOTIFY;
2302
2303 err = rt_fill_info(net, dst, src, &fl4, skb,
2304 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2305 RTM_NEWROUTE, 0, 0);
2306 if (err <= 0)
2307 goto errout_free;
2308
2309 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2310 errout:
2311 return err;
2312
2313 errout_free:
2314 kfree_skb(skb);
2315 goto errout;
2316 }
2317
2318 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2319 {
2320 return skb->len;
2321 }
2322
2323 void ip_rt_multicast_event(struct in_device *in_dev)
2324 {
2325 rt_cache_flush(dev_net(in_dev->dev));
2326 }
2327
2328 #ifdef CONFIG_SYSCTL
2329 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2330 void __user *buffer,
2331 size_t *lenp, loff_t *ppos)
2332 {
2333 if (write) {
2334 rt_cache_flush((struct net *)__ctl->extra1);
2335 return 0;
2336 }
2337
2338 return -EINVAL;
2339 }
2340
2341 static ctl_table ipv4_route_table[] = {
2342 {
2343 .procname = "gc_thresh",
2344 .data = &ipv4_dst_ops.gc_thresh,
2345 .maxlen = sizeof(int),
2346 .mode = 0644,
2347 .proc_handler = proc_dointvec,
2348 },
2349 {
2350 .procname = "max_size",
2351 .data = &ip_rt_max_size,
2352 .maxlen = sizeof(int),
2353 .mode = 0644,
2354 .proc_handler = proc_dointvec,
2355 },
2356 {
2357 /* Deprecated. Use gc_min_interval_ms */
2358
2359 .procname = "gc_min_interval",
2360 .data = &ip_rt_gc_min_interval,
2361 .maxlen = sizeof(int),
2362 .mode = 0644,
2363 .proc_handler = proc_dointvec_jiffies,
2364 },
2365 {
2366 .procname = "gc_min_interval_ms",
2367 .data = &ip_rt_gc_min_interval,
2368 .maxlen = sizeof(int),
2369 .mode = 0644,
2370 .proc_handler = proc_dointvec_ms_jiffies,
2371 },
2372 {
2373 .procname = "gc_timeout",
2374 .data = &ip_rt_gc_timeout,
2375 .maxlen = sizeof(int),
2376 .mode = 0644,
2377 .proc_handler = proc_dointvec_jiffies,
2378 },
2379 {
2380 .procname = "gc_interval",
2381 .data = &ip_rt_gc_interval,
2382 .maxlen = sizeof(int),
2383 .mode = 0644,
2384 .proc_handler = proc_dointvec_jiffies,
2385 },
2386 {
2387 .procname = "redirect_load",
2388 .data = &ip_rt_redirect_load,
2389 .maxlen = sizeof(int),
2390 .mode = 0644,
2391 .proc_handler = proc_dointvec,
2392 },
2393 {
2394 .procname = "redirect_number",
2395 .data = &ip_rt_redirect_number,
2396 .maxlen = sizeof(int),
2397 .mode = 0644,
2398 .proc_handler = proc_dointvec,
2399 },
2400 {
2401 .procname = "redirect_silence",
2402 .data = &ip_rt_redirect_silence,
2403 .maxlen = sizeof(int),
2404 .mode = 0644,
2405 .proc_handler = proc_dointvec,
2406 },
2407 {
2408 .procname = "error_cost",
2409 .data = &ip_rt_error_cost,
2410 .maxlen = sizeof(int),
2411 .mode = 0644,
2412 .proc_handler = proc_dointvec,
2413 },
2414 {
2415 .procname = "error_burst",
2416 .data = &ip_rt_error_burst,
2417 .maxlen = sizeof(int),
2418 .mode = 0644,
2419 .proc_handler = proc_dointvec,
2420 },
2421 {
2422 .procname = "gc_elasticity",
2423 .data = &ip_rt_gc_elasticity,
2424 .maxlen = sizeof(int),
2425 .mode = 0644,
2426 .proc_handler = proc_dointvec,
2427 },
2428 {
2429 .procname = "mtu_expires",
2430 .data = &ip_rt_mtu_expires,
2431 .maxlen = sizeof(int),
2432 .mode = 0644,
2433 .proc_handler = proc_dointvec_jiffies,
2434 },
2435 {
2436 .procname = "min_pmtu",
2437 .data = &ip_rt_min_pmtu,
2438 .maxlen = sizeof(int),
2439 .mode = 0644,
2440 .proc_handler = proc_dointvec,
2441 },
2442 {
2443 .procname = "min_adv_mss",
2444 .data = &ip_rt_min_advmss,
2445 .maxlen = sizeof(int),
2446 .mode = 0644,
2447 .proc_handler = proc_dointvec,
2448 },
2449 { }
2450 };
2451
2452 static struct ctl_table ipv4_route_flush_table[] = {
2453 {
2454 .procname = "flush",
2455 .maxlen = sizeof(int),
2456 .mode = 0200,
2457 .proc_handler = ipv4_sysctl_rtcache_flush,
2458 },
2459 { },
2460 };
2461
2462 static __net_init int sysctl_route_net_init(struct net *net)
2463 {
2464 struct ctl_table *tbl;
2465
2466 tbl = ipv4_route_flush_table;
2467 if (!net_eq(net, &init_net)) {
2468 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2469 if (tbl == NULL)
2470 goto err_dup;
2471 }
2472 tbl[0].extra1 = net;
2473
2474 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2475 if (net->ipv4.route_hdr == NULL)
2476 goto err_reg;
2477 return 0;
2478
2479 err_reg:
2480 if (tbl != ipv4_route_flush_table)
2481 kfree(tbl);
2482 err_dup:
2483 return -ENOMEM;
2484 }
2485
2486 static __net_exit void sysctl_route_net_exit(struct net *net)
2487 {
2488 struct ctl_table *tbl;
2489
2490 tbl = net->ipv4.route_hdr->ctl_table_arg;
2491 unregister_net_sysctl_table(net->ipv4.route_hdr);
2492 BUG_ON(tbl == ipv4_route_flush_table);
2493 kfree(tbl);
2494 }
2495
2496 static __net_initdata struct pernet_operations sysctl_route_ops = {
2497 .init = sysctl_route_net_init,
2498 .exit = sysctl_route_net_exit,
2499 };
2500 #endif
2501
2502 static __net_init int rt_genid_init(struct net *net)
2503 {
2504 atomic_set(&net->rt_genid, 0);
2505 get_random_bytes(&net->ipv4.dev_addr_genid,
2506 sizeof(net->ipv4.dev_addr_genid));
2507 return 0;
2508 }
2509
2510 static __net_initdata struct pernet_operations rt_genid_ops = {
2511 .init = rt_genid_init,
2512 };
2513
2514 static int __net_init ipv4_inetpeer_init(struct net *net)
2515 {
2516 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2517
2518 if (!bp)
2519 return -ENOMEM;
2520 inet_peer_base_init(bp);
2521 net->ipv4.peers = bp;
2522 return 0;
2523 }
2524
2525 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2526 {
2527 struct inet_peer_base *bp = net->ipv4.peers;
2528
2529 net->ipv4.peers = NULL;
2530 inetpeer_invalidate_tree(bp);
2531 kfree(bp);
2532 }
2533
2534 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2535 .init = ipv4_inetpeer_init,
2536 .exit = ipv4_inetpeer_exit,
2537 };
2538
2539 #ifdef CONFIG_IP_ROUTE_CLASSID
2540 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2541 #endif /* CONFIG_IP_ROUTE_CLASSID */
2542
2543 int __init ip_rt_init(void)
2544 {
2545 int rc = 0;
2546
2547 #ifdef CONFIG_IP_ROUTE_CLASSID
2548 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2549 if (!ip_rt_acct)
2550 panic("IP: failed to allocate ip_rt_acct\n");
2551 #endif
2552
2553 ipv4_dst_ops.kmem_cachep =
2554 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2555 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2556
2557 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2558
2559 if (dst_entries_init(&ipv4_dst_ops) < 0)
2560 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2561
2562 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2563 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2564
2565 ipv4_dst_ops.gc_thresh = ~0;
2566 ip_rt_max_size = INT_MAX;
2567
2568 devinet_init();
2569 ip_fib_init();
2570
2571 if (ip_rt_proc_init())
2572 pr_err("Unable to create route proc files\n");
2573 #ifdef CONFIG_XFRM
2574 xfrm_init();
2575 xfrm4_init(ip_rt_max_size);
2576 #endif
2577 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2578
2579 #ifdef CONFIG_SYSCTL
2580 register_pernet_subsys(&sysctl_route_ops);
2581 #endif
2582 register_pernet_subsys(&rt_genid_ops);
2583 register_pernet_subsys(&ipv4_inetpeer_ops);
2584 return rc;
2585 }
2586
2587 #ifdef CONFIG_SYSCTL
2588 /*
2589 * We really need to sanitize the damn ipv4 init order, then all
2590 * this nonsense will go away.
2591 */
2592 void __init ip_static_sysctl_init(void)
2593 {
2594 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2595 }
2596 #endif
This page took 0.084114 seconds and 5 git commands to generate.