net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111
 112 #define RT_FL_TOS(oldflp) \
 113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 114
 115 #define IP_MAX_MTU      0xFFF0
 116
 117 #define RT_GC_TIMEOUT (300*HZ)
 118
 119 static int ip_rt_max_size;
 120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 123 static int ip_rt_redirect_number __read_mostly  = 9;
 124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 126 static int ip_rt_error_cost __read_mostly       = HZ;
 127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 128 static int ip_rt_gc_elasticity __read_mostly    = 8;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132 static int rt_chain_length_max __read_mostly    = 20;
 133
 134 static struct delayed_work expires_work;
 135 static unsigned long expires_ljiffies;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 143 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
 144 static void              ipv4_dst_destroy(struct dst_entry *dst);
 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 146 static void              ipv4_link_failure(struct sk_buff *skb);
 147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 148 static int rt_garbage_collect(struct dst_ops *ops);
 149
 150 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 151                             int how)
 152 {
 153 }
 154
 155 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 156 {
 157         u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
 158
 159         if (p) {
 160                 u32 *old_p = __DST_METRICS_PTR(old);
 161                 unsigned long prev, new;
 162
 163                 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 164
 165                 new = (unsigned long) p;
 166                 prev = cmpxchg(&dst->_metrics, old, new);
 167
 168                 if (prev != old) {
 169                         kfree(p);
 170                         p = __DST_METRICS_PTR(prev);
 171                         if (prev & DST_METRICS_READ_ONLY)
 172                                 p = NULL;
 173                 } else {
 174                         struct rtable *rt = (struct rtable *) dst;
 175
 176                         if (rt->fi) {
 177                                 fib_info_put(rt->fi);
 178                                 rt->fi = NULL;
 179                         }
 180                 }
 181         }
 182         return p;
 183 }
 184
 185 static struct dst_ops ipv4_dst_ops = {
 186         .family =               AF_INET,
 187         .protocol =             cpu_to_be16(ETH_P_IP),
 188         .gc =                   rt_garbage_collect,
 189         .check =                ipv4_dst_check,
 190         .default_advmss =       ipv4_default_advmss,
 191         .default_mtu =          ipv4_default_mtu,
 192         .cow_metrics =          ipv4_cow_metrics,
 193         .destroy =              ipv4_dst_destroy,
 194         .ifdown =               ipv4_dst_ifdown,
 195         .negative_advice =      ipv4_negative_advice,
 196         .link_failure =         ipv4_link_failure,
 197         .update_pmtu =          ip_rt_update_pmtu,
 198         .local_out =            __ip_local_out,
 199 };
 200
 201 #define ECN_OR_COST(class)      TC_PRIO_##class
 202
 203 const __u8 ip_tos2prio[16] = {
 204         TC_PRIO_BESTEFFORT,
 205         ECN_OR_COST(FILLER),
 206         TC_PRIO_BESTEFFORT,
 207         ECN_OR_COST(BESTEFFORT),
 208         TC_PRIO_BULK,
 209         ECN_OR_COST(BULK),
 210         TC_PRIO_BULK,
 211         ECN_OR_COST(BULK),
 212         TC_PRIO_INTERACTIVE,
 213         ECN_OR_COST(INTERACTIVE),
 214         TC_PRIO_INTERACTIVE,
 215         ECN_OR_COST(INTERACTIVE),
 216         TC_PRIO_INTERACTIVE_BULK,
 217         ECN_OR_COST(INTERACTIVE_BULK),
 218         TC_PRIO_INTERACTIVE_BULK,
 219         ECN_OR_COST(INTERACTIVE_BULK)
 220 };
 221
 222
 223 /*
 224  * Route cache.
 225  */
 226
 227 /* The locking scheme is rather straight forward:
 228  *
 229  * 1) Read-Copy Update protects the buckets of the central route hash.
 230  * 2) Only writers remove entries, and they hold the lock
 231  *    as they look at rtable reference counts.
 232  * 3) Only readers acquire references to rtable entries,
 233  *    they do so with atomic increments and with the
 234  *    lock held.
 235  */
 236
 237 struct rt_hash_bucket {
 238         struct rtable __rcu     *chain;
 239 };
 240
 241 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 242         defined(CONFIG_PROVE_LOCKING)
 243 /*
 244  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 245  * The size of this table is a power of two and depends on the number of CPUS.
 246  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 247  */
 248 #ifdef CONFIG_LOCKDEP
 249 # define RT_HASH_LOCK_SZ        256
 250 #else
 251 # if NR_CPUS >= 32
 252 #  define RT_HASH_LOCK_SZ       4096
 253 # elif NR_CPUS >= 16
 254 #  define RT_HASH_LOCK_SZ       2048
 255 # elif NR_CPUS >= 8
 256 #  define RT_HASH_LOCK_SZ       1024
 257 # elif NR_CPUS >= 4
 258 #  define RT_HASH_LOCK_SZ       512
 259 # else
 260 #  define RT_HASH_LOCK_SZ       256
 261 # endif
 262 #endif
 263
 264 static spinlock_t       *rt_hash_locks;
 265 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 266
 267 static __init void rt_hash_lock_init(void)
 268 {
 269         int i;
 270
 271         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 272                         GFP_KERNEL);
 273         if (!rt_hash_locks)
 274                 panic("IP: failed to allocate rt_hash_locks\n");
 275
 276         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 277                 spin_lock_init(&rt_hash_locks[i]);
 278 }
 279 #else
 280 # define rt_hash_lock_addr(slot) NULL
 281
 282 static inline void rt_hash_lock_init(void)
 283 {
 284 }
 285 #endif
 286
 287 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 288 static unsigned                 rt_hash_mask __read_mostly;
 289 static unsigned int             rt_hash_log  __read_mostly;
 290
 291 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 292 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 293
 294 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 295                                    int genid)
 296 {
 297         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 298                             idx, genid)
 299                 & rt_hash_mask;
 300 }
 301
 302 static inline int rt_genid(struct net *net)
 303 {
 304         return atomic_read(&net->ipv4.rt_genid);
 305 }
 306
 307 #ifdef CONFIG_PROC_FS
 308 struct rt_cache_iter_state {
 309         struct seq_net_private p;
 310         int bucket;
 311         int genid;
 312 };
 313
 314 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 315 {
 316         struct rt_cache_iter_state *st = seq->private;
 317         struct rtable *r = NULL;
 318
 319         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 320                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 321                         continue;
 322                 rcu_read_lock_bh();
 323                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 324                 while (r) {
 325                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 326                             r->rt_genid == st->genid)
 327                                 return r;
 328                         r = rcu_dereference_bh(r->dst.rt_next);
 329                 }
 330                 rcu_read_unlock_bh();
 331         }
 332         return r;
 333 }
 334
 335 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 336                                           struct rtable *r)
 337 {
 338         struct rt_cache_iter_state *st = seq->private;
 339
 340         r = rcu_dereference_bh(r->dst.rt_next);
 341         while (!r) {
 342                 rcu_read_unlock_bh();
 343                 do {
 344                         if (--st->bucket < 0)
 345                                 return NULL;
 346                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 347                 rcu_read_lock_bh();
 348                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 349         }
 350         return r;
 351 }
 352
 353 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 354                                         struct rtable *r)
 355 {
 356         struct rt_cache_iter_state *st = seq->private;
 357         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 358                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 359                         continue;
 360                 if (r->rt_genid == st->genid)
 361                         break;
 362         }
 363         return r;
 364 }
 365
 366 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 367 {
 368         struct rtable *r = rt_cache_get_first(seq);
 369
 370         if (r)
 371                 while (pos && (r = rt_cache_get_next(seq, r)))
 372                         --pos;
 373         return pos ? NULL : r;
 374 }
 375
 376 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 377 {
 378         struct rt_cache_iter_state *st = seq->private;
 379         if (*pos)
 380                 return rt_cache_get_idx(seq, *pos - 1);
 381         st->genid = rt_genid(seq_file_net(seq));
 382         return SEQ_START_TOKEN;
 383 }
 384
 385 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 386 {
 387         struct rtable *r;
 388
 389         if (v == SEQ_START_TOKEN)
 390                 r = rt_cache_get_first(seq);
 391         else
 392                 r = rt_cache_get_next(seq, v);
 393         ++*pos;
 394         return r;
 395 }
 396
 397 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 398 {
 399         if (v && v != SEQ_START_TOKEN)
 400                 rcu_read_unlock_bh();
 401 }
 402
 403 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 404 {
 405         if (v == SEQ_START_TOKEN)
 406                 seq_printf(seq, "%-127s\n",
 407                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 408                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 409                            "HHUptod\tSpecDst");
 410         else {
 411                 struct rtable *r = v;
 412                 int len;
 413
 414                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 415                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 416                         r->dst.dev ? r->dst.dev->name : "*",
 417                         (__force u32)r->rt_dst,
 418                         (__force u32)r->rt_gateway,
 419                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 420                         r->dst.__use, 0, (__force u32)r->rt_src,
 421                         dst_metric_advmss(&r->dst) + 40,
 422                         dst_metric(&r->dst, RTAX_WINDOW),
 423                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 424                               dst_metric(&r->dst, RTAX_RTTVAR)),
 425                         r->fl.fl4_tos,
 426                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
 427                         r->dst.hh ? (r->dst.hh->hh_output ==
 428                                        dev_queue_xmit) : 0,
 429                         r->rt_spec_dst, &len);
 430
 431                 seq_printf(seq, "%*s\n", 127 - len, "");
 432         }
 433         return 0;
 434 }
 435
 436 static const struct seq_operations rt_cache_seq_ops = {
 437         .start  = rt_cache_seq_start,
 438         .next   = rt_cache_seq_next,
 439         .stop   = rt_cache_seq_stop,
 440         .show   = rt_cache_seq_show,
 441 };
 442
 443 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 444 {
 445         return seq_open_net(inode, file, &rt_cache_seq_ops,
 446                         sizeof(struct rt_cache_iter_state));
 447 }
 448
 449 static const struct file_operations rt_cache_seq_fops = {
 450         .owner   = THIS_MODULE,
 451         .open    = rt_cache_seq_open,
 452         .read    = seq_read,
 453         .llseek  = seq_lseek,
 454         .release = seq_release_net,
 455 };
 456
 457
 458 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 459 {
 460         int cpu;
 461
 462         if (*pos == 0)
 463                 return SEQ_START_TOKEN;
 464
 465         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 466                 if (!cpu_possible(cpu))
 467                         continue;
 468                 *pos = cpu+1;
 469                 return &per_cpu(rt_cache_stat, cpu);
 470         }
 471         return NULL;
 472 }
 473
 474 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 475 {
 476         int cpu;
 477
 478         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 479                 if (!cpu_possible(cpu))
 480                         continue;
 481                 *pos = cpu+1;
 482                 return &per_cpu(rt_cache_stat, cpu);
 483         }
 484         return NULL;
 485
 486 }
 487
 488 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 489 {
 490
 491 }
 492
 493 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 494 {
 495         struct rt_cache_stat *st = v;
 496
 497         if (v == SEQ_START_TOKEN) {
 498                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 499                 return 0;
 500         }
 501
 502         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 503                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 504                    dst_entries_get_slow(&ipv4_dst_ops),
 505                    st->in_hit,
 506                    st->in_slow_tot,
 507                    st->in_slow_mc,
 508                    st->in_no_route,
 509                    st->in_brd,
 510                    st->in_martian_dst,
 511                    st->in_martian_src,
 512
 513                    st->out_hit,
 514                    st->out_slow_tot,
 515                    st->out_slow_mc,
 516
 517                    st->gc_total,
 518                    st->gc_ignored,
 519                    st->gc_goal_miss,
 520                    st->gc_dst_overflow,
 521                    st->in_hlist_search,
 522                    st->out_hlist_search
 523                 );
 524         return 0;
 525 }
 526
 527 static const struct seq_operations rt_cpu_seq_ops = {
 528         .start  = rt_cpu_seq_start,
 529         .next   = rt_cpu_seq_next,
 530         .stop   = rt_cpu_seq_stop,
 531         .show   = rt_cpu_seq_show,
 532 };
 533
 534
 535 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 536 {
 537         return seq_open(file, &rt_cpu_seq_ops);
 538 }
 539
 540 static const struct file_operations rt_cpu_seq_fops = {
 541         .owner   = THIS_MODULE,
 542         .open    = rt_cpu_seq_open,
 543         .read    = seq_read,
 544         .llseek  = seq_lseek,
 545         .release = seq_release,
 546 };
 547
 548 #ifdef CONFIG_IP_ROUTE_CLASSID
 549 static int rt_acct_proc_show(struct seq_file *m, void *v)
 550 {
 551         struct ip_rt_acct *dst, *src;
 552         unsigned int i, j;
 553
 554         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 555         if (!dst)
 556                 return -ENOMEM;
 557
 558         for_each_possible_cpu(i) {
 559                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 560                 for (j = 0; j < 256; j++) {
 561                         dst[j].o_bytes   += src[j].o_bytes;
 562                         dst[j].o_packets += src[j].o_packets;
 563                         dst[j].i_bytes   += src[j].i_bytes;
 564                         dst[j].i_packets += src[j].i_packets;
 565                 }
 566         }
 567
 568         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 569         kfree(dst);
 570         return 0;
 571 }
 572
 573 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 574 {
 575         return single_open(file, rt_acct_proc_show, NULL);
 576 }
 577
 578 static const struct file_operations rt_acct_proc_fops = {
 579         .owner          = THIS_MODULE,
 580         .open           = rt_acct_proc_open,
 581         .read           = seq_read,
 582         .llseek         = seq_lseek,
 583         .release        = single_release,
 584 };
 585 #endif
 586
 587 static int __net_init ip_rt_do_proc_init(struct net *net)
 588 {
 589         struct proc_dir_entry *pde;
 590
 591         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 592                         &rt_cache_seq_fops);
 593         if (!pde)
 594                 goto err1;
 595
 596         pde = proc_create("rt_cache", S_IRUGO,
 597                           net->proc_net_stat, &rt_cpu_seq_fops);
 598         if (!pde)
 599                 goto err2;
 600
 601 #ifdef CONFIG_IP_ROUTE_CLASSID
 602         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 603         if (!pde)
 604                 goto err3;
 605 #endif
 606         return 0;
 607
 608 #ifdef CONFIG_IP_ROUTE_CLASSID
 609 err3:
 610         remove_proc_entry("rt_cache", net->proc_net_stat);
 611 #endif
 612 err2:
 613         remove_proc_entry("rt_cache", net->proc_net);
 614 err1:
 615         return -ENOMEM;
 616 }
 617
 618 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 619 {
 620         remove_proc_entry("rt_cache", net->proc_net_stat);
 621         remove_proc_entry("rt_cache", net->proc_net);
 622 #ifdef CONFIG_IP_ROUTE_CLASSID
 623         remove_proc_entry("rt_acct", net->proc_net);
 624 #endif
 625 }
 626
 627 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 628         .init = ip_rt_do_proc_init,
 629         .exit = ip_rt_do_proc_exit,
 630 };
 631
 632 static int __init ip_rt_proc_init(void)
 633 {
 634         return register_pernet_subsys(&ip_rt_proc_ops);
 635 }
 636
 637 #else
 638 static inline int ip_rt_proc_init(void)
 639 {
 640         return 0;
 641 }
 642 #endif /* CONFIG_PROC_FS */
 643
 644 static inline void rt_free(struct rtable *rt)
 645 {
 646         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 647 }
 648
 649 static inline void rt_drop(struct rtable *rt)
 650 {
 651         ip_rt_put(rt);
 652         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 653 }
 654
 655 static inline int rt_fast_clean(struct rtable *rth)
 656 {
 657         /* Kill broadcast/multicast entries very aggresively, if they
 658            collide in hash table with more useful entries */
 659         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 660                 rt_is_input_route(rth) && rth->dst.rt_next;
 661 }
 662
 663 static inline int rt_valuable(struct rtable *rth)
 664 {
 665         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 666                 rth->dst.expires;
 667 }
 668
 669 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 670 {
 671         unsigned long age;
 672         int ret = 0;
 673
 674         if (atomic_read(&rth->dst.__refcnt))
 675                 goto out;
 676
 677         ret = 1;
 678         if (rth->dst.expires &&
 679             time_after_eq(jiffies, rth->dst.expires))
 680                 goto out;
 681
 682         age = jiffies - rth->dst.lastuse;
 683         ret = 0;
 684         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 685             (age <= tmo2 && rt_valuable(rth)))
 686                 goto out;
 687         ret = 1;
 688 out:    return ret;
 689 }
 690
 691 /* Bits of score are:
 692  * 31: very valuable
 693  * 30: not quite useless
 694  * 29..0: usage counter
 695  */
 696 static inline u32 rt_score(struct rtable *rt)
 697 {
 698         u32 score = jiffies - rt->dst.lastuse;
 699
 700         score = ~score & ~(3<<30);
 701
 702         if (rt_valuable(rt))
 703                 score |= (1<<31);
 704
 705         if (rt_is_output_route(rt) ||
 706             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 707                 score |= (1<<30);
 708
 709         return score;
 710 }
 711
 712 static inline bool rt_caching(const struct net *net)
 713 {
 714         return net->ipv4.current_rt_cache_rebuild_count <=
 715                 net->ipv4.sysctl_rt_cache_rebuild_count;
 716 }
 717
 718 static inline bool compare_hash_inputs(const struct flowi *fl1,
 719                                         const struct flowi *fl2)
 720 {
 721         return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
 722                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
 723                 (fl1->iif ^ fl2->iif)) == 0);
 724 }
 725
 726 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 727 {
 728         return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
 729                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
 730                 (fl1->mark ^ fl2->mark) |
 731                 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
 732                 (fl1->oif ^ fl2->oif) |
 733                 (fl1->iif ^ fl2->iif)) == 0;
 734 }
 735
 736 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 737 {
 738         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 739 }
 740
 741 static inline int rt_is_expired(struct rtable *rth)
 742 {
 743         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 744 }
 745
 746 /*
 747  * Perform a full scan of hash table and free all entries.
 748  * Can be called by a softirq or a process.
 749  * In the later case, we want to be reschedule if necessary
 750  */
 751 static void rt_do_flush(struct net *net, int process_context)
 752 {
 753         unsigned int i;
 754         struct rtable *rth, *next;
 755
 756         for (i = 0; i <= rt_hash_mask; i++) {
 757                 struct rtable __rcu **pprev;
 758                 struct rtable *list;
 759
 760                 if (process_context && need_resched())
 761                         cond_resched();
 762                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
 763                 if (!rth)
 764                         continue;
 765
 766                 spin_lock_bh(rt_hash_lock_addr(i));
 767
 768                 list = NULL;
 769                 pprev = &rt_hash_table[i].chain;
 770                 rth = rcu_dereference_protected(*pprev,
 771                         lockdep_is_held(rt_hash_lock_addr(i)));
 772
 773                 while (rth) {
 774                         next = rcu_dereference_protected(rth->dst.rt_next,
 775                                 lockdep_is_held(rt_hash_lock_addr(i)));
 776
 777                         if (!net ||
 778                             net_eq(dev_net(rth->dst.dev), net)) {
 779                                 rcu_assign_pointer(*pprev, next);
 780                                 rcu_assign_pointer(rth->dst.rt_next, list);
 781                                 list = rth;
 782                         } else {
 783                                 pprev = &rth->dst.rt_next;
 784                         }
 785                         rth = next;
 786                 }
 787
 788                 spin_unlock_bh(rt_hash_lock_addr(i));
 789
 790                 for (; list; list = next) {
 791                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 792                         rt_free(list);
 793                 }
 794         }
 795 }
 796
 797 /*
 798  * While freeing expired entries, we compute average chain length
 799  * and standard deviation, using fixed-point arithmetic.
 800  * This to have an estimation of rt_chain_length_max
 801  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 802  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 803  */
 804
 805 #define FRACT_BITS 3
 806 #define ONE (1UL << FRACT_BITS)
 807
 808 /*
 809  * Given a hash chain and an item in this hash chain,
 810  * find if a previous entry has the same hash_inputs
 811  * (but differs on tos, mark or oif)
 812  * Returns 0 if an alias is found.
 813  * Returns ONE if rth has no alias before itself.
 814  */
 815 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 816 {
 817         const struct rtable *aux = head;
 818
 819         while (aux != rth) {
 820                 if (compare_hash_inputs(&aux->fl, &rth->fl))
 821                         return 0;
 822                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 823         }
 824         return ONE;
 825 }
 826
 827 static void rt_check_expire(void)
 828 {
 829         static unsigned int rover;
 830         unsigned int i = rover, goal;
 831         struct rtable *rth;
 832         struct rtable __rcu **rthp;
 833         unsigned long samples = 0;
 834         unsigned long sum = 0, sum2 = 0;
 835         unsigned long delta;
 836         u64 mult;
 837
 838         delta = jiffies - expires_ljiffies;
 839         expires_ljiffies = jiffies;
 840         mult = ((u64)delta) << rt_hash_log;
 841         if (ip_rt_gc_timeout > 1)
 842                 do_div(mult, ip_rt_gc_timeout);
 843         goal = (unsigned int)mult;
 844         if (goal > rt_hash_mask)
 845                 goal = rt_hash_mask + 1;
 846         for (; goal > 0; goal--) {
 847                 unsigned long tmo = ip_rt_gc_timeout;
 848                 unsigned long length;
 849
 850                 i = (i + 1) & rt_hash_mask;
 851                 rthp = &rt_hash_table[i].chain;
 852
 853                 if (need_resched())
 854                         cond_resched();
 855
 856                 samples++;
 857
 858                 if (rcu_dereference_raw(*rthp) == NULL)
 859                         continue;
 860                 length = 0;
 861                 spin_lock_bh(rt_hash_lock_addr(i));
 862                 while ((rth = rcu_dereference_protected(*rthp,
 863                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 864                         prefetch(rth->dst.rt_next);
 865                         if (rt_is_expired(rth)) {
 866                                 *rthp = rth->dst.rt_next;
 867                                 rt_free(rth);
 868                                 continue;
 869                         }
 870                         if (rth->dst.expires) {
 871                                 /* Entry is expired even if it is in use */
 872                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 873 nofree:
 874                                         tmo >>= 1;
 875                                         rthp = &rth->dst.rt_next;
 876                                         /*
 877                                          * We only count entries on
 878                                          * a chain with equal hash inputs once
 879                                          * so that entries for different QOS
 880                                          * levels, and other non-hash input
 881                                          * attributes don't unfairly skew
 882                                          * the length computation
 883                                          */
 884                                         length += has_noalias(rt_hash_table[i].chain, rth);
 885                                         continue;
 886                                 }
 887                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 888                                 goto nofree;
 889
 890                         /* Cleanup aged off entries. */
 891                         *rthp = rth->dst.rt_next;
 892                         rt_free(rth);
 893                 }
 894                 spin_unlock_bh(rt_hash_lock_addr(i));
 895                 sum += length;
 896                 sum2 += length*length;
 897         }
 898         if (samples) {
 899                 unsigned long avg = sum / samples;
 900                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 901                 rt_chain_length_max = max_t(unsigned long,
 902                                         ip_rt_gc_elasticity,
 903                                         (avg + 4*sd) >> FRACT_BITS);
 904         }
 905         rover = i;
 906 }
 907
 908 /*
 909  * rt_worker_func() is run in process context.
 910  * we call rt_check_expire() to scan part of the hash table
 911  */
 912 static void rt_worker_func(struct work_struct *work)
 913 {
 914         rt_check_expire();
 915         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 916 }
 917
 918 /*
 919  * Pertubation of rt_genid by a small quantity [1..256]
 920  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 921  * many times (2^24) without giving recent rt_genid.
 922  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 923  */
 924 static void rt_cache_invalidate(struct net *net)
 925 {
 926         unsigned char shuffle;
 927
 928         get_random_bytes(&shuffle, sizeof(shuffle));
 929         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 930 }
 931
 932 /*
 933  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 934  * delay >= 0 : invalidate & flush cache (can be long)
 935  */
 936 void rt_cache_flush(struct net *net, int delay)
 937 {
 938         rt_cache_invalidate(net);
 939         if (delay >= 0)
 940                 rt_do_flush(net, !in_softirq());
 941 }
 942
 943 /* Flush previous cache invalidated entries from the cache */
 944 void rt_cache_flush_batch(struct net *net)
 945 {
 946         rt_do_flush(net, !in_softirq());
 947 }
 948
 949 static void rt_emergency_hash_rebuild(struct net *net)
 950 {
 951         if (net_ratelimit())
 952                 printk(KERN_WARNING "Route hash chain too long!\n");
 953         rt_cache_invalidate(net);
 954 }
 955
 956 /*
 957    Short description of GC goals.
 958
 959    We want to build algorithm, which will keep routing cache
 960    at some equilibrium point, when number of aged off entries
 961    is kept approximately equal to newly generated ones.
 962
 963    Current expiration strength is variable "expire".
 964    We try to adjust it dynamically, so that if networking
 965    is idle expires is large enough to keep enough of warm entries,
 966    and when load increases it reduces to limit cache size.
 967  */
 968
 969 static int rt_garbage_collect(struct dst_ops *ops)
 970 {
 971         static unsigned long expire = RT_GC_TIMEOUT;
 972         static unsigned long last_gc;
 973         static int rover;
 974         static int equilibrium;
 975         struct rtable *rth;
 976         struct rtable __rcu **rthp;
 977         unsigned long now = jiffies;
 978         int goal;
 979         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 980
 981         /*
 982          * Garbage collection is pretty expensive,
 983          * do not make it too frequently.
 984          */
 985
 986         RT_CACHE_STAT_INC(gc_total);
 987
 988         if (now - last_gc < ip_rt_gc_min_interval &&
 989             entries < ip_rt_max_size) {
 990                 RT_CACHE_STAT_INC(gc_ignored);
 991                 goto out;
 992         }
 993
 994         entries = dst_entries_get_slow(&ipv4_dst_ops);
 995         /* Calculate number of entries, which we want to expire now. */
 996         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 997         if (goal <= 0) {
 998                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 999                         equilibrium = ipv4_dst_ops.gc_thresh;
1000                 goal = entries - equilibrium;
1001                 if (goal > 0) {
1002                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1003                         goal = entries - equilibrium;
1004                 }
1005         } else {
1006                 /* We are in dangerous area. Try to reduce cache really
1007                  * aggressively.
1008                  */
1009                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1010                 equilibrium = entries - goal;
1011         }
1012
1013         if (now - last_gc >= ip_rt_gc_min_interval)
1014                 last_gc = now;
1015
1016         if (goal <= 0) {
1017                 equilibrium += goal;
1018                 goto work_done;
1019         }
1020
1021         do {
1022                 int i, k;
1023
1024                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1025                         unsigned long tmo = expire;
1026
1027                         k = (k + 1) & rt_hash_mask;
1028                         rthp = &rt_hash_table[k].chain;
1029                         spin_lock_bh(rt_hash_lock_addr(k));
1030                         while ((rth = rcu_dereference_protected(*rthp,
1031                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1032                                 if (!rt_is_expired(rth) &&
1033                                         !rt_may_expire(rth, tmo, expire)) {
1034                                         tmo >>= 1;
1035                                         rthp = &rth->dst.rt_next;
1036                                         continue;
1037                                 }
1038                                 *rthp = rth->dst.rt_next;
1039                                 rt_free(rth);
1040                                 goal--;
1041                         }
1042                         spin_unlock_bh(rt_hash_lock_addr(k));
1043                         if (goal <= 0)
1044                                 break;
1045                 }
1046                 rover = k;
1047
1048                 if (goal <= 0)
1049                         goto work_done;
1050
1051                 /* Goal is not achieved. We stop process if:
1052
1053                    - if expire reduced to zero. Otherwise, expire is halfed.
1054                    - if table is not full.
1055                    - if we are called from interrupt.
1056                    - jiffies check is just fallback/debug loop breaker.
1057                      We will not spin here for long time in any case.
1058                  */
1059
1060                 RT_CACHE_STAT_INC(gc_goal_miss);
1061
1062                 if (expire == 0)
1063                         break;
1064
1065                 expire >>= 1;
1066 #if RT_CACHE_DEBUG >= 2
1067                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1068                                 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1069 #endif
1070
1071                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1072                         goto out;
1073         } while (!in_softirq() && time_before_eq(jiffies, now));
1074
1075         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1076                 goto out;
1077         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1078                 goto out;
1079         if (net_ratelimit())
1080                 printk(KERN_WARNING "dst cache overflow\n");
1081         RT_CACHE_STAT_INC(gc_dst_overflow);
1082         return 1;
1083
1084 work_done:
1085         expire += ip_rt_gc_min_interval;
1086         if (expire > ip_rt_gc_timeout ||
1087             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1088             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1089                 expire = ip_rt_gc_timeout;
1090 #if RT_CACHE_DEBUG >= 2
1091         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1092                         dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1093 #endif
1094 out:    return 0;
1095 }
1096
1097 /*
1098  * Returns number of entries in a hash chain that have different hash_inputs
1099  */
1100 static int slow_chain_length(const struct rtable *head)
1101 {
1102         int length = 0;
1103         const struct rtable *rth = head;
1104
1105         while (rth) {
1106                 length += has_noalias(head, rth);
1107                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1108         }
1109         return length >> FRACT_BITS;
1110 }
1111
1112 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1113                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1114 {
1115         struct rtable   *rth, *cand;
1116         struct rtable __rcu **rthp, **candp;
1117         unsigned long   now;
1118         u32             min_score;
1119         int             chain_length;
1120         int attempts = !in_softirq();
1121
1122 restart:
1123         chain_length = 0;
1124         min_score = ~(u32)0;
1125         cand = NULL;
1126         candp = NULL;
1127         now = jiffies;
1128
1129         if (!rt_caching(dev_net(rt->dst.dev))) {
1130                 /*
1131                  * If we're not caching, just tell the caller we
1132                  * were successful and don't touch the route.  The
1133                  * caller hold the sole reference to the cache entry, and
1134                  * it will be released when the caller is done with it.
1135                  * If we drop it here, the callers have no way to resolve routes
1136                  * when we're not caching.  Instead, just point *rp at rt, so
1137                  * the caller gets a single use out of the route
1138                  * Note that we do rt_free on this new route entry, so that
1139                  * once its refcount hits zero, we are still able to reap it
1140                  * (Thanks Alexey)
1141                  * Note: To avoid expensive rcu stuff for this uncached dst,
1142                  * we set DST_NOCACHE so that dst_release() can free dst without
1143                  * waiting a grace period.
1144                  */
1145
1146                 rt->dst.flags |= DST_NOCACHE;
1147                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1148                         int err = arp_bind_neighbour(&rt->dst);
1149                         if (err) {
1150                                 if (net_ratelimit())
1151                                         printk(KERN_WARNING
1152                                             "Neighbour table failure & not caching routes.\n");
1153                                 ip_rt_put(rt);
1154                                 return err;
1155                         }
1156                 }
1157
1158                 goto skip_hashing;
1159         }
1160
1161         rthp = &rt_hash_table[hash].chain;
1162
1163         spin_lock_bh(rt_hash_lock_addr(hash));
1164         while ((rth = rcu_dereference_protected(*rthp,
1165                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1166                 if (rt_is_expired(rth)) {
1167                         *rthp = rth->dst.rt_next;
1168                         rt_free(rth);
1169                         continue;
1170                 }
1171                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1172                         /* Put it first */
1173                         *rthp = rth->dst.rt_next;
1174                         /*
1175                          * Since lookup is lockfree, the deletion
1176                          * must be visible to another weakly ordered CPU before
1177                          * the insertion at the start of the hash chain.
1178                          */
1179                         rcu_assign_pointer(rth->dst.rt_next,
1180                                            rt_hash_table[hash].chain);
1181                         /*
1182                          * Since lookup is lockfree, the update writes
1183                          * must be ordered for consistency on SMP.
1184                          */
1185                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1186
1187                         dst_use(&rth->dst, now);
1188                         spin_unlock_bh(rt_hash_lock_addr(hash));
1189
1190                         rt_drop(rt);
1191                         if (rp)
1192                                 *rp = rth;
1193                         else
1194                                 skb_dst_set(skb, &rth->dst);
1195                         return 0;
1196                 }
1197
1198                 if (!atomic_read(&rth->dst.__refcnt)) {
1199                         u32 score = rt_score(rth);
1200
1201                         if (score <= min_score) {
1202                                 cand = rth;
1203                                 candp = rthp;
1204                                 min_score = score;
1205                         }
1206                 }
1207
1208                 chain_length++;
1209
1210                 rthp = &rth->dst.rt_next;
1211         }
1212
1213         if (cand) {
1214                 /* ip_rt_gc_elasticity used to be average length of chain
1215                  * length, when exceeded gc becomes really aggressive.
1216                  *
1217                  * The second limit is less certain. At the moment it allows
1218                  * only 2 entries per bucket. We will see.
1219                  */
1220                 if (chain_length > ip_rt_gc_elasticity) {
1221                         *candp = cand->dst.rt_next;
1222                         rt_free(cand);
1223                 }
1224         } else {
1225                 if (chain_length > rt_chain_length_max &&
1226                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1227                         struct net *net = dev_net(rt->dst.dev);
1228                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1229                         if (!rt_caching(net)) {
1230                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1231                                         rt->dst.dev->name, num);
1232                         }
1233                         rt_emergency_hash_rebuild(net);
1234                         spin_unlock_bh(rt_hash_lock_addr(hash));
1235
1236                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1237                                         ifindex, rt_genid(net));
1238                         goto restart;
1239                 }
1240         }
1241
1242         /* Try to bind route to arp only if it is output
1243            route or unicast forwarding path.
1244          */
1245         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1246                 int err = arp_bind_neighbour(&rt->dst);
1247                 if (err) {
1248                         spin_unlock_bh(rt_hash_lock_addr(hash));
1249
1250                         if (err != -ENOBUFS) {
1251                                 rt_drop(rt);
1252                                 return err;
1253                         }
1254
1255                         /* Neighbour tables are full and nothing
1256                            can be released. Try to shrink route cache,
1257                            it is most likely it holds some neighbour records.
1258                          */
1259                         if (attempts-- > 0) {
1260                                 int saved_elasticity = ip_rt_gc_elasticity;
1261                                 int saved_int = ip_rt_gc_min_interval;
1262                                 ip_rt_gc_elasticity     = 1;
1263                                 ip_rt_gc_min_interval   = 0;
1264                                 rt_garbage_collect(&ipv4_dst_ops);
1265                                 ip_rt_gc_min_interval   = saved_int;
1266                                 ip_rt_gc_elasticity     = saved_elasticity;
1267                                 goto restart;
1268                         }
1269
1270                         if (net_ratelimit())
1271                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1272                         rt_drop(rt);
1273                         return -ENOBUFS;
1274                 }
1275         }
1276
1277         rt->dst.rt_next = rt_hash_table[hash].chain;
1278
1279 #if RT_CACHE_DEBUG >= 2
1280         if (rt->dst.rt_next) {
1281                 struct rtable *trt;
1282                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1283                        hash, &rt->rt_dst);
1284                 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1285                         printk(" . %pI4", &trt->rt_dst);
1286                 printk("\n");
1287         }
1288 #endif
1289         /*
1290          * Since lookup is lockfree, we must make sure
1291          * previous writes to rt are comitted to memory
1292          * before making rt visible to other CPUS.
1293          */
1294         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1295
1296         spin_unlock_bh(rt_hash_lock_addr(hash));
1297
1298 skip_hashing:
1299         if (rp)
1300                 *rp = rt;
1301         else
1302                 skb_dst_set(skb, &rt->dst);
1303         return 0;
1304 }
1305
1306 void rt_bind_peer(struct rtable *rt, int create)
1307 {
1308         struct inet_peer *peer;
1309
1310         peer = inet_getpeer_v4(rt->rt_dst, create);
1311
1312         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1313                 inet_putpeer(peer);
1314 }
1315
1316 /*
1317  * Peer allocation may fail only in serious out-of-memory conditions.  However
1318  * we still can generate some output.
1319  * Random ID selection looks a bit dangerous because we have no chances to
1320  * select ID being unique in a reasonable period of time.
1321  * But broken packet identifier may be better than no packet at all.
1322  */
1323 static void ip_select_fb_ident(struct iphdr *iph)
1324 {
1325         static DEFINE_SPINLOCK(ip_fb_id_lock);
1326         static u32 ip_fallback_id;
1327         u32 salt;
1328
1329         spin_lock_bh(&ip_fb_id_lock);
1330         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1331         iph->id = htons(salt & 0xFFFF);
1332         ip_fallback_id = salt;
1333         spin_unlock_bh(&ip_fb_id_lock);
1334 }
1335
1336 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1337 {
1338         struct rtable *rt = (struct rtable *) dst;
1339
1340         if (rt) {
1341                 if (rt->peer == NULL)
1342                         rt_bind_peer(rt, 1);
1343
1344                 /* If peer is attached to destination, it is never detached,
1345                    so that we need not to grab a lock to dereference it.
1346                  */
1347                 if (rt->peer) {
1348                         iph->id = htons(inet_getid(rt->peer, more));
1349                         return;
1350                 }
1351         } else
1352                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1353                        __builtin_return_address(0));
1354
1355         ip_select_fb_ident(iph);
1356 }
1357 EXPORT_SYMBOL(__ip_select_ident);
1358
1359 static void rt_del(unsigned hash, struct rtable *rt)
1360 {
1361         struct rtable __rcu **rthp;
1362         struct rtable *aux;
1363
1364         rthp = &rt_hash_table[hash].chain;
1365         spin_lock_bh(rt_hash_lock_addr(hash));
1366         ip_rt_put(rt);
1367         while ((aux = rcu_dereference_protected(*rthp,
1368                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1369                 if (aux == rt || rt_is_expired(aux)) {
1370                         *rthp = aux->dst.rt_next;
1371                         rt_free(aux);
1372                         continue;
1373                 }
1374                 rthp = &aux->dst.rt_next;
1375         }
1376         spin_unlock_bh(rt_hash_lock_addr(hash));
1377 }
1378
1379 /* called in rcu_read_lock() section */
1380 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1381                     __be32 saddr, struct net_device *dev)
1382 {
1383         int i, k;
1384         struct in_device *in_dev = __in_dev_get_rcu(dev);
1385         struct rtable *rth;
1386         struct rtable __rcu **rthp;
1387         __be32  skeys[2] = { saddr, 0 };
1388         int  ikeys[2] = { dev->ifindex, 0 };
1389         struct netevent_redirect netevent;
1390         struct net *net;
1391
1392         if (!in_dev)
1393                 return;
1394
1395         net = dev_net(dev);
1396         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1397             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1398             ipv4_is_zeronet(new_gw))
1399                 goto reject_redirect;
1400
1401         if (!rt_caching(net))
1402                 goto reject_redirect;
1403
1404         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1405                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1406                         goto reject_redirect;
1407                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1408                         goto reject_redirect;
1409         } else {
1410                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1411                         goto reject_redirect;
1412         }
1413
1414         for (i = 0; i < 2; i++) {
1415                 for (k = 0; k < 2; k++) {
1416                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1417                                                 rt_genid(net));
1418
1419                         rthp = &rt_hash_table[hash].chain;
1420
1421                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1422                                 struct rtable *rt;
1423
1424                                 if (rth->fl.fl4_dst != daddr ||
1425                                     rth->fl.fl4_src != skeys[i] ||
1426                                     rth->fl.oif != ikeys[k] ||
1427                                     rt_is_input_route(rth) ||
1428                                     rt_is_expired(rth) ||
1429                                     !net_eq(dev_net(rth->dst.dev), net)) {
1430                                         rthp = &rth->dst.rt_next;
1431                                         continue;
1432                                 }
1433
1434                                 if (rth->rt_dst != daddr ||
1435                                     rth->rt_src != saddr ||
1436                                     rth->dst.error ||
1437                                     rth->rt_gateway != old_gw ||
1438                                     rth->dst.dev != dev)
1439                                         break;
1440
1441                                 dst_hold(&rth->dst);
1442
1443                                 rt = dst_alloc(&ipv4_dst_ops);
1444                                 if (rt == NULL) {
1445                                         ip_rt_put(rth);
1446                                         return;
1447                                 }
1448
1449                                 /* Copy all the information. */
1450                                 *rt = *rth;
1451                                 rt->dst.__use           = 1;
1452                                 atomic_set(&rt->dst.__refcnt, 1);
1453                                 rt->dst.child           = NULL;
1454                                 if (rt->dst.dev)
1455                                         dev_hold(rt->dst.dev);
1456                                 rt->dst.obsolete        = -1;
1457                                 rt->dst.lastuse = jiffies;
1458                                 rt->dst.path            = &rt->dst;
1459                                 rt->dst.neighbour       = NULL;
1460                                 rt->dst.hh              = NULL;
1461 #ifdef CONFIG_XFRM
1462                                 rt->dst.xfrm            = NULL;
1463 #endif
1464                                 rt->rt_genid            = rt_genid(net);
1465                                 rt->rt_flags            |= RTCF_REDIRECTED;
1466
1467                                 /* Gateway is different ... */
1468                                 rt->rt_gateway          = new_gw;
1469
1470                                 /* Redirect received -> path was valid */
1471                                 dst_confirm(&rth->dst);
1472
1473                                 if (rt->peer)
1474                                         atomic_inc(&rt->peer->refcnt);
1475                                 if (rt->fi)
1476                                         atomic_inc(&rt->fi->fib_clntref);
1477
1478                                 if (arp_bind_neighbour(&rt->dst) ||
1479                                     !(rt->dst.neighbour->nud_state &
1480                                             NUD_VALID)) {
1481                                         if (rt->dst.neighbour)
1482                                                 neigh_event_send(rt->dst.neighbour, NULL);
1483                                         ip_rt_put(rth);
1484                                         rt_drop(rt);
1485                                         goto do_next;
1486                                 }
1487
1488                                 netevent.old = &rth->dst;
1489                                 netevent.new = &rt->dst;
1490                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1491                                                         &netevent);
1492
1493                                 rt_del(hash, rth);
1494                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1495                                         ip_rt_put(rt);
1496                                 goto do_next;
1497                         }
1498                 do_next:
1499                         ;
1500                 }
1501         }
1502         return;
1503
1504 reject_redirect:
1505 #ifdef CONFIG_IP_ROUTE_VERBOSE
1506         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1507                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1508                         "  Advised path = %pI4 -> %pI4\n",
1509                        &old_gw, dev->name, &new_gw,
1510                        &saddr, &daddr);
1511 #endif
1512         ;
1513 }
1514
1515 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1516 {
1517         struct rtable *rt = (struct rtable *)dst;
1518         struct dst_entry *ret = dst;
1519
1520         if (rt) {
1521                 if (dst->obsolete > 0) {
1522                         ip_rt_put(rt);
1523                         ret = NULL;
1524                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1525                            (rt->dst.expires &&
1526                             time_after_eq(jiffies, rt->dst.expires))) {
1527                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1528                                                 rt->fl.oif,
1529                                                 rt_genid(dev_net(dst->dev)));
1530 #if RT_CACHE_DEBUG >= 1
1531                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1532                                 &rt->rt_dst, rt->fl.fl4_tos);
1533 #endif
1534                         rt_del(hash, rt);
1535                         ret = NULL;
1536                 }
1537         }
1538         return ret;
1539 }
1540
1541 /*
1542  * Algorithm:
1543  *      1. The first ip_rt_redirect_number redirects are sent
1544  *         with exponential backoff, then we stop sending them at all,
1545  *         assuming that the host ignores our redirects.
1546  *      2. If we did not see packets requiring redirects
1547  *         during ip_rt_redirect_silence, we assume that the host
1548  *         forgot redirected route and start to send redirects again.
1549  *
1550  * This algorithm is much cheaper and more intelligent than dumb load limiting
1551  * in icmp.c.
1552  *
1553  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1554  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1555  */
1556
1557 void ip_rt_send_redirect(struct sk_buff *skb)
1558 {
1559         struct rtable *rt = skb_rtable(skb);
1560         struct in_device *in_dev;
1561         int log_martians;
1562
1563         rcu_read_lock();
1564         in_dev = __in_dev_get_rcu(rt->dst.dev);
1565         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1566                 rcu_read_unlock();
1567                 return;
1568         }
1569         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1570         rcu_read_unlock();
1571
1572         /* No redirected packets during ip_rt_redirect_silence;
1573          * reset the algorithm.
1574          */
1575         if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1576                 rt->dst.rate_tokens = 0;
1577
1578         /* Too many ignored redirects; do not send anything
1579          * set dst.rate_last to the last seen redirected packet.
1580          */
1581         if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1582                 rt->dst.rate_last = jiffies;
1583                 return;
1584         }
1585
1586         /* Check for load limit; set rate_last to the latest sent
1587          * redirect.
1588          */
1589         if (rt->dst.rate_tokens == 0 ||
1590             time_after(jiffies,
1591                        (rt->dst.rate_last +
1592                         (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1593                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1594                 rt->dst.rate_last = jiffies;
1595                 ++rt->dst.rate_tokens;
1596 #ifdef CONFIG_IP_ROUTE_VERBOSE
1597                 if (log_martians &&
1598                     rt->dst.rate_tokens == ip_rt_redirect_number &&
1599                     net_ratelimit())
1600                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1601                                 &rt->rt_src, rt->rt_iif,
1602                                 &rt->rt_dst, &rt->rt_gateway);
1603 #endif
1604         }
1605 }
1606
1607 static int ip_error(struct sk_buff *skb)
1608 {
1609         struct rtable *rt = skb_rtable(skb);
1610         unsigned long now;
1611         int code;
1612
1613         switch (rt->dst.error) {
1614                 case EINVAL:
1615                 default:
1616                         goto out;
1617                 case EHOSTUNREACH:
1618                         code = ICMP_HOST_UNREACH;
1619                         break;
1620                 case ENETUNREACH:
1621                         code = ICMP_NET_UNREACH;
1622                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1623                                         IPSTATS_MIB_INNOROUTES);
1624                         break;
1625                 case EACCES:
1626                         code = ICMP_PKT_FILTERED;
1627                         break;
1628         }
1629
1630         now = jiffies;
1631         rt->dst.rate_tokens += now - rt->dst.rate_last;
1632         if (rt->dst.rate_tokens > ip_rt_error_burst)
1633                 rt->dst.rate_tokens = ip_rt_error_burst;
1634         rt->dst.rate_last = now;
1635         if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1636                 rt->dst.rate_tokens -= ip_rt_error_cost;
1637                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1638         }
1639
1640 out:    kfree_skb(skb);
1641         return 0;
1642 }
1643
1644 /*
1645  *      The last two values are not from the RFC but
1646  *      are needed for AMPRnet AX.25 paths.
1647  */
1648
1649 static const unsigned short mtu_plateau[] =
1650 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1651
1652 static inline unsigned short guess_mtu(unsigned short old_mtu)
1653 {
1654         int i;
1655
1656         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1657                 if (old_mtu > mtu_plateau[i])
1658                         return mtu_plateau[i];
1659         return 68;
1660 }
1661
1662 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1663                                  unsigned short new_mtu,
1664                                  struct net_device *dev)
1665 {
1666         int i, k;
1667         unsigned short old_mtu = ntohs(iph->tot_len);
1668         struct rtable *rth;
1669         int  ikeys[2] = { dev->ifindex, 0 };
1670         __be32  skeys[2] = { iph->saddr, 0, };
1671         __be32  daddr = iph->daddr;
1672         unsigned short est_mtu = 0;
1673
1674         for (k = 0; k < 2; k++) {
1675                 for (i = 0; i < 2; i++) {
1676                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1677                                                 rt_genid(net));
1678
1679                         rcu_read_lock();
1680                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1681                              rth = rcu_dereference(rth->dst.rt_next)) {
1682                                 unsigned short mtu = new_mtu;
1683
1684                                 if (rth->fl.fl4_dst != daddr ||
1685                                     rth->fl.fl4_src != skeys[i] ||
1686                                     rth->rt_dst != daddr ||
1687                                     rth->rt_src != iph->saddr ||
1688                                     rth->fl.oif != ikeys[k] ||
1689                                     rt_is_input_route(rth) ||
1690                                     dst_metric_locked(&rth->dst, RTAX_MTU) ||
1691                                     !net_eq(dev_net(rth->dst.dev), net) ||
1692                                     rt_is_expired(rth))
1693                                         continue;
1694
1695                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1696
1697                                         /* BSD 4.2 compatibility hack :-( */
1698                                         if (mtu == 0 &&
1699                                             old_mtu >= dst_mtu(&rth->dst) &&
1700                                             old_mtu >= 68 + (iph->ihl << 2))
1701                                                 old_mtu -= iph->ihl << 2;
1702
1703                                         mtu = guess_mtu(old_mtu);
1704                                 }
1705                                 if (mtu <= dst_mtu(&rth->dst)) {
1706                                         if (mtu < dst_mtu(&rth->dst)) {
1707                                                 dst_confirm(&rth->dst);
1708                                                 if (mtu < ip_rt_min_pmtu) {
1709                                                         u32 lock = dst_metric(&rth->dst,
1710                                                                               RTAX_LOCK);
1711                                                         mtu = ip_rt_min_pmtu;
1712                                                         lock |= (1 << RTAX_MTU);
1713                                                         dst_metric_set(&rth->dst, RTAX_LOCK,
1714                                                                        lock);
1715                                                 }
1716                                                 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1717                                                 dst_set_expires(&rth->dst,
1718                                                         ip_rt_mtu_expires);
1719                                         }
1720                                         est_mtu = mtu;
1721                                 }
1722                         }
1723                         rcu_read_unlock();
1724                 }
1725         }
1726         return est_mtu ? : new_mtu;
1727 }
1728
1729 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1730 {
1731         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1732             !(dst_metric_locked(dst, RTAX_MTU))) {
1733                 if (mtu < ip_rt_min_pmtu) {
1734                         u32 lock = dst_metric(dst, RTAX_LOCK);
1735                         mtu = ip_rt_min_pmtu;
1736                         dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1737                 }
1738                 dst_metric_set(dst, RTAX_MTU, mtu);
1739                 dst_set_expires(dst, ip_rt_mtu_expires);
1740                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1741         }
1742 }
1743
1744 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1745 {
1746         if (rt_is_expired((struct rtable *)dst))
1747                 return NULL;
1748         return dst;
1749 }
1750
1751 static void ipv4_dst_destroy(struct dst_entry *dst)
1752 {
1753         struct rtable *rt = (struct rtable *) dst;
1754         struct inet_peer *peer = rt->peer;
1755
1756         dst_destroy_metrics_generic(dst);
1757         if (rt->fi) {
1758                 fib_info_put(rt->fi);
1759                 rt->fi = NULL;
1760         }
1761         if (peer) {
1762                 rt->peer = NULL;
1763                 inet_putpeer(peer);
1764         }
1765 }
1766
1767
1768 static void ipv4_link_failure(struct sk_buff *skb)
1769 {
1770         struct rtable *rt;
1771
1772         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1773
1774         rt = skb_rtable(skb);
1775         if (rt)
1776                 dst_set_expires(&rt->dst, 0);
1777 }
1778
1779 static int ip_rt_bug(struct sk_buff *skb)
1780 {
1781         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1782                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1783                 skb->dev ? skb->dev->name : "?");
1784         kfree_skb(skb);
1785         return 0;
1786 }
1787
1788 /*
1789    We do not cache source address of outgoing interface,
1790    because it is used only by IP RR, TS and SRR options,
1791    so that it out of fast path.
1792
1793    BTW remember: "addr" is allowed to be not aligned
1794    in IP options!
1795  */
1796
1797 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1798 {
1799         __be32 src;
1800         struct fib_result res;
1801
1802         if (rt_is_output_route(rt))
1803                 src = rt->rt_src;
1804         else {
1805                 rcu_read_lock();
1806                 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1807                         src = FIB_RES_PREFSRC(res);
1808                 else
1809                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1810                                         RT_SCOPE_UNIVERSE);
1811                 rcu_read_unlock();
1812         }
1813         memcpy(addr, &src, 4);
1814 }
1815
1816 #ifdef CONFIG_IP_ROUTE_CLASSID
1817 static void set_class_tag(struct rtable *rt, u32 tag)
1818 {
1819         if (!(rt->dst.tclassid & 0xFFFF))
1820                 rt->dst.tclassid |= tag & 0xFFFF;
1821         if (!(rt->dst.tclassid & 0xFFFF0000))
1822                 rt->dst.tclassid |= tag & 0xFFFF0000;
1823 }
1824 #endif
1825
1826 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1827 {
1828         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1829
1830         if (advmss == 0) {
1831                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1832                                ip_rt_min_advmss);
1833                 if (advmss > 65535 - 40)
1834                         advmss = 65535 - 40;
1835         }
1836         return advmss;
1837 }
1838
1839 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1840 {
1841         unsigned int mtu = dst->dev->mtu;
1842
1843         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1844                 const struct rtable *rt = (const struct rtable *) dst;
1845
1846                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1847                         mtu = 576;
1848         }
1849
1850         if (mtu > IP_MAX_MTU)
1851                 mtu = IP_MAX_MTU;
1852
1853         return mtu;
1854 }
1855
1856 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1857 {
1858         struct dst_entry *dst = &rt->dst;
1859         struct fib_info *fi = res->fi;
1860
1861         if (fi) {
1862                 if (FIB_RES_GW(*res) &&
1863                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1864                         rt->rt_gateway = FIB_RES_GW(*res);
1865                 rt->fi = fi;
1866                 atomic_inc(&fi->fib_clntref);
1867                 dst_init_metrics(dst, fi->fib_metrics, true);
1868 #ifdef CONFIG_IP_ROUTE_CLASSID
1869                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1870 #endif
1871         }
1872
1873         if (dst_mtu(dst) > IP_MAX_MTU)
1874                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1875         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1876                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1877
1878 #ifdef CONFIG_IP_ROUTE_CLASSID
1879 #ifdef CONFIG_IP_MULTIPLE_TABLES
1880         set_class_tag(rt, fib_rules_tclass(res));
1881 #endif
1882         set_class_tag(rt, itag);
1883 #endif
1884         rt->rt_type = res->type;
1885 }
1886
1887 /* called in rcu_read_lock() section */
1888 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1889                                 u8 tos, struct net_device *dev, int our)
1890 {
1891         unsigned int hash;
1892         struct rtable *rth;
1893         __be32 spec_dst;
1894         struct in_device *in_dev = __in_dev_get_rcu(dev);
1895         u32 itag = 0;
1896         int err;
1897
1898         /* Primary sanity checks. */
1899
1900         if (in_dev == NULL)
1901                 return -EINVAL;
1902
1903         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1904             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1905                 goto e_inval;
1906
1907         if (ipv4_is_zeronet(saddr)) {
1908                 if (!ipv4_is_local_multicast(daddr))
1909                         goto e_inval;
1910                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1911         } else {
1912                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1913                                           &itag, 0);
1914                 if (err < 0)
1915                         goto e_err;
1916         }
1917         rth = dst_alloc(&ipv4_dst_ops);
1918         if (!rth)
1919                 goto e_nobufs;
1920
1921         rth->dst.output = ip_rt_bug;
1922         rth->dst.obsolete = -1;
1923
1924         atomic_set(&rth->dst.__refcnt, 1);
1925         rth->dst.flags= DST_HOST;
1926         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1927                 rth->dst.flags |= DST_NOPOLICY;
1928         rth->fl.fl4_dst = daddr;
1929         rth->rt_dst     = daddr;
1930         rth->fl.fl4_tos = tos;
1931         rth->fl.mark    = skb->mark;
1932         rth->fl.fl4_src = saddr;
1933         rth->rt_src     = saddr;
1934 #ifdef CONFIG_IP_ROUTE_CLASSID
1935         rth->dst.tclassid = itag;
1936 #endif
1937         rth->rt_iif     =
1938         rth->fl.iif     = dev->ifindex;
1939         rth->dst.dev    = init_net.loopback_dev;
1940         dev_hold(rth->dst.dev);
1941         rth->fl.oif     = 0;
1942         rth->rt_gateway = daddr;
1943         rth->rt_spec_dst= spec_dst;
1944         rth->rt_genid   = rt_genid(dev_net(dev));
1945         rth->rt_flags   = RTCF_MULTICAST;
1946         rth->rt_type    = RTN_MULTICAST;
1947         if (our) {
1948                 rth->dst.input= ip_local_deliver;
1949                 rth->rt_flags |= RTCF_LOCAL;
1950         }
1951
1952 #ifdef CONFIG_IP_MROUTE
1953         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1954                 rth->dst.input = ip_mr_input;
1955 #endif
1956         RT_CACHE_STAT_INC(in_slow_mc);
1957
1958         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1959         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1960
1961 e_nobufs:
1962         return -ENOBUFS;
1963 e_inval:
1964         return -EINVAL;
1965 e_err:
1966         return err;
1967 }
1968
1969
1970 static void ip_handle_martian_source(struct net_device *dev,
1971                                      struct in_device *in_dev,
1972                                      struct sk_buff *skb,
1973                                      __be32 daddr,
1974                                      __be32 saddr)
1975 {
1976         RT_CACHE_STAT_INC(in_martian_src);
1977 #ifdef CONFIG_IP_ROUTE_VERBOSE
1978         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1979                 /*
1980                  *      RFC1812 recommendation, if source is martian,
1981                  *      the only hint is MAC header.
1982                  */
1983                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1984                         &daddr, &saddr, dev->name);
1985                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1986                         int i;
1987                         const unsigned char *p = skb_mac_header(skb);
1988                         printk(KERN_WARNING "ll header: ");
1989                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1990                                 printk("%02x", *p);
1991                                 if (i < (dev->hard_header_len - 1))
1992                                         printk(":");
1993                         }
1994                         printk("\n");
1995                 }
1996         }
1997 #endif
1998 }
1999
2000 /* called in rcu_read_lock() section */
2001 static int __mkroute_input(struct sk_buff *skb,
2002                            struct fib_result *res,
2003                            struct in_device *in_dev,
2004                            __be32 daddr, __be32 saddr, u32 tos,
2005                            struct rtable **result)
2006 {
2007         struct rtable *rth;
2008         int err;
2009         struct in_device *out_dev;
2010         unsigned int flags = 0;
2011         __be32 spec_dst;
2012         u32 itag;
2013
2014         /* get a working reference to the output device */
2015         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2016         if (out_dev == NULL) {
2017                 if (net_ratelimit())
2018                         printk(KERN_CRIT "Bug in ip_route_input" \
2019                                "_slow(). Please, report\n");
2020                 return -EINVAL;
2021         }
2022
2023
2024         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
2025                                   in_dev->dev, &spec_dst, &itag, skb->mark);
2026         if (err < 0) {
2027                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2028                                          saddr);
2029
2030                 goto cleanup;
2031         }
2032
2033         if (err)
2034                 flags |= RTCF_DIRECTSRC;
2035
2036         if (out_dev == in_dev && err &&
2037             (IN_DEV_SHARED_MEDIA(out_dev) ||
2038              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2039                 flags |= RTCF_DOREDIRECT;
2040
2041         if (skb->protocol != htons(ETH_P_IP)) {
2042                 /* Not IP (i.e. ARP). Do not create route, if it is
2043                  * invalid for proxy arp. DNAT routes are always valid.
2044                  *
2045                  * Proxy arp feature have been extended to allow, ARP
2046                  * replies back to the same interface, to support
2047                  * Private VLAN switch technologies. See arp.c.
2048                  */
2049                 if (out_dev == in_dev &&
2050                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2051                         err = -EINVAL;
2052                         goto cleanup;
2053                 }
2054         }
2055
2056
2057         rth = dst_alloc(&ipv4_dst_ops);
2058         if (!rth) {
2059                 err = -ENOBUFS;
2060                 goto cleanup;
2061         }
2062
2063         atomic_set(&rth->dst.__refcnt, 1);
2064         rth->dst.flags= DST_HOST;
2065         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2066                 rth->dst.flags |= DST_NOPOLICY;
2067         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2068                 rth->dst.flags |= DST_NOXFRM;
2069         rth->fl.fl4_dst = daddr;
2070         rth->rt_dst     = daddr;
2071         rth->fl.fl4_tos = tos;
2072         rth->fl.mark    = skb->mark;
2073         rth->fl.fl4_src = saddr;
2074         rth->rt_src     = saddr;
2075         rth->rt_gateway = daddr;
2076         rth->rt_iif     =
2077                 rth->fl.iif     = in_dev->dev->ifindex;
2078         rth->dst.dev    = (out_dev)->dev;
2079         dev_hold(rth->dst.dev);
2080         rth->fl.oif     = 0;
2081         rth->rt_spec_dst= spec_dst;
2082
2083         rth->dst.obsolete = -1;
2084         rth->dst.input = ip_forward;
2085         rth->dst.output = ip_output;
2086         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2087
2088         rt_set_nexthop(rth, res, itag);
2089
2090         rth->rt_flags = flags;
2091
2092         *result = rth;
2093         err = 0;
2094  cleanup:
2095         return err;
2096 }
2097
2098 static int ip_mkroute_input(struct sk_buff *skb,
2099                             struct fib_result *res,
2100                             const struct flowi *fl,
2101                             struct in_device *in_dev,
2102                             __be32 daddr, __be32 saddr, u32 tos)
2103 {
2104         struct rtable* rth = NULL;
2105         int err;
2106         unsigned hash;
2107
2108 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2109         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2110                 fib_select_multipath(fl, res);
2111 #endif
2112
2113         /* create a routing cache entry */
2114         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2115         if (err)
2116                 return err;
2117
2118         /* put it into the cache */
2119         hash = rt_hash(daddr, saddr, fl->iif,
2120                        rt_genid(dev_net(rth->dst.dev)));
2121         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2122 }
2123
2124 /*
2125  *      NOTE. We drop all the packets that has local source
2126  *      addresses, because every properly looped back packet
2127  *      must have correct destination already attached by output routine.
2128  *
2129  *      Such approach solves two big problems:
2130  *      1. Not simplex devices are handled properly.
2131  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2132  *      called with rcu_read_lock()
2133  */
2134
2135 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2136                                u8 tos, struct net_device *dev)
2137 {
2138         struct fib_result res;
2139         struct in_device *in_dev = __in_dev_get_rcu(dev);
2140         struct flowi fl = { .fl4_dst    = daddr,
2141                             .fl4_src    = saddr,
2142                             .fl4_tos    = tos,
2143                             .fl4_scope  = RT_SCOPE_UNIVERSE,
2144                             .mark = skb->mark,
2145                             .iif = dev->ifindex };
2146         unsigned        flags = 0;
2147         u32             itag = 0;
2148         struct rtable * rth;
2149         unsigned        hash;
2150         __be32          spec_dst;
2151         int             err = -EINVAL;
2152         struct net    * net = dev_net(dev);
2153
2154         /* IP on this device is disabled. */
2155
2156         if (!in_dev)
2157                 goto out;
2158
2159         /* Check for the most weird martians, which can be not detected
2160            by fib_lookup.
2161          */
2162
2163         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2164             ipv4_is_loopback(saddr))
2165                 goto martian_source;
2166
2167         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2168                 goto brd_input;
2169
2170         /* Accept zero addresses only to limited broadcast;
2171          * I even do not know to fix it or not. Waiting for complains :-)
2172          */
2173         if (ipv4_is_zeronet(saddr))
2174                 goto martian_source;
2175
2176         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2177                 goto martian_destination;
2178
2179         /*
2180          *      Now we are ready to route packet.
2181          */
2182         err = fib_lookup(net, &fl, &res);
2183         if (err != 0) {
2184                 if (!IN_DEV_FORWARD(in_dev))
2185                         goto e_hostunreach;
2186                 goto no_route;
2187         }
2188
2189         RT_CACHE_STAT_INC(in_slow_tot);
2190
2191         if (res.type == RTN_BROADCAST)
2192                 goto brd_input;
2193
2194         if (res.type == RTN_LOCAL) {
2195                 err = fib_validate_source(saddr, daddr, tos,
2196                                           net->loopback_dev->ifindex,
2197                                           dev, &spec_dst, &itag, skb->mark);
2198                 if (err < 0)
2199                         goto martian_source_keep_err;
2200                 if (err)
2201                         flags |= RTCF_DIRECTSRC;
2202                 spec_dst = daddr;
2203                 goto local_input;
2204         }
2205
2206         if (!IN_DEV_FORWARD(in_dev))
2207                 goto e_hostunreach;
2208         if (res.type != RTN_UNICAST)
2209                 goto martian_destination;
2210
2211         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2212 out:    return err;
2213
2214 brd_input:
2215         if (skb->protocol != htons(ETH_P_IP))
2216                 goto e_inval;
2217
2218         if (ipv4_is_zeronet(saddr))
2219                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2220         else {
2221                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2222                                           &itag, skb->mark);
2223                 if (err < 0)
2224                         goto martian_source_keep_err;
2225                 if (err)
2226                         flags |= RTCF_DIRECTSRC;
2227         }
2228         flags |= RTCF_BROADCAST;
2229         res.type = RTN_BROADCAST;
2230         RT_CACHE_STAT_INC(in_brd);
2231
2232 local_input:
2233         rth = dst_alloc(&ipv4_dst_ops);
2234         if (!rth)
2235                 goto e_nobufs;
2236
2237         rth->dst.output= ip_rt_bug;
2238         rth->dst.obsolete = -1;
2239         rth->rt_genid = rt_genid(net);
2240
2241         atomic_set(&rth->dst.__refcnt, 1);
2242         rth->dst.flags= DST_HOST;
2243         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2244                 rth->dst.flags |= DST_NOPOLICY;
2245         rth->fl.fl4_dst = daddr;
2246         rth->rt_dst     = daddr;
2247         rth->fl.fl4_tos = tos;
2248         rth->fl.mark    = skb->mark;
2249         rth->fl.fl4_src = saddr;
2250         rth->rt_src     = saddr;
2251 #ifdef CONFIG_IP_ROUTE_CLASSID
2252         rth->dst.tclassid = itag;
2253 #endif
2254         rth->rt_iif     =
2255         rth->fl.iif     = dev->ifindex;
2256         rth->dst.dev    = net->loopback_dev;
2257         dev_hold(rth->dst.dev);
2258         rth->rt_gateway = daddr;
2259         rth->rt_spec_dst= spec_dst;
2260         rth->dst.input= ip_local_deliver;
2261         rth->rt_flags   = flags|RTCF_LOCAL;
2262         if (res.type == RTN_UNREACHABLE) {
2263                 rth->dst.input= ip_error;
2264                 rth->dst.error= -err;
2265                 rth->rt_flags   &= ~RTCF_LOCAL;
2266         }
2267         rth->rt_type    = res.type;
2268         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2269         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2270         goto out;
2271
2272 no_route:
2273         RT_CACHE_STAT_INC(in_no_route);
2274         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2275         res.type = RTN_UNREACHABLE;
2276         if (err == -ESRCH)
2277                 err = -ENETUNREACH;
2278         goto local_input;
2279
2280         /*
2281          *      Do not cache martian addresses: they should be logged (RFC1812)
2282          */
2283 martian_destination:
2284         RT_CACHE_STAT_INC(in_martian_dst);
2285 #ifdef CONFIG_IP_ROUTE_VERBOSE
2286         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2287                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2288                         &daddr, &saddr, dev->name);
2289 #endif
2290
2291 e_hostunreach:
2292         err = -EHOSTUNREACH;
2293         goto out;
2294
2295 e_inval:
2296         err = -EINVAL;
2297         goto out;
2298
2299 e_nobufs:
2300         err = -ENOBUFS;
2301         goto out;
2302
2303 martian_source:
2304         err = -EINVAL;
2305 martian_source_keep_err:
2306         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2307         goto out;
2308 }
2309
2310 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2311                            u8 tos, struct net_device *dev, bool noref)
2312 {
2313         struct rtable * rth;
2314         unsigned        hash;
2315         int iif = dev->ifindex;
2316         struct net *net;
2317         int res;
2318
2319         net = dev_net(dev);
2320
2321         rcu_read_lock();
2322
2323         if (!rt_caching(net))
2324                 goto skip_cache;
2325
2326         tos &= IPTOS_RT_MASK;
2327         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2328
2329         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2330              rth = rcu_dereference(rth->dst.rt_next)) {
2331                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2332                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2333                      (rth->fl.iif ^ iif) |
2334                      rth->fl.oif |
2335                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2336                     rth->fl.mark == skb->mark &&
2337                     net_eq(dev_net(rth->dst.dev), net) &&
2338                     !rt_is_expired(rth)) {
2339                         if (noref) {
2340                                 dst_use_noref(&rth->dst, jiffies);
2341                                 skb_dst_set_noref(skb, &rth->dst);
2342                         } else {
2343                                 dst_use(&rth->dst, jiffies);
2344                                 skb_dst_set(skb, &rth->dst);
2345                         }
2346                         RT_CACHE_STAT_INC(in_hit);
2347                         rcu_read_unlock();
2348                         return 0;
2349                 }
2350                 RT_CACHE_STAT_INC(in_hlist_search);
2351         }
2352
2353 skip_cache:
2354         /* Multicast recognition logic is moved from route cache to here.
2355            The problem was that too many Ethernet cards have broken/missing
2356            hardware multicast filters :-( As result the host on multicasting
2357            network acquires a lot of useless route cache entries, sort of
2358            SDR messages from all the world. Now we try to get rid of them.
2359            Really, provided software IP multicast filter is organized
2360            reasonably (at least, hashed), it does not result in a slowdown
2361            comparing with route cache reject entries.
2362            Note, that multicast routers are not affected, because
2363            route cache entry is created eventually.
2364          */
2365         if (ipv4_is_multicast(daddr)) {
2366                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2367
2368                 if (in_dev) {
2369                         int our = ip_check_mc(in_dev, daddr, saddr,
2370                                               ip_hdr(skb)->protocol);
2371                         if (our
2372 #ifdef CONFIG_IP_MROUTE
2373                                 ||
2374                             (!ipv4_is_local_multicast(daddr) &&
2375                              IN_DEV_MFORWARD(in_dev))
2376 #endif
2377                            ) {
2378                                 int res = ip_route_input_mc(skb, daddr, saddr,
2379                                                             tos, dev, our);
2380                                 rcu_read_unlock();
2381                                 return res;
2382                         }
2383                 }
2384                 rcu_read_unlock();
2385                 return -EINVAL;
2386         }
2387         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2388         rcu_read_unlock();
2389         return res;
2390 }
2391 EXPORT_SYMBOL(ip_route_input_common);
2392
2393 /* called with rcu_read_lock() */
2394 static int __mkroute_output(struct rtable **result,
2395                             struct fib_result *res,
2396                             const struct flowi *fl,
2397                             const struct flowi *oldflp,
2398                             struct net_device *dev_out,
2399                             unsigned flags)
2400 {
2401         struct rtable *rth;
2402         struct in_device *in_dev;
2403         u32 tos = RT_FL_TOS(oldflp);
2404
2405         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2406                 return -EINVAL;
2407
2408         if (ipv4_is_lbcast(fl->fl4_dst))
2409                 res->type = RTN_BROADCAST;
2410         else if (ipv4_is_multicast(fl->fl4_dst))
2411                 res->type = RTN_MULTICAST;
2412         else if (ipv4_is_zeronet(fl->fl4_dst))
2413                 return -EINVAL;
2414
2415         if (dev_out->flags & IFF_LOOPBACK)
2416                 flags |= RTCF_LOCAL;
2417
2418         in_dev = __in_dev_get_rcu(dev_out);
2419         if (!in_dev)
2420                 return -EINVAL;
2421
2422         if (res->type == RTN_BROADCAST) {
2423                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2424                 res->fi = NULL;
2425         } else if (res->type == RTN_MULTICAST) {
2426                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2427                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2428                                  oldflp->proto))
2429                         flags &= ~RTCF_LOCAL;
2430                 /* If multicast route do not exist use
2431                  * default one, but do not gateway in this case.
2432                  * Yes, it is hack.
2433                  */
2434                 if (res->fi && res->prefixlen < 4)
2435                         res->fi = NULL;
2436         }
2437
2438
2439         rth = dst_alloc(&ipv4_dst_ops);
2440         if (!rth)
2441                 return -ENOBUFS;
2442
2443         atomic_set(&rth->dst.__refcnt, 1);
2444         rth->dst.flags= DST_HOST;
2445         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2446                 rth->dst.flags |= DST_NOXFRM;
2447         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2448                 rth->dst.flags |= DST_NOPOLICY;
2449
2450         rth->fl.fl4_dst = oldflp->fl4_dst;
2451         rth->fl.fl4_tos = tos;
2452         rth->fl.fl4_src = oldflp->fl4_src;
2453         rth->fl.oif     = oldflp->oif;
2454         rth->fl.mark    = oldflp->mark;
2455         rth->rt_dst     = fl->fl4_dst;
2456         rth->rt_src     = fl->fl4_src;
2457         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2458         /* get references to the devices that are to be hold by the routing
2459            cache entry */
2460         rth->dst.dev    = dev_out;
2461         dev_hold(dev_out);
2462         rth->rt_gateway = fl->fl4_dst;
2463         rth->rt_spec_dst= fl->fl4_src;
2464
2465         rth->dst.output=ip_output;
2466         rth->dst.obsolete = -1;
2467         rth->rt_genid = rt_genid(dev_net(dev_out));
2468
2469         RT_CACHE_STAT_INC(out_slow_tot);
2470
2471         if (flags & RTCF_LOCAL) {
2472                 rth->dst.input = ip_local_deliver;
2473                 rth->rt_spec_dst = fl->fl4_dst;
2474         }
2475         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2476                 rth->rt_spec_dst = fl->fl4_src;
2477                 if (flags & RTCF_LOCAL &&
2478                     !(dev_out->flags & IFF_LOOPBACK)) {
2479                         rth->dst.output = ip_mc_output;
2480                         RT_CACHE_STAT_INC(out_slow_mc);
2481                 }
2482 #ifdef CONFIG_IP_MROUTE
2483                 if (res->type == RTN_MULTICAST) {
2484                         if (IN_DEV_MFORWARD(in_dev) &&
2485                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2486                                 rth->dst.input = ip_mr_input;
2487                                 rth->dst.output = ip_mc_output;
2488                         }
2489                 }
2490 #endif
2491         }
2492
2493         rt_set_nexthop(rth, res, 0);
2494
2495         rth->rt_flags = flags;
2496         *result = rth;
2497         return 0;
2498 }
2499
2500 /* called with rcu_read_lock() */
2501 static int ip_mkroute_output(struct rtable **rp,
2502                              struct fib_result *res,
2503                              const struct flowi *fl,
2504                              const struct flowi *oldflp,
2505                              struct net_device *dev_out,
2506                              unsigned flags)
2507 {
2508         struct rtable *rth = NULL;
2509         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2510         unsigned hash;
2511         if (err == 0) {
2512                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2513                                rt_genid(dev_net(dev_out)));
2514                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2515         }
2516
2517         return err;
2518 }
2519
2520 /*
2521  * Major route resolver routine.
2522  * called with rcu_read_lock();
2523  */
2524
2525 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2526                                 const struct flowi *oldflp)
2527 {
2528         u32 tos = RT_FL_TOS(oldflp);
2529         struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2530                             .fl4_src = oldflp->fl4_src,
2531                             .fl4_tos = tos & IPTOS_RT_MASK,
2532                             .fl4_scope = ((tos & RTO_ONLINK) ?
2533                                           RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2534                             .mark = oldflp->mark,
2535                             .iif = net->loopback_dev->ifindex,
2536                             .oif = oldflp->oif };
2537         struct fib_result res;
2538         unsigned int flags = 0;
2539         struct net_device *dev_out = NULL;
2540         int err;
2541
2542
2543         res.fi          = NULL;
2544 #ifdef CONFIG_IP_MULTIPLE_TABLES
2545         res.r           = NULL;
2546 #endif
2547
2548         if (oldflp->fl4_src) {
2549                 err = -EINVAL;
2550                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2551                     ipv4_is_lbcast(oldflp->fl4_src) ||
2552                     ipv4_is_zeronet(oldflp->fl4_src))
2553                         goto out;
2554
2555                 /* I removed check for oif == dev_out->oif here.
2556                    It was wrong for two reasons:
2557                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2558                       is assigned to multiple interfaces.
2559                    2. Moreover, we are allowed to send packets with saddr
2560                       of another iface. --ANK
2561                  */
2562
2563                 if (oldflp->oif == 0 &&
2564                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2565                      ipv4_is_lbcast(oldflp->fl4_dst))) {
2566                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2567                         dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2568                         if (dev_out == NULL)
2569                                 goto out;
2570
2571                         /* Special hack: user can direct multicasts
2572                            and limited broadcast via necessary interface
2573                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2574                            This hack is not just for fun, it allows
2575                            vic,vat and friends to work.
2576                            They bind socket to loopback, set ttl to zero
2577                            and expect that it will work.
2578                            From the viewpoint of routing cache they are broken,
2579                            because we are not allowed to build multicast path
2580                            with loopback source addr (look, routing cache
2581                            cannot know, that ttl is zero, so that packet
2582                            will not leave this host and route is valid).
2583                            Luckily, this hack is good workaround.
2584                          */
2585
2586                         fl.oif = dev_out->ifindex;
2587                         goto make_route;
2588                 }
2589
2590                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2591                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2592                         if (!__ip_dev_find(net, oldflp->fl4_src, false))
2593                                 goto out;
2594                 }
2595         }
2596
2597
2598         if (oldflp->oif) {
2599                 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2600                 err = -ENODEV;
2601                 if (dev_out == NULL)
2602                         goto out;
2603
2604                 /* RACE: Check return value of inet_select_addr instead. */
2605                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2606                         err = -ENETUNREACH;
2607                         goto out;
2608                 }
2609                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2610                     ipv4_is_lbcast(oldflp->fl4_dst)) {
2611                         if (!fl.fl4_src)
2612                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2613                                                               RT_SCOPE_LINK);
2614                         goto make_route;
2615                 }
2616                 if (!fl.fl4_src) {
2617                         if (ipv4_is_multicast(oldflp->fl4_dst))
2618                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2619                                                               fl.fl4_scope);
2620                         else if (!oldflp->fl4_dst)
2621                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2622                                                               RT_SCOPE_HOST);
2623                 }
2624         }
2625
2626         if (!fl.fl4_dst) {
2627                 fl.fl4_dst = fl.fl4_src;
2628                 if (!fl.fl4_dst)
2629                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2630                 dev_out = net->loopback_dev;
2631                 fl.oif = net->loopback_dev->ifindex;
2632                 res.type = RTN_LOCAL;
2633                 flags |= RTCF_LOCAL;
2634                 goto make_route;
2635         }
2636
2637         if (fib_lookup(net, &fl, &res)) {
2638                 res.fi = NULL;
2639                 if (oldflp->oif) {
2640                         /* Apparently, routing tables are wrong. Assume,
2641                            that the destination is on link.
2642
2643                            WHY? DW.
2644                            Because we are allowed to send to iface
2645                            even if it has NO routes and NO assigned
2646                            addresses. When oif is specified, routing
2647                            tables are looked up with only one purpose:
2648                            to catch if destination is gatewayed, rather than
2649                            direct. Moreover, if MSG_DONTROUTE is set,
2650                            we send packet, ignoring both routing tables
2651                            and ifaddr state. --ANK
2652
2653
2654                            We could make it even if oif is unknown,
2655                            likely IPv6, but we do not.
2656                          */
2657
2658                         if (fl.fl4_src == 0)
2659                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2660                                                               RT_SCOPE_LINK);
2661                         res.type = RTN_UNICAST;
2662                         goto make_route;
2663                 }
2664                 err = -ENETUNREACH;
2665                 goto out;
2666         }
2667
2668         if (res.type == RTN_LOCAL) {
2669                 if (!fl.fl4_src) {
2670                         if (res.fi->fib_prefsrc)
2671                                 fl.fl4_src = res.fi->fib_prefsrc;
2672                         else
2673                                 fl.fl4_src = fl.fl4_dst;
2674                 }
2675                 dev_out = net->loopback_dev;
2676                 fl.oif = dev_out->ifindex;
2677                 res.fi = NULL;
2678                 flags |= RTCF_LOCAL;
2679                 goto make_route;
2680         }
2681
2682 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2683         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2684                 fib_select_multipath(&fl, &res);
2685         else
2686 #endif
2687         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2688                 fib_select_default(net, &fl, &res);
2689
2690         if (!fl.fl4_src)
2691                 fl.fl4_src = FIB_RES_PREFSRC(res);
2692
2693         dev_out = FIB_RES_DEV(res);
2694         fl.oif = dev_out->ifindex;
2695
2696
2697 make_route:
2698         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2699
2700 out:    return err;
2701 }
2702
2703 int __ip_route_output_key(struct net *net, struct rtable **rp,
2704                           const struct flowi *flp)
2705 {
2706         unsigned int hash;
2707         int res;
2708         struct rtable *rth;
2709
2710         if (!rt_caching(net))
2711                 goto slow_output;
2712
2713         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2714
2715         rcu_read_lock_bh();
2716         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2717                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2718                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2719                     rth->fl.fl4_src == flp->fl4_src &&
2720                     rt_is_output_route(rth) &&
2721                     rth->fl.oif == flp->oif &&
2722                     rth->fl.mark == flp->mark &&
2723                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2724                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2725                     net_eq(dev_net(rth->dst.dev), net) &&
2726                     !rt_is_expired(rth)) {
2727                         dst_use(&rth->dst, jiffies);
2728                         RT_CACHE_STAT_INC(out_hit);
2729                         rcu_read_unlock_bh();
2730                         *rp = rth;
2731                         return 0;
2732                 }
2733                 RT_CACHE_STAT_INC(out_hlist_search);
2734         }
2735         rcu_read_unlock_bh();
2736
2737 slow_output:
2738         rcu_read_lock();
2739         res = ip_route_output_slow(net, rp, flp);
2740         rcu_read_unlock();
2741         return res;
2742 }
2743 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2744
2745 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2746 {
2747         return NULL;
2748 }
2749
2750 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2751 {
2752 }
2753
2754 static struct dst_ops ipv4_dst_blackhole_ops = {
2755         .family                 =       AF_INET,
2756         .protocol               =       cpu_to_be16(ETH_P_IP),
2757         .destroy                =       ipv4_dst_destroy,
2758         .check                  =       ipv4_blackhole_dst_check,
2759         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2760 };
2761
2762
2763 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2764 {
2765         struct rtable *ort = *rp;
2766         struct rtable *rt = (struct rtable *)
2767                 dst_alloc(&ipv4_dst_blackhole_ops);
2768
2769         if (rt) {
2770                 struct dst_entry *new = &rt->dst;
2771
2772                 atomic_set(&new->__refcnt, 1);
2773                 new->__use = 1;
2774                 new->input = dst_discard;
2775                 new->output = dst_discard;
2776                 dst_copy_metrics(new, &ort->dst);
2777
2778                 new->dev = ort->dst.dev;
2779                 if (new->dev)
2780                         dev_hold(new->dev);
2781
2782                 rt->fl = ort->fl;
2783
2784                 rt->rt_genid = rt_genid(net);
2785                 rt->rt_flags = ort->rt_flags;
2786                 rt->rt_type = ort->rt_type;
2787                 rt->rt_dst = ort->rt_dst;
2788                 rt->rt_src = ort->rt_src;
2789                 rt->rt_iif = ort->rt_iif;
2790                 rt->rt_gateway = ort->rt_gateway;
2791                 rt->rt_spec_dst = ort->rt_spec_dst;
2792                 rt->peer = ort->peer;
2793                 if (rt->peer)
2794                         atomic_inc(&rt->peer->refcnt);
2795                 rt->fi = ort->fi;
2796                 if (rt->fi)
2797                         atomic_inc(&rt->fi->fib_clntref);
2798
2799                 dst_free(new);
2800         }
2801
2802         dst_release(&(*rp)->dst);
2803         *rp = rt;
2804         return rt ? 0 : -ENOMEM;
2805 }
2806
2807 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2808                          struct sock *sk, int flags)
2809 {
2810         int err;
2811
2812         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2813                 return err;
2814
2815         if (flp->proto) {
2816                 if (!flp->fl4_src)
2817                         flp->fl4_src = (*rp)->rt_src;
2818                 if (!flp->fl4_dst)
2819                         flp->fl4_dst = (*rp)->rt_dst;
2820                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2821                                     flags ? XFRM_LOOKUP_WAIT : 0);
2822                 if (err == -EREMOTE)
2823                         err = ipv4_dst_blackhole(net, rp, flp);
2824
2825                 return err;
2826         }
2827
2828         return 0;
2829 }
2830 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2831
2832 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2833 {
2834         return ip_route_output_flow(net, rp, flp, NULL, 0);
2835 }
2836 EXPORT_SYMBOL(ip_route_output_key);
2837
2838 static int rt_fill_info(struct net *net,
2839                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2840                         int nowait, unsigned int flags)
2841 {
2842         struct rtable *rt = skb_rtable(skb);
2843         struct rtmsg *r;
2844         struct nlmsghdr *nlh;
2845         long expires;
2846         u32 id = 0, ts = 0, tsage = 0, error;
2847
2848         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2849         if (nlh == NULL)
2850                 return -EMSGSIZE;
2851
2852         r = nlmsg_data(nlh);
2853         r->rtm_family    = AF_INET;
2854         r->rtm_dst_len  = 32;
2855         r->rtm_src_len  = 0;
2856         r->rtm_tos      = rt->fl.fl4_tos;
2857         r->rtm_table    = RT_TABLE_MAIN;
2858         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2859         r->rtm_type     = rt->rt_type;
2860         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2861         r->rtm_protocol = RTPROT_UNSPEC;
2862         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2863         if (rt->rt_flags & RTCF_NOTIFY)
2864                 r->rtm_flags |= RTM_F_NOTIFY;
2865
2866         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2867
2868         if (rt->fl.fl4_src) {
2869                 r->rtm_src_len = 32;
2870                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2871         }
2872         if (rt->dst.dev)
2873                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2874 #ifdef CONFIG_IP_ROUTE_CLASSID
2875         if (rt->dst.tclassid)
2876                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2877 #endif
2878         if (rt_is_input_route(rt))
2879                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2880         else if (rt->rt_src != rt->fl.fl4_src)
2881                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2882
2883         if (rt->rt_dst != rt->rt_gateway)
2884                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2885
2886         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2887                 goto nla_put_failure;
2888
2889         if (rt->fl.mark)
2890                 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2891
2892         error = rt->dst.error;
2893         expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2894         if (rt->peer) {
2895                 inet_peer_refcheck(rt->peer);
2896                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2897                 if (rt->peer->tcp_ts_stamp) {
2898                         ts = rt->peer->tcp_ts;
2899                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2900                 }
2901         }
2902
2903         if (rt_is_input_route(rt)) {
2904 #ifdef CONFIG_IP_MROUTE
2905                 __be32 dst = rt->rt_dst;
2906
2907                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2908                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2909                         int err = ipmr_get_route(net, skb, r, nowait);
2910                         if (err <= 0) {
2911                                 if (!nowait) {
2912                                         if (err == 0)
2913                                                 return 0;
2914                                         goto nla_put_failure;
2915                                 } else {
2916                                         if (err == -EMSGSIZE)
2917                                                 goto nla_put_failure;
2918                                         error = err;
2919                                 }
2920                         }
2921                 } else
2922 #endif
2923                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2924         }
2925
2926         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2927                                expires, error) < 0)
2928                 goto nla_put_failure;
2929
2930         return nlmsg_end(skb, nlh);
2931
2932 nla_put_failure:
2933         nlmsg_cancel(skb, nlh);
2934         return -EMSGSIZE;
2935 }
2936
2937 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2938 {
2939         struct net *net = sock_net(in_skb->sk);
2940         struct rtmsg *rtm;
2941         struct nlattr *tb[RTA_MAX+1];
2942         struct rtable *rt = NULL;
2943         __be32 dst = 0;
2944         __be32 src = 0;
2945         u32 iif;
2946         int err;
2947         int mark;
2948         struct sk_buff *skb;
2949
2950         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2951         if (err < 0)
2952                 goto errout;
2953
2954         rtm = nlmsg_data(nlh);
2955
2956         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2957         if (skb == NULL) {
2958                 err = -ENOBUFS;
2959                 goto errout;
2960         }
2961
2962         /* Reserve room for dummy headers, this skb can pass
2963            through good chunk of routing engine.
2964          */
2965         skb_reset_mac_header(skb);
2966         skb_reset_network_header(skb);
2967
2968         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2969         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2970         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2971
2972         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2973         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2974         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2975         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2976
2977         if (iif) {
2978                 struct net_device *dev;
2979
2980                 dev = __dev_get_by_index(net, iif);
2981                 if (dev == NULL) {
2982                         err = -ENODEV;
2983                         goto errout_free;
2984                 }
2985
2986                 skb->protocol   = htons(ETH_P_IP);
2987                 skb->dev        = dev;
2988                 skb->mark       = mark;
2989                 local_bh_disable();
2990                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2991                 local_bh_enable();
2992
2993                 rt = skb_rtable(skb);
2994                 if (err == 0 && rt->dst.error)
2995                         err = -rt->dst.error;
2996         } else {
2997                 struct flowi fl = {
2998                         .fl4_dst = dst,
2999                         .fl4_src = src,
3000                         .fl4_tos = rtm->rtm_tos,
3001                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3002                         .mark = mark,
3003                 };
3004                 err = ip_route_output_key(net, &rt, &fl);
3005         }
3006
3007         if (err)
3008                 goto errout_free;
3009
3010         skb_dst_set(skb, &rt->dst);
3011         if (rtm->rtm_flags & RTM_F_NOTIFY)
3012                 rt->rt_flags |= RTCF_NOTIFY;
3013
3014         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3015                            RTM_NEWROUTE, 0, 0);
3016         if (err <= 0)
3017                 goto errout_free;
3018
3019         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3020 errout:
3021         return err;
3022
3023 errout_free:
3024         kfree_skb(skb);
3025         goto errout;
3026 }
3027
3028 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3029 {
3030         struct rtable *rt;
3031         int h, s_h;
3032         int idx, s_idx;
3033         struct net *net;
3034
3035         net = sock_net(skb->sk);
3036
3037         s_h = cb->args[0];
3038         if (s_h < 0)
3039                 s_h = 0;
3040         s_idx = idx = cb->args[1];
3041         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3042                 if (!rt_hash_table[h].chain)
3043                         continue;
3044                 rcu_read_lock_bh();
3045                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3046                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3047                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3048                                 continue;
3049                         if (rt_is_expired(rt))
3050                                 continue;
3051                         skb_dst_set_noref(skb, &rt->dst);
3052                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3053                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3054                                          1, NLM_F_MULTI) <= 0) {
3055                                 skb_dst_drop(skb);
3056                                 rcu_read_unlock_bh();
3057                                 goto done;
3058                         }
3059                         skb_dst_drop(skb);
3060                 }
3061                 rcu_read_unlock_bh();
3062         }
3063
3064 done:
3065         cb->args[0] = h;
3066         cb->args[1] = idx;
3067         return skb->len;
3068 }
3069
3070 void ip_rt_multicast_event(struct in_device *in_dev)
3071 {
3072         rt_cache_flush(dev_net(in_dev->dev), 0);
3073 }
3074
3075 #ifdef CONFIG_SYSCTL
3076 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3077                                         void __user *buffer,
3078                                         size_t *lenp, loff_t *ppos)
3079 {
3080         if (write) {
3081                 int flush_delay;
3082                 ctl_table ctl;
3083                 struct net *net;
3084
3085                 memcpy(&ctl, __ctl, sizeof(ctl));
3086                 ctl.data = &flush_delay;
3087                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3088
3089                 net = (struct net *)__ctl->extra1;
3090                 rt_cache_flush(net, flush_delay);
3091                 return 0;
3092         }
3093
3094         return -EINVAL;
3095 }
3096
3097 static ctl_table ipv4_route_table[] = {
3098         {
3099                 .procname       = "gc_thresh",
3100                 .data           = &ipv4_dst_ops.gc_thresh,
3101                 .maxlen         = sizeof(int),
3102                 .mode           = 0644,
3103                 .proc_handler   = proc_dointvec,
3104         },
3105         {
3106                 .procname       = "max_size",
3107                 .data           = &ip_rt_max_size,
3108                 .maxlen         = sizeof(int),
3109                 .mode           = 0644,
3110                 .proc_handler   = proc_dointvec,
3111         },
3112         {
3113                 /*  Deprecated. Use gc_min_interval_ms */
3114
3115                 .procname       = "gc_min_interval",
3116                 .data           = &ip_rt_gc_min_interval,
3117                 .maxlen         = sizeof(int),
3118                 .mode           = 0644,
3119                 .proc_handler   = proc_dointvec_jiffies,
3120         },
3121         {
3122                 .procname       = "gc_min_interval_ms",
3123                 .data           = &ip_rt_gc_min_interval,
3124                 .maxlen         = sizeof(int),
3125                 .mode           = 0644,
3126                 .proc_handler   = proc_dointvec_ms_jiffies,
3127         },
3128         {
3129                 .procname       = "gc_timeout",
3130                 .data           = &ip_rt_gc_timeout,
3131                 .maxlen         = sizeof(int),
3132                 .mode           = 0644,
3133                 .proc_handler   = proc_dointvec_jiffies,
3134         },
3135         {
3136                 .procname       = "gc_interval",
3137                 .data           = &ip_rt_gc_interval,
3138                 .maxlen         = sizeof(int),
3139                 .mode           = 0644,
3140                 .proc_handler   = proc_dointvec_jiffies,
3141         },
3142         {
3143                 .procname       = "redirect_load",
3144                 .data           = &ip_rt_redirect_load,
3145                 .maxlen         = sizeof(int),
3146                 .mode           = 0644,
3147                 .proc_handler   = proc_dointvec,
3148         },
3149         {
3150                 .procname       = "redirect_number",
3151                 .data           = &ip_rt_redirect_number,
3152                 .maxlen         = sizeof(int),
3153                 .mode           = 0644,
3154                 .proc_handler   = proc_dointvec,
3155         },
3156         {
3157                 .procname       = "redirect_silence",
3158                 .data           = &ip_rt_redirect_silence,
3159                 .maxlen         = sizeof(int),
3160                 .mode           = 0644,
3161                 .proc_handler   = proc_dointvec,
3162         },
3163         {
3164                 .procname       = "error_cost",
3165                 .data           = &ip_rt_error_cost,
3166                 .maxlen         = sizeof(int),
3167                 .mode           = 0644,
3168                 .proc_handler   = proc_dointvec,
3169         },
3170         {
3171                 .procname       = "error_burst",
3172                 .data           = &ip_rt_error_burst,
3173                 .maxlen         = sizeof(int),
3174                 .mode           = 0644,
3175                 .proc_handler   = proc_dointvec,
3176         },
3177         {
3178                 .procname       = "gc_elasticity",
3179                 .data           = &ip_rt_gc_elasticity,
3180                 .maxlen         = sizeof(int),
3181                 .mode           = 0644,
3182                 .proc_handler   = proc_dointvec,
3183         },
3184         {
3185                 .procname       = "mtu_expires",
3186                 .data           = &ip_rt_mtu_expires,
3187                 .maxlen         = sizeof(int),
3188                 .mode           = 0644,
3189                 .proc_handler   = proc_dointvec_jiffies,
3190         },
3191         {
3192                 .procname       = "min_pmtu",
3193                 .data           = &ip_rt_min_pmtu,
3194                 .maxlen         = sizeof(int),
3195                 .mode           = 0644,
3196                 .proc_handler   = proc_dointvec,
3197         },
3198         {
3199                 .procname       = "min_adv_mss",
3200                 .data           = &ip_rt_min_advmss,
3201                 .maxlen         = sizeof(int),
3202                 .mode           = 0644,
3203                 .proc_handler   = proc_dointvec,
3204         },
3205         { }
3206 };
3207
3208 static struct ctl_table empty[1];
3209
3210 static struct ctl_table ipv4_skeleton[] =
3211 {
3212         { .procname = "route",
3213           .mode = 0555, .child = ipv4_route_table},
3214         { .procname = "neigh",
3215           .mode = 0555, .child = empty},
3216         { }
3217 };
3218
3219 static __net_initdata struct ctl_path ipv4_path[] = {
3220         { .procname = "net", },
3221         { .procname = "ipv4", },
3222         { },
3223 };
3224
3225 static struct ctl_table ipv4_route_flush_table[] = {
3226         {
3227                 .procname       = "flush",
3228                 .maxlen         = sizeof(int),
3229                 .mode           = 0200,
3230                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3231         },
3232         { },
3233 };
3234
3235 static __net_initdata struct ctl_path ipv4_route_path[] = {
3236         { .procname = "net", },
3237         { .procname = "ipv4", },
3238         { .procname = "route", },
3239         { },
3240 };
3241
3242 static __net_init int sysctl_route_net_init(struct net *net)
3243 {
3244         struct ctl_table *tbl;
3245
3246         tbl = ipv4_route_flush_table;
3247         if (!net_eq(net, &init_net)) {
3248                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3249                 if (tbl == NULL)
3250                         goto err_dup;
3251         }
3252         tbl[0].extra1 = net;
3253
3254         net->ipv4.route_hdr =
3255                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3256         if (net->ipv4.route_hdr == NULL)
3257                 goto err_reg;
3258         return 0;
3259
3260 err_reg:
3261         if (tbl != ipv4_route_flush_table)
3262                 kfree(tbl);
3263 err_dup:
3264         return -ENOMEM;
3265 }
3266
3267 static __net_exit void sysctl_route_net_exit(struct net *net)
3268 {
3269         struct ctl_table *tbl;
3270
3271         tbl = net->ipv4.route_hdr->ctl_table_arg;
3272         unregister_net_sysctl_table(net->ipv4.route_hdr);
3273         BUG_ON(tbl == ipv4_route_flush_table);
3274         kfree(tbl);
3275 }
3276
3277 static __net_initdata struct pernet_operations sysctl_route_ops = {
3278         .init = sysctl_route_net_init,
3279         .exit = sysctl_route_net_exit,
3280 };
3281 #endif
3282
3283 static __net_init int rt_genid_init(struct net *net)
3284 {
3285         get_random_bytes(&net->ipv4.rt_genid,
3286                          sizeof(net->ipv4.rt_genid));
3287         return 0;
3288 }
3289
3290 static __net_initdata struct pernet_operations rt_genid_ops = {
3291         .init = rt_genid_init,
3292 };
3293
3294
3295 #ifdef CONFIG_IP_ROUTE_CLASSID
3296 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3297 #endif /* CONFIG_IP_ROUTE_CLASSID */
3298
3299 static __initdata unsigned long rhash_entries;
3300 static int __init set_rhash_entries(char *str)
3301 {
3302         if (!str)
3303                 return 0;
3304         rhash_entries = simple_strtoul(str, &str, 0);
3305         return 1;
3306 }
3307 __setup("rhash_entries=", set_rhash_entries);
3308
3309 int __init ip_rt_init(void)
3310 {
3311         int rc = 0;
3312
3313 #ifdef CONFIG_IP_ROUTE_CLASSID
3314         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3315         if (!ip_rt_acct)
3316                 panic("IP: failed to allocate ip_rt_acct\n");
3317 #endif
3318
3319         ipv4_dst_ops.kmem_cachep =
3320                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3321                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3322
3323         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3324
3325         if (dst_entries_init(&ipv4_dst_ops) < 0)
3326                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3327
3328         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3329                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3330
3331         rt_hash_table = (struct rt_hash_bucket *)
3332                 alloc_large_system_hash("IP route cache",
3333                                         sizeof(struct rt_hash_bucket),
3334                                         rhash_entries,
3335                                         (totalram_pages >= 128 * 1024) ?
3336                                         15 : 17,
3337                                         0,
3338                                         &rt_hash_log,
3339                                         &rt_hash_mask,
3340                                         rhash_entries ? 0 : 512 * 1024);
3341         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3342         rt_hash_lock_init();
3343
3344         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3345         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3346
3347         devinet_init();
3348         ip_fib_init();
3349
3350         /* All the timers, started at system startup tend
3351            to synchronize. Perturb it a bit.
3352          */
3353         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3354         expires_ljiffies = jiffies;
3355         schedule_delayed_work(&expires_work,
3356                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3357
3358         if (ip_rt_proc_init())
3359                 printk(KERN_ERR "Unable to create route proc files\n");
3360 #ifdef CONFIG_XFRM
3361         xfrm_init();
3362         xfrm4_init(ip_rt_max_size);
3363 #endif
3364         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3365
3366 #ifdef CONFIG_SYSCTL
3367         register_pernet_subsys(&sysctl_route_ops);
3368 #endif
3369         register_pernet_subsys(&rt_genid_ops);
3370         return rc;
3371 }
3372
3373 #ifdef CONFIG_SYSCTL
3374 /*
3375  * We really need to sanitize the damn ipv4 init order, then all
3376  * this nonsense will go away.
3377  */
3378 void __init ip_static_sysctl_init(void)
3379 {
3380         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3381 }
3382 #endif