net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111 #include <net/atmclip.h>
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_gc_elasticity __read_mostly    = 8;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133 static int rt_chain_length_max __read_mostly    = 20;
 134
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 142 static void              ipv4_dst_destroy(struct dst_entry *dst);
 143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 144 static void              ipv4_link_failure(struct sk_buff *skb);
 145 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 146 static int rt_garbage_collect(struct dst_ops *ops);
 147
 148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 149                             int how)
 150 {
 151 }
 152
 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154 {
 155         struct rtable *rt = (struct rtable *) dst;
 156         struct inet_peer *peer;
 157         u32 *p = NULL;
 158
 159         if (!rt->peer)
 160                 rt_bind_peer(rt, rt->rt_dst, 1);
 161
 162         peer = rt->peer;
 163         if (peer) {
 164                 u32 *old_p = __DST_METRICS_PTR(old);
 165                 unsigned long prev, new;
 166
 167                 p = peer->metrics;
 168                 if (inet_metrics_new(peer))
 169                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 170
 171                 new = (unsigned long) p;
 172                 prev = cmpxchg(&dst->_metrics, old, new);
 173
 174                 if (prev != old) {
 175                         p = __DST_METRICS_PTR(prev);
 176                         if (prev & DST_METRICS_READ_ONLY)
 177                                 p = NULL;
 178                 } else {
 179                         if (rt->fi) {
 180                                 fib_info_put(rt->fi);
 181                                 rt->fi = NULL;
 182                         }
 183                 }
 184         }
 185         return p;
 186 }
 187
 188 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 189
 190 static struct dst_ops ipv4_dst_ops = {
 191         .family =               AF_INET,
 192         .protocol =             cpu_to_be16(ETH_P_IP),
 193         .gc =                   rt_garbage_collect,
 194         .check =                ipv4_dst_check,
 195         .default_advmss =       ipv4_default_advmss,
 196         .mtu =                  ipv4_mtu,
 197         .cow_metrics =          ipv4_cow_metrics,
 198         .destroy =              ipv4_dst_destroy,
 199         .ifdown =               ipv4_dst_ifdown,
 200         .negative_advice =      ipv4_negative_advice,
 201         .link_failure =         ipv4_link_failure,
 202         .update_pmtu =          ip_rt_update_pmtu,
 203         .local_out =            __ip_local_out,
 204         .neigh_lookup =         ipv4_neigh_lookup,
 205 };
 206
 207 #define ECN_OR_COST(class)      TC_PRIO_##class
 208
 209 const __u8 ip_tos2prio[16] = {
 210         TC_PRIO_BESTEFFORT,
 211         ECN_OR_COST(BESTEFFORT),
 212         TC_PRIO_BESTEFFORT,
 213         ECN_OR_COST(BESTEFFORT),
 214         TC_PRIO_BULK,
 215         ECN_OR_COST(BULK),
 216         TC_PRIO_BULK,
 217         ECN_OR_COST(BULK),
 218         TC_PRIO_INTERACTIVE,
 219         ECN_OR_COST(INTERACTIVE),
 220         TC_PRIO_INTERACTIVE,
 221         ECN_OR_COST(INTERACTIVE),
 222         TC_PRIO_INTERACTIVE_BULK,
 223         ECN_OR_COST(INTERACTIVE_BULK),
 224         TC_PRIO_INTERACTIVE_BULK,
 225         ECN_OR_COST(INTERACTIVE_BULK)
 226 };
 227
 228
 229 /*
 230  * Route cache.
 231  */
 232
 233 /* The locking scheme is rather straight forward:
 234  *
 235  * 1) Read-Copy Update protects the buckets of the central route hash.
 236  * 2) Only writers remove entries, and they hold the lock
 237  *    as they look at rtable reference counts.
 238  * 3) Only readers acquire references to rtable entries,
 239  *    they do so with atomic increments and with the
 240  *    lock held.
 241  */
 242
 243 struct rt_hash_bucket {
 244         struct rtable __rcu     *chain;
 245 };
 246
 247 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 248         defined(CONFIG_PROVE_LOCKING)
 249 /*
 250  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 251  * The size of this table is a power of two and depends on the number of CPUS.
 252  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 253  */
 254 #ifdef CONFIG_LOCKDEP
 255 # define RT_HASH_LOCK_SZ        256
 256 #else
 257 # if NR_CPUS >= 32
 258 #  define RT_HASH_LOCK_SZ       4096
 259 # elif NR_CPUS >= 16
 260 #  define RT_HASH_LOCK_SZ       2048
 261 # elif NR_CPUS >= 8
 262 #  define RT_HASH_LOCK_SZ       1024
 263 # elif NR_CPUS >= 4
 264 #  define RT_HASH_LOCK_SZ       512
 265 # else
 266 #  define RT_HASH_LOCK_SZ       256
 267 # endif
 268 #endif
 269
 270 static spinlock_t       *rt_hash_locks;
 271 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 272
 273 static __init void rt_hash_lock_init(void)
 274 {
 275         int i;
 276
 277         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 278                         GFP_KERNEL);
 279         if (!rt_hash_locks)
 280                 panic("IP: failed to allocate rt_hash_locks\n");
 281
 282         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 283                 spin_lock_init(&rt_hash_locks[i]);
 284 }
 285 #else
 286 # define rt_hash_lock_addr(slot) NULL
 287
 288 static inline void rt_hash_lock_init(void)
 289 {
 290 }
 291 #endif
 292
 293 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 294 static unsigned                 rt_hash_mask __read_mostly;
 295 static unsigned int             rt_hash_log  __read_mostly;
 296
 297 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 298 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 299
 300 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 301                                    int genid)
 302 {
 303         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 304                             idx, genid)
 305                 & rt_hash_mask;
 306 }
 307
 308 static inline int rt_genid(struct net *net)
 309 {
 310         return atomic_read(&net->ipv4.rt_genid);
 311 }
 312
 313 #ifdef CONFIG_PROC_FS
 314 struct rt_cache_iter_state {
 315         struct seq_net_private p;
 316         int bucket;
 317         int genid;
 318 };
 319
 320 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 321 {
 322         struct rt_cache_iter_state *st = seq->private;
 323         struct rtable *r = NULL;
 324
 325         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 326                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 327                         continue;
 328                 rcu_read_lock_bh();
 329                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 330                 while (r) {
 331                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 332                             r->rt_genid == st->genid)
 333                                 return r;
 334                         r = rcu_dereference_bh(r->dst.rt_next);
 335                 }
 336                 rcu_read_unlock_bh();
 337         }
 338         return r;
 339 }
 340
 341 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 342                                           struct rtable *r)
 343 {
 344         struct rt_cache_iter_state *st = seq->private;
 345
 346         r = rcu_dereference_bh(r->dst.rt_next);
 347         while (!r) {
 348                 rcu_read_unlock_bh();
 349                 do {
 350                         if (--st->bucket < 0)
 351                                 return NULL;
 352                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 353                 rcu_read_lock_bh();
 354                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 355         }
 356         return r;
 357 }
 358
 359 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 360                                         struct rtable *r)
 361 {
 362         struct rt_cache_iter_state *st = seq->private;
 363         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 364                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 365                         continue;
 366                 if (r->rt_genid == st->genid)
 367                         break;
 368         }
 369         return r;
 370 }
 371
 372 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 373 {
 374         struct rtable *r = rt_cache_get_first(seq);
 375
 376         if (r)
 377                 while (pos && (r = rt_cache_get_next(seq, r)))
 378                         --pos;
 379         return pos ? NULL : r;
 380 }
 381
 382 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 383 {
 384         struct rt_cache_iter_state *st = seq->private;
 385         if (*pos)
 386                 return rt_cache_get_idx(seq, *pos - 1);
 387         st->genid = rt_genid(seq_file_net(seq));
 388         return SEQ_START_TOKEN;
 389 }
 390
 391 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 392 {
 393         struct rtable *r;
 394
 395         if (v == SEQ_START_TOKEN)
 396                 r = rt_cache_get_first(seq);
 397         else
 398                 r = rt_cache_get_next(seq, v);
 399         ++*pos;
 400         return r;
 401 }
 402
 403 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 404 {
 405         if (v && v != SEQ_START_TOKEN)
 406                 rcu_read_unlock_bh();
 407 }
 408
 409 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 410 {
 411         if (v == SEQ_START_TOKEN)
 412                 seq_printf(seq, "%-127s\n",
 413                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 414                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 415                            "HHUptod\tSpecDst");
 416         else {
 417                 struct rtable *r = v;
 418                 struct neighbour *n;
 419                 int len;
 420
 421                 n = dst_get_neighbour(&r->dst);
 422                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 423                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 424                         r->dst.dev ? r->dst.dev->name : "*",
 425                         (__force u32)r->rt_dst,
 426                         (__force u32)r->rt_gateway,
 427                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 428                         r->dst.__use, 0, (__force u32)r->rt_src,
 429                         dst_metric_advmss(&r->dst) + 40,
 430                         dst_metric(&r->dst, RTAX_WINDOW),
 431                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 432                               dst_metric(&r->dst, RTAX_RTTVAR)),
 433                         r->rt_key_tos,
 434                         -1,
 435                         (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
 436                         r->rt_spec_dst, &len);
 437
 438                 seq_printf(seq, "%*s\n", 127 - len, "");
 439         }
 440         return 0;
 441 }
 442
 443 static const struct seq_operations rt_cache_seq_ops = {
 444         .start  = rt_cache_seq_start,
 445         .next   = rt_cache_seq_next,
 446         .stop   = rt_cache_seq_stop,
 447         .show   = rt_cache_seq_show,
 448 };
 449
 450 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 451 {
 452         return seq_open_net(inode, file, &rt_cache_seq_ops,
 453                         sizeof(struct rt_cache_iter_state));
 454 }
 455
 456 static const struct file_operations rt_cache_seq_fops = {
 457         .owner   = THIS_MODULE,
 458         .open    = rt_cache_seq_open,
 459         .read    = seq_read,
 460         .llseek  = seq_lseek,
 461         .release = seq_release_net,
 462 };
 463
 464
 465 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 466 {
 467         int cpu;
 468
 469         if (*pos == 0)
 470                 return SEQ_START_TOKEN;
 471
 472         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 473                 if (!cpu_possible(cpu))
 474                         continue;
 475                 *pos = cpu+1;
 476                 return &per_cpu(rt_cache_stat, cpu);
 477         }
 478         return NULL;
 479 }
 480
 481 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 482 {
 483         int cpu;
 484
 485         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 486                 if (!cpu_possible(cpu))
 487                         continue;
 488                 *pos = cpu+1;
 489                 return &per_cpu(rt_cache_stat, cpu);
 490         }
 491         return NULL;
 492
 493 }
 494
 495 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 496 {
 497
 498 }
 499
 500 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 501 {
 502         struct rt_cache_stat *st = v;
 503
 504         if (v == SEQ_START_TOKEN) {
 505                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 506                 return 0;
 507         }
 508
 509         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 510                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 511                    dst_entries_get_slow(&ipv4_dst_ops),
 512                    st->in_hit,
 513                    st->in_slow_tot,
 514                    st->in_slow_mc,
 515                    st->in_no_route,
 516                    st->in_brd,
 517                    st->in_martian_dst,
 518                    st->in_martian_src,
 519
 520                    st->out_hit,
 521                    st->out_slow_tot,
 522                    st->out_slow_mc,
 523
 524                    st->gc_total,
 525                    st->gc_ignored,
 526                    st->gc_goal_miss,
 527                    st->gc_dst_overflow,
 528                    st->in_hlist_search,
 529                    st->out_hlist_search
 530                 );
 531         return 0;
 532 }
 533
 534 static const struct seq_operations rt_cpu_seq_ops = {
 535         .start  = rt_cpu_seq_start,
 536         .next   = rt_cpu_seq_next,
 537         .stop   = rt_cpu_seq_stop,
 538         .show   = rt_cpu_seq_show,
 539 };
 540
 541
 542 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 543 {
 544         return seq_open(file, &rt_cpu_seq_ops);
 545 }
 546
 547 static const struct file_operations rt_cpu_seq_fops = {
 548         .owner   = THIS_MODULE,
 549         .open    = rt_cpu_seq_open,
 550         .read    = seq_read,
 551         .llseek  = seq_lseek,
 552         .release = seq_release,
 553 };
 554
 555 #ifdef CONFIG_IP_ROUTE_CLASSID
 556 static int rt_acct_proc_show(struct seq_file *m, void *v)
 557 {
 558         struct ip_rt_acct *dst, *src;
 559         unsigned int i, j;
 560
 561         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 562         if (!dst)
 563                 return -ENOMEM;
 564
 565         for_each_possible_cpu(i) {
 566                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 567                 for (j = 0; j < 256; j++) {
 568                         dst[j].o_bytes   += src[j].o_bytes;
 569                         dst[j].o_packets += src[j].o_packets;
 570                         dst[j].i_bytes   += src[j].i_bytes;
 571                         dst[j].i_packets += src[j].i_packets;
 572                 }
 573         }
 574
 575         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 576         kfree(dst);
 577         return 0;
 578 }
 579
 580 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 581 {
 582         return single_open(file, rt_acct_proc_show, NULL);
 583 }
 584
 585 static const struct file_operations rt_acct_proc_fops = {
 586         .owner          = THIS_MODULE,
 587         .open           = rt_acct_proc_open,
 588         .read           = seq_read,
 589         .llseek         = seq_lseek,
 590         .release        = single_release,
 591 };
 592 #endif
 593
 594 static int __net_init ip_rt_do_proc_init(struct net *net)
 595 {
 596         struct proc_dir_entry *pde;
 597
 598         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 599                         &rt_cache_seq_fops);
 600         if (!pde)
 601                 goto err1;
 602
 603         pde = proc_create("rt_cache", S_IRUGO,
 604                           net->proc_net_stat, &rt_cpu_seq_fops);
 605         if (!pde)
 606                 goto err2;
 607
 608 #ifdef CONFIG_IP_ROUTE_CLASSID
 609         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 610         if (!pde)
 611                 goto err3;
 612 #endif
 613         return 0;
 614
 615 #ifdef CONFIG_IP_ROUTE_CLASSID
 616 err3:
 617         remove_proc_entry("rt_cache", net->proc_net_stat);
 618 #endif
 619 err2:
 620         remove_proc_entry("rt_cache", net->proc_net);
 621 err1:
 622         return -ENOMEM;
 623 }
 624
 625 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 626 {
 627         remove_proc_entry("rt_cache", net->proc_net_stat);
 628         remove_proc_entry("rt_cache", net->proc_net);
 629 #ifdef CONFIG_IP_ROUTE_CLASSID
 630         remove_proc_entry("rt_acct", net->proc_net);
 631 #endif
 632 }
 633
 634 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 635         .init = ip_rt_do_proc_init,
 636         .exit = ip_rt_do_proc_exit,
 637 };
 638
 639 static int __init ip_rt_proc_init(void)
 640 {
 641         return register_pernet_subsys(&ip_rt_proc_ops);
 642 }
 643
 644 #else
 645 static inline int ip_rt_proc_init(void)
 646 {
 647         return 0;
 648 }
 649 #endif /* CONFIG_PROC_FS */
 650
 651 static inline void rt_free(struct rtable *rt)
 652 {
 653         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 654 }
 655
 656 static inline void rt_drop(struct rtable *rt)
 657 {
 658         ip_rt_put(rt);
 659         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 660 }
 661
 662 static inline int rt_fast_clean(struct rtable *rth)
 663 {
 664         /* Kill broadcast/multicast entries very aggresively, if they
 665            collide in hash table with more useful entries */
 666         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 667                 rt_is_input_route(rth) && rth->dst.rt_next;
 668 }
 669
 670 static inline int rt_valuable(struct rtable *rth)
 671 {
 672         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 673                 (rth->peer && rth->peer->pmtu_expires);
 674 }
 675
 676 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 677 {
 678         unsigned long age;
 679         int ret = 0;
 680
 681         if (atomic_read(&rth->dst.__refcnt))
 682                 goto out;
 683
 684         age = jiffies - rth->dst.lastuse;
 685         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 686             (age <= tmo2 && rt_valuable(rth)))
 687                 goto out;
 688         ret = 1;
 689 out:    return ret;
 690 }
 691
 692 /* Bits of score are:
 693  * 31: very valuable
 694  * 30: not quite useless
 695  * 29..0: usage counter
 696  */
 697 static inline u32 rt_score(struct rtable *rt)
 698 {
 699         u32 score = jiffies - rt->dst.lastuse;
 700
 701         score = ~score & ~(3<<30);
 702
 703         if (rt_valuable(rt))
 704                 score |= (1<<31);
 705
 706         if (rt_is_output_route(rt) ||
 707             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 708                 score |= (1<<30);
 709
 710         return score;
 711 }
 712
 713 static inline bool rt_caching(const struct net *net)
 714 {
 715         return net->ipv4.current_rt_cache_rebuild_count <=
 716                 net->ipv4.sysctl_rt_cache_rebuild_count;
 717 }
 718
 719 static inline bool compare_hash_inputs(const struct rtable *rt1,
 720                                        const struct rtable *rt2)
 721 {
 722         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 723                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 724                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 725 }
 726
 727 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 728 {
 729         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 730                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 731                 (rt1->rt_mark ^ rt2->rt_mark) |
 732                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 733                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 734                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 735 }
 736
 737 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 738 {
 739         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 740 }
 741
 742 static inline int rt_is_expired(struct rtable *rth)
 743 {
 744         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 745 }
 746
 747 /*
 748  * Perform a full scan of hash table and free all entries.
 749  * Can be called by a softirq or a process.
 750  * In the later case, we want to be reschedule if necessary
 751  */
 752 static void rt_do_flush(struct net *net, int process_context)
 753 {
 754         unsigned int i;
 755         struct rtable *rth, *next;
 756
 757         for (i = 0; i <= rt_hash_mask; i++) {
 758                 struct rtable __rcu **pprev;
 759                 struct rtable *list;
 760
 761                 if (process_context && need_resched())
 762                         cond_resched();
 763                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 764                 if (!rth)
 765                         continue;
 766
 767                 spin_lock_bh(rt_hash_lock_addr(i));
 768
 769                 list = NULL;
 770                 pprev = &rt_hash_table[i].chain;
 771                 rth = rcu_dereference_protected(*pprev,
 772                         lockdep_is_held(rt_hash_lock_addr(i)));
 773
 774                 while (rth) {
 775                         next = rcu_dereference_protected(rth->dst.rt_next,
 776                                 lockdep_is_held(rt_hash_lock_addr(i)));
 777
 778                         if (!net ||
 779                             net_eq(dev_net(rth->dst.dev), net)) {
 780                                 rcu_assign_pointer(*pprev, next);
 781                                 rcu_assign_pointer(rth->dst.rt_next, list);
 782                                 list = rth;
 783                         } else {
 784                                 pprev = &rth->dst.rt_next;
 785                         }
 786                         rth = next;
 787                 }
 788
 789                 spin_unlock_bh(rt_hash_lock_addr(i));
 790
 791                 for (; list; list = next) {
 792                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 793                         rt_free(list);
 794                 }
 795         }
 796 }
 797
 798 /*
 799  * While freeing expired entries, we compute average chain length
 800  * and standard deviation, using fixed-point arithmetic.
 801  * This to have an estimation of rt_chain_length_max
 802  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 803  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 804  */
 805
 806 #define FRACT_BITS 3
 807 #define ONE (1UL << FRACT_BITS)
 808
 809 /*
 810  * Given a hash chain and an item in this hash chain,
 811  * find if a previous entry has the same hash_inputs
 812  * (but differs on tos, mark or oif)
 813  * Returns 0 if an alias is found.
 814  * Returns ONE if rth has no alias before itself.
 815  */
 816 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 817 {
 818         const struct rtable *aux = head;
 819
 820         while (aux != rth) {
 821                 if (compare_hash_inputs(aux, rth))
 822                         return 0;
 823                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 824         }
 825         return ONE;
 826 }
 827
 828 /*
 829  * Perturbation of rt_genid by a small quantity [1..256]
 830  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 831  * many times (2^24) without giving recent rt_genid.
 832  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 833  */
 834 static void rt_cache_invalidate(struct net *net)
 835 {
 836         unsigned char shuffle;
 837
 838         get_random_bytes(&shuffle, sizeof(shuffle));
 839         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 840 }
 841
 842 /*
 843  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 844  * delay >= 0 : invalidate & flush cache (can be long)
 845  */
 846 void rt_cache_flush(struct net *net, int delay)
 847 {
 848         rt_cache_invalidate(net);
 849         if (delay >= 0)
 850                 rt_do_flush(net, !in_softirq());
 851 }
 852
 853 /* Flush previous cache invalidated entries from the cache */
 854 void rt_cache_flush_batch(struct net *net)
 855 {
 856         rt_do_flush(net, !in_softirq());
 857 }
 858
 859 static void rt_emergency_hash_rebuild(struct net *net)
 860 {
 861         if (net_ratelimit())
 862                 printk(KERN_WARNING "Route hash chain too long!\n");
 863         rt_cache_invalidate(net);
 864 }
 865
 866 /*
 867    Short description of GC goals.
 868
 869    We want to build algorithm, which will keep routing cache
 870    at some equilibrium point, when number of aged off entries
 871    is kept approximately equal to newly generated ones.
 872
 873    Current expiration strength is variable "expire".
 874    We try to adjust it dynamically, so that if networking
 875    is idle expires is large enough to keep enough of warm entries,
 876    and when load increases it reduces to limit cache size.
 877  */
 878
 879 static int rt_garbage_collect(struct dst_ops *ops)
 880 {
 881         static unsigned long expire = RT_GC_TIMEOUT;
 882         static unsigned long last_gc;
 883         static int rover;
 884         static int equilibrium;
 885         struct rtable *rth;
 886         struct rtable __rcu **rthp;
 887         unsigned long now = jiffies;
 888         int goal;
 889         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 890
 891         /*
 892          * Garbage collection is pretty expensive,
 893          * do not make it too frequently.
 894          */
 895
 896         RT_CACHE_STAT_INC(gc_total);
 897
 898         if (now - last_gc < ip_rt_gc_min_interval &&
 899             entries < ip_rt_max_size) {
 900                 RT_CACHE_STAT_INC(gc_ignored);
 901                 goto out;
 902         }
 903
 904         entries = dst_entries_get_slow(&ipv4_dst_ops);
 905         /* Calculate number of entries, which we want to expire now. */
 906         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 907         if (goal <= 0) {
 908                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 909                         equilibrium = ipv4_dst_ops.gc_thresh;
 910                 goal = entries - equilibrium;
 911                 if (goal > 0) {
 912                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 913                         goal = entries - equilibrium;
 914                 }
 915         } else {
 916                 /* We are in dangerous area. Try to reduce cache really
 917                  * aggressively.
 918                  */
 919                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 920                 equilibrium = entries - goal;
 921         }
 922
 923         if (now - last_gc >= ip_rt_gc_min_interval)
 924                 last_gc = now;
 925
 926         if (goal <= 0) {
 927                 equilibrium += goal;
 928                 goto work_done;
 929         }
 930
 931         do {
 932                 int i, k;
 933
 934                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 935                         unsigned long tmo = expire;
 936
 937                         k = (k + 1) & rt_hash_mask;
 938                         rthp = &rt_hash_table[k].chain;
 939                         spin_lock_bh(rt_hash_lock_addr(k));
 940                         while ((rth = rcu_dereference_protected(*rthp,
 941                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 942                                 if (!rt_is_expired(rth) &&
 943                                         !rt_may_expire(rth, tmo, expire)) {
 944                                         tmo >>= 1;
 945                                         rthp = &rth->dst.rt_next;
 946                                         continue;
 947                                 }
 948                                 *rthp = rth->dst.rt_next;
 949                                 rt_free(rth);
 950                                 goal--;
 951                         }
 952                         spin_unlock_bh(rt_hash_lock_addr(k));
 953                         if (goal <= 0)
 954                                 break;
 955                 }
 956                 rover = k;
 957
 958                 if (goal <= 0)
 959                         goto work_done;
 960
 961                 /* Goal is not achieved. We stop process if:
 962
 963                    - if expire reduced to zero. Otherwise, expire is halfed.
 964                    - if table is not full.
 965                    - if we are called from interrupt.
 966                    - jiffies check is just fallback/debug loop breaker.
 967                      We will not spin here for long time in any case.
 968                  */
 969
 970                 RT_CACHE_STAT_INC(gc_goal_miss);
 971
 972                 if (expire == 0)
 973                         break;
 974
 975                 expire >>= 1;
 976
 977                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 978                         goto out;
 979         } while (!in_softirq() && time_before_eq(jiffies, now));
 980
 981         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 982                 goto out;
 983         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 984                 goto out;
 985         if (net_ratelimit())
 986                 printk(KERN_WARNING "dst cache overflow\n");
 987         RT_CACHE_STAT_INC(gc_dst_overflow);
 988         return 1;
 989
 990 work_done:
 991         expire += ip_rt_gc_min_interval;
 992         if (expire > ip_rt_gc_timeout ||
 993             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
 994             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
 995                 expire = ip_rt_gc_timeout;
 996 out:    return 0;
 997 }
 998
 999 /*
1000  * Returns number of entries in a hash chain that have different hash_inputs
1001  */
1002 static int slow_chain_length(const struct rtable *head)
1003 {
1004         int length = 0;
1005         const struct rtable *rth = head;
1006
1007         while (rth) {
1008                 length += has_noalias(head, rth);
1009                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1010         }
1011         return length >> FRACT_BITS;
1012 }
1013
1014 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1015 {
1016         struct neigh_table *tbl = &arp_tbl;
1017         static const __be32 inaddr_any = 0;
1018         struct net_device *dev = dst->dev;
1019         const __be32 *pkey = daddr;
1020         struct neighbour *n;
1021
1022 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1023         if (dev->type == ARPHRD_ATM)
1024                 tbl = clip_tbl_hook;
1025 #endif
1026         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1027                 pkey = &inaddr_any;
1028
1029         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1030         if (n)
1031                 return n;
1032         return neigh_create(tbl, pkey, dev);
1033 }
1034
1035 static int rt_bind_neighbour(struct rtable *rt)
1036 {
1037         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1038         if (IS_ERR(n))
1039                 return PTR_ERR(n);
1040         dst_set_neighbour(&rt->dst, n);
1041
1042         return 0;
1043 }
1044
1045 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1046                                      struct sk_buff *skb, int ifindex)
1047 {
1048         struct rtable   *rth, *cand;
1049         struct rtable __rcu **rthp, **candp;
1050         unsigned long   now;
1051         u32             min_score;
1052         int             chain_length;
1053         int attempts = !in_softirq();
1054
1055 restart:
1056         chain_length = 0;
1057         min_score = ~(u32)0;
1058         cand = NULL;
1059         candp = NULL;
1060         now = jiffies;
1061
1062         if (!rt_caching(dev_net(rt->dst.dev))) {
1063                 /*
1064                  * If we're not caching, just tell the caller we
1065                  * were successful and don't touch the route.  The
1066                  * caller hold the sole reference to the cache entry, and
1067                  * it will be released when the caller is done with it.
1068                  * If we drop it here, the callers have no way to resolve routes
1069                  * when we're not caching.  Instead, just point *rp at rt, so
1070                  * the caller gets a single use out of the route
1071                  * Note that we do rt_free on this new route entry, so that
1072                  * once its refcount hits zero, we are still able to reap it
1073                  * (Thanks Alexey)
1074                  * Note: To avoid expensive rcu stuff for this uncached dst,
1075                  * we set DST_NOCACHE so that dst_release() can free dst without
1076                  * waiting a grace period.
1077                  */
1078
1079                 rt->dst.flags |= DST_NOCACHE;
1080                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1081                         int err = rt_bind_neighbour(rt);
1082                         if (err) {
1083                                 if (net_ratelimit())
1084                                         printk(KERN_WARNING
1085                                             "Neighbour table failure & not caching routes.\n");
1086                                 ip_rt_put(rt);
1087                                 return ERR_PTR(err);
1088                         }
1089                 }
1090
1091                 goto skip_hashing;
1092         }
1093
1094         rthp = &rt_hash_table[hash].chain;
1095
1096         spin_lock_bh(rt_hash_lock_addr(hash));
1097         while ((rth = rcu_dereference_protected(*rthp,
1098                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1099                 if (rt_is_expired(rth)) {
1100                         *rthp = rth->dst.rt_next;
1101                         rt_free(rth);
1102                         continue;
1103                 }
1104                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1105                         /* Put it first */
1106                         *rthp = rth->dst.rt_next;
1107                         /*
1108                          * Since lookup is lockfree, the deletion
1109                          * must be visible to another weakly ordered CPU before
1110                          * the insertion at the start of the hash chain.
1111                          */
1112                         rcu_assign_pointer(rth->dst.rt_next,
1113                                            rt_hash_table[hash].chain);
1114                         /*
1115                          * Since lookup is lockfree, the update writes
1116                          * must be ordered for consistency on SMP.
1117                          */
1118                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1119
1120                         dst_use(&rth->dst, now);
1121                         spin_unlock_bh(rt_hash_lock_addr(hash));
1122
1123                         rt_drop(rt);
1124                         if (skb)
1125                                 skb_dst_set(skb, &rth->dst);
1126                         return rth;
1127                 }
1128
1129                 if (!atomic_read(&rth->dst.__refcnt)) {
1130                         u32 score = rt_score(rth);
1131
1132                         if (score <= min_score) {
1133                                 cand = rth;
1134                                 candp = rthp;
1135                                 min_score = score;
1136                         }
1137                 }
1138
1139                 chain_length++;
1140
1141                 rthp = &rth->dst.rt_next;
1142         }
1143
1144         if (cand) {
1145                 /* ip_rt_gc_elasticity used to be average length of chain
1146                  * length, when exceeded gc becomes really aggressive.
1147                  *
1148                  * The second limit is less certain. At the moment it allows
1149                  * only 2 entries per bucket. We will see.
1150                  */
1151                 if (chain_length > ip_rt_gc_elasticity) {
1152                         *candp = cand->dst.rt_next;
1153                         rt_free(cand);
1154                 }
1155         } else {
1156                 if (chain_length > rt_chain_length_max &&
1157                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1158                         struct net *net = dev_net(rt->dst.dev);
1159                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1160                         if (!rt_caching(net)) {
1161                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1162                                         rt->dst.dev->name, num);
1163                         }
1164                         rt_emergency_hash_rebuild(net);
1165                         spin_unlock_bh(rt_hash_lock_addr(hash));
1166
1167                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1168                                         ifindex, rt_genid(net));
1169                         goto restart;
1170                 }
1171         }
1172
1173         /* Try to bind route to arp only if it is output
1174            route or unicast forwarding path.
1175          */
1176         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1177                 int err = rt_bind_neighbour(rt);
1178                 if (err) {
1179                         spin_unlock_bh(rt_hash_lock_addr(hash));
1180
1181                         if (err != -ENOBUFS) {
1182                                 rt_drop(rt);
1183                                 return ERR_PTR(err);
1184                         }
1185
1186                         /* Neighbour tables are full and nothing
1187                            can be released. Try to shrink route cache,
1188                            it is most likely it holds some neighbour records.
1189                          */
1190                         if (attempts-- > 0) {
1191                                 int saved_elasticity = ip_rt_gc_elasticity;
1192                                 int saved_int = ip_rt_gc_min_interval;
1193                                 ip_rt_gc_elasticity     = 1;
1194                                 ip_rt_gc_min_interval   = 0;
1195                                 rt_garbage_collect(&ipv4_dst_ops);
1196                                 ip_rt_gc_min_interval   = saved_int;
1197                                 ip_rt_gc_elasticity     = saved_elasticity;
1198                                 goto restart;
1199                         }
1200
1201                         if (net_ratelimit())
1202                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1203                         rt_drop(rt);
1204                         return ERR_PTR(-ENOBUFS);
1205                 }
1206         }
1207
1208         rt->dst.rt_next = rt_hash_table[hash].chain;
1209
1210         /*
1211          * Since lookup is lockfree, we must make sure
1212          * previous writes to rt are committed to memory
1213          * before making rt visible to other CPUS.
1214          */
1215         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1216
1217         spin_unlock_bh(rt_hash_lock_addr(hash));
1218
1219 skip_hashing:
1220         if (skb)
1221                 skb_dst_set(skb, &rt->dst);
1222         return rt;
1223 }
1224
1225 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1226
1227 static u32 rt_peer_genid(void)
1228 {
1229         return atomic_read(&__rt_peer_genid);
1230 }
1231
1232 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1233 {
1234         struct inet_peer *peer;
1235
1236         peer = inet_getpeer_v4(daddr, create);
1237
1238         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1239                 inet_putpeer(peer);
1240         else
1241                 rt->rt_peer_genid = rt_peer_genid();
1242 }
1243
1244 /*
1245  * Peer allocation may fail only in serious out-of-memory conditions.  However
1246  * we still can generate some output.
1247  * Random ID selection looks a bit dangerous because we have no chances to
1248  * select ID being unique in a reasonable period of time.
1249  * But broken packet identifier may be better than no packet at all.
1250  */
1251 static void ip_select_fb_ident(struct iphdr *iph)
1252 {
1253         static DEFINE_SPINLOCK(ip_fb_id_lock);
1254         static u32 ip_fallback_id;
1255         u32 salt;
1256
1257         spin_lock_bh(&ip_fb_id_lock);
1258         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1259         iph->id = htons(salt & 0xFFFF);
1260         ip_fallback_id = salt;
1261         spin_unlock_bh(&ip_fb_id_lock);
1262 }
1263
1264 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1265 {
1266         struct rtable *rt = (struct rtable *) dst;
1267
1268         if (rt) {
1269                 if (rt->peer == NULL)
1270                         rt_bind_peer(rt, rt->rt_dst, 1);
1271
1272                 /* If peer is attached to destination, it is never detached,
1273                    so that we need not to grab a lock to dereference it.
1274                  */
1275                 if (rt->peer) {
1276                         iph->id = htons(inet_getid(rt->peer, more));
1277                         return;
1278                 }
1279         } else
1280                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1281                        __builtin_return_address(0));
1282
1283         ip_select_fb_ident(iph);
1284 }
1285 EXPORT_SYMBOL(__ip_select_ident);
1286
1287 static void rt_del(unsigned hash, struct rtable *rt)
1288 {
1289         struct rtable __rcu **rthp;
1290         struct rtable *aux;
1291
1292         rthp = &rt_hash_table[hash].chain;
1293         spin_lock_bh(rt_hash_lock_addr(hash));
1294         ip_rt_put(rt);
1295         while ((aux = rcu_dereference_protected(*rthp,
1296                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1297                 if (aux == rt || rt_is_expired(aux)) {
1298                         *rthp = aux->dst.rt_next;
1299                         rt_free(aux);
1300                         continue;
1301                 }
1302                 rthp = &aux->dst.rt_next;
1303         }
1304         spin_unlock_bh(rt_hash_lock_addr(hash));
1305 }
1306
1307 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1308 {
1309         struct rtable *rt = (struct rtable *) dst;
1310         __be32 orig_gw = rt->rt_gateway;
1311         struct neighbour *n, *old_n;
1312
1313         dst_confirm(&rt->dst);
1314
1315         rt->rt_gateway = peer->redirect_learned.a4;
1316
1317         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1318         if (IS_ERR(n))
1319                 return PTR_ERR(n);
1320         old_n = xchg(&rt->dst._neighbour, n);
1321         if (old_n)
1322                 neigh_release(old_n);
1323         if (!n || !(n->nud_state & NUD_VALID)) {
1324                 if (n)
1325                         neigh_event_send(n, NULL);
1326                 rt->rt_gateway = orig_gw;
1327                 return -EAGAIN;
1328         } else {
1329                 rt->rt_flags |= RTCF_REDIRECTED;
1330                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1331         }
1332         return 0;
1333 }
1334
1335 /* called in rcu_read_lock() section */
1336 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1337                     __be32 saddr, struct net_device *dev)
1338 {
1339         int s, i;
1340         struct in_device *in_dev = __in_dev_get_rcu(dev);
1341         __be32 skeys[2] = { saddr, 0 };
1342         int    ikeys[2] = { dev->ifindex, 0 };
1343         struct inet_peer *peer;
1344         struct net *net;
1345
1346         if (!in_dev)
1347                 return;
1348
1349         net = dev_net(dev);
1350         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1351             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1352             ipv4_is_zeronet(new_gw))
1353                 goto reject_redirect;
1354
1355         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1356                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1357                         goto reject_redirect;
1358                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1359                         goto reject_redirect;
1360         } else {
1361                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1362                         goto reject_redirect;
1363         }
1364
1365         for (s = 0; s < 2; s++) {
1366                 for (i = 0; i < 2; i++) {
1367                         unsigned int hash;
1368                         struct rtable __rcu **rthp;
1369                         struct rtable *rt;
1370
1371                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1372
1373                         rthp = &rt_hash_table[hash].chain;
1374
1375                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1376                                 rthp = &rt->dst.rt_next;
1377
1378                                 if (rt->rt_key_dst != daddr ||
1379                                     rt->rt_key_src != skeys[s] ||
1380                                     rt->rt_oif != ikeys[i] ||
1381                                     rt_is_input_route(rt) ||
1382                                     rt_is_expired(rt) ||
1383                                     !net_eq(dev_net(rt->dst.dev), net) ||
1384                                     rt->dst.error ||
1385                                     rt->dst.dev != dev ||
1386                                     rt->rt_gateway != old_gw)
1387                                         continue;
1388
1389                                 if (!rt->peer)
1390                                         rt_bind_peer(rt, rt->rt_dst, 1);
1391
1392                                 peer = rt->peer;
1393                                 if (peer) {
1394                                         if (peer->redirect_learned.a4 != new_gw) {
1395                                                 peer->redirect_learned.a4 = new_gw;
1396                                                 atomic_inc(&__rt_peer_genid);
1397                                         }
1398                                         check_peer_redir(&rt->dst, peer);
1399                                 }
1400                         }
1401                 }
1402         }
1403         return;
1404
1405 reject_redirect:
1406 #ifdef CONFIG_IP_ROUTE_VERBOSE
1407         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1408                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1409                         "  Advised path = %pI4 -> %pI4\n",
1410                        &old_gw, dev->name, &new_gw,
1411                        &saddr, &daddr);
1412 #endif
1413         ;
1414 }
1415
1416 static bool peer_pmtu_expired(struct inet_peer *peer)
1417 {
1418         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1419
1420         return orig &&
1421                time_after_eq(jiffies, orig) &&
1422                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1423 }
1424
1425 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1426 {
1427         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1428
1429         return orig &&
1430                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1431 }
1432
1433 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1434 {
1435         struct rtable *rt = (struct rtable *)dst;
1436         struct dst_entry *ret = dst;
1437
1438         if (rt) {
1439                 if (dst->obsolete > 0) {
1440                         ip_rt_put(rt);
1441                         ret = NULL;
1442                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1443                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1444                                                 rt->rt_oif,
1445                                                 rt_genid(dev_net(dst->dev)));
1446                         rt_del(hash, rt);
1447                         ret = NULL;
1448                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1449                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1450                 }
1451         }
1452         return ret;
1453 }
1454
1455 /*
1456  * Algorithm:
1457  *      1. The first ip_rt_redirect_number redirects are sent
1458  *         with exponential backoff, then we stop sending them at all,
1459  *         assuming that the host ignores our redirects.
1460  *      2. If we did not see packets requiring redirects
1461  *         during ip_rt_redirect_silence, we assume that the host
1462  *         forgot redirected route and start to send redirects again.
1463  *
1464  * This algorithm is much cheaper and more intelligent than dumb load limiting
1465  * in icmp.c.
1466  *
1467  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1468  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1469  */
1470
1471 void ip_rt_send_redirect(struct sk_buff *skb)
1472 {
1473         struct rtable *rt = skb_rtable(skb);
1474         struct in_device *in_dev;
1475         struct inet_peer *peer;
1476         int log_martians;
1477
1478         rcu_read_lock();
1479         in_dev = __in_dev_get_rcu(rt->dst.dev);
1480         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1481                 rcu_read_unlock();
1482                 return;
1483         }
1484         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1485         rcu_read_unlock();
1486
1487         if (!rt->peer)
1488                 rt_bind_peer(rt, rt->rt_dst, 1);
1489         peer = rt->peer;
1490         if (!peer) {
1491                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1492                 return;
1493         }
1494
1495         /* No redirected packets during ip_rt_redirect_silence;
1496          * reset the algorithm.
1497          */
1498         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1499                 peer->rate_tokens = 0;
1500
1501         /* Too many ignored redirects; do not send anything
1502          * set dst.rate_last to the last seen redirected packet.
1503          */
1504         if (peer->rate_tokens >= ip_rt_redirect_number) {
1505                 peer->rate_last = jiffies;
1506                 return;
1507         }
1508
1509         /* Check for load limit; set rate_last to the latest sent
1510          * redirect.
1511          */
1512         if (peer->rate_tokens == 0 ||
1513             time_after(jiffies,
1514                        (peer->rate_last +
1515                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1516                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1517                 peer->rate_last = jiffies;
1518                 ++peer->rate_tokens;
1519 #ifdef CONFIG_IP_ROUTE_VERBOSE
1520                 if (log_martians &&
1521                     peer->rate_tokens == ip_rt_redirect_number &&
1522                     net_ratelimit())
1523                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1524                                &ip_hdr(skb)->saddr, rt->rt_iif,
1525                                 &rt->rt_dst, &rt->rt_gateway);
1526 #endif
1527         }
1528 }
1529
1530 static int ip_error(struct sk_buff *skb)
1531 {
1532         struct rtable *rt = skb_rtable(skb);
1533         struct inet_peer *peer;
1534         unsigned long now;
1535         bool send;
1536         int code;
1537
1538         switch (rt->dst.error) {
1539         case EINVAL:
1540         default:
1541                 goto out;
1542         case EHOSTUNREACH:
1543                 code = ICMP_HOST_UNREACH;
1544                 break;
1545         case ENETUNREACH:
1546                 code = ICMP_NET_UNREACH;
1547                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1548                                 IPSTATS_MIB_INNOROUTES);
1549                 break;
1550         case EACCES:
1551                 code = ICMP_PKT_FILTERED;
1552                 break;
1553         }
1554
1555         if (!rt->peer)
1556                 rt_bind_peer(rt, rt->rt_dst, 1);
1557         peer = rt->peer;
1558
1559         send = true;
1560         if (peer) {
1561                 now = jiffies;
1562                 peer->rate_tokens += now - peer->rate_last;
1563                 if (peer->rate_tokens > ip_rt_error_burst)
1564                         peer->rate_tokens = ip_rt_error_burst;
1565                 peer->rate_last = now;
1566                 if (peer->rate_tokens >= ip_rt_error_cost)
1567                         peer->rate_tokens -= ip_rt_error_cost;
1568                 else
1569                         send = false;
1570         }
1571         if (send)
1572                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1573
1574 out:    kfree_skb(skb);
1575         return 0;
1576 }
1577
1578 /*
1579  *      The last two values are not from the RFC but
1580  *      are needed for AMPRnet AX.25 paths.
1581  */
1582
1583 static const unsigned short mtu_plateau[] =
1584 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1585
1586 static inline unsigned short guess_mtu(unsigned short old_mtu)
1587 {
1588         int i;
1589
1590         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1591                 if (old_mtu > mtu_plateau[i])
1592                         return mtu_plateau[i];
1593         return 68;
1594 }
1595
1596 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1597                                  unsigned short new_mtu,
1598                                  struct net_device *dev)
1599 {
1600         unsigned short old_mtu = ntohs(iph->tot_len);
1601         unsigned short est_mtu = 0;
1602         struct inet_peer *peer;
1603
1604         peer = inet_getpeer_v4(iph->daddr, 1);
1605         if (peer) {
1606                 unsigned short mtu = new_mtu;
1607
1608                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1609                         /* BSD 4.2 derived systems incorrectly adjust
1610                          * tot_len by the IP header length, and report
1611                          * a zero MTU in the ICMP message.
1612                          */
1613                         if (mtu == 0 &&
1614                             old_mtu >= 68 + (iph->ihl << 2))
1615                                 old_mtu -= iph->ihl << 2;
1616                         mtu = guess_mtu(old_mtu);
1617                 }
1618
1619                 if (mtu < ip_rt_min_pmtu)
1620                         mtu = ip_rt_min_pmtu;
1621                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1622                         unsigned long pmtu_expires;
1623
1624                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1625                         if (!pmtu_expires)
1626                                 pmtu_expires = 1UL;
1627
1628                         est_mtu = mtu;
1629                         peer->pmtu_learned = mtu;
1630                         peer->pmtu_expires = pmtu_expires;
1631                         atomic_inc(&__rt_peer_genid);
1632                 }
1633
1634                 inet_putpeer(peer);
1635         }
1636         return est_mtu ? : new_mtu;
1637 }
1638
1639 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1640 {
1641         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1642
1643         if (!expires)
1644                 return;
1645         if (time_before(jiffies, expires)) {
1646                 u32 orig_dst_mtu = dst_mtu(dst);
1647                 if (peer->pmtu_learned < orig_dst_mtu) {
1648                         if (!peer->pmtu_orig)
1649                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1650                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1651                 }
1652         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1653                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1654 }
1655
1656 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1657 {
1658         struct rtable *rt = (struct rtable *) dst;
1659         struct inet_peer *peer;
1660
1661         dst_confirm(dst);
1662
1663         if (!rt->peer)
1664                 rt_bind_peer(rt, rt->rt_dst, 1);
1665         peer = rt->peer;
1666         if (peer) {
1667                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1668
1669                 if (mtu < ip_rt_min_pmtu)
1670                         mtu = ip_rt_min_pmtu;
1671                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1672
1673                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1674                         if (!pmtu_expires)
1675                                 pmtu_expires = 1UL;
1676
1677                         peer->pmtu_learned = mtu;
1678                         peer->pmtu_expires = pmtu_expires;
1679
1680                         atomic_inc(&__rt_peer_genid);
1681                         rt->rt_peer_genid = rt_peer_genid();
1682                 }
1683                 check_peer_pmtu(dst, peer);
1684         }
1685 }
1686
1687
1688 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1689 {
1690         struct rtable *rt = (struct rtable *) dst;
1691
1692         if (rt_is_expired(rt))
1693                 return NULL;
1694         if (rt->rt_peer_genid != rt_peer_genid()) {
1695                 struct inet_peer *peer;
1696
1697                 if (!rt->peer)
1698                         rt_bind_peer(rt, rt->rt_dst, 0);
1699
1700                 peer = rt->peer;
1701                 if (peer) {
1702                         check_peer_pmtu(dst, peer);
1703
1704                         if (peer->redirect_learned.a4 &&
1705                             peer->redirect_learned.a4 != rt->rt_gateway) {
1706                                 if (check_peer_redir(dst, peer))
1707                                         return NULL;
1708                         }
1709                 }
1710
1711                 rt->rt_peer_genid = rt_peer_genid();
1712         }
1713         return dst;
1714 }
1715
1716 static void ipv4_dst_destroy(struct dst_entry *dst)
1717 {
1718         struct rtable *rt = (struct rtable *) dst;
1719         struct inet_peer *peer = rt->peer;
1720
1721         if (rt->fi) {
1722                 fib_info_put(rt->fi);
1723                 rt->fi = NULL;
1724         }
1725         if (peer) {
1726                 rt->peer = NULL;
1727                 inet_putpeer(peer);
1728         }
1729 }
1730
1731
1732 static void ipv4_link_failure(struct sk_buff *skb)
1733 {
1734         struct rtable *rt;
1735
1736         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1737
1738         rt = skb_rtable(skb);
1739         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1740                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1741 }
1742
1743 static int ip_rt_bug(struct sk_buff *skb)
1744 {
1745         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1746                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1747                 skb->dev ? skb->dev->name : "?");
1748         kfree_skb(skb);
1749         WARN_ON(1);
1750         return 0;
1751 }
1752
1753 /*
1754    We do not cache source address of outgoing interface,
1755    because it is used only by IP RR, TS and SRR options,
1756    so that it out of fast path.
1757
1758    BTW remember: "addr" is allowed to be not aligned
1759    in IP options!
1760  */
1761
1762 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1763 {
1764         __be32 src;
1765
1766         if (rt_is_output_route(rt))
1767                 src = ip_hdr(skb)->saddr;
1768         else {
1769                 struct fib_result res;
1770                 struct flowi4 fl4;
1771                 struct iphdr *iph;
1772
1773                 iph = ip_hdr(skb);
1774
1775                 memset(&fl4, 0, sizeof(fl4));
1776                 fl4.daddr = iph->daddr;
1777                 fl4.saddr = iph->saddr;
1778                 fl4.flowi4_tos = RT_TOS(iph->tos);
1779                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1780                 fl4.flowi4_iif = skb->dev->ifindex;
1781                 fl4.flowi4_mark = skb->mark;
1782
1783                 rcu_read_lock();
1784                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1785                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1786                 else
1787                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1788                                         RT_SCOPE_UNIVERSE);
1789                 rcu_read_unlock();
1790         }
1791         memcpy(addr, &src, 4);
1792 }
1793
1794 #ifdef CONFIG_IP_ROUTE_CLASSID
1795 static void set_class_tag(struct rtable *rt, u32 tag)
1796 {
1797         if (!(rt->dst.tclassid & 0xFFFF))
1798                 rt->dst.tclassid |= tag & 0xFFFF;
1799         if (!(rt->dst.tclassid & 0xFFFF0000))
1800                 rt->dst.tclassid |= tag & 0xFFFF0000;
1801 }
1802 #endif
1803
1804 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1805 {
1806         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1807
1808         if (advmss == 0) {
1809                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1810                                ip_rt_min_advmss);
1811                 if (advmss > 65535 - 40)
1812                         advmss = 65535 - 40;
1813         }
1814         return advmss;
1815 }
1816
1817 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1818 {
1819         const struct rtable *rt = (const struct rtable *) dst;
1820         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1821
1822         if (mtu && rt_is_output_route(rt))
1823                 return mtu;
1824
1825         mtu = dst->dev->mtu;
1826
1827         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1828
1829                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1830                         mtu = 576;
1831         }
1832
1833         if (mtu > IP_MAX_MTU)
1834                 mtu = IP_MAX_MTU;
1835
1836         return mtu;
1837 }
1838
1839 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1840                             struct fib_info *fi)
1841 {
1842         struct inet_peer *peer;
1843         int create = 0;
1844
1845         /* If a peer entry exists for this destination, we must hook
1846          * it up in order to get at cached metrics.
1847          */
1848         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1849                 create = 1;
1850
1851         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1852         if (peer) {
1853                 rt->rt_peer_genid = rt_peer_genid();
1854                 if (inet_metrics_new(peer))
1855                         memcpy(peer->metrics, fi->fib_metrics,
1856                                sizeof(u32) * RTAX_MAX);
1857                 dst_init_metrics(&rt->dst, peer->metrics, false);
1858
1859                 check_peer_pmtu(&rt->dst, peer);
1860                 if (peer->redirect_learned.a4 &&
1861                     peer->redirect_learned.a4 != rt->rt_gateway) {
1862                         rt->rt_gateway = peer->redirect_learned.a4;
1863                         rt->rt_flags |= RTCF_REDIRECTED;
1864                 }
1865         } else {
1866                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1867                         rt->fi = fi;
1868                         atomic_inc(&fi->fib_clntref);
1869                 }
1870                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1871         }
1872 }
1873
1874 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1875                            const struct fib_result *res,
1876                            struct fib_info *fi, u16 type, u32 itag)
1877 {
1878         struct dst_entry *dst = &rt->dst;
1879
1880         if (fi) {
1881                 if (FIB_RES_GW(*res) &&
1882                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1883                         rt->rt_gateway = FIB_RES_GW(*res);
1884                 rt_init_metrics(rt, fl4, fi);
1885 #ifdef CONFIG_IP_ROUTE_CLASSID
1886                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1887 #endif
1888         }
1889
1890         if (dst_mtu(dst) > IP_MAX_MTU)
1891                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1892         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1893                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1894
1895 #ifdef CONFIG_IP_ROUTE_CLASSID
1896 #ifdef CONFIG_IP_MULTIPLE_TABLES
1897         set_class_tag(rt, fib_rules_tclass(res));
1898 #endif
1899         set_class_tag(rt, itag);
1900 #endif
1901 }
1902
1903 static struct rtable *rt_dst_alloc(struct net_device *dev,
1904                                    bool nopolicy, bool noxfrm)
1905 {
1906         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1907                          DST_HOST |
1908                          (nopolicy ? DST_NOPOLICY : 0) |
1909                          (noxfrm ? DST_NOXFRM : 0));
1910 }
1911
1912 /* called in rcu_read_lock() section */
1913 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1914                                 u8 tos, struct net_device *dev, int our)
1915 {
1916         unsigned int hash;
1917         struct rtable *rth;
1918         __be32 spec_dst;
1919         struct in_device *in_dev = __in_dev_get_rcu(dev);
1920         u32 itag = 0;
1921         int err;
1922
1923         /* Primary sanity checks. */
1924
1925         if (in_dev == NULL)
1926                 return -EINVAL;
1927
1928         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1929             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1930                 goto e_inval;
1931
1932         if (ipv4_is_zeronet(saddr)) {
1933                 if (!ipv4_is_local_multicast(daddr))
1934                         goto e_inval;
1935                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1936         } else {
1937                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1938                                           &itag);
1939                 if (err < 0)
1940                         goto e_err;
1941         }
1942         rth = rt_dst_alloc(init_net.loopback_dev,
1943                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1944         if (!rth)
1945                 goto e_nobufs;
1946
1947 #ifdef CONFIG_IP_ROUTE_CLASSID
1948         rth->dst.tclassid = itag;
1949 #endif
1950         rth->dst.output = ip_rt_bug;
1951
1952         rth->rt_key_dst = daddr;
1953         rth->rt_key_src = saddr;
1954         rth->rt_genid   = rt_genid(dev_net(dev));
1955         rth->rt_flags   = RTCF_MULTICAST;
1956         rth->rt_type    = RTN_MULTICAST;
1957         rth->rt_key_tos = tos;
1958         rth->rt_dst     = daddr;
1959         rth->rt_src     = saddr;
1960         rth->rt_route_iif = dev->ifindex;
1961         rth->rt_iif     = dev->ifindex;
1962         rth->rt_oif     = 0;
1963         rth->rt_mark    = skb->mark;
1964         rth->rt_gateway = daddr;
1965         rth->rt_spec_dst= spec_dst;
1966         rth->rt_peer_genid = 0;
1967         rth->peer = NULL;
1968         rth->fi = NULL;
1969         if (our) {
1970                 rth->dst.input= ip_local_deliver;
1971                 rth->rt_flags |= RTCF_LOCAL;
1972         }
1973
1974 #ifdef CONFIG_IP_MROUTE
1975         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1976                 rth->dst.input = ip_mr_input;
1977 #endif
1978         RT_CACHE_STAT_INC(in_slow_mc);
1979
1980         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1981         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1982         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1983
1984 e_nobufs:
1985         return -ENOBUFS;
1986 e_inval:
1987         return -EINVAL;
1988 e_err:
1989         return err;
1990 }
1991
1992
1993 static void ip_handle_martian_source(struct net_device *dev,
1994                                      struct in_device *in_dev,
1995                                      struct sk_buff *skb,
1996                                      __be32 daddr,
1997                                      __be32 saddr)
1998 {
1999         RT_CACHE_STAT_INC(in_martian_src);
2000 #ifdef CONFIG_IP_ROUTE_VERBOSE
2001         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2002                 /*
2003                  *      RFC1812 recommendation, if source is martian,
2004                  *      the only hint is MAC header.
2005                  */
2006                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2007                         &daddr, &saddr, dev->name);
2008                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2009                         int i;
2010                         const unsigned char *p = skb_mac_header(skb);
2011                         printk(KERN_WARNING "ll header: ");
2012                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2013                                 printk("%02x", *p);
2014                                 if (i < (dev->hard_header_len - 1))
2015                                         printk(":");
2016                         }
2017                         printk("\n");
2018                 }
2019         }
2020 #endif
2021 }
2022
2023 /* called in rcu_read_lock() section */
2024 static int __mkroute_input(struct sk_buff *skb,
2025                            const struct fib_result *res,
2026                            struct in_device *in_dev,
2027                            __be32 daddr, __be32 saddr, u32 tos,
2028                            struct rtable **result)
2029 {
2030         struct rtable *rth;
2031         int err;
2032         struct in_device *out_dev;
2033         unsigned int flags = 0;
2034         __be32 spec_dst;
2035         u32 itag;
2036
2037         /* get a working reference to the output device */
2038         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2039         if (out_dev == NULL) {
2040                 if (net_ratelimit())
2041                         printk(KERN_CRIT "Bug in ip_route_input" \
2042                                "_slow(). Please, report\n");
2043                 return -EINVAL;
2044         }
2045
2046
2047         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2048                                   in_dev->dev, &spec_dst, &itag);
2049         if (err < 0) {
2050                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2051                                          saddr);
2052
2053                 goto cleanup;
2054         }
2055
2056         if (err)
2057                 flags |= RTCF_DIRECTSRC;
2058
2059         if (out_dev == in_dev && err &&
2060             (IN_DEV_SHARED_MEDIA(out_dev) ||
2061              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2062                 flags |= RTCF_DOREDIRECT;
2063
2064         if (skb->protocol != htons(ETH_P_IP)) {
2065                 /* Not IP (i.e. ARP). Do not create route, if it is
2066                  * invalid for proxy arp. DNAT routes are always valid.
2067                  *
2068                  * Proxy arp feature have been extended to allow, ARP
2069                  * replies back to the same interface, to support
2070                  * Private VLAN switch technologies. See arp.c.
2071                  */
2072                 if (out_dev == in_dev &&
2073                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2074                         err = -EINVAL;
2075                         goto cleanup;
2076                 }
2077         }
2078
2079         rth = rt_dst_alloc(out_dev->dev,
2080                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2081                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2082         if (!rth) {
2083                 err = -ENOBUFS;
2084                 goto cleanup;
2085         }
2086
2087         rth->rt_key_dst = daddr;
2088         rth->rt_key_src = saddr;
2089         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2090         rth->rt_flags = flags;
2091         rth->rt_type = res->type;
2092         rth->rt_key_tos = tos;
2093         rth->rt_dst     = daddr;
2094         rth->rt_src     = saddr;
2095         rth->rt_route_iif = in_dev->dev->ifindex;
2096         rth->rt_iif     = in_dev->dev->ifindex;
2097         rth->rt_oif     = 0;
2098         rth->rt_mark    = skb->mark;
2099         rth->rt_gateway = daddr;
2100         rth->rt_spec_dst= spec_dst;
2101         rth->rt_peer_genid = 0;
2102         rth->peer = NULL;
2103         rth->fi = NULL;
2104
2105         rth->dst.input = ip_forward;
2106         rth->dst.output = ip_output;
2107
2108         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2109
2110         *result = rth;
2111         err = 0;
2112  cleanup:
2113         return err;
2114 }
2115
2116 static int ip_mkroute_input(struct sk_buff *skb,
2117                             struct fib_result *res,
2118                             const struct flowi4 *fl4,
2119                             struct in_device *in_dev,
2120                             __be32 daddr, __be32 saddr, u32 tos)
2121 {
2122         struct rtable* rth = NULL;
2123         int err;
2124         unsigned hash;
2125
2126 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2127         if (res->fi && res->fi->fib_nhs > 1)
2128                 fib_select_multipath(res);
2129 #endif
2130
2131         /* create a routing cache entry */
2132         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2133         if (err)
2134                 return err;
2135
2136         /* put it into the cache */
2137         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2138                        rt_genid(dev_net(rth->dst.dev)));
2139         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2140         if (IS_ERR(rth))
2141                 return PTR_ERR(rth);
2142         return 0;
2143 }
2144
2145 /*
2146  *      NOTE. We drop all the packets that has local source
2147  *      addresses, because every properly looped back packet
2148  *      must have correct destination already attached by output routine.
2149  *
2150  *      Such approach solves two big problems:
2151  *      1. Not simplex devices are handled properly.
2152  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2153  *      called with rcu_read_lock()
2154  */
2155
2156 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2157                                u8 tos, struct net_device *dev)
2158 {
2159         struct fib_result res;
2160         struct in_device *in_dev = __in_dev_get_rcu(dev);
2161         struct flowi4   fl4;
2162         unsigned        flags = 0;
2163         u32             itag = 0;
2164         struct rtable * rth;
2165         unsigned        hash;
2166         __be32          spec_dst;
2167         int             err = -EINVAL;
2168         struct net    * net = dev_net(dev);
2169
2170         /* IP on this device is disabled. */
2171
2172         if (!in_dev)
2173                 goto out;
2174
2175         /* Check for the most weird martians, which can be not detected
2176            by fib_lookup.
2177          */
2178
2179         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2180             ipv4_is_loopback(saddr))
2181                 goto martian_source;
2182
2183         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2184                 goto brd_input;
2185
2186         /* Accept zero addresses only to limited broadcast;
2187          * I even do not know to fix it or not. Waiting for complains :-)
2188          */
2189         if (ipv4_is_zeronet(saddr))
2190                 goto martian_source;
2191
2192         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2193                 goto martian_destination;
2194
2195         /*
2196          *      Now we are ready to route packet.
2197          */
2198         fl4.flowi4_oif = 0;
2199         fl4.flowi4_iif = dev->ifindex;
2200         fl4.flowi4_mark = skb->mark;
2201         fl4.flowi4_tos = tos;
2202         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2203         fl4.daddr = daddr;
2204         fl4.saddr = saddr;
2205         err = fib_lookup(net, &fl4, &res);
2206         if (err != 0) {
2207                 if (!IN_DEV_FORWARD(in_dev))
2208                         goto e_hostunreach;
2209                 goto no_route;
2210         }
2211
2212         RT_CACHE_STAT_INC(in_slow_tot);
2213
2214         if (res.type == RTN_BROADCAST)
2215                 goto brd_input;
2216
2217         if (res.type == RTN_LOCAL) {
2218                 err = fib_validate_source(skb, saddr, daddr, tos,
2219                                           net->loopback_dev->ifindex,
2220                                           dev, &spec_dst, &itag);
2221                 if (err < 0)
2222                         goto martian_source_keep_err;
2223                 if (err)
2224                         flags |= RTCF_DIRECTSRC;
2225                 spec_dst = daddr;
2226                 goto local_input;
2227         }
2228
2229         if (!IN_DEV_FORWARD(in_dev))
2230                 goto e_hostunreach;
2231         if (res.type != RTN_UNICAST)
2232                 goto martian_destination;
2233
2234         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2235 out:    return err;
2236
2237 brd_input:
2238         if (skb->protocol != htons(ETH_P_IP))
2239                 goto e_inval;
2240
2241         if (ipv4_is_zeronet(saddr))
2242                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2243         else {
2244                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2245                                           &itag);
2246                 if (err < 0)
2247                         goto martian_source_keep_err;
2248                 if (err)
2249                         flags |= RTCF_DIRECTSRC;
2250         }
2251         flags |= RTCF_BROADCAST;
2252         res.type = RTN_BROADCAST;
2253         RT_CACHE_STAT_INC(in_brd);
2254
2255 local_input:
2256         rth = rt_dst_alloc(net->loopback_dev,
2257                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2258         if (!rth)
2259                 goto e_nobufs;
2260
2261         rth->dst.input= ip_local_deliver;
2262         rth->dst.output= ip_rt_bug;
2263 #ifdef CONFIG_IP_ROUTE_CLASSID
2264         rth->dst.tclassid = itag;
2265 #endif
2266
2267         rth->rt_key_dst = daddr;
2268         rth->rt_key_src = saddr;
2269         rth->rt_genid = rt_genid(net);
2270         rth->rt_flags   = flags|RTCF_LOCAL;
2271         rth->rt_type    = res.type;
2272         rth->rt_key_tos = tos;
2273         rth->rt_dst     = daddr;
2274         rth->rt_src     = saddr;
2275 #ifdef CONFIG_IP_ROUTE_CLASSID
2276         rth->dst.tclassid = itag;
2277 #endif
2278         rth->rt_route_iif = dev->ifindex;
2279         rth->rt_iif     = dev->ifindex;
2280         rth->rt_oif     = 0;
2281         rth->rt_mark    = skb->mark;
2282         rth->rt_gateway = daddr;
2283         rth->rt_spec_dst= spec_dst;
2284         rth->rt_peer_genid = 0;
2285         rth->peer = NULL;
2286         rth->fi = NULL;
2287         if (res.type == RTN_UNREACHABLE) {
2288                 rth->dst.input= ip_error;
2289                 rth->dst.error= -err;
2290                 rth->rt_flags   &= ~RTCF_LOCAL;
2291         }
2292         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2293         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2294         err = 0;
2295         if (IS_ERR(rth))
2296                 err = PTR_ERR(rth);
2297         goto out;
2298
2299 no_route:
2300         RT_CACHE_STAT_INC(in_no_route);
2301         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2302         res.type = RTN_UNREACHABLE;
2303         if (err == -ESRCH)
2304                 err = -ENETUNREACH;
2305         goto local_input;
2306
2307         /*
2308          *      Do not cache martian addresses: they should be logged (RFC1812)
2309          */
2310 martian_destination:
2311         RT_CACHE_STAT_INC(in_martian_dst);
2312 #ifdef CONFIG_IP_ROUTE_VERBOSE
2313         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2314                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2315                         &daddr, &saddr, dev->name);
2316 #endif
2317
2318 e_hostunreach:
2319         err = -EHOSTUNREACH;
2320         goto out;
2321
2322 e_inval:
2323         err = -EINVAL;
2324         goto out;
2325
2326 e_nobufs:
2327         err = -ENOBUFS;
2328         goto out;
2329
2330 martian_source:
2331         err = -EINVAL;
2332 martian_source_keep_err:
2333         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2334         goto out;
2335 }
2336
2337 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2338                            u8 tos, struct net_device *dev, bool noref)
2339 {
2340         struct rtable * rth;
2341         unsigned        hash;
2342         int iif = dev->ifindex;
2343         struct net *net;
2344         int res;
2345
2346         net = dev_net(dev);
2347
2348         rcu_read_lock();
2349
2350         if (!rt_caching(net))
2351                 goto skip_cache;
2352
2353         tos &= IPTOS_RT_MASK;
2354         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2355
2356         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2357              rth = rcu_dereference(rth->dst.rt_next)) {
2358                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2359                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2360                      (rth->rt_route_iif ^ iif) |
2361                      (rth->rt_key_tos ^ tos)) == 0 &&
2362                     rth->rt_mark == skb->mark &&
2363                     net_eq(dev_net(rth->dst.dev), net) &&
2364                     !rt_is_expired(rth)) {
2365                         if (noref) {
2366                                 dst_use_noref(&rth->dst, jiffies);
2367                                 skb_dst_set_noref(skb, &rth->dst);
2368                         } else {
2369                                 dst_use(&rth->dst, jiffies);
2370                                 skb_dst_set(skb, &rth->dst);
2371                         }
2372                         RT_CACHE_STAT_INC(in_hit);
2373                         rcu_read_unlock();
2374                         return 0;
2375                 }
2376                 RT_CACHE_STAT_INC(in_hlist_search);
2377         }
2378
2379 skip_cache:
2380         /* Multicast recognition logic is moved from route cache to here.
2381            The problem was that too many Ethernet cards have broken/missing
2382            hardware multicast filters :-( As result the host on multicasting
2383            network acquires a lot of useless route cache entries, sort of
2384            SDR messages from all the world. Now we try to get rid of them.
2385            Really, provided software IP multicast filter is organized
2386            reasonably (at least, hashed), it does not result in a slowdown
2387            comparing with route cache reject entries.
2388            Note, that multicast routers are not affected, because
2389            route cache entry is created eventually.
2390          */
2391         if (ipv4_is_multicast(daddr)) {
2392                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2393
2394                 if (in_dev) {
2395                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2396                                                   ip_hdr(skb)->protocol);
2397                         if (our
2398 #ifdef CONFIG_IP_MROUTE
2399                                 ||
2400                             (!ipv4_is_local_multicast(daddr) &&
2401                              IN_DEV_MFORWARD(in_dev))
2402 #endif
2403                            ) {
2404                                 int res = ip_route_input_mc(skb, daddr, saddr,
2405                                                             tos, dev, our);
2406                                 rcu_read_unlock();
2407                                 return res;
2408                         }
2409                 }
2410                 rcu_read_unlock();
2411                 return -EINVAL;
2412         }
2413         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2414         rcu_read_unlock();
2415         return res;
2416 }
2417 EXPORT_SYMBOL(ip_route_input_common);
2418
2419 /* called with rcu_read_lock() */
2420 static struct rtable *__mkroute_output(const struct fib_result *res,
2421                                        const struct flowi4 *fl4,
2422                                        __be32 orig_daddr, __be32 orig_saddr,
2423                                        int orig_oif, struct net_device *dev_out,
2424                                        unsigned int flags)
2425 {
2426         struct fib_info *fi = res->fi;
2427         u32 tos = RT_FL_TOS(fl4);
2428         struct in_device *in_dev;
2429         u16 type = res->type;
2430         struct rtable *rth;
2431
2432         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2433                 return ERR_PTR(-EINVAL);
2434
2435         if (ipv4_is_lbcast(fl4->daddr))
2436                 type = RTN_BROADCAST;
2437         else if (ipv4_is_multicast(fl4->daddr))
2438                 type = RTN_MULTICAST;
2439         else if (ipv4_is_zeronet(fl4->daddr))
2440                 return ERR_PTR(-EINVAL);
2441
2442         if (dev_out->flags & IFF_LOOPBACK)
2443                 flags |= RTCF_LOCAL;
2444
2445         in_dev = __in_dev_get_rcu(dev_out);
2446         if (!in_dev)
2447                 return ERR_PTR(-EINVAL);
2448
2449         if (type == RTN_BROADCAST) {
2450                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2451                 fi = NULL;
2452         } else if (type == RTN_MULTICAST) {
2453                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2454                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2455                                      fl4->flowi4_proto))
2456                         flags &= ~RTCF_LOCAL;
2457                 /* If multicast route do not exist use
2458                  * default one, but do not gateway in this case.
2459                  * Yes, it is hack.
2460                  */
2461                 if (fi && res->prefixlen < 4)
2462                         fi = NULL;
2463         }
2464
2465         rth = rt_dst_alloc(dev_out,
2466                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2467                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2468         if (!rth)
2469                 return ERR_PTR(-ENOBUFS);
2470
2471         rth->dst.output = ip_output;
2472
2473         rth->rt_key_dst = orig_daddr;
2474         rth->rt_key_src = orig_saddr;
2475         rth->rt_genid = rt_genid(dev_net(dev_out));
2476         rth->rt_flags   = flags;
2477         rth->rt_type    = type;
2478         rth->rt_key_tos = tos;
2479         rth->rt_dst     = fl4->daddr;
2480         rth->rt_src     = fl4->saddr;
2481         rth->rt_route_iif = 0;
2482         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2483         rth->rt_oif     = orig_oif;
2484         rth->rt_mark    = fl4->flowi4_mark;
2485         rth->rt_gateway = fl4->daddr;
2486         rth->rt_spec_dst= fl4->saddr;
2487         rth->rt_peer_genid = 0;
2488         rth->peer = NULL;
2489         rth->fi = NULL;
2490
2491         RT_CACHE_STAT_INC(out_slow_tot);
2492
2493         if (flags & RTCF_LOCAL) {
2494                 rth->dst.input = ip_local_deliver;
2495                 rth->rt_spec_dst = fl4->daddr;
2496         }
2497         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2498                 rth->rt_spec_dst = fl4->saddr;
2499                 if (flags & RTCF_LOCAL &&
2500                     !(dev_out->flags & IFF_LOOPBACK)) {
2501                         rth->dst.output = ip_mc_output;
2502                         RT_CACHE_STAT_INC(out_slow_mc);
2503                 }
2504 #ifdef CONFIG_IP_MROUTE
2505                 if (type == RTN_MULTICAST) {
2506                         if (IN_DEV_MFORWARD(in_dev) &&
2507                             !ipv4_is_local_multicast(fl4->daddr)) {
2508                                 rth->dst.input = ip_mr_input;
2509                                 rth->dst.output = ip_mc_output;
2510                         }
2511                 }
2512 #endif
2513         }
2514
2515         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2516
2517         return rth;
2518 }
2519
2520 /*
2521  * Major route resolver routine.
2522  * called with rcu_read_lock();
2523  */
2524
2525 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2526 {
2527         struct net_device *dev_out = NULL;
2528         u32 tos = RT_FL_TOS(fl4);
2529         unsigned int flags = 0;
2530         struct fib_result res;
2531         struct rtable *rth;
2532         __be32 orig_daddr;
2533         __be32 orig_saddr;
2534         int orig_oif;
2535
2536         res.fi          = NULL;
2537 #ifdef CONFIG_IP_MULTIPLE_TABLES
2538         res.r           = NULL;
2539 #endif
2540
2541         orig_daddr = fl4->daddr;
2542         orig_saddr = fl4->saddr;
2543         orig_oif = fl4->flowi4_oif;
2544
2545         fl4->flowi4_iif = net->loopback_dev->ifindex;
2546         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2547         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2548                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2549
2550         rcu_read_lock();
2551         if (fl4->saddr) {
2552                 rth = ERR_PTR(-EINVAL);
2553                 if (ipv4_is_multicast(fl4->saddr) ||
2554                     ipv4_is_lbcast(fl4->saddr) ||
2555                     ipv4_is_zeronet(fl4->saddr))
2556                         goto out;
2557
2558                 /* I removed check for oif == dev_out->oif here.
2559                    It was wrong for two reasons:
2560                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2561                       is assigned to multiple interfaces.
2562                    2. Moreover, we are allowed to send packets with saddr
2563                       of another iface. --ANK
2564                  */
2565
2566                 if (fl4->flowi4_oif == 0 &&
2567                     (ipv4_is_multicast(fl4->daddr) ||
2568                      ipv4_is_lbcast(fl4->daddr))) {
2569                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2570                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2571                         if (dev_out == NULL)
2572                                 goto out;
2573
2574                         /* Special hack: user can direct multicasts
2575                            and limited broadcast via necessary interface
2576                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2577                            This hack is not just for fun, it allows
2578                            vic,vat and friends to work.
2579                            They bind socket to loopback, set ttl to zero
2580                            and expect that it will work.
2581                            From the viewpoint of routing cache they are broken,
2582                            because we are not allowed to build multicast path
2583                            with loopback source addr (look, routing cache
2584                            cannot know, that ttl is zero, so that packet
2585                            will not leave this host and route is valid).
2586                            Luckily, this hack is good workaround.
2587                          */
2588
2589                         fl4->flowi4_oif = dev_out->ifindex;
2590                         goto make_route;
2591                 }
2592
2593                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2594                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2595                         if (!__ip_dev_find(net, fl4->saddr, false))
2596                                 goto out;
2597                 }
2598         }
2599
2600
2601         if (fl4->flowi4_oif) {
2602                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2603                 rth = ERR_PTR(-ENODEV);
2604                 if (dev_out == NULL)
2605                         goto out;
2606
2607                 /* RACE: Check return value of inet_select_addr instead. */
2608                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2609                         rth = ERR_PTR(-ENETUNREACH);
2610                         goto out;
2611                 }
2612                 if (ipv4_is_local_multicast(fl4->daddr) ||
2613                     ipv4_is_lbcast(fl4->daddr)) {
2614                         if (!fl4->saddr)
2615                                 fl4->saddr = inet_select_addr(dev_out, 0,
2616                                                               RT_SCOPE_LINK);
2617                         goto make_route;
2618                 }
2619                 if (fl4->saddr) {
2620                         if (ipv4_is_multicast(fl4->daddr))
2621                                 fl4->saddr = inet_select_addr(dev_out, 0,
2622                                                               fl4->flowi4_scope);
2623                         else if (!fl4->daddr)
2624                                 fl4->saddr = inet_select_addr(dev_out, 0,
2625                                                               RT_SCOPE_HOST);
2626                 }
2627         }
2628
2629         if (!fl4->daddr) {
2630                 fl4->daddr = fl4->saddr;
2631                 if (!fl4->daddr)
2632                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2633                 dev_out = net->loopback_dev;
2634                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2635                 res.type = RTN_LOCAL;
2636                 flags |= RTCF_LOCAL;
2637                 goto make_route;
2638         }
2639
2640         if (fib_lookup(net, fl4, &res)) {
2641                 res.fi = NULL;
2642                 if (fl4->flowi4_oif) {
2643                         /* Apparently, routing tables are wrong. Assume,
2644                            that the destination is on link.
2645
2646                            WHY? DW.
2647                            Because we are allowed to send to iface
2648                            even if it has NO routes and NO assigned
2649                            addresses. When oif is specified, routing
2650                            tables are looked up with only one purpose:
2651                            to catch if destination is gatewayed, rather than
2652                            direct. Moreover, if MSG_DONTROUTE is set,
2653                            we send packet, ignoring both routing tables
2654                            and ifaddr state. --ANK
2655
2656
2657                            We could make it even if oif is unknown,
2658                            likely IPv6, but we do not.
2659                          */
2660
2661                         if (fl4->saddr == 0)
2662                                 fl4->saddr = inet_select_addr(dev_out, 0,
2663                                                               RT_SCOPE_LINK);
2664                         res.type = RTN_UNICAST;
2665                         goto make_route;
2666                 }
2667                 rth = ERR_PTR(-ENETUNREACH);
2668                 goto out;
2669         }
2670
2671         if (res.type == RTN_LOCAL) {
2672                 if (!fl4->saddr) {
2673                         if (res.fi->fib_prefsrc)
2674                                 fl4->saddr = res.fi->fib_prefsrc;
2675                         else
2676                                 fl4->saddr = fl4->daddr;
2677                 }
2678                 dev_out = net->loopback_dev;
2679                 fl4->flowi4_oif = dev_out->ifindex;
2680                 res.fi = NULL;
2681                 flags |= RTCF_LOCAL;
2682                 goto make_route;
2683         }
2684
2685 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2686         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2687                 fib_select_multipath(&res);
2688         else
2689 #endif
2690         if (!res.prefixlen &&
2691             res.table->tb_num_default > 1 &&
2692             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2693                 fib_select_default(&res);
2694
2695         if (!fl4->saddr)
2696                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2697
2698         dev_out = FIB_RES_DEV(res);
2699         fl4->flowi4_oif = dev_out->ifindex;
2700
2701
2702 make_route:
2703         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2704                                dev_out, flags);
2705         if (!IS_ERR(rth)) {
2706                 unsigned int hash;
2707
2708                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2709                                rt_genid(dev_net(dev_out)));
2710                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2711         }
2712
2713 out:
2714         rcu_read_unlock();
2715         return rth;
2716 }
2717
2718 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2719 {
2720         struct rtable *rth;
2721         unsigned int hash;
2722
2723         if (!rt_caching(net))
2724                 goto slow_output;
2725
2726         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2727
2728         rcu_read_lock_bh();
2729         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2730                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2731                 if (rth->rt_key_dst == flp4->daddr &&
2732                     rth->rt_key_src == flp4->saddr &&
2733                     rt_is_output_route(rth) &&
2734                     rth->rt_oif == flp4->flowi4_oif &&
2735                     rth->rt_mark == flp4->flowi4_mark &&
2736                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2737                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2738                     net_eq(dev_net(rth->dst.dev), net) &&
2739                     !rt_is_expired(rth)) {
2740                         dst_use(&rth->dst, jiffies);
2741                         RT_CACHE_STAT_INC(out_hit);
2742                         rcu_read_unlock_bh();
2743                         if (!flp4->saddr)
2744                                 flp4->saddr = rth->rt_src;
2745                         if (!flp4->daddr)
2746                                 flp4->daddr = rth->rt_dst;
2747                         return rth;
2748                 }
2749                 RT_CACHE_STAT_INC(out_hlist_search);
2750         }
2751         rcu_read_unlock_bh();
2752
2753 slow_output:
2754         return ip_route_output_slow(net, flp4);
2755 }
2756 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2757
2758 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2759 {
2760         return NULL;
2761 }
2762
2763 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2764 {
2765         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2766
2767         return mtu ? : dst->dev->mtu;
2768 }
2769
2770 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2771 {
2772 }
2773
2774 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2775                                           unsigned long old)
2776 {
2777         return NULL;
2778 }
2779
2780 static struct dst_ops ipv4_dst_blackhole_ops = {
2781         .family                 =       AF_INET,
2782         .protocol               =       cpu_to_be16(ETH_P_IP),
2783         .destroy                =       ipv4_dst_destroy,
2784         .check                  =       ipv4_blackhole_dst_check,
2785         .mtu                    =       ipv4_blackhole_mtu,
2786         .default_advmss         =       ipv4_default_advmss,
2787         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2788         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2789         .neigh_lookup           =       ipv4_neigh_lookup,
2790 };
2791
2792 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2793 {
2794         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2795         struct rtable *ort = (struct rtable *) dst_orig;
2796
2797         if (rt) {
2798                 struct dst_entry *new = &rt->dst;
2799
2800                 new->__use = 1;
2801                 new->input = dst_discard;
2802                 new->output = dst_discard;
2803                 dst_copy_metrics(new, &ort->dst);
2804
2805                 new->dev = ort->dst.dev;
2806                 if (new->dev)
2807                         dev_hold(new->dev);
2808
2809                 rt->rt_key_dst = ort->rt_key_dst;
2810                 rt->rt_key_src = ort->rt_key_src;
2811                 rt->rt_key_tos = ort->rt_key_tos;
2812                 rt->rt_route_iif = ort->rt_route_iif;
2813                 rt->rt_iif = ort->rt_iif;
2814                 rt->rt_oif = ort->rt_oif;
2815                 rt->rt_mark = ort->rt_mark;
2816
2817                 rt->rt_genid = rt_genid(net);
2818                 rt->rt_flags = ort->rt_flags;
2819                 rt->rt_type = ort->rt_type;
2820                 rt->rt_dst = ort->rt_dst;
2821                 rt->rt_src = ort->rt_src;
2822                 rt->rt_gateway = ort->rt_gateway;
2823                 rt->rt_spec_dst = ort->rt_spec_dst;
2824                 rt->peer = ort->peer;
2825                 if (rt->peer)
2826                         atomic_inc(&rt->peer->refcnt);
2827                 rt->fi = ort->fi;
2828                 if (rt->fi)
2829                         atomic_inc(&rt->fi->fib_clntref);
2830
2831                 dst_free(new);
2832         }
2833
2834         dst_release(dst_orig);
2835
2836         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2837 }
2838
2839 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2840                                     struct sock *sk)
2841 {
2842         struct rtable *rt = __ip_route_output_key(net, flp4);
2843
2844         if (IS_ERR(rt))
2845                 return rt;
2846
2847         if (flp4->flowi4_proto)
2848                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2849                                                    flowi4_to_flowi(flp4),
2850                                                    sk, 0);
2851
2852         return rt;
2853 }
2854 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2855
2856 static int rt_fill_info(struct net *net,
2857                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2858                         int nowait, unsigned int flags)
2859 {
2860         struct rtable *rt = skb_rtable(skb);
2861         struct rtmsg *r;
2862         struct nlmsghdr *nlh;
2863         unsigned long expires = 0;
2864         const struct inet_peer *peer = rt->peer;
2865         u32 id = 0, ts = 0, tsage = 0, error;
2866
2867         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2868         if (nlh == NULL)
2869                 return -EMSGSIZE;
2870
2871         r = nlmsg_data(nlh);
2872         r->rtm_family    = AF_INET;
2873         r->rtm_dst_len  = 32;
2874         r->rtm_src_len  = 0;
2875         r->rtm_tos      = rt->rt_key_tos;
2876         r->rtm_table    = RT_TABLE_MAIN;
2877         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2878         r->rtm_type     = rt->rt_type;
2879         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2880         r->rtm_protocol = RTPROT_UNSPEC;
2881         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2882         if (rt->rt_flags & RTCF_NOTIFY)
2883                 r->rtm_flags |= RTM_F_NOTIFY;
2884
2885         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2886
2887         if (rt->rt_key_src) {
2888                 r->rtm_src_len = 32;
2889                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2890         }
2891         if (rt->dst.dev)
2892                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2893 #ifdef CONFIG_IP_ROUTE_CLASSID
2894         if (rt->dst.tclassid)
2895                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2896 #endif
2897         if (rt_is_input_route(rt))
2898                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2899         else if (rt->rt_src != rt->rt_key_src)
2900                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2901
2902         if (rt->rt_dst != rt->rt_gateway)
2903                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2904
2905         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2906                 goto nla_put_failure;
2907
2908         if (rt->rt_mark)
2909                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2910
2911         error = rt->dst.error;
2912         if (peer) {
2913                 inet_peer_refcheck(rt->peer);
2914                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2915                 if (peer->tcp_ts_stamp) {
2916                         ts = peer->tcp_ts;
2917                         tsage = get_seconds() - peer->tcp_ts_stamp;
2918                 }
2919                 expires = ACCESS_ONCE(peer->pmtu_expires);
2920                 if (expires) {
2921                         if (time_before(jiffies, expires))
2922                                 expires -= jiffies;
2923                         else
2924                                 expires = 0;
2925                 }
2926         }
2927
2928         if (rt_is_input_route(rt)) {
2929 #ifdef CONFIG_IP_MROUTE
2930                 __be32 dst = rt->rt_dst;
2931
2932                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2933                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2934                         int err = ipmr_get_route(net, skb,
2935                                                  rt->rt_src, rt->rt_dst,
2936                                                  r, nowait);
2937                         if (err <= 0) {
2938                                 if (!nowait) {
2939                                         if (err == 0)
2940                                                 return 0;
2941                                         goto nla_put_failure;
2942                                 } else {
2943                                         if (err == -EMSGSIZE)
2944                                                 goto nla_put_failure;
2945                                         error = err;
2946                                 }
2947                         }
2948                 } else
2949 #endif
2950                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2951         }
2952
2953         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2954                                expires, error) < 0)
2955                 goto nla_put_failure;
2956
2957         return nlmsg_end(skb, nlh);
2958
2959 nla_put_failure:
2960         nlmsg_cancel(skb, nlh);
2961         return -EMSGSIZE;
2962 }
2963
2964 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2965 {
2966         struct net *net = sock_net(in_skb->sk);
2967         struct rtmsg *rtm;
2968         struct nlattr *tb[RTA_MAX+1];
2969         struct rtable *rt = NULL;
2970         __be32 dst = 0;
2971         __be32 src = 0;
2972         u32 iif;
2973         int err;
2974         int mark;
2975         struct sk_buff *skb;
2976
2977         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2978         if (err < 0)
2979                 goto errout;
2980
2981         rtm = nlmsg_data(nlh);
2982
2983         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2984         if (skb == NULL) {
2985                 err = -ENOBUFS;
2986                 goto errout;
2987         }
2988
2989         /* Reserve room for dummy headers, this skb can pass
2990            through good chunk of routing engine.
2991          */
2992         skb_reset_mac_header(skb);
2993         skb_reset_network_header(skb);
2994
2995         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2996         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2997         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2998
2999         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3000         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3001         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3002         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3003
3004         if (iif) {
3005                 struct net_device *dev;
3006
3007                 dev = __dev_get_by_index(net, iif);
3008                 if (dev == NULL) {
3009                         err = -ENODEV;
3010                         goto errout_free;
3011                 }
3012
3013                 skb->protocol   = htons(ETH_P_IP);
3014                 skb->dev        = dev;
3015                 skb->mark       = mark;
3016                 local_bh_disable();
3017                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3018                 local_bh_enable();
3019
3020                 rt = skb_rtable(skb);
3021                 if (err == 0 && rt->dst.error)
3022                         err = -rt->dst.error;
3023         } else {
3024                 struct flowi4 fl4 = {
3025                         .daddr = dst,
3026                         .saddr = src,
3027                         .flowi4_tos = rtm->rtm_tos,
3028                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3029                         .flowi4_mark = mark,
3030                 };
3031                 rt = ip_route_output_key(net, &fl4);
3032
3033                 err = 0;
3034                 if (IS_ERR(rt))
3035                         err = PTR_ERR(rt);
3036         }
3037
3038         if (err)
3039                 goto errout_free;
3040
3041         skb_dst_set(skb, &rt->dst);
3042         if (rtm->rtm_flags & RTM_F_NOTIFY)
3043                 rt->rt_flags |= RTCF_NOTIFY;
3044
3045         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3046                            RTM_NEWROUTE, 0, 0);
3047         if (err <= 0)
3048                 goto errout_free;
3049
3050         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3051 errout:
3052         return err;
3053
3054 errout_free:
3055         kfree_skb(skb);
3056         goto errout;
3057 }
3058
3059 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3060 {
3061         struct rtable *rt;
3062         int h, s_h;
3063         int idx, s_idx;
3064         struct net *net;
3065
3066         net = sock_net(skb->sk);
3067
3068         s_h = cb->args[0];
3069         if (s_h < 0)
3070                 s_h = 0;
3071         s_idx = idx = cb->args[1];
3072         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3073                 if (!rt_hash_table[h].chain)
3074                         continue;
3075                 rcu_read_lock_bh();
3076                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3077                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3078                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3079                                 continue;
3080                         if (rt_is_expired(rt))
3081                                 continue;
3082                         skb_dst_set_noref(skb, &rt->dst);
3083                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3084                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3085                                          1, NLM_F_MULTI) <= 0) {
3086                                 skb_dst_drop(skb);
3087                                 rcu_read_unlock_bh();
3088                                 goto done;
3089                         }
3090                         skb_dst_drop(skb);
3091                 }
3092                 rcu_read_unlock_bh();
3093         }
3094
3095 done:
3096         cb->args[0] = h;
3097         cb->args[1] = idx;
3098         return skb->len;
3099 }
3100
3101 void ip_rt_multicast_event(struct in_device *in_dev)
3102 {
3103         rt_cache_flush(dev_net(in_dev->dev), 0);
3104 }
3105
3106 #ifdef CONFIG_SYSCTL
3107 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3108                                         void __user *buffer,
3109                                         size_t *lenp, loff_t *ppos)
3110 {
3111         if (write) {
3112                 int flush_delay;
3113                 ctl_table ctl;
3114                 struct net *net;
3115
3116                 memcpy(&ctl, __ctl, sizeof(ctl));
3117                 ctl.data = &flush_delay;
3118                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3119
3120                 net = (struct net *)__ctl->extra1;
3121                 rt_cache_flush(net, flush_delay);
3122                 return 0;
3123         }
3124
3125         return -EINVAL;
3126 }
3127
3128 static ctl_table ipv4_route_table[] = {
3129         {
3130                 .procname       = "gc_thresh",
3131                 .data           = &ipv4_dst_ops.gc_thresh,
3132                 .maxlen         = sizeof(int),
3133                 .mode           = 0644,
3134                 .proc_handler   = proc_dointvec,
3135         },
3136         {
3137                 .procname       = "max_size",
3138                 .data           = &ip_rt_max_size,
3139                 .maxlen         = sizeof(int),
3140                 .mode           = 0644,
3141                 .proc_handler   = proc_dointvec,
3142         },
3143         {
3144                 /*  Deprecated. Use gc_min_interval_ms */
3145
3146                 .procname       = "gc_min_interval",
3147                 .data           = &ip_rt_gc_min_interval,
3148                 .maxlen         = sizeof(int),
3149                 .mode           = 0644,
3150                 .proc_handler   = proc_dointvec_jiffies,
3151         },
3152         {
3153                 .procname       = "gc_min_interval_ms",
3154                 .data           = &ip_rt_gc_min_interval,
3155                 .maxlen         = sizeof(int),
3156                 .mode           = 0644,
3157                 .proc_handler   = proc_dointvec_ms_jiffies,
3158         },
3159         {
3160                 .procname       = "gc_timeout",
3161                 .data           = &ip_rt_gc_timeout,
3162                 .maxlen         = sizeof(int),
3163                 .mode           = 0644,
3164                 .proc_handler   = proc_dointvec_jiffies,
3165         },
3166         {
3167                 .procname       = "redirect_load",
3168                 .data           = &ip_rt_redirect_load,
3169                 .maxlen         = sizeof(int),
3170                 .mode           = 0644,
3171                 .proc_handler   = proc_dointvec,
3172         },
3173         {
3174                 .procname       = "redirect_number",
3175                 .data           = &ip_rt_redirect_number,
3176                 .maxlen         = sizeof(int),
3177                 .mode           = 0644,
3178                 .proc_handler   = proc_dointvec,
3179         },
3180         {
3181                 .procname       = "redirect_silence",
3182                 .data           = &ip_rt_redirect_silence,
3183                 .maxlen         = sizeof(int),
3184                 .mode           = 0644,
3185                 .proc_handler   = proc_dointvec,
3186         },
3187         {
3188                 .procname       = "error_cost",
3189                 .data           = &ip_rt_error_cost,
3190                 .maxlen         = sizeof(int),
3191                 .mode           = 0644,
3192                 .proc_handler   = proc_dointvec,
3193         },
3194         {
3195                 .procname       = "error_burst",
3196                 .data           = &ip_rt_error_burst,
3197                 .maxlen         = sizeof(int),
3198                 .mode           = 0644,
3199                 .proc_handler   = proc_dointvec,
3200         },
3201         {
3202                 .procname       = "gc_elasticity",
3203                 .data           = &ip_rt_gc_elasticity,
3204                 .maxlen         = sizeof(int),
3205                 .mode           = 0644,
3206                 .proc_handler   = proc_dointvec,
3207         },
3208         {
3209                 .procname       = "mtu_expires",
3210                 .data           = &ip_rt_mtu_expires,
3211                 .maxlen         = sizeof(int),
3212                 .mode           = 0644,
3213                 .proc_handler   = proc_dointvec_jiffies,
3214         },
3215         {
3216                 .procname       = "min_pmtu",
3217                 .data           = &ip_rt_min_pmtu,
3218                 .maxlen         = sizeof(int),
3219                 .mode           = 0644,
3220                 .proc_handler   = proc_dointvec,
3221         },
3222         {
3223                 .procname       = "min_adv_mss",
3224                 .data           = &ip_rt_min_advmss,
3225                 .maxlen         = sizeof(int),
3226                 .mode           = 0644,
3227                 .proc_handler   = proc_dointvec,
3228         },
3229         { }
3230 };
3231
3232 static struct ctl_table empty[1];
3233
3234 static struct ctl_table ipv4_skeleton[] =
3235 {
3236         { .procname = "route",
3237           .mode = 0555, .child = ipv4_route_table},
3238         { .procname = "neigh",
3239           .mode = 0555, .child = empty},
3240         { }
3241 };
3242
3243 static __net_initdata struct ctl_path ipv4_path[] = {
3244         { .procname = "net", },
3245         { .procname = "ipv4", },
3246         { },
3247 };
3248
3249 static struct ctl_table ipv4_route_flush_table[] = {
3250         {
3251                 .procname       = "flush",
3252                 .maxlen         = sizeof(int),
3253                 .mode           = 0200,
3254                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3255         },
3256         { },
3257 };
3258
3259 static __net_initdata struct ctl_path ipv4_route_path[] = {
3260         { .procname = "net", },
3261         { .procname = "ipv4", },
3262         { .procname = "route", },
3263         { },
3264 };
3265
3266 static __net_init int sysctl_route_net_init(struct net *net)
3267 {
3268         struct ctl_table *tbl;
3269
3270         tbl = ipv4_route_flush_table;
3271         if (!net_eq(net, &init_net)) {
3272                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3273                 if (tbl == NULL)
3274                         goto err_dup;
3275         }
3276         tbl[0].extra1 = net;
3277
3278         net->ipv4.route_hdr =
3279                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3280         if (net->ipv4.route_hdr == NULL)
3281                 goto err_reg;
3282         return 0;
3283
3284 err_reg:
3285         if (tbl != ipv4_route_flush_table)
3286                 kfree(tbl);
3287 err_dup:
3288         return -ENOMEM;
3289 }
3290
3291 static __net_exit void sysctl_route_net_exit(struct net *net)
3292 {
3293         struct ctl_table *tbl;
3294
3295         tbl = net->ipv4.route_hdr->ctl_table_arg;
3296         unregister_net_sysctl_table(net->ipv4.route_hdr);
3297         BUG_ON(tbl == ipv4_route_flush_table);
3298         kfree(tbl);
3299 }
3300
3301 static __net_initdata struct pernet_operations sysctl_route_ops = {
3302         .init = sysctl_route_net_init,
3303         .exit = sysctl_route_net_exit,
3304 };
3305 #endif
3306
3307 static __net_init int rt_genid_init(struct net *net)
3308 {
3309         get_random_bytes(&net->ipv4.rt_genid,
3310                          sizeof(net->ipv4.rt_genid));
3311         get_random_bytes(&net->ipv4.dev_addr_genid,
3312                          sizeof(net->ipv4.dev_addr_genid));
3313         return 0;
3314 }
3315
3316 static __net_initdata struct pernet_operations rt_genid_ops = {
3317         .init = rt_genid_init,
3318 };
3319
3320
3321 #ifdef CONFIG_IP_ROUTE_CLASSID
3322 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3323 #endif /* CONFIG_IP_ROUTE_CLASSID */
3324
3325 static __initdata unsigned long rhash_entries;
3326 static int __init set_rhash_entries(char *str)
3327 {
3328         if (!str)
3329                 return 0;
3330         rhash_entries = simple_strtoul(str, &str, 0);
3331         return 1;
3332 }
3333 __setup("rhash_entries=", set_rhash_entries);
3334
3335 int __init ip_rt_init(void)
3336 {
3337         int rc = 0;
3338
3339 #ifdef CONFIG_IP_ROUTE_CLASSID
3340         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3341         if (!ip_rt_acct)
3342                 panic("IP: failed to allocate ip_rt_acct\n");
3343 #endif
3344
3345         ipv4_dst_ops.kmem_cachep =
3346                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3347                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3348
3349         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3350
3351         if (dst_entries_init(&ipv4_dst_ops) < 0)
3352                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3353
3354         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3355                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3356
3357         rt_hash_table = (struct rt_hash_bucket *)
3358                 alloc_large_system_hash("IP route cache",
3359                                         sizeof(struct rt_hash_bucket),
3360                                         rhash_entries,
3361                                         (totalram_pages >= 128 * 1024) ?
3362                                         15 : 17,
3363                                         0,
3364                                         &rt_hash_log,
3365                                         &rt_hash_mask,
3366                                         rhash_entries ? 0 : 512 * 1024);
3367         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3368         rt_hash_lock_init();
3369
3370         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3371         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3372
3373         devinet_init();
3374         ip_fib_init();
3375
3376         if (ip_rt_proc_init())
3377                 printk(KERN_ERR "Unable to create route proc files\n");
3378 #ifdef CONFIG_XFRM
3379         xfrm_init();
3380         xfrm4_init(ip_rt_max_size);
3381 #endif
3382         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3383
3384 #ifdef CONFIG_SYSCTL
3385         register_pernet_subsys(&sysctl_route_ops);
3386 #endif
3387         register_pernet_subsys(&rt_genid_ops);
3388         return rc;
3389 }
3390
3391 #ifdef CONFIG_SYSCTL
3392 /*
3393  * We really need to sanitize the damn ipv4 init order, then all
3394  * this nonsense will go away.
3395  */
3396 void __init ip_static_sysctl_init(void)
3397 {
3398         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3399 }
3400 #endif