net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         struct rtable *rt = (struct rtable *) dst;
 162         struct inet_peer *peer;
 163         u32 *p = NULL;
 164
 165         if (!rt->peer)
 166                 rt_bind_peer(rt, rt->rt_dst, 1);
 167
 168         peer = rt->peer;
 169         if (peer) {
 170                 u32 *old_p = __DST_METRICS_PTR(old);
 171                 unsigned long prev, new;
 172
 173                 p = peer->metrics;
 174                 if (inet_metrics_new(peer))
 175                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 176
 177                 new = (unsigned long) p;
 178                 prev = cmpxchg(&dst->_metrics, old, new);
 179
 180                 if (prev != old) {
 181                         p = __DST_METRICS_PTR(prev);
 182                         if (prev & DST_METRICS_READ_ONLY)
 183                                 p = NULL;
 184                 } else {
 185                         if (rt->fi) {
 186                                 fib_info_put(rt->fi);
 187                                 rt->fi = NULL;
 188                         }
 189                 }
 190         }
 191         return p;
 192 }
 193
 194 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 195
 196 static struct dst_ops ipv4_dst_ops = {
 197         .family =               AF_INET,
 198         .protocol =             cpu_to_be16(ETH_P_IP),
 199         .gc =                   rt_garbage_collect,
 200         .check =                ipv4_dst_check,
 201         .default_advmss =       ipv4_default_advmss,
 202         .mtu =                  ipv4_mtu,
 203         .cow_metrics =          ipv4_cow_metrics,
 204         .destroy =              ipv4_dst_destroy,
 205         .ifdown =               ipv4_dst_ifdown,
 206         .negative_advice =      ipv4_negative_advice,
 207         .link_failure =         ipv4_link_failure,
 208         .update_pmtu =          ip_rt_update_pmtu,
 209         .local_out =            __ip_local_out,
 210         .neigh_lookup =         ipv4_neigh_lookup,
 211 };
 212
 213 #define ECN_OR_COST(class)      TC_PRIO_##class
 214
 215 const __u8 ip_tos2prio[16] = {
 216         TC_PRIO_BESTEFFORT,
 217         ECN_OR_COST(BESTEFFORT),
 218         TC_PRIO_BESTEFFORT,
 219         ECN_OR_COST(BESTEFFORT),
 220         TC_PRIO_BULK,
 221         ECN_OR_COST(BULK),
 222         TC_PRIO_BULK,
 223         ECN_OR_COST(BULK),
 224         TC_PRIO_INTERACTIVE,
 225         ECN_OR_COST(INTERACTIVE),
 226         TC_PRIO_INTERACTIVE,
 227         ECN_OR_COST(INTERACTIVE),
 228         TC_PRIO_INTERACTIVE_BULK,
 229         ECN_OR_COST(INTERACTIVE_BULK),
 230         TC_PRIO_INTERACTIVE_BULK,
 231         ECN_OR_COST(INTERACTIVE_BULK)
 232 };
 233 EXPORT_SYMBOL(ip_tos2prio);
 234
 235 /*
 236  * Route cache.
 237  */
 238
 239 /* The locking scheme is rather straight forward:
 240  *
 241  * 1) Read-Copy Update protects the buckets of the central route hash.
 242  * 2) Only writers remove entries, and they hold the lock
 243  *    as they look at rtable reference counts.
 244  * 3) Only readers acquire references to rtable entries,
 245  *    they do so with atomic increments and with the
 246  *    lock held.
 247  */
 248
 249 struct rt_hash_bucket {
 250         struct rtable __rcu     *chain;
 251 };
 252
 253 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 254         defined(CONFIG_PROVE_LOCKING)
 255 /*
 256  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 257  * The size of this table is a power of two and depends on the number of CPUS.
 258  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 259  */
 260 #ifdef CONFIG_LOCKDEP
 261 # define RT_HASH_LOCK_SZ        256
 262 #else
 263 # if NR_CPUS >= 32
 264 #  define RT_HASH_LOCK_SZ       4096
 265 # elif NR_CPUS >= 16
 266 #  define RT_HASH_LOCK_SZ       2048
 267 # elif NR_CPUS >= 8
 268 #  define RT_HASH_LOCK_SZ       1024
 269 # elif NR_CPUS >= 4
 270 #  define RT_HASH_LOCK_SZ       512
 271 # else
 272 #  define RT_HASH_LOCK_SZ       256
 273 # endif
 274 #endif
 275
 276 static spinlock_t       *rt_hash_locks;
 277 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 278
 279 static __init void rt_hash_lock_init(void)
 280 {
 281         int i;
 282
 283         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 284                         GFP_KERNEL);
 285         if (!rt_hash_locks)
 286                 panic("IP: failed to allocate rt_hash_locks\n");
 287
 288         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 289                 spin_lock_init(&rt_hash_locks[i]);
 290 }
 291 #else
 292 # define rt_hash_lock_addr(slot) NULL
 293
 294 static inline void rt_hash_lock_init(void)
 295 {
 296 }
 297 #endif
 298
 299 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 300 static unsigned int             rt_hash_mask __read_mostly;
 301 static unsigned int             rt_hash_log  __read_mostly;
 302
 303 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 304 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 305
 306 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 307                                    int genid)
 308 {
 309         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 310                             idx, genid)
 311                 & rt_hash_mask;
 312 }
 313
 314 static inline int rt_genid(struct net *net)
 315 {
 316         return atomic_read(&net->ipv4.rt_genid);
 317 }
 318
 319 #ifdef CONFIG_PROC_FS
 320 struct rt_cache_iter_state {
 321         struct seq_net_private p;
 322         int bucket;
 323         int genid;
 324 };
 325
 326 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 327 {
 328         struct rt_cache_iter_state *st = seq->private;
 329         struct rtable *r = NULL;
 330
 331         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 332                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 333                         continue;
 334                 rcu_read_lock_bh();
 335                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 336                 while (r) {
 337                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 338                             r->rt_genid == st->genid)
 339                                 return r;
 340                         r = rcu_dereference_bh(r->dst.rt_next);
 341                 }
 342                 rcu_read_unlock_bh();
 343         }
 344         return r;
 345 }
 346
 347 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 348                                           struct rtable *r)
 349 {
 350         struct rt_cache_iter_state *st = seq->private;
 351
 352         r = rcu_dereference_bh(r->dst.rt_next);
 353         while (!r) {
 354                 rcu_read_unlock_bh();
 355                 do {
 356                         if (--st->bucket < 0)
 357                                 return NULL;
 358                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 359                 rcu_read_lock_bh();
 360                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 361         }
 362         return r;
 363 }
 364
 365 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 366                                         struct rtable *r)
 367 {
 368         struct rt_cache_iter_state *st = seq->private;
 369         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 370                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 371                         continue;
 372                 if (r->rt_genid == st->genid)
 373                         break;
 374         }
 375         return r;
 376 }
 377
 378 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 379 {
 380         struct rtable *r = rt_cache_get_first(seq);
 381
 382         if (r)
 383                 while (pos && (r = rt_cache_get_next(seq, r)))
 384                         --pos;
 385         return pos ? NULL : r;
 386 }
 387
 388 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 389 {
 390         struct rt_cache_iter_state *st = seq->private;
 391         if (*pos)
 392                 return rt_cache_get_idx(seq, *pos - 1);
 393         st->genid = rt_genid(seq_file_net(seq));
 394         return SEQ_START_TOKEN;
 395 }
 396
 397 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 398 {
 399         struct rtable *r;
 400
 401         if (v == SEQ_START_TOKEN)
 402                 r = rt_cache_get_first(seq);
 403         else
 404                 r = rt_cache_get_next(seq, v);
 405         ++*pos;
 406         return r;
 407 }
 408
 409 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 410 {
 411         if (v && v != SEQ_START_TOKEN)
 412                 rcu_read_unlock_bh();
 413 }
 414
 415 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 416 {
 417         if (v == SEQ_START_TOKEN)
 418                 seq_printf(seq, "%-127s\n",
 419                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 420                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 421                            "HHUptod\tSpecDst");
 422         else {
 423                 struct rtable *r = v;
 424                 struct neighbour *n;
 425                 int len, HHUptod;
 426
 427                 rcu_read_lock();
 428                 n = dst_get_neighbour_noref(&r->dst);
 429                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 430                 rcu_read_unlock();
 431
 432                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 433                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 434                         r->dst.dev ? r->dst.dev->name : "*",
 435                         (__force u32)r->rt_dst,
 436                         (__force u32)r->rt_gateway,
 437                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 438                         r->dst.__use, 0, (__force u32)r->rt_src,
 439                         dst_metric_advmss(&r->dst) + 40,
 440                         dst_metric(&r->dst, RTAX_WINDOW),
 441                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 442                               dst_metric(&r->dst, RTAX_RTTVAR)),
 443                         r->rt_key_tos,
 444                         -1,
 445                         HHUptod,
 446                         r->rt_spec_dst, &len);
 447
 448                 seq_printf(seq, "%*s\n", 127 - len, "");
 449         }
 450         return 0;
 451 }
 452
 453 static const struct seq_operations rt_cache_seq_ops = {
 454         .start  = rt_cache_seq_start,
 455         .next   = rt_cache_seq_next,
 456         .stop   = rt_cache_seq_stop,
 457         .show   = rt_cache_seq_show,
 458 };
 459
 460 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 461 {
 462         return seq_open_net(inode, file, &rt_cache_seq_ops,
 463                         sizeof(struct rt_cache_iter_state));
 464 }
 465
 466 static const struct file_operations rt_cache_seq_fops = {
 467         .owner   = THIS_MODULE,
 468         .open    = rt_cache_seq_open,
 469         .read    = seq_read,
 470         .llseek  = seq_lseek,
 471         .release = seq_release_net,
 472 };
 473
 474
 475 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 476 {
 477         int cpu;
 478
 479         if (*pos == 0)
 480                 return SEQ_START_TOKEN;
 481
 482         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 483                 if (!cpu_possible(cpu))
 484                         continue;
 485                 *pos = cpu+1;
 486                 return &per_cpu(rt_cache_stat, cpu);
 487         }
 488         return NULL;
 489 }
 490
 491 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 492 {
 493         int cpu;
 494
 495         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 496                 if (!cpu_possible(cpu))
 497                         continue;
 498                 *pos = cpu+1;
 499                 return &per_cpu(rt_cache_stat, cpu);
 500         }
 501         return NULL;
 502
 503 }
 504
 505 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 506 {
 507
 508 }
 509
 510 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 511 {
 512         struct rt_cache_stat *st = v;
 513
 514         if (v == SEQ_START_TOKEN) {
 515                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 516                 return 0;
 517         }
 518
 519         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 520                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 521                    dst_entries_get_slow(&ipv4_dst_ops),
 522                    st->in_hit,
 523                    st->in_slow_tot,
 524                    st->in_slow_mc,
 525                    st->in_no_route,
 526                    st->in_brd,
 527                    st->in_martian_dst,
 528                    st->in_martian_src,
 529
 530                    st->out_hit,
 531                    st->out_slow_tot,
 532                    st->out_slow_mc,
 533
 534                    st->gc_total,
 535                    st->gc_ignored,
 536                    st->gc_goal_miss,
 537                    st->gc_dst_overflow,
 538                    st->in_hlist_search,
 539                    st->out_hlist_search
 540                 );
 541         return 0;
 542 }
 543
 544 static const struct seq_operations rt_cpu_seq_ops = {
 545         .start  = rt_cpu_seq_start,
 546         .next   = rt_cpu_seq_next,
 547         .stop   = rt_cpu_seq_stop,
 548         .show   = rt_cpu_seq_show,
 549 };
 550
 551
 552 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 553 {
 554         return seq_open(file, &rt_cpu_seq_ops);
 555 }
 556
 557 static const struct file_operations rt_cpu_seq_fops = {
 558         .owner   = THIS_MODULE,
 559         .open    = rt_cpu_seq_open,
 560         .read    = seq_read,
 561         .llseek  = seq_lseek,
 562         .release = seq_release,
 563 };
 564
 565 #ifdef CONFIG_IP_ROUTE_CLASSID
 566 static int rt_acct_proc_show(struct seq_file *m, void *v)
 567 {
 568         struct ip_rt_acct *dst, *src;
 569         unsigned int i, j;
 570
 571         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 572         if (!dst)
 573                 return -ENOMEM;
 574
 575         for_each_possible_cpu(i) {
 576                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 577                 for (j = 0; j < 256; j++) {
 578                         dst[j].o_bytes   += src[j].o_bytes;
 579                         dst[j].o_packets += src[j].o_packets;
 580                         dst[j].i_bytes   += src[j].i_bytes;
 581                         dst[j].i_packets += src[j].i_packets;
 582                 }
 583         }
 584
 585         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 586         kfree(dst);
 587         return 0;
 588 }
 589
 590 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 591 {
 592         return single_open(file, rt_acct_proc_show, NULL);
 593 }
 594
 595 static const struct file_operations rt_acct_proc_fops = {
 596         .owner          = THIS_MODULE,
 597         .open           = rt_acct_proc_open,
 598         .read           = seq_read,
 599         .llseek         = seq_lseek,
 600         .release        = single_release,
 601 };
 602 #endif
 603
 604 static int __net_init ip_rt_do_proc_init(struct net *net)
 605 {
 606         struct proc_dir_entry *pde;
 607
 608         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 609                         &rt_cache_seq_fops);
 610         if (!pde)
 611                 goto err1;
 612
 613         pde = proc_create("rt_cache", S_IRUGO,
 614                           net->proc_net_stat, &rt_cpu_seq_fops);
 615         if (!pde)
 616                 goto err2;
 617
 618 #ifdef CONFIG_IP_ROUTE_CLASSID
 619         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 620         if (!pde)
 621                 goto err3;
 622 #endif
 623         return 0;
 624
 625 #ifdef CONFIG_IP_ROUTE_CLASSID
 626 err3:
 627         remove_proc_entry("rt_cache", net->proc_net_stat);
 628 #endif
 629 err2:
 630         remove_proc_entry("rt_cache", net->proc_net);
 631 err1:
 632         return -ENOMEM;
 633 }
 634
 635 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 636 {
 637         remove_proc_entry("rt_cache", net->proc_net_stat);
 638         remove_proc_entry("rt_cache", net->proc_net);
 639 #ifdef CONFIG_IP_ROUTE_CLASSID
 640         remove_proc_entry("rt_acct", net->proc_net);
 641 #endif
 642 }
 643
 644 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 645         .init = ip_rt_do_proc_init,
 646         .exit = ip_rt_do_proc_exit,
 647 };
 648
 649 static int __init ip_rt_proc_init(void)
 650 {
 651         return register_pernet_subsys(&ip_rt_proc_ops);
 652 }
 653
 654 #else
 655 static inline int ip_rt_proc_init(void)
 656 {
 657         return 0;
 658 }
 659 #endif /* CONFIG_PROC_FS */
 660
 661 static inline void rt_free(struct rtable *rt)
 662 {
 663         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 664 }
 665
 666 static inline void rt_drop(struct rtable *rt)
 667 {
 668         ip_rt_put(rt);
 669         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 670 }
 671
 672 static inline int rt_fast_clean(struct rtable *rth)
 673 {
 674         /* Kill broadcast/multicast entries very aggresively, if they
 675            collide in hash table with more useful entries */
 676         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 677                 rt_is_input_route(rth) && rth->dst.rt_next;
 678 }
 679
 680 static inline int rt_valuable(struct rtable *rth)
 681 {
 682         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 683                 (rth->peer && rth->peer->pmtu_expires);
 684 }
 685
 686 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 687 {
 688         unsigned long age;
 689         int ret = 0;
 690
 691         if (atomic_read(&rth->dst.__refcnt))
 692                 goto out;
 693
 694         age = jiffies - rth->dst.lastuse;
 695         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 696             (age <= tmo2 && rt_valuable(rth)))
 697                 goto out;
 698         ret = 1;
 699 out:    return ret;
 700 }
 701
 702 /* Bits of score are:
 703  * 31: very valuable
 704  * 30: not quite useless
 705  * 29..0: usage counter
 706  */
 707 static inline u32 rt_score(struct rtable *rt)
 708 {
 709         u32 score = jiffies - rt->dst.lastuse;
 710
 711         score = ~score & ~(3<<30);
 712
 713         if (rt_valuable(rt))
 714                 score |= (1<<31);
 715
 716         if (rt_is_output_route(rt) ||
 717             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 718                 score |= (1<<30);
 719
 720         return score;
 721 }
 722
 723 static inline bool rt_caching(const struct net *net)
 724 {
 725         return net->ipv4.current_rt_cache_rebuild_count <=
 726                 net->ipv4.sysctl_rt_cache_rebuild_count;
 727 }
 728
 729 static inline bool compare_hash_inputs(const struct rtable *rt1,
 730                                        const struct rtable *rt2)
 731 {
 732         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 733                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 734                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 735 }
 736
 737 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 738 {
 739         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 740                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 741                 (rt1->rt_mark ^ rt2->rt_mark) |
 742                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 743                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 744                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 745 }
 746
 747 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 748 {
 749         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 750 }
 751
 752 static inline int rt_is_expired(struct rtable *rth)
 753 {
 754         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 755 }
 756
 757 /*
 758  * Perform a full scan of hash table and free all entries.
 759  * Can be called by a softirq or a process.
 760  * In the later case, we want to be reschedule if necessary
 761  */
 762 static void rt_do_flush(struct net *net, int process_context)
 763 {
 764         unsigned int i;
 765         struct rtable *rth, *next;
 766
 767         for (i = 0; i <= rt_hash_mask; i++) {
 768                 struct rtable __rcu **pprev;
 769                 struct rtable *list;
 770
 771                 if (process_context && need_resched())
 772                         cond_resched();
 773                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 774                 if (!rth)
 775                         continue;
 776
 777                 spin_lock_bh(rt_hash_lock_addr(i));
 778
 779                 list = NULL;
 780                 pprev = &rt_hash_table[i].chain;
 781                 rth = rcu_dereference_protected(*pprev,
 782                         lockdep_is_held(rt_hash_lock_addr(i)));
 783
 784                 while (rth) {
 785                         next = rcu_dereference_protected(rth->dst.rt_next,
 786                                 lockdep_is_held(rt_hash_lock_addr(i)));
 787
 788                         if (!net ||
 789                             net_eq(dev_net(rth->dst.dev), net)) {
 790                                 rcu_assign_pointer(*pprev, next);
 791                                 rcu_assign_pointer(rth->dst.rt_next, list);
 792                                 list = rth;
 793                         } else {
 794                                 pprev = &rth->dst.rt_next;
 795                         }
 796                         rth = next;
 797                 }
 798
 799                 spin_unlock_bh(rt_hash_lock_addr(i));
 800
 801                 for (; list; list = next) {
 802                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 803                         rt_free(list);
 804                 }
 805         }
 806 }
 807
 808 /*
 809  * While freeing expired entries, we compute average chain length
 810  * and standard deviation, using fixed-point arithmetic.
 811  * This to have an estimation of rt_chain_length_max
 812  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 813  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 814  */
 815
 816 #define FRACT_BITS 3
 817 #define ONE (1UL << FRACT_BITS)
 818
 819 /*
 820  * Given a hash chain and an item in this hash chain,
 821  * find if a previous entry has the same hash_inputs
 822  * (but differs on tos, mark or oif)
 823  * Returns 0 if an alias is found.
 824  * Returns ONE if rth has no alias before itself.
 825  */
 826 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 827 {
 828         const struct rtable *aux = head;
 829
 830         while (aux != rth) {
 831                 if (compare_hash_inputs(aux, rth))
 832                         return 0;
 833                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 834         }
 835         return ONE;
 836 }
 837
 838 static void rt_check_expire(void)
 839 {
 840         static unsigned int rover;
 841         unsigned int i = rover, goal;
 842         struct rtable *rth;
 843         struct rtable __rcu **rthp;
 844         unsigned long samples = 0;
 845         unsigned long sum = 0, sum2 = 0;
 846         unsigned long delta;
 847         u64 mult;
 848
 849         delta = jiffies - expires_ljiffies;
 850         expires_ljiffies = jiffies;
 851         mult = ((u64)delta) << rt_hash_log;
 852         if (ip_rt_gc_timeout > 1)
 853                 do_div(mult, ip_rt_gc_timeout);
 854         goal = (unsigned int)mult;
 855         if (goal > rt_hash_mask)
 856                 goal = rt_hash_mask + 1;
 857         for (; goal > 0; goal--) {
 858                 unsigned long tmo = ip_rt_gc_timeout;
 859                 unsigned long length;
 860
 861                 i = (i + 1) & rt_hash_mask;
 862                 rthp = &rt_hash_table[i].chain;
 863
 864                 if (need_resched())
 865                         cond_resched();
 866
 867                 samples++;
 868
 869                 if (rcu_dereference_raw(*rthp) == NULL)
 870                         continue;
 871                 length = 0;
 872                 spin_lock_bh(rt_hash_lock_addr(i));
 873                 while ((rth = rcu_dereference_protected(*rthp,
 874                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 875                         prefetch(rth->dst.rt_next);
 876                         if (rt_is_expired(rth)) {
 877                                 *rthp = rth->dst.rt_next;
 878                                 rt_free(rth);
 879                                 continue;
 880                         }
 881                         if (rth->dst.expires) {
 882                                 /* Entry is expired even if it is in use */
 883                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 884 nofree:
 885                                         tmo >>= 1;
 886                                         rthp = &rth->dst.rt_next;
 887                                         /*
 888                                          * We only count entries on
 889                                          * a chain with equal hash inputs once
 890                                          * so that entries for different QOS
 891                                          * levels, and other non-hash input
 892                                          * attributes don't unfairly skew
 893                                          * the length computation
 894                                          */
 895                                         length += has_noalias(rt_hash_table[i].chain, rth);
 896                                         continue;
 897                                 }
 898                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 899                                 goto nofree;
 900
 901                         /* Cleanup aged off entries. */
 902                         *rthp = rth->dst.rt_next;
 903                         rt_free(rth);
 904                 }
 905                 spin_unlock_bh(rt_hash_lock_addr(i));
 906                 sum += length;
 907                 sum2 += length*length;
 908         }
 909         if (samples) {
 910                 unsigned long avg = sum / samples;
 911                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 912                 rt_chain_length_max = max_t(unsigned long,
 913                                         ip_rt_gc_elasticity,
 914                                         (avg + 4*sd) >> FRACT_BITS);
 915         }
 916         rover = i;
 917 }
 918
 919 /*
 920  * rt_worker_func() is run in process context.
 921  * we call rt_check_expire() to scan part of the hash table
 922  */
 923 static void rt_worker_func(struct work_struct *work)
 924 {
 925         rt_check_expire();
 926         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 927 }
 928
 929 /*
 930  * Perturbation of rt_genid by a small quantity [1..256]
 931  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 932  * many times (2^24) without giving recent rt_genid.
 933  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 934  */
 935 static void rt_cache_invalidate(struct net *net)
 936 {
 937         unsigned char shuffle;
 938
 939         get_random_bytes(&shuffle, sizeof(shuffle));
 940         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 941         inetpeer_invalidate_tree(AF_INET);
 942 }
 943
 944 /*
 945  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 946  * delay >= 0 : invalidate & flush cache (can be long)
 947  */
 948 void rt_cache_flush(struct net *net, int delay)
 949 {
 950         rt_cache_invalidate(net);
 951         if (delay >= 0)
 952                 rt_do_flush(net, !in_softirq());
 953 }
 954
 955 /* Flush previous cache invalidated entries from the cache */
 956 void rt_cache_flush_batch(struct net *net)
 957 {
 958         rt_do_flush(net, !in_softirq());
 959 }
 960
 961 static void rt_emergency_hash_rebuild(struct net *net)
 962 {
 963         if (net_ratelimit())
 964                 pr_warn("Route hash chain too long!\n");
 965         rt_cache_invalidate(net);
 966 }
 967
 968 /*
 969    Short description of GC goals.
 970
 971    We want to build algorithm, which will keep routing cache
 972    at some equilibrium point, when number of aged off entries
 973    is kept approximately equal to newly generated ones.
 974
 975    Current expiration strength is variable "expire".
 976    We try to adjust it dynamically, so that if networking
 977    is idle expires is large enough to keep enough of warm entries,
 978    and when load increases it reduces to limit cache size.
 979  */
 980
 981 static int rt_garbage_collect(struct dst_ops *ops)
 982 {
 983         static unsigned long expire = RT_GC_TIMEOUT;
 984         static unsigned long last_gc;
 985         static int rover;
 986         static int equilibrium;
 987         struct rtable *rth;
 988         struct rtable __rcu **rthp;
 989         unsigned long now = jiffies;
 990         int goal;
 991         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 992
 993         /*
 994          * Garbage collection is pretty expensive,
 995          * do not make it too frequently.
 996          */
 997
 998         RT_CACHE_STAT_INC(gc_total);
 999
1000         if (now - last_gc < ip_rt_gc_min_interval &&
1001             entries < ip_rt_max_size) {
1002                 RT_CACHE_STAT_INC(gc_ignored);
1003                 goto out;
1004         }
1005
1006         entries = dst_entries_get_slow(&ipv4_dst_ops);
1007         /* Calculate number of entries, which we want to expire now. */
1008         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1009         if (goal <= 0) {
1010                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1011                         equilibrium = ipv4_dst_ops.gc_thresh;
1012                 goal = entries - equilibrium;
1013                 if (goal > 0) {
1014                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1015                         goal = entries - equilibrium;
1016                 }
1017         } else {
1018                 /* We are in dangerous area. Try to reduce cache really
1019                  * aggressively.
1020                  */
1021                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1022                 equilibrium = entries - goal;
1023         }
1024
1025         if (now - last_gc >= ip_rt_gc_min_interval)
1026                 last_gc = now;
1027
1028         if (goal <= 0) {
1029                 equilibrium += goal;
1030                 goto work_done;
1031         }
1032
1033         do {
1034                 int i, k;
1035
1036                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1037                         unsigned long tmo = expire;
1038
1039                         k = (k + 1) & rt_hash_mask;
1040                         rthp = &rt_hash_table[k].chain;
1041                         spin_lock_bh(rt_hash_lock_addr(k));
1042                         while ((rth = rcu_dereference_protected(*rthp,
1043                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1044                                 if (!rt_is_expired(rth) &&
1045                                         !rt_may_expire(rth, tmo, expire)) {
1046                                         tmo >>= 1;
1047                                         rthp = &rth->dst.rt_next;
1048                                         continue;
1049                                 }
1050                                 *rthp = rth->dst.rt_next;
1051                                 rt_free(rth);
1052                                 goal--;
1053                         }
1054                         spin_unlock_bh(rt_hash_lock_addr(k));
1055                         if (goal <= 0)
1056                                 break;
1057                 }
1058                 rover = k;
1059
1060                 if (goal <= 0)
1061                         goto work_done;
1062
1063                 /* Goal is not achieved. We stop process if:
1064
1065                    - if expire reduced to zero. Otherwise, expire is halfed.
1066                    - if table is not full.
1067                    - if we are called from interrupt.
1068                    - jiffies check is just fallback/debug loop breaker.
1069                      We will not spin here for long time in any case.
1070                  */
1071
1072                 RT_CACHE_STAT_INC(gc_goal_miss);
1073
1074                 if (expire == 0)
1075                         break;
1076
1077                 expire >>= 1;
1078
1079                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1080                         goto out;
1081         } while (!in_softirq() && time_before_eq(jiffies, now));
1082
1083         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1084                 goto out;
1085         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1086                 goto out;
1087         if (net_ratelimit())
1088                 pr_warn("dst cache overflow\n");
1089         RT_CACHE_STAT_INC(gc_dst_overflow);
1090         return 1;
1091
1092 work_done:
1093         expire += ip_rt_gc_min_interval;
1094         if (expire > ip_rt_gc_timeout ||
1095             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1096             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1097                 expire = ip_rt_gc_timeout;
1098 out:    return 0;
1099 }
1100
1101 /*
1102  * Returns number of entries in a hash chain that have different hash_inputs
1103  */
1104 static int slow_chain_length(const struct rtable *head)
1105 {
1106         int length = 0;
1107         const struct rtable *rth = head;
1108
1109         while (rth) {
1110                 length += has_noalias(head, rth);
1111                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1112         }
1113         return length >> FRACT_BITS;
1114 }
1115
1116 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1117 {
1118         static const __be32 inaddr_any = 0;
1119         struct net_device *dev = dst->dev;
1120         const __be32 *pkey = daddr;
1121         const struct rtable *rt;
1122         struct neighbour *n;
1123
1124         rt = (const struct rtable *) dst;
1125
1126         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1127                 pkey = &inaddr_any;
1128         else if (rt->rt_gateway)
1129                 pkey = (const __be32 *) &rt->rt_gateway;
1130
1131         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1132         if (n)
1133                 return n;
1134         return neigh_create(&arp_tbl, pkey, dev);
1135 }
1136
1137 static int rt_bind_neighbour(struct rtable *rt)
1138 {
1139         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1140         if (IS_ERR(n))
1141                 return PTR_ERR(n);
1142         dst_set_neighbour(&rt->dst, n);
1143
1144         return 0;
1145 }
1146
1147 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1148                                      struct sk_buff *skb, int ifindex)
1149 {
1150         struct rtable   *rth, *cand;
1151         struct rtable __rcu **rthp, **candp;
1152         unsigned long   now;
1153         u32             min_score;
1154         int             chain_length;
1155         int attempts = !in_softirq();
1156
1157 restart:
1158         chain_length = 0;
1159         min_score = ~(u32)0;
1160         cand = NULL;
1161         candp = NULL;
1162         now = jiffies;
1163
1164         if (!rt_caching(dev_net(rt->dst.dev))) {
1165                 /*
1166                  * If we're not caching, just tell the caller we
1167                  * were successful and don't touch the route.  The
1168                  * caller hold the sole reference to the cache entry, and
1169                  * it will be released when the caller is done with it.
1170                  * If we drop it here, the callers have no way to resolve routes
1171                  * when we're not caching.  Instead, just point *rp at rt, so
1172                  * the caller gets a single use out of the route
1173                  * Note that we do rt_free on this new route entry, so that
1174                  * once its refcount hits zero, we are still able to reap it
1175                  * (Thanks Alexey)
1176                  * Note: To avoid expensive rcu stuff for this uncached dst,
1177                  * we set DST_NOCACHE so that dst_release() can free dst without
1178                  * waiting a grace period.
1179                  */
1180
1181                 rt->dst.flags |= DST_NOCACHE;
1182                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1183                         int err = rt_bind_neighbour(rt);
1184                         if (err) {
1185                                 if (net_ratelimit())
1186                                         pr_warn("Neighbour table failure & not caching routes\n");
1187                                 ip_rt_put(rt);
1188                                 return ERR_PTR(err);
1189                         }
1190                 }
1191
1192                 goto skip_hashing;
1193         }
1194
1195         rthp = &rt_hash_table[hash].chain;
1196
1197         spin_lock_bh(rt_hash_lock_addr(hash));
1198         while ((rth = rcu_dereference_protected(*rthp,
1199                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1200                 if (rt_is_expired(rth)) {
1201                         *rthp = rth->dst.rt_next;
1202                         rt_free(rth);
1203                         continue;
1204                 }
1205                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1206                         /* Put it first */
1207                         *rthp = rth->dst.rt_next;
1208                         /*
1209                          * Since lookup is lockfree, the deletion
1210                          * must be visible to another weakly ordered CPU before
1211                          * the insertion at the start of the hash chain.
1212                          */
1213                         rcu_assign_pointer(rth->dst.rt_next,
1214                                            rt_hash_table[hash].chain);
1215                         /*
1216                          * Since lookup is lockfree, the update writes
1217                          * must be ordered for consistency on SMP.
1218                          */
1219                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1220
1221                         dst_use(&rth->dst, now);
1222                         spin_unlock_bh(rt_hash_lock_addr(hash));
1223
1224                         rt_drop(rt);
1225                         if (skb)
1226                                 skb_dst_set(skb, &rth->dst);
1227                         return rth;
1228                 }
1229
1230                 if (!atomic_read(&rth->dst.__refcnt)) {
1231                         u32 score = rt_score(rth);
1232
1233                         if (score <= min_score) {
1234                                 cand = rth;
1235                                 candp = rthp;
1236                                 min_score = score;
1237                         }
1238                 }
1239
1240                 chain_length++;
1241
1242                 rthp = &rth->dst.rt_next;
1243         }
1244
1245         if (cand) {
1246                 /* ip_rt_gc_elasticity used to be average length of chain
1247                  * length, when exceeded gc becomes really aggressive.
1248                  *
1249                  * The second limit is less certain. At the moment it allows
1250                  * only 2 entries per bucket. We will see.
1251                  */
1252                 if (chain_length > ip_rt_gc_elasticity) {
1253                         *candp = cand->dst.rt_next;
1254                         rt_free(cand);
1255                 }
1256         } else {
1257                 if (chain_length > rt_chain_length_max &&
1258                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1259                         struct net *net = dev_net(rt->dst.dev);
1260                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1261                         if (!rt_caching(net)) {
1262                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1263                                         rt->dst.dev->name, num);
1264                         }
1265                         rt_emergency_hash_rebuild(net);
1266                         spin_unlock_bh(rt_hash_lock_addr(hash));
1267
1268                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1269                                         ifindex, rt_genid(net));
1270                         goto restart;
1271                 }
1272         }
1273
1274         /* Try to bind route to arp only if it is output
1275            route or unicast forwarding path.
1276          */
1277         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1278                 int err = rt_bind_neighbour(rt);
1279                 if (err) {
1280                         spin_unlock_bh(rt_hash_lock_addr(hash));
1281
1282                         if (err != -ENOBUFS) {
1283                                 rt_drop(rt);
1284                                 return ERR_PTR(err);
1285                         }
1286
1287                         /* Neighbour tables are full and nothing
1288                            can be released. Try to shrink route cache,
1289                            it is most likely it holds some neighbour records.
1290                          */
1291                         if (attempts-- > 0) {
1292                                 int saved_elasticity = ip_rt_gc_elasticity;
1293                                 int saved_int = ip_rt_gc_min_interval;
1294                                 ip_rt_gc_elasticity     = 1;
1295                                 ip_rt_gc_min_interval   = 0;
1296                                 rt_garbage_collect(&ipv4_dst_ops);
1297                                 ip_rt_gc_min_interval   = saved_int;
1298                                 ip_rt_gc_elasticity     = saved_elasticity;
1299                                 goto restart;
1300                         }
1301
1302                         if (net_ratelimit())
1303                                 pr_warn("Neighbour table overflow\n");
1304                         rt_drop(rt);
1305                         return ERR_PTR(-ENOBUFS);
1306                 }
1307         }
1308
1309         rt->dst.rt_next = rt_hash_table[hash].chain;
1310
1311         /*
1312          * Since lookup is lockfree, we must make sure
1313          * previous writes to rt are committed to memory
1314          * before making rt visible to other CPUS.
1315          */
1316         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1317
1318         spin_unlock_bh(rt_hash_lock_addr(hash));
1319
1320 skip_hashing:
1321         if (skb)
1322                 skb_dst_set(skb, &rt->dst);
1323         return rt;
1324 }
1325
1326 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1327
1328 static u32 rt_peer_genid(void)
1329 {
1330         return atomic_read(&__rt_peer_genid);
1331 }
1332
1333 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1334 {
1335         struct inet_peer *peer;
1336
1337         peer = inet_getpeer_v4(daddr, create);
1338
1339         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1340                 inet_putpeer(peer);
1341         else
1342                 rt->rt_peer_genid = rt_peer_genid();
1343 }
1344
1345 /*
1346  * Peer allocation may fail only in serious out-of-memory conditions.  However
1347  * we still can generate some output.
1348  * Random ID selection looks a bit dangerous because we have no chances to
1349  * select ID being unique in a reasonable period of time.
1350  * But broken packet identifier may be better than no packet at all.
1351  */
1352 static void ip_select_fb_ident(struct iphdr *iph)
1353 {
1354         static DEFINE_SPINLOCK(ip_fb_id_lock);
1355         static u32 ip_fallback_id;
1356         u32 salt;
1357
1358         spin_lock_bh(&ip_fb_id_lock);
1359         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1360         iph->id = htons(salt & 0xFFFF);
1361         ip_fallback_id = salt;
1362         spin_unlock_bh(&ip_fb_id_lock);
1363 }
1364
1365 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1366 {
1367         struct rtable *rt = (struct rtable *) dst;
1368
1369         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1370                 if (rt->peer == NULL)
1371                         rt_bind_peer(rt, rt->rt_dst, 1);
1372
1373                 /* If peer is attached to destination, it is never detached,
1374                    so that we need not to grab a lock to dereference it.
1375                  */
1376                 if (rt->peer) {
1377                         iph->id = htons(inet_getid(rt->peer, more));
1378                         return;
1379                 }
1380         } else if (!rt)
1381                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1382                        __builtin_return_address(0));
1383
1384         ip_select_fb_ident(iph);
1385 }
1386 EXPORT_SYMBOL(__ip_select_ident);
1387
1388 static void rt_del(unsigned int hash, struct rtable *rt)
1389 {
1390         struct rtable __rcu **rthp;
1391         struct rtable *aux;
1392
1393         rthp = &rt_hash_table[hash].chain;
1394         spin_lock_bh(rt_hash_lock_addr(hash));
1395         ip_rt_put(rt);
1396         while ((aux = rcu_dereference_protected(*rthp,
1397                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1398                 if (aux == rt || rt_is_expired(aux)) {
1399                         *rthp = aux->dst.rt_next;
1400                         rt_free(aux);
1401                         continue;
1402                 }
1403                 rthp = &aux->dst.rt_next;
1404         }
1405         spin_unlock_bh(rt_hash_lock_addr(hash));
1406 }
1407
1408 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1409 {
1410         struct rtable *rt = (struct rtable *) dst;
1411         __be32 orig_gw = rt->rt_gateway;
1412         struct neighbour *n, *old_n;
1413
1414         dst_confirm(&rt->dst);
1415
1416         rt->rt_gateway = peer->redirect_learned.a4;
1417
1418         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1419         if (IS_ERR(n)) {
1420                 rt->rt_gateway = orig_gw;
1421                 return;
1422         }
1423         old_n = xchg(&rt->dst._neighbour, n);
1424         if (old_n)
1425                 neigh_release(old_n);
1426         if (!(n->nud_state & NUD_VALID)) {
1427                 neigh_event_send(n, NULL);
1428         } else {
1429                 rt->rt_flags |= RTCF_REDIRECTED;
1430                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1431         }
1432 }
1433
1434 /* called in rcu_read_lock() section */
1435 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1436                     __be32 saddr, struct net_device *dev)
1437 {
1438         int s, i;
1439         struct in_device *in_dev = __in_dev_get_rcu(dev);
1440         __be32 skeys[2] = { saddr, 0 };
1441         int    ikeys[2] = { dev->ifindex, 0 };
1442         struct inet_peer *peer;
1443         struct net *net;
1444
1445         if (!in_dev)
1446                 return;
1447
1448         net = dev_net(dev);
1449         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1450             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1451             ipv4_is_zeronet(new_gw))
1452                 goto reject_redirect;
1453
1454         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1455                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1456                         goto reject_redirect;
1457                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1458                         goto reject_redirect;
1459         } else {
1460                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1461                         goto reject_redirect;
1462         }
1463
1464         for (s = 0; s < 2; s++) {
1465                 for (i = 0; i < 2; i++) {
1466                         unsigned int hash;
1467                         struct rtable __rcu **rthp;
1468                         struct rtable *rt;
1469
1470                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1471
1472                         rthp = &rt_hash_table[hash].chain;
1473
1474                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1475                                 rthp = &rt->dst.rt_next;
1476
1477                                 if (rt->rt_key_dst != daddr ||
1478                                     rt->rt_key_src != skeys[s] ||
1479                                     rt->rt_oif != ikeys[i] ||
1480                                     rt_is_input_route(rt) ||
1481                                     rt_is_expired(rt) ||
1482                                     !net_eq(dev_net(rt->dst.dev), net) ||
1483                                     rt->dst.error ||
1484                                     rt->dst.dev != dev ||
1485                                     rt->rt_gateway != old_gw)
1486                                         continue;
1487
1488                                 if (!rt->peer)
1489                                         rt_bind_peer(rt, rt->rt_dst, 1);
1490
1491                                 peer = rt->peer;
1492                                 if (peer) {
1493                                         if (peer->redirect_learned.a4 != new_gw) {
1494                                                 peer->redirect_learned.a4 = new_gw;
1495                                                 atomic_inc(&__rt_peer_genid);
1496                                         }
1497                                         check_peer_redir(&rt->dst, peer);
1498                                 }
1499                         }
1500                 }
1501         }
1502         return;
1503
1504 reject_redirect:
1505 #ifdef CONFIG_IP_ROUTE_VERBOSE
1506         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1507                 pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
1508                         "  Advised path = %pI4 -> %pI4\n",
1509                         &old_gw, dev->name, &new_gw,
1510                         &saddr, &daddr);
1511 #endif
1512         ;
1513 }
1514
1515 static bool peer_pmtu_expired(struct inet_peer *peer)
1516 {
1517         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1518
1519         return orig &&
1520                time_after_eq(jiffies, orig) &&
1521                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1522 }
1523
1524 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1525 {
1526         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1527
1528         return orig &&
1529                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1530 }
1531
1532 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1533 {
1534         struct rtable *rt = (struct rtable *)dst;
1535         struct dst_entry *ret = dst;
1536
1537         if (rt) {
1538                 if (dst->obsolete > 0) {
1539                         ip_rt_put(rt);
1540                         ret = NULL;
1541                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1542                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1543                                                 rt->rt_oif,
1544                                                 rt_genid(dev_net(dst->dev)));
1545                         rt_del(hash, rt);
1546                         ret = NULL;
1547                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1548                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1549                 }
1550         }
1551         return ret;
1552 }
1553
1554 /*
1555  * Algorithm:
1556  *      1. The first ip_rt_redirect_number redirects are sent
1557  *         with exponential backoff, then we stop sending them at all,
1558  *         assuming that the host ignores our redirects.
1559  *      2. If we did not see packets requiring redirects
1560  *         during ip_rt_redirect_silence, we assume that the host
1561  *         forgot redirected route and start to send redirects again.
1562  *
1563  * This algorithm is much cheaper and more intelligent than dumb load limiting
1564  * in icmp.c.
1565  *
1566  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1567  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1568  */
1569
1570 void ip_rt_send_redirect(struct sk_buff *skb)
1571 {
1572         struct rtable *rt = skb_rtable(skb);
1573         struct in_device *in_dev;
1574         struct inet_peer *peer;
1575         int log_martians;
1576
1577         rcu_read_lock();
1578         in_dev = __in_dev_get_rcu(rt->dst.dev);
1579         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1580                 rcu_read_unlock();
1581                 return;
1582         }
1583         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1584         rcu_read_unlock();
1585
1586         if (!rt->peer)
1587                 rt_bind_peer(rt, rt->rt_dst, 1);
1588         peer = rt->peer;
1589         if (!peer) {
1590                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1591                 return;
1592         }
1593
1594         /* No redirected packets during ip_rt_redirect_silence;
1595          * reset the algorithm.
1596          */
1597         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1598                 peer->rate_tokens = 0;
1599
1600         /* Too many ignored redirects; do not send anything
1601          * set dst.rate_last to the last seen redirected packet.
1602          */
1603         if (peer->rate_tokens >= ip_rt_redirect_number) {
1604                 peer->rate_last = jiffies;
1605                 return;
1606         }
1607
1608         /* Check for load limit; set rate_last to the latest sent
1609          * redirect.
1610          */
1611         if (peer->rate_tokens == 0 ||
1612             time_after(jiffies,
1613                        (peer->rate_last +
1614                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1615                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1616                 peer->rate_last = jiffies;
1617                 ++peer->rate_tokens;
1618 #ifdef CONFIG_IP_ROUTE_VERBOSE
1619                 if (log_martians &&
1620                     peer->rate_tokens == ip_rt_redirect_number &&
1621                     net_ratelimit())
1622                         pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1623                                 &ip_hdr(skb)->saddr, rt->rt_iif,
1624                                 &rt->rt_dst, &rt->rt_gateway);
1625 #endif
1626         }
1627 }
1628
1629 static int ip_error(struct sk_buff *skb)
1630 {
1631         struct rtable *rt = skb_rtable(skb);
1632         struct inet_peer *peer;
1633         unsigned long now;
1634         bool send;
1635         int code;
1636
1637         switch (rt->dst.error) {
1638         case EINVAL:
1639         default:
1640                 goto out;
1641         case EHOSTUNREACH:
1642                 code = ICMP_HOST_UNREACH;
1643                 break;
1644         case ENETUNREACH:
1645                 code = ICMP_NET_UNREACH;
1646                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1647                                 IPSTATS_MIB_INNOROUTES);
1648                 break;
1649         case EACCES:
1650                 code = ICMP_PKT_FILTERED;
1651                 break;
1652         }
1653
1654         if (!rt->peer)
1655                 rt_bind_peer(rt, rt->rt_dst, 1);
1656         peer = rt->peer;
1657
1658         send = true;
1659         if (peer) {
1660                 now = jiffies;
1661                 peer->rate_tokens += now - peer->rate_last;
1662                 if (peer->rate_tokens > ip_rt_error_burst)
1663                         peer->rate_tokens = ip_rt_error_burst;
1664                 peer->rate_last = now;
1665                 if (peer->rate_tokens >= ip_rt_error_cost)
1666                         peer->rate_tokens -= ip_rt_error_cost;
1667                 else
1668                         send = false;
1669         }
1670         if (send)
1671                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1672
1673 out:    kfree_skb(skb);
1674         return 0;
1675 }
1676
1677 /*
1678  *      The last two values are not from the RFC but
1679  *      are needed for AMPRnet AX.25 paths.
1680  */
1681
1682 static const unsigned short mtu_plateau[] =
1683 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1684
1685 static inline unsigned short guess_mtu(unsigned short old_mtu)
1686 {
1687         int i;
1688
1689         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1690                 if (old_mtu > mtu_plateau[i])
1691                         return mtu_plateau[i];
1692         return 68;
1693 }
1694
1695 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1696                                  unsigned short new_mtu,
1697                                  struct net_device *dev)
1698 {
1699         unsigned short old_mtu = ntohs(iph->tot_len);
1700         unsigned short est_mtu = 0;
1701         struct inet_peer *peer;
1702
1703         peer = inet_getpeer_v4(iph->daddr, 1);
1704         if (peer) {
1705                 unsigned short mtu = new_mtu;
1706
1707                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1708                         /* BSD 4.2 derived systems incorrectly adjust
1709                          * tot_len by the IP header length, and report
1710                          * a zero MTU in the ICMP message.
1711                          */
1712                         if (mtu == 0 &&
1713                             old_mtu >= 68 + (iph->ihl << 2))
1714                                 old_mtu -= iph->ihl << 2;
1715                         mtu = guess_mtu(old_mtu);
1716                 }
1717
1718                 if (mtu < ip_rt_min_pmtu)
1719                         mtu = ip_rt_min_pmtu;
1720                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1721                         unsigned long pmtu_expires;
1722
1723                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1724                         if (!pmtu_expires)
1725                                 pmtu_expires = 1UL;
1726
1727                         est_mtu = mtu;
1728                         peer->pmtu_learned = mtu;
1729                         peer->pmtu_expires = pmtu_expires;
1730                         atomic_inc(&__rt_peer_genid);
1731                 }
1732
1733                 inet_putpeer(peer);
1734         }
1735         return est_mtu ? : new_mtu;
1736 }
1737
1738 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1739 {
1740         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1741
1742         if (!expires)
1743                 return;
1744         if (time_before(jiffies, expires)) {
1745                 u32 orig_dst_mtu = dst_mtu(dst);
1746                 if (peer->pmtu_learned < orig_dst_mtu) {
1747                         if (!peer->pmtu_orig)
1748                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1749                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1750                 }
1751         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1752                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1753 }
1754
1755 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1756 {
1757         struct rtable *rt = (struct rtable *) dst;
1758         struct inet_peer *peer;
1759
1760         dst_confirm(dst);
1761
1762         if (!rt->peer)
1763                 rt_bind_peer(rt, rt->rt_dst, 1);
1764         peer = rt->peer;
1765         if (peer) {
1766                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1767
1768                 if (mtu < ip_rt_min_pmtu)
1769                         mtu = ip_rt_min_pmtu;
1770                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1771
1772                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1773                         if (!pmtu_expires)
1774                                 pmtu_expires = 1UL;
1775
1776                         peer->pmtu_learned = mtu;
1777                         peer->pmtu_expires = pmtu_expires;
1778
1779                         atomic_inc(&__rt_peer_genid);
1780                         rt->rt_peer_genid = rt_peer_genid();
1781                 }
1782                 check_peer_pmtu(dst, peer);
1783         }
1784 }
1785
1786
1787 static void ipv4_validate_peer(struct rtable *rt)
1788 {
1789         if (rt->rt_peer_genid != rt_peer_genid()) {
1790                 struct inet_peer *peer;
1791
1792                 if (!rt->peer)
1793                         rt_bind_peer(rt, rt->rt_dst, 0);
1794
1795                 peer = rt->peer;
1796                 if (peer) {
1797                         check_peer_pmtu(&rt->dst, peer);
1798
1799                         if (peer->redirect_learned.a4 &&
1800                             peer->redirect_learned.a4 != rt->rt_gateway)
1801                                 check_peer_redir(&rt->dst, peer);
1802                 }
1803
1804                 rt->rt_peer_genid = rt_peer_genid();
1805         }
1806 }
1807
1808 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1809 {
1810         struct rtable *rt = (struct rtable *) dst;
1811
1812         if (rt_is_expired(rt))
1813                 return NULL;
1814         ipv4_validate_peer(rt);
1815         return dst;
1816 }
1817
1818 static void ipv4_dst_destroy(struct dst_entry *dst)
1819 {
1820         struct rtable *rt = (struct rtable *) dst;
1821         struct inet_peer *peer = rt->peer;
1822
1823         if (rt->fi) {
1824                 fib_info_put(rt->fi);
1825                 rt->fi = NULL;
1826         }
1827         if (peer) {
1828                 rt->peer = NULL;
1829                 inet_putpeer(peer);
1830         }
1831 }
1832
1833
1834 static void ipv4_link_failure(struct sk_buff *skb)
1835 {
1836         struct rtable *rt;
1837
1838         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1839
1840         rt = skb_rtable(skb);
1841         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1842                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1843 }
1844
1845 static int ip_rt_bug(struct sk_buff *skb)
1846 {
1847         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1848                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1849                 skb->dev ? skb->dev->name : "?");
1850         kfree_skb(skb);
1851         WARN_ON(1);
1852         return 0;
1853 }
1854
1855 /*
1856    We do not cache source address of outgoing interface,
1857    because it is used only by IP RR, TS and SRR options,
1858    so that it out of fast path.
1859
1860    BTW remember: "addr" is allowed to be not aligned
1861    in IP options!
1862  */
1863
1864 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1865 {
1866         __be32 src;
1867
1868         if (rt_is_output_route(rt))
1869                 src = ip_hdr(skb)->saddr;
1870         else {
1871                 struct fib_result res;
1872                 struct flowi4 fl4;
1873                 struct iphdr *iph;
1874
1875                 iph = ip_hdr(skb);
1876
1877                 memset(&fl4, 0, sizeof(fl4));
1878                 fl4.daddr = iph->daddr;
1879                 fl4.saddr = iph->saddr;
1880                 fl4.flowi4_tos = RT_TOS(iph->tos);
1881                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1882                 fl4.flowi4_iif = skb->dev->ifindex;
1883                 fl4.flowi4_mark = skb->mark;
1884
1885                 rcu_read_lock();
1886                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1887                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1888                 else
1889                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1890                                         RT_SCOPE_UNIVERSE);
1891                 rcu_read_unlock();
1892         }
1893         memcpy(addr, &src, 4);
1894 }
1895
1896 #ifdef CONFIG_IP_ROUTE_CLASSID
1897 static void set_class_tag(struct rtable *rt, u32 tag)
1898 {
1899         if (!(rt->dst.tclassid & 0xFFFF))
1900                 rt->dst.tclassid |= tag & 0xFFFF;
1901         if (!(rt->dst.tclassid & 0xFFFF0000))
1902                 rt->dst.tclassid |= tag & 0xFFFF0000;
1903 }
1904 #endif
1905
1906 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1907 {
1908         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1909
1910         if (advmss == 0) {
1911                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1912                                ip_rt_min_advmss);
1913                 if (advmss > 65535 - 40)
1914                         advmss = 65535 - 40;
1915         }
1916         return advmss;
1917 }
1918
1919 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1920 {
1921         const struct rtable *rt = (const struct rtable *) dst;
1922         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1923
1924         if (mtu && rt_is_output_route(rt))
1925                 return mtu;
1926
1927         mtu = dst->dev->mtu;
1928
1929         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1930
1931                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1932                         mtu = 576;
1933         }
1934
1935         if (mtu > IP_MAX_MTU)
1936                 mtu = IP_MAX_MTU;
1937
1938         return mtu;
1939 }
1940
1941 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1942                             struct fib_info *fi)
1943 {
1944         struct inet_peer *peer;
1945         int create = 0;
1946
1947         /* If a peer entry exists for this destination, we must hook
1948          * it up in order to get at cached metrics.
1949          */
1950         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1951                 create = 1;
1952
1953         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1954         if (peer) {
1955                 rt->rt_peer_genid = rt_peer_genid();
1956                 if (inet_metrics_new(peer))
1957                         memcpy(peer->metrics, fi->fib_metrics,
1958                                sizeof(u32) * RTAX_MAX);
1959                 dst_init_metrics(&rt->dst, peer->metrics, false);
1960
1961                 check_peer_pmtu(&rt->dst, peer);
1962
1963                 if (peer->redirect_learned.a4 &&
1964                     peer->redirect_learned.a4 != rt->rt_gateway) {
1965                         rt->rt_gateway = peer->redirect_learned.a4;
1966                         rt->rt_flags |= RTCF_REDIRECTED;
1967                 }
1968         } else {
1969                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1970                         rt->fi = fi;
1971                         atomic_inc(&fi->fib_clntref);
1972                 }
1973                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1974         }
1975 }
1976
1977 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1978                            const struct fib_result *res,
1979                            struct fib_info *fi, u16 type, u32 itag)
1980 {
1981         struct dst_entry *dst = &rt->dst;
1982
1983         if (fi) {
1984                 if (FIB_RES_GW(*res) &&
1985                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1986                         rt->rt_gateway = FIB_RES_GW(*res);
1987                 rt_init_metrics(rt, fl4, fi);
1988 #ifdef CONFIG_IP_ROUTE_CLASSID
1989                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1990 #endif
1991         }
1992
1993         if (dst_mtu(dst) > IP_MAX_MTU)
1994                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1995         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1996                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1997
1998 #ifdef CONFIG_IP_ROUTE_CLASSID
1999 #ifdef CONFIG_IP_MULTIPLE_TABLES
2000         set_class_tag(rt, fib_rules_tclass(res));
2001 #endif
2002         set_class_tag(rt, itag);
2003 #endif
2004 }
2005
2006 static struct rtable *rt_dst_alloc(struct net_device *dev,
2007                                    bool nopolicy, bool noxfrm)
2008 {
2009         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2010                          DST_HOST |
2011                          (nopolicy ? DST_NOPOLICY : 0) |
2012                          (noxfrm ? DST_NOXFRM : 0));
2013 }
2014
2015 /* called in rcu_read_lock() section */
2016 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2017                                 u8 tos, struct net_device *dev, int our)
2018 {
2019         unsigned int hash;
2020         struct rtable *rth;
2021         __be32 spec_dst;
2022         struct in_device *in_dev = __in_dev_get_rcu(dev);
2023         u32 itag = 0;
2024         int err;
2025
2026         /* Primary sanity checks. */
2027
2028         if (in_dev == NULL)
2029                 return -EINVAL;
2030
2031         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2032             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2033                 goto e_inval;
2034
2035         if (ipv4_is_zeronet(saddr)) {
2036                 if (!ipv4_is_local_multicast(daddr))
2037                         goto e_inval;
2038                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2039         } else {
2040                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2041                                           &itag);
2042                 if (err < 0)
2043                         goto e_err;
2044         }
2045         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2046                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2047         if (!rth)
2048                 goto e_nobufs;
2049
2050 #ifdef CONFIG_IP_ROUTE_CLASSID
2051         rth->dst.tclassid = itag;
2052 #endif
2053         rth->dst.output = ip_rt_bug;
2054
2055         rth->rt_key_dst = daddr;
2056         rth->rt_key_src = saddr;
2057         rth->rt_genid   = rt_genid(dev_net(dev));
2058         rth->rt_flags   = RTCF_MULTICAST;
2059         rth->rt_type    = RTN_MULTICAST;
2060         rth->rt_key_tos = tos;
2061         rth->rt_dst     = daddr;
2062         rth->rt_src     = saddr;
2063         rth->rt_route_iif = dev->ifindex;
2064         rth->rt_iif     = dev->ifindex;
2065         rth->rt_oif     = 0;
2066         rth->rt_mark    = skb->mark;
2067         rth->rt_gateway = daddr;
2068         rth->rt_spec_dst= spec_dst;
2069         rth->rt_peer_genid = 0;
2070         rth->peer = NULL;
2071         rth->fi = NULL;
2072         if (our) {
2073                 rth->dst.input= ip_local_deliver;
2074                 rth->rt_flags |= RTCF_LOCAL;
2075         }
2076
2077 #ifdef CONFIG_IP_MROUTE
2078         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2079                 rth->dst.input = ip_mr_input;
2080 #endif
2081         RT_CACHE_STAT_INC(in_slow_mc);
2082
2083         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2084         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2085         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2086
2087 e_nobufs:
2088         return -ENOBUFS;
2089 e_inval:
2090         return -EINVAL;
2091 e_err:
2092         return err;
2093 }
2094
2095
2096 static void ip_handle_martian_source(struct net_device *dev,
2097                                      struct in_device *in_dev,
2098                                      struct sk_buff *skb,
2099                                      __be32 daddr,
2100                                      __be32 saddr)
2101 {
2102         RT_CACHE_STAT_INC(in_martian_src);
2103 #ifdef CONFIG_IP_ROUTE_VERBOSE
2104         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2105                 /*
2106                  *      RFC1812 recommendation, if source is martian,
2107                  *      the only hint is MAC header.
2108                  */
2109                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2110                         &daddr, &saddr, dev->name);
2111                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2112                         print_hex_dump(KERN_WARNING, "ll header: ",
2113                                        DUMP_PREFIX_OFFSET, 16, 1,
2114                                        skb_mac_header(skb),
2115                                        dev->hard_header_len, true);
2116                 }
2117         }
2118 #endif
2119 }
2120
2121 /* called in rcu_read_lock() section */
2122 static int __mkroute_input(struct sk_buff *skb,
2123                            const struct fib_result *res,
2124                            struct in_device *in_dev,
2125                            __be32 daddr, __be32 saddr, u32 tos,
2126                            struct rtable **result)
2127 {
2128         struct rtable *rth;
2129         int err;
2130         struct in_device *out_dev;
2131         unsigned int flags = 0;
2132         __be32 spec_dst;
2133         u32 itag;
2134
2135         /* get a working reference to the output device */
2136         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2137         if (out_dev == NULL) {
2138                 if (net_ratelimit())
2139                         pr_crit("Bug in ip_route_input_slow(). Please report.\n");
2140                 return -EINVAL;
2141         }
2142
2143
2144         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2145                                   in_dev->dev, &spec_dst, &itag);
2146         if (err < 0) {
2147                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2148                                          saddr);
2149
2150                 goto cleanup;
2151         }
2152
2153         if (err)
2154                 flags |= RTCF_DIRECTSRC;
2155
2156         if (out_dev == in_dev && err &&
2157             (IN_DEV_SHARED_MEDIA(out_dev) ||
2158              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2159                 flags |= RTCF_DOREDIRECT;
2160
2161         if (skb->protocol != htons(ETH_P_IP)) {
2162                 /* Not IP (i.e. ARP). Do not create route, if it is
2163                  * invalid for proxy arp. DNAT routes are always valid.
2164                  *
2165                  * Proxy arp feature have been extended to allow, ARP
2166                  * replies back to the same interface, to support
2167                  * Private VLAN switch technologies. See arp.c.
2168                  */
2169                 if (out_dev == in_dev &&
2170                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2171                         err = -EINVAL;
2172                         goto cleanup;
2173                 }
2174         }
2175
2176         rth = rt_dst_alloc(out_dev->dev,
2177                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2178                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2179         if (!rth) {
2180                 err = -ENOBUFS;
2181                 goto cleanup;
2182         }
2183
2184         rth->rt_key_dst = daddr;
2185         rth->rt_key_src = saddr;
2186         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2187         rth->rt_flags = flags;
2188         rth->rt_type = res->type;
2189         rth->rt_key_tos = tos;
2190         rth->rt_dst     = daddr;
2191         rth->rt_src     = saddr;
2192         rth->rt_route_iif = in_dev->dev->ifindex;
2193         rth->rt_iif     = in_dev->dev->ifindex;
2194         rth->rt_oif     = 0;
2195         rth->rt_mark    = skb->mark;
2196         rth->rt_gateway = daddr;
2197         rth->rt_spec_dst= spec_dst;
2198         rth->rt_peer_genid = 0;
2199         rth->peer = NULL;
2200         rth->fi = NULL;
2201
2202         rth->dst.input = ip_forward;
2203         rth->dst.output = ip_output;
2204
2205         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2206
2207         *result = rth;
2208         err = 0;
2209  cleanup:
2210         return err;
2211 }
2212
2213 static int ip_mkroute_input(struct sk_buff *skb,
2214                             struct fib_result *res,
2215                             const struct flowi4 *fl4,
2216                             struct in_device *in_dev,
2217                             __be32 daddr, __be32 saddr, u32 tos)
2218 {
2219         struct rtable *rth = NULL;
2220         int err;
2221         unsigned int hash;
2222
2223 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2224         if (res->fi && res->fi->fib_nhs > 1)
2225                 fib_select_multipath(res);
2226 #endif
2227
2228         /* create a routing cache entry */
2229         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2230         if (err)
2231                 return err;
2232
2233         /* put it into the cache */
2234         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2235                        rt_genid(dev_net(rth->dst.dev)));
2236         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2237         if (IS_ERR(rth))
2238                 return PTR_ERR(rth);
2239         return 0;
2240 }
2241
2242 /*
2243  *      NOTE. We drop all the packets that has local source
2244  *      addresses, because every properly looped back packet
2245  *      must have correct destination already attached by output routine.
2246  *
2247  *      Such approach solves two big problems:
2248  *      1. Not simplex devices are handled properly.
2249  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2250  *      called with rcu_read_lock()
2251  */
2252
2253 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2254                                u8 tos, struct net_device *dev)
2255 {
2256         struct fib_result res;
2257         struct in_device *in_dev = __in_dev_get_rcu(dev);
2258         struct flowi4   fl4;
2259         unsigned int    flags = 0;
2260         u32             itag = 0;
2261         struct rtable   *rth;
2262         unsigned int    hash;
2263         __be32          spec_dst;
2264         int             err = -EINVAL;
2265         struct net    *net = dev_net(dev);
2266
2267         /* IP on this device is disabled. */
2268
2269         if (!in_dev)
2270                 goto out;
2271
2272         /* Check for the most weird martians, which can be not detected
2273            by fib_lookup.
2274          */
2275
2276         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2277             ipv4_is_loopback(saddr))
2278                 goto martian_source;
2279
2280         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2281                 goto brd_input;
2282
2283         /* Accept zero addresses only to limited broadcast;
2284          * I even do not know to fix it or not. Waiting for complains :-)
2285          */
2286         if (ipv4_is_zeronet(saddr))
2287                 goto martian_source;
2288
2289         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2290                 goto martian_destination;
2291
2292         /*
2293          *      Now we are ready to route packet.
2294          */
2295         fl4.flowi4_oif = 0;
2296         fl4.flowi4_iif = dev->ifindex;
2297         fl4.flowi4_mark = skb->mark;
2298         fl4.flowi4_tos = tos;
2299         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2300         fl4.daddr = daddr;
2301         fl4.saddr = saddr;
2302         err = fib_lookup(net, &fl4, &res);
2303         if (err != 0) {
2304                 if (!IN_DEV_FORWARD(in_dev))
2305                         goto e_hostunreach;
2306                 goto no_route;
2307         }
2308
2309         RT_CACHE_STAT_INC(in_slow_tot);
2310
2311         if (res.type == RTN_BROADCAST)
2312                 goto brd_input;
2313
2314         if (res.type == RTN_LOCAL) {
2315                 err = fib_validate_source(skb, saddr, daddr, tos,
2316                                           net->loopback_dev->ifindex,
2317                                           dev, &spec_dst, &itag);
2318                 if (err < 0)
2319                         goto martian_source_keep_err;
2320                 if (err)
2321                         flags |= RTCF_DIRECTSRC;
2322                 spec_dst = daddr;
2323                 goto local_input;
2324         }
2325
2326         if (!IN_DEV_FORWARD(in_dev))
2327                 goto e_hostunreach;
2328         if (res.type != RTN_UNICAST)
2329                 goto martian_destination;
2330
2331         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2332 out:    return err;
2333
2334 brd_input:
2335         if (skb->protocol != htons(ETH_P_IP))
2336                 goto e_inval;
2337
2338         if (ipv4_is_zeronet(saddr))
2339                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2340         else {
2341                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2342                                           &itag);
2343                 if (err < 0)
2344                         goto martian_source_keep_err;
2345                 if (err)
2346                         flags |= RTCF_DIRECTSRC;
2347         }
2348         flags |= RTCF_BROADCAST;
2349         res.type = RTN_BROADCAST;
2350         RT_CACHE_STAT_INC(in_brd);
2351
2352 local_input:
2353         rth = rt_dst_alloc(net->loopback_dev,
2354                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2355         if (!rth)
2356                 goto e_nobufs;
2357
2358         rth->dst.input= ip_local_deliver;
2359         rth->dst.output= ip_rt_bug;
2360 #ifdef CONFIG_IP_ROUTE_CLASSID
2361         rth->dst.tclassid = itag;
2362 #endif
2363
2364         rth->rt_key_dst = daddr;
2365         rth->rt_key_src = saddr;
2366         rth->rt_genid = rt_genid(net);
2367         rth->rt_flags   = flags|RTCF_LOCAL;
2368         rth->rt_type    = res.type;
2369         rth->rt_key_tos = tos;
2370         rth->rt_dst     = daddr;
2371         rth->rt_src     = saddr;
2372 #ifdef CONFIG_IP_ROUTE_CLASSID
2373         rth->dst.tclassid = itag;
2374 #endif
2375         rth->rt_route_iif = dev->ifindex;
2376         rth->rt_iif     = dev->ifindex;
2377         rth->rt_oif     = 0;
2378         rth->rt_mark    = skb->mark;
2379         rth->rt_gateway = daddr;
2380         rth->rt_spec_dst= spec_dst;
2381         rth->rt_peer_genid = 0;
2382         rth->peer = NULL;
2383         rth->fi = NULL;
2384         if (res.type == RTN_UNREACHABLE) {
2385                 rth->dst.input= ip_error;
2386                 rth->dst.error= -err;
2387                 rth->rt_flags   &= ~RTCF_LOCAL;
2388         }
2389         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2390         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2391         err = 0;
2392         if (IS_ERR(rth))
2393                 err = PTR_ERR(rth);
2394         goto out;
2395
2396 no_route:
2397         RT_CACHE_STAT_INC(in_no_route);
2398         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2399         res.type = RTN_UNREACHABLE;
2400         if (err == -ESRCH)
2401                 err = -ENETUNREACH;
2402         goto local_input;
2403
2404         /*
2405          *      Do not cache martian addresses: they should be logged (RFC1812)
2406          */
2407 martian_destination:
2408         RT_CACHE_STAT_INC(in_martian_dst);
2409 #ifdef CONFIG_IP_ROUTE_VERBOSE
2410         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2411                 pr_warn("martian destination %pI4 from %pI4, dev %s\n",
2412                         &daddr, &saddr, dev->name);
2413 #endif
2414
2415 e_hostunreach:
2416         err = -EHOSTUNREACH;
2417         goto out;
2418
2419 e_inval:
2420         err = -EINVAL;
2421         goto out;
2422
2423 e_nobufs:
2424         err = -ENOBUFS;
2425         goto out;
2426
2427 martian_source:
2428         err = -EINVAL;
2429 martian_source_keep_err:
2430         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2431         goto out;
2432 }
2433
2434 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2435                            u8 tos, struct net_device *dev, bool noref)
2436 {
2437         struct rtable   *rth;
2438         unsigned int    hash;
2439         int iif = dev->ifindex;
2440         struct net *net;
2441         int res;
2442
2443         net = dev_net(dev);
2444
2445         rcu_read_lock();
2446
2447         if (!rt_caching(net))
2448                 goto skip_cache;
2449
2450         tos &= IPTOS_RT_MASK;
2451         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2452
2453         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2454              rth = rcu_dereference(rth->dst.rt_next)) {
2455                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2456                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2457                      (rth->rt_route_iif ^ iif) |
2458                      (rth->rt_key_tos ^ tos)) == 0 &&
2459                     rth->rt_mark == skb->mark &&
2460                     net_eq(dev_net(rth->dst.dev), net) &&
2461                     !rt_is_expired(rth)) {
2462                         ipv4_validate_peer(rth);
2463                         if (noref) {
2464                                 dst_use_noref(&rth->dst, jiffies);
2465                                 skb_dst_set_noref(skb, &rth->dst);
2466                         } else {
2467                                 dst_use(&rth->dst, jiffies);
2468                                 skb_dst_set(skb, &rth->dst);
2469                         }
2470                         RT_CACHE_STAT_INC(in_hit);
2471                         rcu_read_unlock();
2472                         return 0;
2473                 }
2474                 RT_CACHE_STAT_INC(in_hlist_search);
2475         }
2476
2477 skip_cache:
2478         /* Multicast recognition logic is moved from route cache to here.
2479            The problem was that too many Ethernet cards have broken/missing
2480            hardware multicast filters :-( As result the host on multicasting
2481            network acquires a lot of useless route cache entries, sort of
2482            SDR messages from all the world. Now we try to get rid of them.
2483            Really, provided software IP multicast filter is organized
2484            reasonably (at least, hashed), it does not result in a slowdown
2485            comparing with route cache reject entries.
2486            Note, that multicast routers are not affected, because
2487            route cache entry is created eventually.
2488          */
2489         if (ipv4_is_multicast(daddr)) {
2490                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2491
2492                 if (in_dev) {
2493                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2494                                                   ip_hdr(skb)->protocol);
2495                         if (our
2496 #ifdef CONFIG_IP_MROUTE
2497                                 ||
2498                             (!ipv4_is_local_multicast(daddr) &&
2499                              IN_DEV_MFORWARD(in_dev))
2500 #endif
2501                            ) {
2502                                 int res = ip_route_input_mc(skb, daddr, saddr,
2503                                                             tos, dev, our);
2504                                 rcu_read_unlock();
2505                                 return res;
2506                         }
2507                 }
2508                 rcu_read_unlock();
2509                 return -EINVAL;
2510         }
2511         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2512         rcu_read_unlock();
2513         return res;
2514 }
2515 EXPORT_SYMBOL(ip_route_input_common);
2516
2517 /* called with rcu_read_lock() */
2518 static struct rtable *__mkroute_output(const struct fib_result *res,
2519                                        const struct flowi4 *fl4,
2520                                        __be32 orig_daddr, __be32 orig_saddr,
2521                                        int orig_oif, __u8 orig_rtos,
2522                                        struct net_device *dev_out,
2523                                        unsigned int flags)
2524 {
2525         struct fib_info *fi = res->fi;
2526         struct in_device *in_dev;
2527         u16 type = res->type;
2528         struct rtable *rth;
2529
2530         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2531                 return ERR_PTR(-EINVAL);
2532
2533         if (ipv4_is_lbcast(fl4->daddr))
2534                 type = RTN_BROADCAST;
2535         else if (ipv4_is_multicast(fl4->daddr))
2536                 type = RTN_MULTICAST;
2537         else if (ipv4_is_zeronet(fl4->daddr))
2538                 return ERR_PTR(-EINVAL);
2539
2540         if (dev_out->flags & IFF_LOOPBACK)
2541                 flags |= RTCF_LOCAL;
2542
2543         in_dev = __in_dev_get_rcu(dev_out);
2544         if (!in_dev)
2545                 return ERR_PTR(-EINVAL);
2546
2547         if (type == RTN_BROADCAST) {
2548                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2549                 fi = NULL;
2550         } else if (type == RTN_MULTICAST) {
2551                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2552                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2553                                      fl4->flowi4_proto))
2554                         flags &= ~RTCF_LOCAL;
2555                 /* If multicast route do not exist use
2556                  * default one, but do not gateway in this case.
2557                  * Yes, it is hack.
2558                  */
2559                 if (fi && res->prefixlen < 4)
2560                         fi = NULL;
2561         }
2562
2563         rth = rt_dst_alloc(dev_out,
2564                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2565                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2566         if (!rth)
2567                 return ERR_PTR(-ENOBUFS);
2568
2569         rth->dst.output = ip_output;
2570
2571         rth->rt_key_dst = orig_daddr;
2572         rth->rt_key_src = orig_saddr;
2573         rth->rt_genid = rt_genid(dev_net(dev_out));
2574         rth->rt_flags   = flags;
2575         rth->rt_type    = type;
2576         rth->rt_key_tos = orig_rtos;
2577         rth->rt_dst     = fl4->daddr;
2578         rth->rt_src     = fl4->saddr;
2579         rth->rt_route_iif = 0;
2580         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2581         rth->rt_oif     = orig_oif;
2582         rth->rt_mark    = fl4->flowi4_mark;
2583         rth->rt_gateway = fl4->daddr;
2584         rth->rt_spec_dst= fl4->saddr;
2585         rth->rt_peer_genid = 0;
2586         rth->peer = NULL;
2587         rth->fi = NULL;
2588
2589         RT_CACHE_STAT_INC(out_slow_tot);
2590
2591         if (flags & RTCF_LOCAL) {
2592                 rth->dst.input = ip_local_deliver;
2593                 rth->rt_spec_dst = fl4->daddr;
2594         }
2595         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2596                 rth->rt_spec_dst = fl4->saddr;
2597                 if (flags & RTCF_LOCAL &&
2598                     !(dev_out->flags & IFF_LOOPBACK)) {
2599                         rth->dst.output = ip_mc_output;
2600                         RT_CACHE_STAT_INC(out_slow_mc);
2601                 }
2602 #ifdef CONFIG_IP_MROUTE
2603                 if (type == RTN_MULTICAST) {
2604                         if (IN_DEV_MFORWARD(in_dev) &&
2605                             !ipv4_is_local_multicast(fl4->daddr)) {
2606                                 rth->dst.input = ip_mr_input;
2607                                 rth->dst.output = ip_mc_output;
2608                         }
2609                 }
2610 #endif
2611         }
2612
2613         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2614
2615         return rth;
2616 }
2617
2618 /*
2619  * Major route resolver routine.
2620  * called with rcu_read_lock();
2621  */
2622
2623 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2624 {
2625         struct net_device *dev_out = NULL;
2626         __u8 tos = RT_FL_TOS(fl4);
2627         unsigned int flags = 0;
2628         struct fib_result res;
2629         struct rtable *rth;
2630         __be32 orig_daddr;
2631         __be32 orig_saddr;
2632         int orig_oif;
2633
2634         res.fi          = NULL;
2635 #ifdef CONFIG_IP_MULTIPLE_TABLES
2636         res.r           = NULL;
2637 #endif
2638
2639         orig_daddr = fl4->daddr;
2640         orig_saddr = fl4->saddr;
2641         orig_oif = fl4->flowi4_oif;
2642
2643         fl4->flowi4_iif = net->loopback_dev->ifindex;
2644         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2645         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2646                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2647
2648         rcu_read_lock();
2649         if (fl4->saddr) {
2650                 rth = ERR_PTR(-EINVAL);
2651                 if (ipv4_is_multicast(fl4->saddr) ||
2652                     ipv4_is_lbcast(fl4->saddr) ||
2653                     ipv4_is_zeronet(fl4->saddr))
2654                         goto out;
2655
2656                 /* I removed check for oif == dev_out->oif here.
2657                    It was wrong for two reasons:
2658                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2659                       is assigned to multiple interfaces.
2660                    2. Moreover, we are allowed to send packets with saddr
2661                       of another iface. --ANK
2662                  */
2663
2664                 if (fl4->flowi4_oif == 0 &&
2665                     (ipv4_is_multicast(fl4->daddr) ||
2666                      ipv4_is_lbcast(fl4->daddr))) {
2667                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2668                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2669                         if (dev_out == NULL)
2670                                 goto out;
2671
2672                         /* Special hack: user can direct multicasts
2673                            and limited broadcast via necessary interface
2674                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2675                            This hack is not just for fun, it allows
2676                            vic,vat and friends to work.
2677                            They bind socket to loopback, set ttl to zero
2678                            and expect that it will work.
2679                            From the viewpoint of routing cache they are broken,
2680                            because we are not allowed to build multicast path
2681                            with loopback source addr (look, routing cache
2682                            cannot know, that ttl is zero, so that packet
2683                            will not leave this host and route is valid).
2684                            Luckily, this hack is good workaround.
2685                          */
2686
2687                         fl4->flowi4_oif = dev_out->ifindex;
2688                         goto make_route;
2689                 }
2690
2691                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2692                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2693                         if (!__ip_dev_find(net, fl4->saddr, false))
2694                                 goto out;
2695                 }
2696         }
2697
2698
2699         if (fl4->flowi4_oif) {
2700                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2701                 rth = ERR_PTR(-ENODEV);
2702                 if (dev_out == NULL)
2703                         goto out;
2704
2705                 /* RACE: Check return value of inet_select_addr instead. */
2706                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2707                         rth = ERR_PTR(-ENETUNREACH);
2708                         goto out;
2709                 }
2710                 if (ipv4_is_local_multicast(fl4->daddr) ||
2711                     ipv4_is_lbcast(fl4->daddr)) {
2712                         if (!fl4->saddr)
2713                                 fl4->saddr = inet_select_addr(dev_out, 0,
2714                                                               RT_SCOPE_LINK);
2715                         goto make_route;
2716                 }
2717                 if (fl4->saddr) {
2718                         if (ipv4_is_multicast(fl4->daddr))
2719                                 fl4->saddr = inet_select_addr(dev_out, 0,
2720                                                               fl4->flowi4_scope);
2721                         else if (!fl4->daddr)
2722                                 fl4->saddr = inet_select_addr(dev_out, 0,
2723                                                               RT_SCOPE_HOST);
2724                 }
2725         }
2726
2727         if (!fl4->daddr) {
2728                 fl4->daddr = fl4->saddr;
2729                 if (!fl4->daddr)
2730                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2731                 dev_out = net->loopback_dev;
2732                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2733                 res.type = RTN_LOCAL;
2734                 flags |= RTCF_LOCAL;
2735                 goto make_route;
2736         }
2737
2738         if (fib_lookup(net, fl4, &res)) {
2739                 res.fi = NULL;
2740                 if (fl4->flowi4_oif) {
2741                         /* Apparently, routing tables are wrong. Assume,
2742                            that the destination is on link.
2743
2744                            WHY? DW.
2745                            Because we are allowed to send to iface
2746                            even if it has NO routes and NO assigned
2747                            addresses. When oif is specified, routing
2748                            tables are looked up with only one purpose:
2749                            to catch if destination is gatewayed, rather than
2750                            direct. Moreover, if MSG_DONTROUTE is set,
2751                            we send packet, ignoring both routing tables
2752                            and ifaddr state. --ANK
2753
2754
2755                            We could make it even if oif is unknown,
2756                            likely IPv6, but we do not.
2757                          */
2758
2759                         if (fl4->saddr == 0)
2760                                 fl4->saddr = inet_select_addr(dev_out, 0,
2761                                                               RT_SCOPE_LINK);
2762                         res.type = RTN_UNICAST;
2763                         goto make_route;
2764                 }
2765                 rth = ERR_PTR(-ENETUNREACH);
2766                 goto out;
2767         }
2768
2769         if (res.type == RTN_LOCAL) {
2770                 if (!fl4->saddr) {
2771                         if (res.fi->fib_prefsrc)
2772                                 fl4->saddr = res.fi->fib_prefsrc;
2773                         else
2774                                 fl4->saddr = fl4->daddr;
2775                 }
2776                 dev_out = net->loopback_dev;
2777                 fl4->flowi4_oif = dev_out->ifindex;
2778                 res.fi = NULL;
2779                 flags |= RTCF_LOCAL;
2780                 goto make_route;
2781         }
2782
2783 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2784         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2785                 fib_select_multipath(&res);
2786         else
2787 #endif
2788         if (!res.prefixlen &&
2789             res.table->tb_num_default > 1 &&
2790             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2791                 fib_select_default(&res);
2792
2793         if (!fl4->saddr)
2794                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2795
2796         dev_out = FIB_RES_DEV(res);
2797         fl4->flowi4_oif = dev_out->ifindex;
2798
2799
2800 make_route:
2801         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2802                                tos, dev_out, flags);
2803         if (!IS_ERR(rth)) {
2804                 unsigned int hash;
2805
2806                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2807                                rt_genid(dev_net(dev_out)));
2808                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2809         }
2810
2811 out:
2812         rcu_read_unlock();
2813         return rth;
2814 }
2815
2816 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2817 {
2818         struct rtable *rth;
2819         unsigned int hash;
2820
2821         if (!rt_caching(net))
2822                 goto slow_output;
2823
2824         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2825
2826         rcu_read_lock_bh();
2827         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2828                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2829                 if (rth->rt_key_dst == flp4->daddr &&
2830                     rth->rt_key_src == flp4->saddr &&
2831                     rt_is_output_route(rth) &&
2832                     rth->rt_oif == flp4->flowi4_oif &&
2833                     rth->rt_mark == flp4->flowi4_mark &&
2834                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2835                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2836                     net_eq(dev_net(rth->dst.dev), net) &&
2837                     !rt_is_expired(rth)) {
2838                         ipv4_validate_peer(rth);
2839                         dst_use(&rth->dst, jiffies);
2840                         RT_CACHE_STAT_INC(out_hit);
2841                         rcu_read_unlock_bh();
2842                         if (!flp4->saddr)
2843                                 flp4->saddr = rth->rt_src;
2844                         if (!flp4->daddr)
2845                                 flp4->daddr = rth->rt_dst;
2846                         return rth;
2847                 }
2848                 RT_CACHE_STAT_INC(out_hlist_search);
2849         }
2850         rcu_read_unlock_bh();
2851
2852 slow_output:
2853         return ip_route_output_slow(net, flp4);
2854 }
2855 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2856
2857 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2858 {
2859         return NULL;
2860 }
2861
2862 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2863 {
2864         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2865
2866         return mtu ? : dst->dev->mtu;
2867 }
2868
2869 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2870 {
2871 }
2872
2873 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2874                                           unsigned long old)
2875 {
2876         return NULL;
2877 }
2878
2879 static struct dst_ops ipv4_dst_blackhole_ops = {
2880         .family                 =       AF_INET,
2881         .protocol               =       cpu_to_be16(ETH_P_IP),
2882         .destroy                =       ipv4_dst_destroy,
2883         .check                  =       ipv4_blackhole_dst_check,
2884         .mtu                    =       ipv4_blackhole_mtu,
2885         .default_advmss         =       ipv4_default_advmss,
2886         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2887         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2888         .neigh_lookup           =       ipv4_neigh_lookup,
2889 };
2890
2891 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2892 {
2893         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2894         struct rtable *ort = (struct rtable *) dst_orig;
2895
2896         if (rt) {
2897                 struct dst_entry *new = &rt->dst;
2898
2899                 new->__use = 1;
2900                 new->input = dst_discard;
2901                 new->output = dst_discard;
2902                 dst_copy_metrics(new, &ort->dst);
2903
2904                 new->dev = ort->dst.dev;
2905                 if (new->dev)
2906                         dev_hold(new->dev);
2907
2908                 rt->rt_key_dst = ort->rt_key_dst;
2909                 rt->rt_key_src = ort->rt_key_src;
2910                 rt->rt_key_tos = ort->rt_key_tos;
2911                 rt->rt_route_iif = ort->rt_route_iif;
2912                 rt->rt_iif = ort->rt_iif;
2913                 rt->rt_oif = ort->rt_oif;
2914                 rt->rt_mark = ort->rt_mark;
2915
2916                 rt->rt_genid = rt_genid(net);
2917                 rt->rt_flags = ort->rt_flags;
2918                 rt->rt_type = ort->rt_type;
2919                 rt->rt_dst = ort->rt_dst;
2920                 rt->rt_src = ort->rt_src;
2921                 rt->rt_gateway = ort->rt_gateway;
2922                 rt->rt_spec_dst = ort->rt_spec_dst;
2923                 rt->peer = ort->peer;
2924                 if (rt->peer)
2925                         atomic_inc(&rt->peer->refcnt);
2926                 rt->fi = ort->fi;
2927                 if (rt->fi)
2928                         atomic_inc(&rt->fi->fib_clntref);
2929
2930                 dst_free(new);
2931         }
2932
2933         dst_release(dst_orig);
2934
2935         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2936 }
2937
2938 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2939                                     struct sock *sk)
2940 {
2941         struct rtable *rt = __ip_route_output_key(net, flp4);
2942
2943         if (IS_ERR(rt))
2944                 return rt;
2945
2946         if (flp4->flowi4_proto)
2947                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2948                                                    flowi4_to_flowi(flp4),
2949                                                    sk, 0);
2950
2951         return rt;
2952 }
2953 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2954
2955 static int rt_fill_info(struct net *net,
2956                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2957                         int nowait, unsigned int flags)
2958 {
2959         struct rtable *rt = skb_rtable(skb);
2960         struct rtmsg *r;
2961         struct nlmsghdr *nlh;
2962         unsigned long expires = 0;
2963         const struct inet_peer *peer = rt->peer;
2964         u32 id = 0, ts = 0, tsage = 0, error;
2965
2966         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2967         if (nlh == NULL)
2968                 return -EMSGSIZE;
2969
2970         r = nlmsg_data(nlh);
2971         r->rtm_family    = AF_INET;
2972         r->rtm_dst_len  = 32;
2973         r->rtm_src_len  = 0;
2974         r->rtm_tos      = rt->rt_key_tos;
2975         r->rtm_table    = RT_TABLE_MAIN;
2976         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2977                 goto nla_put_failure;
2978         r->rtm_type     = rt->rt_type;
2979         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2980         r->rtm_protocol = RTPROT_UNSPEC;
2981         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2982         if (rt->rt_flags & RTCF_NOTIFY)
2983                 r->rtm_flags |= RTM_F_NOTIFY;
2984
2985         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2986                 goto nla_put_failure;
2987         if (rt->rt_key_src) {
2988                 r->rtm_src_len = 32;
2989                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2990                         goto nla_put_failure;
2991         }
2992         if (rt->dst.dev &&
2993             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2994                 goto nla_put_failure;
2995 #ifdef CONFIG_IP_ROUTE_CLASSID
2996         if (rt->dst.tclassid &&
2997             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2998                 goto nla_put_failure;
2999 #endif
3000         if (rt_is_input_route(rt)) {
3001                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
3002                         goto nla_put_failure;
3003         } else if (rt->rt_src != rt->rt_key_src) {
3004                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
3005                         goto nla_put_failure;
3006         }
3007         if (rt->rt_dst != rt->rt_gateway &&
3008             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
3009                 goto nla_put_failure;
3010
3011         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3012                 goto nla_put_failure;
3013
3014         if (rt->rt_mark &&
3015             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
3016                 goto nla_put_failure;
3017
3018         error = rt->dst.error;
3019         if (peer) {
3020                 inet_peer_refcheck(rt->peer);
3021                 id = atomic_read(&peer->ip_id_count) & 0xffff;
3022                 if (peer->tcp_ts_stamp) {
3023                         ts = peer->tcp_ts;
3024                         tsage = get_seconds() - peer->tcp_ts_stamp;
3025                 }
3026                 expires = ACCESS_ONCE(peer->pmtu_expires);
3027                 if (expires) {
3028                         if (time_before(jiffies, expires))
3029                                 expires -= jiffies;
3030                         else
3031                                 expires = 0;
3032                 }
3033         }
3034
3035         if (rt_is_input_route(rt)) {
3036 #ifdef CONFIG_IP_MROUTE
3037                 __be32 dst = rt->rt_dst;
3038
3039                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3040                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3041                         int err = ipmr_get_route(net, skb,
3042                                                  rt->rt_src, rt->rt_dst,
3043                                                  r, nowait);
3044                         if (err <= 0) {
3045                                 if (!nowait) {
3046                                         if (err == 0)
3047                                                 return 0;
3048                                         goto nla_put_failure;
3049                                 } else {
3050                                         if (err == -EMSGSIZE)
3051                                                 goto nla_put_failure;
3052                                         error = err;
3053                                 }
3054                         }
3055                 } else
3056 #endif
3057                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3058                                 goto nla_put_failure;
3059         }
3060
3061         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3062                                expires, error) < 0)
3063                 goto nla_put_failure;
3064
3065         return nlmsg_end(skb, nlh);
3066
3067 nla_put_failure:
3068         nlmsg_cancel(skb, nlh);
3069         return -EMSGSIZE;
3070 }
3071
3072 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3073 {
3074         struct net *net = sock_net(in_skb->sk);
3075         struct rtmsg *rtm;
3076         struct nlattr *tb[RTA_MAX+1];
3077         struct rtable *rt = NULL;
3078         __be32 dst = 0;
3079         __be32 src = 0;
3080         u32 iif;
3081         int err;
3082         int mark;
3083         struct sk_buff *skb;
3084
3085         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3086         if (err < 0)
3087                 goto errout;
3088
3089         rtm = nlmsg_data(nlh);
3090
3091         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3092         if (skb == NULL) {
3093                 err = -ENOBUFS;
3094                 goto errout;
3095         }
3096
3097         /* Reserve room for dummy headers, this skb can pass
3098            through good chunk of routing engine.
3099          */
3100         skb_reset_mac_header(skb);
3101         skb_reset_network_header(skb);
3102
3103         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3104         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3105         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3106
3107         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3108         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3109         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3110         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3111
3112         if (iif) {
3113                 struct net_device *dev;
3114
3115                 dev = __dev_get_by_index(net, iif);
3116                 if (dev == NULL) {
3117                         err = -ENODEV;
3118                         goto errout_free;
3119                 }
3120
3121                 skb->protocol   = htons(ETH_P_IP);
3122                 skb->dev        = dev;
3123                 skb->mark       = mark;
3124                 local_bh_disable();
3125                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3126                 local_bh_enable();
3127
3128                 rt = skb_rtable(skb);
3129                 if (err == 0 && rt->dst.error)
3130                         err = -rt->dst.error;
3131         } else {
3132                 struct flowi4 fl4 = {
3133                         .daddr = dst,
3134                         .saddr = src,
3135                         .flowi4_tos = rtm->rtm_tos,
3136                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3137                         .flowi4_mark = mark,
3138                 };
3139                 rt = ip_route_output_key(net, &fl4);
3140
3141                 err = 0;
3142                 if (IS_ERR(rt))
3143                         err = PTR_ERR(rt);
3144         }
3145
3146         if (err)
3147                 goto errout_free;
3148
3149         skb_dst_set(skb, &rt->dst);
3150         if (rtm->rtm_flags & RTM_F_NOTIFY)
3151                 rt->rt_flags |= RTCF_NOTIFY;
3152
3153         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3154                            RTM_NEWROUTE, 0, 0);
3155         if (err <= 0)
3156                 goto errout_free;
3157
3158         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3159 errout:
3160         return err;
3161
3162 errout_free:
3163         kfree_skb(skb);
3164         goto errout;
3165 }
3166
3167 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3168 {
3169         struct rtable *rt;
3170         int h, s_h;
3171         int idx, s_idx;
3172         struct net *net;
3173
3174         net = sock_net(skb->sk);
3175
3176         s_h = cb->args[0];
3177         if (s_h < 0)
3178                 s_h = 0;
3179         s_idx = idx = cb->args[1];
3180         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3181                 if (!rt_hash_table[h].chain)
3182                         continue;
3183                 rcu_read_lock_bh();
3184                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3185                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3186                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3187                                 continue;
3188                         if (rt_is_expired(rt))
3189                                 continue;
3190                         skb_dst_set_noref(skb, &rt->dst);
3191                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3192                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3193                                          1, NLM_F_MULTI) <= 0) {
3194                                 skb_dst_drop(skb);
3195                                 rcu_read_unlock_bh();
3196                                 goto done;
3197                         }
3198                         skb_dst_drop(skb);
3199                 }
3200                 rcu_read_unlock_bh();
3201         }
3202
3203 done:
3204         cb->args[0] = h;
3205         cb->args[1] = idx;
3206         return skb->len;
3207 }
3208
3209 void ip_rt_multicast_event(struct in_device *in_dev)
3210 {
3211         rt_cache_flush(dev_net(in_dev->dev), 0);
3212 }
3213
3214 #ifdef CONFIG_SYSCTL
3215 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3216                                         void __user *buffer,
3217                                         size_t *lenp, loff_t *ppos)
3218 {
3219         if (write) {
3220                 int flush_delay;
3221                 ctl_table ctl;
3222                 struct net *net;
3223
3224                 memcpy(&ctl, __ctl, sizeof(ctl));
3225                 ctl.data = &flush_delay;
3226                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3227
3228                 net = (struct net *)__ctl->extra1;
3229                 rt_cache_flush(net, flush_delay);
3230                 return 0;
3231         }
3232
3233         return -EINVAL;
3234 }
3235
3236 static ctl_table ipv4_route_table[] = {
3237         {
3238                 .procname       = "gc_thresh",
3239                 .data           = &ipv4_dst_ops.gc_thresh,
3240                 .maxlen         = sizeof(int),
3241                 .mode           = 0644,
3242                 .proc_handler   = proc_dointvec,
3243         },
3244         {
3245                 .procname       = "max_size",
3246                 .data           = &ip_rt_max_size,
3247                 .maxlen         = sizeof(int),
3248                 .mode           = 0644,
3249                 .proc_handler   = proc_dointvec,
3250         },
3251         {
3252                 /*  Deprecated. Use gc_min_interval_ms */
3253
3254                 .procname       = "gc_min_interval",
3255                 .data           = &ip_rt_gc_min_interval,
3256                 .maxlen         = sizeof(int),
3257                 .mode           = 0644,
3258                 .proc_handler   = proc_dointvec_jiffies,
3259         },
3260         {
3261                 .procname       = "gc_min_interval_ms",
3262                 .data           = &ip_rt_gc_min_interval,
3263                 .maxlen         = sizeof(int),
3264                 .mode           = 0644,
3265                 .proc_handler   = proc_dointvec_ms_jiffies,
3266         },
3267         {
3268                 .procname       = "gc_timeout",
3269                 .data           = &ip_rt_gc_timeout,
3270                 .maxlen         = sizeof(int),
3271                 .mode           = 0644,
3272                 .proc_handler   = proc_dointvec_jiffies,
3273         },
3274         {
3275                 .procname       = "gc_interval",
3276                 .data           = &ip_rt_gc_interval,
3277                 .maxlen         = sizeof(int),
3278                 .mode           = 0644,
3279                 .proc_handler   = proc_dointvec_jiffies,
3280         },
3281         {
3282                 .procname       = "redirect_load",
3283                 .data           = &ip_rt_redirect_load,
3284                 .maxlen         = sizeof(int),
3285                 .mode           = 0644,
3286                 .proc_handler   = proc_dointvec,
3287         },
3288         {
3289                 .procname       = "redirect_number",
3290                 .data           = &ip_rt_redirect_number,
3291                 .maxlen         = sizeof(int),
3292                 .mode           = 0644,
3293                 .proc_handler   = proc_dointvec,
3294         },
3295         {
3296                 .procname       = "redirect_silence",
3297                 .data           = &ip_rt_redirect_silence,
3298                 .maxlen         = sizeof(int),
3299                 .mode           = 0644,
3300                 .proc_handler   = proc_dointvec,
3301         },
3302         {
3303                 .procname       = "error_cost",
3304                 .data           = &ip_rt_error_cost,
3305                 .maxlen         = sizeof(int),
3306                 .mode           = 0644,
3307                 .proc_handler   = proc_dointvec,
3308         },
3309         {
3310                 .procname       = "error_burst",
3311                 .data           = &ip_rt_error_burst,
3312                 .maxlen         = sizeof(int),
3313                 .mode           = 0644,
3314                 .proc_handler   = proc_dointvec,
3315         },
3316         {
3317                 .procname       = "gc_elasticity",
3318                 .data           = &ip_rt_gc_elasticity,
3319                 .maxlen         = sizeof(int),
3320                 .mode           = 0644,
3321                 .proc_handler   = proc_dointvec,
3322         },
3323         {
3324                 .procname       = "mtu_expires",
3325                 .data           = &ip_rt_mtu_expires,
3326                 .maxlen         = sizeof(int),
3327                 .mode           = 0644,
3328                 .proc_handler   = proc_dointvec_jiffies,
3329         },
3330         {
3331                 .procname       = "min_pmtu",
3332                 .data           = &ip_rt_min_pmtu,
3333                 .maxlen         = sizeof(int),
3334                 .mode           = 0644,
3335                 .proc_handler   = proc_dointvec,
3336         },
3337         {
3338                 .procname       = "min_adv_mss",
3339                 .data           = &ip_rt_min_advmss,
3340                 .maxlen         = sizeof(int),
3341                 .mode           = 0644,
3342                 .proc_handler   = proc_dointvec,
3343         },
3344         { }
3345 };
3346
3347 static struct ctl_table ipv4_route_flush_table[] = {
3348         {
3349                 .procname       = "flush",
3350                 .maxlen         = sizeof(int),
3351                 .mode           = 0200,
3352                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3353         },
3354         { },
3355 };
3356
3357 static __net_initdata struct ctl_path ipv4_route_path[] = {
3358         { .procname = "net", },
3359         { .procname = "ipv4", },
3360         { .procname = "route", },
3361         { },
3362 };
3363
3364 static __net_init int sysctl_route_net_init(struct net *net)
3365 {
3366         struct ctl_table *tbl;
3367
3368         tbl = ipv4_route_flush_table;
3369         if (!net_eq(net, &init_net)) {
3370                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3371                 if (tbl == NULL)
3372                         goto err_dup;
3373         }
3374         tbl[0].extra1 = net;
3375
3376         net->ipv4.route_hdr =
3377                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3378         if (net->ipv4.route_hdr == NULL)
3379                 goto err_reg;
3380         return 0;
3381
3382 err_reg:
3383         if (tbl != ipv4_route_flush_table)
3384                 kfree(tbl);
3385 err_dup:
3386         return -ENOMEM;
3387 }
3388
3389 static __net_exit void sysctl_route_net_exit(struct net *net)
3390 {
3391         struct ctl_table *tbl;
3392
3393         tbl = net->ipv4.route_hdr->ctl_table_arg;
3394         unregister_net_sysctl_table(net->ipv4.route_hdr);
3395         BUG_ON(tbl == ipv4_route_flush_table);
3396         kfree(tbl);
3397 }
3398
3399 static __net_initdata struct pernet_operations sysctl_route_ops = {
3400         .init = sysctl_route_net_init,
3401         .exit = sysctl_route_net_exit,
3402 };
3403 #endif
3404
3405 static __net_init int rt_genid_init(struct net *net)
3406 {
3407         get_random_bytes(&net->ipv4.rt_genid,
3408                          sizeof(net->ipv4.rt_genid));
3409         get_random_bytes(&net->ipv4.dev_addr_genid,
3410                          sizeof(net->ipv4.dev_addr_genid));
3411         return 0;
3412 }
3413
3414 static __net_initdata struct pernet_operations rt_genid_ops = {
3415         .init = rt_genid_init,
3416 };
3417
3418
3419 #ifdef CONFIG_IP_ROUTE_CLASSID
3420 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3421 #endif /* CONFIG_IP_ROUTE_CLASSID */
3422
3423 static __initdata unsigned long rhash_entries;
3424 static int __init set_rhash_entries(char *str)
3425 {
3426         if (!str)
3427                 return 0;
3428         rhash_entries = simple_strtoul(str, &str, 0);
3429         return 1;
3430 }
3431 __setup("rhash_entries=", set_rhash_entries);
3432
3433 int __init ip_rt_init(void)
3434 {
3435         int rc = 0;
3436
3437 #ifdef CONFIG_IP_ROUTE_CLASSID
3438         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3439         if (!ip_rt_acct)
3440                 panic("IP: failed to allocate ip_rt_acct\n");
3441 #endif
3442
3443         ipv4_dst_ops.kmem_cachep =
3444                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3445                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3446
3447         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3448
3449         if (dst_entries_init(&ipv4_dst_ops) < 0)
3450                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3451
3452         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3453                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3454
3455         rt_hash_table = (struct rt_hash_bucket *)
3456                 alloc_large_system_hash("IP route cache",
3457                                         sizeof(struct rt_hash_bucket),
3458                                         rhash_entries,
3459                                         (totalram_pages >= 128 * 1024) ?
3460                                         15 : 17,
3461                                         0,
3462                                         &rt_hash_log,
3463                                         &rt_hash_mask,
3464                                         rhash_entries ? 0 : 512 * 1024);
3465         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3466         rt_hash_lock_init();
3467
3468         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3469         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3470
3471         devinet_init();
3472         ip_fib_init();
3473
3474         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3475         expires_ljiffies = jiffies;
3476         schedule_delayed_work(&expires_work,
3477                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3478
3479         if (ip_rt_proc_init())
3480                 pr_err("Unable to create route proc files\n");
3481 #ifdef CONFIG_XFRM
3482         xfrm_init();
3483         xfrm4_init(ip_rt_max_size);
3484 #endif
3485         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3486
3487 #ifdef CONFIG_SYSCTL
3488         register_pernet_subsys(&sysctl_route_ops);
3489 #endif
3490         register_pernet_subsys(&rt_genid_ops);
3491         return rc;
3492 }
3493
3494 #ifdef CONFIG_SYSCTL
3495 /*
3496  * We really need to sanitize the damn ipv4 init order, then all
3497  * this nonsense will go away.
3498  */
3499 void __init ip_static_sysctl_init(void)
3500 {
3501         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3502 }
3503 #endif