net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/workqueue.h>
  84 #include <linux/skbuff.h>
  85 #include <linux/inetdevice.h>
  86 #include <linux/igmp.h>
  87 #include <linux/pkt_sched.h>
  88 #include <linux/mroute.h>
  89 #include <linux/netfilter_ipv4.h>
  90 #include <linux/random.h>
  91 #include <linux/jhash.h>
  92 #include <linux/rcupdate.h>
  93 #include <linux/times.h>
  94 #include <linux/slab.h>
  95 #include <linux/prefetch.h>
  96 #include <net/dst.h>
  97 #include <net/net_namespace.h>
  98 #include <net/protocol.h>
  99 #include <net/ip.h>
 100 #include <net/route.h>
 101 #include <net/inetpeer.h>
 102 #include <net/sock.h>
 103 #include <net/ip_fib.h>
 104 #include <net/arp.h>
 105 #include <net/tcp.h>
 106 #include <net/icmp.h>
 107 #include <net/xfrm.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define IP_MAX_MTU      0xFFF0
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 125 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 127 static int ip_rt_redirect_number __read_mostly  = 9;
 128 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 129 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 130 static int ip_rt_error_cost __read_mostly       = HZ;
 131 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 132 static int ip_rt_gc_elasticity __read_mostly    = 8;
 133 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 134 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 135 static int ip_rt_min_advmss __read_mostly       = 256;
 136 static int rt_chain_length_max __read_mostly    = 20;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         struct rtable *rt = (struct rtable *) dst;
 162         struct inet_peer *peer;
 163         u32 *p = NULL;
 164
 165         peer = rt_get_peer_create(rt, rt->rt_dst);
 166         if (peer) {
 167                 u32 *old_p = __DST_METRICS_PTR(old);
 168                 unsigned long prev, new;
 169
 170                 p = peer->metrics;
 171                 if (inet_metrics_new(peer))
 172                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 173
 174                 new = (unsigned long) p;
 175                 prev = cmpxchg(&dst->_metrics, old, new);
 176
 177                 if (prev != old) {
 178                         p = __DST_METRICS_PTR(prev);
 179                         if (prev & DST_METRICS_READ_ONLY)
 180                                 p = NULL;
 181                 } else {
 182                         if (rt->fi) {
 183                                 fib_info_put(rt->fi);
 184                                 rt->fi = NULL;
 185                         }
 186                 }
 187         }
 188         return p;
 189 }
 190
 191 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 192
 193 static struct dst_ops ipv4_dst_ops = {
 194         .family =               AF_INET,
 195         .protocol =             cpu_to_be16(ETH_P_IP),
 196         .gc =                   rt_garbage_collect,
 197         .check =                ipv4_dst_check,
 198         .default_advmss =       ipv4_default_advmss,
 199         .mtu =                  ipv4_mtu,
 200         .cow_metrics =          ipv4_cow_metrics,
 201         .destroy =              ipv4_dst_destroy,
 202         .ifdown =               ipv4_dst_ifdown,
 203         .negative_advice =      ipv4_negative_advice,
 204         .link_failure =         ipv4_link_failure,
 205         .update_pmtu =          ip_rt_update_pmtu,
 206         .local_out =            __ip_local_out,
 207         .neigh_lookup =         ipv4_neigh_lookup,
 208 };
 209
 210 #define ECN_OR_COST(class)      TC_PRIO_##class
 211
 212 const __u8 ip_tos2prio[16] = {
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BESTEFFORT,
 216         ECN_OR_COST(BESTEFFORT),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_BULK,
 220         ECN_OR_COST(BULK),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE,
 224         ECN_OR_COST(INTERACTIVE),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK),
 227         TC_PRIO_INTERACTIVE_BULK,
 228         ECN_OR_COST(INTERACTIVE_BULK)
 229 };
 230 EXPORT_SYMBOL(ip_tos2prio);
 231
 232 /*
 233  * Route cache.
 234  */
 235
 236 /* The locking scheme is rather straight forward:
 237  *
 238  * 1) Read-Copy Update protects the buckets of the central route hash.
 239  * 2) Only writers remove entries, and they hold the lock
 240  *    as they look at rtable reference counts.
 241  * 3) Only readers acquire references to rtable entries,
 242  *    they do so with atomic increments and with the
 243  *    lock held.
 244  */
 245
 246 struct rt_hash_bucket {
 247         struct rtable __rcu     *chain;
 248 };
 249
 250 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 251         defined(CONFIG_PROVE_LOCKING)
 252 /*
 253  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 254  * The size of this table is a power of two and depends on the number of CPUS.
 255  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 256  */
 257 #ifdef CONFIG_LOCKDEP
 258 # define RT_HASH_LOCK_SZ        256
 259 #else
 260 # if NR_CPUS >= 32
 261 #  define RT_HASH_LOCK_SZ       4096
 262 # elif NR_CPUS >= 16
 263 #  define RT_HASH_LOCK_SZ       2048
 264 # elif NR_CPUS >= 8
 265 #  define RT_HASH_LOCK_SZ       1024
 266 # elif NR_CPUS >= 4
 267 #  define RT_HASH_LOCK_SZ       512
 268 # else
 269 #  define RT_HASH_LOCK_SZ       256
 270 # endif
 271 #endif
 272
 273 static spinlock_t       *rt_hash_locks;
 274 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 275
 276 static __init void rt_hash_lock_init(void)
 277 {
 278         int i;
 279
 280         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 281                         GFP_KERNEL);
 282         if (!rt_hash_locks)
 283                 panic("IP: failed to allocate rt_hash_locks\n");
 284
 285         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 286                 spin_lock_init(&rt_hash_locks[i]);
 287 }
 288 #else
 289 # define rt_hash_lock_addr(slot) NULL
 290
 291 static inline void rt_hash_lock_init(void)
 292 {
 293 }
 294 #endif
 295
 296 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 297 static unsigned int             rt_hash_mask __read_mostly;
 298 static unsigned int             rt_hash_log  __read_mostly;
 299
 300 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 301 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 302
 303 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 304                                    int genid)
 305 {
 306         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 307                             idx, genid)
 308                 & rt_hash_mask;
 309 }
 310
 311 static inline int rt_genid(struct net *net)
 312 {
 313         return atomic_read(&net->ipv4.rt_genid);
 314 }
 315
 316 #ifdef CONFIG_PROC_FS
 317 struct rt_cache_iter_state {
 318         struct seq_net_private p;
 319         int bucket;
 320         int genid;
 321 };
 322
 323 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 324 {
 325         struct rt_cache_iter_state *st = seq->private;
 326         struct rtable *r = NULL;
 327
 328         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 329                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 330                         continue;
 331                 rcu_read_lock_bh();
 332                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 333                 while (r) {
 334                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 335                             r->rt_genid == st->genid)
 336                                 return r;
 337                         r = rcu_dereference_bh(r->dst.rt_next);
 338                 }
 339                 rcu_read_unlock_bh();
 340         }
 341         return r;
 342 }
 343
 344 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 345                                           struct rtable *r)
 346 {
 347         struct rt_cache_iter_state *st = seq->private;
 348
 349         r = rcu_dereference_bh(r->dst.rt_next);
 350         while (!r) {
 351                 rcu_read_unlock_bh();
 352                 do {
 353                         if (--st->bucket < 0)
 354                                 return NULL;
 355                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 356                 rcu_read_lock_bh();
 357                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 358         }
 359         return r;
 360 }
 361
 362 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 363                                         struct rtable *r)
 364 {
 365         struct rt_cache_iter_state *st = seq->private;
 366         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 367                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 368                         continue;
 369                 if (r->rt_genid == st->genid)
 370                         break;
 371         }
 372         return r;
 373 }
 374
 375 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 376 {
 377         struct rtable *r = rt_cache_get_first(seq);
 378
 379         if (r)
 380                 while (pos && (r = rt_cache_get_next(seq, r)))
 381                         --pos;
 382         return pos ? NULL : r;
 383 }
 384
 385 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 386 {
 387         struct rt_cache_iter_state *st = seq->private;
 388         if (*pos)
 389                 return rt_cache_get_idx(seq, *pos - 1);
 390         st->genid = rt_genid(seq_file_net(seq));
 391         return SEQ_START_TOKEN;
 392 }
 393
 394 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 395 {
 396         struct rtable *r;
 397
 398         if (v == SEQ_START_TOKEN)
 399                 r = rt_cache_get_first(seq);
 400         else
 401                 r = rt_cache_get_next(seq, v);
 402         ++*pos;
 403         return r;
 404 }
 405
 406 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 407 {
 408         if (v && v != SEQ_START_TOKEN)
 409                 rcu_read_unlock_bh();
 410 }
 411
 412 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 413 {
 414         if (v == SEQ_START_TOKEN)
 415                 seq_printf(seq, "%-127s\n",
 416                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 417                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 418                            "HHUptod\tSpecDst");
 419         else {
 420                 struct rtable *r = v;
 421                 struct neighbour *n;
 422                 int len, HHUptod;
 423
 424                 rcu_read_lock();
 425                 n = dst_get_neighbour_noref(&r->dst);
 426                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 427                 rcu_read_unlock();
 428
 429                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 430                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 431                         r->dst.dev ? r->dst.dev->name : "*",
 432                         (__force u32)r->rt_dst,
 433                         (__force u32)r->rt_gateway,
 434                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 435                         r->dst.__use, 0, (__force u32)r->rt_src,
 436                         dst_metric_advmss(&r->dst) + 40,
 437                         dst_metric(&r->dst, RTAX_WINDOW),
 438                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 439                               dst_metric(&r->dst, RTAX_RTTVAR)),
 440                         r->rt_key_tos,
 441                         -1,
 442                         HHUptod,
 443                         r->rt_spec_dst, &len);
 444
 445                 seq_printf(seq, "%*s\n", 127 - len, "");
 446         }
 447         return 0;
 448 }
 449
 450 static const struct seq_operations rt_cache_seq_ops = {
 451         .start  = rt_cache_seq_start,
 452         .next   = rt_cache_seq_next,
 453         .stop   = rt_cache_seq_stop,
 454         .show   = rt_cache_seq_show,
 455 };
 456
 457 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 458 {
 459         return seq_open_net(inode, file, &rt_cache_seq_ops,
 460                         sizeof(struct rt_cache_iter_state));
 461 }
 462
 463 static const struct file_operations rt_cache_seq_fops = {
 464         .owner   = THIS_MODULE,
 465         .open    = rt_cache_seq_open,
 466         .read    = seq_read,
 467         .llseek  = seq_lseek,
 468         .release = seq_release_net,
 469 };
 470
 471
 472 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 473 {
 474         int cpu;
 475
 476         if (*pos == 0)
 477                 return SEQ_START_TOKEN;
 478
 479         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 480                 if (!cpu_possible(cpu))
 481                         continue;
 482                 *pos = cpu+1;
 483                 return &per_cpu(rt_cache_stat, cpu);
 484         }
 485         return NULL;
 486 }
 487
 488 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 489 {
 490         int cpu;
 491
 492         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 493                 if (!cpu_possible(cpu))
 494                         continue;
 495                 *pos = cpu+1;
 496                 return &per_cpu(rt_cache_stat, cpu);
 497         }
 498         return NULL;
 499
 500 }
 501
 502 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 503 {
 504
 505 }
 506
 507 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 508 {
 509         struct rt_cache_stat *st = v;
 510
 511         if (v == SEQ_START_TOKEN) {
 512                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 513                 return 0;
 514         }
 515
 516         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 517                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 518                    dst_entries_get_slow(&ipv4_dst_ops),
 519                    st->in_hit,
 520                    st->in_slow_tot,
 521                    st->in_slow_mc,
 522                    st->in_no_route,
 523                    st->in_brd,
 524                    st->in_martian_dst,
 525                    st->in_martian_src,
 526
 527                    st->out_hit,
 528                    st->out_slow_tot,
 529                    st->out_slow_mc,
 530
 531                    st->gc_total,
 532                    st->gc_ignored,
 533                    st->gc_goal_miss,
 534                    st->gc_dst_overflow,
 535                    st->in_hlist_search,
 536                    st->out_hlist_search
 537                 );
 538         return 0;
 539 }
 540
 541 static const struct seq_operations rt_cpu_seq_ops = {
 542         .start  = rt_cpu_seq_start,
 543         .next   = rt_cpu_seq_next,
 544         .stop   = rt_cpu_seq_stop,
 545         .show   = rt_cpu_seq_show,
 546 };
 547
 548
 549 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 550 {
 551         return seq_open(file, &rt_cpu_seq_ops);
 552 }
 553
 554 static const struct file_operations rt_cpu_seq_fops = {
 555         .owner   = THIS_MODULE,
 556         .open    = rt_cpu_seq_open,
 557         .read    = seq_read,
 558         .llseek  = seq_lseek,
 559         .release = seq_release,
 560 };
 561
 562 #ifdef CONFIG_IP_ROUTE_CLASSID
 563 static int rt_acct_proc_show(struct seq_file *m, void *v)
 564 {
 565         struct ip_rt_acct *dst, *src;
 566         unsigned int i, j;
 567
 568         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 569         if (!dst)
 570                 return -ENOMEM;
 571
 572         for_each_possible_cpu(i) {
 573                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 574                 for (j = 0; j < 256; j++) {
 575                         dst[j].o_bytes   += src[j].o_bytes;
 576                         dst[j].o_packets += src[j].o_packets;
 577                         dst[j].i_bytes   += src[j].i_bytes;
 578                         dst[j].i_packets += src[j].i_packets;
 579                 }
 580         }
 581
 582         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 583         kfree(dst);
 584         return 0;
 585 }
 586
 587 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 588 {
 589         return single_open(file, rt_acct_proc_show, NULL);
 590 }
 591
 592 static const struct file_operations rt_acct_proc_fops = {
 593         .owner          = THIS_MODULE,
 594         .open           = rt_acct_proc_open,
 595         .read           = seq_read,
 596         .llseek         = seq_lseek,
 597         .release        = single_release,
 598 };
 599 #endif
 600
 601 static int __net_init ip_rt_do_proc_init(struct net *net)
 602 {
 603         struct proc_dir_entry *pde;
 604
 605         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 606                         &rt_cache_seq_fops);
 607         if (!pde)
 608                 goto err1;
 609
 610         pde = proc_create("rt_cache", S_IRUGO,
 611                           net->proc_net_stat, &rt_cpu_seq_fops);
 612         if (!pde)
 613                 goto err2;
 614
 615 #ifdef CONFIG_IP_ROUTE_CLASSID
 616         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 617         if (!pde)
 618                 goto err3;
 619 #endif
 620         return 0;
 621
 622 #ifdef CONFIG_IP_ROUTE_CLASSID
 623 err3:
 624         remove_proc_entry("rt_cache", net->proc_net_stat);
 625 #endif
 626 err2:
 627         remove_proc_entry("rt_cache", net->proc_net);
 628 err1:
 629         return -ENOMEM;
 630 }
 631
 632 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 633 {
 634         remove_proc_entry("rt_cache", net->proc_net_stat);
 635         remove_proc_entry("rt_cache", net->proc_net);
 636 #ifdef CONFIG_IP_ROUTE_CLASSID
 637         remove_proc_entry("rt_acct", net->proc_net);
 638 #endif
 639 }
 640
 641 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 642         .init = ip_rt_do_proc_init,
 643         .exit = ip_rt_do_proc_exit,
 644 };
 645
 646 static int __init ip_rt_proc_init(void)
 647 {
 648         return register_pernet_subsys(&ip_rt_proc_ops);
 649 }
 650
 651 #else
 652 static inline int ip_rt_proc_init(void)
 653 {
 654         return 0;
 655 }
 656 #endif /* CONFIG_PROC_FS */
 657
 658 static inline void rt_free(struct rtable *rt)
 659 {
 660         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 661 }
 662
 663 static inline void rt_drop(struct rtable *rt)
 664 {
 665         ip_rt_put(rt);
 666         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 667 }
 668
 669 static inline int rt_fast_clean(struct rtable *rth)
 670 {
 671         /* Kill broadcast/multicast entries very aggresively, if they
 672            collide in hash table with more useful entries */
 673         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 674                 rt_is_input_route(rth) && rth->dst.rt_next;
 675 }
 676
 677 static inline int rt_valuable(struct rtable *rth)
 678 {
 679         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 680                 (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires);
 681 }
 682
 683 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 684 {
 685         unsigned long age;
 686         int ret = 0;
 687
 688         if (atomic_read(&rth->dst.__refcnt))
 689                 goto out;
 690
 691         age = jiffies - rth->dst.lastuse;
 692         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 693             (age <= tmo2 && rt_valuable(rth)))
 694                 goto out;
 695         ret = 1;
 696 out:    return ret;
 697 }
 698
 699 /* Bits of score are:
 700  * 31: very valuable
 701  * 30: not quite useless
 702  * 29..0: usage counter
 703  */
 704 static inline u32 rt_score(struct rtable *rt)
 705 {
 706         u32 score = jiffies - rt->dst.lastuse;
 707
 708         score = ~score & ~(3<<30);
 709
 710         if (rt_valuable(rt))
 711                 score |= (1<<31);
 712
 713         if (rt_is_output_route(rt) ||
 714             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 715                 score |= (1<<30);
 716
 717         return score;
 718 }
 719
 720 static inline bool rt_caching(const struct net *net)
 721 {
 722         return net->ipv4.current_rt_cache_rebuild_count <=
 723                 net->ipv4.sysctl_rt_cache_rebuild_count;
 724 }
 725
 726 static inline bool compare_hash_inputs(const struct rtable *rt1,
 727                                        const struct rtable *rt2)
 728 {
 729         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 730                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 731                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 732 }
 733
 734 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 735 {
 736         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 737                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 738                 (rt1->rt_mark ^ rt2->rt_mark) |
 739                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 740                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 741                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 742 }
 743
 744 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 745 {
 746         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 747 }
 748
 749 static inline int rt_is_expired(struct rtable *rth)
 750 {
 751         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 752 }
 753
 754 /*
 755  * Perform a full scan of hash table and free all entries.
 756  * Can be called by a softirq or a process.
 757  * In the later case, we want to be reschedule if necessary
 758  */
 759 static void rt_do_flush(struct net *net, int process_context)
 760 {
 761         unsigned int i;
 762         struct rtable *rth, *next;
 763
 764         for (i = 0; i <= rt_hash_mask; i++) {
 765                 struct rtable __rcu **pprev;
 766                 struct rtable *list;
 767
 768                 if (process_context && need_resched())
 769                         cond_resched();
 770                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 771                 if (!rth)
 772                         continue;
 773
 774                 spin_lock_bh(rt_hash_lock_addr(i));
 775
 776                 list = NULL;
 777                 pprev = &rt_hash_table[i].chain;
 778                 rth = rcu_dereference_protected(*pprev,
 779                         lockdep_is_held(rt_hash_lock_addr(i)));
 780
 781                 while (rth) {
 782                         next = rcu_dereference_protected(rth->dst.rt_next,
 783                                 lockdep_is_held(rt_hash_lock_addr(i)));
 784
 785                         if (!net ||
 786                             net_eq(dev_net(rth->dst.dev), net)) {
 787                                 rcu_assign_pointer(*pprev, next);
 788                                 rcu_assign_pointer(rth->dst.rt_next, list);
 789                                 list = rth;
 790                         } else {
 791                                 pprev = &rth->dst.rt_next;
 792                         }
 793                         rth = next;
 794                 }
 795
 796                 spin_unlock_bh(rt_hash_lock_addr(i));
 797
 798                 for (; list; list = next) {
 799                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 800                         rt_free(list);
 801                 }
 802         }
 803 }
 804
 805 /*
 806  * While freeing expired entries, we compute average chain length
 807  * and standard deviation, using fixed-point arithmetic.
 808  * This to have an estimation of rt_chain_length_max
 809  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 810  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 811  */
 812
 813 #define FRACT_BITS 3
 814 #define ONE (1UL << FRACT_BITS)
 815
 816 /*
 817  * Given a hash chain and an item in this hash chain,
 818  * find if a previous entry has the same hash_inputs
 819  * (but differs on tos, mark or oif)
 820  * Returns 0 if an alias is found.
 821  * Returns ONE if rth has no alias before itself.
 822  */
 823 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 824 {
 825         const struct rtable *aux = head;
 826
 827         while (aux != rth) {
 828                 if (compare_hash_inputs(aux, rth))
 829                         return 0;
 830                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 831         }
 832         return ONE;
 833 }
 834
 835 static void rt_check_expire(void)
 836 {
 837         static unsigned int rover;
 838         unsigned int i = rover, goal;
 839         struct rtable *rth;
 840         struct rtable __rcu **rthp;
 841         unsigned long samples = 0;
 842         unsigned long sum = 0, sum2 = 0;
 843         unsigned long delta;
 844         u64 mult;
 845
 846         delta = jiffies - expires_ljiffies;
 847         expires_ljiffies = jiffies;
 848         mult = ((u64)delta) << rt_hash_log;
 849         if (ip_rt_gc_timeout > 1)
 850                 do_div(mult, ip_rt_gc_timeout);
 851         goal = (unsigned int)mult;
 852         if (goal > rt_hash_mask)
 853                 goal = rt_hash_mask + 1;
 854         for (; goal > 0; goal--) {
 855                 unsigned long tmo = ip_rt_gc_timeout;
 856                 unsigned long length;
 857
 858                 i = (i + 1) & rt_hash_mask;
 859                 rthp = &rt_hash_table[i].chain;
 860
 861                 if (need_resched())
 862                         cond_resched();
 863
 864                 samples++;
 865
 866                 if (rcu_dereference_raw(*rthp) == NULL)
 867                         continue;
 868                 length = 0;
 869                 spin_lock_bh(rt_hash_lock_addr(i));
 870                 while ((rth = rcu_dereference_protected(*rthp,
 871                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 872                         prefetch(rth->dst.rt_next);
 873                         if (rt_is_expired(rth)) {
 874                                 *rthp = rth->dst.rt_next;
 875                                 rt_free(rth);
 876                                 continue;
 877                         }
 878                         if (rth->dst.expires) {
 879                                 /* Entry is expired even if it is in use */
 880                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 881 nofree:
 882                                         tmo >>= 1;
 883                                         rthp = &rth->dst.rt_next;
 884                                         /*
 885                                          * We only count entries on
 886                                          * a chain with equal hash inputs once
 887                                          * so that entries for different QOS
 888                                          * levels, and other non-hash input
 889                                          * attributes don't unfairly skew
 890                                          * the length computation
 891                                          */
 892                                         length += has_noalias(rt_hash_table[i].chain, rth);
 893                                         continue;
 894                                 }
 895                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 896                                 goto nofree;
 897
 898                         /* Cleanup aged off entries. */
 899                         *rthp = rth->dst.rt_next;
 900                         rt_free(rth);
 901                 }
 902                 spin_unlock_bh(rt_hash_lock_addr(i));
 903                 sum += length;
 904                 sum2 += length*length;
 905         }
 906         if (samples) {
 907                 unsigned long avg = sum / samples;
 908                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 909                 rt_chain_length_max = max_t(unsigned long,
 910                                         ip_rt_gc_elasticity,
 911                                         (avg + 4*sd) >> FRACT_BITS);
 912         }
 913         rover = i;
 914 }
 915
 916 /*
 917  * rt_worker_func() is run in process context.
 918  * we call rt_check_expire() to scan part of the hash table
 919  */
 920 static void rt_worker_func(struct work_struct *work)
 921 {
 922         rt_check_expire();
 923         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 924 }
 925
 926 /*
 927  * Perturbation of rt_genid by a small quantity [1..256]
 928  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 929  * many times (2^24) without giving recent rt_genid.
 930  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 931  */
 932 static void rt_cache_invalidate(struct net *net)
 933 {
 934         unsigned char shuffle;
 935
 936         get_random_bytes(&shuffle, sizeof(shuffle));
 937         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 938         inetpeer_invalidate_family(AF_INET);
 939 }
 940
 941 /*
 942  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 943  * delay >= 0 : invalidate & flush cache (can be long)
 944  */
 945 void rt_cache_flush(struct net *net, int delay)
 946 {
 947         rt_cache_invalidate(net);
 948         if (delay >= 0)
 949                 rt_do_flush(net, !in_softirq());
 950 }
 951
 952 /* Flush previous cache invalidated entries from the cache */
 953 void rt_cache_flush_batch(struct net *net)
 954 {
 955         rt_do_flush(net, !in_softirq());
 956 }
 957
 958 static void rt_emergency_hash_rebuild(struct net *net)
 959 {
 960         net_warn_ratelimited("Route hash chain too long!\n");
 961         rt_cache_invalidate(net);
 962 }
 963
 964 /*
 965    Short description of GC goals.
 966
 967    We want to build algorithm, which will keep routing cache
 968    at some equilibrium point, when number of aged off entries
 969    is kept approximately equal to newly generated ones.
 970
 971    Current expiration strength is variable "expire".
 972    We try to adjust it dynamically, so that if networking
 973    is idle expires is large enough to keep enough of warm entries,
 974    and when load increases it reduces to limit cache size.
 975  */
 976
 977 static int rt_garbage_collect(struct dst_ops *ops)
 978 {
 979         static unsigned long expire = RT_GC_TIMEOUT;
 980         static unsigned long last_gc;
 981         static int rover;
 982         static int equilibrium;
 983         struct rtable *rth;
 984         struct rtable __rcu **rthp;
 985         unsigned long now = jiffies;
 986         int goal;
 987         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 988
 989         /*
 990          * Garbage collection is pretty expensive,
 991          * do not make it too frequently.
 992          */
 993
 994         RT_CACHE_STAT_INC(gc_total);
 995
 996         if (now - last_gc < ip_rt_gc_min_interval &&
 997             entries < ip_rt_max_size) {
 998                 RT_CACHE_STAT_INC(gc_ignored);
 999                 goto out;
1000         }
1001
1002         entries = dst_entries_get_slow(&ipv4_dst_ops);
1003         /* Calculate number of entries, which we want to expire now. */
1004         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1005         if (goal <= 0) {
1006                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1007                         equilibrium = ipv4_dst_ops.gc_thresh;
1008                 goal = entries - equilibrium;
1009                 if (goal > 0) {
1010                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1011                         goal = entries - equilibrium;
1012                 }
1013         } else {
1014                 /* We are in dangerous area. Try to reduce cache really
1015                  * aggressively.
1016                  */
1017                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1018                 equilibrium = entries - goal;
1019         }
1020
1021         if (now - last_gc >= ip_rt_gc_min_interval)
1022                 last_gc = now;
1023
1024         if (goal <= 0) {
1025                 equilibrium += goal;
1026                 goto work_done;
1027         }
1028
1029         do {
1030                 int i, k;
1031
1032                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1033                         unsigned long tmo = expire;
1034
1035                         k = (k + 1) & rt_hash_mask;
1036                         rthp = &rt_hash_table[k].chain;
1037                         spin_lock_bh(rt_hash_lock_addr(k));
1038                         while ((rth = rcu_dereference_protected(*rthp,
1039                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1040                                 if (!rt_is_expired(rth) &&
1041                                         !rt_may_expire(rth, tmo, expire)) {
1042                                         tmo >>= 1;
1043                                         rthp = &rth->dst.rt_next;
1044                                         continue;
1045                                 }
1046                                 *rthp = rth->dst.rt_next;
1047                                 rt_free(rth);
1048                                 goal--;
1049                         }
1050                         spin_unlock_bh(rt_hash_lock_addr(k));
1051                         if (goal <= 0)
1052                                 break;
1053                 }
1054                 rover = k;
1055
1056                 if (goal <= 0)
1057                         goto work_done;
1058
1059                 /* Goal is not achieved. We stop process if:
1060
1061                    - if expire reduced to zero. Otherwise, expire is halfed.
1062                    - if table is not full.
1063                    - if we are called from interrupt.
1064                    - jiffies check is just fallback/debug loop breaker.
1065                      We will not spin here for long time in any case.
1066                  */
1067
1068                 RT_CACHE_STAT_INC(gc_goal_miss);
1069
1070                 if (expire == 0)
1071                         break;
1072
1073                 expire >>= 1;
1074
1075                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1076                         goto out;
1077         } while (!in_softirq() && time_before_eq(jiffies, now));
1078
1079         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1080                 goto out;
1081         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1082                 goto out;
1083         net_warn_ratelimited("dst cache overflow\n");
1084         RT_CACHE_STAT_INC(gc_dst_overflow);
1085         return 1;
1086
1087 work_done:
1088         expire += ip_rt_gc_min_interval;
1089         if (expire > ip_rt_gc_timeout ||
1090             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1091             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1092                 expire = ip_rt_gc_timeout;
1093 out:    return 0;
1094 }
1095
1096 /*
1097  * Returns number of entries in a hash chain that have different hash_inputs
1098  */
1099 static int slow_chain_length(const struct rtable *head)
1100 {
1101         int length = 0;
1102         const struct rtable *rth = head;
1103
1104         while (rth) {
1105                 length += has_noalias(head, rth);
1106                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1107         }
1108         return length >> FRACT_BITS;
1109 }
1110
1111 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1112 {
1113         static const __be32 inaddr_any = 0;
1114         struct net_device *dev = dst->dev;
1115         const __be32 *pkey = daddr;
1116         const struct rtable *rt;
1117         struct neighbour *n;
1118
1119         rt = (const struct rtable *) dst;
1120
1121         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1122                 pkey = &inaddr_any;
1123         else if (rt->rt_gateway)
1124                 pkey = (const __be32 *) &rt->rt_gateway;
1125
1126         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1127         if (n)
1128                 return n;
1129         return neigh_create(&arp_tbl, pkey, dev);
1130 }
1131
1132 static int rt_bind_neighbour(struct rtable *rt)
1133 {
1134         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1135         if (IS_ERR(n))
1136                 return PTR_ERR(n);
1137         dst_set_neighbour(&rt->dst, n);
1138
1139         return 0;
1140 }
1141
1142 static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1143                                      struct sk_buff *skb, int ifindex)
1144 {
1145         struct rtable   *rth, *cand;
1146         struct rtable __rcu **rthp, **candp;
1147         unsigned long   now;
1148         u32             min_score;
1149         int             chain_length;
1150         int attempts = !in_softirq();
1151
1152 restart:
1153         chain_length = 0;
1154         min_score = ~(u32)0;
1155         cand = NULL;
1156         candp = NULL;
1157         now = jiffies;
1158
1159         if (!rt_caching(dev_net(rt->dst.dev))) {
1160                 /*
1161                  * If we're not caching, just tell the caller we
1162                  * were successful and don't touch the route.  The
1163                  * caller hold the sole reference to the cache entry, and
1164                  * it will be released when the caller is done with it.
1165                  * If we drop it here, the callers have no way to resolve routes
1166                  * when we're not caching.  Instead, just point *rp at rt, so
1167                  * the caller gets a single use out of the route
1168                  * Note that we do rt_free on this new route entry, so that
1169                  * once its refcount hits zero, we are still able to reap it
1170                  * (Thanks Alexey)
1171                  * Note: To avoid expensive rcu stuff for this uncached dst,
1172                  * we set DST_NOCACHE so that dst_release() can free dst without
1173                  * waiting a grace period.
1174                  */
1175
1176                 rt->dst.flags |= DST_NOCACHE;
1177                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1178                         int err = rt_bind_neighbour(rt);
1179                         if (err) {
1180                                 net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1181                                 ip_rt_put(rt);
1182                                 return ERR_PTR(err);
1183                         }
1184                 }
1185
1186                 goto skip_hashing;
1187         }
1188
1189         rthp = &rt_hash_table[hash].chain;
1190
1191         spin_lock_bh(rt_hash_lock_addr(hash));
1192         while ((rth = rcu_dereference_protected(*rthp,
1193                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1194                 if (rt_is_expired(rth)) {
1195                         *rthp = rth->dst.rt_next;
1196                         rt_free(rth);
1197                         continue;
1198                 }
1199                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1200                         /* Put it first */
1201                         *rthp = rth->dst.rt_next;
1202                         /*
1203                          * Since lookup is lockfree, the deletion
1204                          * must be visible to another weakly ordered CPU before
1205                          * the insertion at the start of the hash chain.
1206                          */
1207                         rcu_assign_pointer(rth->dst.rt_next,
1208                                            rt_hash_table[hash].chain);
1209                         /*
1210                          * Since lookup is lockfree, the update writes
1211                          * must be ordered for consistency on SMP.
1212                          */
1213                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1214
1215                         dst_use(&rth->dst, now);
1216                         spin_unlock_bh(rt_hash_lock_addr(hash));
1217
1218                         rt_drop(rt);
1219                         if (skb)
1220                                 skb_dst_set(skb, &rth->dst);
1221                         return rth;
1222                 }
1223
1224                 if (!atomic_read(&rth->dst.__refcnt)) {
1225                         u32 score = rt_score(rth);
1226
1227                         if (score <= min_score) {
1228                                 cand = rth;
1229                                 candp = rthp;
1230                                 min_score = score;
1231                         }
1232                 }
1233
1234                 chain_length++;
1235
1236                 rthp = &rth->dst.rt_next;
1237         }
1238
1239         if (cand) {
1240                 /* ip_rt_gc_elasticity used to be average length of chain
1241                  * length, when exceeded gc becomes really aggressive.
1242                  *
1243                  * The second limit is less certain. At the moment it allows
1244                  * only 2 entries per bucket. We will see.
1245                  */
1246                 if (chain_length > ip_rt_gc_elasticity) {
1247                         *candp = cand->dst.rt_next;
1248                         rt_free(cand);
1249                 }
1250         } else {
1251                 if (chain_length > rt_chain_length_max &&
1252                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1253                         struct net *net = dev_net(rt->dst.dev);
1254                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1255                         if (!rt_caching(net)) {
1256                                 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1257                                         rt->dst.dev->name, num);
1258                         }
1259                         rt_emergency_hash_rebuild(net);
1260                         spin_unlock_bh(rt_hash_lock_addr(hash));
1261
1262                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1263                                         ifindex, rt_genid(net));
1264                         goto restart;
1265                 }
1266         }
1267
1268         /* Try to bind route to arp only if it is output
1269            route or unicast forwarding path.
1270          */
1271         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1272                 int err = rt_bind_neighbour(rt);
1273                 if (err) {
1274                         spin_unlock_bh(rt_hash_lock_addr(hash));
1275
1276                         if (err != -ENOBUFS) {
1277                                 rt_drop(rt);
1278                                 return ERR_PTR(err);
1279                         }
1280
1281                         /* Neighbour tables are full and nothing
1282                            can be released. Try to shrink route cache,
1283                            it is most likely it holds some neighbour records.
1284                          */
1285                         if (attempts-- > 0) {
1286                                 int saved_elasticity = ip_rt_gc_elasticity;
1287                                 int saved_int = ip_rt_gc_min_interval;
1288                                 ip_rt_gc_elasticity     = 1;
1289                                 ip_rt_gc_min_interval   = 0;
1290                                 rt_garbage_collect(&ipv4_dst_ops);
1291                                 ip_rt_gc_min_interval   = saved_int;
1292                                 ip_rt_gc_elasticity     = saved_elasticity;
1293                                 goto restart;
1294                         }
1295
1296                         net_warn_ratelimited("Neighbour table overflow\n");
1297                         rt_drop(rt);
1298                         return ERR_PTR(-ENOBUFS);
1299                 }
1300         }
1301
1302         rt->dst.rt_next = rt_hash_table[hash].chain;
1303
1304         /*
1305          * Since lookup is lockfree, we must make sure
1306          * previous writes to rt are committed to memory
1307          * before making rt visible to other CPUS.
1308          */
1309         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1310
1311         spin_unlock_bh(rt_hash_lock_addr(hash));
1312
1313 skip_hashing:
1314         if (skb)
1315                 skb_dst_set(skb, &rt->dst);
1316         return rt;
1317 }
1318
1319 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1320
1321 static u32 rt_peer_genid(void)
1322 {
1323         return atomic_read(&__rt_peer_genid);
1324 }
1325
1326 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1327 {
1328         struct inet_peer_base *base;
1329         struct inet_peer *peer;
1330
1331         base = inetpeer_base_ptr(rt->_peer);
1332         if (!base)
1333                 return;
1334
1335         peer = inet_getpeer_v4(base, daddr, create);
1336         if (peer) {
1337                 if (!rt_set_peer(rt, peer))
1338                         inet_putpeer(peer);
1339                 else
1340                         rt->rt_peer_genid = rt_peer_genid();
1341         }
1342 }
1343
1344 /*
1345  * Peer allocation may fail only in serious out-of-memory conditions.  However
1346  * we still can generate some output.
1347  * Random ID selection looks a bit dangerous because we have no chances to
1348  * select ID being unique in a reasonable period of time.
1349  * But broken packet identifier may be better than no packet at all.
1350  */
1351 static void ip_select_fb_ident(struct iphdr *iph)
1352 {
1353         static DEFINE_SPINLOCK(ip_fb_id_lock);
1354         static u32 ip_fallback_id;
1355         u32 salt;
1356
1357         spin_lock_bh(&ip_fb_id_lock);
1358         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1359         iph->id = htons(salt & 0xFFFF);
1360         ip_fallback_id = salt;
1361         spin_unlock_bh(&ip_fb_id_lock);
1362 }
1363
1364 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1365 {
1366         struct rtable *rt = (struct rtable *) dst;
1367
1368         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1369                 struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst);
1370
1371                 /* If peer is attached to destination, it is never detached,
1372                    so that we need not to grab a lock to dereference it.
1373                  */
1374                 if (peer) {
1375                         iph->id = htons(inet_getid(peer, more));
1376                         return;
1377                 }
1378         } else if (!rt)
1379                 pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1380
1381         ip_select_fb_ident(iph);
1382 }
1383 EXPORT_SYMBOL(__ip_select_ident);
1384
1385 static void rt_del(unsigned int hash, struct rtable *rt)
1386 {
1387         struct rtable __rcu **rthp;
1388         struct rtable *aux;
1389
1390         rthp = &rt_hash_table[hash].chain;
1391         spin_lock_bh(rt_hash_lock_addr(hash));
1392         ip_rt_put(rt);
1393         while ((aux = rcu_dereference_protected(*rthp,
1394                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1395                 if (aux == rt || rt_is_expired(aux)) {
1396                         *rthp = aux->dst.rt_next;
1397                         rt_free(aux);
1398                         continue;
1399                 }
1400                 rthp = &aux->dst.rt_next;
1401         }
1402         spin_unlock_bh(rt_hash_lock_addr(hash));
1403 }
1404
1405 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1406 {
1407         struct rtable *rt = (struct rtable *) dst;
1408         __be32 orig_gw = rt->rt_gateway;
1409         struct neighbour *n, *old_n;
1410
1411         dst_confirm(&rt->dst);
1412
1413         rt->rt_gateway = peer->redirect_learned.a4;
1414
1415         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1416         if (IS_ERR(n)) {
1417                 rt->rt_gateway = orig_gw;
1418                 return;
1419         }
1420         old_n = xchg(&rt->dst._neighbour, n);
1421         if (old_n)
1422                 neigh_release(old_n);
1423         if (!(n->nud_state & NUD_VALID)) {
1424                 neigh_event_send(n, NULL);
1425         } else {
1426                 rt->rt_flags |= RTCF_REDIRECTED;
1427                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1428         }
1429 }
1430
1431 /* called in rcu_read_lock() section */
1432 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1433                     __be32 saddr, struct net_device *dev)
1434 {
1435         int s, i;
1436         struct in_device *in_dev = __in_dev_get_rcu(dev);
1437         __be32 skeys[2] = { saddr, 0 };
1438         int    ikeys[2] = { dev->ifindex, 0 };
1439         struct inet_peer *peer;
1440         struct net *net;
1441
1442         if (!in_dev)
1443                 return;
1444
1445         net = dev_net(dev);
1446         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1447             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1448             ipv4_is_zeronet(new_gw))
1449                 goto reject_redirect;
1450
1451         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1452                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1453                         goto reject_redirect;
1454                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1455                         goto reject_redirect;
1456         } else {
1457                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1458                         goto reject_redirect;
1459         }
1460
1461         for (s = 0; s < 2; s++) {
1462                 for (i = 0; i < 2; i++) {
1463                         unsigned int hash;
1464                         struct rtable __rcu **rthp;
1465                         struct rtable *rt;
1466
1467                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1468
1469                         rthp = &rt_hash_table[hash].chain;
1470
1471                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1472                                 rthp = &rt->dst.rt_next;
1473
1474                                 if (rt->rt_key_dst != daddr ||
1475                                     rt->rt_key_src != skeys[s] ||
1476                                     rt->rt_oif != ikeys[i] ||
1477                                     rt_is_input_route(rt) ||
1478                                     rt_is_expired(rt) ||
1479                                     !net_eq(dev_net(rt->dst.dev), net) ||
1480                                     rt->dst.error ||
1481                                     rt->dst.dev != dev ||
1482                                     rt->rt_gateway != old_gw)
1483                                         continue;
1484
1485                                 peer = rt_get_peer_create(rt, rt->rt_dst);
1486                                 if (peer) {
1487                                         if (peer->redirect_learned.a4 != new_gw) {
1488                                                 peer->redirect_learned.a4 = new_gw;
1489                                                 atomic_inc(&__rt_peer_genid);
1490                                         }
1491                                         check_peer_redir(&rt->dst, peer);
1492                                 }
1493                         }
1494                 }
1495         }
1496         return;
1497
1498 reject_redirect:
1499 #ifdef CONFIG_IP_ROUTE_VERBOSE
1500         if (IN_DEV_LOG_MARTIANS(in_dev))
1501                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1502                                      "  Advised path = %pI4 -> %pI4\n",
1503                                      &old_gw, dev->name, &new_gw,
1504                                      &saddr, &daddr);
1505 #endif
1506         ;
1507 }
1508
1509 static bool peer_pmtu_expired(struct inet_peer *peer)
1510 {
1511         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1512
1513         return orig &&
1514                time_after_eq(jiffies, orig) &&
1515                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1516 }
1517
1518 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1519 {
1520         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1521
1522         return orig &&
1523                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1524 }
1525
1526 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1527 {
1528         struct rtable *rt = (struct rtable *)dst;
1529         struct dst_entry *ret = dst;
1530
1531         if (rt) {
1532                 if (dst->obsolete > 0) {
1533                         ip_rt_put(rt);
1534                         ret = NULL;
1535                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1536                         unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1537                                                 rt->rt_oif,
1538                                                 rt_genid(dev_net(dst->dev)));
1539                         rt_del(hash, rt);
1540                         ret = NULL;
1541                 } else if (rt_has_peer(rt)) {
1542                         struct inet_peer *peer = rt_peer_ptr(rt);
1543                         if (peer_pmtu_expired(peer))
1544                                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1545                 }
1546         }
1547         return ret;
1548 }
1549
1550 /*
1551  * Algorithm:
1552  *      1. The first ip_rt_redirect_number redirects are sent
1553  *         with exponential backoff, then we stop sending them at all,
1554  *         assuming that the host ignores our redirects.
1555  *      2. If we did not see packets requiring redirects
1556  *         during ip_rt_redirect_silence, we assume that the host
1557  *         forgot redirected route and start to send redirects again.
1558  *
1559  * This algorithm is much cheaper and more intelligent than dumb load limiting
1560  * in icmp.c.
1561  *
1562  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1563  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1564  */
1565
1566 void ip_rt_send_redirect(struct sk_buff *skb)
1567 {
1568         struct rtable *rt = skb_rtable(skb);
1569         struct in_device *in_dev;
1570         struct inet_peer *peer;
1571         int log_martians;
1572
1573         rcu_read_lock();
1574         in_dev = __in_dev_get_rcu(rt->dst.dev);
1575         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1576                 rcu_read_unlock();
1577                 return;
1578         }
1579         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1580         rcu_read_unlock();
1581
1582         peer = rt_get_peer_create(rt, rt->rt_dst);
1583         if (!peer) {
1584                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1585                 return;
1586         }
1587
1588         /* No redirected packets during ip_rt_redirect_silence;
1589          * reset the algorithm.
1590          */
1591         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1592                 peer->rate_tokens = 0;
1593
1594         /* Too many ignored redirects; do not send anything
1595          * set dst.rate_last to the last seen redirected packet.
1596          */
1597         if (peer->rate_tokens >= ip_rt_redirect_number) {
1598                 peer->rate_last = jiffies;
1599                 return;
1600         }
1601
1602         /* Check for load limit; set rate_last to the latest sent
1603          * redirect.
1604          */
1605         if (peer->rate_tokens == 0 ||
1606             time_after(jiffies,
1607                        (peer->rate_last +
1608                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1609                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1610                 peer->rate_last = jiffies;
1611                 ++peer->rate_tokens;
1612 #ifdef CONFIG_IP_ROUTE_VERBOSE
1613                 if (log_martians &&
1614                     peer->rate_tokens == ip_rt_redirect_number)
1615                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1616                                              &ip_hdr(skb)->saddr, rt->rt_iif,
1617                                              &rt->rt_dst, &rt->rt_gateway);
1618 #endif
1619         }
1620 }
1621
1622 static int ip_error(struct sk_buff *skb)
1623 {
1624         struct rtable *rt = skb_rtable(skb);
1625         struct inet_peer *peer;
1626         unsigned long now;
1627         bool send;
1628         int code;
1629
1630         switch (rt->dst.error) {
1631         case EINVAL:
1632         default:
1633                 goto out;
1634         case EHOSTUNREACH:
1635                 code = ICMP_HOST_UNREACH;
1636                 break;
1637         case ENETUNREACH:
1638                 code = ICMP_NET_UNREACH;
1639                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1640                                 IPSTATS_MIB_INNOROUTES);
1641                 break;
1642         case EACCES:
1643                 code = ICMP_PKT_FILTERED;
1644                 break;
1645         }
1646
1647         peer = rt_get_peer_create(rt, rt->rt_dst);
1648
1649         send = true;
1650         if (peer) {
1651                 now = jiffies;
1652                 peer->rate_tokens += now - peer->rate_last;
1653                 if (peer->rate_tokens > ip_rt_error_burst)
1654                         peer->rate_tokens = ip_rt_error_burst;
1655                 peer->rate_last = now;
1656                 if (peer->rate_tokens >= ip_rt_error_cost)
1657                         peer->rate_tokens -= ip_rt_error_cost;
1658                 else
1659                         send = false;
1660         }
1661         if (send)
1662                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1663
1664 out:    kfree_skb(skb);
1665         return 0;
1666 }
1667
1668 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1669 {
1670         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1671
1672         if (!expires)
1673                 return;
1674         if (time_before(jiffies, expires)) {
1675                 u32 orig_dst_mtu = dst_mtu(dst);
1676                 if (peer->pmtu_learned < orig_dst_mtu) {
1677                         if (!peer->pmtu_orig)
1678                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1679                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1680                 }
1681         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1682                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1683 }
1684
1685 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1686 {
1687         struct rtable *rt = (struct rtable *) dst;
1688         struct inet_peer *peer;
1689
1690         dst_confirm(dst);
1691
1692         peer = rt_get_peer_create(rt, rt->rt_dst);
1693         if (peer) {
1694                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1695
1696                 if (mtu < ip_rt_min_pmtu)
1697                         mtu = ip_rt_min_pmtu;
1698                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1699
1700                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1701                         if (!pmtu_expires)
1702                                 pmtu_expires = 1UL;
1703
1704                         peer->pmtu_learned = mtu;
1705                         peer->pmtu_expires = pmtu_expires;
1706
1707                         atomic_inc(&__rt_peer_genid);
1708                         rt->rt_peer_genid = rt_peer_genid();
1709                 }
1710                 check_peer_pmtu(dst, peer);
1711         }
1712 }
1713
1714
1715 static void ipv4_validate_peer(struct rtable *rt)
1716 {
1717         if (rt->rt_peer_genid != rt_peer_genid()) {
1718                 struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst);
1719
1720                 if (peer) {
1721                         check_peer_pmtu(&rt->dst, peer);
1722
1723                         if (peer->redirect_learned.a4 &&
1724                             peer->redirect_learned.a4 != rt->rt_gateway)
1725                                 check_peer_redir(&rt->dst, peer);
1726                 }
1727
1728                 rt->rt_peer_genid = rt_peer_genid();
1729         }
1730 }
1731
1732 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1733 {
1734         struct rtable *rt = (struct rtable *) dst;
1735
1736         if (rt_is_expired(rt))
1737                 return NULL;
1738         ipv4_validate_peer(rt);
1739         return dst;
1740 }
1741
1742 static void ipv4_dst_destroy(struct dst_entry *dst)
1743 {
1744         struct rtable *rt = (struct rtable *) dst;
1745
1746         if (rt->fi) {
1747                 fib_info_put(rt->fi);
1748                 rt->fi = NULL;
1749         }
1750         if (rt_has_peer(rt)) {
1751                 struct inet_peer *peer = rt_peer_ptr(rt);
1752                 inet_putpeer(peer);
1753         }
1754 }
1755
1756
1757 static void ipv4_link_failure(struct sk_buff *skb)
1758 {
1759         struct rtable *rt;
1760
1761         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1762
1763         rt = skb_rtable(skb);
1764         if (rt && rt_has_peer(rt)) {
1765                 struct inet_peer *peer = rt_peer_ptr(rt);
1766                 if (peer_pmtu_cleaned(peer))
1767                         dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig);
1768         }
1769 }
1770
1771 static int ip_rt_bug(struct sk_buff *skb)
1772 {
1773         pr_debug("%s: %pI4 -> %pI4, %s\n",
1774                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1775                  skb->dev ? skb->dev->name : "?");
1776         kfree_skb(skb);
1777         WARN_ON(1);
1778         return 0;
1779 }
1780
1781 /*
1782    We do not cache source address of outgoing interface,
1783    because it is used only by IP RR, TS and SRR options,
1784    so that it out of fast path.
1785
1786    BTW remember: "addr" is allowed to be not aligned
1787    in IP options!
1788  */
1789
1790 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1791 {
1792         __be32 src;
1793
1794         if (rt_is_output_route(rt))
1795                 src = ip_hdr(skb)->saddr;
1796         else {
1797                 struct fib_result res;
1798                 struct flowi4 fl4;
1799                 struct iphdr *iph;
1800
1801                 iph = ip_hdr(skb);
1802
1803                 memset(&fl4, 0, sizeof(fl4));
1804                 fl4.daddr = iph->daddr;
1805                 fl4.saddr = iph->saddr;
1806                 fl4.flowi4_tos = RT_TOS(iph->tos);
1807                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1808                 fl4.flowi4_iif = skb->dev->ifindex;
1809                 fl4.flowi4_mark = skb->mark;
1810
1811                 rcu_read_lock();
1812                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1813                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1814                 else
1815                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1816                                         RT_SCOPE_UNIVERSE);
1817                 rcu_read_unlock();
1818         }
1819         memcpy(addr, &src, 4);
1820 }
1821
1822 #ifdef CONFIG_IP_ROUTE_CLASSID
1823 static void set_class_tag(struct rtable *rt, u32 tag)
1824 {
1825         if (!(rt->dst.tclassid & 0xFFFF))
1826                 rt->dst.tclassid |= tag & 0xFFFF;
1827         if (!(rt->dst.tclassid & 0xFFFF0000))
1828                 rt->dst.tclassid |= tag & 0xFFFF0000;
1829 }
1830 #endif
1831
1832 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1833 {
1834         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1835
1836         if (advmss == 0) {
1837                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1838                                ip_rt_min_advmss);
1839                 if (advmss > 65535 - 40)
1840                         advmss = 65535 - 40;
1841         }
1842         return advmss;
1843 }
1844
1845 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1846 {
1847         const struct rtable *rt = (const struct rtable *) dst;
1848         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1849
1850         if (mtu && rt_is_output_route(rt))
1851                 return mtu;
1852
1853         mtu = dst->dev->mtu;
1854
1855         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1856
1857                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1858                         mtu = 576;
1859         }
1860
1861         if (mtu > IP_MAX_MTU)
1862                 mtu = IP_MAX_MTU;
1863
1864         return mtu;
1865 }
1866
1867 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1868                             struct fib_info *fi)
1869 {
1870         struct inet_peer_base *base;
1871         struct inet_peer *peer;
1872         int create = 0;
1873
1874         /* If a peer entry exists for this destination, we must hook
1875          * it up in order to get at cached metrics.
1876          */
1877         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1878                 create = 1;
1879
1880         base = inetpeer_base_ptr(rt->_peer);
1881         BUG_ON(!base);
1882
1883         peer = inet_getpeer_v4(base, rt->rt_dst, create);
1884         if (peer) {
1885                 __rt_set_peer(rt, peer);
1886                 rt->rt_peer_genid = rt_peer_genid();
1887                 if (inet_metrics_new(peer))
1888                         memcpy(peer->metrics, fi->fib_metrics,
1889                                sizeof(u32) * RTAX_MAX);
1890                 dst_init_metrics(&rt->dst, peer->metrics, false);
1891
1892                 check_peer_pmtu(&rt->dst, peer);
1893
1894                 if (peer->redirect_learned.a4 &&
1895                     peer->redirect_learned.a4 != rt->rt_gateway) {
1896                         rt->rt_gateway = peer->redirect_learned.a4;
1897                         rt->rt_flags |= RTCF_REDIRECTED;
1898                 }
1899         } else {
1900                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1901                         rt->fi = fi;
1902                         atomic_inc(&fi->fib_clntref);
1903                 }
1904                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1905         }
1906 }
1907
1908 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1909                            const struct fib_result *res,
1910                            struct fib_info *fi, u16 type, u32 itag)
1911 {
1912         struct dst_entry *dst = &rt->dst;
1913
1914         if (fi) {
1915                 if (FIB_RES_GW(*res) &&
1916                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1917                         rt->rt_gateway = FIB_RES_GW(*res);
1918                 rt_init_metrics(rt, fl4, fi);
1919 #ifdef CONFIG_IP_ROUTE_CLASSID
1920                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1921 #endif
1922         }
1923
1924         if (dst_mtu(dst) > IP_MAX_MTU)
1925                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1926         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1927                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1928
1929 #ifdef CONFIG_IP_ROUTE_CLASSID
1930 #ifdef CONFIG_IP_MULTIPLE_TABLES
1931         set_class_tag(rt, fib_rules_tclass(res));
1932 #endif
1933         set_class_tag(rt, itag);
1934 #endif
1935 }
1936
1937 static struct rtable *rt_dst_alloc(struct net_device *dev,
1938                                    bool nopolicy, bool noxfrm)
1939 {
1940         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1941                          DST_HOST |
1942                          (nopolicy ? DST_NOPOLICY : 0) |
1943                          (noxfrm ? DST_NOXFRM : 0));
1944 }
1945
1946 /* called in rcu_read_lock() section */
1947 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1948                                 u8 tos, struct net_device *dev, int our)
1949 {
1950         unsigned int hash;
1951         struct rtable *rth;
1952         __be32 spec_dst;
1953         struct in_device *in_dev = __in_dev_get_rcu(dev);
1954         u32 itag = 0;
1955         int err;
1956
1957         /* Primary sanity checks. */
1958
1959         if (in_dev == NULL)
1960                 return -EINVAL;
1961
1962         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1963             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1964                 goto e_inval;
1965
1966         if (ipv4_is_zeronet(saddr)) {
1967                 if (!ipv4_is_local_multicast(daddr))
1968                         goto e_inval;
1969                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1970         } else {
1971                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1972                                           &itag);
1973                 if (err < 0)
1974                         goto e_err;
1975         }
1976         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1977                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1978         if (!rth)
1979                 goto e_nobufs;
1980
1981 #ifdef CONFIG_IP_ROUTE_CLASSID
1982         rth->dst.tclassid = itag;
1983 #endif
1984         rth->dst.output = ip_rt_bug;
1985
1986         rth->rt_key_dst = daddr;
1987         rth->rt_key_src = saddr;
1988         rth->rt_genid   = rt_genid(dev_net(dev));
1989         rth->rt_flags   = RTCF_MULTICAST;
1990         rth->rt_type    = RTN_MULTICAST;
1991         rth->rt_key_tos = tos;
1992         rth->rt_dst     = daddr;
1993         rth->rt_src     = saddr;
1994         rth->rt_route_iif = dev->ifindex;
1995         rth->rt_iif     = dev->ifindex;
1996         rth->rt_oif     = 0;
1997         rth->rt_mark    = skb->mark;
1998         rth->rt_gateway = daddr;
1999         rth->rt_spec_dst= spec_dst;
2000         rth->rt_peer_genid = 0;
2001         rt_init_peer(rth, dev_net(dev)->ipv4.peers);
2002         rth->fi = NULL;
2003         if (our) {
2004                 rth->dst.input= ip_local_deliver;
2005                 rth->rt_flags |= RTCF_LOCAL;
2006         }
2007
2008 #ifdef CONFIG_IP_MROUTE
2009         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2010                 rth->dst.input = ip_mr_input;
2011 #endif
2012         RT_CACHE_STAT_INC(in_slow_mc);
2013
2014         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2015         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2016         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2017
2018 e_nobufs:
2019         return -ENOBUFS;
2020 e_inval:
2021         return -EINVAL;
2022 e_err:
2023         return err;
2024 }
2025
2026
2027 static void ip_handle_martian_source(struct net_device *dev,
2028                                      struct in_device *in_dev,
2029                                      struct sk_buff *skb,
2030                                      __be32 daddr,
2031                                      __be32 saddr)
2032 {
2033         RT_CACHE_STAT_INC(in_martian_src);
2034 #ifdef CONFIG_IP_ROUTE_VERBOSE
2035         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2036                 /*
2037                  *      RFC1812 recommendation, if source is martian,
2038                  *      the only hint is MAC header.
2039                  */
2040                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2041                         &daddr, &saddr, dev->name);
2042                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2043                         print_hex_dump(KERN_WARNING, "ll header: ",
2044                                        DUMP_PREFIX_OFFSET, 16, 1,
2045                                        skb_mac_header(skb),
2046                                        dev->hard_header_len, true);
2047                 }
2048         }
2049 #endif
2050 }
2051
2052 /* called in rcu_read_lock() section */
2053 static int __mkroute_input(struct sk_buff *skb,
2054                            const struct fib_result *res,
2055                            struct in_device *in_dev,
2056                            __be32 daddr, __be32 saddr, u32 tos,
2057                            struct rtable **result)
2058 {
2059         struct rtable *rth;
2060         int err;
2061         struct in_device *out_dev;
2062         unsigned int flags = 0;
2063         __be32 spec_dst;
2064         u32 itag;
2065
2066         /* get a working reference to the output device */
2067         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2068         if (out_dev == NULL) {
2069                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2070                 return -EINVAL;
2071         }
2072
2073
2074         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2075                                   in_dev->dev, &spec_dst, &itag);
2076         if (err < 0) {
2077                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2078                                          saddr);
2079
2080                 goto cleanup;
2081         }
2082
2083         if (err)
2084                 flags |= RTCF_DIRECTSRC;
2085
2086         if (out_dev == in_dev && err &&
2087             (IN_DEV_SHARED_MEDIA(out_dev) ||
2088              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2089                 flags |= RTCF_DOREDIRECT;
2090
2091         if (skb->protocol != htons(ETH_P_IP)) {
2092                 /* Not IP (i.e. ARP). Do not create route, if it is
2093                  * invalid for proxy arp. DNAT routes are always valid.
2094                  *
2095                  * Proxy arp feature have been extended to allow, ARP
2096                  * replies back to the same interface, to support
2097                  * Private VLAN switch technologies. See arp.c.
2098                  */
2099                 if (out_dev == in_dev &&
2100                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2101                         err = -EINVAL;
2102                         goto cleanup;
2103                 }
2104         }
2105
2106         rth = rt_dst_alloc(out_dev->dev,
2107                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2108                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2109         if (!rth) {
2110                 err = -ENOBUFS;
2111                 goto cleanup;
2112         }
2113
2114         rth->rt_key_dst = daddr;
2115         rth->rt_key_src = saddr;
2116         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2117         rth->rt_flags = flags;
2118         rth->rt_type = res->type;
2119         rth->rt_key_tos = tos;
2120         rth->rt_dst     = daddr;
2121         rth->rt_src     = saddr;
2122         rth->rt_route_iif = in_dev->dev->ifindex;
2123         rth->rt_iif     = in_dev->dev->ifindex;
2124         rth->rt_oif     = 0;
2125         rth->rt_mark    = skb->mark;
2126         rth->rt_gateway = daddr;
2127         rth->rt_spec_dst= spec_dst;
2128         rth->rt_peer_genid = 0;
2129         rt_init_peer(rth, &res->table->tb_peers);
2130         rth->fi = NULL;
2131
2132         rth->dst.input = ip_forward;
2133         rth->dst.output = ip_output;
2134
2135         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2136
2137         *result = rth;
2138         err = 0;
2139  cleanup:
2140         return err;
2141 }
2142
2143 static int ip_mkroute_input(struct sk_buff *skb,
2144                             struct fib_result *res,
2145                             const struct flowi4 *fl4,
2146                             struct in_device *in_dev,
2147                             __be32 daddr, __be32 saddr, u32 tos)
2148 {
2149         struct rtable *rth = NULL;
2150         int err;
2151         unsigned int hash;
2152
2153 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2154         if (res->fi && res->fi->fib_nhs > 1)
2155                 fib_select_multipath(res);
2156 #endif
2157
2158         /* create a routing cache entry */
2159         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2160         if (err)
2161                 return err;
2162
2163         /* put it into the cache */
2164         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2165                        rt_genid(dev_net(rth->dst.dev)));
2166         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2167         if (IS_ERR(rth))
2168                 return PTR_ERR(rth);
2169         return 0;
2170 }
2171
2172 /*
2173  *      NOTE. We drop all the packets that has local source
2174  *      addresses, because every properly looped back packet
2175  *      must have correct destination already attached by output routine.
2176  *
2177  *      Such approach solves two big problems:
2178  *      1. Not simplex devices are handled properly.
2179  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2180  *      called with rcu_read_lock()
2181  */
2182
2183 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2184                                u8 tos, struct net_device *dev)
2185 {
2186         struct fib_result res;
2187         struct in_device *in_dev = __in_dev_get_rcu(dev);
2188         struct flowi4   fl4;
2189         unsigned int    flags = 0;
2190         u32             itag = 0;
2191         struct rtable   *rth;
2192         unsigned int    hash;
2193         __be32          spec_dst;
2194         int             err = -EINVAL;
2195         struct net    *net = dev_net(dev);
2196
2197         /* IP on this device is disabled. */
2198
2199         if (!in_dev)
2200                 goto out;
2201
2202         /* Check for the most weird martians, which can be not detected
2203            by fib_lookup.
2204          */
2205
2206         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2207             ipv4_is_loopback(saddr))
2208                 goto martian_source;
2209
2210         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2211                 goto brd_input;
2212
2213         /* Accept zero addresses only to limited broadcast;
2214          * I even do not know to fix it or not. Waiting for complains :-)
2215          */
2216         if (ipv4_is_zeronet(saddr))
2217                 goto martian_source;
2218
2219         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2220                 goto martian_destination;
2221
2222         /*
2223          *      Now we are ready to route packet.
2224          */
2225         fl4.flowi4_oif = 0;
2226         fl4.flowi4_iif = dev->ifindex;
2227         fl4.flowi4_mark = skb->mark;
2228         fl4.flowi4_tos = tos;
2229         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2230         fl4.daddr = daddr;
2231         fl4.saddr = saddr;
2232         err = fib_lookup(net, &fl4, &res);
2233         if (err != 0) {
2234                 if (!IN_DEV_FORWARD(in_dev))
2235                         goto e_hostunreach;
2236                 goto no_route;
2237         }
2238
2239         RT_CACHE_STAT_INC(in_slow_tot);
2240
2241         if (res.type == RTN_BROADCAST)
2242                 goto brd_input;
2243
2244         if (res.type == RTN_LOCAL) {
2245                 err = fib_validate_source(skb, saddr, daddr, tos,
2246                                           net->loopback_dev->ifindex,
2247                                           dev, &spec_dst, &itag);
2248                 if (err < 0)
2249                         goto martian_source_keep_err;
2250                 if (err)
2251                         flags |= RTCF_DIRECTSRC;
2252                 spec_dst = daddr;
2253                 goto local_input;
2254         }
2255
2256         if (!IN_DEV_FORWARD(in_dev))
2257                 goto e_hostunreach;
2258         if (res.type != RTN_UNICAST)
2259                 goto martian_destination;
2260
2261         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2262 out:    return err;
2263
2264 brd_input:
2265         if (skb->protocol != htons(ETH_P_IP))
2266                 goto e_inval;
2267
2268         if (ipv4_is_zeronet(saddr))
2269                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2270         else {
2271                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2272                                           &itag);
2273                 if (err < 0)
2274                         goto martian_source_keep_err;
2275                 if (err)
2276                         flags |= RTCF_DIRECTSRC;
2277         }
2278         flags |= RTCF_BROADCAST;
2279         res.type = RTN_BROADCAST;
2280         RT_CACHE_STAT_INC(in_brd);
2281
2282 local_input:
2283         rth = rt_dst_alloc(net->loopback_dev,
2284                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2285         if (!rth)
2286                 goto e_nobufs;
2287
2288         rth->dst.input= ip_local_deliver;
2289         rth->dst.output= ip_rt_bug;
2290 #ifdef CONFIG_IP_ROUTE_CLASSID
2291         rth->dst.tclassid = itag;
2292 #endif
2293
2294         rth->rt_key_dst = daddr;
2295         rth->rt_key_src = saddr;
2296         rth->rt_genid = rt_genid(net);
2297         rth->rt_flags   = flags|RTCF_LOCAL;
2298         rth->rt_type    = res.type;
2299         rth->rt_key_tos = tos;
2300         rth->rt_dst     = daddr;
2301         rth->rt_src     = saddr;
2302 #ifdef CONFIG_IP_ROUTE_CLASSID
2303         rth->dst.tclassid = itag;
2304 #endif
2305         rth->rt_route_iif = dev->ifindex;
2306         rth->rt_iif     = dev->ifindex;
2307         rth->rt_oif     = 0;
2308         rth->rt_mark    = skb->mark;
2309         rth->rt_gateway = daddr;
2310         rth->rt_spec_dst= spec_dst;
2311         rth->rt_peer_genid = 0;
2312         rt_init_peer(rth, net->ipv4.peers);
2313         rth->fi = NULL;
2314         if (res.type == RTN_UNREACHABLE) {
2315                 rth->dst.input= ip_error;
2316                 rth->dst.error= -err;
2317                 rth->rt_flags   &= ~RTCF_LOCAL;
2318         }
2319         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2320         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2321         err = 0;
2322         if (IS_ERR(rth))
2323                 err = PTR_ERR(rth);
2324         goto out;
2325
2326 no_route:
2327         RT_CACHE_STAT_INC(in_no_route);
2328         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2329         res.type = RTN_UNREACHABLE;
2330         if (err == -ESRCH)
2331                 err = -ENETUNREACH;
2332         goto local_input;
2333
2334         /*
2335          *      Do not cache martian addresses: they should be logged (RFC1812)
2336          */
2337 martian_destination:
2338         RT_CACHE_STAT_INC(in_martian_dst);
2339 #ifdef CONFIG_IP_ROUTE_VERBOSE
2340         if (IN_DEV_LOG_MARTIANS(in_dev))
2341                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2342                                      &daddr, &saddr, dev->name);
2343 #endif
2344
2345 e_hostunreach:
2346         err = -EHOSTUNREACH;
2347         goto out;
2348
2349 e_inval:
2350         err = -EINVAL;
2351         goto out;
2352
2353 e_nobufs:
2354         err = -ENOBUFS;
2355         goto out;
2356
2357 martian_source:
2358         err = -EINVAL;
2359 martian_source_keep_err:
2360         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2361         goto out;
2362 }
2363
2364 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2365                            u8 tos, struct net_device *dev, bool noref)
2366 {
2367         struct rtable   *rth;
2368         unsigned int    hash;
2369         int iif = dev->ifindex;
2370         struct net *net;
2371         int res;
2372
2373         net = dev_net(dev);
2374
2375         rcu_read_lock();
2376
2377         if (!rt_caching(net))
2378                 goto skip_cache;
2379
2380         tos &= IPTOS_RT_MASK;
2381         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2382
2383         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2384              rth = rcu_dereference(rth->dst.rt_next)) {
2385                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2386                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2387                      (rth->rt_route_iif ^ iif) |
2388                      (rth->rt_key_tos ^ tos)) == 0 &&
2389                     rth->rt_mark == skb->mark &&
2390                     net_eq(dev_net(rth->dst.dev), net) &&
2391                     !rt_is_expired(rth)) {
2392                         ipv4_validate_peer(rth);
2393                         if (noref) {
2394                                 dst_use_noref(&rth->dst, jiffies);
2395                                 skb_dst_set_noref(skb, &rth->dst);
2396                         } else {
2397                                 dst_use(&rth->dst, jiffies);
2398                                 skb_dst_set(skb, &rth->dst);
2399                         }
2400                         RT_CACHE_STAT_INC(in_hit);
2401                         rcu_read_unlock();
2402                         return 0;
2403                 }
2404                 RT_CACHE_STAT_INC(in_hlist_search);
2405         }
2406
2407 skip_cache:
2408         /* Multicast recognition logic is moved from route cache to here.
2409            The problem was that too many Ethernet cards have broken/missing
2410            hardware multicast filters :-( As result the host on multicasting
2411            network acquires a lot of useless route cache entries, sort of
2412            SDR messages from all the world. Now we try to get rid of them.
2413            Really, provided software IP multicast filter is organized
2414            reasonably (at least, hashed), it does not result in a slowdown
2415            comparing with route cache reject entries.
2416            Note, that multicast routers are not affected, because
2417            route cache entry is created eventually.
2418          */
2419         if (ipv4_is_multicast(daddr)) {
2420                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2421
2422                 if (in_dev) {
2423                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2424                                                   ip_hdr(skb)->protocol);
2425                         if (our
2426 #ifdef CONFIG_IP_MROUTE
2427                                 ||
2428                             (!ipv4_is_local_multicast(daddr) &&
2429                              IN_DEV_MFORWARD(in_dev))
2430 #endif
2431                            ) {
2432                                 int res = ip_route_input_mc(skb, daddr, saddr,
2433                                                             tos, dev, our);
2434                                 rcu_read_unlock();
2435                                 return res;
2436                         }
2437                 }
2438                 rcu_read_unlock();
2439                 return -EINVAL;
2440         }
2441         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2442         rcu_read_unlock();
2443         return res;
2444 }
2445 EXPORT_SYMBOL(ip_route_input_common);
2446
2447 /* called with rcu_read_lock() */
2448 static struct rtable *__mkroute_output(const struct fib_result *res,
2449                                        const struct flowi4 *fl4,
2450                                        __be32 orig_daddr, __be32 orig_saddr,
2451                                        int orig_oif, __u8 orig_rtos,
2452                                        struct net_device *dev_out,
2453                                        unsigned int flags)
2454 {
2455         struct fib_info *fi = res->fi;
2456         struct in_device *in_dev;
2457         u16 type = res->type;
2458         struct rtable *rth;
2459
2460         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2461                 return ERR_PTR(-EINVAL);
2462
2463         if (ipv4_is_lbcast(fl4->daddr))
2464                 type = RTN_BROADCAST;
2465         else if (ipv4_is_multicast(fl4->daddr))
2466                 type = RTN_MULTICAST;
2467         else if (ipv4_is_zeronet(fl4->daddr))
2468                 return ERR_PTR(-EINVAL);
2469
2470         if (dev_out->flags & IFF_LOOPBACK)
2471                 flags |= RTCF_LOCAL;
2472
2473         in_dev = __in_dev_get_rcu(dev_out);
2474         if (!in_dev)
2475                 return ERR_PTR(-EINVAL);
2476
2477         if (type == RTN_BROADCAST) {
2478                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2479                 fi = NULL;
2480         } else if (type == RTN_MULTICAST) {
2481                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2482                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2483                                      fl4->flowi4_proto))
2484                         flags &= ~RTCF_LOCAL;
2485                 /* If multicast route do not exist use
2486                  * default one, but do not gateway in this case.
2487                  * Yes, it is hack.
2488                  */
2489                 if (fi && res->prefixlen < 4)
2490                         fi = NULL;
2491         }
2492
2493         rth = rt_dst_alloc(dev_out,
2494                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2495                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2496         if (!rth)
2497                 return ERR_PTR(-ENOBUFS);
2498
2499         rth->dst.output = ip_output;
2500
2501         rth->rt_key_dst = orig_daddr;
2502         rth->rt_key_src = orig_saddr;
2503         rth->rt_genid = rt_genid(dev_net(dev_out));
2504         rth->rt_flags   = flags;
2505         rth->rt_type    = type;
2506         rth->rt_key_tos = orig_rtos;
2507         rth->rt_dst     = fl4->daddr;
2508         rth->rt_src     = fl4->saddr;
2509         rth->rt_route_iif = 0;
2510         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2511         rth->rt_oif     = orig_oif;
2512         rth->rt_mark    = fl4->flowi4_mark;
2513         rth->rt_gateway = fl4->daddr;
2514         rth->rt_spec_dst= fl4->saddr;
2515         rth->rt_peer_genid = 0;
2516         rt_init_peer(rth, (res->table ?
2517                            &res->table->tb_peers :
2518                            dev_net(dev_out)->ipv4.peers));
2519         rth->fi = NULL;
2520
2521         RT_CACHE_STAT_INC(out_slow_tot);
2522
2523         if (flags & RTCF_LOCAL) {
2524                 rth->dst.input = ip_local_deliver;
2525                 rth->rt_spec_dst = fl4->daddr;
2526         }
2527         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2528                 rth->rt_spec_dst = fl4->saddr;
2529                 if (flags & RTCF_LOCAL &&
2530                     !(dev_out->flags & IFF_LOOPBACK)) {
2531                         rth->dst.output = ip_mc_output;
2532                         RT_CACHE_STAT_INC(out_slow_mc);
2533                 }
2534 #ifdef CONFIG_IP_MROUTE
2535                 if (type == RTN_MULTICAST) {
2536                         if (IN_DEV_MFORWARD(in_dev) &&
2537                             !ipv4_is_local_multicast(fl4->daddr)) {
2538                                 rth->dst.input = ip_mr_input;
2539                                 rth->dst.output = ip_mc_output;
2540                         }
2541                 }
2542 #endif
2543         }
2544
2545         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2546
2547         return rth;
2548 }
2549
2550 /*
2551  * Major route resolver routine.
2552  * called with rcu_read_lock();
2553  */
2554
2555 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2556 {
2557         struct net_device *dev_out = NULL;
2558         __u8 tos = RT_FL_TOS(fl4);
2559         unsigned int flags = 0;
2560         struct fib_result res;
2561         struct rtable *rth;
2562         __be32 orig_daddr;
2563         __be32 orig_saddr;
2564         int orig_oif;
2565
2566         res.fi          = NULL;
2567         res.table       = NULL;
2568 #ifdef CONFIG_IP_MULTIPLE_TABLES
2569         res.r           = NULL;
2570 #endif
2571
2572         orig_daddr = fl4->daddr;
2573         orig_saddr = fl4->saddr;
2574         orig_oif = fl4->flowi4_oif;
2575
2576         fl4->flowi4_iif = net->loopback_dev->ifindex;
2577         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2578         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2579                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2580
2581         rcu_read_lock();
2582         if (fl4->saddr) {
2583                 rth = ERR_PTR(-EINVAL);
2584                 if (ipv4_is_multicast(fl4->saddr) ||
2585                     ipv4_is_lbcast(fl4->saddr) ||
2586                     ipv4_is_zeronet(fl4->saddr))
2587                         goto out;
2588
2589                 /* I removed check for oif == dev_out->oif here.
2590                    It was wrong for two reasons:
2591                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2592                       is assigned to multiple interfaces.
2593                    2. Moreover, we are allowed to send packets with saddr
2594                       of another iface. --ANK
2595                  */
2596
2597                 if (fl4->flowi4_oif == 0 &&
2598                     (ipv4_is_multicast(fl4->daddr) ||
2599                      ipv4_is_lbcast(fl4->daddr))) {
2600                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2601                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2602                         if (dev_out == NULL)
2603                                 goto out;
2604
2605                         /* Special hack: user can direct multicasts
2606                            and limited broadcast via necessary interface
2607                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2608                            This hack is not just for fun, it allows
2609                            vic,vat and friends to work.
2610                            They bind socket to loopback, set ttl to zero
2611                            and expect that it will work.
2612                            From the viewpoint of routing cache they are broken,
2613                            because we are not allowed to build multicast path
2614                            with loopback source addr (look, routing cache
2615                            cannot know, that ttl is zero, so that packet
2616                            will not leave this host and route is valid).
2617                            Luckily, this hack is good workaround.
2618                          */
2619
2620                         fl4->flowi4_oif = dev_out->ifindex;
2621                         goto make_route;
2622                 }
2623
2624                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2625                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2626                         if (!__ip_dev_find(net, fl4->saddr, false))
2627                                 goto out;
2628                 }
2629         }
2630
2631
2632         if (fl4->flowi4_oif) {
2633                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2634                 rth = ERR_PTR(-ENODEV);
2635                 if (dev_out == NULL)
2636                         goto out;
2637
2638                 /* RACE: Check return value of inet_select_addr instead. */
2639                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2640                         rth = ERR_PTR(-ENETUNREACH);
2641                         goto out;
2642                 }
2643                 if (ipv4_is_local_multicast(fl4->daddr) ||
2644                     ipv4_is_lbcast(fl4->daddr)) {
2645                         if (!fl4->saddr)
2646                                 fl4->saddr = inet_select_addr(dev_out, 0,
2647                                                               RT_SCOPE_LINK);
2648                         goto make_route;
2649                 }
2650                 if (fl4->saddr) {
2651                         if (ipv4_is_multicast(fl4->daddr))
2652                                 fl4->saddr = inet_select_addr(dev_out, 0,
2653                                                               fl4->flowi4_scope);
2654                         else if (!fl4->daddr)
2655                                 fl4->saddr = inet_select_addr(dev_out, 0,
2656                                                               RT_SCOPE_HOST);
2657                 }
2658         }
2659
2660         if (!fl4->daddr) {
2661                 fl4->daddr = fl4->saddr;
2662                 if (!fl4->daddr)
2663                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2664                 dev_out = net->loopback_dev;
2665                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2666                 res.type = RTN_LOCAL;
2667                 flags |= RTCF_LOCAL;
2668                 goto make_route;
2669         }
2670
2671         if (fib_lookup(net, fl4, &res)) {
2672                 res.fi = NULL;
2673                 res.table = NULL;
2674                 if (fl4->flowi4_oif) {
2675                         /* Apparently, routing tables are wrong. Assume,
2676                            that the destination is on link.
2677
2678                            WHY? DW.
2679                            Because we are allowed to send to iface
2680                            even if it has NO routes and NO assigned
2681                            addresses. When oif is specified, routing
2682                            tables are looked up with only one purpose:
2683                            to catch if destination is gatewayed, rather than
2684                            direct. Moreover, if MSG_DONTROUTE is set,
2685                            we send packet, ignoring both routing tables
2686                            and ifaddr state. --ANK
2687
2688
2689                            We could make it even if oif is unknown,
2690                            likely IPv6, but we do not.
2691                          */
2692
2693                         if (fl4->saddr == 0)
2694                                 fl4->saddr = inet_select_addr(dev_out, 0,
2695                                                               RT_SCOPE_LINK);
2696                         res.type = RTN_UNICAST;
2697                         goto make_route;
2698                 }
2699                 rth = ERR_PTR(-ENETUNREACH);
2700                 goto out;
2701         }
2702
2703         if (res.type == RTN_LOCAL) {
2704                 if (!fl4->saddr) {
2705                         if (res.fi->fib_prefsrc)
2706                                 fl4->saddr = res.fi->fib_prefsrc;
2707                         else
2708                                 fl4->saddr = fl4->daddr;
2709                 }
2710                 dev_out = net->loopback_dev;
2711                 fl4->flowi4_oif = dev_out->ifindex;
2712                 res.fi = NULL;
2713                 flags |= RTCF_LOCAL;
2714                 goto make_route;
2715         }
2716
2717 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2718         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2719                 fib_select_multipath(&res);
2720         else
2721 #endif
2722         if (!res.prefixlen &&
2723             res.table->tb_num_default > 1 &&
2724             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2725                 fib_select_default(&res);
2726
2727         if (!fl4->saddr)
2728                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2729
2730         dev_out = FIB_RES_DEV(res);
2731         fl4->flowi4_oif = dev_out->ifindex;
2732
2733
2734 make_route:
2735         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2736                                tos, dev_out, flags);
2737         if (!IS_ERR(rth)) {
2738                 unsigned int hash;
2739
2740                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2741                                rt_genid(dev_net(dev_out)));
2742                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2743         }
2744
2745 out:
2746         rcu_read_unlock();
2747         return rth;
2748 }
2749
2750 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2751 {
2752         struct rtable *rth;
2753         unsigned int hash;
2754
2755         if (!rt_caching(net))
2756                 goto slow_output;
2757
2758         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2759
2760         rcu_read_lock_bh();
2761         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2762                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2763                 if (rth->rt_key_dst == flp4->daddr &&
2764                     rth->rt_key_src == flp4->saddr &&
2765                     rt_is_output_route(rth) &&
2766                     rth->rt_oif == flp4->flowi4_oif &&
2767                     rth->rt_mark == flp4->flowi4_mark &&
2768                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2769                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2770                     net_eq(dev_net(rth->dst.dev), net) &&
2771                     !rt_is_expired(rth)) {
2772                         ipv4_validate_peer(rth);
2773                         dst_use(&rth->dst, jiffies);
2774                         RT_CACHE_STAT_INC(out_hit);
2775                         rcu_read_unlock_bh();
2776                         if (!flp4->saddr)
2777                                 flp4->saddr = rth->rt_src;
2778                         if (!flp4->daddr)
2779                                 flp4->daddr = rth->rt_dst;
2780                         return rth;
2781                 }
2782                 RT_CACHE_STAT_INC(out_hlist_search);
2783         }
2784         rcu_read_unlock_bh();
2785
2786 slow_output:
2787         return ip_route_output_slow(net, flp4);
2788 }
2789 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2790
2791 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2792 {
2793         return NULL;
2794 }
2795
2796 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2797 {
2798         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2799
2800         return mtu ? : dst->dev->mtu;
2801 }
2802
2803 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2804 {
2805 }
2806
2807 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2808                                           unsigned long old)
2809 {
2810         return NULL;
2811 }
2812
2813 static struct dst_ops ipv4_dst_blackhole_ops = {
2814         .family                 =       AF_INET,
2815         .protocol               =       cpu_to_be16(ETH_P_IP),
2816         .destroy                =       ipv4_dst_destroy,
2817         .check                  =       ipv4_blackhole_dst_check,
2818         .mtu                    =       ipv4_blackhole_mtu,
2819         .default_advmss         =       ipv4_default_advmss,
2820         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2821         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2822         .neigh_lookup           =       ipv4_neigh_lookup,
2823 };
2824
2825 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2826 {
2827         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2828         struct rtable *ort = (struct rtable *) dst_orig;
2829
2830         if (rt) {
2831                 struct dst_entry *new = &rt->dst;
2832
2833                 new->__use = 1;
2834                 new->input = dst_discard;
2835                 new->output = dst_discard;
2836                 dst_copy_metrics(new, &ort->dst);
2837
2838                 new->dev = ort->dst.dev;
2839                 if (new->dev)
2840                         dev_hold(new->dev);
2841
2842                 rt->rt_key_dst = ort->rt_key_dst;
2843                 rt->rt_key_src = ort->rt_key_src;
2844                 rt->rt_key_tos = ort->rt_key_tos;
2845                 rt->rt_route_iif = ort->rt_route_iif;
2846                 rt->rt_iif = ort->rt_iif;
2847                 rt->rt_oif = ort->rt_oif;
2848                 rt->rt_mark = ort->rt_mark;
2849
2850                 rt->rt_genid = rt_genid(net);
2851                 rt->rt_flags = ort->rt_flags;
2852                 rt->rt_type = ort->rt_type;
2853                 rt->rt_dst = ort->rt_dst;
2854                 rt->rt_src = ort->rt_src;
2855                 rt->rt_gateway = ort->rt_gateway;
2856                 rt->rt_spec_dst = ort->rt_spec_dst;
2857                 rt_transfer_peer(rt, ort);
2858                 rt->fi = ort->fi;
2859                 if (rt->fi)
2860                         atomic_inc(&rt->fi->fib_clntref);
2861
2862                 dst_free(new);
2863         }
2864
2865         dst_release(dst_orig);
2866
2867         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2868 }
2869
2870 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2871                                     struct sock *sk)
2872 {
2873         struct rtable *rt = __ip_route_output_key(net, flp4);
2874
2875         if (IS_ERR(rt))
2876                 return rt;
2877
2878         if (flp4->flowi4_proto)
2879                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2880                                                    flowi4_to_flowi(flp4),
2881                                                    sk, 0);
2882
2883         return rt;
2884 }
2885 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2886
2887 static int rt_fill_info(struct net *net,
2888                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2889                         int nowait, unsigned int flags)
2890 {
2891         struct rtable *rt = skb_rtable(skb);
2892         struct rtmsg *r;
2893         struct nlmsghdr *nlh;
2894         unsigned long expires = 0;
2895         u32 id = 0, ts = 0, tsage = 0, error;
2896
2897         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2898         if (nlh == NULL)
2899                 return -EMSGSIZE;
2900
2901         r = nlmsg_data(nlh);
2902         r->rtm_family    = AF_INET;
2903         r->rtm_dst_len  = 32;
2904         r->rtm_src_len  = 0;
2905         r->rtm_tos      = rt->rt_key_tos;
2906         r->rtm_table    = RT_TABLE_MAIN;
2907         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2908                 goto nla_put_failure;
2909         r->rtm_type     = rt->rt_type;
2910         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2911         r->rtm_protocol = RTPROT_UNSPEC;
2912         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2913         if (rt->rt_flags & RTCF_NOTIFY)
2914                 r->rtm_flags |= RTM_F_NOTIFY;
2915
2916         if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2917                 goto nla_put_failure;
2918         if (rt->rt_key_src) {
2919                 r->rtm_src_len = 32;
2920                 if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2921                         goto nla_put_failure;
2922         }
2923         if (rt->dst.dev &&
2924             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2925                 goto nla_put_failure;
2926 #ifdef CONFIG_IP_ROUTE_CLASSID
2927         if (rt->dst.tclassid &&
2928             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2929                 goto nla_put_failure;
2930 #endif
2931         if (rt_is_input_route(rt)) {
2932                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
2933                         goto nla_put_failure;
2934         } else if (rt->rt_src != rt->rt_key_src) {
2935                 if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2936                         goto nla_put_failure;
2937         }
2938         if (rt->rt_dst != rt->rt_gateway &&
2939             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2940                 goto nla_put_failure;
2941
2942         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2943                 goto nla_put_failure;
2944
2945         if (rt->rt_mark &&
2946             nla_put_be32(skb, RTA_MARK, rt->rt_mark))
2947                 goto nla_put_failure;
2948
2949         error = rt->dst.error;
2950         if (rt_has_peer(rt)) {
2951                 const struct inet_peer *peer = rt_peer_ptr(rt);
2952                 inet_peer_refcheck(peer);
2953                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2954                 if (peer->tcp_ts_stamp) {
2955                         ts = peer->tcp_ts;
2956                         tsage = get_seconds() - peer->tcp_ts_stamp;
2957                 }
2958                 expires = ACCESS_ONCE(peer->pmtu_expires);
2959                 if (expires) {
2960                         if (time_before(jiffies, expires))
2961                                 expires -= jiffies;
2962                         else
2963                                 expires = 0;
2964                 }
2965         }
2966
2967         if (rt_is_input_route(rt)) {
2968 #ifdef CONFIG_IP_MROUTE
2969                 __be32 dst = rt->rt_dst;
2970
2971                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2972                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2973                         int err = ipmr_get_route(net, skb,
2974                                                  rt->rt_src, rt->rt_dst,
2975                                                  r, nowait);
2976                         if (err <= 0) {
2977                                 if (!nowait) {
2978                                         if (err == 0)
2979                                                 return 0;
2980                                         goto nla_put_failure;
2981                                 } else {
2982                                         if (err == -EMSGSIZE)
2983                                                 goto nla_put_failure;
2984                                         error = err;
2985                                 }
2986                         }
2987                 } else
2988 #endif
2989                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2990                                 goto nla_put_failure;
2991         }
2992
2993         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2994                                expires, error) < 0)
2995                 goto nla_put_failure;
2996
2997         return nlmsg_end(skb, nlh);
2998
2999 nla_put_failure:
3000         nlmsg_cancel(skb, nlh);
3001         return -EMSGSIZE;
3002 }
3003
3004 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
3005 {
3006         struct net *net = sock_net(in_skb->sk);
3007         struct rtmsg *rtm;
3008         struct nlattr *tb[RTA_MAX+1];
3009         struct rtable *rt = NULL;
3010         __be32 dst = 0;
3011         __be32 src = 0;
3012         u32 iif;
3013         int err;
3014         int mark;
3015         struct sk_buff *skb;
3016
3017         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3018         if (err < 0)
3019                 goto errout;
3020
3021         rtm = nlmsg_data(nlh);
3022
3023         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3024         if (skb == NULL) {
3025                 err = -ENOBUFS;
3026                 goto errout;
3027         }
3028
3029         /* Reserve room for dummy headers, this skb can pass
3030            through good chunk of routing engine.
3031          */
3032         skb_reset_mac_header(skb);
3033         skb_reset_network_header(skb);
3034
3035         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3036         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3037         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3038
3039         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3040         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3041         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3042         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3043
3044         if (iif) {
3045                 struct net_device *dev;
3046
3047                 dev = __dev_get_by_index(net, iif);
3048                 if (dev == NULL) {
3049                         err = -ENODEV;
3050                         goto errout_free;
3051                 }
3052
3053                 skb->protocol   = htons(ETH_P_IP);
3054                 skb->dev        = dev;
3055                 skb->mark       = mark;
3056                 local_bh_disable();
3057                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3058                 local_bh_enable();
3059
3060                 rt = skb_rtable(skb);
3061                 if (err == 0 && rt->dst.error)
3062                         err = -rt->dst.error;
3063         } else {
3064                 struct flowi4 fl4 = {
3065                         .daddr = dst,
3066                         .saddr = src,
3067                         .flowi4_tos = rtm->rtm_tos,
3068                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3069                         .flowi4_mark = mark,
3070                 };
3071                 rt = ip_route_output_key(net, &fl4);
3072
3073                 err = 0;
3074                 if (IS_ERR(rt))
3075                         err = PTR_ERR(rt);
3076         }
3077
3078         if (err)
3079                 goto errout_free;
3080
3081         skb_dst_set(skb, &rt->dst);
3082         if (rtm->rtm_flags & RTM_F_NOTIFY)
3083                 rt->rt_flags |= RTCF_NOTIFY;
3084
3085         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3086                            RTM_NEWROUTE, 0, 0);
3087         if (err <= 0)
3088                 goto errout_free;
3089
3090         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3091 errout:
3092         return err;
3093
3094 errout_free:
3095         kfree_skb(skb);
3096         goto errout;
3097 }
3098
3099 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3100 {
3101         struct rtable *rt;
3102         int h, s_h;
3103         int idx, s_idx;
3104         struct net *net;
3105
3106         net = sock_net(skb->sk);
3107
3108         s_h = cb->args[0];
3109         if (s_h < 0)
3110                 s_h = 0;
3111         s_idx = idx = cb->args[1];
3112         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3113                 if (!rt_hash_table[h].chain)
3114                         continue;
3115                 rcu_read_lock_bh();
3116                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3117                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3118                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3119                                 continue;
3120                         if (rt_is_expired(rt))
3121                                 continue;
3122                         skb_dst_set_noref(skb, &rt->dst);
3123                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3124                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3125                                          1, NLM_F_MULTI) <= 0) {
3126                                 skb_dst_drop(skb);
3127                                 rcu_read_unlock_bh();
3128                                 goto done;
3129                         }
3130                         skb_dst_drop(skb);
3131                 }
3132                 rcu_read_unlock_bh();
3133         }
3134
3135 done:
3136         cb->args[0] = h;
3137         cb->args[1] = idx;
3138         return skb->len;
3139 }
3140
3141 void ip_rt_multicast_event(struct in_device *in_dev)
3142 {
3143         rt_cache_flush(dev_net(in_dev->dev), 0);
3144 }
3145
3146 #ifdef CONFIG_SYSCTL
3147 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3148                                         void __user *buffer,
3149                                         size_t *lenp, loff_t *ppos)
3150 {
3151         if (write) {
3152                 int flush_delay;
3153                 ctl_table ctl;
3154                 struct net *net;
3155
3156                 memcpy(&ctl, __ctl, sizeof(ctl));
3157                 ctl.data = &flush_delay;
3158                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3159
3160                 net = (struct net *)__ctl->extra1;
3161                 rt_cache_flush(net, flush_delay);
3162                 return 0;
3163         }
3164
3165         return -EINVAL;
3166 }
3167
3168 static ctl_table ipv4_route_table[] = {
3169         {
3170                 .procname       = "gc_thresh",
3171                 .data           = &ipv4_dst_ops.gc_thresh,
3172                 .maxlen         = sizeof(int),
3173                 .mode           = 0644,
3174                 .proc_handler   = proc_dointvec,
3175         },
3176         {
3177                 .procname       = "max_size",
3178                 .data           = &ip_rt_max_size,
3179                 .maxlen         = sizeof(int),
3180                 .mode           = 0644,
3181                 .proc_handler   = proc_dointvec,
3182         },
3183         {
3184                 /*  Deprecated. Use gc_min_interval_ms */
3185
3186                 .procname       = "gc_min_interval",
3187                 .data           = &ip_rt_gc_min_interval,
3188                 .maxlen         = sizeof(int),
3189                 .mode           = 0644,
3190                 .proc_handler   = proc_dointvec_jiffies,
3191         },
3192         {
3193                 .procname       = "gc_min_interval_ms",
3194                 .data           = &ip_rt_gc_min_interval,
3195                 .maxlen         = sizeof(int),
3196                 .mode           = 0644,
3197                 .proc_handler   = proc_dointvec_ms_jiffies,
3198         },
3199         {
3200                 .procname       = "gc_timeout",
3201                 .data           = &ip_rt_gc_timeout,
3202                 .maxlen         = sizeof(int),
3203                 .mode           = 0644,
3204                 .proc_handler   = proc_dointvec_jiffies,
3205         },
3206         {
3207                 .procname       = "gc_interval",
3208                 .data           = &ip_rt_gc_interval,
3209                 .maxlen         = sizeof(int),
3210                 .mode           = 0644,
3211                 .proc_handler   = proc_dointvec_jiffies,
3212         },
3213         {
3214                 .procname       = "redirect_load",
3215                 .data           = &ip_rt_redirect_load,
3216                 .maxlen         = sizeof(int),
3217                 .mode           = 0644,
3218                 .proc_handler   = proc_dointvec,
3219         },
3220         {
3221                 .procname       = "redirect_number",
3222                 .data           = &ip_rt_redirect_number,
3223                 .maxlen         = sizeof(int),
3224                 .mode           = 0644,
3225                 .proc_handler   = proc_dointvec,
3226         },
3227         {
3228                 .procname       = "redirect_silence",
3229                 .data           = &ip_rt_redirect_silence,
3230                 .maxlen         = sizeof(int),
3231                 .mode           = 0644,
3232                 .proc_handler   = proc_dointvec,
3233         },
3234         {
3235                 .procname       = "error_cost",
3236                 .data           = &ip_rt_error_cost,
3237                 .maxlen         = sizeof(int),
3238                 .mode           = 0644,
3239                 .proc_handler   = proc_dointvec,
3240         },
3241         {
3242                 .procname       = "error_burst",
3243                 .data           = &ip_rt_error_burst,
3244                 .maxlen         = sizeof(int),
3245                 .mode           = 0644,
3246                 .proc_handler   = proc_dointvec,
3247         },
3248         {
3249                 .procname       = "gc_elasticity",
3250                 .data           = &ip_rt_gc_elasticity,
3251                 .maxlen         = sizeof(int),
3252                 .mode           = 0644,
3253                 .proc_handler   = proc_dointvec,
3254         },
3255         {
3256                 .procname       = "mtu_expires",
3257                 .data           = &ip_rt_mtu_expires,
3258                 .maxlen         = sizeof(int),
3259                 .mode           = 0644,
3260                 .proc_handler   = proc_dointvec_jiffies,
3261         },
3262         {
3263                 .procname       = "min_pmtu",
3264                 .data           = &ip_rt_min_pmtu,
3265                 .maxlen         = sizeof(int),
3266                 .mode           = 0644,
3267                 .proc_handler   = proc_dointvec,
3268         },
3269         {
3270                 .procname       = "min_adv_mss",
3271                 .data           = &ip_rt_min_advmss,
3272                 .maxlen         = sizeof(int),
3273                 .mode           = 0644,
3274                 .proc_handler   = proc_dointvec,
3275         },
3276         { }
3277 };
3278
3279 static struct ctl_table ipv4_route_flush_table[] = {
3280         {
3281                 .procname       = "flush",
3282                 .maxlen         = sizeof(int),
3283                 .mode           = 0200,
3284                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3285         },
3286         { },
3287 };
3288
3289 static __net_init int sysctl_route_net_init(struct net *net)
3290 {
3291         struct ctl_table *tbl;
3292
3293         tbl = ipv4_route_flush_table;
3294         if (!net_eq(net, &init_net)) {
3295                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3296                 if (tbl == NULL)
3297                         goto err_dup;
3298         }
3299         tbl[0].extra1 = net;
3300
3301         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3302         if (net->ipv4.route_hdr == NULL)
3303                 goto err_reg;
3304         return 0;
3305
3306 err_reg:
3307         if (tbl != ipv4_route_flush_table)
3308                 kfree(tbl);
3309 err_dup:
3310         return -ENOMEM;
3311 }
3312
3313 static __net_exit void sysctl_route_net_exit(struct net *net)
3314 {
3315         struct ctl_table *tbl;
3316
3317         tbl = net->ipv4.route_hdr->ctl_table_arg;
3318         unregister_net_sysctl_table(net->ipv4.route_hdr);
3319         BUG_ON(tbl == ipv4_route_flush_table);
3320         kfree(tbl);
3321 }
3322
3323 static __net_initdata struct pernet_operations sysctl_route_ops = {
3324         .init = sysctl_route_net_init,
3325         .exit = sysctl_route_net_exit,
3326 };
3327 #endif
3328
3329 static __net_init int rt_genid_init(struct net *net)
3330 {
3331         get_random_bytes(&net->ipv4.rt_genid,
3332                          sizeof(net->ipv4.rt_genid));
3333         get_random_bytes(&net->ipv4.dev_addr_genid,
3334                          sizeof(net->ipv4.dev_addr_genid));
3335         return 0;
3336 }
3337
3338 static __net_initdata struct pernet_operations rt_genid_ops = {
3339         .init = rt_genid_init,
3340 };
3341
3342 static int __net_init ipv4_inetpeer_init(struct net *net)
3343 {
3344         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3345
3346         if (!bp)
3347                 return -ENOMEM;
3348         inet_peer_base_init(bp);
3349         net->ipv4.peers = bp;
3350         return 0;
3351 }
3352
3353 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3354 {
3355         struct inet_peer_base *bp = net->ipv4.peers;
3356
3357         net->ipv4.peers = NULL;
3358         inetpeer_invalidate_tree(bp);
3359         kfree(bp);
3360 }
3361
3362 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3363         .init   =       ipv4_inetpeer_init,
3364         .exit   =       ipv4_inetpeer_exit,
3365 };
3366
3367 #ifdef CONFIG_IP_ROUTE_CLASSID
3368 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3369 #endif /* CONFIG_IP_ROUTE_CLASSID */
3370
3371 static __initdata unsigned long rhash_entries;
3372 static int __init set_rhash_entries(char *str)
3373 {
3374         ssize_t ret;
3375
3376         if (!str)
3377                 return 0;
3378
3379         ret = kstrtoul(str, 0, &rhash_entries);
3380         if (ret)
3381                 return 0;
3382
3383         return 1;
3384 }
3385 __setup("rhash_entries=", set_rhash_entries);
3386
3387 int __init ip_rt_init(void)
3388 {
3389         int rc = 0;
3390
3391 #ifdef CONFIG_IP_ROUTE_CLASSID
3392         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3393         if (!ip_rt_acct)
3394                 panic("IP: failed to allocate ip_rt_acct\n");
3395 #endif
3396
3397         ipv4_dst_ops.kmem_cachep =
3398                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3399                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3400
3401         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3402
3403         if (dst_entries_init(&ipv4_dst_ops) < 0)
3404                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3405
3406         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3407                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3408
3409         rt_hash_table = (struct rt_hash_bucket *)
3410                 alloc_large_system_hash("IP route cache",
3411                                         sizeof(struct rt_hash_bucket),
3412                                         rhash_entries,
3413                                         (totalram_pages >= 128 * 1024) ?
3414                                         15 : 17,
3415                                         0,
3416                                         &rt_hash_log,
3417                                         &rt_hash_mask,
3418                                         0,
3419                                         rhash_entries ? 0 : 512 * 1024);
3420         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3421         rt_hash_lock_init();
3422
3423         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3424         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3425
3426         devinet_init();
3427         ip_fib_init();
3428
3429         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3430         expires_ljiffies = jiffies;
3431         schedule_delayed_work(&expires_work,
3432                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3433
3434         if (ip_rt_proc_init())
3435                 pr_err("Unable to create route proc files\n");
3436 #ifdef CONFIG_XFRM
3437         xfrm_init();
3438         xfrm4_init(ip_rt_max_size);
3439 #endif
3440         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3441
3442 #ifdef CONFIG_SYSCTL
3443         register_pernet_subsys(&sysctl_route_ops);
3444 #endif
3445         register_pernet_subsys(&rt_genid_ops);
3446         register_pernet_subsys(&ipv4_inetpeer_ops);
3447         return rc;
3448 }
3449
3450 #ifdef CONFIG_SYSCTL
3451 /*
3452  * We really need to sanitize the damn ipv4 init order, then all
3453  * this nonsense will go away.
3454  */
3455 void __init ip_static_sysctl_init(void)
3456 {
3457         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3458 }
3459 #endif