ndisc: Remove tbl argument for __ipv6_neigh_lookup().

[deliverable/linux.git] / net / ipv6 / route.c
diff --git a/net/ipv6/route.c b/net/ipv6/route.c

index b1e6cf0b95fd9bf3546420e8cb48578d1a7d2190..5d9ca274d149cb5e8f3a469b7089292a6c3ec665 100644 (file)
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -57,6 +57,7 @@
  #include <net/xfrm.h>
  #include <net/netevent.h>
  #include <net/netlink.h>
+#include <net/nexthop.h>
  
  #include <asm/uaccess.h>
  
@@ -144,7 +145,7 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
         struct neighbour *n;
  
         daddr = choose_neigh_daddr(rt, skb, daddr);
-       n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
+       n = __ipv6_neigh_lookup(dst->dev, daddr);
         if (n)
                 return n;
         return neigh_create(&nd_tbl, daddr, dst->dev);
@@ -152,7 +153,7 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
  
  static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
  {
-       struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
+       struct neighbour *n = __ipv6_neigh_lookup(dev, &rt->rt6i_gateway);
         if (!n) {
                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
                 if (IS_ERR(n))
@@ -289,6 +290,8 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
                 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
                 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
                 rt->rt6i_genid = rt_genid(net);
+               INIT_LIST_HEAD(&rt->rt6i_siblings);
+               rt->rt6i_nsiblings = 0;
         }
         return rt;
  }
@@ -318,13 +321,6 @@ static void ip6_dst_destroy(struct dst_entry *dst)
         }
  }
  
-static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
-
-static u32 rt6_peer_genid(void)
-{
-       return atomic_read(&__rt6_peer_genid);
-}
-
  void rt6_bind_peer(struct rt6_info *rt, int create)
  {
         struct inet_peer_base *base;
@@ -338,8 +334,6 @@ void rt6_bind_peer(struct rt6_info *rt, int create)
         if (peer) {
                 if (!rt6_set_peer(rt, peer))
                         inet_putpeer(peer);
-               else
-                       rt->rt6i_peer_genid = rt6_peer_genid();
         }
  }
  
@@ -385,6 +379,62 @@ static bool rt6_need_strict(const struct in6_addr *daddr)
                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
  }
  
+/* Multipath route selection:
+ *   Hash based function using packet header and flowlabel.
+ * Adapted from fib_info_hashfn()
+ */
+static int rt6_info_hash_nhsfn(unsigned int candidate_count,
+                              const struct flowi6 *fl6)
+{
+       unsigned int val = fl6->flowi6_proto;
+
+       val ^= ipv6_addr_hash(&fl6->daddr);
+       val ^= ipv6_addr_hash(&fl6->saddr);
+
+       /* Work only if this not encapsulated */
+       switch (fl6->flowi6_proto) {
+       case IPPROTO_UDP:
+       case IPPROTO_TCP:
+       case IPPROTO_SCTP:
+               val ^= (__force u16)fl6->fl6_sport;
+               val ^= (__force u16)fl6->fl6_dport;
+               break;
+
+       case IPPROTO_ICMPV6:
+               val ^= (__force u16)fl6->fl6_icmp_type;
+               val ^= (__force u16)fl6->fl6_icmp_code;
+               break;
+       }
+       /* RFC6438 recommands to use flowlabel */
+       val ^= (__force u32)fl6->flowlabel;
+
+       /* Perhaps, we need to tune, this function? */
+       val = val ^ (val >> 7) ^ (val >> 12);
+       return val % candidate_count;
+}
+
+static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
+                                            struct flowi6 *fl6)
+{
+       struct rt6_info *sibling, *next_sibling;
+       int route_choosen;
+
+       route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
+       /* Don't change the route, if route_choosen == 0
+        * (siblings does not include ourself)
+        */
+       if (route_choosen)
+               list_for_each_entry_safe(sibling, next_sibling,
+                               &match->rt6i_siblings, rt6i_siblings) {
+                       route_choosen--;
+                       if (route_choosen == 0) {
+                               match = sibling;
+                               break;
+                       }
+               }
+       return match;
+}
+
  /*
   *     Route lookup. Any table->tb6_lock is implied.
   */
@@ -449,22 +499,26 @@ static void rt6_probe(struct rt6_info *rt)
          * to no more than one per minute.
          */
         neigh = rt ? rt->n : NULL;
-       if (!neigh || (neigh->nud_state & NUD_VALID))
+       if (!neigh)
+               return;
+       write_lock_bh(&neigh->lock);
+       if (neigh->nud_state & NUD_VALID) {
+               write_unlock_bh(&neigh->lock);
                 return;
-       read_lock_bh(&neigh->lock);
+       }
         if (!(neigh->nud_state & NUD_VALID) &&
             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
                 struct in6_addr mcaddr;
                 struct in6_addr *target;
  
                 neigh->updated = jiffies;
-               read_unlock_bh(&neigh->lock);
+               write_unlock_bh(&neigh->lock);
  
                 target = (struct in6_addr *)&neigh->primary_key;
                 addrconf_addr_solict_mult(target, &mcaddr);
                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
         } else {
-               read_unlock_bh(&neigh->lock);
+               write_unlock_bh(&neigh->lock);
         }
  }
  #else
@@ -487,35 +541,32 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif)
         return 0;
  }
  
-static inline int rt6_check_neigh(struct rt6_info *rt)
+static inline bool rt6_check_neigh(struct rt6_info *rt)
  {
         struct neighbour *neigh;
-       int m;
+       bool ret = false;
  
         neigh = rt->n;
         if (rt->rt6i_flags & RTF_NONEXTHOP ||
             !(rt->rt6i_flags & RTF_GATEWAY))
-               m = 1;
+               ret = true;
         else if (neigh) {
                 read_lock_bh(&neigh->lock);
                 if (neigh->nud_state & NUD_VALID)
-                       m = 2;
+                       ret = true;
  #ifdef CONFIG_IPV6_ROUTER_PREF
-               else if (neigh->nud_state & NUD_FAILED)
-                       m = 0;
+               else if (!(neigh->nud_state & NUD_FAILED))
+                       ret = true;
  #endif
-               else
-                       m = 1;
                 read_unlock_bh(&neigh->lock);
-       } else
-               m = 0;
-       return m;
+       }
+       return ret;
  }
  
  static int rt6_score_route(struct rt6_info *rt, int oif,
                            int strict)
  {
-       int m, n;
+       int m;
  
         m = rt6_check_dev(rt, oif);
         if (!m && (strict & RT6_LOOKUP_F_IFACE))
@@ -523,8 +574,7 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
  #ifdef CONFIG_IPV6_ROUTER_PREF
         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
  #endif
-       n = rt6_check_neigh(rt);
-       if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
+       if (!rt6_check_neigh(rt) && (strict & RT6_LOOKUP_F_REACHABLE))
                 return -1;
         return m;
  }
@@ -666,7 +716,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
                 else
                         rt6_set_expires(rt, jiffies + HZ * lifetime);
  
-               dst_release(&rt->dst);
+               ip6_rt_put(rt);
         }
         return 0;
  }
@@ -702,6 +752,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
  restart:
         rt = fn->leaf;
         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
+       if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
+               rt = rt6_multipath_select(rt, fl6);
         BACKTRACK(net, &fl6->saddr);
  out:
         dst_use(&rt->dst, jiffies);
@@ -863,7 +915,8 @@ restart_2:
  
  restart:
         rt = rt6_select(fn, oif, strict | reachable);
-
+       if (rt->rt6i_nsiblings && oif == 0)
+               rt = rt6_multipath_select(rt, fl6);
         BACKTRACK(net, &fl6->saddr);
         if (rt == net->ipv6.ip6_null_entry ||
             rt->rt6i_flags & RTF_CACHE)
@@ -879,7 +932,7 @@ restart:
         else
                 goto out2;
  
-       dst_release(&rt->dst);
+       ip6_rt_put(rt);
         rt = nrt ? : net->ipv6.ip6_null_entry;
  
         dst_hold(&rt->dst);
@@ -896,7 +949,7 @@ restart:
          * Race condition! In the gap, when table->tb6_lock was
          * released someone could insert this route.  Relookup.
          */
-       dst_release(&rt->dst);
+       ip6_rt_put(rt);
         goto relookup;
  
  out:
@@ -938,7 +991,7 @@ void ip6_route_input(struct sk_buff *skb)
                 .flowi6_iif = skb->dev->ifindex,
                 .daddr = iph->daddr,
                 .saddr = iph->saddr,
-               .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
+               .flowlabel = ip6_flowinfo(iph),
                 .flowi6_mark = skb->mark,
                 .flowi6_proto = iph->nexthdr,
         };
@@ -1030,14 +1083,9 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
         if (rt->rt6i_genid != rt_genid(dev_net(rt->dst.dev)))
                 return NULL;
  
-       if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
-               if (rt->rt6i_peer_genid != rt6_peer_genid()) {
-                       if (!rt6_has_peer(rt))
-                               rt6_bind_peer(rt, 0);
-                       rt->rt6i_peer_genid = rt6_peer_genid();
-               }
+       if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
                 return dst;
-       }
+
         return NULL;
  }
  
@@ -1108,7 +1156,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
         fl6.flowi6_flags = 0;
         fl6.daddr = iph->daddr;
         fl6.saddr = iph->saddr;
-       fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
+       fl6.flowlabel = ip6_flowinfo(iph);
  
         dst = ip6_route_output(net, NULL, &fl6);
         if (!dst->error)
@@ -1136,7 +1184,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
         fl6.flowi6_flags = 0;
         fl6.daddr = iph->daddr;
         fl6.saddr = iph->saddr;
-       fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
+       fl6.flowlabel = ip6_flowinfo(iph);
  
         dst = ip6_route_output(net, NULL, &fl6);
         if (!dst->error)
@@ -1316,12 +1364,6 @@ out:
         return entries > rt_max_size;
  }
  
-/* Clean host part of a prefix. Not necessary in radix tree,
-   but results in cleaner routing tables.
-
-   Remove it only when all the things will work!
- */
-
  int ip6_dst_hoplimit(struct dst_entry *dst)
  {
         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
@@ -1507,7 +1549,7 @@ int ip6_route_add(struct fib6_config *cfg)
                                 goto out;
                         if (dev) {
                                 if (dev != grt->dst.dev) {
-                                       dst_release(&grt->dst);
+                                       ip6_rt_put(grt);
                                         goto out;
                                 }
                         } else {
@@ -1518,7 +1560,7 @@ int ip6_route_add(struct fib6_config *cfg)
                         }
                         if (!(grt->rt6i_flags & RTF_GATEWAY))
                                 err = 0;
-                       dst_release(&grt->dst);
+                       ip6_rt_put(grt);
  
                         if (err)
                                 goto out;
@@ -1604,7 +1646,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
         write_unlock_bh(&table->tb6_lock);
  
  out:
-       dst_release(&rt->dst);
+       ip6_rt_put(rt);
         return err;
  }
  
@@ -1660,37 +1702,33 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
         struct net *net = dev_net(skb->dev);
         struct netevent_redirect netevent;
         struct rt6_info *rt, *nrt = NULL;
-       const struct in6_addr *target;
         struct ndisc_options ndopts;
-       const struct in6_addr *dest;
         struct neighbour *old_neigh;
         struct inet6_dev *in6_dev;
         struct neighbour *neigh;
-       struct icmp6hdr *icmph;
+       struct rd_msg *msg;
         int optlen, on_link;
         u8 *lladdr;
  
         optlen = skb->tail - skb->transport_header;
-       optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr);
+       optlen -= sizeof(*msg);
  
         if (optlen < 0) {
                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
                 return;
         }
  
-       icmph = icmp6_hdr(skb);
-       target = (const struct in6_addr *) (icmph + 1);
-       dest = target + 1;
+       msg = (struct rd_msg *)icmp6_hdr(skb);
  
-       if (ipv6_addr_is_multicast(dest)) {
+       if (ipv6_addr_is_multicast(&msg->dest)) {
                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
                 return;
         }
  
         on_link = 0;
-       if (ipv6_addr_equal(dest, target)) {
+       if (ipv6_addr_equal(&msg->dest, &msg->target)) {
                 on_link = 1;
-       } else if (ipv6_addr_type(target) !=
+       } else if (ipv6_addr_type(&msg->target) !=
                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
                 return;
@@ -1707,7 +1745,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
          *      first-hop router for the specified ICMP Destination Address.
          */
  
-       if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) {
+       if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
                 return;
         }
@@ -1734,7 +1772,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
          */
         dst_confirm(&rt->dst);
  
-       neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1);
+       neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
         if (!neigh)
                 return;
  
@@ -1754,7 +1792,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
                                      NEIGH_UPDATE_F_ISROUTER))
                      );
  
-       nrt = ip6_rt_copy(rt, dest);
+       nrt = ip6_rt_copy(rt, &msg->dest);
         if (!nrt)
                 goto out;
  
@@ -1769,10 +1807,9 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
                 goto out;
  
         netevent.old = &rt->dst;
-       netevent.old_neigh = old_neigh;
         netevent.new = &nrt->dst;
-       netevent.new_neigh = neigh;
-       netevent.daddr = dest;
+       netevent.daddr = &msg->dest;
+       netevent.neigh = neigh;
         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
  
         if (rt->rt6i_flags & RTF_CACHE) {
@@ -1987,7 +2024,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
         switch(cmd) {
         case SIOCADDRT:         /* Add a route */
         case SIOCDELRT:         /* Delete a route */
-               if (!capable(CAP_NET_ADMIN))
+               if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                         return -EPERM;
                 err = copy_from_user(&rtmsg, arg,
                                      sizeof(struct in6_rtmsg));
@@ -2249,6 +2286,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
         [RTA_IIF]               = { .type = NLA_U32 },
         [RTA_PRIORITY]          = { .type = NLA_U32 },
         [RTA_METRICS]           = { .type = NLA_NESTED },
+       [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
  };
  
  static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2326,11 +2364,71 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
         if (tb[RTA_TABLE])
                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
  
+       if (tb[RTA_MULTIPATH]) {
+               cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
+               cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
+       }
+
         err = 0;
  errout:
         return err;
  }
  
+static int ip6_route_multipath(struct fib6_config *cfg, int add)
+{
+       struct fib6_config r_cfg;
+       struct rtnexthop *rtnh;
+       int remaining;
+       int attrlen;
+       int err = 0, last_err = 0;
+
+beginning:
+       rtnh = (struct rtnexthop *)cfg->fc_mp;
+       remaining = cfg->fc_mp_len;
+
+       /* Parse a Multipath Entry */
+       while (rtnh_ok(rtnh, remaining)) {
+               memcpy(&r_cfg, cfg, sizeof(*cfg));
+               if (rtnh->rtnh_ifindex)
+                       r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
+
+               attrlen = rtnh_attrlen(rtnh);
+               if (attrlen > 0) {
+                       struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+                       nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+                       if (nla) {
+                               nla_memcpy(&r_cfg.fc_gateway, nla, 16);
+                               r_cfg.fc_flags |= RTF_GATEWAY;
+                       }
+               }
+               err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
+               if (err) {
+                       last_err = err;
+                       /* If we are trying to remove a route, do not stop the
+                        * loop when ip6_route_del() fails (because next hop is
+                        * already gone), we should try to remove all next hops.
+                        */
+                       if (add) {
+                               /* If add fails, we should try to delete all
+                                * next hops that have been already added.
+                                */
+                               add = 0;
+                               goto beginning;
+                       }
+               }
+               /* Because each route is added like a single route we remove
+                * this flag after the first nexthop (if there is a collision,
+                * we have already fail to add the first nexthop:
+                * fib6_add_rt2node() has reject it).
+                */
+               cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL;
+               rtnh = rtnh_next(rtnh, &remaining);
+       }
+
+       return last_err;
+}
+
  static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
  {
         struct fib6_config cfg;
@@ -2340,7 +2438,10 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
         if (err < 0)
                 return err;
  
-       return ip6_route_del(&cfg);
+       if (cfg.fc_mp)
+               return ip6_route_multipath(&cfg, 0);
+       else
+               return ip6_route_del(&cfg);
  }
  
  static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
@@ -2352,7 +2453,10 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
         if (err < 0)
                 return err;
  
-       return ip6_route_add(&cfg);
+       if (cfg.fc_mp)
+               return ip6_route_multipath(&cfg, 1);
+       else
+               return ip6_route_add(&cfg);
  }
  
  static inline size_t rt6_nlmsg_size(void)
@@ -2596,7 +2700,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
  
         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
         if (!skb) {
-               dst_release(&rt->dst);
+               ip6_rt_put(rt);
                 err = -ENOBUFS;
                 goto errout;
         }
@@ -2873,6 +2977,10 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+
+               /* Don't export sysctls to unprivileged users */
+               if (net->user_ns != &init_user_ns)
+                       table[0].procname = NULL;
         }
  
         return table;