net/netfilter/ipvs/ip_vs_core.c

   1 /*
   2  * IPVS         An implementation of the IP virtual server support for the
   3  *              LINUX operating system.  IPVS is now implemented as a module
   4  *              over the Netfilter framework. IPVS can be used to build a
   5  *              high-performance and highly available server based on a
   6  *              cluster of servers.
   7  *
   8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   9  *              Peter Kese <peter.kese@ijs.si>
  10  *              Julian Anastasov <ja@ssi.bg>
  11  *
  12  *              This program is free software; you can redistribute it and/or
  13  *              modify it under the terms of the GNU General Public License
  14  *              as published by the Free Software Foundation; either version
  15  *              2 of the License, or (at your option) any later version.
  16  *
  17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
  18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
  19  * and others.
  20  *
  21  * Changes:
  22  *      Paul `Rusty' Russell            properly handle non-linear skbs
  23  *      Harald Welte                    don't use nfcache
  24  *
  25  */
  26
  27 #define KMSG_COMPONENT "IPVS"
  28 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  29
  30 #include <linux/module.h>
  31 #include <linux/kernel.h>
  32 #include <linux/ip.h>
  33 #include <linux/tcp.h>
  34 #include <linux/sctp.h>
  35 #include <linux/icmp.h>
  36
  37 #include <net/ip.h>
  38 #include <net/tcp.h>
  39 #include <net/udp.h>
  40 #include <net/icmp.h>                   /* for icmp_send */
  41 #include <net/route.h>
  42
  43 #include <linux/netfilter.h>
  44 #include <linux/netfilter_ipv4.h>
  45
  46 #ifdef CONFIG_IP_VS_IPV6
  47 #include <net/ipv6.h>
  48 #include <linux/netfilter_ipv6.h>
  49 #endif
  50
  51 #include <net/ip_vs.h>
  52
  53
  54 EXPORT_SYMBOL(register_ip_vs_scheduler);
  55 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
  56 EXPORT_SYMBOL(ip_vs_skb_replace);
  57 EXPORT_SYMBOL(ip_vs_proto_name);
  58 EXPORT_SYMBOL(ip_vs_conn_new);
  59 EXPORT_SYMBOL(ip_vs_conn_in_get);
  60 EXPORT_SYMBOL(ip_vs_conn_out_get);
  61 #ifdef CONFIG_IP_VS_PROTO_TCP
  62 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
  63 #endif
  64 EXPORT_SYMBOL(ip_vs_conn_put);
  65 #ifdef CONFIG_IP_VS_DEBUG
  66 EXPORT_SYMBOL(ip_vs_get_debug_level);
  67 #endif
  68
  69
  70 /* ID used in ICMP lookups */
  71 #define icmp_id(icmph)          (((icmph)->un).echo.id)
  72 #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
  73
  74 const char *ip_vs_proto_name(unsigned proto)
  75 {
  76         static char buf[20];
  77
  78         switch (proto) {
  79         case IPPROTO_IP:
  80                 return "IP";
  81         case IPPROTO_UDP:
  82                 return "UDP";
  83         case IPPROTO_TCP:
  84                 return "TCP";
  85         case IPPROTO_SCTP:
  86                 return "SCTP";
  87         case IPPROTO_ICMP:
  88                 return "ICMP";
  89 #ifdef CONFIG_IP_VS_IPV6
  90         case IPPROTO_ICMPV6:
  91                 return "ICMPv6";
  92 #endif
  93         default:
  94                 sprintf(buf, "IP_%d", proto);
  95                 return buf;
  96         }
  97 }
  98
  99 void ip_vs_init_hash_table(struct list_head *table, int rows)
 100 {
 101         while (--rows >= 0)
 102                 INIT_LIST_HEAD(&table[rows]);
 103 }
 104
 105 static inline void
 106 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 107 {
 108         struct ip_vs_dest *dest = cp->dest;
 109         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 110                 spin_lock(&dest->stats.lock);
 111                 dest->stats.ustats.inpkts++;
 112                 dest->stats.ustats.inbytes += skb->len;
 113                 spin_unlock(&dest->stats.lock);
 114
 115                 spin_lock(&dest->svc->stats.lock);
 116                 dest->svc->stats.ustats.inpkts++;
 117                 dest->svc->stats.ustats.inbytes += skb->len;
 118                 spin_unlock(&dest->svc->stats.lock);
 119
 120                 spin_lock(&ip_vs_stats.lock);
 121                 ip_vs_stats.ustats.inpkts++;
 122                 ip_vs_stats.ustats.inbytes += skb->len;
 123                 spin_unlock(&ip_vs_stats.lock);
 124         }
 125 }
 126
 127
 128 static inline void
 129 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 130 {
 131         struct ip_vs_dest *dest = cp->dest;
 132         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 133                 spin_lock(&dest->stats.lock);
 134                 dest->stats.ustats.outpkts++;
 135                 dest->stats.ustats.outbytes += skb->len;
 136                 spin_unlock(&dest->stats.lock);
 137
 138                 spin_lock(&dest->svc->stats.lock);
 139                 dest->svc->stats.ustats.outpkts++;
 140                 dest->svc->stats.ustats.outbytes += skb->len;
 141                 spin_unlock(&dest->svc->stats.lock);
 142
 143                 spin_lock(&ip_vs_stats.lock);
 144                 ip_vs_stats.ustats.outpkts++;
 145                 ip_vs_stats.ustats.outbytes += skb->len;
 146                 spin_unlock(&ip_vs_stats.lock);
 147         }
 148 }
 149
 150
 151 static inline void
 152 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 153 {
 154         spin_lock(&cp->dest->stats.lock);
 155         cp->dest->stats.ustats.conns++;
 156         spin_unlock(&cp->dest->stats.lock);
 157
 158         spin_lock(&svc->stats.lock);
 159         svc->stats.ustats.conns++;
 160         spin_unlock(&svc->stats.lock);
 161
 162         spin_lock(&ip_vs_stats.lock);
 163         ip_vs_stats.ustats.conns++;
 164         spin_unlock(&ip_vs_stats.lock);
 165 }
 166
 167
 168 static inline int
 169 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 170                 const struct sk_buff *skb,
 171                 struct ip_vs_protocol *pp)
 172 {
 173         if (unlikely(!pp->state_transition))
 174                 return 0;
 175         return pp->state_transition(cp, direction, skb, pp);
 176 }
 177
 178
 179 /*
 180  *  IPVS persistent scheduling function
 181  *  It creates a connection entry according to its template if exists,
 182  *  or selects a server and creates a connection entry plus a template.
 183  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
 184  *  Protocols supported: TCP, UDP
 185  */
 186 static struct ip_vs_conn *
 187 ip_vs_sched_persist(struct ip_vs_service *svc,
 188                     const struct sk_buff *skb,
 189                     __be16 ports[2])
 190 {
 191         struct ip_vs_conn *cp = NULL;
 192         struct ip_vs_iphdr iph;
 193         struct ip_vs_dest *dest;
 194         struct ip_vs_conn *ct;
 195         __be16  dport;                  /* destination port to forward */
 196         union nf_inet_addr snet;        /* source network of the client,
 197                                            after masking */
 198
 199         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 200
 201         /* Mask saddr with the netmask to adjust template granularity */
 202 #ifdef CONFIG_IP_VS_IPV6
 203         if (svc->af == AF_INET6)
 204                 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
 205         else
 206 #endif
 207                 snet.ip = iph.saddr.ip & svc->netmask;
 208
 209         IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
 210                       "mnet %s\n",
 211                       IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
 212                       IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
 213                       IP_VS_DBG_ADDR(svc->af, &snet));
 214
 215         /*
 216          * As far as we know, FTP is a very complicated network protocol, and
 217          * it uses control connection and data connections. For active FTP,
 218          * FTP server initialize data connection to the client, its source port
 219          * is often 20. For passive FTP, FTP server tells the clients the port
 220          * that it passively listens to,  and the client issues the data
 221          * connection. In the tunneling or direct routing mode, the load
 222          * balancer is on the client-to-server half of connection, the port
 223          * number is unknown to the load balancer. So, a conn template like
 224          * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
 225          * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 226          * is created for other persistent services.
 227          */
 228         if (ports[1] == svc->port) {
 229                 /* Check if a template already exists */
 230                 if (svc->port != FTPPORT)
 231                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 232                                              &iph.daddr, ports[1]);
 233                 else
 234                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 235                                              &iph.daddr, 0);
 236
 237                 if (!ct || !ip_vs_check_template(ct)) {
 238                         /*
 239                          * No template found or the dest of the connection
 240                          * template is not available.
 241                          */
 242                         dest = svc->scheduler->schedule(svc, skb);
 243                         if (dest == NULL) {
 244                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
 245                                 return NULL;
 246                         }
 247
 248                         /*
 249                          * Create a template like <protocol,caddr,0,
 250                          * vaddr,vport,daddr,dport> for non-ftp service,
 251                          * and <protocol,caddr,0,vaddr,0,daddr,0>
 252                          * for ftp service.
 253                          */
 254                         if (svc->port != FTPPORT)
 255                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 256                                                     &snet, 0,
 257                                                     &iph.daddr,
 258                                                     ports[1],
 259                                                     &dest->addr, dest->port,
 260                                                     IP_VS_CONN_F_TEMPLATE,
 261                                                     dest);
 262                         else
 263                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 264                                                     &snet, 0,
 265                                                     &iph.daddr, 0,
 266                                                     &dest->addr, 0,
 267                                                     IP_VS_CONN_F_TEMPLATE,
 268                                                     dest);
 269                         if (ct == NULL)
 270                                 return NULL;
 271
 272                         ct->timeout = svc->timeout;
 273                 } else {
 274                         /* set destination with the found template */
 275                         dest = ct->dest;
 276                 }
 277                 dport = dest->port;
 278         } else {
 279                 /*
 280                  * Note: persistent fwmark-based services and persistent
 281                  * port zero service are handled here.
 282                  * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
 283                  * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
 284                  */
 285                 if (svc->fwmark) {
 286                         union nf_inet_addr fwmark = {
 287                                 .ip = htonl(svc->fwmark)
 288                         };
 289
 290                         ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
 291                                              &fwmark, 0);
 292                 } else
 293                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 294                                              &iph.daddr, 0);
 295
 296                 if (!ct || !ip_vs_check_template(ct)) {
 297                         /*
 298                          * If it is not persistent port zero, return NULL,
 299                          * otherwise create a connection template.
 300                          */
 301                         if (svc->port)
 302                                 return NULL;
 303
 304                         dest = svc->scheduler->schedule(svc, skb);
 305                         if (dest == NULL) {
 306                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
 307                                 return NULL;
 308                         }
 309
 310                         /*
 311                          * Create a template according to the service
 312                          */
 313                         if (svc->fwmark) {
 314                                 union nf_inet_addr fwmark = {
 315                                         .ip = htonl(svc->fwmark)
 316                                 };
 317
 318                                 ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
 319                                                     &snet, 0,
 320                                                     &fwmark, 0,
 321                                                     &dest->addr, 0,
 322                                                     IP_VS_CONN_F_TEMPLATE,
 323                                                     dest);
 324                         } else
 325                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 326                                                     &snet, 0,
 327                                                     &iph.daddr, 0,
 328                                                     &dest->addr, 0,
 329                                                     IP_VS_CONN_F_TEMPLATE,
 330                                                     dest);
 331                         if (ct == NULL)
 332                                 return NULL;
 333
 334                         ct->timeout = svc->timeout;
 335                 } else {
 336                         /* set destination with the found template */
 337                         dest = ct->dest;
 338                 }
 339                 dport = ports[1];
 340         }
 341
 342         /*
 343          *    Create a new connection according to the template
 344          */
 345         cp = ip_vs_conn_new(svc->af, iph.protocol,
 346                             &iph.saddr, ports[0],
 347                             &iph.daddr, ports[1],
 348                             &dest->addr, dport,
 349                             0,
 350                             dest);
 351         if (cp == NULL) {
 352                 ip_vs_conn_put(ct);
 353                 return NULL;
 354         }
 355
 356         /*
 357          *    Add its control
 358          */
 359         ip_vs_control_add(cp, ct);
 360         ip_vs_conn_put(ct);
 361
 362         ip_vs_conn_stats(cp, svc);
 363         return cp;
 364 }
 365
 366
 367 /*
 368  *  IPVS main scheduling function
 369  *  It selects a server according to the virtual service, and
 370  *  creates a connection entry.
 371  *  Protocols supported: TCP, UDP
 372  */
 373 struct ip_vs_conn *
 374 ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 375 {
 376         struct ip_vs_conn *cp = NULL;
 377         struct ip_vs_iphdr iph;
 378         struct ip_vs_dest *dest;
 379         __be16 _ports[2], *pptr;
 380
 381         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 382         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 383         if (pptr == NULL)
 384                 return NULL;
 385
 386         /*
 387          *    Persistent service
 388          */
 389         if (svc->flags & IP_VS_SVC_F_PERSISTENT)
 390                 return ip_vs_sched_persist(svc, skb, pptr);
 391
 392         /*
 393          *    Non-persistent service
 394          */
 395         if (!svc->fwmark && pptr[1] != svc->port) {
 396                 if (!svc->port)
 397                         pr_err("Schedule: port zero only supported "
 398                                "in persistent services, "
 399                                "check your ipvs configuration\n");
 400                 return NULL;
 401         }
 402
 403         dest = svc->scheduler->schedule(svc, skb);
 404         if (dest == NULL) {
 405                 IP_VS_DBG(1, "Schedule: no dest found.\n");
 406                 return NULL;
 407         }
 408
 409         /*
 410          *    Create a connection entry.
 411          */
 412         cp = ip_vs_conn_new(svc->af, iph.protocol,
 413                             &iph.saddr, pptr[0],
 414                             &iph.daddr, pptr[1],
 415                             &dest->addr, dest->port ? dest->port : pptr[1],
 416                             0,
 417                             dest);
 418         if (cp == NULL)
 419                 return NULL;
 420
 421         IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
 422                       "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
 423                       ip_vs_fwd_tag(cp),
 424                       IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
 425                       IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
 426                       IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
 427                       cp->flags, atomic_read(&cp->refcnt));
 428
 429         ip_vs_conn_stats(cp, svc);
 430         return cp;
 431 }
 432
 433
 434 /*
 435  *  Pass or drop the packet.
 436  *  Called by ip_vs_in, when the virtual service is available but
 437  *  no destination is available for a new connection.
 438  */
 439 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 440                 struct ip_vs_protocol *pp)
 441 {
 442         __be16 _ports[2], *pptr;
 443         struct ip_vs_iphdr iph;
 444         int unicast;
 445         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 446
 447         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 448         if (pptr == NULL) {
 449                 ip_vs_service_put(svc);
 450                 return NF_DROP;
 451         }
 452
 453 #ifdef CONFIG_IP_VS_IPV6
 454         if (svc->af == AF_INET6)
 455                 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
 456         else
 457 #endif
 458                 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
 459
 460         /* if it is fwmark-based service, the cache_bypass sysctl is up
 461            and the destination is a non-local unicast, then create
 462            a cache_bypass connection entry */
 463         if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
 464                 int ret, cs;
 465                 struct ip_vs_conn *cp;
 466                 union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 467
 468                 ip_vs_service_put(svc);
 469
 470                 /* create a new connection entry */
 471                 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
 472                 cp = ip_vs_conn_new(svc->af, iph.protocol,
 473                                     &iph.saddr, pptr[0],
 474                                     &iph.daddr, pptr[1],
 475                                     &daddr, 0,
 476                                     IP_VS_CONN_F_BYPASS,
 477                                     NULL);
 478                 if (cp == NULL)
 479                         return NF_DROP;
 480
 481                 /* statistics */
 482                 ip_vs_in_stats(cp, skb);
 483
 484                 /* set state */
 485                 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
 486
 487                 /* transmit the first SYN packet */
 488                 ret = cp->packet_xmit(skb, cp, pp);
 489                 /* do not touch skb anymore */
 490
 491                 atomic_inc(&cp->in_pkts);
 492                 ip_vs_conn_put(cp);
 493                 return ret;
 494         }
 495
 496         /*
 497          * When the virtual ftp service is presented, packets destined
 498          * for other services on the VIP may get here (except services
 499          * listed in the ipvs table), pass the packets, because it is
 500          * not ipvs job to decide to drop the packets.
 501          */
 502         if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
 503                 ip_vs_service_put(svc);
 504                 return NF_ACCEPT;
 505         }
 506
 507         ip_vs_service_put(svc);
 508
 509         /*
 510          * Notify the client that the destination is unreachable, and
 511          * release the socket buffer.
 512          * Since it is in IP layer, the TCP socket is not actually
 513          * created, the TCP RST packet cannot be sent, instead that
 514          * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 515          */
 516 #ifdef CONFIG_IP_VS_IPV6
 517         if (svc->af == AF_INET6)
 518                 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
 519         else
 520 #endif
 521                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 522
 523         return NF_DROP;
 524 }
 525
 526
 527 /*
 528  *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
 529  *      chain, and is used for VS/NAT.
 530  *      It detects packets for VS/NAT connections and sends the packets
 531  *      immediately. This can avoid that iptable_nat mangles the packets
 532  *      for VS/NAT.
 533  */
 534 static unsigned int ip_vs_post_routing(unsigned int hooknum,
 535                                        struct sk_buff *skb,
 536                                        const struct net_device *in,
 537                                        const struct net_device *out,
 538                                        int (*okfn)(struct sk_buff *))
 539 {
 540         if (!skb->ipvs_property)
 541                 return NF_ACCEPT;
 542         /* The packet was sent from IPVS, exit this chain */
 543         return NF_STOP;
 544 }
 545
 546 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 547 {
 548         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 549 }
 550
 551 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 552 {
 553         int err = ip_defrag(skb, user);
 554
 555         if (!err)
 556                 ip_send_check(ip_hdr(skb));
 557
 558         return err;
 559 }
 560
 561 #ifdef CONFIG_IP_VS_IPV6
 562 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
 563 {
 564         /* TODO IPv6: Find out what to do here for IPv6 */
 565         return 0;
 566 }
 567 #endif
 568
 569 /*
 570  * Packet has been made sufficiently writable in caller
 571  * - inout: 1=in->out, 0=out->in
 572  */
 573 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 574                     struct ip_vs_conn *cp, int inout)
 575 {
 576         struct iphdr *iph        = ip_hdr(skb);
 577         unsigned int icmp_offset = iph->ihl*4;
 578         struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
 579                                                       icmp_offset);
 580         struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
 581
 582         if (inout) {
 583                 iph->saddr = cp->vaddr.ip;
 584                 ip_send_check(iph);
 585                 ciph->daddr = cp->vaddr.ip;
 586                 ip_send_check(ciph);
 587         } else {
 588                 iph->daddr = cp->daddr.ip;
 589                 ip_send_check(iph);
 590                 ciph->saddr = cp->daddr.ip;
 591                 ip_send_check(ciph);
 592         }
 593
 594         /* the TCP/UDP/SCTP port */
 595         if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
 596             IPPROTO_SCTP == ciph->protocol) {
 597                 __be16 *ports = (void *)ciph + ciph->ihl*4;
 598
 599                 if (inout)
 600                         ports[1] = cp->vport;
 601                 else
 602                         ports[0] = cp->dport;
 603         }
 604
 605         /* And finally the ICMP checksum */
 606         icmph->checksum = 0;
 607         icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
 608         skb->ip_summed = CHECKSUM_UNNECESSARY;
 609
 610         if (inout)
 611                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 612                         "Forwarding altered outgoing ICMP");
 613         else
 614                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 615                         "Forwarding altered incoming ICMP");
 616 }
 617
 618 #ifdef CONFIG_IP_VS_IPV6
 619 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 620                     struct ip_vs_conn *cp, int inout)
 621 {
 622         struct ipv6hdr *iph      = ipv6_hdr(skb);
 623         unsigned int icmp_offset = sizeof(struct ipv6hdr);
 624         struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
 625                                                       icmp_offset);
 626         struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
 627
 628         if (inout) {
 629                 iph->saddr = cp->vaddr.in6;
 630                 ciph->daddr = cp->vaddr.in6;
 631         } else {
 632                 iph->daddr = cp->daddr.in6;
 633                 ciph->saddr = cp->daddr.in6;
 634         }
 635
 636         /* the TCP/UDP/SCTP port */
 637         if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
 638             IPPROTO_SCTP == ciph->nexthdr) {
 639                 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
 640
 641                 if (inout)
 642                         ports[1] = cp->vport;
 643                 else
 644                         ports[0] = cp->dport;
 645         }
 646
 647         /* And finally the ICMP checksum */
 648         icmph->icmp6_cksum = 0;
 649         /* TODO IPv6: is this correct for ICMPv6? */
 650         ip_vs_checksum_complete(skb, icmp_offset);
 651         skb->ip_summed = CHECKSUM_UNNECESSARY;
 652
 653         if (inout)
 654                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 655                         "Forwarding altered outgoing ICMPv6");
 656         else
 657                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 658                         "Forwarding altered incoming ICMPv6");
 659 }
 660 #endif
 661
 662 /* Handle relevant response ICMP messages - forward to the right
 663  * destination host. Used for NAT and local client.
 664  */
 665 static int handle_response_icmp(int af, struct sk_buff *skb,
 666                                 union nf_inet_addr *snet,
 667                                 __u8 protocol, struct ip_vs_conn *cp,
 668                                 struct ip_vs_protocol *pp,
 669                                 unsigned int offset, unsigned int ihl)
 670 {
 671         unsigned int verdict = NF_DROP;
 672
 673         if (IP_VS_FWD_METHOD(cp) != 0) {
 674                 pr_err("shouldn't reach here, because the box is on the "
 675                        "half connection in the tun/dr module.\n");
 676         }
 677
 678         /* Ensure the checksum is correct */
 679         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
 680                 /* Failed checksum! */
 681                 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
 682                               IP_VS_DBG_ADDR(af, snet));
 683                 goto out;
 684         }
 685
 686         if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
 687             IPPROTO_SCTP == protocol)
 688                 offset += 2 * sizeof(__u16);
 689         if (!skb_make_writable(skb, offset))
 690                 goto out;
 691
 692 #ifdef CONFIG_IP_VS_IPV6
 693         if (af == AF_INET6)
 694                 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
 695         else
 696 #endif
 697                 ip_vs_nat_icmp(skb, pp, cp, 1);
 698
 699         /* do the statistics and put it back */
 700         ip_vs_out_stats(cp, skb);
 701
 702         skb->ipvs_property = 1;
 703         verdict = NF_ACCEPT;
 704
 705 out:
 706         __ip_vs_conn_put(cp);
 707
 708         return verdict;
 709 }
 710
 711 /*
 712  *      Handle ICMP messages in the inside-to-outside direction (outgoing).
 713  *      Find any that might be relevant, check against existing connections.
 714  *      Currently handles error types - unreachable, quench, ttl exceeded.
 715  */
 716 static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 717 {
 718         struct iphdr *iph;
 719         struct icmphdr  _icmph, *ic;
 720         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
 721         struct ip_vs_iphdr ciph;
 722         struct ip_vs_conn *cp;
 723         struct ip_vs_protocol *pp;
 724         unsigned int offset, ihl;
 725         union nf_inet_addr snet;
 726
 727         *related = 1;
 728
 729         /* reassemble IP fragments */
 730         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
 731                 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
 732                         return NF_STOLEN;
 733         }
 734
 735         iph = ip_hdr(skb);
 736         offset = ihl = iph->ihl * 4;
 737         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 738         if (ic == NULL)
 739                 return NF_DROP;
 740
 741         IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
 742                   ic->type, ntohs(icmp_id(ic)),
 743                   &iph->saddr, &iph->daddr);
 744
 745         /*
 746          * Work through seeing if this is for us.
 747          * These checks are supposed to be in an order that means easy
 748          * things are checked first to speed up processing.... however
 749          * this means that some packets will manage to get a long way
 750          * down this stack and then be rejected, but that's life.
 751          */
 752         if ((ic->type != ICMP_DEST_UNREACH) &&
 753             (ic->type != ICMP_SOURCE_QUENCH) &&
 754             (ic->type != ICMP_TIME_EXCEEDED)) {
 755                 *related = 0;
 756                 return NF_ACCEPT;
 757         }
 758
 759         /* Now find the contained IP header */
 760         offset += sizeof(_icmph);
 761         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 762         if (cih == NULL)
 763                 return NF_ACCEPT; /* The packet looks wrong, ignore */
 764
 765         pp = ip_vs_proto_get(cih->protocol);
 766         if (!pp)
 767                 return NF_ACCEPT;
 768
 769         /* Is the embedded protocol header present? */
 770         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
 771                      pp->dont_defrag))
 772                 return NF_ACCEPT;
 773
 774         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
 775
 776         offset += cih->ihl * 4;
 777
 778         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
 779         /* The embedded headers contain source and dest in reverse order */
 780         cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
 781         if (!cp)
 782                 return NF_ACCEPT;
 783
 784         snet.ip = iph->saddr;
 785         return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
 786                                     pp, offset, ihl);
 787 }
 788
 789 #ifdef CONFIG_IP_VS_IPV6
 790 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 791 {
 792         struct ipv6hdr *iph;
 793         struct icmp6hdr _icmph, *ic;
 794         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
 795                                            within the ICMP */
 796         struct ip_vs_iphdr ciph;
 797         struct ip_vs_conn *cp;
 798         struct ip_vs_protocol *pp;
 799         unsigned int offset;
 800         union nf_inet_addr snet;
 801
 802         *related = 1;
 803
 804         /* reassemble IP fragments */
 805         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
 806                 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
 807                         return NF_STOLEN;
 808         }
 809
 810         iph = ipv6_hdr(skb);
 811         offset = sizeof(struct ipv6hdr);
 812         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 813         if (ic == NULL)
 814                 return NF_DROP;
 815
 816         IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
 817                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
 818                   &iph->saddr, &iph->daddr);
 819
 820         /*
 821          * Work through seeing if this is for us.
 822          * These checks are supposed to be in an order that means easy
 823          * things are checked first to speed up processing.... however
 824          * this means that some packets will manage to get a long way
 825          * down this stack and then be rejected, but that's life.
 826          */
 827         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
 828             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
 829             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
 830                 *related = 0;
 831                 return NF_ACCEPT;
 832         }
 833
 834         /* Now find the contained IP header */
 835         offset += sizeof(_icmph);
 836         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 837         if (cih == NULL)
 838                 return NF_ACCEPT; /* The packet looks wrong, ignore */
 839
 840         pp = ip_vs_proto_get(cih->nexthdr);
 841         if (!pp)
 842                 return NF_ACCEPT;
 843
 844         /* Is the embedded protocol header present? */
 845         /* TODO: we don't support fragmentation at the moment anyways */
 846         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
 847                 return NF_ACCEPT;
 848
 849         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
 850
 851         offset += sizeof(struct ipv6hdr);
 852
 853         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
 854         /* The embedded headers contain source and dest in reverse order */
 855         cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
 856         if (!cp)
 857                 return NF_ACCEPT;
 858
 859         ipv6_addr_copy(&snet.in6, &iph->saddr);
 860         return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
 861                                     pp, offset, sizeof(struct ipv6hdr));
 862 }
 863 #endif
 864
 865 /*
 866  * Check if sctp chunc is ABORT chunk
 867  */
 868 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
 869 {
 870         sctp_chunkhdr_t *sch, schunk;
 871         sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
 872                         sizeof(schunk), &schunk);
 873         if (sch == NULL)
 874                 return 0;
 875         if (sch->type == SCTP_CID_ABORT)
 876                 return 1;
 877         return 0;
 878 }
 879
 880 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
 881 {
 882         struct tcphdr _tcph, *th;
 883
 884         th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
 885         if (th == NULL)
 886                 return 0;
 887         return th->rst;
 888 }
 889
 890 /* Handle response packets: rewrite addresses and send away...
 891  * Used for NAT and local client.
 892  */
 893 static unsigned int
 894 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 895                 struct ip_vs_conn *cp, int ihl)
 896 {
 897         IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
 898
 899         if (!skb_make_writable(skb, ihl))
 900                 goto drop;
 901
 902         /* mangle the packet */
 903         if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
 904                 goto drop;
 905
 906 #ifdef CONFIG_IP_VS_IPV6
 907         if (af == AF_INET6)
 908                 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
 909         else
 910 #endif
 911         {
 912                 ip_hdr(skb)->saddr = cp->vaddr.ip;
 913                 ip_send_check(ip_hdr(skb));
 914         }
 915
 916         /* For policy routing, packets originating from this
 917          * machine itself may be routed differently to packets
 918          * passing through.  We want this packet to be routed as
 919          * if it came from this machine itself.  So re-compute
 920          * the routing information.
 921          */
 922 #ifdef CONFIG_IP_VS_IPV6
 923         if (af == AF_INET6) {
 924                 if (ip6_route_me_harder(skb) != 0)
 925                         goto drop;
 926         } else
 927 #endif
 928                 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
 929                         goto drop;
 930
 931         IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
 932
 933         ip_vs_out_stats(cp, skb);
 934         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
 935         ip_vs_conn_put(cp);
 936
 937         skb->ipvs_property = 1;
 938
 939         LeaveFunction(11);
 940         return NF_ACCEPT;
 941
 942 drop:
 943         ip_vs_conn_put(cp);
 944         kfree_skb(skb);
 945         return NF_STOLEN;
 946 }
 947
 948 /*
 949  *      It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
 950  *      Check if outgoing packet belongs to the established ip_vs_conn.
 951  */
 952 static unsigned int
 953 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 954           const struct net_device *in, const struct net_device *out,
 955           int (*okfn)(struct sk_buff *))
 956 {
 957         struct ip_vs_iphdr iph;
 958         struct ip_vs_protocol *pp;
 959         struct ip_vs_conn *cp;
 960         int af;
 961
 962         EnterFunction(11);
 963
 964         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
 965
 966         if (skb->ipvs_property)
 967                 return NF_ACCEPT;
 968
 969         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 970 #ifdef CONFIG_IP_VS_IPV6
 971         if (af == AF_INET6) {
 972                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
 973                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
 974
 975                         if (related)
 976                                 return verdict;
 977                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 978                 }
 979         } else
 980 #endif
 981                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
 982                         int related, verdict = ip_vs_out_icmp(skb, &related);
 983
 984                         if (related)
 985                                 return verdict;
 986                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 987                 }
 988
 989         pp = ip_vs_proto_get(iph.protocol);
 990         if (unlikely(!pp))
 991                 return NF_ACCEPT;
 992
 993         /* reassemble IP fragments */
 994 #ifdef CONFIG_IP_VS_IPV6
 995         if (af == AF_INET6) {
 996                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
 997                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
 998
 999                         if (related)
1000                                 return verdict;
1001
1002                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1003                 }
1004         } else
1005 #endif
1006                 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
1007                              !pp->dont_defrag)) {
1008                         if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
1009                                 return NF_STOLEN;
1010
1011                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1012                 }
1013
1014         /*
1015          * Check if the packet belongs to an existing entry
1016          */
1017         cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1018
1019         if (unlikely(!cp)) {
1020                 if (sysctl_ip_vs_nat_icmp_send &&
1021                     (pp->protocol == IPPROTO_TCP ||
1022                      pp->protocol == IPPROTO_UDP ||
1023                      pp->protocol == IPPROTO_SCTP)) {
1024                         __be16 _ports[2], *pptr;
1025
1026                         pptr = skb_header_pointer(skb, iph.len,
1027                                                   sizeof(_ports), _ports);
1028                         if (pptr == NULL)
1029                                 return NF_ACCEPT;       /* Not for me */
1030                         if (ip_vs_lookup_real_service(af, iph.protocol,
1031                                                       &iph.saddr,
1032                                                       pptr[0])) {
1033                                 /*
1034                                  * Notify the real server: there is no
1035                                  * existing entry if it is not RST
1036                                  * packet or not TCP packet.
1037                                  */
1038                                 if ((iph.protocol != IPPROTO_TCP &&
1039                                      iph.protocol != IPPROTO_SCTP)
1040                                      || ((iph.protocol == IPPROTO_TCP
1041                                           && !is_tcp_reset(skb, iph.len))
1042                                          || (iph.protocol == IPPROTO_SCTP
1043                                                 && !is_sctp_abort(skb,
1044                                                         iph.len)))) {
1045 #ifdef CONFIG_IP_VS_IPV6
1046                                         if (af == AF_INET6)
1047                                                 icmpv6_send(skb,
1048                                                             ICMPV6_DEST_UNREACH,
1049                                                             ICMPV6_PORT_UNREACH,
1050                                                             0);
1051                                         else
1052 #endif
1053                                                 icmp_send(skb,
1054                                                           ICMP_DEST_UNREACH,
1055                                                           ICMP_PORT_UNREACH, 0);
1056                                         return NF_DROP;
1057                                 }
1058                         }
1059                 }
1060                 IP_VS_DBG_PKT(12, pp, skb, 0,
1061                               "packet continues traversal as normal");
1062                 return NF_ACCEPT;
1063         }
1064
1065         return handle_response(af, skb, pp, cp, iph.len);
1066 }
1067
1068
1069 /*
1070  *      Handle ICMP messages in the outside-to-inside direction (incoming).
1071  *      Find any that might be relevant, check against existing connections,
1072  *      forward to the right destination host if relevant.
1073  *      Currently handles error types - unreachable, quench, ttl exceeded.
1074  */
1075 static int
1076 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1077 {
1078         struct iphdr *iph;
1079         struct icmphdr  _icmph, *ic;
1080         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1081         struct ip_vs_iphdr ciph;
1082         struct ip_vs_conn *cp;
1083         struct ip_vs_protocol *pp;
1084         unsigned int offset, ihl, verdict;
1085         union nf_inet_addr snet;
1086
1087         *related = 1;
1088
1089         /* reassemble IP fragments */
1090         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1091                 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
1092                                             IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1093                         return NF_STOLEN;
1094         }
1095
1096         iph = ip_hdr(skb);
1097         offset = ihl = iph->ihl * 4;
1098         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1099         if (ic == NULL)
1100                 return NF_DROP;
1101
1102         IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1103                   ic->type, ntohs(icmp_id(ic)),
1104                   &iph->saddr, &iph->daddr);
1105
1106         /*
1107          * Work through seeing if this is for us.
1108          * These checks are supposed to be in an order that means easy
1109          * things are checked first to speed up processing.... however
1110          * this means that some packets will manage to get a long way
1111          * down this stack and then be rejected, but that's life.
1112          */
1113         if ((ic->type != ICMP_DEST_UNREACH) &&
1114             (ic->type != ICMP_SOURCE_QUENCH) &&
1115             (ic->type != ICMP_TIME_EXCEEDED)) {
1116                 *related = 0;
1117                 return NF_ACCEPT;
1118         }
1119
1120         /* Now find the contained IP header */
1121         offset += sizeof(_icmph);
1122         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1123         if (cih == NULL)
1124                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1125
1126         pp = ip_vs_proto_get(cih->protocol);
1127         if (!pp)
1128                 return NF_ACCEPT;
1129
1130         /* Is the embedded protocol header present? */
1131         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1132                      pp->dont_defrag))
1133                 return NF_ACCEPT;
1134
1135         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1136
1137         offset += cih->ihl * 4;
1138
1139         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1140         /* The embedded headers contain source and dest in reverse order */
1141         cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1142         if (!cp) {
1143                 /* The packet could also belong to a local client */
1144                 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1145                 if (cp) {
1146                         snet.ip = iph->saddr;
1147                         return handle_response_icmp(AF_INET, skb, &snet,
1148                                                     cih->protocol, cp, pp,
1149                                                     offset, ihl);
1150                 }
1151                 return NF_ACCEPT;
1152         }
1153
1154         verdict = NF_DROP;
1155
1156         /* Ensure the checksum is correct */
1157         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1158                 /* Failed checksum! */
1159                 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1160                           &iph->saddr);
1161                 goto out;
1162         }
1163
1164         /* do the statistics and put it back */
1165         ip_vs_in_stats(cp, skb);
1166         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1167                 offset += 2 * sizeof(__u16);
1168         verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1169         /* do not touch skb anymore */
1170
1171   out:
1172         __ip_vs_conn_put(cp);
1173
1174         return verdict;
1175 }
1176
1177 #ifdef CONFIG_IP_VS_IPV6
1178 static int
1179 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1180 {
1181         struct ipv6hdr *iph;
1182         struct icmp6hdr _icmph, *ic;
1183         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
1184                                            within the ICMP */
1185         struct ip_vs_iphdr ciph;
1186         struct ip_vs_conn *cp;
1187         struct ip_vs_protocol *pp;
1188         unsigned int offset, verdict;
1189         union nf_inet_addr snet;
1190
1191         *related = 1;
1192
1193         /* reassemble IP fragments */
1194         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1195                 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1196                                                IP_DEFRAG_VS_IN :
1197                                                IP_DEFRAG_VS_FWD))
1198                         return NF_STOLEN;
1199         }
1200
1201         iph = ipv6_hdr(skb);
1202         offset = sizeof(struct ipv6hdr);
1203         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1204         if (ic == NULL)
1205                 return NF_DROP;
1206
1207         IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1208                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
1209                   &iph->saddr, &iph->daddr);
1210
1211         /*
1212          * Work through seeing if this is for us.
1213          * These checks are supposed to be in an order that means easy
1214          * things are checked first to speed up processing.... however
1215          * this means that some packets will manage to get a long way
1216          * down this stack and then be rejected, but that's life.
1217          */
1218         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1219             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1220             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1221                 *related = 0;
1222                 return NF_ACCEPT;
1223         }
1224
1225         /* Now find the contained IP header */
1226         offset += sizeof(_icmph);
1227         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1228         if (cih == NULL)
1229                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1230
1231         pp = ip_vs_proto_get(cih->nexthdr);
1232         if (!pp)
1233                 return NF_ACCEPT;
1234
1235         /* Is the embedded protocol header present? */
1236         /* TODO: we don't support fragmentation at the moment anyways */
1237         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1238                 return NF_ACCEPT;
1239
1240         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1241
1242         offset += sizeof(struct ipv6hdr);
1243
1244         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1245         /* The embedded headers contain source and dest in reverse order */
1246         cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1247         if (!cp) {
1248                 /* The packet could also belong to a local client */
1249                 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1250                 if (cp) {
1251                         ipv6_addr_copy(&snet.in6, &iph->saddr);
1252                         return handle_response_icmp(AF_INET6, skb, &snet,
1253                                                     cih->nexthdr,
1254                                                     cp, pp, offset,
1255                                                     sizeof(struct ipv6hdr));
1256                 }
1257                 return NF_ACCEPT;
1258         }
1259
1260         verdict = NF_DROP;
1261
1262         /* do the statistics and put it back */
1263         ip_vs_in_stats(cp, skb);
1264         if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
1265             IPPROTO_SCTP == cih->nexthdr)
1266                 offset += 2 * sizeof(__u16);
1267         verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1268         /* do not touch skb anymore */
1269
1270         __ip_vs_conn_put(cp);
1271
1272         return verdict;
1273 }
1274 #endif
1275
1276
1277 /*
1278  *      Check if it's for virtual services, look it up,
1279  *      and send it on its way...
1280  */
1281 static unsigned int
1282 ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1283          const struct net_device *in, const struct net_device *out,
1284          int (*okfn)(struct sk_buff *))
1285 {
1286         struct ip_vs_iphdr iph;
1287         struct ip_vs_protocol *pp;
1288         struct ip_vs_conn *cp;
1289         int ret, restart, af, pkts;
1290
1291         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1292
1293         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1294
1295         /*
1296          *      Big tappo: only PACKET_HOST, including loopback for local client
1297          *      Don't handle local packets on IPv6 for now
1298          */
1299         if (unlikely(skb->pkt_type != PACKET_HOST)) {
1300                 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
1301                               skb->pkt_type,
1302                               iph.protocol,
1303                               IP_VS_DBG_ADDR(af, &iph.daddr));
1304                 return NF_ACCEPT;
1305         }
1306
1307 #ifdef CONFIG_IP_VS_IPV6
1308         if (af == AF_INET6) {
1309                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1310                         int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1311
1312                         if (related)
1313                                 return verdict;
1314                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1315                 }
1316         } else
1317 #endif
1318                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1319                         int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
1320
1321                         if (related)
1322                                 return verdict;
1323                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1324                 }
1325
1326         /* Protocol supported? */
1327         pp = ip_vs_proto_get(iph.protocol);
1328         if (unlikely(!pp))
1329                 return NF_ACCEPT;
1330
1331         /*
1332          * Check if the packet belongs to an existing connection entry
1333          */
1334         cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1335
1336         if (unlikely(!cp)) {
1337                 int v;
1338
1339                 /* For local client packets, it could be a response */
1340                 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1341                 if (cp)
1342                         return handle_response(af, skb, pp, cp, iph.len);
1343
1344                 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1345                         return v;
1346         }
1347
1348         if (unlikely(!cp)) {
1349                 /* sorry, all this trouble for a no-hit :) */
1350                 IP_VS_DBG_PKT(12, pp, skb, 0,
1351                               "packet continues traversal as normal");
1352                 return NF_ACCEPT;
1353         }
1354
1355         IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1356
1357         /* Check the server status */
1358         if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1359                 /* the destination server is not available */
1360
1361                 if (sysctl_ip_vs_expire_nodest_conn) {
1362                         /* try to expire the connection immediately */
1363                         ip_vs_conn_expire_now(cp);
1364                 }
1365                 /* don't restart its timer, and silently
1366                    drop the packet. */
1367                 __ip_vs_conn_put(cp);
1368                 return NF_DROP;
1369         }
1370
1371         ip_vs_in_stats(cp, skb);
1372         restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1373         if (cp->packet_xmit)
1374                 ret = cp->packet_xmit(skb, cp, pp);
1375                 /* do not touch skb anymore */
1376         else {
1377                 IP_VS_DBG_RL("warning: packet_xmit is null");
1378                 ret = NF_ACCEPT;
1379         }
1380
1381         /* Increase its packet counter and check if it is needed
1382          * to be synchronized
1383          *
1384          * Sync connection if it is about to close to
1385          * encorage the standby servers to update the connections timeout
1386          */
1387         pkts = atomic_add_return(1, &cp->in_pkts);
1388         if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1389             cp->protocol == IPPROTO_SCTP) {
1390                 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
1391                         (atomic_read(&cp->in_pkts) %
1392                          sysctl_ip_vs_sync_threshold[1]
1393                          == sysctl_ip_vs_sync_threshold[0])) ||
1394                                 (cp->old_state != cp->state &&
1395                                  ((cp->state == IP_VS_SCTP_S_CLOSED) ||
1396                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
1397                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
1398                         ip_vs_sync_conn(cp);
1399                         goto out;
1400                 }
1401         }
1402
1403         if (af == AF_INET &&
1404             (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1405             (((cp->protocol != IPPROTO_TCP ||
1406                cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1407               (pkts % sysctl_ip_vs_sync_threshold[1]
1408                == sysctl_ip_vs_sync_threshold[0])) ||
1409              ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1410               ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1411                (cp->state == IP_VS_TCP_S_CLOSE) ||
1412                (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1413                (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1414                 ip_vs_sync_conn(cp);
1415 out:
1416         cp->old_state = cp->state;
1417
1418         ip_vs_conn_put(cp);
1419         return ret;
1420 }
1421
1422
1423 /*
1424  *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1425  *      related packets destined for 0.0.0.0/0.
1426  *      When fwmark-based virtual service is used, such as transparent
1427  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1428  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1429  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1430  *      and send them to ip_vs_in_icmp.
1431  */
1432 static unsigned int
1433 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1434                    const struct net_device *in, const struct net_device *out,
1435                    int (*okfn)(struct sk_buff *))
1436 {
1437         int r;
1438
1439         if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1440                 return NF_ACCEPT;
1441
1442         return ip_vs_in_icmp(skb, &r, hooknum);
1443 }
1444
1445 #ifdef CONFIG_IP_VS_IPV6
1446 static unsigned int
1447 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1448                       const struct net_device *in, const struct net_device *out,
1449                       int (*okfn)(struct sk_buff *))
1450 {
1451         int r;
1452
1453         if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1454                 return NF_ACCEPT;
1455
1456         return ip_vs_in_icmp_v6(skb, &r, hooknum);
1457 }
1458 #endif
1459
1460
1461 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1462         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1463          * or VS/NAT(change destination), so that filtering rules can be
1464          * applied to IPVS. */
1465         {
1466                 .hook           = ip_vs_in,
1467                 .owner          = THIS_MODULE,
1468                 .pf             = PF_INET,
1469                 .hooknum        = NF_INET_LOCAL_IN,
1470                 .priority       = 100,
1471         },
1472         /* After packet filtering, change source only for VS/NAT */
1473         {
1474                 .hook           = ip_vs_out,
1475                 .owner          = THIS_MODULE,
1476                 .pf             = PF_INET,
1477                 .hooknum        = NF_INET_FORWARD,
1478                 .priority       = 100,
1479         },
1480         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1481          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1482         {
1483                 .hook           = ip_vs_forward_icmp,
1484                 .owner          = THIS_MODULE,
1485                 .pf             = PF_INET,
1486                 .hooknum        = NF_INET_FORWARD,
1487                 .priority       = 99,
1488         },
1489         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1490         {
1491                 .hook           = ip_vs_post_routing,
1492                 .owner          = THIS_MODULE,
1493                 .pf             = PF_INET,
1494                 .hooknum        = NF_INET_POST_ROUTING,
1495                 .priority       = NF_IP_PRI_NAT_SRC-1,
1496         },
1497 #ifdef CONFIG_IP_VS_IPV6
1498         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1499          * or VS/NAT(change destination), so that filtering rules can be
1500          * applied to IPVS. */
1501         {
1502                 .hook           = ip_vs_in,
1503                 .owner          = THIS_MODULE,
1504                 .pf             = PF_INET6,
1505                 .hooknum        = NF_INET_LOCAL_IN,
1506                 .priority       = 100,
1507         },
1508         /* After packet filtering, change source only for VS/NAT */
1509         {
1510                 .hook           = ip_vs_out,
1511                 .owner          = THIS_MODULE,
1512                 .pf             = PF_INET6,
1513                 .hooknum        = NF_INET_FORWARD,
1514                 .priority       = 100,
1515         },
1516         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1517          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1518         {
1519                 .hook           = ip_vs_forward_icmp_v6,
1520                 .owner          = THIS_MODULE,
1521                 .pf             = PF_INET6,
1522                 .hooknum        = NF_INET_FORWARD,
1523                 .priority       = 99,
1524         },
1525         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1526         {
1527                 .hook           = ip_vs_post_routing,
1528                 .owner          = THIS_MODULE,
1529                 .pf             = PF_INET6,
1530                 .hooknum        = NF_INET_POST_ROUTING,
1531                 .priority       = NF_IP6_PRI_NAT_SRC-1,
1532         },
1533 #endif
1534 };
1535
1536
1537 /*
1538  *      Initialize IP Virtual Server
1539  */
1540 static int __init ip_vs_init(void)
1541 {
1542         int ret;
1543
1544         ip_vs_estimator_init();
1545
1546         ret = ip_vs_control_init();
1547         if (ret < 0) {
1548                 pr_err("can't setup control.\n");
1549                 goto cleanup_estimator;
1550         }
1551
1552         ip_vs_protocol_init();
1553
1554         ret = ip_vs_app_init();
1555         if (ret < 0) {
1556                 pr_err("can't setup application helper.\n");
1557                 goto cleanup_protocol;
1558         }
1559
1560         ret = ip_vs_conn_init();
1561         if (ret < 0) {
1562                 pr_err("can't setup connection table.\n");
1563                 goto cleanup_app;
1564         }
1565
1566         ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1567         if (ret < 0) {
1568                 pr_err("can't register hooks.\n");
1569                 goto cleanup_conn;
1570         }
1571
1572         pr_info("ipvs loaded.\n");
1573         return ret;
1574
1575   cleanup_conn:
1576         ip_vs_conn_cleanup();
1577   cleanup_app:
1578         ip_vs_app_cleanup();
1579   cleanup_protocol:
1580         ip_vs_protocol_cleanup();
1581         ip_vs_control_cleanup();
1582   cleanup_estimator:
1583         ip_vs_estimator_cleanup();
1584         return ret;
1585 }
1586
1587 static void __exit ip_vs_cleanup(void)
1588 {
1589         nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1590         ip_vs_conn_cleanup();
1591         ip_vs_app_cleanup();
1592         ip_vs_protocol_cleanup();
1593         ip_vs_control_cleanup();
1594         ip_vs_estimator_cleanup();
1595         pr_info("ipvs unloaded.\n");
1596 }
1597
1598 module_init(ip_vs_init);
1599 module_exit(ip_vs_cleanup);
1600 MODULE_LICENSE("GPL");