net/netfilter/ipvs/ip_vs_core.c

   1 /*
   2  * IPVS         An implementation of the IP virtual server support for the
   3  *              LINUX operating system.  IPVS is now implemented as a module
   4  *              over the Netfilter framework. IPVS can be used to build a
   5  *              high-performance and highly available server based on a
   6  *              cluster of servers.
   7  *
   8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   9  *              Peter Kese <peter.kese@ijs.si>
  10  *              Julian Anastasov <ja@ssi.bg>
  11  *
  12  *              This program is free software; you can redistribute it and/or
  13  *              modify it under the terms of the GNU General Public License
  14  *              as published by the Free Software Foundation; either version
  15  *              2 of the License, or (at your option) any later version.
  16  *
  17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
  18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
  19  * and others.
  20  *
  21  * Changes:
  22  *      Paul `Rusty' Russell            properly handle non-linear skbs
  23  *      Harald Welte                    don't use nfcache
  24  *
  25  */
  26
  27 #define KMSG_COMPONENT "IPVS"
  28 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  29
  30 #include <linux/module.h>
  31 #include <linux/kernel.h>
  32 #include <linux/ip.h>
  33 #include <linux/tcp.h>
  34 #include <linux/icmp.h>
  35
  36 #include <net/ip.h>
  37 #include <net/tcp.h>
  38 #include <net/udp.h>
  39 #include <net/icmp.h>                   /* for icmp_send */
  40 #include <net/route.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv4.h>
  44
  45 #ifdef CONFIG_IP_VS_IPV6
  46 #include <net/ipv6.h>
  47 #include <linux/netfilter_ipv6.h>
  48 #endif
  49
  50 #include <net/ip_vs.h>
  51
  52
  53 EXPORT_SYMBOL(register_ip_vs_scheduler);
  54 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
  55 EXPORT_SYMBOL(ip_vs_skb_replace);
  56 EXPORT_SYMBOL(ip_vs_proto_name);
  57 EXPORT_SYMBOL(ip_vs_conn_new);
  58 EXPORT_SYMBOL(ip_vs_conn_in_get);
  59 EXPORT_SYMBOL(ip_vs_conn_out_get);
  60 #ifdef CONFIG_IP_VS_PROTO_TCP
  61 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
  62 #endif
  63 EXPORT_SYMBOL(ip_vs_conn_put);
  64 #ifdef CONFIG_IP_VS_DEBUG
  65 EXPORT_SYMBOL(ip_vs_get_debug_level);
  66 #endif
  67
  68
  69 /* ID used in ICMP lookups */
  70 #define icmp_id(icmph)          (((icmph)->un).echo.id)
  71 #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
  72
  73 const char *ip_vs_proto_name(unsigned proto)
  74 {
  75         static char buf[20];
  76
  77         switch (proto) {
  78         case IPPROTO_IP:
  79                 return "IP";
  80         case IPPROTO_UDP:
  81                 return "UDP";
  82         case IPPROTO_TCP:
  83                 return "TCP";
  84         case IPPROTO_ICMP:
  85                 return "ICMP";
  86 #ifdef CONFIG_IP_VS_IPV6
  87         case IPPROTO_ICMPV6:
  88                 return "ICMPv6";
  89 #endif
  90         default:
  91                 sprintf(buf, "IP_%d", proto);
  92                 return buf;
  93         }
  94 }
  95
  96 void ip_vs_init_hash_table(struct list_head *table, int rows)
  97 {
  98         while (--rows >= 0)
  99                 INIT_LIST_HEAD(&table[rows]);
 100 }
 101
 102 static inline void
 103 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 104 {
 105         struct ip_vs_dest *dest = cp->dest;
 106         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 107                 spin_lock(&dest->stats.lock);
 108                 dest->stats.ustats.inpkts++;
 109                 dest->stats.ustats.inbytes += skb->len;
 110                 spin_unlock(&dest->stats.lock);
 111
 112                 spin_lock(&dest->svc->stats.lock);
 113                 dest->svc->stats.ustats.inpkts++;
 114                 dest->svc->stats.ustats.inbytes += skb->len;
 115                 spin_unlock(&dest->svc->stats.lock);
 116
 117                 spin_lock(&ip_vs_stats.lock);
 118                 ip_vs_stats.ustats.inpkts++;
 119                 ip_vs_stats.ustats.inbytes += skb->len;
 120                 spin_unlock(&ip_vs_stats.lock);
 121         }
 122 }
 123
 124
 125 static inline void
 126 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 127 {
 128         struct ip_vs_dest *dest = cp->dest;
 129         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 130                 spin_lock(&dest->stats.lock);
 131                 dest->stats.ustats.outpkts++;
 132                 dest->stats.ustats.outbytes += skb->len;
 133                 spin_unlock(&dest->stats.lock);
 134
 135                 spin_lock(&dest->svc->stats.lock);
 136                 dest->svc->stats.ustats.outpkts++;
 137                 dest->svc->stats.ustats.outbytes += skb->len;
 138                 spin_unlock(&dest->svc->stats.lock);
 139
 140                 spin_lock(&ip_vs_stats.lock);
 141                 ip_vs_stats.ustats.outpkts++;
 142                 ip_vs_stats.ustats.outbytes += skb->len;
 143                 spin_unlock(&ip_vs_stats.lock);
 144         }
 145 }
 146
 147
 148 static inline void
 149 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 150 {
 151         spin_lock(&cp->dest->stats.lock);
 152         cp->dest->stats.ustats.conns++;
 153         spin_unlock(&cp->dest->stats.lock);
 154
 155         spin_lock(&svc->stats.lock);
 156         svc->stats.ustats.conns++;
 157         spin_unlock(&svc->stats.lock);
 158
 159         spin_lock(&ip_vs_stats.lock);
 160         ip_vs_stats.ustats.conns++;
 161         spin_unlock(&ip_vs_stats.lock);
 162 }
 163
 164
 165 static inline int
 166 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 167                 const struct sk_buff *skb,
 168                 struct ip_vs_protocol *pp)
 169 {
 170         if (unlikely(!pp->state_transition))
 171                 return 0;
 172         return pp->state_transition(cp, direction, skb, pp);
 173 }
 174
 175
 176 /*
 177  *  IPVS persistent scheduling function
 178  *  It creates a connection entry according to its template if exists,
 179  *  or selects a server and creates a connection entry plus a template.
 180  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
 181  *  Protocols supported: TCP, UDP
 182  */
 183 static struct ip_vs_conn *
 184 ip_vs_sched_persist(struct ip_vs_service *svc,
 185                     const struct sk_buff *skb,
 186                     __be16 ports[2])
 187 {
 188         struct ip_vs_conn *cp = NULL;
 189         struct ip_vs_iphdr iph;
 190         struct ip_vs_dest *dest;
 191         struct ip_vs_conn *ct;
 192         __be16  dport;                  /* destination port to forward */
 193         union nf_inet_addr snet;        /* source network of the client,
 194                                            after masking */
 195
 196         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 197
 198         /* Mask saddr with the netmask to adjust template granularity */
 199 #ifdef CONFIG_IP_VS_IPV6
 200         if (svc->af == AF_INET6)
 201                 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
 202         else
 203 #endif
 204                 snet.ip = iph.saddr.ip & svc->netmask;
 205
 206         IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
 207                       "mnet %s\n",
 208                       IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
 209                       IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
 210                       IP_VS_DBG_ADDR(svc->af, &snet));
 211
 212         /*
 213          * As far as we know, FTP is a very complicated network protocol, and
 214          * it uses control connection and data connections. For active FTP,
 215          * FTP server initialize data connection to the client, its source port
 216          * is often 20. For passive FTP, FTP server tells the clients the port
 217          * that it passively listens to,  and the client issues the data
 218          * connection. In the tunneling or direct routing mode, the load
 219          * balancer is on the client-to-server half of connection, the port
 220          * number is unknown to the load balancer. So, a conn template like
 221          * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
 222          * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 223          * is created for other persistent services.
 224          */
 225         if (ports[1] == svc->port) {
 226                 /* Check if a template already exists */
 227                 if (svc->port != FTPPORT)
 228                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 229                                              &iph.daddr, ports[1]);
 230                 else
 231                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 232                                              &iph.daddr, 0);
 233
 234                 if (!ct || !ip_vs_check_template(ct)) {
 235                         /*
 236                          * No template found or the dest of the connection
 237                          * template is not available.
 238                          */
 239                         dest = svc->scheduler->schedule(svc, skb);
 240                         if (dest == NULL) {
 241                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
 242                                 return NULL;
 243                         }
 244
 245                         /*
 246                          * Create a template like <protocol,caddr,0,
 247                          * vaddr,vport,daddr,dport> for non-ftp service,
 248                          * and <protocol,caddr,0,vaddr,0,daddr,0>
 249                          * for ftp service.
 250                          */
 251                         if (svc->port != FTPPORT)
 252                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 253                                                     &snet, 0,
 254                                                     &iph.daddr,
 255                                                     ports[1],
 256                                                     &dest->addr, dest->port,
 257                                                     IP_VS_CONN_F_TEMPLATE,
 258                                                     dest);
 259                         else
 260                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 261                                                     &snet, 0,
 262                                                     &iph.daddr, 0,
 263                                                     &dest->addr, 0,
 264                                                     IP_VS_CONN_F_TEMPLATE,
 265                                                     dest);
 266                         if (ct == NULL)
 267                                 return NULL;
 268
 269                         ct->timeout = svc->timeout;
 270                 } else {
 271                         /* set destination with the found template */
 272                         dest = ct->dest;
 273                 }
 274                 dport = dest->port;
 275         } else {
 276                 /*
 277                  * Note: persistent fwmark-based services and persistent
 278                  * port zero service are handled here.
 279                  * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
 280                  * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
 281                  */
 282                 if (svc->fwmark) {
 283                         union nf_inet_addr fwmark = {
 284                                 .ip = htonl(svc->fwmark)
 285                         };
 286
 287                         ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
 288                                              &fwmark, 0);
 289                 } else
 290                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 291                                              &iph.daddr, 0);
 292
 293                 if (!ct || !ip_vs_check_template(ct)) {
 294                         /*
 295                          * If it is not persistent port zero, return NULL,
 296                          * otherwise create a connection template.
 297                          */
 298                         if (svc->port)
 299                                 return NULL;
 300
 301                         dest = svc->scheduler->schedule(svc, skb);
 302                         if (dest == NULL) {
 303                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
 304                                 return NULL;
 305                         }
 306
 307                         /*
 308                          * Create a template according to the service
 309                          */
 310                         if (svc->fwmark) {
 311                                 union nf_inet_addr fwmark = {
 312                                         .ip = htonl(svc->fwmark)
 313                                 };
 314
 315                                 ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
 316                                                     &snet, 0,
 317                                                     &fwmark, 0,
 318                                                     &dest->addr, 0,
 319                                                     IP_VS_CONN_F_TEMPLATE,
 320                                                     dest);
 321                         } else
 322                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 323                                                     &snet, 0,
 324                                                     &iph.daddr, 0,
 325                                                     &dest->addr, 0,
 326                                                     IP_VS_CONN_F_TEMPLATE,
 327                                                     dest);
 328                         if (ct == NULL)
 329                                 return NULL;
 330
 331                         ct->timeout = svc->timeout;
 332                 } else {
 333                         /* set destination with the found template */
 334                         dest = ct->dest;
 335                 }
 336                 dport = ports[1];
 337         }
 338
 339         /*
 340          *    Create a new connection according to the template
 341          */
 342         cp = ip_vs_conn_new(svc->af, iph.protocol,
 343                             &iph.saddr, ports[0],
 344                             &iph.daddr, ports[1],
 345                             &dest->addr, dport,
 346                             0,
 347                             dest);
 348         if (cp == NULL) {
 349                 ip_vs_conn_put(ct);
 350                 return NULL;
 351         }
 352
 353         /*
 354          *    Add its control
 355          */
 356         ip_vs_control_add(cp, ct);
 357         ip_vs_conn_put(ct);
 358
 359         ip_vs_conn_stats(cp, svc);
 360         return cp;
 361 }
 362
 363
 364 /*
 365  *  IPVS main scheduling function
 366  *  It selects a server according to the virtual service, and
 367  *  creates a connection entry.
 368  *  Protocols supported: TCP, UDP
 369  */
 370 struct ip_vs_conn *
 371 ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 372 {
 373         struct ip_vs_conn *cp = NULL;
 374         struct ip_vs_iphdr iph;
 375         struct ip_vs_dest *dest;
 376         __be16 _ports[2], *pptr;
 377
 378         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 379         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 380         if (pptr == NULL)
 381                 return NULL;
 382
 383         /*
 384          *    Persistent service
 385          */
 386         if (svc->flags & IP_VS_SVC_F_PERSISTENT)
 387                 return ip_vs_sched_persist(svc, skb, pptr);
 388
 389         /*
 390          *    Non-persistent service
 391          */
 392         if (!svc->fwmark && pptr[1] != svc->port) {
 393                 if (!svc->port)
 394                         pr_err("Schedule: port zero only supported "
 395                                "in persistent services, "
 396                                "check your ipvs configuration\n");
 397                 return NULL;
 398         }
 399
 400         dest = svc->scheduler->schedule(svc, skb);
 401         if (dest == NULL) {
 402                 IP_VS_DBG(1, "Schedule: no dest found.\n");
 403                 return NULL;
 404         }
 405
 406         /*
 407          *    Create a connection entry.
 408          */
 409         cp = ip_vs_conn_new(svc->af, iph.protocol,
 410                             &iph.saddr, pptr[0],
 411                             &iph.daddr, pptr[1],
 412                             &dest->addr, dest->port ? dest->port : pptr[1],
 413                             0,
 414                             dest);
 415         if (cp == NULL)
 416                 return NULL;
 417
 418         IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
 419                       "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
 420                       ip_vs_fwd_tag(cp),
 421                       IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
 422                       IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
 423                       IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
 424                       cp->flags, atomic_read(&cp->refcnt));
 425
 426         ip_vs_conn_stats(cp, svc);
 427         return cp;
 428 }
 429
 430
 431 /*
 432  *  Pass or drop the packet.
 433  *  Called by ip_vs_in, when the virtual service is available but
 434  *  no destination is available for a new connection.
 435  */
 436 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 437                 struct ip_vs_protocol *pp)
 438 {
 439         __be16 _ports[2], *pptr;
 440         struct ip_vs_iphdr iph;
 441         int unicast;
 442         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 443
 444         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 445         if (pptr == NULL) {
 446                 ip_vs_service_put(svc);
 447                 return NF_DROP;
 448         }
 449
 450 #ifdef CONFIG_IP_VS_IPV6
 451         if (svc->af == AF_INET6)
 452                 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
 453         else
 454 #endif
 455                 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
 456
 457         /* if it is fwmark-based service, the cache_bypass sysctl is up
 458            and the destination is a non-local unicast, then create
 459            a cache_bypass connection entry */
 460         if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
 461                 int ret, cs;
 462                 struct ip_vs_conn *cp;
 463                 union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 464
 465                 ip_vs_service_put(svc);
 466
 467                 /* create a new connection entry */
 468                 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
 469                 cp = ip_vs_conn_new(svc->af, iph.protocol,
 470                                     &iph.saddr, pptr[0],
 471                                     &iph.daddr, pptr[1],
 472                                     &daddr, 0,
 473                                     IP_VS_CONN_F_BYPASS,
 474                                     NULL);
 475                 if (cp == NULL)
 476                         return NF_DROP;
 477
 478                 /* statistics */
 479                 ip_vs_in_stats(cp, skb);
 480
 481                 /* set state */
 482                 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
 483
 484                 /* transmit the first SYN packet */
 485                 ret = cp->packet_xmit(skb, cp, pp);
 486                 /* do not touch skb anymore */
 487
 488                 atomic_inc(&cp->in_pkts);
 489                 ip_vs_conn_put(cp);
 490                 return ret;
 491         }
 492
 493         /*
 494          * When the virtual ftp service is presented, packets destined
 495          * for other services on the VIP may get here (except services
 496          * listed in the ipvs table), pass the packets, because it is
 497          * not ipvs job to decide to drop the packets.
 498          */
 499         if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
 500                 ip_vs_service_put(svc);
 501                 return NF_ACCEPT;
 502         }
 503
 504         ip_vs_service_put(svc);
 505
 506         /*
 507          * Notify the client that the destination is unreachable, and
 508          * release the socket buffer.
 509          * Since it is in IP layer, the TCP socket is not actually
 510          * created, the TCP RST packet cannot be sent, instead that
 511          * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 512          */
 513 #ifdef CONFIG_IP_VS_IPV6
 514         if (svc->af == AF_INET6)
 515                 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0,
 516                             skb->dev);
 517         else
 518 #endif
 519                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 520
 521         return NF_DROP;
 522 }
 523
 524
 525 /*
 526  *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
 527  *      chain, and is used for VS/NAT.
 528  *      It detects packets for VS/NAT connections and sends the packets
 529  *      immediately. This can avoid that iptable_nat mangles the packets
 530  *      for VS/NAT.
 531  */
 532 static unsigned int ip_vs_post_routing(unsigned int hooknum,
 533                                        struct sk_buff *skb,
 534                                        const struct net_device *in,
 535                                        const struct net_device *out,
 536                                        int (*okfn)(struct sk_buff *))
 537 {
 538         if (!skb->ipvs_property)
 539                 return NF_ACCEPT;
 540         /* The packet was sent from IPVS, exit this chain */
 541         return NF_STOP;
 542 }
 543
 544 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 545 {
 546         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 547 }
 548
 549 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 550 {
 551         int err = ip_defrag(skb, user);
 552
 553         if (!err)
 554                 ip_send_check(ip_hdr(skb));
 555
 556         return err;
 557 }
 558
 559 #ifdef CONFIG_IP_VS_IPV6
 560 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
 561 {
 562         /* TODO IPv6: Find out what to do here for IPv6 */
 563         return 0;
 564 }
 565 #endif
 566
 567 /*
 568  * Packet has been made sufficiently writable in caller
 569  * - inout: 1=in->out, 0=out->in
 570  */
 571 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 572                     struct ip_vs_conn *cp, int inout)
 573 {
 574         struct iphdr *iph        = ip_hdr(skb);
 575         unsigned int icmp_offset = iph->ihl*4;
 576         struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
 577                                                       icmp_offset);
 578         struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
 579
 580         if (inout) {
 581                 iph->saddr = cp->vaddr.ip;
 582                 ip_send_check(iph);
 583                 ciph->daddr = cp->vaddr.ip;
 584                 ip_send_check(ciph);
 585         } else {
 586                 iph->daddr = cp->daddr.ip;
 587                 ip_send_check(iph);
 588                 ciph->saddr = cp->daddr.ip;
 589                 ip_send_check(ciph);
 590         }
 591
 592         /* the TCP/UDP port */
 593         if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
 594                 __be16 *ports = (void *)ciph + ciph->ihl*4;
 595
 596                 if (inout)
 597                         ports[1] = cp->vport;
 598                 else
 599                         ports[0] = cp->dport;
 600         }
 601
 602         /* And finally the ICMP checksum */
 603         icmph->checksum = 0;
 604         icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
 605         skb->ip_summed = CHECKSUM_UNNECESSARY;
 606
 607         if (inout)
 608                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 609                         "Forwarding altered outgoing ICMP");
 610         else
 611                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 612                         "Forwarding altered incoming ICMP");
 613 }
 614
 615 #ifdef CONFIG_IP_VS_IPV6
 616 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 617                     struct ip_vs_conn *cp, int inout)
 618 {
 619         struct ipv6hdr *iph      = ipv6_hdr(skb);
 620         unsigned int icmp_offset = sizeof(struct ipv6hdr);
 621         struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
 622                                                       icmp_offset);
 623         struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
 624
 625         if (inout) {
 626                 iph->saddr = cp->vaddr.in6;
 627                 ciph->daddr = cp->vaddr.in6;
 628         } else {
 629                 iph->daddr = cp->daddr.in6;
 630                 ciph->saddr = cp->daddr.in6;
 631         }
 632
 633         /* the TCP/UDP port */
 634         if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) {
 635                 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
 636
 637                 if (inout)
 638                         ports[1] = cp->vport;
 639                 else
 640                         ports[0] = cp->dport;
 641         }
 642
 643         /* And finally the ICMP checksum */
 644         icmph->icmp6_cksum = 0;
 645         /* TODO IPv6: is this correct for ICMPv6? */
 646         ip_vs_checksum_complete(skb, icmp_offset);
 647         skb->ip_summed = CHECKSUM_UNNECESSARY;
 648
 649         if (inout)
 650                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 651                         "Forwarding altered outgoing ICMPv6");
 652         else
 653                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 654                         "Forwarding altered incoming ICMPv6");
 655 }
 656 #endif
 657
 658 /* Handle relevant response ICMP messages - forward to the right
 659  * destination host. Used for NAT and local client.
 660  */
 661 static int handle_response_icmp(int af, struct sk_buff *skb,
 662                                 union nf_inet_addr *snet,
 663                                 __u8 protocol, struct ip_vs_conn *cp,
 664                                 struct ip_vs_protocol *pp,
 665                                 unsigned int offset, unsigned int ihl)
 666 {
 667         unsigned int verdict = NF_DROP;
 668
 669         if (IP_VS_FWD_METHOD(cp) != 0) {
 670                 pr_err("shouldn't reach here, because the box is on the "
 671                        "half connection in the tun/dr module.\n");
 672         }
 673
 674         /* Ensure the checksum is correct */
 675         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
 676                 /* Failed checksum! */
 677                 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
 678                               IP_VS_DBG_ADDR(af, snet));
 679                 goto out;
 680         }
 681
 682         if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol)
 683                 offset += 2 * sizeof(__u16);
 684         if (!skb_make_writable(skb, offset))
 685                 goto out;
 686
 687 #ifdef CONFIG_IP_VS_IPV6
 688         if (af == AF_INET6)
 689                 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
 690         else
 691 #endif
 692                 ip_vs_nat_icmp(skb, pp, cp, 1);
 693
 694         /* do the statistics and put it back */
 695         ip_vs_out_stats(cp, skb);
 696
 697         skb->ipvs_property = 1;
 698         verdict = NF_ACCEPT;
 699
 700 out:
 701         __ip_vs_conn_put(cp);
 702
 703         return verdict;
 704 }
 705
 706 /*
 707  *      Handle ICMP messages in the inside-to-outside direction (outgoing).
 708  *      Find any that might be relevant, check against existing connections.
 709  *      Currently handles error types - unreachable, quench, ttl exceeded.
 710  */
 711 static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 712 {
 713         struct iphdr *iph;
 714         struct icmphdr  _icmph, *ic;
 715         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
 716         struct ip_vs_iphdr ciph;
 717         struct ip_vs_conn *cp;
 718         struct ip_vs_protocol *pp;
 719         unsigned int offset, ihl;
 720         union nf_inet_addr snet;
 721
 722         *related = 1;
 723
 724         /* reassemble IP fragments */
 725         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
 726                 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
 727                         return NF_STOLEN;
 728         }
 729
 730         iph = ip_hdr(skb);
 731         offset = ihl = iph->ihl * 4;
 732         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 733         if (ic == NULL)
 734                 return NF_DROP;
 735
 736         IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
 737                   ic->type, ntohs(icmp_id(ic)),
 738                   &iph->saddr, &iph->daddr);
 739
 740         /*
 741          * Work through seeing if this is for us.
 742          * These checks are supposed to be in an order that means easy
 743          * things are checked first to speed up processing.... however
 744          * this means that some packets will manage to get a long way
 745          * down this stack and then be rejected, but that's life.
 746          */
 747         if ((ic->type != ICMP_DEST_UNREACH) &&
 748             (ic->type != ICMP_SOURCE_QUENCH) &&
 749             (ic->type != ICMP_TIME_EXCEEDED)) {
 750                 *related = 0;
 751                 return NF_ACCEPT;
 752         }
 753
 754         /* Now find the contained IP header */
 755         offset += sizeof(_icmph);
 756         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 757         if (cih == NULL)
 758                 return NF_ACCEPT; /* The packet looks wrong, ignore */
 759
 760         pp = ip_vs_proto_get(cih->protocol);
 761         if (!pp)
 762                 return NF_ACCEPT;
 763
 764         /* Is the embedded protocol header present? */
 765         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
 766                      pp->dont_defrag))
 767                 return NF_ACCEPT;
 768
 769         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
 770
 771         offset += cih->ihl * 4;
 772
 773         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
 774         /* The embedded headers contain source and dest in reverse order */
 775         cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
 776         if (!cp)
 777                 return NF_ACCEPT;
 778
 779         snet.ip = iph->saddr;
 780         return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
 781                                     pp, offset, ihl);
 782 }
 783
 784 #ifdef CONFIG_IP_VS_IPV6
 785 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 786 {
 787         struct ipv6hdr *iph;
 788         struct icmp6hdr _icmph, *ic;
 789         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
 790                                            within the ICMP */
 791         struct ip_vs_iphdr ciph;
 792         struct ip_vs_conn *cp;
 793         struct ip_vs_protocol *pp;
 794         unsigned int offset;
 795         union nf_inet_addr snet;
 796
 797         *related = 1;
 798
 799         /* reassemble IP fragments */
 800         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
 801                 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
 802                         return NF_STOLEN;
 803         }
 804
 805         iph = ipv6_hdr(skb);
 806         offset = sizeof(struct ipv6hdr);
 807         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 808         if (ic == NULL)
 809                 return NF_DROP;
 810
 811         IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
 812                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
 813                   &iph->saddr, &iph->daddr);
 814
 815         /*
 816          * Work through seeing if this is for us.
 817          * These checks are supposed to be in an order that means easy
 818          * things are checked first to speed up processing.... however
 819          * this means that some packets will manage to get a long way
 820          * down this stack and then be rejected, but that's life.
 821          */
 822         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
 823             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
 824             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
 825                 *related = 0;
 826                 return NF_ACCEPT;
 827         }
 828
 829         /* Now find the contained IP header */
 830         offset += sizeof(_icmph);
 831         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 832         if (cih == NULL)
 833                 return NF_ACCEPT; /* The packet looks wrong, ignore */
 834
 835         pp = ip_vs_proto_get(cih->nexthdr);
 836         if (!pp)
 837                 return NF_ACCEPT;
 838
 839         /* Is the embedded protocol header present? */
 840         /* TODO: we don't support fragmentation at the moment anyways */
 841         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
 842                 return NF_ACCEPT;
 843
 844         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
 845
 846         offset += sizeof(struct ipv6hdr);
 847
 848         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
 849         /* The embedded headers contain source and dest in reverse order */
 850         cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
 851         if (!cp)
 852                 return NF_ACCEPT;
 853
 854         ipv6_addr_copy(&snet.in6, &iph->saddr);
 855         return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
 856                                     pp, offset, sizeof(struct ipv6hdr));
 857 }
 858 #endif
 859
 860 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
 861 {
 862         struct tcphdr _tcph, *th;
 863
 864         th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
 865         if (th == NULL)
 866                 return 0;
 867         return th->rst;
 868 }
 869
 870 /* Handle response packets: rewrite addresses and send away...
 871  * Used for NAT and local client.
 872  */
 873 static unsigned int
 874 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 875                 struct ip_vs_conn *cp, int ihl)
 876 {
 877         IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
 878
 879         if (!skb_make_writable(skb, ihl))
 880                 goto drop;
 881
 882         /* mangle the packet */
 883         if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
 884                 goto drop;
 885
 886 #ifdef CONFIG_IP_VS_IPV6
 887         if (af == AF_INET6)
 888                 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
 889         else
 890 #endif
 891         {
 892                 ip_hdr(skb)->saddr = cp->vaddr.ip;
 893                 ip_send_check(ip_hdr(skb));
 894         }
 895
 896         /* For policy routing, packets originating from this
 897          * machine itself may be routed differently to packets
 898          * passing through.  We want this packet to be routed as
 899          * if it came from this machine itself.  So re-compute
 900          * the routing information.
 901          */
 902 #ifdef CONFIG_IP_VS_IPV6
 903         if (af == AF_INET6) {
 904                 if (ip6_route_me_harder(skb) != 0)
 905                         goto drop;
 906         } else
 907 #endif
 908                 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
 909                         goto drop;
 910
 911         IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
 912
 913         ip_vs_out_stats(cp, skb);
 914         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
 915         ip_vs_conn_put(cp);
 916
 917         skb->ipvs_property = 1;
 918
 919         LeaveFunction(11);
 920         return NF_ACCEPT;
 921
 922 drop:
 923         ip_vs_conn_put(cp);
 924         kfree_skb(skb);
 925         return NF_STOLEN;
 926 }
 927
 928 /*
 929  *      It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
 930  *      Check if outgoing packet belongs to the established ip_vs_conn.
 931  */
 932 static unsigned int
 933 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 934           const struct net_device *in, const struct net_device *out,
 935           int (*okfn)(struct sk_buff *))
 936 {
 937         struct ip_vs_iphdr iph;
 938         struct ip_vs_protocol *pp;
 939         struct ip_vs_conn *cp;
 940         int af;
 941
 942         EnterFunction(11);
 943
 944         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
 945
 946         if (skb->ipvs_property)
 947                 return NF_ACCEPT;
 948
 949         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 950 #ifdef CONFIG_IP_VS_IPV6
 951         if (af == AF_INET6) {
 952                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
 953                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
 954
 955                         if (related)
 956                                 return verdict;
 957                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 958                 }
 959         } else
 960 #endif
 961                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
 962                         int related, verdict = ip_vs_out_icmp(skb, &related);
 963
 964                         if (related)
 965                                 return verdict;
 966                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 967                 }
 968
 969         pp = ip_vs_proto_get(iph.protocol);
 970         if (unlikely(!pp))
 971                 return NF_ACCEPT;
 972
 973         /* reassemble IP fragments */
 974 #ifdef CONFIG_IP_VS_IPV6
 975         if (af == AF_INET6) {
 976                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
 977                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
 978
 979                         if (related)
 980                                 return verdict;
 981
 982                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 983                 }
 984         } else
 985 #endif
 986                 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
 987                              !pp->dont_defrag)) {
 988                         if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
 989                                 return NF_STOLEN;
 990
 991                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 992                 }
 993
 994         /*
 995          * Check if the packet belongs to an existing entry
 996          */
 997         cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
 998
 999         if (unlikely(!cp)) {
1000                 if (sysctl_ip_vs_nat_icmp_send &&
1001                     (pp->protocol == IPPROTO_TCP ||
1002                      pp->protocol == IPPROTO_UDP)) {
1003                         __be16 _ports[2], *pptr;
1004
1005                         pptr = skb_header_pointer(skb, iph.len,
1006                                                   sizeof(_ports), _ports);
1007                         if (pptr == NULL)
1008                                 return NF_ACCEPT;       /* Not for me */
1009                         if (ip_vs_lookup_real_service(af, iph.protocol,
1010                                                       &iph.saddr,
1011                                                       pptr[0])) {
1012                                 /*
1013                                  * Notify the real server: there is no
1014                                  * existing entry if it is not RST
1015                                  * packet or not TCP packet.
1016                                  */
1017                                 if (iph.protocol != IPPROTO_TCP
1018                                     || !is_tcp_reset(skb, iph.len)) {
1019 #ifdef CONFIG_IP_VS_IPV6
1020                                         if (af == AF_INET6)
1021                                                 icmpv6_send(skb,
1022                                                             ICMPV6_DEST_UNREACH,
1023                                                             ICMPV6_PORT_UNREACH,
1024                                                             0, skb->dev);
1025                                         else
1026 #endif
1027                                                 icmp_send(skb,
1028                                                           ICMP_DEST_UNREACH,
1029                                                           ICMP_PORT_UNREACH, 0);
1030                                         return NF_DROP;
1031                                 }
1032                         }
1033                 }
1034                 IP_VS_DBG_PKT(12, pp, skb, 0,
1035                               "packet continues traversal as normal");
1036                 return NF_ACCEPT;
1037         }
1038
1039         return handle_response(af, skb, pp, cp, iph.len);
1040 }
1041
1042
1043 /*
1044  *      Handle ICMP messages in the outside-to-inside direction (incoming).
1045  *      Find any that might be relevant, check against existing connections,
1046  *      forward to the right destination host if relevant.
1047  *      Currently handles error types - unreachable, quench, ttl exceeded.
1048  */
1049 static int
1050 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1051 {
1052         struct iphdr *iph;
1053         struct icmphdr  _icmph, *ic;
1054         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1055         struct ip_vs_iphdr ciph;
1056         struct ip_vs_conn *cp;
1057         struct ip_vs_protocol *pp;
1058         unsigned int offset, ihl, verdict;
1059         union nf_inet_addr snet;
1060
1061         *related = 1;
1062
1063         /* reassemble IP fragments */
1064         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1065                 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
1066                                             IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1067                         return NF_STOLEN;
1068         }
1069
1070         iph = ip_hdr(skb);
1071         offset = ihl = iph->ihl * 4;
1072         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1073         if (ic == NULL)
1074                 return NF_DROP;
1075
1076         IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1077                   ic->type, ntohs(icmp_id(ic)),
1078                   &iph->saddr, &iph->daddr);
1079
1080         /*
1081          * Work through seeing if this is for us.
1082          * These checks are supposed to be in an order that means easy
1083          * things are checked first to speed up processing.... however
1084          * this means that some packets will manage to get a long way
1085          * down this stack and then be rejected, but that's life.
1086          */
1087         if ((ic->type != ICMP_DEST_UNREACH) &&
1088             (ic->type != ICMP_SOURCE_QUENCH) &&
1089             (ic->type != ICMP_TIME_EXCEEDED)) {
1090                 *related = 0;
1091                 return NF_ACCEPT;
1092         }
1093
1094         /* Now find the contained IP header */
1095         offset += sizeof(_icmph);
1096         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1097         if (cih == NULL)
1098                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1099
1100         pp = ip_vs_proto_get(cih->protocol);
1101         if (!pp)
1102                 return NF_ACCEPT;
1103
1104         /* Is the embedded protocol header present? */
1105         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1106                      pp->dont_defrag))
1107                 return NF_ACCEPT;
1108
1109         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1110
1111         offset += cih->ihl * 4;
1112
1113         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1114         /* The embedded headers contain source and dest in reverse order */
1115         cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1116         if (!cp) {
1117                 /* The packet could also belong to a local client */
1118                 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1119                 if (cp) {
1120                         snet.ip = iph->saddr;
1121                         return handle_response_icmp(AF_INET, skb, &snet,
1122                                                     cih->protocol, cp, pp,
1123                                                     offset, ihl);
1124                 }
1125                 return NF_ACCEPT;
1126         }
1127
1128         verdict = NF_DROP;
1129
1130         /* Ensure the checksum is correct */
1131         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1132                 /* Failed checksum! */
1133                 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1134                           &iph->saddr);
1135                 goto out;
1136         }
1137
1138         /* do the statistics and put it back */
1139         ip_vs_in_stats(cp, skb);
1140         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1141                 offset += 2 * sizeof(__u16);
1142         verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1143         /* do not touch skb anymore */
1144
1145   out:
1146         __ip_vs_conn_put(cp);
1147
1148         return verdict;
1149 }
1150
1151 #ifdef CONFIG_IP_VS_IPV6
1152 static int
1153 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1154 {
1155         struct ipv6hdr *iph;
1156         struct icmp6hdr _icmph, *ic;
1157         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
1158                                            within the ICMP */
1159         struct ip_vs_iphdr ciph;
1160         struct ip_vs_conn *cp;
1161         struct ip_vs_protocol *pp;
1162         unsigned int offset, verdict;
1163         union nf_inet_addr snet;
1164
1165         *related = 1;
1166
1167         /* reassemble IP fragments */
1168         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1169                 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1170                                                IP_DEFRAG_VS_IN :
1171                                                IP_DEFRAG_VS_FWD))
1172                         return NF_STOLEN;
1173         }
1174
1175         iph = ipv6_hdr(skb);
1176         offset = sizeof(struct ipv6hdr);
1177         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1178         if (ic == NULL)
1179                 return NF_DROP;
1180
1181         IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1182                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
1183                   &iph->saddr, &iph->daddr);
1184
1185         /*
1186          * Work through seeing if this is for us.
1187          * These checks are supposed to be in an order that means easy
1188          * things are checked first to speed up processing.... however
1189          * this means that some packets will manage to get a long way
1190          * down this stack and then be rejected, but that's life.
1191          */
1192         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1193             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1194             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1195                 *related = 0;
1196                 return NF_ACCEPT;
1197         }
1198
1199         /* Now find the contained IP header */
1200         offset += sizeof(_icmph);
1201         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1202         if (cih == NULL)
1203                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1204
1205         pp = ip_vs_proto_get(cih->nexthdr);
1206         if (!pp)
1207                 return NF_ACCEPT;
1208
1209         /* Is the embedded protocol header present? */
1210         /* TODO: we don't support fragmentation at the moment anyways */
1211         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1212                 return NF_ACCEPT;
1213
1214         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1215
1216         offset += sizeof(struct ipv6hdr);
1217
1218         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1219         /* The embedded headers contain source and dest in reverse order */
1220         cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1221         if (!cp) {
1222                 /* The packet could also belong to a local client */
1223                 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1224                 if (cp) {
1225                         ipv6_addr_copy(&snet.in6, &iph->saddr);
1226                         return handle_response_icmp(AF_INET6, skb, &snet,
1227                                                     cih->nexthdr,
1228                                                     cp, pp, offset,
1229                                                     sizeof(struct ipv6hdr));
1230                 }
1231                 return NF_ACCEPT;
1232         }
1233
1234         verdict = NF_DROP;
1235
1236         /* do the statistics and put it back */
1237         ip_vs_in_stats(cp, skb);
1238         if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
1239                 offset += 2 * sizeof(__u16);
1240         verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1241         /* do not touch skb anymore */
1242
1243         __ip_vs_conn_put(cp);
1244
1245         return verdict;
1246 }
1247 #endif
1248
1249
1250 /*
1251  *      Check if it's for virtual services, look it up,
1252  *      and send it on its way...
1253  */
1254 static unsigned int
1255 ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1256          const struct net_device *in, const struct net_device *out,
1257          int (*okfn)(struct sk_buff *))
1258 {
1259         struct ip_vs_iphdr iph;
1260         struct ip_vs_protocol *pp;
1261         struct ip_vs_conn *cp;
1262         int ret, restart, af, pkts;
1263
1264         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1265
1266         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1267
1268         /*
1269          *      Big tappo: only PACKET_HOST, including loopback for local client
1270          *      Don't handle local packets on IPv6 for now
1271          */
1272         if (unlikely(skb->pkt_type != PACKET_HOST)) {
1273                 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
1274                               skb->pkt_type,
1275                               iph.protocol,
1276                               IP_VS_DBG_ADDR(af, &iph.daddr));
1277                 return NF_ACCEPT;
1278         }
1279
1280 #ifdef CONFIG_IP_VS_IPV6
1281         if (af == AF_INET6) {
1282                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1283                         int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1284
1285                         if (related)
1286                                 return verdict;
1287                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1288                 }
1289         } else
1290 #endif
1291                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1292                         int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
1293
1294                         if (related)
1295                                 return verdict;
1296                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1297                 }
1298
1299         /* Protocol supported? */
1300         pp = ip_vs_proto_get(iph.protocol);
1301         if (unlikely(!pp))
1302                 return NF_ACCEPT;
1303
1304         /*
1305          * Check if the packet belongs to an existing connection entry
1306          */
1307         cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1308
1309         if (unlikely(!cp)) {
1310                 int v;
1311
1312                 /* For local client packets, it could be a response */
1313                 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1314                 if (cp)
1315                         return handle_response(af, skb, pp, cp, iph.len);
1316
1317                 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1318                         return v;
1319         }
1320
1321         if (unlikely(!cp)) {
1322                 /* sorry, all this trouble for a no-hit :) */
1323                 IP_VS_DBG_PKT(12, pp, skb, 0,
1324                               "packet continues traversal as normal");
1325                 return NF_ACCEPT;
1326         }
1327
1328         IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1329
1330         /* Check the server status */
1331         if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1332                 /* the destination server is not available */
1333
1334                 if (sysctl_ip_vs_expire_nodest_conn) {
1335                         /* try to expire the connection immediately */
1336                         ip_vs_conn_expire_now(cp);
1337                 }
1338                 /* don't restart its timer, and silently
1339                    drop the packet. */
1340                 __ip_vs_conn_put(cp);
1341                 return NF_DROP;
1342         }
1343
1344         ip_vs_in_stats(cp, skb);
1345         restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1346         if (cp->packet_xmit)
1347                 ret = cp->packet_xmit(skb, cp, pp);
1348                 /* do not touch skb anymore */
1349         else {
1350                 IP_VS_DBG_RL("warning: packet_xmit is null");
1351                 ret = NF_ACCEPT;
1352         }
1353
1354         /* Increase its packet counter and check if it is needed
1355          * to be synchronized
1356          *
1357          * Sync connection if it is about to close to
1358          * encorage the standby servers to update the connections timeout
1359          */
1360         pkts = atomic_add_return(1, &cp->in_pkts);
1361         if (af == AF_INET &&
1362             (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1363             (((cp->protocol != IPPROTO_TCP ||
1364                cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1365               (pkts % sysctl_ip_vs_sync_threshold[1]
1366                == sysctl_ip_vs_sync_threshold[0])) ||
1367              ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1368               ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1369                (cp->state == IP_VS_TCP_S_CLOSE) ||
1370                (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1371                (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1372                 ip_vs_sync_conn(cp);
1373         cp->old_state = cp->state;
1374
1375         ip_vs_conn_put(cp);
1376         return ret;
1377 }
1378
1379
1380 /*
1381  *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1382  *      related packets destined for 0.0.0.0/0.
1383  *      When fwmark-based virtual service is used, such as transparent
1384  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1385  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1386  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1387  *      and send them to ip_vs_in_icmp.
1388  */
1389 static unsigned int
1390 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1391                    const struct net_device *in, const struct net_device *out,
1392                    int (*okfn)(struct sk_buff *))
1393 {
1394         int r;
1395
1396         if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1397                 return NF_ACCEPT;
1398
1399         return ip_vs_in_icmp(skb, &r, hooknum);
1400 }
1401
1402 #ifdef CONFIG_IP_VS_IPV6
1403 static unsigned int
1404 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1405                       const struct net_device *in, const struct net_device *out,
1406                       int (*okfn)(struct sk_buff *))
1407 {
1408         int r;
1409
1410         if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1411                 return NF_ACCEPT;
1412
1413         return ip_vs_in_icmp_v6(skb, &r, hooknum);
1414 }
1415 #endif
1416
1417
1418 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1419         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1420          * or VS/NAT(change destination), so that filtering rules can be
1421          * applied to IPVS. */
1422         {
1423                 .hook           = ip_vs_in,
1424                 .owner          = THIS_MODULE,
1425                 .pf             = PF_INET,
1426                 .hooknum        = NF_INET_LOCAL_IN,
1427                 .priority       = 100,
1428         },
1429         /* After packet filtering, change source only for VS/NAT */
1430         {
1431                 .hook           = ip_vs_out,
1432                 .owner          = THIS_MODULE,
1433                 .pf             = PF_INET,
1434                 .hooknum        = NF_INET_FORWARD,
1435                 .priority       = 100,
1436         },
1437         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1438          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1439         {
1440                 .hook           = ip_vs_forward_icmp,
1441                 .owner          = THIS_MODULE,
1442                 .pf             = PF_INET,
1443                 .hooknum        = NF_INET_FORWARD,
1444                 .priority       = 99,
1445         },
1446         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1447         {
1448                 .hook           = ip_vs_post_routing,
1449                 .owner          = THIS_MODULE,
1450                 .pf             = PF_INET,
1451                 .hooknum        = NF_INET_POST_ROUTING,
1452                 .priority       = NF_IP_PRI_NAT_SRC-1,
1453         },
1454 #ifdef CONFIG_IP_VS_IPV6
1455         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1456          * or VS/NAT(change destination), so that filtering rules can be
1457          * applied to IPVS. */
1458         {
1459                 .hook           = ip_vs_in,
1460                 .owner          = THIS_MODULE,
1461                 .pf             = PF_INET6,
1462                 .hooknum        = NF_INET_LOCAL_IN,
1463                 .priority       = 100,
1464         },
1465         /* After packet filtering, change source only for VS/NAT */
1466         {
1467                 .hook           = ip_vs_out,
1468                 .owner          = THIS_MODULE,
1469                 .pf             = PF_INET6,
1470                 .hooknum        = NF_INET_FORWARD,
1471                 .priority       = 100,
1472         },
1473         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1474          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1475         {
1476                 .hook           = ip_vs_forward_icmp_v6,
1477                 .owner          = THIS_MODULE,
1478                 .pf             = PF_INET6,
1479                 .hooknum        = NF_INET_FORWARD,
1480                 .priority       = 99,
1481         },
1482         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1483         {
1484                 .hook           = ip_vs_post_routing,
1485                 .owner          = THIS_MODULE,
1486                 .pf             = PF_INET6,
1487                 .hooknum        = NF_INET_POST_ROUTING,
1488                 .priority       = NF_IP6_PRI_NAT_SRC-1,
1489         },
1490 #endif
1491 };
1492
1493
1494 /*
1495  *      Initialize IP Virtual Server
1496  */
1497 static int __init ip_vs_init(void)
1498 {
1499         int ret;
1500
1501         ip_vs_estimator_init();
1502
1503         ret = ip_vs_control_init();
1504         if (ret < 0) {
1505                 pr_err("can't setup control.\n");
1506                 goto cleanup_estimator;
1507         }
1508
1509         ip_vs_protocol_init();
1510
1511         ret = ip_vs_app_init();
1512         if (ret < 0) {
1513                 pr_err("can't setup application helper.\n");
1514                 goto cleanup_protocol;
1515         }
1516
1517         ret = ip_vs_conn_init();
1518         if (ret < 0) {
1519                 pr_err("can't setup connection table.\n");
1520                 goto cleanup_app;
1521         }
1522
1523         ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1524         if (ret < 0) {
1525                 pr_err("can't register hooks.\n");
1526                 goto cleanup_conn;
1527         }
1528
1529         pr_info("ipvs loaded.\n");
1530         return ret;
1531
1532   cleanup_conn:
1533         ip_vs_conn_cleanup();
1534   cleanup_app:
1535         ip_vs_app_cleanup();
1536   cleanup_protocol:
1537         ip_vs_protocol_cleanup();
1538         ip_vs_control_cleanup();
1539   cleanup_estimator:
1540         ip_vs_estimator_cleanup();
1541         return ret;
1542 }
1543
1544 static void __exit ip_vs_cleanup(void)
1545 {
1546         nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1547         ip_vs_conn_cleanup();
1548         ip_vs_app_cleanup();
1549         ip_vs_protocol_cleanup();
1550         ip_vs_control_cleanup();
1551         ip_vs_estimator_cleanup();
1552         pr_info("ipvs unloaded.\n");
1553 }
1554
1555 module_init(ip_vs_init);
1556 module_exit(ip_vs_cleanup);
1557 MODULE_LICENSE("GPL");