IPVS: skb defrag in L7 helpers
[deliverable/linux.git] / net / netfilter / ipvs / ip_vs_core.c
CommitLineData
1da177e4
LT
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
1da177e4
LT
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19 * and others.
20 *
21 * Changes:
22 * Paul `Rusty' Russell properly handle non-linear skbs
6869c4d8 23 * Harald Welte don't use nfcache
1da177e4
LT
24 *
25 */
26
9aada7ac
HE
27#define KMSG_COMPONENT "IPVS"
28#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
29
1da177e4
LT
30#include <linux/module.h>
31#include <linux/kernel.h>
32#include <linux/ip.h>
33#include <linux/tcp.h>
2906f66a 34#include <linux/sctp.h>
1da177e4 35#include <linux/icmp.h>
5a0e3ad6 36#include <linux/slab.h>
1da177e4
LT
37
38#include <net/ip.h>
39#include <net/tcp.h>
40#include <net/udp.h>
41#include <net/icmp.h> /* for icmp_send */
42#include <net/route.h>
2c70b519 43#include <net/ip6_checksum.h>
1da177e4
LT
44
45#include <linux/netfilter.h>
46#include <linux/netfilter_ipv4.h>
47
2a3b791e
JV
48#ifdef CONFIG_IP_VS_IPV6
49#include <net/ipv6.h>
50#include <linux/netfilter_ipv6.h>
489fdeda 51#include <net/ip6_route.h>
2a3b791e
JV
52#endif
53
1da177e4
LT
54#include <net/ip_vs.h>
55
56
57EXPORT_SYMBOL(register_ip_vs_scheduler);
58EXPORT_SYMBOL(unregister_ip_vs_scheduler);
1da177e4
LT
59EXPORT_SYMBOL(ip_vs_proto_name);
60EXPORT_SYMBOL(ip_vs_conn_new);
61EXPORT_SYMBOL(ip_vs_conn_in_get);
62EXPORT_SYMBOL(ip_vs_conn_out_get);
63#ifdef CONFIG_IP_VS_PROTO_TCP
64EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
65#endif
66EXPORT_SYMBOL(ip_vs_conn_put);
67#ifdef CONFIG_IP_VS_DEBUG
68EXPORT_SYMBOL(ip_vs_get_debug_level);
69#endif
1da177e4
LT
70
71
72/* ID used in ICMP lookups */
73#define icmp_id(icmph) (((icmph)->un).echo.id)
2a3b791e 74#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier)
1da177e4
LT
75
76const char *ip_vs_proto_name(unsigned proto)
77{
78 static char buf[20];
79
80 switch (proto) {
81 case IPPROTO_IP:
82 return "IP";
83 case IPPROTO_UDP:
84 return "UDP";
85 case IPPROTO_TCP:
86 return "TCP";
2906f66a
VMR
87 case IPPROTO_SCTP:
88 return "SCTP";
1da177e4
LT
89 case IPPROTO_ICMP:
90 return "ICMP";
2a3b791e
JV
91#ifdef CONFIG_IP_VS_IPV6
92 case IPPROTO_ICMPV6:
93 return "ICMPv6";
94#endif
1da177e4
LT
95 default:
96 sprintf(buf, "IP_%d", proto);
97 return buf;
98 }
99}
100
101void ip_vs_init_hash_table(struct list_head *table, int rows)
102{
103 while (--rows >= 0)
104 INIT_LIST_HEAD(&table[rows]);
105}
106
107static inline void
108ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
109{
110 struct ip_vs_dest *dest = cp->dest;
111 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
112 spin_lock(&dest->stats.lock);
e9c0ce23
SW
113 dest->stats.ustats.inpkts++;
114 dest->stats.ustats.inbytes += skb->len;
1da177e4
LT
115 spin_unlock(&dest->stats.lock);
116
117 spin_lock(&dest->svc->stats.lock);
e9c0ce23
SW
118 dest->svc->stats.ustats.inpkts++;
119 dest->svc->stats.ustats.inbytes += skb->len;
1da177e4
LT
120 spin_unlock(&dest->svc->stats.lock);
121
122 spin_lock(&ip_vs_stats.lock);
e9c0ce23
SW
123 ip_vs_stats.ustats.inpkts++;
124 ip_vs_stats.ustats.inbytes += skb->len;
1da177e4
LT
125 spin_unlock(&ip_vs_stats.lock);
126 }
127}
128
129
130static inline void
131ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
132{
133 struct ip_vs_dest *dest = cp->dest;
134 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
135 spin_lock(&dest->stats.lock);
e9c0ce23
SW
136 dest->stats.ustats.outpkts++;
137 dest->stats.ustats.outbytes += skb->len;
1da177e4
LT
138 spin_unlock(&dest->stats.lock);
139
140 spin_lock(&dest->svc->stats.lock);
e9c0ce23
SW
141 dest->svc->stats.ustats.outpkts++;
142 dest->svc->stats.ustats.outbytes += skb->len;
1da177e4
LT
143 spin_unlock(&dest->svc->stats.lock);
144
145 spin_lock(&ip_vs_stats.lock);
e9c0ce23
SW
146 ip_vs_stats.ustats.outpkts++;
147 ip_vs_stats.ustats.outbytes += skb->len;
1da177e4
LT
148 spin_unlock(&ip_vs_stats.lock);
149 }
150}
151
152
153static inline void
154ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
155{
156 spin_lock(&cp->dest->stats.lock);
e9c0ce23 157 cp->dest->stats.ustats.conns++;
1da177e4
LT
158 spin_unlock(&cp->dest->stats.lock);
159
160 spin_lock(&svc->stats.lock);
e9c0ce23 161 svc->stats.ustats.conns++;
1da177e4
LT
162 spin_unlock(&svc->stats.lock);
163
164 spin_lock(&ip_vs_stats.lock);
e9c0ce23 165 ip_vs_stats.ustats.conns++;
1da177e4
LT
166 spin_unlock(&ip_vs_stats.lock);
167}
168
169
170static inline int
171ip_vs_set_state(struct ip_vs_conn *cp, int direction,
172 const struct sk_buff *skb,
173 struct ip_vs_protocol *pp)
174{
175 if (unlikely(!pp->state_transition))
176 return 0;
177 return pp->state_transition(cp, direction, skb, pp);
178}
179
f71499aa 180static inline void
85999283
SH
181ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
182 struct sk_buff *skb, int protocol,
183 const union nf_inet_addr *caddr, __be16 cport,
184 const union nf_inet_addr *vaddr, __be16 vport,
185 struct ip_vs_conn_param *p)
186{
187 ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
188 p->pe = svc->pe;
189 if (p->pe && p->pe->fill_param)
f71499aa 190 p->pe->fill_param(p, skb);
85999283 191}
1da177e4 192
1da177e4
LT
193/*
194 * IPVS persistent scheduling function
195 * It creates a connection entry according to its template if exists,
196 * or selects a server and creates a connection entry plus a template.
197 * Locking: we are svc user (svc->refcnt), so we hold all dests too
198 * Protocols supported: TCP, UDP
199 */
200static struct ip_vs_conn *
201ip_vs_sched_persist(struct ip_vs_service *svc,
85999283 202 struct sk_buff *skb,
ce144f24 203 __be16 src_port, __be16 dst_port)
1da177e4
LT
204{
205 struct ip_vs_conn *cp = NULL;
28364a59 206 struct ip_vs_iphdr iph;
1da177e4
LT
207 struct ip_vs_dest *dest;
208 struct ip_vs_conn *ct;
5b57a98c 209 __be16 dport = 0; /* destination port to forward */
3575792e 210 unsigned int flags;
f11017ec 211 struct ip_vs_conn_param param;
28364a59
JV
212 union nf_inet_addr snet; /* source network of the client,
213 after masking */
cd17f9ed
JV
214
215 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
1da177e4
LT
216
217 /* Mask saddr with the netmask to adjust template granularity */
cd17f9ed
JV
218#ifdef CONFIG_IP_VS_IPV6
219 if (svc->af == AF_INET6)
220 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
221 else
222#endif
223 snet.ip = iph.saddr.ip & svc->netmask;
1da177e4 224
cd17f9ed
JV
225 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
226 "mnet %s\n",
ce144f24
HS
227 IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port),
228 IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port),
cd17f9ed 229 IP_VS_DBG_ADDR(svc->af, &snet));
1da177e4
LT
230
231 /*
232 * As far as we know, FTP is a very complicated network protocol, and
233 * it uses control connection and data connections. For active FTP,
234 * FTP server initialize data connection to the client, its source port
235 * is often 20. For passive FTP, FTP server tells the clients the port
236 * that it passively listens to, and the client issues the data
237 * connection. In the tunneling or direct routing mode, the load
238 * balancer is on the client-to-server half of connection, the port
239 * number is unknown to the load balancer. So, a conn template like
240 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
241 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
242 * is created for other persistent services.
243 */
5b57a98c 244 {
f11017ec
SH
245 int protocol = iph.protocol;
246 const union nf_inet_addr *vaddr = &iph.daddr;
247 const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
248 __be16 vport = 0;
249
ce144f24 250 if (dst_port == svc->port) {
5b57a98c
SH
251 /* non-FTP template:
252 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
253 * FTP template:
254 * <protocol, caddr, 0, vaddr, 0, daddr, 0>
1da177e4
LT
255 */
256 if (svc->port != FTPPORT)
ce144f24 257 vport = dst_port;
1da177e4 258 } else {
5b57a98c
SH
259 /* Note: persistent fwmark-based services and
260 * persistent port zero service are handled here.
261 * fwmark template:
262 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
263 * port zero template:
264 * <protocol,caddr,0,vaddr,0,daddr,0>
1da177e4 265 */
28364a59 266 if (svc->fwmark) {
5b57a98c
SH
267 protocol = IPPROTO_IP;
268 vaddr = &fwmark;
269 }
1da177e4 270 }
f71499aa
SH
271 ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
272 vaddr, vport, &param);
1da177e4
LT
273 }
274
5b57a98c 275 /* Check if a template already exists */
f11017ec 276 ct = ip_vs_ct_in_get(&param);
5b57a98c
SH
277 if (!ct || !ip_vs_check_template(ct)) {
278 /* No template found or the dest of the connection
279 * template is not available.
280 */
281 dest = svc->scheduler->schedule(svc, skb);
282 if (!dest) {
283 IP_VS_DBG(1, "p-schedule: no dest found.\n");
85999283 284 kfree(param.pe_data);
5b57a98c
SH
285 return NULL;
286 }
287
ce144f24 288 if (dst_port == svc->port && svc->port != FTPPORT)
5b57a98c
SH
289 dport = dest->port;
290
85999283
SH
291 /* Create a template
292 * This adds param.pe_data to the template,
293 * and thus param.pe_data will be destroyed
294 * when the template expires */
f11017ec 295 ct = ip_vs_conn_new(&param, &dest->addr, dport,
0e051e68 296 IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
85999283
SH
297 if (ct == NULL) {
298 kfree(param.pe_data);
5b57a98c 299 return NULL;
85999283 300 }
5b57a98c
SH
301
302 ct->timeout = svc->timeout;
85999283 303 } else {
5b57a98c
SH
304 /* set destination with the found template */
305 dest = ct->dest;
85999283
SH
306 kfree(param.pe_data);
307 }
5b57a98c 308
ce144f24 309 dport = dst_port;
5b57a98c
SH
310 if (dport == svc->port && dest->port)
311 dport = dest->port;
312
26ec037f
NC
313 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
314 && iph.protocol == IPPROTO_UDP)?
315 IP_VS_CONN_F_ONE_PACKET : 0;
316
1da177e4
LT
317 /*
318 * Create a new connection according to the template
319 */
ce144f24
HS
320 ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, src_port,
321 &iph.daddr, dst_port, &param);
322
0e051e68 323 cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
1da177e4
LT
324 if (cp == NULL) {
325 ip_vs_conn_put(ct);
326 return NULL;
327 }
328
329 /*
330 * Add its control
331 */
332 ip_vs_control_add(cp, ct);
333 ip_vs_conn_put(ct);
334
335 ip_vs_conn_stats(cp, svc);
336 return cp;
337}
338
339
340/*
341 * IPVS main scheduling function
342 * It selects a server according to the virtual service, and
343 * creates a connection entry.
344 * Protocols supported: TCP, UDP
345 */
346struct ip_vs_conn *
190ecd27
JA
347ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
348 struct ip_vs_protocol *pp, int *ignored)
1da177e4
LT
349{
350 struct ip_vs_conn *cp = NULL;
28364a59 351 struct ip_vs_iphdr iph;
1da177e4 352 struct ip_vs_dest *dest;
3575792e
JA
353 __be16 _ports[2], *pptr;
354 unsigned int flags;
1da177e4 355
190ecd27 356 *ignored = 1;
28364a59
JV
357 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
358 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
1da177e4
LT
359 if (pptr == NULL)
360 return NULL;
361
190ecd27
JA
362 /*
363 * FTPDATA needs this check when using local real server.
364 * Never schedule Active FTPDATA connections from real server.
365 * For LVS-NAT they must be already created. For other methods
366 * with persistence the connection is created on SYN+ACK.
367 */
368 if (pptr[0] == FTPDATA) {
0d79641a
JA
369 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
370 "Not scheduling FTPDATA");
190ecd27
JA
371 return NULL;
372 }
373
374 /*
375 * Do not schedule replies from local real server. It is risky
376 * for fwmark services but mostly for persistent services.
377 */
378 if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
379 (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
380 (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
0d79641a 381 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
190ecd27
JA
382 "Not scheduling reply for existing connection");
383 __ip_vs_conn_put(cp);
384 return NULL;
385 }
386
1da177e4
LT
387 /*
388 * Persistent service
389 */
190ecd27
JA
390 if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
391 *ignored = 0;
ce144f24 392 return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1]);
190ecd27 393 }
1da177e4
LT
394
395 /*
396 * Non-persistent service
397 */
398 if (!svc->fwmark && pptr[1] != svc->port) {
399 if (!svc->port)
1e3e238e
HE
400 pr_err("Schedule: port zero only supported "
401 "in persistent services, "
402 "check your ipvs configuration\n");
1da177e4
LT
403 return NULL;
404 }
405
190ecd27
JA
406 *ignored = 0;
407
1da177e4
LT
408 dest = svc->scheduler->schedule(svc, skb);
409 if (dest == NULL) {
410 IP_VS_DBG(1, "Schedule: no dest found.\n");
411 return NULL;
412 }
413
26ec037f
NC
414 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
415 && iph.protocol == IPPROTO_UDP)?
416 IP_VS_CONN_F_ONE_PACKET : 0;
417
1da177e4
LT
418 /*
419 * Create a connection entry.
420 */
f11017ec
SH
421 {
422 struct ip_vs_conn_param p;
423 ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
424 pptr[0], &iph.daddr, pptr[1], &p);
425 cp = ip_vs_conn_new(&p, &dest->addr,
426 dest->port ? dest->port : pptr[1],
0e051e68 427 flags, dest, skb->mark);
f11017ec
SH
428 if (!cp)
429 return NULL;
430 }
1da177e4 431
cd17f9ed
JV
432 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
433 "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
434 ip_vs_fwd_tag(cp),
435 IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
436 IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
437 IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
438 cp->flags, atomic_read(&cp->refcnt));
1da177e4
LT
439
440 ip_vs_conn_stats(cp, svc);
441 return cp;
442}
443
444
445/*
446 * Pass or drop the packet.
447 * Called by ip_vs_in, when the virtual service is available but
448 * no destination is available for a new connection.
449 */
450int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
451 struct ip_vs_protocol *pp)
452{
014d730d 453 __be16 _ports[2], *pptr;
28364a59 454 struct ip_vs_iphdr iph;
2a3b791e
JV
455 int unicast;
456 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
1da177e4 457
28364a59 458 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
1da177e4
LT
459 if (pptr == NULL) {
460 ip_vs_service_put(svc);
461 return NF_DROP;
462 }
463
2a3b791e
JV
464#ifdef CONFIG_IP_VS_IPV6
465 if (svc->af == AF_INET6)
466 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
467 else
468#endif
469 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
470
1da177e4 471 /* if it is fwmark-based service, the cache_bypass sysctl is up
2a3b791e 472 and the destination is a non-local unicast, then create
1da177e4 473 a cache_bypass connection entry */
2a3b791e 474 if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
1da177e4
LT
475 int ret, cs;
476 struct ip_vs_conn *cp;
3575792e
JA
477 unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
478 iph.protocol == IPPROTO_UDP)?
479 IP_VS_CONN_F_ONE_PACKET : 0;
dff630dd 480 union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
1da177e4
LT
481
482 ip_vs_service_put(svc);
483
484 /* create a new connection entry */
1e3e238e 485 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
f11017ec
SH
486 {
487 struct ip_vs_conn_param p;
488 ip_vs_conn_fill_param(svc->af, iph.protocol,
489 &iph.saddr, pptr[0],
490 &iph.daddr, pptr[1], &p);
491 cp = ip_vs_conn_new(&p, &daddr, 0,
492 IP_VS_CONN_F_BYPASS | flags,
0e051e68 493 NULL, skb->mark);
f11017ec
SH
494 if (!cp)
495 return NF_DROP;
496 }
1da177e4
LT
497
498 /* statistics */
499 ip_vs_in_stats(cp, skb);
500
501 /* set state */
502 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
503
504 /* transmit the first SYN packet */
505 ret = cp->packet_xmit(skb, cp, pp);
506 /* do not touch skb anymore */
507
508 atomic_inc(&cp->in_pkts);
509 ip_vs_conn_put(cp);
510 return ret;
511 }
512
513 /*
514 * When the virtual ftp service is presented, packets destined
515 * for other services on the VIP may get here (except services
516 * listed in the ipvs table), pass the packets, because it is
517 * not ipvs job to decide to drop the packets.
518 */
519 if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
520 ip_vs_service_put(svc);
521 return NF_ACCEPT;
522 }
523
524 ip_vs_service_put(svc);
525
526 /*
527 * Notify the client that the destination is unreachable, and
528 * release the socket buffer.
529 * Since it is in IP layer, the TCP socket is not actually
530 * created, the TCP RST packet cannot be sent, instead that
531 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
532 */
2a3b791e 533#ifdef CONFIG_IP_VS_IPV6
cb59155f
JA
534 if (svc->af == AF_INET6) {
535 if (!skb->dev) {
536 struct net *net = dev_net(skb_dst(skb)->dev);
537
538 skb->dev = net->loopback_dev;
539 }
3ffe533c 540 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
cb59155f 541 } else
2a3b791e
JV
542#endif
543 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
544
1da177e4
LT
545 return NF_DROP;
546}
547
b1550f22 548__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
1da177e4 549{
d3bc23e7 550 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
1da177e4
LT
551}
552
1ca5bb54
JA
553static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
554{
555 if (NF_INET_LOCAL_IN == hooknum)
556 return IP_DEFRAG_VS_IN;
557 if (NF_INET_FORWARD == hooknum)
558 return IP_DEFRAG_VS_FWD;
559 return IP_DEFRAG_VS_OUT;
560}
561
776c729e 562static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
1da177e4 563{
776c729e
HX
564 int err = ip_defrag(skb, user);
565
566 if (!err)
eddc9ec5 567 ip_send_check(ip_hdr(skb));
776c729e
HX
568
569 return err;
1da177e4
LT
570}
571
2a3b791e
JV
572#ifdef CONFIG_IP_VS_IPV6
573static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
574{
575 /* TODO IPv6: Find out what to do here for IPv6 */
576 return 0;
577}
578#endif
579
1da177e4
LT
580/*
581 * Packet has been made sufficiently writable in caller
582 * - inout: 1=in->out, 0=out->in
583 */
584void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
585 struct ip_vs_conn *cp, int inout)
586{
eddc9ec5 587 struct iphdr *iph = ip_hdr(skb);
1da177e4 588 unsigned int icmp_offset = iph->ihl*4;
d56f90a7
ACM
589 struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) +
590 icmp_offset);
1da177e4
LT
591 struct iphdr *ciph = (struct iphdr *)(icmph + 1);
592
593 if (inout) {
e7ade46a 594 iph->saddr = cp->vaddr.ip;
1da177e4 595 ip_send_check(iph);
e7ade46a 596 ciph->daddr = cp->vaddr.ip;
1da177e4
LT
597 ip_send_check(ciph);
598 } else {
e7ade46a 599 iph->daddr = cp->daddr.ip;
1da177e4 600 ip_send_check(iph);
e7ade46a 601 ciph->saddr = cp->daddr.ip;
1da177e4
LT
602 ip_send_check(ciph);
603 }
604
2906f66a
VMR
605 /* the TCP/UDP/SCTP port */
606 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
607 IPPROTO_SCTP == ciph->protocol) {
014d730d 608 __be16 *ports = (void *)ciph + ciph->ihl*4;
1da177e4
LT
609
610 if (inout)
611 ports[1] = cp->vport;
612 else
613 ports[0] = cp->dport;
614 }
615
616 /* And finally the ICMP checksum */
617 icmph->checksum = 0;
618 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
619 skb->ip_summed = CHECKSUM_UNNECESSARY;
620
621 if (inout)
0d79641a 622 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
1da177e4
LT
623 "Forwarding altered outgoing ICMP");
624 else
0d79641a 625 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
1da177e4
LT
626 "Forwarding altered incoming ICMP");
627}
628
b3cdd2a7
JV
629#ifdef CONFIG_IP_VS_IPV6
630void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
631 struct ip_vs_conn *cp, int inout)
632{
633 struct ipv6hdr *iph = ipv6_hdr(skb);
634 unsigned int icmp_offset = sizeof(struct ipv6hdr);
635 struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) +
636 icmp_offset);
637 struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1);
638
639 if (inout) {
640 iph->saddr = cp->vaddr.in6;
641 ciph->daddr = cp->vaddr.in6;
642 } else {
643 iph->daddr = cp->daddr.in6;
644 ciph->saddr = cp->daddr.in6;
645 }
646
2906f66a
VMR
647 /* the TCP/UDP/SCTP port */
648 if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
649 IPPROTO_SCTP == ciph->nexthdr) {
b3cdd2a7
JV
650 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
651
652 if (inout)
653 ports[1] = cp->vport;
654 else
655 ports[0] = cp->dport;
656 }
657
658 /* And finally the ICMP checksum */
8870f842
SH
659 icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
660 skb->len - icmp_offset,
661 IPPROTO_ICMPV6, 0);
662 skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
663 skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
664 skb->ip_summed = CHECKSUM_PARTIAL;
b3cdd2a7
JV
665
666 if (inout)
0d79641a
JA
667 IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
668 (void *)ciph - (void *)iph,
669 "Forwarding altered outgoing ICMPv6");
b3cdd2a7 670 else
0d79641a
JA
671 IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
672 (void *)ciph - (void *)iph,
673 "Forwarding altered incoming ICMPv6");
b3cdd2a7
JV
674}
675#endif
676
4856c84c
MT
677/* Handle relevant response ICMP messages - forward to the right
678 * destination host. Used for NAT and local client.
679 */
f2428ed5
SH
680static int handle_response_icmp(int af, struct sk_buff *skb,
681 union nf_inet_addr *snet,
682 __u8 protocol, struct ip_vs_conn *cp,
4856c84c
MT
683 struct ip_vs_protocol *pp,
684 unsigned int offset, unsigned int ihl)
685{
686 unsigned int verdict = NF_DROP;
687
688 if (IP_VS_FWD_METHOD(cp) != 0) {
1e3e238e
HE
689 pr_err("shouldn't reach here, because the box is on the "
690 "half connection in the tun/dr module.\n");
4856c84c
MT
691 }
692
693 /* Ensure the checksum is correct */
694 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
695 /* Failed checksum! */
f2428ed5
SH
696 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
697 IP_VS_DBG_ADDR(af, snet));
4856c84c
MT
698 goto out;
699 }
700
2906f66a
VMR
701 if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
702 IPPROTO_SCTP == protocol)
4856c84c
MT
703 offset += 2 * sizeof(__u16);
704 if (!skb_make_writable(skb, offset))
705 goto out;
706
f2428ed5
SH
707#ifdef CONFIG_IP_VS_IPV6
708 if (af == AF_INET6)
709 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
710 else
711#endif
712 ip_vs_nat_icmp(skb, pp, cp, 1);
4856c84c 713
f5a41847
JA
714#ifdef CONFIG_IP_VS_IPV6
715 if (af == AF_INET6) {
716 if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
717 goto out;
718 } else
719#endif
720 if ((sysctl_ip_vs_snat_reroute ||
721 skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
722 ip_route_me_harder(skb, RTN_LOCAL) != 0)
723 goto out;
724
4856c84c
MT
725 /* do the statistics and put it back */
726 ip_vs_out_stats(cp, skb);
727
cf356d69 728 skb->ipvs_property = 1;
f4bc17cd 729 if (!(cp->flags & IP_VS_CONN_F_NFCT))
cf356d69 730 ip_vs_notrack(skb);
f4bc17cd
JA
731 else
732 ip_vs_update_conntrack(skb, cp, 0);
4856c84c
MT
733 verdict = NF_ACCEPT;
734
735out:
736 __ip_vs_conn_put(cp);
737
738 return verdict;
739}
740
1da177e4
LT
741/*
742 * Handle ICMP messages in the inside-to-outside direction (outgoing).
4856c84c 743 * Find any that might be relevant, check against existing connections.
1da177e4 744 * Currently handles error types - unreachable, quench, ttl exceeded.
1da177e4 745 */
1ca5bb54
JA
746static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
747 unsigned int hooknum)
1da177e4 748{
1da177e4
LT
749 struct iphdr *iph;
750 struct icmphdr _icmph, *ic;
751 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
51ef348b 752 struct ip_vs_iphdr ciph;
1da177e4
LT
753 struct ip_vs_conn *cp;
754 struct ip_vs_protocol *pp;
4856c84c 755 unsigned int offset, ihl;
f2428ed5 756 union nf_inet_addr snet;
1da177e4
LT
757
758 *related = 1;
759
760 /* reassemble IP fragments */
eddc9ec5 761 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1ca5bb54 762 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
1da177e4 763 return NF_STOLEN;
1da177e4
LT
764 }
765
eddc9ec5 766 iph = ip_hdr(skb);
1da177e4
LT
767 offset = ihl = iph->ihl * 4;
768 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
769 if (ic == NULL)
770 return NF_DROP;
771
14d5e834 772 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
1da177e4 773 ic->type, ntohs(icmp_id(ic)),
14d5e834 774 &iph->saddr, &iph->daddr);
1da177e4
LT
775
776 /*
777 * Work through seeing if this is for us.
778 * These checks are supposed to be in an order that means easy
779 * things are checked first to speed up processing.... however
780 * this means that some packets will manage to get a long way
781 * down this stack and then be rejected, but that's life.
782 */
783 if ((ic->type != ICMP_DEST_UNREACH) &&
784 (ic->type != ICMP_SOURCE_QUENCH) &&
785 (ic->type != ICMP_TIME_EXCEEDED)) {
786 *related = 0;
787 return NF_ACCEPT;
788 }
789
790 /* Now find the contained IP header */
791 offset += sizeof(_icmph);
792 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
793 if (cih == NULL)
794 return NF_ACCEPT; /* The packet looks wrong, ignore */
795
796 pp = ip_vs_proto_get(cih->protocol);
797 if (!pp)
798 return NF_ACCEPT;
799
800 /* Is the embedded protocol header present? */
4412ec49 801 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1da177e4
LT
802 pp->dont_defrag))
803 return NF_ACCEPT;
804
0d79641a
JA
805 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
806 "Checking outgoing ICMP for");
1da177e4
LT
807
808 offset += cih->ihl * 4;
809
51ef348b 810 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1da177e4 811 /* The embedded headers contain source and dest in reverse order */
51ef348b 812 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1da177e4
LT
813 if (!cp)
814 return NF_ACCEPT;
815
f2428ed5
SH
816 snet.ip = iph->saddr;
817 return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
818 pp, offset, ihl);
1da177e4
LT
819}
820
2a3b791e 821#ifdef CONFIG_IP_VS_IPV6
1ca5bb54
JA
822static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
823 unsigned int hooknum)
2a3b791e
JV
824{
825 struct ipv6hdr *iph;
826 struct icmp6hdr _icmph, *ic;
827 struct ipv6hdr _ciph, *cih; /* The ip header contained
828 within the ICMP */
829 struct ip_vs_iphdr ciph;
830 struct ip_vs_conn *cp;
831 struct ip_vs_protocol *pp;
f2428ed5
SH
832 unsigned int offset;
833 union nf_inet_addr snet;
2a3b791e
JV
834
835 *related = 1;
836
837 /* reassemble IP fragments */
838 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1ca5bb54 839 if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
2a3b791e
JV
840 return NF_STOLEN;
841 }
842
843 iph = ipv6_hdr(skb);
844 offset = sizeof(struct ipv6hdr);
845 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
846 if (ic == NULL)
847 return NF_DROP;
848
5b095d98 849 IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
2a3b791e 850 ic->icmp6_type, ntohs(icmpv6_id(ic)),
38ff4fa4 851 &iph->saddr, &iph->daddr);
2a3b791e
JV
852
853 /*
854 * Work through seeing if this is for us.
855 * These checks are supposed to be in an order that means easy
856 * things are checked first to speed up processing.... however
857 * this means that some packets will manage to get a long way
858 * down this stack and then be rejected, but that's life.
859 */
860 if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
861 (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
862 (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
863 *related = 0;
864 return NF_ACCEPT;
865 }
866
867 /* Now find the contained IP header */
868 offset += sizeof(_icmph);
869 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
870 if (cih == NULL)
871 return NF_ACCEPT; /* The packet looks wrong, ignore */
872
873 pp = ip_vs_proto_get(cih->nexthdr);
874 if (!pp)
875 return NF_ACCEPT;
876
877 /* Is the embedded protocol header present? */
878 /* TODO: we don't support fragmentation at the moment anyways */
879 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
880 return NF_ACCEPT;
881
0d79641a
JA
882 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
883 "Checking outgoing ICMPv6 for");
2a3b791e
JV
884
885 offset += sizeof(struct ipv6hdr);
886
887 ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
888 /* The embedded headers contain source and dest in reverse order */
889 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
890 if (!cp)
891 return NF_ACCEPT;
892
178f5e49 893 ipv6_addr_copy(&snet.in6, &iph->saddr);
f2428ed5
SH
894 return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
895 pp, offset, sizeof(struct ipv6hdr));
2a3b791e
JV
896}
897#endif
898
2906f66a
VMR
899/*
900 * Check if sctp chunc is ABORT chunk
901 */
902static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
903{
904 sctp_chunkhdr_t *sch, schunk;
905 sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
906 sizeof(schunk), &schunk);
907 if (sch == NULL)
908 return 0;
909 if (sch->type == SCTP_CID_ABORT)
910 return 1;
911 return 0;
912}
913
2a3b791e 914static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
1da177e4
LT
915{
916 struct tcphdr _tcph, *th;
917
2a3b791e 918 th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
1da177e4
LT
919 if (th == NULL)
920 return 0;
921 return th->rst;
922}
923
4856c84c
MT
924/* Handle response packets: rewrite addresses and send away...
925 * Used for NAT and local client.
926 */
927static unsigned int
928handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
929 struct ip_vs_conn *cp, int ihl)
930{
0d79641a 931 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
4856c84c
MT
932
933 if (!skb_make_writable(skb, ihl))
934 goto drop;
935
936 /* mangle the packet */
937 if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
938 goto drop;
939
940#ifdef CONFIG_IP_VS_IPV6
941 if (af == AF_INET6)
942 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
943 else
944#endif
945 {
946 ip_hdr(skb)->saddr = cp->vaddr.ip;
947 ip_send_check(ip_hdr(skb));
948 }
949
8a803040
JA
950 /*
951 * nf_iterate does not expect change in the skb->dst->dev.
952 * It looks like it is not fatal to enable this code for hooks
953 * where our handlers are at the end of the chain list and
954 * when all next handlers use skb->dst->dev and not outdev.
955 * It will definitely route properly the inout NAT traffic
956 * when multiple paths are used.
957 */
958
4856c84c
MT
959 /* For policy routing, packets originating from this
960 * machine itself may be routed differently to packets
961 * passing through. We want this packet to be routed as
962 * if it came from this machine itself. So re-compute
963 * the routing information.
964 */
965#ifdef CONFIG_IP_VS_IPV6
f5a41847
JA
966 if (af == AF_INET6) {
967 if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
968 goto drop;
969 } else
4856c84c 970#endif
f5a41847
JA
971 if ((sysctl_ip_vs_snat_reroute ||
972 skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
973 ip_route_me_harder(skb, RTN_LOCAL) != 0)
974 goto drop;
4856c84c 975
0d79641a 976 IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
4856c84c
MT
977
978 ip_vs_out_stats(cp, skb);
979 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
cf356d69 980 skb->ipvs_property = 1;
f4bc17cd 981 if (!(cp->flags & IP_VS_CONN_F_NFCT))
cf356d69 982 ip_vs_notrack(skb);
f4bc17cd
JA
983 else
984 ip_vs_update_conntrack(skb, cp, 0);
4856c84c
MT
985 ip_vs_conn_put(cp);
986
4856c84c
MT
987 LeaveFunction(11);
988 return NF_ACCEPT;
989
990drop:
991 ip_vs_conn_put(cp);
992 kfree_skb(skb);
f4bc17cd 993 LeaveFunction(11);
4856c84c
MT
994 return NF_STOLEN;
995}
996
1da177e4 997/*
4856c84c 998 * Check if outgoing packet belongs to the established ip_vs_conn.
1da177e4
LT
999 */
1000static unsigned int
fc604767 1001ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
1da177e4 1002{
51ef348b 1003 struct ip_vs_iphdr iph;
1da177e4
LT
1004 struct ip_vs_protocol *pp;
1005 struct ip_vs_conn *cp;
1da177e4
LT
1006
1007 EnterFunction(11);
1008
fc604767 1009 /* Already marked as IPVS request or reply? */
6869c4d8 1010 if (skb->ipvs_property)
1da177e4
LT
1011 return NF_ACCEPT;
1012
fc604767
JA
1013 /* Bad... Do not break raw sockets */
1014 if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1015 af == AF_INET)) {
1016 struct sock *sk = skb->sk;
1017 struct inet_sock *inet = inet_sk(skb->sk);
1018
1019 if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1020 return NF_ACCEPT;
1021 }
1022
1023 if (unlikely(!skb_dst(skb)))
1024 return NF_ACCEPT;
1025
2a3b791e
JV
1026 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1027#ifdef CONFIG_IP_VS_IPV6
1028 if (af == AF_INET6) {
1029 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1ca5bb54
JA
1030 int related;
1031 int verdict = ip_vs_out_icmp_v6(skb, &related,
1032 hooknum);
1da177e4 1033
f5a41847 1034 if (related)
2a3b791e
JV
1035 return verdict;
1036 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1037 }
1038 } else
1039#endif
1040 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1ca5bb54
JA
1041 int related;
1042 int verdict = ip_vs_out_icmp(skb, &related, hooknum);
2a3b791e 1043
f5a41847 1044 if (related)
2a3b791e
JV
1045 return verdict;
1046 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1047 }
1da177e4 1048
51ef348b 1049 pp = ip_vs_proto_get(iph.protocol);
1da177e4
LT
1050 if (unlikely(!pp))
1051 return NF_ACCEPT;
1052
1053 /* reassemble IP fragments */
2a3b791e
JV
1054#ifdef CONFIG_IP_VS_IPV6
1055 if (af == AF_INET6) {
1ca5bb54
JA
1056 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1057 if (ip_vs_gather_frags_v6(skb,
1058 ip_vs_defrag_user(hooknum)))
1059 return NF_STOLEN;
2a3b791e 1060 }
1ca5bb54
JA
1061
1062 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
2a3b791e
JV
1063 } else
1064#endif
1065 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
1066 !pp->dont_defrag)) {
1ca5bb54
JA
1067 if (ip_vs_gather_frags(skb,
1068 ip_vs_defrag_user(hooknum)))
2a3b791e
JV
1069 return NF_STOLEN;
1070
1071 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1072 }
1da177e4
LT
1073
1074 /*
1075 * Check if the packet belongs to an existing entry
1076 */
2a3b791e 1077 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1da177e4 1078
cb59155f
JA
1079 if (likely(cp))
1080 return handle_response(af, skb, pp, cp, iph.len);
1081 if (sysctl_ip_vs_nat_icmp_send &&
1082 (pp->protocol == IPPROTO_TCP ||
1083 pp->protocol == IPPROTO_UDP ||
1084 pp->protocol == IPPROTO_SCTP)) {
1085 __be16 _ports[2], *pptr;
1086
1087 pptr = skb_header_pointer(skb, iph.len,
1088 sizeof(_ports), _ports);
1089 if (pptr == NULL)
1090 return NF_ACCEPT; /* Not for me */
1091 if (ip_vs_lookup_real_service(af, iph.protocol,
1092 &iph.saddr,
1093 pptr[0])) {
1094 /*
1095 * Notify the real server: there is no
1096 * existing entry if it is not RST
1097 * packet or not TCP packet.
1098 */
1099 if ((iph.protocol != IPPROTO_TCP &&
1100 iph.protocol != IPPROTO_SCTP)
1101 || ((iph.protocol == IPPROTO_TCP
1102 && !is_tcp_reset(skb, iph.len))
1103 || (iph.protocol == IPPROTO_SCTP
1104 && !is_sctp_abort(skb,
1105 iph.len)))) {
2a3b791e 1106#ifdef CONFIG_IP_VS_IPV6
cb59155f
JA
1107 if (af == AF_INET6) {
1108 struct net *net =
1109 dev_net(skb_dst(skb)->dev);
1110
1111 if (!skb->dev)
1112 skb->dev = net->loopback_dev;
1113 icmpv6_send(skb,
1114 ICMPV6_DEST_UNREACH,
1115 ICMPV6_PORT_UNREACH,
1116 0);
1117 } else
2a3b791e 1118#endif
cb59155f
JA
1119 icmp_send(skb,
1120 ICMP_DEST_UNREACH,
1121 ICMP_PORT_UNREACH, 0);
1122 return NF_DROP;
1da177e4
LT
1123 }
1124 }
1da177e4 1125 }
0d79641a 1126 IP_VS_DBG_PKT(12, af, pp, skb, 0,
cb59155f
JA
1127 "ip_vs_out: packet continues traversal as normal");
1128 return NF_ACCEPT;
1da177e4
LT
1129}
1130
fc604767 1131/*
cb59155f
JA
1132 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1133 * used only for VS/NAT.
fc604767
JA
1134 * Check if packet is reply for established ip_vs_conn.
1135 */
1136static unsigned int
1137ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
1138 const struct net_device *in, const struct net_device *out,
1139 int (*okfn)(struct sk_buff *))
1140{
1141 return ip_vs_out(hooknum, skb, AF_INET);
1142}
1143
1144/*
1145 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1146 * Check if packet is reply for established ip_vs_conn.
1147 */
1148static unsigned int
1149ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
1150 const struct net_device *in, const struct net_device *out,
1151 int (*okfn)(struct sk_buff *))
1152{
1153 unsigned int verdict;
1154
1155 /* Disable BH in LOCAL_OUT until all places are fixed */
1156 local_bh_disable();
1157 verdict = ip_vs_out(hooknum, skb, AF_INET);
1158 local_bh_enable();
1159 return verdict;
1160}
1161
1162#ifdef CONFIG_IP_VS_IPV6
1163
1164/*
cb59155f
JA
1165 * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1166 * used only for VS/NAT.
fc604767
JA
1167 * Check if packet is reply for established ip_vs_conn.
1168 */
1169static unsigned int
1170ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
1171 const struct net_device *in, const struct net_device *out,
1172 int (*okfn)(struct sk_buff *))
1173{
1174 return ip_vs_out(hooknum, skb, AF_INET6);
1175}
1176
1177/*
1178 * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1179 * Check if packet is reply for established ip_vs_conn.
1180 */
1181static unsigned int
1182ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
1183 const struct net_device *in, const struct net_device *out,
1184 int (*okfn)(struct sk_buff *))
1185{
1186 unsigned int verdict;
1187
1188 /* Disable BH in LOCAL_OUT until all places are fixed */
1189 local_bh_disable();
1190 verdict = ip_vs_out(hooknum, skb, AF_INET6);
1191 local_bh_enable();
1192 return verdict;
1193}
1194
1195#endif
1da177e4
LT
1196
1197/*
1198 * Handle ICMP messages in the outside-to-inside direction (incoming).
1199 * Find any that might be relevant, check against existing connections,
1200 * forward to the right destination host if relevant.
1201 * Currently handles error types - unreachable, quench, ttl exceeded.
1202 */
e905a9ed 1203static int
3db05fea 1204ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1da177e4 1205{
1da177e4
LT
1206 struct iphdr *iph;
1207 struct icmphdr _icmph, *ic;
1208 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
51ef348b 1209 struct ip_vs_iphdr ciph;
1da177e4
LT
1210 struct ip_vs_conn *cp;
1211 struct ip_vs_protocol *pp;
1212 unsigned int offset, ihl, verdict;
f2428ed5 1213 union nf_inet_addr snet;
1da177e4
LT
1214
1215 *related = 1;
1216
1217 /* reassemble IP fragments */
eddc9ec5 1218 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1ca5bb54 1219 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
1da177e4 1220 return NF_STOLEN;
1da177e4
LT
1221 }
1222
eddc9ec5 1223 iph = ip_hdr(skb);
1da177e4
LT
1224 offset = ihl = iph->ihl * 4;
1225 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1226 if (ic == NULL)
1227 return NF_DROP;
1228
14d5e834 1229 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1da177e4 1230 ic->type, ntohs(icmp_id(ic)),
14d5e834 1231 &iph->saddr, &iph->daddr);
1da177e4
LT
1232
1233 /*
1234 * Work through seeing if this is for us.
1235 * These checks are supposed to be in an order that means easy
1236 * things are checked first to speed up processing.... however
1237 * this means that some packets will manage to get a long way
1238 * down this stack and then be rejected, but that's life.
1239 */
1240 if ((ic->type != ICMP_DEST_UNREACH) &&
1241 (ic->type != ICMP_SOURCE_QUENCH) &&
1242 (ic->type != ICMP_TIME_EXCEEDED)) {
1243 *related = 0;
1244 return NF_ACCEPT;
1245 }
1246
1247 /* Now find the contained IP header */
1248 offset += sizeof(_icmph);
1249 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1250 if (cih == NULL)
1251 return NF_ACCEPT; /* The packet looks wrong, ignore */
1252
1253 pp = ip_vs_proto_get(cih->protocol);
1254 if (!pp)
1255 return NF_ACCEPT;
1256
1257 /* Is the embedded protocol header present? */
4412ec49 1258 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1da177e4
LT
1259 pp->dont_defrag))
1260 return NF_ACCEPT;
1261
0d79641a
JA
1262 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
1263 "Checking incoming ICMP for");
1da177e4
LT
1264
1265 offset += cih->ihl * 4;
1266
51ef348b 1267 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1da177e4 1268 /* The embedded headers contain source and dest in reverse order */
51ef348b 1269 cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
4856c84c
MT
1270 if (!cp) {
1271 /* The packet could also belong to a local client */
1272 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
f2428ed5
SH
1273 if (cp) {
1274 snet.ip = iph->saddr;
1275 return handle_response_icmp(AF_INET, skb, &snet,
1276 cih->protocol, cp, pp,
4856c84c 1277 offset, ihl);
f2428ed5 1278 }
1da177e4 1279 return NF_ACCEPT;
4856c84c 1280 }
1da177e4
LT
1281
1282 verdict = NF_DROP;
1283
1284 /* Ensure the checksum is correct */
60476372 1285 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1da177e4 1286 /* Failed checksum! */
14d5e834
HH
1287 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1288 &iph->saddr);
1da177e4
LT
1289 goto out;
1290 }
1291
1292 /* do the statistics and put it back */
1293 ip_vs_in_stats(cp, skb);
1294 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1295 offset += 2 * sizeof(__u16);
1296 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
489fdeda
JA
1297 /* LOCALNODE from FORWARD hook is not supported */
1298 if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
1299 skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
1300 IP_VS_DBG(1, "%s(): "
1301 "local delivery to %pI4 but in FORWARD\n",
1302 __func__, &skb_rtable(skb)->rt_dst);
1303 verdict = NF_DROP;
1304 }
1da177e4
LT
1305
1306 out:
1307 __ip_vs_conn_put(cp);
1308
1309 return verdict;
1310}
1311
2a3b791e
JV
1312#ifdef CONFIG_IP_VS_IPV6
1313static int
1314ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1315{
1316 struct ipv6hdr *iph;
1317 struct icmp6hdr _icmph, *ic;
1318 struct ipv6hdr _ciph, *cih; /* The ip header contained
1319 within the ICMP */
1320 struct ip_vs_iphdr ciph;
1321 struct ip_vs_conn *cp;
1322 struct ip_vs_protocol *pp;
1323 unsigned int offset, verdict;
f2428ed5 1324 union nf_inet_addr snet;
489fdeda 1325 struct rt6_info *rt;
2a3b791e
JV
1326
1327 *related = 1;
1328
1329 /* reassemble IP fragments */
1330 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1ca5bb54 1331 if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
2a3b791e
JV
1332 return NF_STOLEN;
1333 }
1334
1335 iph = ipv6_hdr(skb);
1336 offset = sizeof(struct ipv6hdr);
1337 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1338 if (ic == NULL)
1339 return NF_DROP;
1340
5b095d98 1341 IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
2a3b791e 1342 ic->icmp6_type, ntohs(icmpv6_id(ic)),
38ff4fa4 1343 &iph->saddr, &iph->daddr);
2a3b791e
JV
1344
1345 /*
1346 * Work through seeing if this is for us.
1347 * These checks are supposed to be in an order that means easy
1348 * things are checked first to speed up processing.... however
1349 * this means that some packets will manage to get a long way
1350 * down this stack and then be rejected, but that's life.
1351 */
1352 if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1353 (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1354 (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1355 *related = 0;
1356 return NF_ACCEPT;
1357 }
1358
1359 /* Now find the contained IP header */
1360 offset += sizeof(_icmph);
1361 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1362 if (cih == NULL)
1363 return NF_ACCEPT; /* The packet looks wrong, ignore */
1364
1365 pp = ip_vs_proto_get(cih->nexthdr);
1366 if (!pp)
1367 return NF_ACCEPT;
1368
1369 /* Is the embedded protocol header present? */
1370 /* TODO: we don't support fragmentation at the moment anyways */
1371 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1372 return NF_ACCEPT;
1373
0d79641a
JA
1374 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
1375 "Checking incoming ICMPv6 for");
2a3b791e
JV
1376
1377 offset += sizeof(struct ipv6hdr);
1378
1379 ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1380 /* The embedded headers contain source and dest in reverse order */
1381 cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
f2428ed5
SH
1382 if (!cp) {
1383 /* The packet could also belong to a local client */
1384 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1385 if (cp) {
178f5e49 1386 ipv6_addr_copy(&snet.in6, &iph->saddr);
f2428ed5
SH
1387 return handle_response_icmp(AF_INET6, skb, &snet,
1388 cih->nexthdr,
1389 cp, pp, offset,
1390 sizeof(struct ipv6hdr));
1391 }
2a3b791e 1392 return NF_ACCEPT;
f2428ed5 1393 }
2a3b791e
JV
1394
1395 verdict = NF_DROP;
1396
1397 /* do the statistics and put it back */
1398 ip_vs_in_stats(cp, skb);
2906f66a
VMR
1399 if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
1400 IPPROTO_SCTP == cih->nexthdr)
2a3b791e
JV
1401 offset += 2 * sizeof(__u16);
1402 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
489fdeda
JA
1403 /* LOCALNODE from FORWARD hook is not supported */
1404 if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
1405 (rt = (struct rt6_info *) skb_dst(skb)) &&
1406 rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
1407 IP_VS_DBG(1, "%s(): "
1408 "local delivery to %pI6 but in FORWARD\n",
1409 __func__, &rt->rt6i_dst);
1410 verdict = NF_DROP;
1411 }
2a3b791e
JV
1412
1413 __ip_vs_conn_put(cp);
1414
1415 return verdict;
1416}
1417#endif
1418
1419
1da177e4
LT
1420/*
1421 * Check if it's for virtual services, look it up,
1422 * and send it on its way...
1423 */
1424static unsigned int
cb59155f 1425ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1da177e4 1426{
51ef348b 1427 struct ip_vs_iphdr iph;
1da177e4
LT
1428 struct ip_vs_protocol *pp;
1429 struct ip_vs_conn *cp;
cb59155f 1430 int ret, restart, pkts;
2a3b791e 1431
fc604767
JA
1432 /* Already marked as IPVS request or reply? */
1433 if (skb->ipvs_property)
1434 return NF_ACCEPT;
1435
1da177e4 1436 /*
cb59155f
JA
1437 * Big tappo:
1438 * - remote client: only PACKET_HOST
1439 * - route: used for struct net when skb->dev is unset
1da177e4 1440 */
cb59155f
JA
1441 if (unlikely((skb->pkt_type != PACKET_HOST &&
1442 hooknum != NF_INET_LOCAL_OUT) ||
1443 !skb_dst(skb))) {
1444 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1445 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
1446 " ignored in hook %u\n",
1447 skb->pkt_type, iph.protocol,
1448 IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
1da177e4
LT
1449 return NF_ACCEPT;
1450 }
cb59155f
JA
1451 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1452
1453 /* Bad... Do not break raw sockets */
1454 if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1455 af == AF_INET)) {
1456 struct sock *sk = skb->sk;
1457 struct inet_sock *inet = inet_sk(skb->sk);
1458
1459 if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1460 return NF_ACCEPT;
1461 }
1da177e4 1462
94b26551
JV
1463#ifdef CONFIG_IP_VS_IPV6
1464 if (af == AF_INET6) {
1465 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1ca5bb54
JA
1466 int related;
1467 int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1da177e4 1468
94b26551
JV
1469 if (related)
1470 return verdict;
1471 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1472 }
1473 } else
1474#endif
1475 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1ca5bb54
JA
1476 int related;
1477 int verdict = ip_vs_in_icmp(skb, &related, hooknum);
94b26551
JV
1478
1479 if (related)
1480 return verdict;
1481 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1482 }
1da177e4
LT
1483
1484 /* Protocol supported? */
51ef348b 1485 pp = ip_vs_proto_get(iph.protocol);
1da177e4
LT
1486 if (unlikely(!pp))
1487 return NF_ACCEPT;
1488
1da177e4
LT
1489 /*
1490 * Check if the packet belongs to an existing connection entry
1491 */
2a3b791e 1492 cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1da177e4
LT
1493
1494 if (unlikely(!cp)) {
1495 int v;
1496
2a3b791e 1497 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1da177e4
LT
1498 return v;
1499 }
1500
1501 if (unlikely(!cp)) {
1502 /* sorry, all this trouble for a no-hit :) */
0d79641a 1503 IP_VS_DBG_PKT(12, af, pp, skb, 0,
cb59155f 1504 "ip_vs_in: packet continues traversal as normal");
1da177e4
LT
1505 return NF_ACCEPT;
1506 }
1507
0d79641a 1508 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
1da177e4
LT
1509
1510 /* Check the server status */
1511 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1512 /* the destination server is not available */
1513
1514 if (sysctl_ip_vs_expire_nodest_conn) {
1515 /* try to expire the connection immediately */
1516 ip_vs_conn_expire_now(cp);
1da177e4 1517 }
dc8103f2
JA
1518 /* don't restart its timer, and silently
1519 drop the packet. */
1520 __ip_vs_conn_put(cp);
1da177e4
LT
1521 return NF_DROP;
1522 }
1523
1524 ip_vs_in_stats(cp, skb);
1525 restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1526 if (cp->packet_xmit)
1527 ret = cp->packet_xmit(skb, cp, pp);
1528 /* do not touch skb anymore */
1529 else {
1530 IP_VS_DBG_RL("warning: packet_xmit is null");
1531 ret = NF_ACCEPT;
1532 }
1533
efac5276
RB
1534 /* Increase its packet counter and check if it is needed
1535 * to be synchronized
1536 *
1537 * Sync connection if it is about to close to
1538 * encorage the standby servers to update the connections timeout
1539 */
1e66dafc 1540 pkts = atomic_add_return(1, &cp->in_pkts);
2906f66a
VMR
1541 if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1542 cp->protocol == IPPROTO_SCTP) {
1543 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
8ed2163f 1544 (pkts % sysctl_ip_vs_sync_threshold[1]
2906f66a
VMR
1545 == sysctl_ip_vs_sync_threshold[0])) ||
1546 (cp->old_state != cp->state &&
1547 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
1548 (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
1549 (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
1550 ip_vs_sync_conn(cp);
1551 goto out;
1552 }
1553 }
1554
8ed2163f
JA
1555 /* Keep this block last: TCP and others with pp->num_states <= 1 */
1556 else if (af == AF_INET &&
c6883f58 1557 (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
efac5276
RB
1558 (((cp->protocol != IPPROTO_TCP ||
1559 cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1e66dafc 1560 (pkts % sysctl_ip_vs_sync_threshold[1]
efac5276
RB
1561 == sysctl_ip_vs_sync_threshold[0])) ||
1562 ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1563 ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
9abfe315 1564 (cp->state == IP_VS_TCP_S_CLOSE) ||
9d3a0de7
RB
1565 (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1566 (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1da177e4 1567 ip_vs_sync_conn(cp);
2906f66a 1568out:
efac5276 1569 cp->old_state = cp->state;
1da177e4
LT
1570
1571 ip_vs_conn_put(cp);
1572 return ret;
1573}
1574
cb59155f
JA
1575/*
1576 * AF_INET handler in NF_INET_LOCAL_IN chain
1577 * Schedule and forward packets from remote clients
1578 */
1579static unsigned int
1580ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
1581 const struct net_device *in,
1582 const struct net_device *out,
1583 int (*okfn)(struct sk_buff *))
1584{
1585 return ip_vs_in(hooknum, skb, AF_INET);
1586}
1587
1588/*
1589 * AF_INET handler in NF_INET_LOCAL_OUT chain
1590 * Schedule and forward packets from local clients
1591 */
1592static unsigned int
1593ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
1594 const struct net_device *in, const struct net_device *out,
1595 int (*okfn)(struct sk_buff *))
1596{
1597 unsigned int verdict;
1598
1599 /* Disable BH in LOCAL_OUT until all places are fixed */
1600 local_bh_disable();
1601 verdict = ip_vs_in(hooknum, skb, AF_INET);
1602 local_bh_enable();
1603 return verdict;
1604}
1605
1606#ifdef CONFIG_IP_VS_IPV6
1607
1608/*
1609 * AF_INET6 handler in NF_INET_LOCAL_IN chain
1610 * Schedule and forward packets from remote clients
1611 */
1612static unsigned int
1613ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
1614 const struct net_device *in,
1615 const struct net_device *out,
1616 int (*okfn)(struct sk_buff *))
1617{
1618 return ip_vs_in(hooknum, skb, AF_INET6);
1619}
1620
1621/*
1622 * AF_INET6 handler in NF_INET_LOCAL_OUT chain
1623 * Schedule and forward packets from local clients
1624 */
1625static unsigned int
1626ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
1627 const struct net_device *in, const struct net_device *out,
1628 int (*okfn)(struct sk_buff *))
1629{
1630 unsigned int verdict;
1631
1632 /* Disable BH in LOCAL_OUT until all places are fixed */
1633 local_bh_disable();
1634 verdict = ip_vs_in(hooknum, skb, AF_INET6);
1635 local_bh_enable();
1636 return verdict;
1637}
1638
1639#endif
1640
1da177e4
LT
1641
1642/*
6e23ae2a 1643 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1da177e4
LT
1644 * related packets destined for 0.0.0.0/0.
1645 * When fwmark-based virtual service is used, such as transparent
1646 * cache cluster, TCP packets can be marked and routed to ip_vs_in,
1647 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
6e23ae2a 1648 * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1da177e4
LT
1649 * and send them to ip_vs_in_icmp.
1650 */
1651static unsigned int
3db05fea 1652ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1da177e4
LT
1653 const struct net_device *in, const struct net_device *out,
1654 int (*okfn)(struct sk_buff *))
1655{
1656 int r;
1657
3db05fea 1658 if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1da177e4
LT
1659 return NF_ACCEPT;
1660
3db05fea 1661 return ip_vs_in_icmp(skb, &r, hooknum);
1da177e4
LT
1662}
1663
2a3b791e
JV
1664#ifdef CONFIG_IP_VS_IPV6
1665static unsigned int
1666ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1667 const struct net_device *in, const struct net_device *out,
1668 int (*okfn)(struct sk_buff *))
1669{
1670 int r;
1671
1672 if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1673 return NF_ACCEPT;
1674
1675 return ip_vs_in_icmp_v6(skb, &r, hooknum);
1676}
1677#endif
1678
1da177e4 1679
1999414a 1680static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
cb59155f
JA
1681 /* After packet filtering, change source only for VS/NAT */
1682 {
1683 .hook = ip_vs_reply4,
1684 .owner = THIS_MODULE,
1685 .pf = PF_INET,
1686 .hooknum = NF_INET_LOCAL_IN,
1687 .priority = 99,
1688 },
41c5b317
PM
1689 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1690 * or VS/NAT(change destination), so that filtering rules can be
1691 * applied to IPVS. */
1692 {
cb59155f 1693 .hook = ip_vs_remote_request4,
41c5b317
PM
1694 .owner = THIS_MODULE,
1695 .pf = PF_INET,
cb59155f
JA
1696 .hooknum = NF_INET_LOCAL_IN,
1697 .priority = 101,
41c5b317 1698 },
fc604767 1699 /* Before ip_vs_in, change source only for VS/NAT */
41c5b317 1700 {
fc604767 1701 .hook = ip_vs_local_reply4,
41c5b317
PM
1702 .owner = THIS_MODULE,
1703 .pf = PF_INET,
fc604767
JA
1704 .hooknum = NF_INET_LOCAL_OUT,
1705 .priority = -99,
41c5b317 1706 },
cb59155f
JA
1707 /* After mangle, schedule and forward local requests */
1708 {
1709 .hook = ip_vs_local_request4,
1710 .owner = THIS_MODULE,
1711 .pf = PF_INET,
1712 .hooknum = NF_INET_LOCAL_OUT,
1713 .priority = -98,
1714 },
41c5b317
PM
1715 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1716 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1717 {
1718 .hook = ip_vs_forward_icmp,
1719 .owner = THIS_MODULE,
1720 .pf = PF_INET,
cb59155f
JA
1721 .hooknum = NF_INET_FORWARD,
1722 .priority = 99,
41c5b317 1723 },
fc604767
JA
1724 /* After packet filtering, change source only for VS/NAT */
1725 {
1726 .hook = ip_vs_reply4,
1727 .owner = THIS_MODULE,
1728 .pf = PF_INET,
1729 .hooknum = NF_INET_FORWARD,
1730 .priority = 100,
1731 },
473b23d3 1732#ifdef CONFIG_IP_VS_IPV6
cb59155f
JA
1733 /* After packet filtering, change source only for VS/NAT */
1734 {
1735 .hook = ip_vs_reply6,
1736 .owner = THIS_MODULE,
1737 .pf = PF_INET6,
1738 .hooknum = NF_INET_LOCAL_IN,
1739 .priority = 99,
1740 },
473b23d3
JV
1741 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1742 * or VS/NAT(change destination), so that filtering rules can be
1743 * applied to IPVS. */
1744 {
cb59155f 1745 .hook = ip_vs_remote_request6,
473b23d3
JV
1746 .owner = THIS_MODULE,
1747 .pf = PF_INET6,
cb59155f
JA
1748 .hooknum = NF_INET_LOCAL_IN,
1749 .priority = 101,
473b23d3 1750 },
fc604767 1751 /* Before ip_vs_in, change source only for VS/NAT */
473b23d3 1752 {
fc604767 1753 .hook = ip_vs_local_reply6,
473b23d3 1754 .owner = THIS_MODULE,
fc604767
JA
1755 .pf = PF_INET,
1756 .hooknum = NF_INET_LOCAL_OUT,
1757 .priority = -99,
473b23d3 1758 },
cb59155f
JA
1759 /* After mangle, schedule and forward local requests */
1760 {
1761 .hook = ip_vs_local_request6,
1762 .owner = THIS_MODULE,
1763 .pf = PF_INET6,
1764 .hooknum = NF_INET_LOCAL_OUT,
1765 .priority = -98,
1766 },
473b23d3
JV
1767 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1768 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1769 {
1770 .hook = ip_vs_forward_icmp_v6,
1771 .owner = THIS_MODULE,
1772 .pf = PF_INET6,
cb59155f
JA
1773 .hooknum = NF_INET_FORWARD,
1774 .priority = 99,
473b23d3 1775 },
fc604767
JA
1776 /* After packet filtering, change source only for VS/NAT */
1777 {
1778 .hook = ip_vs_reply6,
1779 .owner = THIS_MODULE,
1780 .pf = PF_INET6,
1781 .hooknum = NF_INET_FORWARD,
1782 .priority = 100,
1783 },
473b23d3 1784#endif
1da177e4
LT
1785};
1786
1787
1788/*
1789 * Initialize IP Virtual Server
1790 */
1791static int __init ip_vs_init(void)
1792{
1793 int ret;
1794
a919cf4b
SW
1795 ip_vs_estimator_init();
1796
1da177e4
LT
1797 ret = ip_vs_control_init();
1798 if (ret < 0) {
1e3e238e 1799 pr_err("can't setup control.\n");
a919cf4b 1800 goto cleanup_estimator;
1da177e4
LT
1801 }
1802
1803 ip_vs_protocol_init();
1804
1805 ret = ip_vs_app_init();
1806 if (ret < 0) {
1e3e238e 1807 pr_err("can't setup application helper.\n");
1da177e4
LT
1808 goto cleanup_protocol;
1809 }
1810
1811 ret = ip_vs_conn_init();
1812 if (ret < 0) {
1e3e238e 1813 pr_err("can't setup connection table.\n");
1da177e4
LT
1814 goto cleanup_app;
1815 }
1816
41c5b317 1817 ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1da177e4 1818 if (ret < 0) {
1e3e238e 1819 pr_err("can't register hooks.\n");
1da177e4
LT
1820 goto cleanup_conn;
1821 }
1822
1e3e238e 1823 pr_info("ipvs loaded.\n");
1da177e4
LT
1824 return ret;
1825
1da177e4
LT
1826 cleanup_conn:
1827 ip_vs_conn_cleanup();
1828 cleanup_app:
1829 ip_vs_app_cleanup();
1830 cleanup_protocol:
1831 ip_vs_protocol_cleanup();
1832 ip_vs_control_cleanup();
a919cf4b
SW
1833 cleanup_estimator:
1834 ip_vs_estimator_cleanup();
1da177e4
LT
1835 return ret;
1836}
1837
1838static void __exit ip_vs_cleanup(void)
1839{
41c5b317 1840 nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1da177e4
LT
1841 ip_vs_conn_cleanup();
1842 ip_vs_app_cleanup();
1843 ip_vs_protocol_cleanup();
1844 ip_vs_control_cleanup();
a919cf4b 1845 ip_vs_estimator_cleanup();
1e3e238e 1846 pr_info("ipvs unloaded.\n");
1da177e4
LT
1847}
1848
1849module_init(ip_vs_init);
1850module_exit(ip_vs_cleanup);
1851MODULE_LICENSE("GPL");
This page took 0.947431 seconds and 5 git commands to generate.