ipvs: remove extra lookups for ICMP packets
[deliverable/linux.git] / net / netfilter / ipvs / ip_vs_lblcr.c
CommitLineData
1da177e4
LT
1/*
2 * IPVS: Locality-Based Least-Connection with Replication scheduler
3 *
1da177e4
LT
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Julian Anastasov : Added the missing (dest->weight>0)
13 * condition in the ip_vs_dest_set_max.
14 *
15 */
16
17/*
18 * The lblc/r algorithm is as follows (pseudo code):
19 *
20 * if serverSet[dest_ip] is null then
21 * n, serverSet[dest_ip] <- {weighted least-conn node};
22 * else
23 * n <- {least-conn (alive) node in serverSet[dest_ip]};
24 * if (n is null) OR
25 * (n.conns>n.weight AND
26 * there is a node m with m.conns<m.weight/2) then
27 * n <- {weighted least-conn node};
28 * add n to serverSet[dest_ip];
29 * if |serverSet[dest_ip]| > 1 AND
30 * now - serverSet[dest_ip].lastMod > T then
31 * m <- {most conn node in serverSet[dest_ip]};
32 * remove m from serverSet[dest_ip];
33 * if serverSet[dest_ip] changed then
34 * serverSet[dest_ip].lastMod <- now;
35 *
36 * return n;
37 *
38 */
39
9aada7ac
HE
40#define KMSG_COMPONENT "IPVS"
41#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
42
14c85021 43#include <linux/ip.h>
1da177e4
LT
44#include <linux/module.h>
45#include <linux/kernel.h>
14c85021 46#include <linux/skbuff.h>
d7fe0f24 47#include <linux/jiffies.h>
51f0bc78 48#include <linux/list.h>
5a0e3ad6 49#include <linux/slab.h>
1da177e4
LT
50
51/* for sysctl */
52#include <linux/fs.h>
53#include <linux/sysctl.h>
457c4cbc 54#include <net/net_namespace.h>
1da177e4
LT
55
56#include <net/ip_vs.h>
57
58
59/*
60 * It is for garbage collection of stale IPVS lblcr entries,
61 * when the table is full.
62 */
63#define CHECK_EXPIRE_INTERVAL (60*HZ)
64#define ENTRY_TIMEOUT (6*60*HZ)
65
66/*
67 * It is for full expiration check.
68 * When there is no partial expiration check (garbage collection)
69 * in a half hour, do a full expiration check to collect stale
70 * entries that haven't been touched for a day.
71 */
72#define COUNT_FOR_FULL_EXPIRATION 30
1da177e4
LT
73
74/*
75 * for IPVS lblcr entry hash table
76 */
77#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
78#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
79#endif
80#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
81#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
82#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
83
84
85/*
86 * IPVS destination set structure and operations
87 */
51f0bc78
SH
88struct ip_vs_dest_set_elem {
89 struct list_head list; /* list link */
1da177e4
LT
90 struct ip_vs_dest *dest; /* destination server */
91};
92
93struct ip_vs_dest_set {
94 atomic_t size; /* set size */
95 unsigned long lastmod; /* last modified time */
51f0bc78 96 struct list_head list; /* destination list */
1da177e4
LT
97 rwlock_t lock; /* lock for this list */
98};
99
100
51f0bc78 101static struct ip_vs_dest_set_elem *
1da177e4
LT
102ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
103{
51f0bc78 104 struct ip_vs_dest_set_elem *e;
1da177e4 105
51f0bc78 106 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
107 if (e->dest == dest)
108 /* already existed */
109 return NULL;
110 }
111
f728bafb 112 e = kmalloc(sizeof(*e), GFP_ATOMIC);
1da177e4 113 if (e == NULL) {
1e3e238e 114 pr_err("%s(): no memory\n", __func__);
1da177e4
LT
115 return NULL;
116 }
117
118 atomic_inc(&dest->refcnt);
119 e->dest = dest;
120
51f0bc78 121 list_add(&e->list, &set->list);
1da177e4 122 atomic_inc(&set->size);
1da177e4
LT
123
124 set->lastmod = jiffies;
125 return e;
126}
127
128static void
129ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
130{
51f0bc78 131 struct ip_vs_dest_set_elem *e;
1da177e4 132
51f0bc78 133 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
134 if (e->dest == dest) {
135 /* HIT */
1da177e4
LT
136 atomic_dec(&set->size);
137 set->lastmod = jiffies;
138 atomic_dec(&e->dest->refcnt);
51f0bc78 139 list_del(&e->list);
1da177e4
LT
140 kfree(e);
141 break;
142 }
1da177e4 143 }
1da177e4
LT
144}
145
146static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
147{
51f0bc78 148 struct ip_vs_dest_set_elem *e, *ep;
1da177e4
LT
149
150 write_lock(&set->lock);
51f0bc78 151 list_for_each_entry_safe(e, ep, &set->list, list) {
1da177e4
LT
152 /*
153 * We don't kfree dest because it is refered either
154 * by its service or by the trash dest list.
155 */
156 atomic_dec(&e->dest->refcnt);
51f0bc78 157 list_del(&e->list);
1da177e4
LT
158 kfree(e);
159 }
160 write_unlock(&set->lock);
161}
162
163/* get weighted least-connection node in the destination set */
164static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
165{
51f0bc78 166 register struct ip_vs_dest_set_elem *e;
1da177e4
LT
167 struct ip_vs_dest *dest, *least;
168 int loh, doh;
169
170 if (set == NULL)
171 return NULL;
172
1da177e4 173 /* select the first destination server, whose weight > 0 */
51f0bc78 174 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
175 least = e->dest;
176 if (least->flags & IP_VS_DEST_F_OVERLOAD)
177 continue;
178
179 if ((atomic_read(&least->weight) > 0)
180 && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
181 loh = atomic_read(&least->activeconns) * 50
182 + atomic_read(&least->inactconns);
183 goto nextstage;
184 }
185 }
1da177e4
LT
186 return NULL;
187
188 /* find the destination with the weighted least load */
189 nextstage:
51f0bc78 190 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
191 dest = e->dest;
192 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
193 continue;
194
195 doh = atomic_read(&dest->activeconns) * 50
196 + atomic_read(&dest->inactconns);
197 if ((loh * atomic_read(&dest->weight) >
198 doh * atomic_read(&least->weight))
199 && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
200 least = dest;
201 loh = doh;
202 }
203 }
1da177e4 204
1e3e238e 205 IP_VS_DBG_BUF(6, "%s(): server %s:%d "
44548375 206 "activeconns %d refcnt %d weight %d overhead %d\n",
1e3e238e 207 __func__,
44548375
JV
208 IP_VS_DBG_ADDR(least->af, &least->addr),
209 ntohs(least->port),
210 atomic_read(&least->activeconns),
211 atomic_read(&least->refcnt),
212 atomic_read(&least->weight), loh);
1da177e4
LT
213 return least;
214}
215
216
217/* get weighted most-connection node in the destination set */
218static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
219{
51f0bc78 220 register struct ip_vs_dest_set_elem *e;
1da177e4
LT
221 struct ip_vs_dest *dest, *most;
222 int moh, doh;
223
224 if (set == NULL)
225 return NULL;
226
1da177e4 227 /* select the first destination server, whose weight > 0 */
51f0bc78 228 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
229 most = e->dest;
230 if (atomic_read(&most->weight) > 0) {
231 moh = atomic_read(&most->activeconns) * 50
232 + atomic_read(&most->inactconns);
233 goto nextstage;
234 }
235 }
1da177e4
LT
236 return NULL;
237
238 /* find the destination with the weighted most load */
239 nextstage:
51f0bc78 240 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
241 dest = e->dest;
242 doh = atomic_read(&dest->activeconns) * 50
243 + atomic_read(&dest->inactconns);
244 /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
245 if ((moh * atomic_read(&dest->weight) <
246 doh * atomic_read(&most->weight))
247 && (atomic_read(&dest->weight) > 0)) {
248 most = dest;
249 moh = doh;
250 }
251 }
1da177e4 252
1e3e238e 253 IP_VS_DBG_BUF(6, "%s(): server %s:%d "
44548375 254 "activeconns %d refcnt %d weight %d overhead %d\n",
1e3e238e 255 __func__,
44548375
JV
256 IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port),
257 atomic_read(&most->activeconns),
258 atomic_read(&most->refcnt),
259 atomic_read(&most->weight), moh);
1da177e4
LT
260 return most;
261}
262
263
264/*
265 * IPVS lblcr entry represents an association between destination
266 * IP address and its destination server set
267 */
268struct ip_vs_lblcr_entry {
269 struct list_head list;
44548375
JV
270 int af; /* address family */
271 union nf_inet_addr addr; /* destination IP address */
1da177e4
LT
272 struct ip_vs_dest_set set; /* destination server set */
273 unsigned long lastuse; /* last used time */
274};
275
276
277/*
278 * IPVS lblcr hash table
279 */
280struct ip_vs_lblcr_table {
1da177e4
LT
281 struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
282 atomic_t entries; /* number of entries */
283 int max_size; /* maximum size of entries */
284 struct timer_list periodic_timer; /* collect stale entries */
285 int rover; /* rover for expire check */
286 int counter; /* counter for no expire */
287};
288
289
290/*
291 * IPVS LBLCR sysctl table
292 */
293
294static ctl_table vs_vars_table[] = {
295 {
1da177e4 296 .procname = "lblcr_expiration",
d0a1eef9 297 .data = NULL,
1da177e4 298 .maxlen = sizeof(int),
e905a9ed 299 .mode = 0644,
6d9f239a 300 .proc_handler = proc_dointvec_jiffies,
1da177e4 301 },
f8572d8f 302 { }
1da177e4
LT
303};
304
1da177e4
LT
305static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
306{
307 list_del(&en->list);
308 ip_vs_dest_set_eraseall(&en->set);
309 kfree(en);
310}
311
312
313/*
314 * Returns hash value for IPVS LBLCR entry
315 */
44548375
JV
316static inline unsigned
317ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
1da177e4 318{
44548375
JV
319 __be32 addr_fold = addr->ip;
320
321#ifdef CONFIG_IP_VS_IPV6
322 if (af == AF_INET6)
323 addr_fold = addr->ip6[0]^addr->ip6[1]^
324 addr->ip6[2]^addr->ip6[3];
325#endif
326 return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
1da177e4
LT
327}
328
329
330/*
331 * Hash an entry in the ip_vs_lblcr_table.
332 * returns bool success.
333 */
f728bafb 334static void
1da177e4
LT
335ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
336{
44548375 337 unsigned hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
1da177e4 338
1da177e4
LT
339 list_add(&en->list, &tbl->bucket[hash]);
340 atomic_inc(&tbl->entries);
1da177e4
LT
341}
342
343
1da177e4 344/*
f728bafb
SW
345 * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
346 * read lock.
1da177e4
LT
347 */
348static inline struct ip_vs_lblcr_entry *
44548375
JV
349ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
350 const union nf_inet_addr *addr)
1da177e4 351{
44548375 352 unsigned hash = ip_vs_lblcr_hashkey(af, addr);
1da177e4
LT
353 struct ip_vs_lblcr_entry *en;
354
f728bafb 355 list_for_each_entry(en, &tbl->bucket[hash], list)
44548375 356 if (ip_vs_addr_equal(af, &en->addr, addr))
f728bafb
SW
357 return en;
358
359 return NULL;
360}
1da177e4 361
1da177e4 362
f728bafb
SW
363/*
364 * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
365 * IP address to a server. Called under write lock.
366 */
367static inline struct ip_vs_lblcr_entry *
44548375 368ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
f728bafb
SW
369 struct ip_vs_dest *dest)
370{
371 struct ip_vs_lblcr_entry *en;
372
44548375 373 en = ip_vs_lblcr_get(dest->af, tbl, daddr);
f728bafb
SW
374 if (!en) {
375 en = kmalloc(sizeof(*en), GFP_ATOMIC);
376 if (!en) {
1e3e238e 377 pr_err("%s(): no memory\n", __func__);
f728bafb 378 return NULL;
1da177e4 379 }
f728bafb 380
44548375
JV
381 en->af = dest->af;
382 ip_vs_addr_copy(dest->af, &en->addr, daddr);
f728bafb
SW
383 en->lastuse = jiffies;
384
421f91d2 385 /* initialize its dest set */
f728bafb 386 atomic_set(&(en->set.size), 0);
51f0bc78 387 INIT_LIST_HEAD(&en->set.list);
f728bafb
SW
388 rwlock_init(&en->set.lock);
389
390 ip_vs_lblcr_hash(tbl, en);
1da177e4
LT
391 }
392
f728bafb
SW
393 write_lock(&en->set.lock);
394 ip_vs_dest_set_insert(&en->set, dest);
395 write_unlock(&en->set.lock);
1da177e4 396
f728bafb 397 return en;
1da177e4
LT
398}
399
400
401/*
402 * Flush all the entries of the specified table.
403 */
404static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
405{
406 int i;
407 struct ip_vs_lblcr_entry *en, *nxt;
408
f728bafb 409 /* No locking required, only called during cleanup. */
1da177e4 410 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
1da177e4
LT
411 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
412 ip_vs_lblcr_free(en);
1da177e4 413 }
1da177e4
LT
414 }
415}
416
417
f728bafb 418static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
1da177e4 419{
f728bafb 420 struct ip_vs_lblcr_table *tbl = svc->sched_data;
1da177e4
LT
421 unsigned long now = jiffies;
422 int i, j;
423 struct ip_vs_lblcr_entry *en, *nxt;
d0a1eef9 424 struct netns_ipvs *ipvs = net_ipvs(svc->net);
1da177e4
LT
425
426 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
427 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
428
f728bafb 429 write_lock(&svc->sched_lock);
1da177e4 430 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
d0a1eef9
HS
431 if (time_after(en->lastuse
432 + ipvs->sysctl_lblcr_expiration, now))
1da177e4
LT
433 continue;
434
435 ip_vs_lblcr_free(en);
436 atomic_dec(&tbl->entries);
437 }
f728bafb 438 write_unlock(&svc->sched_lock);
1da177e4
LT
439 }
440 tbl->rover = j;
441}
442
443
444/*
445 * Periodical timer handler for IPVS lblcr table
446 * It is used to collect stale entries when the number of entries
447 * exceeds the maximum size of the table.
448 *
449 * Fixme: we probably need more complicated algorithm to collect
450 * entries that have not been used for a long time even
451 * if the number of entries doesn't exceed the maximum size
452 * of the table.
453 * The full expiration check is for this purpose now.
454 */
455static void ip_vs_lblcr_check_expire(unsigned long data)
456{
f728bafb
SW
457 struct ip_vs_service *svc = (struct ip_vs_service *) data;
458 struct ip_vs_lblcr_table *tbl = svc->sched_data;
1da177e4
LT
459 unsigned long now = jiffies;
460 int goal;
461 int i, j;
462 struct ip_vs_lblcr_entry *en, *nxt;
463
1da177e4
LT
464 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
465 /* do full expiration check */
f728bafb 466 ip_vs_lblcr_full_check(svc);
1da177e4
LT
467 tbl->counter = 1;
468 goto out;
469 }
470
471 if (atomic_read(&tbl->entries) <= tbl->max_size) {
472 tbl->counter++;
473 goto out;
474 }
475
476 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
477 if (goal > tbl->max_size/2)
478 goal = tbl->max_size/2;
479
480 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
481 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
482
f728bafb 483 write_lock(&svc->sched_lock);
1da177e4
LT
484 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
485 if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
486 continue;
487
488 ip_vs_lblcr_free(en);
489 atomic_dec(&tbl->entries);
490 goal--;
491 }
f728bafb 492 write_unlock(&svc->sched_lock);
1da177e4
LT
493 if (goal <= 0)
494 break;
495 }
496 tbl->rover = j;
497
498 out:
499 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
500}
501
1da177e4
LT
502static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
503{
504 int i;
505 struct ip_vs_lblcr_table *tbl;
506
507 /*
508 * Allocate the ip_vs_lblcr_table for this service
509 */
f728bafb 510 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
1da177e4 511 if (tbl == NULL) {
1e3e238e 512 pr_err("%s(): no memory\n", __func__);
1da177e4
LT
513 return -ENOMEM;
514 }
515 svc->sched_data = tbl;
516 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
f728bafb 517 "current service\n", sizeof(*tbl));
1da177e4
LT
518
519 /*
520 * Initialize the hash buckets
521 */
522 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
523 INIT_LIST_HEAD(&tbl->bucket[i]);
524 }
1da177e4
LT
525 tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
526 tbl->rover = 0;
527 tbl->counter = 1;
528
529 /*
530 * Hook periodic timer for garbage collection
531 */
b24b8a24 532 setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
f728bafb
SW
533 (unsigned long)svc);
534 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
1da177e4 535
1da177e4
LT
536 return 0;
537}
538
539
540static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
541{
542 struct ip_vs_lblcr_table *tbl = svc->sched_data;
543
544 /* remove periodic timer */
545 del_timer_sync(&tbl->periodic_timer);
546
547 /* got to clean up table entries here */
548 ip_vs_lblcr_flush(tbl);
549
550 /* release the table itself */
f728bafb 551 kfree(tbl);
1da177e4 552 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
f728bafb 553 sizeof(*tbl));
1da177e4
LT
554
555 return 0;
556}
557
558
1da177e4 559static inline struct ip_vs_dest *
44548375 560__ip_vs_lblcr_schedule(struct ip_vs_service *svc)
1da177e4
LT
561{
562 struct ip_vs_dest *dest, *least;
563 int loh, doh;
564
565 /*
566 * We think the overhead of processing active connections is fifty
567 * times higher than that of inactive connections in average. (This
568 * fifty times might not be accurate, we will change it later.) We
569 * use the following formula to estimate the overhead:
570 * dest->activeconns*50 + dest->inactconns
571 * and the load:
572 * (dest overhead) / dest->weight
573 *
574 * Remember -- no floats in kernel mode!!!
575 * The comparison of h1*w2 > h2*w1 is equivalent to that of
576 * h1/w1 > h2/w2
577 * if every weight is larger than zero.
578 *
579 * The server with weight=0 is quiesced and will not receive any
580 * new connection.
581 */
582 list_for_each_entry(dest, &svc->destinations, n_list) {
583 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
584 continue;
585
586 if (atomic_read(&dest->weight) > 0) {
587 least = dest;
588 loh = atomic_read(&least->activeconns) * 50
589 + atomic_read(&least->inactconns);
590 goto nextstage;
591 }
592 }
593 return NULL;
594
595 /*
596 * Find the destination with the least load.
597 */
598 nextstage:
599 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
600 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
601 continue;
602
603 doh = atomic_read(&dest->activeconns) * 50
604 + atomic_read(&dest->inactconns);
605 if (loh * atomic_read(&dest->weight) >
606 doh * atomic_read(&least->weight)) {
607 least = dest;
608 loh = doh;
609 }
610 }
611
44548375
JV
612 IP_VS_DBG_BUF(6, "LBLCR: server %s:%d "
613 "activeconns %d refcnt %d weight %d overhead %d\n",
614 IP_VS_DBG_ADDR(least->af, &least->addr),
615 ntohs(least->port),
616 atomic_read(&least->activeconns),
617 atomic_read(&least->refcnt),
618 atomic_read(&least->weight), loh);
1da177e4
LT
619
620 return least;
621}
622
623
624/*
625 * If this destination server is overloaded and there is a less loaded
626 * server, then return true.
627 */
628static inline int
629is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
630{
631 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
632 struct ip_vs_dest *d;
633
634 list_for_each_entry(d, &svc->destinations, n_list) {
635 if (atomic_read(&d->activeconns)*2
636 < atomic_read(&d->weight)) {
637 return 1;
638 }
639 }
640 }
641 return 0;
642}
643
644
645/*
646 * Locality-Based (weighted) Least-Connection scheduling
647 */
648static struct ip_vs_dest *
649ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
650{
f728bafb 651 struct ip_vs_lblcr_table *tbl = svc->sched_data;
44548375 652 struct ip_vs_iphdr iph;
f728bafb
SW
653 struct ip_vs_dest *dest = NULL;
654 struct ip_vs_lblcr_entry *en;
1da177e4 655
44548375
JV
656 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
657
1e3e238e 658 IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
1da177e4 659
f728bafb
SW
660 /* First look in our cache */
661 read_lock(&svc->sched_lock);
44548375 662 en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
f728bafb 663 if (en) {
d0a1eef9 664 struct netns_ipvs *ipvs = net_ipvs(svc->net);
f728bafb
SW
665 /* We only hold a read lock, but this is atomic */
666 en->lastuse = jiffies;
667
668 /* Get the least loaded destination */
669 read_lock(&en->set.lock);
1da177e4 670 dest = ip_vs_dest_set_min(&en->set);
f728bafb
SW
671 read_unlock(&en->set.lock);
672
673 /* More than one destination + enough time passed by, cleanup */
1da177e4 674 if (atomic_read(&en->set.size) > 1 &&
f728bafb 675 time_after(jiffies, en->set.lastmod +
d0a1eef9 676 ipvs->sysctl_lblcr_expiration)) {
1da177e4 677 struct ip_vs_dest *m;
f728bafb
SW
678
679 write_lock(&en->set.lock);
1da177e4
LT
680 m = ip_vs_dest_set_max(&en->set);
681 if (m)
682 ip_vs_dest_set_erase(&en->set, m);
f728bafb
SW
683 write_unlock(&en->set.lock);
684 }
685
686 /* If the destination is not overloaded, use it */
687 if (dest && !is_overloaded(dest, svc)) {
688 read_unlock(&svc->sched_lock);
689 goto out;
690 }
691
692 /* The cache entry is invalid, time to schedule */
44548375 693 dest = __ip_vs_lblcr_schedule(svc);
f728bafb 694 if (!dest) {
68888d10 695 IP_VS_ERR_RL("LBLCR: no destination available\n");
f728bafb
SW
696 read_unlock(&svc->sched_lock);
697 return NULL;
1da177e4 698 }
f728bafb
SW
699
700 /* Update our cache entry */
701 write_lock(&en->set.lock);
702 ip_vs_dest_set_insert(&en->set, dest);
703 write_unlock(&en->set.lock);
704 }
705 read_unlock(&svc->sched_lock);
706
707 if (dest)
708 goto out;
709
710 /* No cache entry, time to schedule */
44548375 711 dest = __ip_vs_lblcr_schedule(svc);
f728bafb
SW
712 if (!dest) {
713 IP_VS_DBG(1, "no destination available\n");
714 return NULL;
1da177e4 715 }
1da177e4 716
f728bafb
SW
717 /* If we fail to create a cache entry, we'll just use the valid dest */
718 write_lock(&svc->sched_lock);
44548375 719 ip_vs_lblcr_new(tbl, &iph.daddr, dest);
f728bafb
SW
720 write_unlock(&svc->sched_lock);
721
722out:
44548375
JV
723 IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
724 IP_VS_DBG_ADDR(svc->af, &iph.daddr),
725 IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
1da177e4
LT
726
727 return dest;
728}
729
730
731/*
732 * IPVS LBLCR Scheduler structure
733 */
734static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
735{
736 .name = "lblcr",
737 .refcnt = ATOMIC_INIT(0),
738 .module = THIS_MODULE,
d149ccc9 739 .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
1da177e4
LT
740 .init_service = ip_vs_lblcr_init_svc,
741 .done_service = ip_vs_lblcr_done_svc,
1da177e4
LT
742 .schedule = ip_vs_lblcr_schedule,
743};
744
61b1ab45
HS
745/*
746 * per netns init.
747 */
748static int __net_init __ip_vs_lblcr_init(struct net *net)
749{
d0a1eef9
HS
750 struct netns_ipvs *ipvs = net_ipvs(net);
751
752 if (!net_eq(net, &init_net)) {
753 ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
754 sizeof(vs_vars_table),
755 GFP_KERNEL);
756 if (ipvs->lblcr_ctl_table == NULL)
0443929f 757 return -ENOMEM;
d0a1eef9
HS
758 } else
759 ipvs->lblcr_ctl_table = vs_vars_table;
760 ipvs->sysctl_lblcr_expiration = 24*60*60*HZ;
761 ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
762
0443929f 763#ifdef CONFIG_SYSCTL
d0a1eef9
HS
764 ipvs->lblcr_ctl_header =
765 register_net_sysctl_table(net, net_vs_ctl_path,
766 ipvs->lblcr_ctl_table);
0443929f
SH
767 if (!ipvs->lblcr_ctl_header) {
768 if (!net_eq(net, &init_net))
769 kfree(ipvs->lblcr_ctl_table);
770 return -ENOMEM;
771 }
772#endif
61b1ab45
HS
773
774 return 0;
775}
776
777static void __net_exit __ip_vs_lblcr_exit(struct net *net)
778{
d0a1eef9
HS
779 struct netns_ipvs *ipvs = net_ipvs(net);
780
0443929f 781#ifdef CONFIG_SYSCTL
d0a1eef9 782 unregister_net_sysctl_table(ipvs->lblcr_ctl_header);
0443929f 783#endif
61b1ab45 784
d0a1eef9
HS
785 if (!net_eq(net, &init_net))
786 kfree(ipvs->lblcr_ctl_table);
61b1ab45
HS
787}
788
789static struct pernet_operations ip_vs_lblcr_ops = {
790 .init = __ip_vs_lblcr_init,
791 .exit = __ip_vs_lblcr_exit,
792};
1da177e4
LT
793
794static int __init ip_vs_lblcr_init(void)
795{
a014bc8f
PE
796 int ret;
797
61b1ab45
HS
798 ret = register_pernet_subsys(&ip_vs_lblcr_ops);
799 if (ret)
800 return ret;
801
a014bc8f
PE
802 ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
803 if (ret)
61b1ab45 804 unregister_pernet_subsys(&ip_vs_lblcr_ops);
a014bc8f 805 return ret;
1da177e4
LT
806}
807
1da177e4
LT
808static void __exit ip_vs_lblcr_cleanup(void)
809{
1da177e4 810 unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
61b1ab45 811 unregister_pernet_subsys(&ip_vs_lblcr_ops);
1da177e4
LT
812}
813
814
815module_init(ip_vs_lblcr_init);
816module_exit(ip_vs_lblcr_cleanup);
817MODULE_LICENSE("GPL");
This page took 0.64634 seconds and 5 git commands to generate.