1 /* Cluster IP hashmark target
2 * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
3 * based on ideas of Fabio Olive Leite <olive@unixforge.org>
5 * Development of this code funded by SuSE Linux AG, http://www.suse.com/
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
12 #include <linux/module.h>
13 #include <linux/config.h>
14 #include <linux/proc_fs.h>
15 #include <linux/jhash.h>
16 #include <linux/bitops.h>
17 #include <linux/skbuff.h>
19 #include <linux/tcp.h>
20 #include <linux/udp.h>
21 #include <linux/icmp.h>
22 #include <linux/if_arp.h>
23 #include <linux/proc_fs.h>
24 #include <linux/seq_file.h>
26 #include <net/checksum.h>
28 #include <linux/netfilter_arp.h>
30 #include <linux/netfilter_ipv4/ip_tables.h>
31 #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
32 #include <net/netfilter/nf_conntrack_compat.h>
34 #define CLUSTERIP_VERSION "0.8"
36 #define DEBUG_CLUSTERIP
38 #ifdef DEBUG_CLUSTERIP
44 #define ASSERT_READ_LOCK(x)
46 MODULE_LICENSE("GPL");
47 MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
48 MODULE_DESCRIPTION("iptables target for CLUSTERIP");
50 struct clusterip_config
{
51 struct list_head list
; /* list of all configs */
52 atomic_t refcount
; /* reference count */
53 atomic_t entries
; /* number of entries/rules
56 u_int32_t clusterip
; /* the IP address */
57 u_int8_t clustermac
[ETH_ALEN
]; /* the MAC address */
58 struct net_device
*dev
; /* device */
59 u_int16_t num_total_nodes
; /* total number of nodes */
60 unsigned long local_nodes
; /* node number array */
63 struct proc_dir_entry
*pde
; /* proc dir entry */
65 enum clusterip_hashmode hash_mode
; /* which hashing mode */
66 u_int32_t hash_initval
; /* hash initialization */
69 static LIST_HEAD(clusterip_configs
);
71 /* clusterip_lock protects the clusterip_configs list */
72 static DEFINE_RWLOCK(clusterip_lock
);
75 static struct file_operations clusterip_proc_fops
;
76 static struct proc_dir_entry
*clusterip_procdir
;
80 clusterip_config_get(struct clusterip_config
*c
)
82 atomic_inc(&c
->refcount
);
86 clusterip_config_put(struct clusterip_config
*c
)
88 if (atomic_dec_and_test(&c
->refcount
))
92 /* increase the count of entries(rules) using/referencing this config */
94 clusterip_config_entry_get(struct clusterip_config
*c
)
96 atomic_inc(&c
->entries
);
99 /* decrease the count of entries using/referencing this config. If last
100 * entry(rule) is removed, remove the config from lists, but don't free it
101 * yet, since proc-files could still be holding references */
103 clusterip_config_entry_put(struct clusterip_config
*c
)
105 if (atomic_dec_and_test(&c
->entries
)) {
106 write_lock_bh(&clusterip_lock
);
108 write_unlock_bh(&clusterip_lock
);
110 dev_mc_delete(c
->dev
, c
->clustermac
, ETH_ALEN
, 0);
113 /* In case anyone still accesses the file, the open/close
114 * functions are also incrementing the refcount on their own,
115 * so it's safe to remove the entry even if it's in use. */
116 #ifdef CONFIG_PROC_FS
117 remove_proc_entry(c
->pde
->name
, c
->pde
->parent
);
122 static struct clusterip_config
*
123 __clusterip_config_find(u_int32_t clusterip
)
125 struct list_head
*pos
;
127 ASSERT_READ_LOCK(&clusterip_lock
);
128 list_for_each(pos
, &clusterip_configs
) {
129 struct clusterip_config
*c
= list_entry(pos
,
130 struct clusterip_config
, list
);
131 if (c
->clusterip
== clusterip
) {
139 static inline struct clusterip_config
*
140 clusterip_config_find_get(u_int32_t clusterip
, int entry
)
142 struct clusterip_config
*c
;
144 read_lock_bh(&clusterip_lock
);
145 c
= __clusterip_config_find(clusterip
);
147 read_unlock_bh(&clusterip_lock
);
150 atomic_inc(&c
->refcount
);
152 atomic_inc(&c
->entries
);
153 read_unlock_bh(&clusterip_lock
);
159 clusterip_config_init_nodelist(struct clusterip_config
*c
,
160 const struct ipt_clusterip_tgt_info
*i
)
164 for (n
= 0; n
< i
->num_local_nodes
; n
++) {
165 set_bit(i
->local_nodes
[n
] - 1, &c
->local_nodes
);
169 static struct clusterip_config
*
170 clusterip_config_init(struct ipt_clusterip_tgt_info
*i
, u_int32_t ip
,
171 struct net_device
*dev
)
173 struct clusterip_config
*c
;
176 c
= kmalloc(sizeof(*c
), GFP_ATOMIC
);
180 memset(c
, 0, sizeof(*c
));
183 memcpy(&c
->clustermac
, &i
->clustermac
, ETH_ALEN
);
184 c
->num_total_nodes
= i
->num_total_nodes
;
185 clusterip_config_init_nodelist(c
, i
);
186 c
->hash_mode
= i
->hash_mode
;
187 c
->hash_initval
= i
->hash_initval
;
188 atomic_set(&c
->refcount
, 1);
189 atomic_set(&c
->entries
, 1);
191 #ifdef CONFIG_PROC_FS
192 /* create proc dir entry */
193 sprintf(buffer
, "%u.%u.%u.%u", NIPQUAD(ip
));
194 c
->pde
= create_proc_entry(buffer
, S_IWUSR
|S_IRUSR
, clusterip_procdir
);
199 c
->pde
->proc_fops
= &clusterip_proc_fops
;
203 write_lock_bh(&clusterip_lock
);
204 list_add(&c
->list
, &clusterip_configs
);
205 write_unlock_bh(&clusterip_lock
);
211 clusterip_add_node(struct clusterip_config
*c
, u_int16_t nodenum
)
215 nodenum
> c
->num_total_nodes
)
218 /* check if we already have this number in our bitfield */
219 if (test_and_set_bit(nodenum
- 1, &c
->local_nodes
))
226 clusterip_del_node(struct clusterip_config
*c
, u_int16_t nodenum
)
229 nodenum
> c
->num_total_nodes
)
232 if (test_and_clear_bit(nodenum
- 1, &c
->local_nodes
))
238 static inline u_int32_t
239 clusterip_hashfn(struct sk_buff
*skb
, struct clusterip_config
*config
)
241 struct iphdr
*iph
= skb
->nh
.iph
;
242 unsigned long hashval
;
243 u_int16_t sport
, dport
;
248 switch (iph
->protocol
) {
250 th
= (void *)iph
+iph
->ihl
*4;
251 sport
= ntohs(th
->source
);
252 dport
= ntohs(th
->dest
);
255 uh
= (void *)iph
+iph
->ihl
*4;
256 sport
= ntohs(uh
->source
);
257 dport
= ntohs(uh
->dest
);
260 ih
= (void *)iph
+iph
->ihl
*4;
261 sport
= ntohs(ih
->un
.echo
.id
);
262 dport
= (ih
->type
<<8)|ih
->code
;
265 if (net_ratelimit()) {
266 printk(KERN_NOTICE
"CLUSTERIP: unknown protocol `%u'\n",
272 switch (config
->hash_mode
) {
273 case CLUSTERIP_HASHMODE_SIP
:
274 hashval
= jhash_1word(ntohl(iph
->saddr
),
275 config
->hash_initval
);
277 case CLUSTERIP_HASHMODE_SIP_SPT
:
278 hashval
= jhash_2words(ntohl(iph
->saddr
), sport
,
279 config
->hash_initval
);
281 case CLUSTERIP_HASHMODE_SIP_SPT_DPT
:
282 hashval
= jhash_3words(ntohl(iph
->saddr
), sport
, dport
,
283 config
->hash_initval
);
286 /* to make gcc happy */
288 /* This cannot happen, unless the check function wasn't called
289 * at rule load time */
290 printk("CLUSTERIP: unknown mode `%u'\n", config
->hash_mode
);
295 /* node numbers are 1..n, not 0..n */
296 return ((hashval
% config
->num_total_nodes
)+1);
300 clusterip_responsible(struct clusterip_config
*config
, u_int32_t hash
)
302 return test_bit(hash
- 1, &config
->local_nodes
);
305 /***********************************************************************
307 ***********************************************************************/
310 target(struct sk_buff
**pskb
,
311 const struct net_device
*in
,
312 const struct net_device
*out
,
313 unsigned int hooknum
,
314 const struct xt_target
*target
,
315 const void *targinfo
,
318 const struct ipt_clusterip_tgt_info
*cipinfo
= targinfo
;
319 enum ip_conntrack_info ctinfo
;
320 u_int32_t
*mark
, hash
;
322 /* don't need to clusterip_config_get() here, since refcount
323 * is only decremented by destroy() - and ip_tables guarantees
324 * that the ->target() function isn't called after ->destroy() */
326 mark
= nf_ct_get_mark((*pskb
), &ctinfo
);
328 printk(KERN_ERR
"CLUSTERIP: no conntrack!\n");
329 /* FIXME: need to drop invalid ones, since replies
330 * to outgoing connections of other nodes will be
331 * marked as INVALID */
335 /* special case: ICMP error handling. conntrack distinguishes between
336 * error messages (RELATED) and information requests (see below) */
337 if ((*pskb
)->nh
.iph
->protocol
== IPPROTO_ICMP
338 && (ctinfo
== IP_CT_RELATED
339 || ctinfo
== IP_CT_RELATED
+IP_CT_IS_REPLY
))
342 /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
343 * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
344 * on, which all have an ID field [relevant for hashing]. */
346 hash
= clusterip_hashfn(*pskb
, cipinfo
->config
);
353 case IP_CT_RELATED
+IP_CT_IS_REPLY
:
354 /* FIXME: we don't handle expectations at the
355 * moment. they can arrive on a different node than
356 * the master connection (e.g. FTP passive mode) */
357 case IP_CT_ESTABLISHED
:
358 case IP_CT_ESTABLISHED
+IP_CT_IS_REPLY
:
364 #ifdef DEBUG_CLUSTERP
365 DUMP_TUPLE(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
);
367 DEBUGP("hash=%u ct_hash=%u ", hash
, *mark
);
368 if (!clusterip_responsible(cipinfo
->config
, hash
)) {
369 DEBUGP("not responsible\n");
372 DEBUGP("responsible\n");
374 /* despite being received via linklayer multicast, this is
375 * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
376 (*pskb
)->pkt_type
= PACKET_HOST
;
382 checkentry(const char *tablename
,
384 const struct xt_target
*target
,
386 unsigned int targinfosize
,
387 unsigned int hook_mask
)
389 struct ipt_clusterip_tgt_info
*cipinfo
= targinfo
;
390 const struct ipt_entry
*e
= e_void
;
392 struct clusterip_config
*config
;
394 if (cipinfo
->hash_mode
!= CLUSTERIP_HASHMODE_SIP
&&
395 cipinfo
->hash_mode
!= CLUSTERIP_HASHMODE_SIP_SPT
&&
396 cipinfo
->hash_mode
!= CLUSTERIP_HASHMODE_SIP_SPT_DPT
) {
397 printk(KERN_WARNING
"CLUSTERIP: unknown mode `%u'\n",
402 if (e
->ip
.dmsk
.s_addr
!= 0xffffffff
403 || e
->ip
.dst
.s_addr
== 0) {
404 printk(KERN_ERR
"CLUSTERIP: Please specify destination IP\n");
408 /* FIXME: further sanity checks */
410 config
= clusterip_config_find_get(e
->ip
.dst
.s_addr
, 1);
412 if (cipinfo
->config
!= NULL
) {
413 /* Case A: This is an entry that gets reloaded, since
414 * it still has a cipinfo->config pointer. Simply
415 * increase the entry refcount and return */
416 if (cipinfo
->config
!= config
) {
417 printk(KERN_ERR
"CLUSTERIP: Reloaded entry "
418 "has invalid config pointer!\n");
421 clusterip_config_entry_get(cipinfo
->config
);
423 /* Case B: This is a new rule referring to an existing
424 * clusterip config. */
425 cipinfo
->config
= config
;
426 clusterip_config_entry_get(cipinfo
->config
);
429 /* Case C: This is a completely new clusterip config */
430 if (!(cipinfo
->flags
& CLUSTERIP_FLAG_NEW
)) {
431 printk(KERN_WARNING
"CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e
->ip
.dst
.s_addr
));
434 struct net_device
*dev
;
436 if (e
->ip
.iniface
[0] == '\0') {
437 printk(KERN_WARNING
"CLUSTERIP: Please specify an interface name\n");
441 dev
= dev_get_by_name(e
->ip
.iniface
);
443 printk(KERN_WARNING
"CLUSTERIP: no such interface %s\n", e
->ip
.iniface
);
447 config
= clusterip_config_init(cipinfo
,
448 e
->ip
.dst
.s_addr
, dev
);
450 printk(KERN_WARNING
"CLUSTERIP: cannot allocate config\n");
454 dev_mc_add(config
->dev
,config
->clustermac
, ETH_ALEN
, 0);
456 cipinfo
->config
= config
;
462 /* drop reference count of cluster config when rule is deleted */
463 static void destroy(const struct xt_target
*target
, void *targinfo
,
464 unsigned int targinfosize
)
466 struct ipt_clusterip_tgt_info
*cipinfo
= targinfo
;
468 /* if no more entries are referencing the config, remove it
469 * from the list and destroy the proc entry */
470 clusterip_config_entry_put(cipinfo
->config
);
472 clusterip_config_put(cipinfo
->config
);
475 static struct ipt_target clusterip_tgt
= {
478 .targetsize
= sizeof(struct ipt_clusterip_tgt_info
),
479 .checkentry
= checkentry
,
485 /***********************************************************************
487 ***********************************************************************/
489 /* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
491 u_int8_t src_hw
[ETH_ALEN
];
493 u_int8_t dst_hw
[ETH_ALEN
];
495 } __attribute__ ((packed
));
497 #ifdef CLUSTERIP_DEBUG
498 static void arp_print(struct arp_payload
*payload
)
500 #define HBUFFERLEN 30
501 char hbuffer
[HBUFFERLEN
];
503 const char hexbuf
[]= "0123456789abcdef";
505 for (k
=0, j
=0; k
< HBUFFERLEN
-3 && j
< ETH_ALEN
; j
++) {
506 hbuffer
[k
++]=hexbuf
[(payload
->src_hw
[j
]>>4)&15];
507 hbuffer
[k
++]=hexbuf
[payload
->src_hw
[j
]&15];
512 printk("src %u.%u.%u.%u@%s, dst %u.%u.%u.%u\n",
513 NIPQUAD(payload
->src_ip
), hbuffer
,
514 NIPQUAD(payload
->dst_ip
));
519 arp_mangle(unsigned int hook
,
520 struct sk_buff
**pskb
,
521 const struct net_device
*in
,
522 const struct net_device
*out
,
523 int (*okfn
)(struct sk_buff
*))
525 struct arphdr
*arp
= (*pskb
)->nh
.arph
;
526 struct arp_payload
*payload
;
527 struct clusterip_config
*c
;
529 /* we don't care about non-ethernet and non-ipv4 ARP */
530 if (arp
->ar_hrd
!= htons(ARPHRD_ETHER
)
531 || arp
->ar_pro
!= htons(ETH_P_IP
)
532 || arp
->ar_pln
!= 4 || arp
->ar_hln
!= ETH_ALEN
)
535 /* we only want to mangle arp requests and replies */
536 if (arp
->ar_op
!= htons(ARPOP_REPLY
)
537 && arp
->ar_op
!= htons(ARPOP_REQUEST
))
540 payload
= (void *)(arp
+1);
542 /* if there is no clusterip configuration for the arp reply's
543 * source ip, we don't want to mangle it */
544 c
= clusterip_config_find_get(payload
->src_ip
, 0);
548 /* normally the linux kernel always replies to arp queries of
549 * addresses on different interfacs. However, in the CLUSTERIP case
550 * this wouldn't work, since we didn't subscribe the mcast group on
551 * other interfaces */
553 DEBUGP("CLUSTERIP: not mangling arp reply on different "
554 "interface: cip'%s'-skb'%s'\n", c
->dev
->name
, out
->name
);
555 clusterip_config_put(c
);
559 /* mangle reply hardware address */
560 memcpy(payload
->src_hw
, c
->clustermac
, arp
->ar_hln
);
562 #ifdef CLUSTERIP_DEBUG
563 DEBUGP(KERN_DEBUG
"CLUSTERIP mangled arp reply: ");
567 clusterip_config_put(c
);
572 static struct nf_hook_ops cip_arp_ops
= {
575 .hooknum
= NF_ARP_OUT
,
579 /***********************************************************************
581 ***********************************************************************/
583 #ifdef CONFIG_PROC_FS
585 struct clusterip_seq_position
{
586 unsigned int pos
; /* position */
587 unsigned int weight
; /* number of bits set == size */
588 unsigned int bit
; /* current bit */
589 unsigned long val
; /* current value */
592 static void *clusterip_seq_start(struct seq_file
*s
, loff_t
*pos
)
594 struct proc_dir_entry
*pde
= s
->private;
595 struct clusterip_config
*c
= pde
->data
;
597 u_int32_t local_nodes
;
598 struct clusterip_seq_position
*idx
;
600 /* FIXME: possible race */
601 local_nodes
= c
->local_nodes
;
602 weight
= hweight32(local_nodes
);
606 idx
= kmalloc(sizeof(struct clusterip_seq_position
), GFP_KERNEL
);
608 return ERR_PTR(-ENOMEM
);
611 idx
->weight
= weight
;
612 idx
->bit
= ffs(local_nodes
);
613 idx
->val
= local_nodes
;
614 clear_bit(idx
->bit
- 1, &idx
->val
);
619 static void *clusterip_seq_next(struct seq_file
*s
, void *v
, loff_t
*pos
)
621 struct clusterip_seq_position
*idx
= (struct clusterip_seq_position
*)v
;
624 if (*pos
>= idx
->weight
) {
628 idx
->bit
= ffs(idx
->val
);
629 clear_bit(idx
->bit
- 1, &idx
->val
);
633 static void clusterip_seq_stop(struct seq_file
*s
, void *v
)
638 static int clusterip_seq_show(struct seq_file
*s
, void *v
)
640 struct clusterip_seq_position
*idx
= (struct clusterip_seq_position
*)v
;
645 seq_printf(s
, "%u", idx
->bit
);
647 if (idx
->pos
== idx
->weight
- 1)
653 static struct seq_operations clusterip_seq_ops
= {
654 .start
= clusterip_seq_start
,
655 .next
= clusterip_seq_next
,
656 .stop
= clusterip_seq_stop
,
657 .show
= clusterip_seq_show
,
660 static int clusterip_proc_open(struct inode
*inode
, struct file
*file
)
662 int ret
= seq_open(file
, &clusterip_seq_ops
);
665 struct seq_file
*sf
= file
->private_data
;
666 struct proc_dir_entry
*pde
= PDE(inode
);
667 struct clusterip_config
*c
= pde
->data
;
671 clusterip_config_get(c
);
677 static int clusterip_proc_release(struct inode
*inode
, struct file
*file
)
679 struct proc_dir_entry
*pde
= PDE(inode
);
680 struct clusterip_config
*c
= pde
->data
;
683 ret
= seq_release(inode
, file
);
686 clusterip_config_put(c
);
691 static ssize_t
clusterip_proc_write(struct file
*file
, const char __user
*input
,
692 size_t size
, loff_t
*ofs
)
694 #define PROC_WRITELEN 10
695 char buffer
[PROC_WRITELEN
+1];
696 struct proc_dir_entry
*pde
= PDE(file
->f_dentry
->d_inode
);
697 struct clusterip_config
*c
= pde
->data
;
698 unsigned long nodenum
;
700 if (copy_from_user(buffer
, input
, PROC_WRITELEN
))
703 if (*buffer
== '+') {
704 nodenum
= simple_strtoul(buffer
+1, NULL
, 10);
705 if (clusterip_add_node(c
, nodenum
))
707 } else if (*buffer
== '-') {
708 nodenum
= simple_strtoul(buffer
+1, NULL
,10);
709 if (clusterip_del_node(c
, nodenum
))
717 static struct file_operations clusterip_proc_fops
= {
718 .owner
= THIS_MODULE
,
719 .open
= clusterip_proc_open
,
721 .write
= clusterip_proc_write
,
723 .release
= clusterip_proc_release
,
726 #endif /* CONFIG_PROC_FS */
728 static int __init
ipt_clusterip_init(void)
732 ret
= ipt_register_target(&clusterip_tgt
);
736 ret
= nf_register_hook(&cip_arp_ops
);
740 #ifdef CONFIG_PROC_FS
741 clusterip_procdir
= proc_mkdir("ipt_CLUSTERIP", proc_net
);
742 if (!clusterip_procdir
) {
743 printk(KERN_ERR
"CLUSTERIP: Unable to proc dir entry\n");
747 #endif /* CONFIG_PROC_FS */
749 printk(KERN_NOTICE
"ClusterIP Version %s loaded successfully\n",
754 nf_unregister_hook(&cip_arp_ops
);
756 ipt_unregister_target(&clusterip_tgt
);
760 static void __exit
ipt_clusterip_fini(void)
762 printk(KERN_NOTICE
"ClusterIP Version %s unloading\n",
764 #ifdef CONFIG_PROC_FS
765 remove_proc_entry(clusterip_procdir
->name
, clusterip_procdir
->parent
);
767 nf_unregister_hook(&cip_arp_ops
);
768 ipt_unregister_target(&clusterip_tgt
);
771 module_init(ipt_clusterip_init
);
772 module_exit(ipt_clusterip_fini
);