1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43 registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
53 #define IP_CONNTRACK_VERSION "2.3"
58 #define DEBUGP(format, args...)
61 DEFINE_RWLOCK(ip_conntrack_lock
);
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count
= ATOMIC_INIT(0);
66 void (*ip_conntrack_destroyed
)(struct ip_conntrack
*conntrack
) = NULL
;
67 LIST_HEAD(ip_conntrack_expect_list
);
68 struct ip_conntrack_protocol
*ip_ct_protos
[MAX_IP_CT_PROTO
];
69 static LIST_HEAD(helpers
);
70 unsigned int ip_conntrack_htable_size
= 0;
72 struct list_head
*ip_conntrack_hash
;
73 static kmem_cache_t
*ip_conntrack_cachep
;
74 static kmem_cache_t
*ip_conntrack_expect_cachep
;
75 struct ip_conntrack ip_conntrack_untracked
;
76 unsigned int ip_ct_log_invalid
;
77 static LIST_HEAD(unconfirmed
);
78 static int ip_conntrack_vmalloc
;
80 static unsigned int ip_conntrack_next_id
= 1;
81 static unsigned int ip_conntrack_expect_next_id
= 1;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 struct notifier_block
*ip_conntrack_chain
;
84 struct notifier_block
*ip_conntrack_expect_chain
;
86 DEFINE_PER_CPU(struct ip_conntrack_ecache
, ip_conntrack_ecache
);
88 static inline void __deliver_cached_events(struct ip_conntrack_ecache
*ecache
)
90 if (is_confirmed(ecache
->ct
) && !is_dying(ecache
->ct
) && ecache
->events
)
91 notifier_call_chain(&ip_conntrack_chain
, ecache
->events
,
96 void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache
*ecache
)
98 __deliver_cached_events(ecache
);
101 /* Deliver all cached events for a particular conntrack. This is called
102 * by code prior to async packet handling or freeing the skb */
104 ip_conntrack_deliver_cached_events_for(const struct ip_conntrack
*ct
)
106 struct ip_conntrack_ecache
*ecache
=
107 &__get_cpu_var(ip_conntrack_ecache
);
112 if (ecache
->ct
== ct
) {
113 DEBUGP("ecache: delivering event for %p\n", ct
);
114 __deliver_cached_events(ecache
);
117 printk(KERN_WARNING
"ecache: want to deliver for %p, "
118 "but cache has %p\n", ct
, ecache
->ct
);
121 /* signalize that events have already been delivered */
125 /* Deliver cached events for old pending events, if current conntrack != old */
126 void ip_conntrack_event_cache_init(const struct sk_buff
*skb
)
128 struct ip_conntrack
*ct
= (struct ip_conntrack
*) skb
->nfct
;
129 struct ip_conntrack_ecache
*ecache
=
130 &__get_cpu_var(ip_conntrack_ecache
);
132 /* take care of delivering potentially old events */
133 if (ecache
->ct
!= ct
) {
134 enum ip_conntrack_info ctinfo
;
135 /* we have to check, since at startup the cache is NULL */
136 if (likely(ecache
->ct
)) {
137 DEBUGP("ecache: entered for different conntrack: "
138 "ecache->ct=%p, skb->nfct=%p. delivering "
139 "events\n", ecache
->ct
, ct
);
140 __deliver_cached_events(ecache
);
141 ip_conntrack_put(ecache
->ct
);
143 DEBUGP("ecache: entered for conntrack %p, "
144 "cache was clean before\n", ct
);
147 /* initialize for this conntrack/packet */
148 ecache
->ct
= ip_conntrack_get(skb
, &ctinfo
);
149 /* ecache->events cleared by __deliver_cached_devents() */
151 DEBUGP("ecache: re-entered for conntrack %p.\n", ct
);
155 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
157 DEFINE_PER_CPU(struct ip_conntrack_stat
, ip_conntrack_stat
);
159 static int ip_conntrack_hash_rnd_initted
;
160 static unsigned int ip_conntrack_hash_rnd
;
163 hash_conntrack(const struct ip_conntrack_tuple
*tuple
)
168 return (jhash_3words(tuple
->src
.ip
,
169 (tuple
->dst
.ip
^ tuple
->dst
.protonum
),
170 (tuple
->src
.u
.all
| (tuple
->dst
.u
.all
<< 16)),
171 ip_conntrack_hash_rnd
) % ip_conntrack_htable_size
);
175 ip_ct_get_tuple(const struct iphdr
*iph
,
176 const struct sk_buff
*skb
,
177 unsigned int dataoff
,
178 struct ip_conntrack_tuple
*tuple
,
179 const struct ip_conntrack_protocol
*protocol
)
182 if (iph
->frag_off
& htons(IP_OFFSET
)) {
183 printk("ip_conntrack_core: Frag of proto %u.\n",
188 tuple
->src
.ip
= iph
->saddr
;
189 tuple
->dst
.ip
= iph
->daddr
;
190 tuple
->dst
.protonum
= iph
->protocol
;
191 tuple
->dst
.dir
= IP_CT_DIR_ORIGINAL
;
193 return protocol
->pkt_to_tuple(skb
, dataoff
, tuple
);
197 ip_ct_invert_tuple(struct ip_conntrack_tuple
*inverse
,
198 const struct ip_conntrack_tuple
*orig
,
199 const struct ip_conntrack_protocol
*protocol
)
201 inverse
->src
.ip
= orig
->dst
.ip
;
202 inverse
->dst
.ip
= orig
->src
.ip
;
203 inverse
->dst
.protonum
= orig
->dst
.protonum
;
204 inverse
->dst
.dir
= !orig
->dst
.dir
;
206 return protocol
->invert_tuple(inverse
, orig
);
210 /* ip_conntrack_expect helper functions */
211 static void unlink_expect(struct ip_conntrack_expect
*exp
)
213 ASSERT_WRITE_LOCK(&ip_conntrack_lock
);
214 IP_NF_ASSERT(!timer_pending(&exp
->timeout
));
215 list_del(&exp
->list
);
216 CONNTRACK_STAT_INC(expect_delete
);
217 exp
->master
->expecting
--;
220 void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect
*exp
)
223 ip_conntrack_expect_put(exp
);
226 static void expectation_timed_out(unsigned long ul_expect
)
228 struct ip_conntrack_expect
*exp
= (void *)ul_expect
;
230 write_lock_bh(&ip_conntrack_lock
);
232 write_unlock_bh(&ip_conntrack_lock
);
233 ip_conntrack_expect_put(exp
);
236 struct ip_conntrack_expect
*
237 __ip_conntrack_expect_find(const struct ip_conntrack_tuple
*tuple
)
239 struct ip_conntrack_expect
*i
;
241 list_for_each_entry(i
, &ip_conntrack_expect_list
, list
) {
242 if (ip_ct_tuple_mask_cmp(tuple
, &i
->tuple
, &i
->mask
)) {
250 /* Just find a expectation corresponding to a tuple. */
251 struct ip_conntrack_expect
*
252 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple
*tuple
)
254 struct ip_conntrack_expect
*i
;
256 read_lock_bh(&ip_conntrack_lock
);
257 i
= __ip_conntrack_expect_find(tuple
);
258 read_unlock_bh(&ip_conntrack_lock
);
263 /* If an expectation for this connection is found, it gets delete from
264 * global list then returned. */
265 static struct ip_conntrack_expect
*
266 find_expectation(const struct ip_conntrack_tuple
*tuple
)
268 struct ip_conntrack_expect
*i
;
270 list_for_each_entry(i
, &ip_conntrack_expect_list
, list
) {
271 /* If master is not in hash table yet (ie. packet hasn't left
272 this machine yet), how can other end know about expected?
273 Hence these are not the droids you are looking for (if
274 master ct never got confirmed, we'd hold a reference to it
275 and weird things would happen to future packets). */
276 if (ip_ct_tuple_mask_cmp(tuple
, &i
->tuple
, &i
->mask
)
277 && is_confirmed(i
->master
)
278 && del_timer(&i
->timeout
)) {
286 /* delete all expectations for this conntrack */
287 void ip_ct_remove_expectations(struct ip_conntrack
*ct
)
289 struct ip_conntrack_expect
*i
, *tmp
;
291 /* Optimization: most connection never expect any others. */
292 if (ct
->expecting
== 0)
295 list_for_each_entry_safe(i
, tmp
, &ip_conntrack_expect_list
, list
) {
296 if (i
->master
== ct
&& del_timer(&i
->timeout
)) {
298 ip_conntrack_expect_put(i
);
304 clean_from_lists(struct ip_conntrack
*ct
)
308 DEBUGP("clean_from_lists(%p)\n", ct
);
309 ASSERT_WRITE_LOCK(&ip_conntrack_lock
);
311 ho
= hash_conntrack(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
);
312 hr
= hash_conntrack(&ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
);
313 LIST_DELETE(&ip_conntrack_hash
[ho
], &ct
->tuplehash
[IP_CT_DIR_ORIGINAL
]);
314 LIST_DELETE(&ip_conntrack_hash
[hr
], &ct
->tuplehash
[IP_CT_DIR_REPLY
]);
316 /* Destroy all pending expectations */
317 ip_ct_remove_expectations(ct
);
321 destroy_conntrack(struct nf_conntrack
*nfct
)
323 struct ip_conntrack
*ct
= (struct ip_conntrack
*)nfct
;
324 struct ip_conntrack_protocol
*proto
;
326 DEBUGP("destroy_conntrack(%p)\n", ct
);
327 IP_NF_ASSERT(atomic_read(&nfct
->use
) == 0);
328 IP_NF_ASSERT(!timer_pending(&ct
->timeout
));
330 set_bit(IPS_DYING_BIT
, &ct
->status
);
332 /* To make sure we don't get any weird locking issues here:
333 * destroy_conntrack() MUST NOT be called with a write lock
334 * to ip_conntrack_lock!!! -HW */
335 proto
= __ip_conntrack_proto_find(ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
.dst
.protonum
);
336 if (proto
&& proto
->destroy
)
339 if (ip_conntrack_destroyed
)
340 ip_conntrack_destroyed(ct
);
342 write_lock_bh(&ip_conntrack_lock
);
343 /* Expectations will have been removed in clean_from_lists,
344 * except TFTP can create an expectation on the first packet,
345 * before connection is in the list, so we need to clean here,
347 ip_ct_remove_expectations(ct
);
349 /* We overload first tuple to link into unconfirmed list. */
350 if (!is_confirmed(ct
)) {
351 BUG_ON(list_empty(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].list
));
352 list_del(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].list
);
355 CONNTRACK_STAT_INC(delete);
356 write_unlock_bh(&ip_conntrack_lock
);
359 ip_conntrack_put(ct
->master
);
361 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct
);
362 ip_conntrack_free(ct
);
365 static void death_by_timeout(unsigned long ul_conntrack
)
367 struct ip_conntrack
*ct
= (void *)ul_conntrack
;
369 ip_conntrack_event(IPCT_DESTROY
, ct
);
370 write_lock_bh(&ip_conntrack_lock
);
371 /* Inside lock so preempt is disabled on module removal path.
372 * Otherwise we can get spurious warnings. */
373 CONNTRACK_STAT_INC(delete_list
);
374 clean_from_lists(ct
);
375 write_unlock_bh(&ip_conntrack_lock
);
376 ip_conntrack_put(ct
);
380 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash
*i
,
381 const struct ip_conntrack_tuple
*tuple
,
382 const struct ip_conntrack
*ignored_conntrack
)
384 ASSERT_READ_LOCK(&ip_conntrack_lock
);
385 return tuplehash_to_ctrack(i
) != ignored_conntrack
386 && ip_ct_tuple_equal(tuple
, &i
->tuple
);
389 struct ip_conntrack_tuple_hash
*
390 __ip_conntrack_find(const struct ip_conntrack_tuple
*tuple
,
391 const struct ip_conntrack
*ignored_conntrack
)
393 struct ip_conntrack_tuple_hash
*h
;
394 unsigned int hash
= hash_conntrack(tuple
);
396 ASSERT_READ_LOCK(&ip_conntrack_lock
);
397 list_for_each_entry(h
, &ip_conntrack_hash
[hash
], list
) {
398 if (conntrack_tuple_cmp(h
, tuple
, ignored_conntrack
)) {
399 CONNTRACK_STAT_INC(found
);
402 CONNTRACK_STAT_INC(searched
);
408 /* Find a connection corresponding to a tuple. */
409 struct ip_conntrack_tuple_hash
*
410 ip_conntrack_find_get(const struct ip_conntrack_tuple
*tuple
,
411 const struct ip_conntrack
*ignored_conntrack
)
413 struct ip_conntrack_tuple_hash
*h
;
415 read_lock_bh(&ip_conntrack_lock
);
416 h
= __ip_conntrack_find(tuple
, ignored_conntrack
);
418 atomic_inc(&tuplehash_to_ctrack(h
)->ct_general
.use
);
419 read_unlock_bh(&ip_conntrack_lock
);
424 static void __ip_conntrack_hash_insert(struct ip_conntrack
*ct
,
426 unsigned int repl_hash
)
428 ct
->id
= ++ip_conntrack_next_id
;
429 list_prepend(&ip_conntrack_hash
[hash
],
430 &ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].list
);
431 list_prepend(&ip_conntrack_hash
[repl_hash
],
432 &ct
->tuplehash
[IP_CT_DIR_REPLY
].list
);
435 void ip_conntrack_hash_insert(struct ip_conntrack
*ct
)
437 unsigned int hash
, repl_hash
;
439 hash
= hash_conntrack(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
);
440 repl_hash
= hash_conntrack(&ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
);
442 write_lock_bh(&ip_conntrack_lock
);
443 __ip_conntrack_hash_insert(ct
, hash
, repl_hash
);
444 write_unlock_bh(&ip_conntrack_lock
);
447 /* Confirm a connection given skb; places it in hash table */
449 __ip_conntrack_confirm(struct sk_buff
**pskb
)
451 unsigned int hash
, repl_hash
;
452 struct ip_conntrack
*ct
;
453 enum ip_conntrack_info ctinfo
;
455 ct
= ip_conntrack_get(*pskb
, &ctinfo
);
457 /* ipt_REJECT uses ip_conntrack_attach to attach related
458 ICMP/TCP RST packets in other direction. Actual packet
459 which created connection will be IP_CT_NEW or for an
460 expected connection, IP_CT_RELATED. */
461 if (CTINFO2DIR(ctinfo
) != IP_CT_DIR_ORIGINAL
)
464 hash
= hash_conntrack(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
);
465 repl_hash
= hash_conntrack(&ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
);
467 /* We're not in hash table, and we refuse to set up related
468 connections for unconfirmed conns. But packet copies and
469 REJECT will give spurious warnings here. */
470 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
472 /* No external references means noone else could have
474 IP_NF_ASSERT(!is_confirmed(ct
));
475 DEBUGP("Confirming conntrack %p\n", ct
);
477 write_lock_bh(&ip_conntrack_lock
);
479 /* See if there's one in the list already, including reverse:
480 NAT could have grabbed it without realizing, since we're
481 not in the hash. If there is, we lost race. */
482 if (!LIST_FIND(&ip_conntrack_hash
[hash
],
484 struct ip_conntrack_tuple_hash
*,
485 &ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
, NULL
)
486 && !LIST_FIND(&ip_conntrack_hash
[repl_hash
],
488 struct ip_conntrack_tuple_hash
*,
489 &ct
->tuplehash
[IP_CT_DIR_REPLY
].tuple
, NULL
)) {
490 /* Remove from unconfirmed list */
491 list_del(&ct
->tuplehash
[IP_CT_DIR_ORIGINAL
].list
);
493 __ip_conntrack_hash_insert(ct
, hash
, repl_hash
);
494 /* Timer relative to confirmation time, not original
495 setting time, otherwise we'd get timer wrap in
496 weird delay cases. */
497 ct
->timeout
.expires
+= jiffies
;
498 add_timer(&ct
->timeout
);
499 atomic_inc(&ct
->ct_general
.use
);
500 set_bit(IPS_CONFIRMED_BIT
, &ct
->status
);
501 CONNTRACK_STAT_INC(insert
);
502 write_unlock_bh(&ip_conntrack_lock
);
504 ip_conntrack_event_cache(IPCT_HELPER
, *pskb
);
505 #ifdef CONFIG_IP_NF_NAT_NEEDED
506 if (test_bit(IPS_SRC_NAT_DONE_BIT
, &ct
->status
) ||
507 test_bit(IPS_DST_NAT_DONE_BIT
, &ct
->status
))
508 ip_conntrack_event_cache(IPCT_NATINFO
, *pskb
);
510 ip_conntrack_event_cache(master_ct(ct
) ?
511 IPCT_RELATED
: IPCT_NEW
, *pskb
);
516 CONNTRACK_STAT_INC(insert_failed
);
517 write_unlock_bh(&ip_conntrack_lock
);
522 /* Returns true if a connection correspondings to the tuple (required
525 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple
*tuple
,
526 const struct ip_conntrack
*ignored_conntrack
)
528 struct ip_conntrack_tuple_hash
*h
;
530 read_lock_bh(&ip_conntrack_lock
);
531 h
= __ip_conntrack_find(tuple
, ignored_conntrack
);
532 read_unlock_bh(&ip_conntrack_lock
);
537 /* There's a small race here where we may free a just-assured
538 connection. Too bad: we're in trouble anyway. */
539 static inline int unreplied(const struct ip_conntrack_tuple_hash
*i
)
541 return !(test_bit(IPS_ASSURED_BIT
, &tuplehash_to_ctrack(i
)->status
));
544 static int early_drop(struct list_head
*chain
)
546 /* Traverse backwards: gives us oldest, which is roughly LRU */
547 struct ip_conntrack_tuple_hash
*h
;
548 struct ip_conntrack
*ct
= NULL
;
551 read_lock_bh(&ip_conntrack_lock
);
552 h
= LIST_FIND_B(chain
, unreplied
, struct ip_conntrack_tuple_hash
*);
554 ct
= tuplehash_to_ctrack(h
);
555 atomic_inc(&ct
->ct_general
.use
);
557 read_unlock_bh(&ip_conntrack_lock
);
562 if (del_timer(&ct
->timeout
)) {
563 death_by_timeout((unsigned long)ct
);
565 CONNTRACK_STAT_INC(early_drop
);
567 ip_conntrack_put(ct
);
571 static inline int helper_cmp(const struct ip_conntrack_helper
*i
,
572 const struct ip_conntrack_tuple
*rtuple
)
574 return ip_ct_tuple_mask_cmp(rtuple
, &i
->tuple
, &i
->mask
);
577 static struct ip_conntrack_helper
*
578 __ip_conntrack_helper_find( const struct ip_conntrack_tuple
*tuple
)
580 return LIST_FIND(&helpers
, helper_cmp
,
581 struct ip_conntrack_helper
*,
585 struct ip_conntrack_helper
*
586 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple
*tuple
)
588 struct ip_conntrack_helper
*helper
;
590 /* need ip_conntrack_lock to assure that helper exists until
591 * try_module_get() is called */
592 read_lock_bh(&ip_conntrack_lock
);
594 helper
= __ip_conntrack_helper_find(tuple
);
596 /* need to increase module usage count to assure helper will
597 * not go away while the caller is e.g. busy putting a
598 * conntrack in the hash that uses the helper */
599 if (!try_module_get(helper
->me
))
603 read_unlock_bh(&ip_conntrack_lock
);
608 void ip_conntrack_helper_put(struct ip_conntrack_helper
*helper
)
610 module_put(helper
->me
);
613 struct ip_conntrack_protocol
*
614 __ip_conntrack_proto_find(u_int8_t protocol
)
616 return ip_ct_protos
[protocol
];
619 /* this is guaranteed to always return a valid protocol helper, since
620 * it falls back to generic_protocol */
621 struct ip_conntrack_protocol
*
622 ip_conntrack_proto_find_get(u_int8_t protocol
)
624 struct ip_conntrack_protocol
*p
;
627 p
= __ip_conntrack_proto_find(protocol
);
629 if (!try_module_get(p
->me
))
630 p
= &ip_conntrack_generic_protocol
;
637 void ip_conntrack_proto_put(struct ip_conntrack_protocol
*p
)
642 struct ip_conntrack
*ip_conntrack_alloc(struct ip_conntrack_tuple
*orig
,
643 struct ip_conntrack_tuple
*repl
)
645 struct ip_conntrack
*conntrack
;
647 if (!ip_conntrack_hash_rnd_initted
) {
648 get_random_bytes(&ip_conntrack_hash_rnd
, 4);
649 ip_conntrack_hash_rnd_initted
= 1;
653 && atomic_read(&ip_conntrack_count
) >= ip_conntrack_max
) {
654 unsigned int hash
= hash_conntrack(orig
);
655 /* Try dropping from this hash chain. */
656 if (!early_drop(&ip_conntrack_hash
[hash
])) {
659 "ip_conntrack: table full, dropping"
661 return ERR_PTR(-ENOMEM
);
665 conntrack
= kmem_cache_alloc(ip_conntrack_cachep
, GFP_ATOMIC
);
667 DEBUGP("Can't allocate conntrack.\n");
671 memset(conntrack
, 0, sizeof(*conntrack
));
672 atomic_set(&conntrack
->ct_general
.use
, 1);
673 conntrack
->ct_general
.destroy
= destroy_conntrack
;
674 conntrack
->tuplehash
[IP_CT_DIR_ORIGINAL
].tuple
= *orig
;
675 conntrack
->tuplehash
[IP_CT_DIR_REPLY
].tuple
= *repl
;
676 /* Don't set timer yet: wait for confirmation */
677 init_timer(&conntrack
->timeout
);
678 conntrack
->timeout
.data
= (unsigned long)conntrack
;
679 conntrack
->timeout
.function
= death_by_timeout
;
681 atomic_inc(&ip_conntrack_count
);
687 ip_conntrack_free(struct ip_conntrack
*conntrack
)
689 atomic_dec(&ip_conntrack_count
);
690 kmem_cache_free(ip_conntrack_cachep
, conntrack
);
693 /* Allocate a new conntrack: we return -ENOMEM if classification
694 * failed due to stress. Otherwise it really is unclassifiable */
695 static struct ip_conntrack_tuple_hash
*
696 init_conntrack(struct ip_conntrack_tuple
*tuple
,
697 struct ip_conntrack_protocol
*protocol
,
700 struct ip_conntrack
*conntrack
;
701 struct ip_conntrack_tuple repl_tuple
;
702 struct ip_conntrack_expect
*exp
;
704 if (!ip_ct_invert_tuple(&repl_tuple
, tuple
, protocol
)) {
705 DEBUGP("Can't invert tuple.\n");
709 if (!(conntrack
= ip_conntrack_alloc(tuple
, &repl_tuple
)))
712 if (!protocol
->new(conntrack
, skb
)) {
713 ip_conntrack_free(conntrack
);
717 write_lock_bh(&ip_conntrack_lock
);
718 exp
= find_expectation(tuple
);
721 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
723 /* Welcome, Mr. Bond. We've been expecting you... */
724 __set_bit(IPS_EXPECTED_BIT
, &conntrack
->status
);
725 conntrack
->master
= exp
->master
;
726 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
727 conntrack
->mark
= exp
->master
->mark
;
729 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
730 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
731 /* this is ugly, but there is no other place where to put it */
732 conntrack
->nat
.masq_index
= exp
->master
->nat
.masq_index
;
734 nf_conntrack_get(&conntrack
->master
->ct_general
);
735 CONNTRACK_STAT_INC(expect_new
);
737 conntrack
->helper
= __ip_conntrack_helper_find(&repl_tuple
);
739 CONNTRACK_STAT_INC(new);
742 /* Overload tuple linked list to put us in unconfirmed list. */
743 list_add(&conntrack
->tuplehash
[IP_CT_DIR_ORIGINAL
].list
, &unconfirmed
);
745 write_unlock_bh(&ip_conntrack_lock
);
749 exp
->expectfn(conntrack
, exp
);
750 ip_conntrack_expect_put(exp
);
753 return &conntrack
->tuplehash
[IP_CT_DIR_ORIGINAL
];
756 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
757 static inline struct ip_conntrack
*
758 resolve_normal_ct(struct sk_buff
*skb
,
759 struct ip_conntrack_protocol
*proto
,
761 unsigned int hooknum
,
762 enum ip_conntrack_info
*ctinfo
)
764 struct ip_conntrack_tuple tuple
;
765 struct ip_conntrack_tuple_hash
*h
;
766 struct ip_conntrack
*ct
;
768 IP_NF_ASSERT((skb
->nh
.iph
->frag_off
& htons(IP_OFFSET
)) == 0);
770 if (!ip_ct_get_tuple(skb
->nh
.iph
, skb
, skb
->nh
.iph
->ihl
*4,
774 /* look for tuple match */
775 h
= ip_conntrack_find_get(&tuple
, NULL
);
777 h
= init_conntrack(&tuple
, proto
, skb
);
783 ct
= tuplehash_to_ctrack(h
);
785 /* It exists; we have (non-exclusive) reference. */
786 if (DIRECTION(h
) == IP_CT_DIR_REPLY
) {
787 *ctinfo
= IP_CT_ESTABLISHED
+ IP_CT_IS_REPLY
;
788 /* Please set reply bit if this packet OK */
791 /* Once we've had two way comms, always ESTABLISHED. */
792 if (test_bit(IPS_SEEN_REPLY_BIT
, &ct
->status
)) {
793 DEBUGP("ip_conntrack_in: normal packet for %p\n",
795 *ctinfo
= IP_CT_ESTABLISHED
;
796 } else if (test_bit(IPS_EXPECTED_BIT
, &ct
->status
)) {
797 DEBUGP("ip_conntrack_in: related packet for %p\n",
799 *ctinfo
= IP_CT_RELATED
;
801 DEBUGP("ip_conntrack_in: new packet for %p\n",
807 skb
->nfct
= &ct
->ct_general
;
808 skb
->nfctinfo
= *ctinfo
;
812 /* Netfilter hook itself. */
813 unsigned int ip_conntrack_in(unsigned int hooknum
,
814 struct sk_buff
**pskb
,
815 const struct net_device
*in
,
816 const struct net_device
*out
,
817 int (*okfn
)(struct sk_buff
*))
819 struct ip_conntrack
*ct
;
820 enum ip_conntrack_info ctinfo
;
821 struct ip_conntrack_protocol
*proto
;
825 /* Previously seen (loopback or untracked)? Ignore. */
827 CONNTRACK_STAT_INC(ignore
);
832 if ((*pskb
)->nh
.iph
->frag_off
& htons(IP_OFFSET
)) {
833 if (net_ratelimit()) {
834 printk(KERN_ERR
"ip_conntrack_in: Frag of proto %u (hook=%u)\n",
835 (*pskb
)->nh
.iph
->protocol
, hooknum
);
840 /* Doesn't cover locally-generated broadcast, so not worth it. */
842 /* Ignore broadcast: no `connection'. */
843 if ((*pskb
)->pkt_type
== PACKET_BROADCAST
) {
844 printk("Broadcast packet!\n");
846 } else if (((*pskb
)->nh
.iph
->daddr
& htonl(0x000000FF))
847 == htonl(0x000000FF)) {
848 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
849 NIPQUAD((*pskb
)->nh
.iph
->saddr
),
850 NIPQUAD((*pskb
)->nh
.iph
->daddr
),
851 (*pskb
)->sk
, (*pskb
)->pkt_type
);
855 proto
= __ip_conntrack_proto_find((*pskb
)->nh
.iph
->protocol
);
857 /* It may be an special packet, error, unclean...
858 * inverse of the return code tells to the netfilter
859 * core what to do with the packet. */
860 if (proto
->error
!= NULL
861 && (ret
= proto
->error(*pskb
, &ctinfo
, hooknum
)) <= 0) {
862 CONNTRACK_STAT_INC(error
);
863 CONNTRACK_STAT_INC(invalid
);
867 if (!(ct
= resolve_normal_ct(*pskb
, proto
,&set_reply
,hooknum
,&ctinfo
))) {
868 /* Not valid part of a connection */
869 CONNTRACK_STAT_INC(invalid
);
874 /* Too stressed to deal. */
875 CONNTRACK_STAT_INC(drop
);
879 IP_NF_ASSERT((*pskb
)->nfct
);
881 ip_conntrack_event_cache_init(*pskb
);
883 ret
= proto
->packet(ct
, *pskb
, ctinfo
);
885 /* Invalid: inverse of the return code tells
886 * the netfilter core what to do*/
887 nf_conntrack_put((*pskb
)->nfct
);
888 (*pskb
)->nfct
= NULL
;
889 CONNTRACK_STAT_INC(invalid
);
893 if (set_reply
&& !test_and_set_bit(IPS_SEEN_REPLY_BIT
, &ct
->status
))
894 ip_conntrack_event_cache(IPCT_STATUS
, *pskb
);
899 int invert_tuplepr(struct ip_conntrack_tuple
*inverse
,
900 const struct ip_conntrack_tuple
*orig
)
902 return ip_ct_invert_tuple(inverse
, orig
,
903 __ip_conntrack_proto_find(orig
->dst
.protonum
));
906 /* Would two expected things clash? */
907 static inline int expect_clash(const struct ip_conntrack_expect
*a
,
908 const struct ip_conntrack_expect
*b
)
910 /* Part covered by intersection of masks must be unequal,
911 otherwise they clash */
912 struct ip_conntrack_tuple intersect_mask
913 = { { a
->mask
.src
.ip
& b
->mask
.src
.ip
,
914 { a
->mask
.src
.u
.all
& b
->mask
.src
.u
.all
} },
915 { a
->mask
.dst
.ip
& b
->mask
.dst
.ip
,
916 { a
->mask
.dst
.u
.all
& b
->mask
.dst
.u
.all
},
917 a
->mask
.dst
.protonum
& b
->mask
.dst
.protonum
} };
919 return ip_ct_tuple_mask_cmp(&a
->tuple
, &b
->tuple
, &intersect_mask
);
922 static inline int expect_matches(const struct ip_conntrack_expect
*a
,
923 const struct ip_conntrack_expect
*b
)
925 return a
->master
== b
->master
926 && ip_ct_tuple_equal(&a
->tuple
, &b
->tuple
)
927 && ip_ct_tuple_equal(&a
->mask
, &b
->mask
);
930 /* Generally a bad idea to call this: could have matched already. */
931 void ip_conntrack_unexpect_related(struct ip_conntrack_expect
*exp
)
933 struct ip_conntrack_expect
*i
;
935 write_lock_bh(&ip_conntrack_lock
);
936 /* choose the the oldest expectation to evict */
937 list_for_each_entry_reverse(i
, &ip_conntrack_expect_list
, list
) {
938 if (expect_matches(i
, exp
) && del_timer(&i
->timeout
)) {
940 write_unlock_bh(&ip_conntrack_lock
);
941 ip_conntrack_expect_put(i
);
945 write_unlock_bh(&ip_conntrack_lock
);
948 struct ip_conntrack_expect
*ip_conntrack_expect_alloc(struct ip_conntrack
*me
)
950 struct ip_conntrack_expect
*new;
952 new = kmem_cache_alloc(ip_conntrack_expect_cachep
, GFP_ATOMIC
);
954 DEBUGP("expect_related: OOM allocating expect\n");
958 atomic_inc(&new->master
->ct_general
.use
);
959 atomic_set(&new->use
, 1);
963 void ip_conntrack_expect_put(struct ip_conntrack_expect
*exp
)
965 if (atomic_dec_and_test(&exp
->use
)) {
966 ip_conntrack_put(exp
->master
);
967 kmem_cache_free(ip_conntrack_expect_cachep
, exp
);
971 static void ip_conntrack_expect_insert(struct ip_conntrack_expect
*exp
)
973 atomic_inc(&exp
->use
);
974 exp
->master
->expecting
++;
975 list_add(&exp
->list
, &ip_conntrack_expect_list
);
977 init_timer(&exp
->timeout
);
978 exp
->timeout
.data
= (unsigned long)exp
;
979 exp
->timeout
.function
= expectation_timed_out
;
980 exp
->timeout
.expires
= jiffies
+ exp
->master
->helper
->timeout
* HZ
;
981 add_timer(&exp
->timeout
);
983 exp
->id
= ++ip_conntrack_expect_next_id
;
984 atomic_inc(&exp
->use
);
985 CONNTRACK_STAT_INC(expect_create
);
988 /* Race with expectations being used means we could have none to find; OK. */
989 static void evict_oldest_expect(struct ip_conntrack
*master
)
991 struct ip_conntrack_expect
*i
;
993 list_for_each_entry_reverse(i
, &ip_conntrack_expect_list
, list
) {
994 if (i
->master
== master
) {
995 if (del_timer(&i
->timeout
)) {
997 ip_conntrack_expect_put(i
);
1004 static inline int refresh_timer(struct ip_conntrack_expect
*i
)
1006 if (!del_timer(&i
->timeout
))
1009 i
->timeout
.expires
= jiffies
+ i
->master
->helper
->timeout
*HZ
;
1010 add_timer(&i
->timeout
);
1014 int ip_conntrack_expect_related(struct ip_conntrack_expect
*expect
)
1016 struct ip_conntrack_expect
*i
;
1019 DEBUGP("ip_conntrack_expect_related %p\n", related_to
);
1020 DEBUGP("tuple: "); DUMP_TUPLE(&expect
->tuple
);
1021 DEBUGP("mask: "); DUMP_TUPLE(&expect
->mask
);
1023 write_lock_bh(&ip_conntrack_lock
);
1024 list_for_each_entry(i
, &ip_conntrack_expect_list
, list
) {
1025 if (expect_matches(i
, expect
)) {
1026 /* Refresh timer: if it's dying, ignore.. */
1027 if (refresh_timer(i
)) {
1031 } else if (expect_clash(i
, expect
)) {
1037 /* Will be over limit? */
1038 if (expect
->master
->helper
->max_expected
&&
1039 expect
->master
->expecting
>= expect
->master
->helper
->max_expected
)
1040 evict_oldest_expect(expect
->master
);
1042 ip_conntrack_expect_insert(expect
);
1043 ip_conntrack_expect_event(IPEXP_NEW
, expect
);
1046 write_unlock_bh(&ip_conntrack_lock
);
1050 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1051 implicitly racy: see __ip_conntrack_confirm */
1052 void ip_conntrack_alter_reply(struct ip_conntrack
*conntrack
,
1053 const struct ip_conntrack_tuple
*newreply
)
1055 write_lock_bh(&ip_conntrack_lock
);
1056 /* Should be unconfirmed, so not in hash table yet */
1057 IP_NF_ASSERT(!is_confirmed(conntrack
));
1059 DEBUGP("Altering reply tuple of %p to ", conntrack
);
1060 DUMP_TUPLE(newreply
);
1062 conntrack
->tuplehash
[IP_CT_DIR_REPLY
].tuple
= *newreply
;
1063 if (!conntrack
->master
&& conntrack
->expecting
== 0)
1064 conntrack
->helper
= __ip_conntrack_helper_find(newreply
);
1065 write_unlock_bh(&ip_conntrack_lock
);
1068 int ip_conntrack_helper_register(struct ip_conntrack_helper
*me
)
1070 BUG_ON(me
->timeout
== 0);
1071 write_lock_bh(&ip_conntrack_lock
);
1072 list_prepend(&helpers
, me
);
1073 write_unlock_bh(&ip_conntrack_lock
);
1078 struct ip_conntrack_helper
*
1079 __ip_conntrack_helper_find_byname(const char *name
)
1081 struct ip_conntrack_helper
*h
;
1083 list_for_each_entry(h
, &helpers
, list
) {
1084 if (!strcmp(h
->name
, name
))
1091 static inline int unhelp(struct ip_conntrack_tuple_hash
*i
,
1092 const struct ip_conntrack_helper
*me
)
1094 if (tuplehash_to_ctrack(i
)->helper
== me
) {
1095 ip_conntrack_event(IPCT_HELPER
, tuplehash_to_ctrack(i
));
1096 tuplehash_to_ctrack(i
)->helper
= NULL
;
1101 void ip_conntrack_helper_unregister(struct ip_conntrack_helper
*me
)
1104 struct ip_conntrack_expect
*exp
, *tmp
;
1106 /* Need write lock here, to delete helper. */
1107 write_lock_bh(&ip_conntrack_lock
);
1108 LIST_DELETE(&helpers
, me
);
1110 /* Get rid of expectations */
1111 list_for_each_entry_safe(exp
, tmp
, &ip_conntrack_expect_list
, list
) {
1112 if (exp
->master
->helper
== me
&& del_timer(&exp
->timeout
)) {
1114 ip_conntrack_expect_put(exp
);
1117 /* Get rid of expecteds, set helpers to NULL. */
1118 LIST_FIND_W(&unconfirmed
, unhelp
, struct ip_conntrack_tuple_hash
*, me
);
1119 for (i
= 0; i
< ip_conntrack_htable_size
; i
++)
1120 LIST_FIND_W(&ip_conntrack_hash
[i
], unhelp
,
1121 struct ip_conntrack_tuple_hash
*, me
);
1122 write_unlock_bh(&ip_conntrack_lock
);
1124 /* Someone could be still looking at the helper in a bh. */
1128 static inline void ct_add_counters(struct ip_conntrack
*ct
,
1129 enum ip_conntrack_info ctinfo
,
1130 const struct sk_buff
*skb
)
1132 #ifdef CONFIG_IP_NF_CT_ACCT
1134 ct
->counters
[CTINFO2DIR(ctinfo
)].packets
++;
1135 ct
->counters
[CTINFO2DIR(ctinfo
)].bytes
+=
1136 ntohs(skb
->nh
.iph
->tot_len
);
1141 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1142 void ip_ct_refresh_acct(struct ip_conntrack
*ct
,
1143 enum ip_conntrack_info ctinfo
,
1144 const struct sk_buff
*skb
,
1145 unsigned long extra_jiffies
)
1147 IP_NF_ASSERT(ct
->timeout
.data
== (unsigned long)ct
);
1149 /* If not in hash table, timer will not be active yet */
1150 if (!is_confirmed(ct
)) {
1151 ct
->timeout
.expires
= extra_jiffies
;
1152 ct_add_counters(ct
, ctinfo
, skb
);
1154 write_lock_bh(&ip_conntrack_lock
);
1155 /* Need del_timer for race avoidance (may already be dying). */
1156 if (del_timer(&ct
->timeout
)) {
1157 ct
->timeout
.expires
= jiffies
+ extra_jiffies
;
1158 add_timer(&ct
->timeout
);
1159 ip_conntrack_event_cache(IPCT_REFRESH
, skb
);
1161 ct_add_counters(ct
, ctinfo
, skb
);
1162 write_unlock_bh(&ip_conntrack_lock
);
1166 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1167 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1168 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1169 * in ip_conntrack_core, since we don't want the protocols to autoload
1170 * or depend on ctnetlink */
1171 int ip_ct_port_tuple_to_nfattr(struct sk_buff
*skb
,
1172 const struct ip_conntrack_tuple
*tuple
)
1174 NFA_PUT(skb
, CTA_PROTO_SRC_PORT
, sizeof(u_int16_t
),
1175 &tuple
->src
.u
.tcp
.port
);
1176 NFA_PUT(skb
, CTA_PROTO_DST_PORT
, sizeof(u_int16_t
),
1177 &tuple
->dst
.u
.tcp
.port
);
1184 int ip_ct_port_nfattr_to_tuple(struct nfattr
*tb
[],
1185 struct ip_conntrack_tuple
*t
)
1187 if (!tb
[CTA_PROTO_SRC_PORT
-1] || !tb
[CTA_PROTO_DST_PORT
-1])
1191 *(u_int16_t
*)NFA_DATA(tb
[CTA_PROTO_SRC_PORT
-1]);
1193 *(u_int16_t
*)NFA_DATA(tb
[CTA_PROTO_DST_PORT
-1]);
1199 /* Returns new sk_buff, or NULL */
1201 ip_ct_gather_frags(struct sk_buff
*skb
, u_int32_t user
)
1206 skb
= ip_defrag(skb
, user
);
1210 ip_send_check(skb
->nh
.iph
);
1214 /* Used by ipt_REJECT. */
1215 static void ip_conntrack_attach(struct sk_buff
*nskb
, struct sk_buff
*skb
)
1217 struct ip_conntrack
*ct
;
1218 enum ip_conntrack_info ctinfo
;
1220 /* This ICMP is in reverse direction to the packet which caused it */
1221 ct
= ip_conntrack_get(skb
, &ctinfo
);
1223 if (CTINFO2DIR(ctinfo
) == IP_CT_DIR_ORIGINAL
)
1224 ctinfo
= IP_CT_RELATED
+ IP_CT_IS_REPLY
;
1226 ctinfo
= IP_CT_RELATED
;
1228 /* Attach to new skbuff, and increment count */
1229 nskb
->nfct
= &ct
->ct_general
;
1230 nskb
->nfctinfo
= ctinfo
;
1231 nf_conntrack_get(nskb
->nfct
);
1235 do_iter(const struct ip_conntrack_tuple_hash
*i
,
1236 int (*iter
)(struct ip_conntrack
*i
, void *data
),
1239 return iter(tuplehash_to_ctrack(i
), data
);
1242 /* Bring out ya dead! */
1243 static struct ip_conntrack_tuple_hash
*
1244 get_next_corpse(int (*iter
)(struct ip_conntrack
*i
, void *data
),
1245 void *data
, unsigned int *bucket
)
1247 struct ip_conntrack_tuple_hash
*h
= NULL
;
1249 write_lock_bh(&ip_conntrack_lock
);
1250 for (; *bucket
< ip_conntrack_htable_size
; (*bucket
)++) {
1251 h
= LIST_FIND_W(&ip_conntrack_hash
[*bucket
], do_iter
,
1252 struct ip_conntrack_tuple_hash
*, iter
, data
);
1257 h
= LIST_FIND_W(&unconfirmed
, do_iter
,
1258 struct ip_conntrack_tuple_hash
*, iter
, data
);
1260 atomic_inc(&tuplehash_to_ctrack(h
)->ct_general
.use
);
1261 write_unlock_bh(&ip_conntrack_lock
);
1267 ip_ct_iterate_cleanup(int (*iter
)(struct ip_conntrack
*i
, void *), void *data
)
1269 struct ip_conntrack_tuple_hash
*h
;
1270 unsigned int bucket
= 0;
1272 while ((h
= get_next_corpse(iter
, data
, &bucket
)) != NULL
) {
1273 struct ip_conntrack
*ct
= tuplehash_to_ctrack(h
);
1274 /* Time to push up daises... */
1275 if (del_timer(&ct
->timeout
))
1276 death_by_timeout((unsigned long)ct
);
1277 /* ... else the timer will get him soon. */
1279 ip_conntrack_put(ct
);
1282 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1284 /* we need to deliver all cached events in order to drop
1285 * the reference counts */
1288 struct ip_conntrack_ecache
*ecache
=
1289 &per_cpu(ip_conntrack_ecache
, cpu
);
1291 __ip_ct_deliver_cached_events(ecache
);
1292 ip_conntrack_put(ecache
->ct
);
1300 /* Fast function for those who don't want to parse /proc (and I don't
1302 /* Reversing the socket's dst/src point of view gives us the reply
1305 getorigdst(struct sock
*sk
, int optval
, void __user
*user
, int *len
)
1307 struct inet_sock
*inet
= inet_sk(sk
);
1308 struct ip_conntrack_tuple_hash
*h
;
1309 struct ip_conntrack_tuple tuple
;
1311 IP_CT_TUPLE_U_BLANK(&tuple
);
1312 tuple
.src
.ip
= inet
->rcv_saddr
;
1313 tuple
.src
.u
.tcp
.port
= inet
->sport
;
1314 tuple
.dst
.ip
= inet
->daddr
;
1315 tuple
.dst
.u
.tcp
.port
= inet
->dport
;
1316 tuple
.dst
.protonum
= IPPROTO_TCP
;
1318 /* We only do TCP at the moment: is there a better way? */
1319 if (strcmp(sk
->sk_prot
->name
, "TCP")) {
1320 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1321 return -ENOPROTOOPT
;
1324 if ((unsigned int) *len
< sizeof(struct sockaddr_in
)) {
1325 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1326 *len
, sizeof(struct sockaddr_in
));
1330 h
= ip_conntrack_find_get(&tuple
, NULL
);
1332 struct sockaddr_in sin
;
1333 struct ip_conntrack
*ct
= tuplehash_to_ctrack(h
);
1335 sin
.sin_family
= AF_INET
;
1336 sin
.sin_port
= ct
->tuplehash
[IP_CT_DIR_ORIGINAL
]
1337 .tuple
.dst
.u
.tcp
.port
;
1338 sin
.sin_addr
.s_addr
= ct
->tuplehash
[IP_CT_DIR_ORIGINAL
]
1341 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1342 NIPQUAD(sin
.sin_addr
.s_addr
), ntohs(sin
.sin_port
));
1343 ip_conntrack_put(ct
);
1344 if (copy_to_user(user
, &sin
, sizeof(sin
)) != 0)
1349 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1350 NIPQUAD(tuple
.src
.ip
), ntohs(tuple
.src
.u
.tcp
.port
),
1351 NIPQUAD(tuple
.dst
.ip
), ntohs(tuple
.dst
.u
.tcp
.port
));
1355 static struct nf_sockopt_ops so_getorigdst
= {
1357 .get_optmin
= SO_ORIGINAL_DST
,
1358 .get_optmax
= SO_ORIGINAL_DST
+1,
1362 static int kill_all(struct ip_conntrack
*i
, void *data
)
1367 static void free_conntrack_hash(void)
1369 if (ip_conntrack_vmalloc
)
1370 vfree(ip_conntrack_hash
);
1372 free_pages((unsigned long)ip_conntrack_hash
,
1373 get_order(sizeof(struct list_head
)
1374 * ip_conntrack_htable_size
));
1377 void ip_conntrack_flush()
1379 /* This makes sure all current packets have passed through
1380 netfilter framework. Roll on, two-stage module
1385 ip_ct_iterate_cleanup(kill_all
, NULL
);
1386 if (atomic_read(&ip_conntrack_count
) != 0) {
1388 goto i_see_dead_people
;
1390 /* wait until all references to ip_conntrack_untracked are dropped */
1391 while (atomic_read(&ip_conntrack_untracked
.ct_general
.use
) > 1)
1395 /* Mishearing the voices in his head, our hero wonders how he's
1396 supposed to kill the mall. */
1397 void ip_conntrack_cleanup(void)
1399 ip_ct_attach
= NULL
;
1400 ip_conntrack_flush();
1401 kmem_cache_destroy(ip_conntrack_cachep
);
1402 kmem_cache_destroy(ip_conntrack_expect_cachep
);
1403 free_conntrack_hash();
1404 nf_unregister_sockopt(&so_getorigdst
);
1407 static int hashsize
;
1408 module_param(hashsize
, int, 0400);
1410 int __init
ip_conntrack_init(void)
1415 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1416 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1418 ip_conntrack_htable_size
= hashsize
;
1420 ip_conntrack_htable_size
1421 = (((num_physpages
<< PAGE_SHIFT
) / 16384)
1422 / sizeof(struct list_head
));
1423 if (num_physpages
> (1024 * 1024 * 1024 / PAGE_SIZE
))
1424 ip_conntrack_htable_size
= 8192;
1425 if (ip_conntrack_htable_size
< 16)
1426 ip_conntrack_htable_size
= 16;
1428 ip_conntrack_max
= 8 * ip_conntrack_htable_size
;
1430 printk("ip_conntrack version %s (%u buckets, %d max)"
1431 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION
,
1432 ip_conntrack_htable_size
, ip_conntrack_max
,
1433 sizeof(struct ip_conntrack
));
1435 ret
= nf_register_sockopt(&so_getorigdst
);
1437 printk(KERN_ERR
"Unable to register netfilter socket option\n");
1441 /* AK: the hash table is twice as big than needed because it
1442 uses list_head. it would be much nicer to caches to use a
1443 single pointer list head here. */
1444 ip_conntrack_vmalloc
= 0;
1446 =(void*)__get_free_pages(GFP_KERNEL
,
1447 get_order(sizeof(struct list_head
)
1448 *ip_conntrack_htable_size
));
1449 if (!ip_conntrack_hash
) {
1450 ip_conntrack_vmalloc
= 1;
1451 printk(KERN_WARNING
"ip_conntrack: falling back to vmalloc.\n");
1452 ip_conntrack_hash
= vmalloc(sizeof(struct list_head
)
1453 * ip_conntrack_htable_size
);
1455 if (!ip_conntrack_hash
) {
1456 printk(KERN_ERR
"Unable to create ip_conntrack_hash\n");
1457 goto err_unreg_sockopt
;
1460 ip_conntrack_cachep
= kmem_cache_create("ip_conntrack",
1461 sizeof(struct ip_conntrack
), 0,
1463 if (!ip_conntrack_cachep
) {
1464 printk(KERN_ERR
"Unable to create ip_conntrack slab cache\n");
1468 ip_conntrack_expect_cachep
= kmem_cache_create("ip_conntrack_expect",
1469 sizeof(struct ip_conntrack_expect
),
1471 if (!ip_conntrack_expect_cachep
) {
1472 printk(KERN_ERR
"Unable to create ip_expect slab cache\n");
1473 goto err_free_conntrack_slab
;
1476 /* Don't NEED lock here, but good form anyway. */
1477 write_lock_bh(&ip_conntrack_lock
);
1478 for (i
= 0; i
< MAX_IP_CT_PROTO
; i
++)
1479 ip_ct_protos
[i
] = &ip_conntrack_generic_protocol
;
1480 /* Sew in builtin protocols. */
1481 ip_ct_protos
[IPPROTO_TCP
] = &ip_conntrack_protocol_tcp
;
1482 ip_ct_protos
[IPPROTO_UDP
] = &ip_conntrack_protocol_udp
;
1483 ip_ct_protos
[IPPROTO_ICMP
] = &ip_conntrack_protocol_icmp
;
1484 write_unlock_bh(&ip_conntrack_lock
);
1486 for (i
= 0; i
< ip_conntrack_htable_size
; i
++)
1487 INIT_LIST_HEAD(&ip_conntrack_hash
[i
]);
1489 /* For use by ipt_REJECT */
1490 ip_ct_attach
= ip_conntrack_attach
;
1492 /* Set up fake conntrack:
1493 - to never be deleted, not in any hashes */
1494 atomic_set(&ip_conntrack_untracked
.ct_general
.use
, 1);
1495 /* - and look it like as a confirmed connection */
1496 set_bit(IPS_CONFIRMED_BIT
, &ip_conntrack_untracked
.status
);
1500 err_free_conntrack_slab
:
1501 kmem_cache_destroy(ip_conntrack_cachep
);
1503 free_conntrack_hash();
1505 nf_unregister_sockopt(&so_getorigdst
);