Merge branch 'upstream'
[deliverable/linux.git] / net / netfilter / nf_conntrack_core.c
CommitLineData
9fb9cbb1
YK
1/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 *
13 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14 * - new API and handling of conntrack/nat helpers
15 * - now capable of multiple expectations for one master
16 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17 * - add usage/reference counts to ip_conntrack_expect
18 * - export ip_conntrack[_expect]_{find_get,put} functions
19 * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20 * - generalize L3 protocol denendent part.
21 * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22 * - add support various size of conntrack structures.
23 *
24 * Derived from net/ipv4/netfilter/ip_conntrack_core.c
25 */
26
27#include <linux/config.h>
28#include <linux/types.h>
29#include <linux/netfilter.h>
30#include <linux/module.h>
31#include <linux/skbuff.h>
32#include <linux/proc_fs.h>
33#include <linux/vmalloc.h>
34#include <linux/stddef.h>
35#include <linux/slab.h>
36#include <linux/random.h>
37#include <linux/jhash.h>
38#include <linux/err.h>
39#include <linux/percpu.h>
40#include <linux/moduleparam.h>
41#include <linux/notifier.h>
42#include <linux/kernel.h>
43#include <linux/netdevice.h>
44#include <linux/socket.h>
45
46/* This rwlock protects the main hash table, protocol/helper/expected
47 registrations, conntrack timers*/
48#define ASSERT_READ_LOCK(x)
49#define ASSERT_WRITE_LOCK(x)
50
51#include <net/netfilter/nf_conntrack.h>
52#include <net/netfilter/nf_conntrack_l3proto.h>
53#include <net/netfilter/nf_conntrack_protocol.h>
54#include <net/netfilter/nf_conntrack_helper.h>
55#include <net/netfilter/nf_conntrack_core.h>
56#include <linux/netfilter_ipv4/listhelp.h>
57
58#define NF_CONNTRACK_VERSION "0.4.1"
59
60#if 0
61#define DEBUGP printk
62#else
63#define DEBUGP(format, args...)
64#endif
65
66DEFINE_RWLOCK(nf_conntrack_lock);
67
68/* nf_conntrack_standalone needs this */
69atomic_t nf_conntrack_count = ATOMIC_INIT(0);
70
71void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
72LIST_HEAD(nf_conntrack_expect_list);
73struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
74struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
75static LIST_HEAD(helpers);
76unsigned int nf_conntrack_htable_size = 0;
77int nf_conntrack_max;
78struct list_head *nf_conntrack_hash;
79static kmem_cache_t *nf_conntrack_expect_cachep;
80struct nf_conn nf_conntrack_untracked;
81unsigned int nf_ct_log_invalid;
82static LIST_HEAD(unconfirmed);
83static int nf_conntrack_vmalloc;
84
c1d10adb
PNA
85static unsigned int nf_conntrack_next_id = 1;
86static unsigned int nf_conntrack_expect_next_id = 1;
9fb9cbb1
YK
87#ifdef CONFIG_NF_CONNTRACK_EVENTS
88struct notifier_block *nf_conntrack_chain;
89struct notifier_block *nf_conntrack_expect_chain;
90
91DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
92
93/* deliver cached events and clear cache entry - must be called with locally
94 * disabled softirqs */
95static inline void
96__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
97{
98 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
99 if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
100 && ecache->events)
101 notifier_call_chain(&nf_conntrack_chain, ecache->events,
102 ecache->ct);
103
104 ecache->events = 0;
105 nf_ct_put(ecache->ct);
106 ecache->ct = NULL;
107}
108
109/* Deliver all cached events for a particular conntrack. This is called
110 * by code prior to async packet handling for freeing the skb */
111void nf_ct_deliver_cached_events(const struct nf_conn *ct)
112{
113 struct nf_conntrack_ecache *ecache;
114
115 local_bh_disable();
116 ecache = &__get_cpu_var(nf_conntrack_ecache);
117 if (ecache->ct == ct)
118 __nf_ct_deliver_cached_events(ecache);
119 local_bh_enable();
120}
121
122/* Deliver cached events for old pending events, if current conntrack != old */
123void __nf_ct_event_cache_init(struct nf_conn *ct)
124{
125 struct nf_conntrack_ecache *ecache;
126
127 /* take care of delivering potentially old events */
128 ecache = &__get_cpu_var(nf_conntrack_ecache);
129 BUG_ON(ecache->ct == ct);
130 if (ecache->ct)
131 __nf_ct_deliver_cached_events(ecache);
132 /* initialize for this conntrack/packet */
133 ecache->ct = ct;
134 nf_conntrack_get(&ct->ct_general);
135}
136
137/* flush the event cache - touches other CPU's data and must not be called
138 * while packets are still passing through the code */
139static void nf_ct_event_cache_flush(void)
140{
141 struct nf_conntrack_ecache *ecache;
142 int cpu;
143
144 for_each_cpu(cpu) {
145 ecache = &per_cpu(nf_conntrack_ecache, cpu);
146 if (ecache->ct)
147 nf_ct_put(ecache->ct);
148 }
149}
150#else
151static inline void nf_ct_event_cache_flush(void) {}
152#endif /* CONFIG_NF_CONNTRACK_EVENTS */
153
154DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
155EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
156
157/*
158 * This scheme offers various size of "struct nf_conn" dependent on
159 * features(helper, nat, ...)
160 */
161
162#define NF_CT_FEATURES_NAMELEN 256
163static struct {
164 /* name of slab cache. printed in /proc/slabinfo */
165 char *name;
166
167 /* size of slab cache */
168 size_t size;
169
170 /* slab cache pointer */
171 kmem_cache_t *cachep;
172
173 /* allocated slab cache + modules which uses this slab cache */
174 int use;
175
176 /* Initialization */
177 int (*init_conntrack)(struct nf_conn *, u_int32_t);
178
179} nf_ct_cache[NF_CT_F_NUM];
180
181/* protect members of nf_ct_cache except of "use" */
182DEFINE_RWLOCK(nf_ct_cache_lock);
183
184/* This avoids calling kmem_cache_create() with same name simultaneously */
185DECLARE_MUTEX(nf_ct_cache_mutex);
186
187extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
188struct nf_conntrack_protocol *
c1d10adb 189__nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
9fb9cbb1 190{
ddc8d029 191 if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
9fb9cbb1
YK
192 return &nf_conntrack_generic_protocol;
193
194 return nf_ct_protos[l3proto][protocol];
195}
196
c1d10adb
PNA
197/* this is guaranteed to always return a valid protocol helper, since
198 * it falls back to generic_protocol */
199struct nf_conntrack_protocol *
200nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
201{
202 struct nf_conntrack_protocol *p;
203
204 preempt_disable();
205 p = __nf_ct_proto_find(l3proto, protocol);
206 if (p) {
207 if (!try_module_get(p->me))
208 p = &nf_conntrack_generic_protocol;
209 }
210 preempt_enable();
211
212 return p;
213}
214
215void nf_ct_proto_put(struct nf_conntrack_protocol *p)
216{
217 module_put(p->me);
218}
219
220struct nf_conntrack_l3proto *
221nf_ct_l3proto_find_get(u_int16_t l3proto)
222{
223 struct nf_conntrack_l3proto *p;
224
225 preempt_disable();
226 p = __nf_ct_l3proto_find(l3proto);
227 if (p) {
228 if (!try_module_get(p->me))
229 p = &nf_conntrack_generic_l3proto;
230 }
231 preempt_enable();
232
233 return p;
234}
235
236void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
237{
238 module_put(p->me);
239}
240
9fb9cbb1
YK
241static int nf_conntrack_hash_rnd_initted;
242static unsigned int nf_conntrack_hash_rnd;
243
244static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
245 unsigned int size, unsigned int rnd)
246{
247 unsigned int a, b;
248 a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
249 ((tuple->src.l3num) << 16) | tuple->dst.protonum);
250 b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
251 (tuple->src.u.all << 16) | tuple->dst.u.all);
252
253 return jhash_2words(a, b, rnd) % size;
254}
255
256static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
257{
258 return __hash_conntrack(tuple, nf_conntrack_htable_size,
259 nf_conntrack_hash_rnd);
260}
261
262/* Initialize "struct nf_conn" which has spaces for helper */
263static int
264init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features)
265{
266
267 conntrack->help = (union nf_conntrack_help *)
268 (((unsigned long)conntrack->data
269 + (__alignof__(union nf_conntrack_help) - 1))
270 & (~((unsigned long)(__alignof__(union nf_conntrack_help) -1))));
271 return 0;
272}
273
274int nf_conntrack_register_cache(u_int32_t features, const char *name,
275 size_t size,
276 int (*init)(struct nf_conn *, u_int32_t))
277{
278 int ret = 0;
279 char *cache_name;
280 kmem_cache_t *cachep;
281
282 DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
283 features, name, size);
284
285 if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
286 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
287 features);
288 return -EINVAL;
289 }
290
291 down(&nf_ct_cache_mutex);
292
293 write_lock_bh(&nf_ct_cache_lock);
294 /* e.g: multiple helpers are loaded */
295 if (nf_ct_cache[features].use > 0) {
296 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
297 if ((!strncmp(nf_ct_cache[features].name, name,
298 NF_CT_FEATURES_NAMELEN))
299 && nf_ct_cache[features].size == size
300 && nf_ct_cache[features].init_conntrack == init) {
301 DEBUGP("nf_conntrack_register_cache: reusing.\n");
302 nf_ct_cache[features].use++;
303 ret = 0;
304 } else
305 ret = -EBUSY;
306
307 write_unlock_bh(&nf_ct_cache_lock);
308 up(&nf_ct_cache_mutex);
309 return ret;
310 }
311 write_unlock_bh(&nf_ct_cache_lock);
312
313 /*
314 * The memory space for name of slab cache must be alive until
315 * cache is destroyed.
316 */
317 cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
318 if (cache_name == NULL) {
319 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
320 ret = -ENOMEM;
321 goto out_up_mutex;
322 }
323
324 if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
325 >= NF_CT_FEATURES_NAMELEN) {
326 printk("nf_conntrack_register_cache: name too long\n");
327 ret = -EINVAL;
328 goto out_free_name;
329 }
330
331 cachep = kmem_cache_create(cache_name, size, 0, 0,
332 NULL, NULL);
333 if (!cachep) {
334 printk("nf_conntrack_register_cache: Can't create slab cache "
335 "for the features = 0x%x\n", features);
336 ret = -ENOMEM;
337 goto out_free_name;
338 }
339
340 write_lock_bh(&nf_ct_cache_lock);
341 nf_ct_cache[features].use = 1;
342 nf_ct_cache[features].size = size;
343 nf_ct_cache[features].init_conntrack = init;
344 nf_ct_cache[features].cachep = cachep;
345 nf_ct_cache[features].name = cache_name;
346 write_unlock_bh(&nf_ct_cache_lock);
347
348 goto out_up_mutex;
349
350out_free_name:
351 kfree(cache_name);
352out_up_mutex:
353 up(&nf_ct_cache_mutex);
354 return ret;
355}
356
357/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
358void nf_conntrack_unregister_cache(u_int32_t features)
359{
360 kmem_cache_t *cachep;
361 char *name;
362
363 /*
364 * This assures that kmem_cache_create() isn't called before destroying
365 * slab cache.
366 */
367 DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
368 down(&nf_ct_cache_mutex);
369
370 write_lock_bh(&nf_ct_cache_lock);
371 if (--nf_ct_cache[features].use > 0) {
372 write_unlock_bh(&nf_ct_cache_lock);
373 up(&nf_ct_cache_mutex);
374 return;
375 }
376 cachep = nf_ct_cache[features].cachep;
377 name = nf_ct_cache[features].name;
378 nf_ct_cache[features].cachep = NULL;
379 nf_ct_cache[features].name = NULL;
380 nf_ct_cache[features].init_conntrack = NULL;
381 nf_ct_cache[features].size = 0;
382 write_unlock_bh(&nf_ct_cache_lock);
383
384 synchronize_net();
385
386 kmem_cache_destroy(cachep);
387 kfree(name);
388
389 up(&nf_ct_cache_mutex);
390}
391
392int
393nf_ct_get_tuple(const struct sk_buff *skb,
394 unsigned int nhoff,
395 unsigned int dataoff,
396 u_int16_t l3num,
397 u_int8_t protonum,
398 struct nf_conntrack_tuple *tuple,
399 const struct nf_conntrack_l3proto *l3proto,
400 const struct nf_conntrack_protocol *protocol)
401{
402 NF_CT_TUPLE_U_BLANK(tuple);
403
404 tuple->src.l3num = l3num;
405 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
406 return 0;
407
408 tuple->dst.protonum = protonum;
409 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
410
411 return protocol->pkt_to_tuple(skb, dataoff, tuple);
412}
413
414int
415nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
416 const struct nf_conntrack_tuple *orig,
417 const struct nf_conntrack_l3proto *l3proto,
418 const struct nf_conntrack_protocol *protocol)
419{
420 NF_CT_TUPLE_U_BLANK(inverse);
421
422 inverse->src.l3num = orig->src.l3num;
423 if (l3proto->invert_tuple(inverse, orig) == 0)
424 return 0;
425
426 inverse->dst.dir = !orig->dst.dir;
427
428 inverse->dst.protonum = orig->dst.protonum;
429 return protocol->invert_tuple(inverse, orig);
430}
431
432/* nf_conntrack_expect helper functions */
c1d10adb 433void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
9fb9cbb1
YK
434{
435 ASSERT_WRITE_LOCK(&nf_conntrack_lock);
4a59a810 436 NF_CT_ASSERT(!timer_pending(&exp->timeout));
9fb9cbb1
YK
437 list_del(&exp->list);
438 NF_CT_STAT_INC(expect_delete);
439 exp->master->expecting--;
440 nf_conntrack_expect_put(exp);
441}
442
443static void expectation_timed_out(unsigned long ul_expect)
444{
445 struct nf_conntrack_expect *exp = (void *)ul_expect;
446
447 write_lock_bh(&nf_conntrack_lock);
448 nf_ct_unlink_expect(exp);
449 write_unlock_bh(&nf_conntrack_lock);
450 nf_conntrack_expect_put(exp);
451}
452
c1d10adb
PNA
453struct nf_conntrack_expect *
454__nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
455{
456 struct nf_conntrack_expect *i;
457
458 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
459 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
460 atomic_inc(&i->use);
461 return i;
462 }
463 }
464 return NULL;
465}
466
467/* Just find a expectation corresponding to a tuple. */
468struct nf_conntrack_expect *
469nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
470{
471 struct nf_conntrack_expect *i;
472
473 read_lock_bh(&nf_conntrack_lock);
474 i = __nf_conntrack_expect_find(tuple);
475 read_unlock_bh(&nf_conntrack_lock);
476
477 return i;
478}
479
9fb9cbb1
YK
480/* If an expectation for this connection is found, it gets delete from
481 * global list then returned. */
482static struct nf_conntrack_expect *
483find_expectation(const struct nf_conntrack_tuple *tuple)
484{
485 struct nf_conntrack_expect *i;
486
487 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
488 /* If master is not in hash table yet (ie. packet hasn't left
489 this machine yet), how can other end know about expected?
490 Hence these are not the droids you are looking for (if
491 master ct never got confirmed, we'd hold a reference to it
492 and weird things would happen to future packets). */
493 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
494 && nf_ct_is_confirmed(i->master)) {
495 if (i->flags & NF_CT_EXPECT_PERMANENT) {
496 atomic_inc(&i->use);
497 return i;
498 } else if (del_timer(&i->timeout)) {
499 nf_ct_unlink_expect(i);
500 return i;
501 }
502 }
503 }
504 return NULL;
505}
506
507/* delete all expectations for this conntrack */
c1d10adb 508void nf_ct_remove_expectations(struct nf_conn *ct)
9fb9cbb1
YK
509{
510 struct nf_conntrack_expect *i, *tmp;
511
512 /* Optimization: most connection never expect any others. */
513 if (ct->expecting == 0)
514 return;
515
516 list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
517 if (i->master == ct && del_timer(&i->timeout)) {
518 nf_ct_unlink_expect(i);
519 nf_conntrack_expect_put(i);
520 }
521 }
522}
523
524static void
525clean_from_lists(struct nf_conn *ct)
526{
527 unsigned int ho, hr;
528
529 DEBUGP("clean_from_lists(%p)\n", ct);
530 ASSERT_WRITE_LOCK(&nf_conntrack_lock);
531
532 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
533 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
534 LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
535 LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
536
537 /* Destroy all pending expectations */
c1d10adb 538 nf_ct_remove_expectations(ct);
9fb9cbb1
YK
539}
540
541static void
542destroy_conntrack(struct nf_conntrack *nfct)
543{
544 struct nf_conn *ct = (struct nf_conn *)nfct;
545 struct nf_conntrack_l3proto *l3proto;
546 struct nf_conntrack_protocol *proto;
547
548 DEBUGP("destroy_conntrack(%p)\n", ct);
549 NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
550 NF_CT_ASSERT(!timer_pending(&ct->timeout));
551
552 nf_conntrack_event(IPCT_DESTROY, ct);
553 set_bit(IPS_DYING_BIT, &ct->status);
554
555 /* To make sure we don't get any weird locking issues here:
556 * destroy_conntrack() MUST NOT be called with a write lock
557 * to nf_conntrack_lock!!! -HW */
c1d10adb 558 l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
9fb9cbb1
YK
559 if (l3proto && l3proto->destroy)
560 l3proto->destroy(ct);
561
c1d10adb 562 proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
9fb9cbb1
YK
563 if (proto && proto->destroy)
564 proto->destroy(ct);
565
566 if (nf_conntrack_destroyed)
567 nf_conntrack_destroyed(ct);
568
569 write_lock_bh(&nf_conntrack_lock);
570 /* Expectations will have been removed in clean_from_lists,
571 * except TFTP can create an expectation on the first packet,
572 * before connection is in the list, so we need to clean here,
573 * too. */
c1d10adb 574 nf_ct_remove_expectations(ct);
9fb9cbb1
YK
575
576 /* We overload first tuple to link into unconfirmed list. */
577 if (!nf_ct_is_confirmed(ct)) {
578 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
579 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
580 }
581
582 NF_CT_STAT_INC(delete);
583 write_unlock_bh(&nf_conntrack_lock);
584
585 if (ct->master)
586 nf_ct_put(ct->master);
587
588 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
589 nf_conntrack_free(ct);
590}
591
592static void death_by_timeout(unsigned long ul_conntrack)
593{
594 struct nf_conn *ct = (void *)ul_conntrack;
595
596 write_lock_bh(&nf_conntrack_lock);
597 /* Inside lock so preempt is disabled on module removal path.
598 * Otherwise we can get spurious warnings. */
599 NF_CT_STAT_INC(delete_list);
600 clean_from_lists(ct);
601 write_unlock_bh(&nf_conntrack_lock);
602 nf_ct_put(ct);
603}
604
605static inline int
606conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
607 const struct nf_conntrack_tuple *tuple,
608 const struct nf_conn *ignored_conntrack)
609{
610 ASSERT_READ_LOCK(&nf_conntrack_lock);
611 return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
612 && nf_ct_tuple_equal(tuple, &i->tuple);
613}
614
c1d10adb 615struct nf_conntrack_tuple_hash *
9fb9cbb1
YK
616__nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
617 const struct nf_conn *ignored_conntrack)
618{
619 struct nf_conntrack_tuple_hash *h;
620 unsigned int hash = hash_conntrack(tuple);
621
622 ASSERT_READ_LOCK(&nf_conntrack_lock);
623 list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
624 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
625 NF_CT_STAT_INC(found);
626 return h;
627 }
628 NF_CT_STAT_INC(searched);
629 }
630
631 return NULL;
632}
633
634/* Find a connection corresponding to a tuple. */
635struct nf_conntrack_tuple_hash *
636nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
637 const struct nf_conn *ignored_conntrack)
638{
639 struct nf_conntrack_tuple_hash *h;
640
641 read_lock_bh(&nf_conntrack_lock);
642 h = __nf_conntrack_find(tuple, ignored_conntrack);
643 if (h)
644 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
645 read_unlock_bh(&nf_conntrack_lock);
646
647 return h;
648}
649
c1d10adb
PNA
650static void __nf_conntrack_hash_insert(struct nf_conn *ct,
651 unsigned int hash,
652 unsigned int repl_hash)
653{
654 ct->id = ++nf_conntrack_next_id;
655 list_prepend(&nf_conntrack_hash[hash],
656 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
657 list_prepend(&nf_conntrack_hash[repl_hash],
658 &ct->tuplehash[IP_CT_DIR_REPLY].list);
659}
660
661void nf_conntrack_hash_insert(struct nf_conn *ct)
662{
663 unsigned int hash, repl_hash;
664
665 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
666 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
667
668 write_lock_bh(&nf_conntrack_lock);
669 __nf_conntrack_hash_insert(ct, hash, repl_hash);
670 write_unlock_bh(&nf_conntrack_lock);
671}
672
9fb9cbb1
YK
673/* Confirm a connection given skb; places it in hash table */
674int
675__nf_conntrack_confirm(struct sk_buff **pskb)
676{
677 unsigned int hash, repl_hash;
678 struct nf_conn *ct;
679 enum ip_conntrack_info ctinfo;
680
681 ct = nf_ct_get(*pskb, &ctinfo);
682
683 /* ipt_REJECT uses nf_conntrack_attach to attach related
684 ICMP/TCP RST packets in other direction. Actual packet
685 which created connection will be IP_CT_NEW or for an
686 expected connection, IP_CT_RELATED. */
687 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
688 return NF_ACCEPT;
689
690 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
691 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
692
693 /* We're not in hash table, and we refuse to set up related
694 connections for unconfirmed conns. But packet copies and
695 REJECT will give spurious warnings here. */
696 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
697
698 /* No external references means noone else could have
699 confirmed us. */
700 NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
701 DEBUGP("Confirming conntrack %p\n", ct);
702
703 write_lock_bh(&nf_conntrack_lock);
704
705 /* See if there's one in the list already, including reverse:
706 NAT could have grabbed it without realizing, since we're
707 not in the hash. If there is, we lost race. */
708 if (!LIST_FIND(&nf_conntrack_hash[hash],
709 conntrack_tuple_cmp,
710 struct nf_conntrack_tuple_hash *,
711 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
712 && !LIST_FIND(&nf_conntrack_hash[repl_hash],
713 conntrack_tuple_cmp,
714 struct nf_conntrack_tuple_hash *,
715 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
716 /* Remove from unconfirmed list */
717 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
718
c1d10adb 719 __nf_conntrack_hash_insert(ct, hash, repl_hash);
9fb9cbb1
YK
720 /* Timer relative to confirmation time, not original
721 setting time, otherwise we'd get timer wrap in
722 weird delay cases. */
723 ct->timeout.expires += jiffies;
724 add_timer(&ct->timeout);
725 atomic_inc(&ct->ct_general.use);
726 set_bit(IPS_CONFIRMED_BIT, &ct->status);
727 NF_CT_STAT_INC(insert);
728 write_unlock_bh(&nf_conntrack_lock);
729 if (ct->helper)
730 nf_conntrack_event_cache(IPCT_HELPER, *pskb);
731#ifdef CONFIG_NF_NAT_NEEDED
732 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
733 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
734 nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
735#endif
736 nf_conntrack_event_cache(master_ct(ct) ?
737 IPCT_RELATED : IPCT_NEW, *pskb);
738 return NF_ACCEPT;
739 }
740
741 NF_CT_STAT_INC(insert_failed);
742 write_unlock_bh(&nf_conntrack_lock);
743 return NF_DROP;
744}
745
746/* Returns true if a connection correspondings to the tuple (required
747 for NAT). */
748int
749nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
750 const struct nf_conn *ignored_conntrack)
751{
752 struct nf_conntrack_tuple_hash *h;
753
754 read_lock_bh(&nf_conntrack_lock);
755 h = __nf_conntrack_find(tuple, ignored_conntrack);
756 read_unlock_bh(&nf_conntrack_lock);
757
758 return h != NULL;
759}
760
761/* There's a small race here where we may free a just-assured
762 connection. Too bad: we're in trouble anyway. */
763static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
764{
765 return !(test_bit(IPS_ASSURED_BIT,
766 &nf_ct_tuplehash_to_ctrack(i)->status));
767}
768
769static int early_drop(struct list_head *chain)
770{
771 /* Traverse backwards: gives us oldest, which is roughly LRU */
772 struct nf_conntrack_tuple_hash *h;
773 struct nf_conn *ct = NULL;
774 int dropped = 0;
775
776 read_lock_bh(&nf_conntrack_lock);
777 h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
778 if (h) {
779 ct = nf_ct_tuplehash_to_ctrack(h);
780 atomic_inc(&ct->ct_general.use);
781 }
782 read_unlock_bh(&nf_conntrack_lock);
783
784 if (!ct)
785 return dropped;
786
787 if (del_timer(&ct->timeout)) {
788 death_by_timeout((unsigned long)ct);
789 dropped = 1;
790 NF_CT_STAT_INC(early_drop);
791 }
792 nf_ct_put(ct);
793 return dropped;
794}
795
796static inline int helper_cmp(const struct nf_conntrack_helper *i,
797 const struct nf_conntrack_tuple *rtuple)
798{
799 return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
800}
801
802static struct nf_conntrack_helper *
c1d10adb 803__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
9fb9cbb1
YK
804{
805 return LIST_FIND(&helpers, helper_cmp,
806 struct nf_conntrack_helper *,
807 tuple);
808}
809
c1d10adb
PNA
810struct nf_conntrack_helper *
811nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
812{
813 struct nf_conntrack_helper *helper;
814
815 /* need nf_conntrack_lock to assure that helper exists until
816 * try_module_get() is called */
817 read_lock_bh(&nf_conntrack_lock);
818
819 helper = __nf_ct_helper_find(tuple);
820 if (helper) {
821 /* need to increase module usage count to assure helper will
822 * not go away while the caller is e.g. busy putting a
823 * conntrack in the hash that uses the helper */
824 if (!try_module_get(helper->me))
825 helper = NULL;
826 }
827
828 read_unlock_bh(&nf_conntrack_lock);
829
830 return helper;
831}
832
833void nf_ct_helper_put(struct nf_conntrack_helper *helper)
834{
835 module_put(helper->me);
836}
837
9fb9cbb1
YK
838static struct nf_conn *
839__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
840 const struct nf_conntrack_tuple *repl,
841 const struct nf_conntrack_l3proto *l3proto)
842{
843 struct nf_conn *conntrack = NULL;
844 u_int32_t features = 0;
845
846 if (!nf_conntrack_hash_rnd_initted) {
847 get_random_bytes(&nf_conntrack_hash_rnd, 4);
848 nf_conntrack_hash_rnd_initted = 1;
849 }
850
851 if (nf_conntrack_max
852 && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
853 unsigned int hash = hash_conntrack(orig);
854 /* Try dropping from this hash chain. */
855 if (!early_drop(&nf_conntrack_hash[hash])) {
856 if (net_ratelimit())
857 printk(KERN_WARNING
858 "nf_conntrack: table full, dropping"
859 " packet.\n");
860 return ERR_PTR(-ENOMEM);
861 }
862 }
863
864 /* find features needed by this conntrack. */
865 features = l3proto->get_features(orig);
866 read_lock_bh(&nf_conntrack_lock);
c1d10adb 867 if (__nf_ct_helper_find(repl) != NULL)
9fb9cbb1
YK
868 features |= NF_CT_F_HELP;
869 read_unlock_bh(&nf_conntrack_lock);
870
871 DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
872
873 read_lock_bh(&nf_ct_cache_lock);
874
875 if (!nf_ct_cache[features].use) {
876 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
877 features);
878 goto out;
879 }
880
881 conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
882 if (conntrack == NULL) {
883 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
884 goto out;
885 }
886
887 memset(conntrack, 0, nf_ct_cache[features].size);
888 conntrack->features = features;
889 if (nf_ct_cache[features].init_conntrack &&
890 nf_ct_cache[features].init_conntrack(conntrack, features) < 0) {
891 DEBUGP("nf_conntrack_alloc: failed to init\n");
892 kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
893 conntrack = NULL;
894 goto out;
895 }
896
897 atomic_set(&conntrack->ct_general.use, 1);
898 conntrack->ct_general.destroy = destroy_conntrack;
899 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
900 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
901 /* Don't set timer yet: wait for confirmation */
902 init_timer(&conntrack->timeout);
903 conntrack->timeout.data = (unsigned long)conntrack;
904 conntrack->timeout.function = death_by_timeout;
905
906 atomic_inc(&nf_conntrack_count);
907out:
908 read_unlock_bh(&nf_ct_cache_lock);
909 return conntrack;
910}
911
912struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
913 const struct nf_conntrack_tuple *repl)
914{
915 struct nf_conntrack_l3proto *l3proto;
916
c1d10adb 917 l3proto = __nf_ct_l3proto_find(orig->src.l3num);
9fb9cbb1
YK
918 return __nf_conntrack_alloc(orig, repl, l3proto);
919}
920
921void nf_conntrack_free(struct nf_conn *conntrack)
922{
923 u_int32_t features = conntrack->features;
924 NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
925 DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
926 conntrack);
927 kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
928 atomic_dec(&nf_conntrack_count);
929}
930
931/* Allocate a new conntrack: we return -ENOMEM if classification
932 failed due to stress. Otherwise it really is unclassifiable. */
933static struct nf_conntrack_tuple_hash *
934init_conntrack(const struct nf_conntrack_tuple *tuple,
935 struct nf_conntrack_l3proto *l3proto,
936 struct nf_conntrack_protocol *protocol,
937 struct sk_buff *skb,
938 unsigned int dataoff)
939{
940 struct nf_conn *conntrack;
941 struct nf_conntrack_tuple repl_tuple;
942 struct nf_conntrack_expect *exp;
943
944 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
945 DEBUGP("Can't invert tuple.\n");
946 return NULL;
947 }
948
949 conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
950 if (conntrack == NULL || IS_ERR(conntrack)) {
951 DEBUGP("Can't allocate conntrack.\n");
952 return (struct nf_conntrack_tuple_hash *)conntrack;
953 }
954
955 if (!protocol->new(conntrack, skb, dataoff)) {
956 nf_conntrack_free(conntrack);
957 DEBUGP("init conntrack: can't track with proto module\n");
958 return NULL;
959 }
960
961 write_lock_bh(&nf_conntrack_lock);
962 exp = find_expectation(tuple);
963
964 if (exp) {
965 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
966 conntrack, exp);
967 /* Welcome, Mr. Bond. We've been expecting you... */
968 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
969 conntrack->master = exp->master;
970#ifdef CONFIG_NF_CONNTRACK_MARK
971 conntrack->mark = exp->master->mark;
972#endif
973 nf_conntrack_get(&conntrack->master->ct_general);
974 NF_CT_STAT_INC(expect_new);
975 } else {
c1d10adb 976 conntrack->helper = __nf_ct_helper_find(&repl_tuple);
9fb9cbb1
YK
977
978 NF_CT_STAT_INC(new);
979 }
980
981 /* Overload tuple linked list to put us in unconfirmed list. */
982 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
983
984 write_unlock_bh(&nf_conntrack_lock);
985
986 if (exp) {
987 if (exp->expectfn)
988 exp->expectfn(conntrack, exp);
989 nf_conntrack_expect_put(exp);
990 }
991
992 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
993}
994
995/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
996static inline struct nf_conn *
997resolve_normal_ct(struct sk_buff *skb,
998 unsigned int dataoff,
999 u_int16_t l3num,
1000 u_int8_t protonum,
1001 struct nf_conntrack_l3proto *l3proto,
1002 struct nf_conntrack_protocol *proto,
1003 int *set_reply,
1004 enum ip_conntrack_info *ctinfo)
1005{
1006 struct nf_conntrack_tuple tuple;
1007 struct nf_conntrack_tuple_hash *h;
1008 struct nf_conn *ct;
1009
1010 if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
1011 dataoff, l3num, protonum, &tuple, l3proto,
1012 proto)) {
1013 DEBUGP("resolve_normal_ct: Can't get tuple\n");
1014 return NULL;
1015 }
1016
1017 /* look for tuple match */
1018 h = nf_conntrack_find_get(&tuple, NULL);
1019 if (!h) {
1020 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
1021 if (!h)
1022 return NULL;
1023 if (IS_ERR(h))
1024 return (void *)h;
1025 }
1026 ct = nf_ct_tuplehash_to_ctrack(h);
1027
1028 /* It exists; we have (non-exclusive) reference. */
1029 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1030 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1031 /* Please set reply bit if this packet OK */
1032 *set_reply = 1;
1033 } else {
1034 /* Once we've had two way comms, always ESTABLISHED. */
1035 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1036 DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
1037 *ctinfo = IP_CT_ESTABLISHED;
1038 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1039 DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
1040 *ctinfo = IP_CT_RELATED;
1041 } else {
1042 DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
1043 *ctinfo = IP_CT_NEW;
1044 }
1045 *set_reply = 0;
1046 }
1047 skb->nfct = &ct->ct_general;
1048 skb->nfctinfo = *ctinfo;
1049 return ct;
1050}
1051
1052unsigned int
1053nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
1054{
1055 struct nf_conn *ct;
1056 enum ip_conntrack_info ctinfo;
1057 struct nf_conntrack_l3proto *l3proto;
1058 struct nf_conntrack_protocol *proto;
1059 unsigned int dataoff;
1060 u_int8_t protonum;
1061 int set_reply = 0;
1062 int ret;
1063
1064 /* Previously seen (loopback or untracked)? Ignore. */
1065 if ((*pskb)->nfct) {
1066 NF_CT_STAT_INC(ignore);
1067 return NF_ACCEPT;
1068 }
1069
c1d10adb 1070 l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
9fb9cbb1
YK
1071 if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
1072 DEBUGP("not prepared to track yet or error occured\n");
1073 return -ret;
1074 }
1075
c1d10adb 1076 proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
9fb9cbb1
YK
1077
1078 /* It may be an special packet, error, unclean...
1079 * inverse of the return code tells to the netfilter
1080 * core what to do with the packet. */
1081 if (proto->error != NULL &&
1082 (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
1083 NF_CT_STAT_INC(error);
1084 NF_CT_STAT_INC(invalid);
1085 return -ret;
1086 }
1087
1088 ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
1089 &set_reply, &ctinfo);
1090 if (!ct) {
1091 /* Not valid part of a connection */
1092 NF_CT_STAT_INC(invalid);
1093 return NF_ACCEPT;
1094 }
1095
1096 if (IS_ERR(ct)) {
1097 /* Too stressed to deal. */
1098 NF_CT_STAT_INC(drop);
1099 return NF_DROP;
1100 }
1101
1102 NF_CT_ASSERT((*pskb)->nfct);
1103
1104 ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
1105 if (ret < 0) {
1106 /* Invalid: inverse of the return code tells
1107 * the netfilter core what to do */
1108 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
1109 nf_conntrack_put((*pskb)->nfct);
1110 (*pskb)->nfct = NULL;
1111 NF_CT_STAT_INC(invalid);
1112 return -ret;
1113 }
1114
1115 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1116 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
1117
1118 return ret;
1119}
1120
1121int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1122 const struct nf_conntrack_tuple *orig)
1123{
1124 return nf_ct_invert_tuple(inverse, orig,
c1d10adb
PNA
1125 __nf_ct_l3proto_find(orig->src.l3num),
1126 __nf_ct_proto_find(orig->src.l3num,
1127 orig->dst.protonum));
9fb9cbb1
YK
1128}
1129
1130/* Would two expected things clash? */
1131static inline int expect_clash(const struct nf_conntrack_expect *a,
1132 const struct nf_conntrack_expect *b)
1133{
1134 /* Part covered by intersection of masks must be unequal,
1135 otherwise they clash */
1136 struct nf_conntrack_tuple intersect_mask;
1137 int count;
1138
1139 intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1140 intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1141 intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1142 intersect_mask.dst.protonum = a->mask.dst.protonum
1143 & b->mask.dst.protonum;
1144
1145 for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1146 intersect_mask.src.u3.all[count] =
1147 a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1148 }
1149
1150 for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1151 intersect_mask.dst.u3.all[count] =
1152 a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1153 }
1154
1155 return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1156}
1157
1158static inline int expect_matches(const struct nf_conntrack_expect *a,
1159 const struct nf_conntrack_expect *b)
1160{
1161 return a->master == b->master
1162 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1163 && nf_ct_tuple_equal(&a->mask, &b->mask);
1164}
1165
1166/* Generally a bad idea to call this: could have matched already. */
1167void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1168{
1169 struct nf_conntrack_expect *i;
1170
1171 write_lock_bh(&nf_conntrack_lock);
1172 /* choose the the oldest expectation to evict */
1173 list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1174 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1175 nf_ct_unlink_expect(i);
1176 write_unlock_bh(&nf_conntrack_lock);
1177 nf_conntrack_expect_put(i);
1178 return;
1179 }
1180 }
1181 write_unlock_bh(&nf_conntrack_lock);
1182}
1183
1184/* We don't increase the master conntrack refcount for non-fulfilled
1185 * conntracks. During the conntrack destruction, the expectations are
1186 * always killed before the conntrack itself */
1187struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1188{
1189 struct nf_conntrack_expect *new;
1190
1191 new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1192 if (!new) {
1193 DEBUGP("expect_related: OOM allocating expect\n");
1194 return NULL;
1195 }
1196 new->master = me;
1197 atomic_set(&new->use, 1);
1198 return new;
1199}
1200
1201void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1202{
1203 if (atomic_dec_and_test(&exp->use))
1204 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1205}
1206
1207static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1208{
1209 atomic_inc(&exp->use);
1210 exp->master->expecting++;
1211 list_add(&exp->list, &nf_conntrack_expect_list);
1212
1213 init_timer(&exp->timeout);
1214 exp->timeout.data = (unsigned long)exp;
1215 exp->timeout.function = expectation_timed_out;
1216 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
1217 add_timer(&exp->timeout);
1218
c1d10adb 1219 exp->id = ++nf_conntrack_expect_next_id;
9fb9cbb1
YK
1220 atomic_inc(&exp->use);
1221 NF_CT_STAT_INC(expect_create);
1222}
1223
1224/* Race with expectations being used means we could have none to find; OK. */
1225static void evict_oldest_expect(struct nf_conn *master)
1226{
1227 struct nf_conntrack_expect *i;
1228
1229 list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1230 if (i->master == master) {
1231 if (del_timer(&i->timeout)) {
1232 nf_ct_unlink_expect(i);
1233 nf_conntrack_expect_put(i);
1234 }
1235 break;
1236 }
1237 }
1238}
1239
1240static inline int refresh_timer(struct nf_conntrack_expect *i)
1241{
1242 if (!del_timer(&i->timeout))
1243 return 0;
1244
1245 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1246 add_timer(&i->timeout);
1247 return 1;
1248}
1249
1250int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1251{
1252 struct nf_conntrack_expect *i;
d695aa8a 1253 struct nf_conn *master = expect->master;
9fb9cbb1
YK
1254 int ret;
1255
1256 DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1257 DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1258 DEBUGP("mask: "); NF_CT_DUMP_TUPLE(&expect->mask);
1259
1260 write_lock_bh(&nf_conntrack_lock);
1261 list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1262 if (expect_matches(i, expect)) {
1263 /* Refresh timer: if it's dying, ignore.. */
1264 if (refresh_timer(i)) {
1265 ret = 0;
1266 goto out;
1267 }
1268 } else if (expect_clash(i, expect)) {
1269 ret = -EBUSY;
1270 goto out;
1271 }
1272 }
1273 /* Will be over limit? */
d695aa8a
JJ
1274 if (master->helper->max_expected &&
1275 master->expecting >= master->helper->max_expected)
1276 evict_oldest_expect(master);
9fb9cbb1
YK
1277
1278 nf_conntrack_expect_insert(expect);
1279 nf_conntrack_expect_event(IPEXP_NEW, expect);
1280 ret = 0;
1281out:
1282 write_unlock_bh(&nf_conntrack_lock);
1283 return ret;
1284}
1285
1286/* Alter reply tuple (maybe alter helper). This is for NAT, and is
1287 implicitly racy: see __nf_conntrack_confirm */
1288void nf_conntrack_alter_reply(struct nf_conn *conntrack,
1289 const struct nf_conntrack_tuple *newreply)
1290{
1291 write_lock_bh(&nf_conntrack_lock);
1292 /* Should be unconfirmed, so not in hash table yet */
1293 NF_CT_ASSERT(!nf_ct_is_confirmed(conntrack));
1294
1295 DEBUGP("Altering reply tuple of %p to ", conntrack);
1296 NF_CT_DUMP_TUPLE(newreply);
1297
1298 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1299 if (!conntrack->master && conntrack->expecting == 0)
c1d10adb 1300 conntrack->helper = __nf_ct_helper_find(newreply);
9fb9cbb1
YK
1301 write_unlock_bh(&nf_conntrack_lock);
1302}
1303
1304int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1305{
1306 int ret;
1307 BUG_ON(me->timeout == 0);
1308
1309 ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1310 sizeof(struct nf_conn)
1311 + sizeof(union nf_conntrack_help)
1312 + __alignof__(union nf_conntrack_help),
1313 init_conntrack_for_helper);
1314 if (ret < 0) {
1315 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1316 return ret;
1317 }
1318 write_lock_bh(&nf_conntrack_lock);
1319 list_prepend(&helpers, me);
1320 write_unlock_bh(&nf_conntrack_lock);
1321
1322 return 0;
1323}
1324
c1d10adb
PNA
1325struct nf_conntrack_helper *
1326__nf_conntrack_helper_find_byname(const char *name)
1327{
1328 struct nf_conntrack_helper *h;
1329
1330 list_for_each_entry(h, &helpers, list) {
1331 if (!strcmp(h->name, name))
1332 return h;
1333 }
1334
1335 return NULL;
1336}
1337
9fb9cbb1
YK
1338static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1339 const struct nf_conntrack_helper *me)
1340{
1341 if (nf_ct_tuplehash_to_ctrack(i)->helper == me) {
1342 nf_conntrack_event(IPCT_HELPER, nf_ct_tuplehash_to_ctrack(i));
1343 nf_ct_tuplehash_to_ctrack(i)->helper = NULL;
1344 }
1345 return 0;
1346}
1347
1348void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1349{
1350 unsigned int i;
1351 struct nf_conntrack_expect *exp, *tmp;
1352
1353 /* Need write lock here, to delete helper. */
1354 write_lock_bh(&nf_conntrack_lock);
1355 LIST_DELETE(&helpers, me);
1356
1357 /* Get rid of expectations */
1358 list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
1359 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1360 nf_ct_unlink_expect(exp);
1361 nf_conntrack_expect_put(exp);
1362 }
1363 }
1364
1365 /* Get rid of expecteds, set helpers to NULL. */
1366 LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1367 for (i = 0; i < nf_conntrack_htable_size; i++)
1368 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1369 struct nf_conntrack_tuple_hash *, me);
1370 write_unlock_bh(&nf_conntrack_lock);
1371
1372 /* Someone could be still looking at the helper in a bh. */
1373 synchronize_net();
1374}
1375
1376/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1377void __nf_ct_refresh_acct(struct nf_conn *ct,
1378 enum ip_conntrack_info ctinfo,
1379 const struct sk_buff *skb,
1380 unsigned long extra_jiffies,
1381 int do_acct)
1382{
1383 int event = 0;
1384
1385 NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1386 NF_CT_ASSERT(skb);
1387
1388 write_lock_bh(&nf_conntrack_lock);
1389
1390 /* If not in hash table, timer will not be active yet */
1391 if (!nf_ct_is_confirmed(ct)) {
1392 ct->timeout.expires = extra_jiffies;
1393 event = IPCT_REFRESH;
1394 } else {
1395 /* Need del_timer for race avoidance (may already be dying). */
1396 if (del_timer(&ct->timeout)) {
1397 ct->timeout.expires = jiffies + extra_jiffies;
1398 add_timer(&ct->timeout);
1399 event = IPCT_REFRESH;
1400 }
1401 }
1402
1403#ifdef CONFIG_NF_CT_ACCT
1404 if (do_acct) {
1405 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1406 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1407 skb->len - (unsigned int)(skb->nh.raw - skb->data);
1408 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1409 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1410 event |= IPCT_COUNTER_FILLING;
1411 }
1412#endif
1413
1414 write_unlock_bh(&nf_conntrack_lock);
1415
1416 /* must be unlocked when calling event cache */
1417 if (event)
1418 nf_conntrack_event_cache(event, skb);
1419}
1420
c1d10adb
PNA
1421#if defined(CONFIG_NF_CT_NETLINK) || \
1422 defined(CONFIG_NF_CT_NETLINK_MODULE)
1423
1424#include <linux/netfilter/nfnetlink.h>
1425#include <linux/netfilter/nfnetlink_conntrack.h>
1426
1427/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1428 * in ip_conntrack_core, since we don't want the protocols to autoload
1429 * or depend on ctnetlink */
1430int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1431 const struct nf_conntrack_tuple *tuple)
1432{
1433 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1434 &tuple->src.u.tcp.port);
1435 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1436 &tuple->dst.u.tcp.port);
1437 return 0;
1438
1439nfattr_failure:
1440 return -1;
1441}
1442
1443static const size_t cta_min_proto[CTA_PROTO_MAX] = {
1444 [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t),
1445 [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t)
1446};
1447
1448int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1449 struct nf_conntrack_tuple *t)
1450{
1451 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1452 return -EINVAL;
1453
1454 if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
1455 return -EINVAL;
1456
1457 t->src.u.tcp.port =
1458 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1459 t->dst.u.tcp.port =
1460 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1461
1462 return 0;
1463}
1464#endif
1465
9fb9cbb1
YK
1466/* Used by ipt_REJECT and ip6t_REJECT. */
1467void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1468{
1469 struct nf_conn *ct;
1470 enum ip_conntrack_info ctinfo;
1471
1472 /* This ICMP is in reverse direction to the packet which caused it */
1473 ct = nf_ct_get(skb, &ctinfo);
1474 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1475 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1476 else
1477 ctinfo = IP_CT_RELATED;
1478
1479 /* Attach to new skbuff, and increment count */
1480 nskb->nfct = &ct->ct_general;
1481 nskb->nfctinfo = ctinfo;
1482 nf_conntrack_get(nskb->nfct);
1483}
1484
1485static inline int
1486do_iter(const struct nf_conntrack_tuple_hash *i,
1487 int (*iter)(struct nf_conn *i, void *data),
1488 void *data)
1489{
1490 return iter(nf_ct_tuplehash_to_ctrack(i), data);
1491}
1492
1493/* Bring out ya dead! */
1494static struct nf_conntrack_tuple_hash *
1495get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1496 void *data, unsigned int *bucket)
1497{
1498 struct nf_conntrack_tuple_hash *h = NULL;
1499
1500 write_lock_bh(&nf_conntrack_lock);
1501 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1502 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1503 struct nf_conntrack_tuple_hash *, iter, data);
1504 if (h)
1505 break;
1506 }
1507 if (!h)
1508 h = LIST_FIND_W(&unconfirmed, do_iter,
1509 struct nf_conntrack_tuple_hash *, iter, data);
1510 if (h)
1511 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1512 write_unlock_bh(&nf_conntrack_lock);
1513
1514 return h;
1515}
1516
1517void
1518nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1519{
1520 struct nf_conntrack_tuple_hash *h;
1521 unsigned int bucket = 0;
1522
1523 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1524 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1525 /* Time to push up daises... */
1526 if (del_timer(&ct->timeout))
1527 death_by_timeout((unsigned long)ct);
1528 /* ... else the timer will get him soon. */
1529
1530 nf_ct_put(ct);
1531 }
1532}
1533
1534static int kill_all(struct nf_conn *i, void *data)
1535{
1536 return 1;
1537}
1538
1539static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1540{
1541 if (vmalloced)
1542 vfree(hash);
1543 else
1544 free_pages((unsigned long)hash,
1545 get_order(sizeof(struct list_head) * size));
1546}
1547
c1d10adb
PNA
1548void nf_conntrack_flush()
1549{
1550 nf_ct_iterate_cleanup(kill_all, NULL);
1551}
1552
9fb9cbb1
YK
1553/* Mishearing the voices in his head, our hero wonders how he's
1554 supposed to kill the mall. */
1555void nf_conntrack_cleanup(void)
1556{
1557 int i;
1558
7d3cdc6b
YK
1559 ip_ct_attach = NULL;
1560
9fb9cbb1
YK
1561 /* This makes sure all current packets have passed through
1562 netfilter framework. Roll on, two-stage module
1563 delete... */
1564 synchronize_net();
1565
1566 nf_ct_event_cache_flush();
1567 i_see_dead_people:
c1d10adb 1568 nf_conntrack_flush();
9fb9cbb1
YK
1569 if (atomic_read(&nf_conntrack_count) != 0) {
1570 schedule();
1571 goto i_see_dead_people;
1572 }
6636568c
PM
1573 /* wait until all references to nf_conntrack_untracked are dropped */
1574 while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1575 schedule();
9fb9cbb1
YK
1576
1577 for (i = 0; i < NF_CT_F_NUM; i++) {
1578 if (nf_ct_cache[i].use == 0)
1579 continue;
1580
1581 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1582 nf_ct_cache[i].use = 1;
1583 nf_conntrack_unregister_cache(i);
1584 }
1585 kmem_cache_destroy(nf_conntrack_expect_cachep);
1586 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1587 nf_conntrack_htable_size);
5a6f294e
KK
1588
1589 /* free l3proto protocol tables */
1590 for (i = 0; i < PF_MAX; i++)
1591 if (nf_ct_protos[i]) {
1592 kfree(nf_ct_protos[i]);
1593 nf_ct_protos[i] = NULL;
1594 }
9fb9cbb1
YK
1595}
1596
1597static struct list_head *alloc_hashtable(int size, int *vmalloced)
1598{
1599 struct list_head *hash;
1600 unsigned int i;
1601
1602 *vmalloced = 0;
1603 hash = (void*)__get_free_pages(GFP_KERNEL,
1604 get_order(sizeof(struct list_head)
1605 * size));
1606 if (!hash) {
1607 *vmalloced = 1;
1608 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1609 hash = vmalloc(sizeof(struct list_head) * size);
1610 }
1611
1612 if (hash)
1613 for (i = 0; i < size; i++)
1614 INIT_LIST_HEAD(&hash[i]);
1615
1616 return hash;
1617}
1618
1619int set_hashsize(const char *val, struct kernel_param *kp)
1620{
1621 int i, bucket, hashsize, vmalloced;
1622 int old_vmalloced, old_size;
1623 int rnd;
1624 struct list_head *hash, *old_hash;
1625 struct nf_conntrack_tuple_hash *h;
1626
1627 /* On boot, we can set this without any fancy locking. */
1628 if (!nf_conntrack_htable_size)
1629 return param_set_uint(val, kp);
1630
1631 hashsize = simple_strtol(val, NULL, 0);
1632 if (!hashsize)
1633 return -EINVAL;
1634
1635 hash = alloc_hashtable(hashsize, &vmalloced);
1636 if (!hash)
1637 return -ENOMEM;
1638
1639 /* We have to rehahs for the new table anyway, so we also can
1640 * use a newrandom seed */
1641 get_random_bytes(&rnd, 4);
1642
1643 write_lock_bh(&nf_conntrack_lock);
1644 for (i = 0; i < nf_conntrack_htable_size; i++) {
1645 while (!list_empty(&nf_conntrack_hash[i])) {
1646 h = list_entry(nf_conntrack_hash[i].next,
1647 struct nf_conntrack_tuple_hash, list);
1648 list_del(&h->list);
1649 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1650 list_add_tail(&h->list, &hash[bucket]);
1651 }
1652 }
1653 old_size = nf_conntrack_htable_size;
1654 old_vmalloced = nf_conntrack_vmalloc;
1655 old_hash = nf_conntrack_hash;
1656
1657 nf_conntrack_htable_size = hashsize;
1658 nf_conntrack_vmalloc = vmalloced;
1659 nf_conntrack_hash = hash;
1660 nf_conntrack_hash_rnd = rnd;
1661 write_unlock_bh(&nf_conntrack_lock);
1662
1663 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1664 return 0;
1665}
1666
1667module_param_call(hashsize, set_hashsize, param_get_uint,
1668 &nf_conntrack_htable_size, 0600);
1669
1670int __init nf_conntrack_init(void)
1671{
1672 unsigned int i;
1673 int ret;
1674
1675 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1676 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1677 if (!nf_conntrack_htable_size) {
1678 nf_conntrack_htable_size
1679 = (((num_physpages << PAGE_SHIFT) / 16384)
1680 / sizeof(struct list_head));
1681 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1682 nf_conntrack_htable_size = 8192;
1683 if (nf_conntrack_htable_size < 16)
1684 nf_conntrack_htable_size = 16;
1685 }
1686 nf_conntrack_max = 8 * nf_conntrack_htable_size;
1687
1688 printk("nf_conntrack version %s (%u buckets, %d max)\n",
1689 NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1690 nf_conntrack_max);
1691
1692 nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1693 &nf_conntrack_vmalloc);
1694 if (!nf_conntrack_hash) {
1695 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1696 goto err_out;
1697 }
1698
1699 ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1700 sizeof(struct nf_conn), NULL);
1701 if (ret < 0) {
1702 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1703 goto err_free_hash;
1704 }
1705
1706 nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1707 sizeof(struct nf_conntrack_expect),
1708 0, 0, NULL, NULL);
1709 if (!nf_conntrack_expect_cachep) {
1710 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1711 goto err_free_conntrack_slab;
1712 }
1713
1714 /* Don't NEED lock here, but good form anyway. */
1715 write_lock_bh(&nf_conntrack_lock);
1716 for (i = 0; i < PF_MAX; i++)
1717 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1718 write_unlock_bh(&nf_conntrack_lock);
1719
7d3cdc6b
YK
1720 /* For use by REJECT target */
1721 ip_ct_attach = __nf_conntrack_attach;
1722
9fb9cbb1
YK
1723 /* Set up fake conntrack:
1724 - to never be deleted, not in any hashes */
1725 atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1726 /* - and look it like as a confirmed connection */
1727 set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1728
1729 return ret;
1730
1731err_free_conntrack_slab:
1732 nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1733err_free_hash:
1734 free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1735 nf_conntrack_htable_size);
1736err_out:
1737 return -ENOMEM;
1738}
This page took 0.121275 seconds and 5 git commands to generate.