[NETFILTER]: Add ctnetlink subsystem
[deliverable/linux.git] / net / ipv4 / netfilter / ip_conntrack_core.c
CommitLineData
1da177e4
LT
1/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20#include <linux/config.h>
21#include <linux/types.h>
22#include <linux/icmp.h>
23#include <linux/ip.h>
24#include <linux/netfilter.h>
25#include <linux/netfilter_ipv4.h>
26#include <linux/module.h>
27#include <linux/skbuff.h>
28#include <linux/proc_fs.h>
29#include <linux/vmalloc.h>
30#include <net/checksum.h>
31#include <net/ip.h>
32#include <linux/stddef.h>
33#include <linux/sysctl.h>
34#include <linux/slab.h>
35#include <linux/random.h>
36#include <linux/jhash.h>
37#include <linux/err.h>
38#include <linux/percpu.h>
39#include <linux/moduleparam.h>
ac3247ba 40#include <linux/notifier.h>
1da177e4 41
e45b1be8 42/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
1da177e4 43 registrations, conntrack timers*/
e45b1be8
PM
44#define ASSERT_READ_LOCK(x)
45#define ASSERT_WRITE_LOCK(x)
1da177e4
LT
46
47#include <linux/netfilter_ipv4/ip_conntrack.h>
48#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50#include <linux/netfilter_ipv4/ip_conntrack_core.h>
51#include <linux/netfilter_ipv4/listhelp.h>
52
080774a2 53#define IP_CONNTRACK_VERSION "2.3"
1da177e4
LT
54
55#if 0
56#define DEBUGP printk
57#else
58#define DEBUGP(format, args...)
59#endif
60
e45b1be8 61DEFINE_RWLOCK(ip_conntrack_lock);
1da177e4
LT
62
63/* ip_conntrack_standalone needs this */
64atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67LIST_HEAD(ip_conntrack_expect_list);
68struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69static LIST_HEAD(helpers);
70unsigned int ip_conntrack_htable_size = 0;
71int ip_conntrack_max;
72struct list_head *ip_conntrack_hash;
73static kmem_cache_t *ip_conntrack_cachep;
74static kmem_cache_t *ip_conntrack_expect_cachep;
75struct ip_conntrack ip_conntrack_untracked;
76unsigned int ip_ct_log_invalid;
77static LIST_HEAD(unconfirmed);
78static int ip_conntrack_vmalloc;
79
080774a2
HW
80static unsigned int ip_conntrack_next_id = 1;
81static unsigned int ip_conntrack_expect_next_id = 1;
ac3247ba
HW
82#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83struct notifier_block *ip_conntrack_chain;
84struct notifier_block *ip_conntrack_expect_chain;
85
86DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88static inline void __deliver_cached_events(struct ip_conntrack_ecache *ecache)
89{
90 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
91 notifier_call_chain(&ip_conntrack_chain, ecache->events,
92 ecache->ct);
93 ecache->events = 0;
94}
95
96void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
97{
98 __deliver_cached_events(ecache);
99}
100
101/* Deliver all cached events for a particular conntrack. This is called
102 * by code prior to async packet handling or freeing the skb */
103void
104ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct)
105{
106 struct ip_conntrack_ecache *ecache =
107 &__get_cpu_var(ip_conntrack_ecache);
108
109 if (!ct)
110 return;
111
112 if (ecache->ct == ct) {
113 DEBUGP("ecache: delivering event for %p\n", ct);
114 __deliver_cached_events(ecache);
115 } else {
116 if (net_ratelimit())
117 printk(KERN_WARNING "ecache: want to deliver for %p, "
118 "but cache has %p\n", ct, ecache->ct);
119 }
120
121 /* signalize that events have already been delivered */
122 ecache->ct = NULL;
123}
124
125/* Deliver cached events for old pending events, if current conntrack != old */
126void ip_conntrack_event_cache_init(const struct sk_buff *skb)
127{
128 struct ip_conntrack *ct = (struct ip_conntrack *) skb->nfct;
129 struct ip_conntrack_ecache *ecache =
130 &__get_cpu_var(ip_conntrack_ecache);
131
132 /* take care of delivering potentially old events */
133 if (ecache->ct != ct) {
134 enum ip_conntrack_info ctinfo;
135 /* we have to check, since at startup the cache is NULL */
136 if (likely(ecache->ct)) {
137 DEBUGP("ecache: entered for different conntrack: "
138 "ecache->ct=%p, skb->nfct=%p. delivering "
139 "events\n", ecache->ct, ct);
140 __deliver_cached_events(ecache);
141 ip_conntrack_put(ecache->ct);
142 } else {
143 DEBUGP("ecache: entered for conntrack %p, "
144 "cache was clean before\n", ct);
145 }
146
147 /* initialize for this conntrack/packet */
148 ecache->ct = ip_conntrack_get(skb, &ctinfo);
149 /* ecache->events cleared by __deliver_cached_devents() */
150 } else {
151 DEBUGP("ecache: re-entered for conntrack %p.\n", ct);
152 }
153}
154
155#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
156
1da177e4
LT
157DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
158
1da177e4
LT
159static int ip_conntrack_hash_rnd_initted;
160static unsigned int ip_conntrack_hash_rnd;
161
162static u_int32_t
163hash_conntrack(const struct ip_conntrack_tuple *tuple)
164{
165#if 0
166 dump_tuple(tuple);
167#endif
168 return (jhash_3words(tuple->src.ip,
169 (tuple->dst.ip ^ tuple->dst.protonum),
170 (tuple->src.u.all | (tuple->dst.u.all << 16)),
171 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
172}
173
174int
175ip_ct_get_tuple(const struct iphdr *iph,
176 const struct sk_buff *skb,
177 unsigned int dataoff,
178 struct ip_conntrack_tuple *tuple,
179 const struct ip_conntrack_protocol *protocol)
180{
181 /* Never happen */
182 if (iph->frag_off & htons(IP_OFFSET)) {
183 printk("ip_conntrack_core: Frag of proto %u.\n",
184 iph->protocol);
185 return 0;
186 }
187
188 tuple->src.ip = iph->saddr;
189 tuple->dst.ip = iph->daddr;
190 tuple->dst.protonum = iph->protocol;
191 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
192
193 return protocol->pkt_to_tuple(skb, dataoff, tuple);
194}
195
196int
197ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
198 const struct ip_conntrack_tuple *orig,
199 const struct ip_conntrack_protocol *protocol)
200{
201 inverse->src.ip = orig->dst.ip;
202 inverse->dst.ip = orig->src.ip;
203 inverse->dst.protonum = orig->dst.protonum;
204 inverse->dst.dir = !orig->dst.dir;
205
206 return protocol->invert_tuple(inverse, orig);
207}
208
209
210/* ip_conntrack_expect helper functions */
1da177e4
LT
211static void unlink_expect(struct ip_conntrack_expect *exp)
212{
e45b1be8 213 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
4acdbdbe 214 IP_NF_ASSERT(!timer_pending(&exp->timeout));
1da177e4 215 list_del(&exp->list);
4acdbdbe 216 CONNTRACK_STAT_INC(expect_delete);
1da177e4
LT
217 exp->master->expecting--;
218}
219
080774a2
HW
220void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
221{
222 unlink_expect(exp);
223 ip_conntrack_expect_put(exp);
224}
225
1da177e4
LT
226static void expectation_timed_out(unsigned long ul_expect)
227{
228 struct ip_conntrack_expect *exp = (void *)ul_expect;
229
e45b1be8 230 write_lock_bh(&ip_conntrack_lock);
1da177e4 231 unlink_expect(exp);
e45b1be8 232 write_unlock_bh(&ip_conntrack_lock);
4acdbdbe 233 ip_conntrack_expect_put(exp);
1da177e4
LT
234}
235
080774a2
HW
236struct ip_conntrack_expect *
237__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
238{
239 struct ip_conntrack_expect *i;
240
241 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
242 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
243 atomic_inc(&i->use);
244 return i;
245 }
246 }
247 return NULL;
248}
249
250/* Just find a expectation corresponding to a tuple. */
251struct ip_conntrack_expect *
252ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
253{
254 struct ip_conntrack_expect *i;
255
256 read_lock_bh(&ip_conntrack_lock);
257 i = __ip_conntrack_expect_find(tuple);
258 read_unlock_bh(&ip_conntrack_lock);
259
260 return i;
261}
262
1da177e4
LT
263/* If an expectation for this connection is found, it gets delete from
264 * global list then returned. */
265static struct ip_conntrack_expect *
266find_expectation(const struct ip_conntrack_tuple *tuple)
267{
268 struct ip_conntrack_expect *i;
269
270 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
271 /* If master is not in hash table yet (ie. packet hasn't left
272 this machine yet), how can other end know about expected?
273 Hence these are not the droids you are looking for (if
274 master ct never got confirmed, we'd hold a reference to it
275 and weird things would happen to future packets). */
276 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
277 && is_confirmed(i->master)
278 && del_timer(&i->timeout)) {
279 unlink_expect(i);
280 return i;
281 }
282 }
283 return NULL;
284}
285
286/* delete all expectations for this conntrack */
080774a2 287void ip_ct_remove_expectations(struct ip_conntrack *ct)
1da177e4
LT
288{
289 struct ip_conntrack_expect *i, *tmp;
290
291 /* Optimization: most connection never expect any others. */
292 if (ct->expecting == 0)
293 return;
294
295 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
296 if (i->master == ct && del_timer(&i->timeout)) {
297 unlink_expect(i);
4acdbdbe 298 ip_conntrack_expect_put(i);
1da177e4
LT
299 }
300 }
301}
302
303static void
304clean_from_lists(struct ip_conntrack *ct)
305{
306 unsigned int ho, hr;
307
308 DEBUGP("clean_from_lists(%p)\n", ct);
e45b1be8 309 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
1da177e4
LT
310
311 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
312 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
313 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
314 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
315
316 /* Destroy all pending expectations */
080774a2 317 ip_ct_remove_expectations(ct);
1da177e4
LT
318}
319
320static void
321destroy_conntrack(struct nf_conntrack *nfct)
322{
323 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
324 struct ip_conntrack_protocol *proto;
325
326 DEBUGP("destroy_conntrack(%p)\n", ct);
327 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
328 IP_NF_ASSERT(!timer_pending(&ct->timeout));
329
ac3247ba
HW
330 set_bit(IPS_DYING_BIT, &ct->status);
331
1da177e4
LT
332 /* To make sure we don't get any weird locking issues here:
333 * destroy_conntrack() MUST NOT be called with a write lock
334 * to ip_conntrack_lock!!! -HW */
080774a2 335 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
1da177e4
LT
336 if (proto && proto->destroy)
337 proto->destroy(ct);
338
339 if (ip_conntrack_destroyed)
340 ip_conntrack_destroyed(ct);
341
e45b1be8 342 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
343 /* Expectations will have been removed in clean_from_lists,
344 * except TFTP can create an expectation on the first packet,
345 * before connection is in the list, so we need to clean here,
346 * too. */
080774a2 347 ip_ct_remove_expectations(ct);
1da177e4
LT
348
349 /* We overload first tuple to link into unconfirmed list. */
350 if (!is_confirmed(ct)) {
351 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
352 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
353 }
354
355 CONNTRACK_STAT_INC(delete);
e45b1be8 356 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
357
358 if (ct->master)
359 ip_conntrack_put(ct->master);
360
361 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
080774a2 362 ip_conntrack_free(ct);
1da177e4
LT
363}
364
365static void death_by_timeout(unsigned long ul_conntrack)
366{
367 struct ip_conntrack *ct = (void *)ul_conntrack;
368
ac3247ba 369 ip_conntrack_event(IPCT_DESTROY, ct);
e45b1be8 370 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
371 /* Inside lock so preempt is disabled on module removal path.
372 * Otherwise we can get spurious warnings. */
373 CONNTRACK_STAT_INC(delete_list);
374 clean_from_lists(ct);
e45b1be8 375 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
376 ip_conntrack_put(ct);
377}
378
379static inline int
380conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
381 const struct ip_conntrack_tuple *tuple,
382 const struct ip_conntrack *ignored_conntrack)
383{
e45b1be8 384 ASSERT_READ_LOCK(&ip_conntrack_lock);
1da177e4
LT
385 return tuplehash_to_ctrack(i) != ignored_conntrack
386 && ip_ct_tuple_equal(tuple, &i->tuple);
387}
388
080774a2 389struct ip_conntrack_tuple_hash *
1da177e4
LT
390__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
391 const struct ip_conntrack *ignored_conntrack)
392{
393 struct ip_conntrack_tuple_hash *h;
394 unsigned int hash = hash_conntrack(tuple);
395
e45b1be8 396 ASSERT_READ_LOCK(&ip_conntrack_lock);
1da177e4
LT
397 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
398 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
399 CONNTRACK_STAT_INC(found);
400 return h;
401 }
402 CONNTRACK_STAT_INC(searched);
403 }
404
405 return NULL;
406}
407
408/* Find a connection corresponding to a tuple. */
409struct ip_conntrack_tuple_hash *
410ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
411 const struct ip_conntrack *ignored_conntrack)
412{
413 struct ip_conntrack_tuple_hash *h;
414
e45b1be8 415 read_lock_bh(&ip_conntrack_lock);
1da177e4
LT
416 h = __ip_conntrack_find(tuple, ignored_conntrack);
417 if (h)
418 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
e45b1be8 419 read_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
420
421 return h;
422}
423
080774a2
HW
424static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
425 unsigned int hash,
426 unsigned int repl_hash)
427{
428 ct->id = ++ip_conntrack_next_id;
429 list_prepend(&ip_conntrack_hash[hash],
430 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
431 list_prepend(&ip_conntrack_hash[repl_hash],
432 &ct->tuplehash[IP_CT_DIR_REPLY].list);
433}
434
435void ip_conntrack_hash_insert(struct ip_conntrack *ct)
436{
437 unsigned int hash, repl_hash;
438
439 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
440 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
441
442 write_lock_bh(&ip_conntrack_lock);
443 __ip_conntrack_hash_insert(ct, hash, repl_hash);
444 write_unlock_bh(&ip_conntrack_lock);
445}
446
1da177e4
LT
447/* Confirm a connection given skb; places it in hash table */
448int
449__ip_conntrack_confirm(struct sk_buff **pskb)
450{
451 unsigned int hash, repl_hash;
452 struct ip_conntrack *ct;
453 enum ip_conntrack_info ctinfo;
454
455 ct = ip_conntrack_get(*pskb, &ctinfo);
456
457 /* ipt_REJECT uses ip_conntrack_attach to attach related
458 ICMP/TCP RST packets in other direction. Actual packet
459 which created connection will be IP_CT_NEW or for an
460 expected connection, IP_CT_RELATED. */
461 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
462 return NF_ACCEPT;
463
464 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
465 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
466
467 /* We're not in hash table, and we refuse to set up related
468 connections for unconfirmed conns. But packet copies and
469 REJECT will give spurious warnings here. */
470 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
471
472 /* No external references means noone else could have
473 confirmed us. */
474 IP_NF_ASSERT(!is_confirmed(ct));
475 DEBUGP("Confirming conntrack %p\n", ct);
476
e45b1be8 477 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
478
479 /* See if there's one in the list already, including reverse:
480 NAT could have grabbed it without realizing, since we're
481 not in the hash. If there is, we lost race. */
482 if (!LIST_FIND(&ip_conntrack_hash[hash],
483 conntrack_tuple_cmp,
484 struct ip_conntrack_tuple_hash *,
485 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
486 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
487 conntrack_tuple_cmp,
488 struct ip_conntrack_tuple_hash *,
489 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
490 /* Remove from unconfirmed list */
491 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
492
080774a2 493 __ip_conntrack_hash_insert(ct, hash, repl_hash);
1da177e4
LT
494 /* Timer relative to confirmation time, not original
495 setting time, otherwise we'd get timer wrap in
496 weird delay cases. */
497 ct->timeout.expires += jiffies;
498 add_timer(&ct->timeout);
499 atomic_inc(&ct->ct_general.use);
500 set_bit(IPS_CONFIRMED_BIT, &ct->status);
501 CONNTRACK_STAT_INC(insert);
e45b1be8 502 write_unlock_bh(&ip_conntrack_lock);
ac3247ba
HW
503 if (ct->helper)
504 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
505#ifdef CONFIG_IP_NF_NAT_NEEDED
506 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
507 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
508 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
509#endif
510 ip_conntrack_event_cache(master_ct(ct) ?
511 IPCT_RELATED : IPCT_NEW, *pskb);
512
1da177e4
LT
513 return NF_ACCEPT;
514 }
515
516 CONNTRACK_STAT_INC(insert_failed);
e45b1be8 517 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
518
519 return NF_DROP;
520}
521
522/* Returns true if a connection correspondings to the tuple (required
523 for NAT). */
524int
525ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
526 const struct ip_conntrack *ignored_conntrack)
527{
528 struct ip_conntrack_tuple_hash *h;
529
e45b1be8 530 read_lock_bh(&ip_conntrack_lock);
1da177e4 531 h = __ip_conntrack_find(tuple, ignored_conntrack);
e45b1be8 532 read_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
533
534 return h != NULL;
535}
536
537/* There's a small race here where we may free a just-assured
538 connection. Too bad: we're in trouble anyway. */
539static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
540{
541 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
542}
543
544static int early_drop(struct list_head *chain)
545{
546 /* Traverse backwards: gives us oldest, which is roughly LRU */
547 struct ip_conntrack_tuple_hash *h;
548 struct ip_conntrack *ct = NULL;
549 int dropped = 0;
550
e45b1be8 551 read_lock_bh(&ip_conntrack_lock);
1da177e4
LT
552 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
553 if (h) {
554 ct = tuplehash_to_ctrack(h);
555 atomic_inc(&ct->ct_general.use);
556 }
e45b1be8 557 read_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
558
559 if (!ct)
560 return dropped;
561
562 if (del_timer(&ct->timeout)) {
563 death_by_timeout((unsigned long)ct);
564 dropped = 1;
565 CONNTRACK_STAT_INC(early_drop);
566 }
567 ip_conntrack_put(ct);
568 return dropped;
569}
570
571static inline int helper_cmp(const struct ip_conntrack_helper *i,
572 const struct ip_conntrack_tuple *rtuple)
573{
574 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
575}
576
080774a2
HW
577static struct ip_conntrack_helper *
578__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
1da177e4
LT
579{
580 return LIST_FIND(&helpers, helper_cmp,
581 struct ip_conntrack_helper *,
582 tuple);
583}
584
080774a2
HW
585struct ip_conntrack_helper *
586ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
587{
588 struct ip_conntrack_helper *helper;
589
590 /* need ip_conntrack_lock to assure that helper exists until
591 * try_module_get() is called */
592 read_lock_bh(&ip_conntrack_lock);
593
594 helper = __ip_conntrack_helper_find(tuple);
595 if (helper) {
596 /* need to increase module usage count to assure helper will
597 * not go away while the caller is e.g. busy putting a
598 * conntrack in the hash that uses the helper */
599 if (!try_module_get(helper->me))
600 helper = NULL;
601 }
602
603 read_unlock_bh(&ip_conntrack_lock);
604
605 return helper;
606}
607
608void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
609{
610 module_put(helper->me);
611}
612
613struct ip_conntrack_protocol *
614__ip_conntrack_proto_find(u_int8_t protocol)
615{
616 return ip_ct_protos[protocol];
617}
618
619/* this is guaranteed to always return a valid protocol helper, since
620 * it falls back to generic_protocol */
621struct ip_conntrack_protocol *
622ip_conntrack_proto_find_get(u_int8_t protocol)
623{
624 struct ip_conntrack_protocol *p;
625
626 preempt_disable();
627 p = __ip_conntrack_proto_find(protocol);
628 if (p) {
629 if (!try_module_get(p->me))
630 p = &ip_conntrack_generic_protocol;
631 }
632 preempt_enable();
633
634 return p;
635}
636
637void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
638{
639 module_put(p->me);
640}
641
642struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
643 struct ip_conntrack_tuple *repl)
1da177e4
LT
644{
645 struct ip_conntrack *conntrack;
1da177e4
LT
646
647 if (!ip_conntrack_hash_rnd_initted) {
648 get_random_bytes(&ip_conntrack_hash_rnd, 4);
649 ip_conntrack_hash_rnd_initted = 1;
650 }
651
1da177e4
LT
652 if (ip_conntrack_max
653 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
080774a2 654 unsigned int hash = hash_conntrack(orig);
1da177e4
LT
655 /* Try dropping from this hash chain. */
656 if (!early_drop(&ip_conntrack_hash[hash])) {
657 if (net_ratelimit())
658 printk(KERN_WARNING
659 "ip_conntrack: table full, dropping"
660 " packet.\n");
661 return ERR_PTR(-ENOMEM);
662 }
663 }
664
1da177e4
LT
665 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
666 if (!conntrack) {
667 DEBUGP("Can't allocate conntrack.\n");
080774a2 668 return NULL;
1da177e4
LT
669 }
670
671 memset(conntrack, 0, sizeof(*conntrack));
672 atomic_set(&conntrack->ct_general.use, 1);
673 conntrack->ct_general.destroy = destroy_conntrack;
080774a2
HW
674 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
675 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1da177e4
LT
676 /* Don't set timer yet: wait for confirmation */
677 init_timer(&conntrack->timeout);
678 conntrack->timeout.data = (unsigned long)conntrack;
679 conntrack->timeout.function = death_by_timeout;
680
080774a2
HW
681 atomic_inc(&ip_conntrack_count);
682
683 return conntrack;
684}
685
686void
687ip_conntrack_free(struct ip_conntrack *conntrack)
688{
689 atomic_dec(&ip_conntrack_count);
690 kmem_cache_free(ip_conntrack_cachep, conntrack);
691}
692
693/* Allocate a new conntrack: we return -ENOMEM if classification
694 * failed due to stress. Otherwise it really is unclassifiable */
695static struct ip_conntrack_tuple_hash *
696init_conntrack(struct ip_conntrack_tuple *tuple,
697 struct ip_conntrack_protocol *protocol,
698 struct sk_buff *skb)
699{
700 struct ip_conntrack *conntrack;
701 struct ip_conntrack_tuple repl_tuple;
702 struct ip_conntrack_expect *exp;
703
704 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
705 DEBUGP("Can't invert tuple.\n");
706 return NULL;
707 }
708
709 if (!(conntrack = ip_conntrack_alloc(tuple, &repl_tuple)))
710 return NULL;
711
712 if (!protocol->new(conntrack, skb)) {
713 ip_conntrack_free(conntrack);
714 return NULL;
715 }
716
e45b1be8 717 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
718 exp = find_expectation(tuple);
719
720 if (exp) {
721 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
722 conntrack, exp);
723 /* Welcome, Mr. Bond. We've been expecting you... */
724 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
725 conntrack->master = exp->master;
7cee432a 726#ifdef CONFIG_IP_NF_CONNTRACK_MARK
1da177e4 727 conntrack->mark = exp->master->mark;
1f494c0e
HW
728#endif
729#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
730 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
731 /* this is ugly, but there is no other place where to put it */
732 conntrack->nat.masq_index = exp->master->nat.masq_index;
1da177e4
LT
733#endif
734 nf_conntrack_get(&conntrack->master->ct_general);
735 CONNTRACK_STAT_INC(expect_new);
736 } else {
080774a2 737 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
1da177e4
LT
738
739 CONNTRACK_STAT_INC(new);
740 }
741
742 /* Overload tuple linked list to put us in unconfirmed list. */
743 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
744
e45b1be8 745 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
746
747 if (exp) {
748 if (exp->expectfn)
749 exp->expectfn(conntrack, exp);
4acdbdbe 750 ip_conntrack_expect_put(exp);
1da177e4
LT
751 }
752
753 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
754}
755
756/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
757static inline struct ip_conntrack *
758resolve_normal_ct(struct sk_buff *skb,
759 struct ip_conntrack_protocol *proto,
760 int *set_reply,
761 unsigned int hooknum,
762 enum ip_conntrack_info *ctinfo)
763{
764 struct ip_conntrack_tuple tuple;
765 struct ip_conntrack_tuple_hash *h;
766 struct ip_conntrack *ct;
767
768 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
769
770 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
771 &tuple,proto))
772 return NULL;
773
774 /* look for tuple match */
775 h = ip_conntrack_find_get(&tuple, NULL);
776 if (!h) {
777 h = init_conntrack(&tuple, proto, skb);
778 if (!h)
779 return NULL;
780 if (IS_ERR(h))
781 return (void *)h;
782 }
783 ct = tuplehash_to_ctrack(h);
784
785 /* It exists; we have (non-exclusive) reference. */
786 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
787 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
788 /* Please set reply bit if this packet OK */
789 *set_reply = 1;
790 } else {
791 /* Once we've had two way comms, always ESTABLISHED. */
792 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
793 DEBUGP("ip_conntrack_in: normal packet for %p\n",
794 ct);
795 *ctinfo = IP_CT_ESTABLISHED;
796 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
797 DEBUGP("ip_conntrack_in: related packet for %p\n",
798 ct);
799 *ctinfo = IP_CT_RELATED;
800 } else {
801 DEBUGP("ip_conntrack_in: new packet for %p\n",
802 ct);
803 *ctinfo = IP_CT_NEW;
804 }
805 *set_reply = 0;
806 }
807 skb->nfct = &ct->ct_general;
808 skb->nfctinfo = *ctinfo;
809 return ct;
810}
811
812/* Netfilter hook itself. */
813unsigned int ip_conntrack_in(unsigned int hooknum,
814 struct sk_buff **pskb,
815 const struct net_device *in,
816 const struct net_device *out,
817 int (*okfn)(struct sk_buff *))
818{
819 struct ip_conntrack *ct;
820 enum ip_conntrack_info ctinfo;
821 struct ip_conntrack_protocol *proto;
ac3247ba 822 int set_reply = 0;
1da177e4
LT
823 int ret;
824
825 /* Previously seen (loopback or untracked)? Ignore. */
826 if ((*pskb)->nfct) {
827 CONNTRACK_STAT_INC(ignore);
828 return NF_ACCEPT;
829 }
830
831 /* Never happen */
832 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
833 if (net_ratelimit()) {
834 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
835 (*pskb)->nh.iph->protocol, hooknum);
836 }
837 return NF_DROP;
838 }
839
1da177e4
LT
840/* Doesn't cover locally-generated broadcast, so not worth it. */
841#if 0
842 /* Ignore broadcast: no `connection'. */
843 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
844 printk("Broadcast packet!\n");
845 return NF_ACCEPT;
846 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
847 == htonl(0x000000FF)) {
848 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
849 NIPQUAD((*pskb)->nh.iph->saddr),
850 NIPQUAD((*pskb)->nh.iph->daddr),
851 (*pskb)->sk, (*pskb)->pkt_type);
852 }
853#endif
854
080774a2 855 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
1da177e4
LT
856
857 /* It may be an special packet, error, unclean...
858 * inverse of the return code tells to the netfilter
859 * core what to do with the packet. */
860 if (proto->error != NULL
861 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
862 CONNTRACK_STAT_INC(error);
863 CONNTRACK_STAT_INC(invalid);
864 return -ret;
865 }
866
867 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
868 /* Not valid part of a connection */
869 CONNTRACK_STAT_INC(invalid);
870 return NF_ACCEPT;
871 }
872
873 if (IS_ERR(ct)) {
874 /* Too stressed to deal. */
875 CONNTRACK_STAT_INC(drop);
876 return NF_DROP;
877 }
878
879 IP_NF_ASSERT((*pskb)->nfct);
880
ac3247ba
HW
881 ip_conntrack_event_cache_init(*pskb);
882
1da177e4
LT
883 ret = proto->packet(ct, *pskb, ctinfo);
884 if (ret < 0) {
885 /* Invalid: inverse of the return code tells
886 * the netfilter core what to do*/
887 nf_conntrack_put((*pskb)->nfct);
888 (*pskb)->nfct = NULL;
889 CONNTRACK_STAT_INC(invalid);
890 return -ret;
891 }
892
ac3247ba
HW
893 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
894 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
1da177e4
LT
895
896 return ret;
897}
898
899int invert_tuplepr(struct ip_conntrack_tuple *inverse,
900 const struct ip_conntrack_tuple *orig)
901{
902 return ip_ct_invert_tuple(inverse, orig,
080774a2 903 __ip_conntrack_proto_find(orig->dst.protonum));
1da177e4
LT
904}
905
906/* Would two expected things clash? */
907static inline int expect_clash(const struct ip_conntrack_expect *a,
908 const struct ip_conntrack_expect *b)
909{
910 /* Part covered by intersection of masks must be unequal,
911 otherwise they clash */
912 struct ip_conntrack_tuple intersect_mask
913 = { { a->mask.src.ip & b->mask.src.ip,
914 { a->mask.src.u.all & b->mask.src.u.all } },
915 { a->mask.dst.ip & b->mask.dst.ip,
916 { a->mask.dst.u.all & b->mask.dst.u.all },
917 a->mask.dst.protonum & b->mask.dst.protonum } };
918
919 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
920}
921
922static inline int expect_matches(const struct ip_conntrack_expect *a,
923 const struct ip_conntrack_expect *b)
924{
925 return a->master == b->master
926 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
927 && ip_ct_tuple_equal(&a->mask, &b->mask);
928}
929
930/* Generally a bad idea to call this: could have matched already. */
931void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
932{
933 struct ip_conntrack_expect *i;
934
e45b1be8 935 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
936 /* choose the the oldest expectation to evict */
937 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
938 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
939 unlink_expect(i);
e45b1be8 940 write_unlock_bh(&ip_conntrack_lock);
4acdbdbe 941 ip_conntrack_expect_put(i);
1da177e4
LT
942 return;
943 }
944 }
e45b1be8 945 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
946}
947
4acdbdbe 948struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
1da177e4
LT
949{
950 struct ip_conntrack_expect *new;
951
952 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
953 if (!new) {
954 DEBUGP("expect_related: OOM allocating expect\n");
955 return NULL;
956 }
4acdbdbe
RR
957 new->master = me;
958 atomic_inc(&new->master->ct_general.use);
959 atomic_set(&new->use, 1);
1da177e4
LT
960 return new;
961}
962
4acdbdbe 963void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
1da177e4 964{
4acdbdbe
RR
965 if (atomic_dec_and_test(&exp->use)) {
966 ip_conntrack_put(exp->master);
967 kmem_cache_free(ip_conntrack_expect_cachep, exp);
968 }
1da177e4
LT
969}
970
971static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
972{
4acdbdbe 973 atomic_inc(&exp->use);
1da177e4
LT
974 exp->master->expecting++;
975 list_add(&exp->list, &ip_conntrack_expect_list);
976
1d3cdb41
PO
977 init_timer(&exp->timeout);
978 exp->timeout.data = (unsigned long)exp;
979 exp->timeout.function = expectation_timed_out;
980 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
981 add_timer(&exp->timeout);
1da177e4 982
080774a2
HW
983 exp->id = ++ip_conntrack_expect_next_id;
984 atomic_inc(&exp->use);
1da177e4
LT
985 CONNTRACK_STAT_INC(expect_create);
986}
987
988/* Race with expectations being used means we could have none to find; OK. */
989static void evict_oldest_expect(struct ip_conntrack *master)
990{
991 struct ip_conntrack_expect *i;
992
993 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
994 if (i->master == master) {
995 if (del_timer(&i->timeout)) {
996 unlink_expect(i);
4acdbdbe 997 ip_conntrack_expect_put(i);
1da177e4
LT
998 }
999 break;
1000 }
1001 }
1002}
1003
1004static inline int refresh_timer(struct ip_conntrack_expect *i)
1005{
1006 if (!del_timer(&i->timeout))
1007 return 0;
1008
1009 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1010 add_timer(&i->timeout);
1011 return 1;
1012}
1013
1014int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1015{
1016 struct ip_conntrack_expect *i;
1017 int ret;
1018
1019 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1020 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1021 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1022
e45b1be8 1023 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1024 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1025 if (expect_matches(i, expect)) {
1026 /* Refresh timer: if it's dying, ignore.. */
1027 if (refresh_timer(i)) {
1028 ret = 0;
1da177e4
LT
1029 goto out;
1030 }
1031 } else if (expect_clash(i, expect)) {
1032 ret = -EBUSY;
1033 goto out;
1034 }
1035 }
1036
1037 /* Will be over limit? */
1038 if (expect->master->helper->max_expected &&
1039 expect->master->expecting >= expect->master->helper->max_expected)
1040 evict_oldest_expect(expect->master);
1041
1042 ip_conntrack_expect_insert(expect);
ac3247ba 1043 ip_conntrack_expect_event(IPEXP_NEW, expect);
1da177e4
LT
1044 ret = 0;
1045out:
e45b1be8 1046 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1047 return ret;
1048}
1049
1050/* Alter reply tuple (maybe alter helper). This is for NAT, and is
1051 implicitly racy: see __ip_conntrack_confirm */
1052void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1053 const struct ip_conntrack_tuple *newreply)
1054{
e45b1be8 1055 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1056 /* Should be unconfirmed, so not in hash table yet */
1057 IP_NF_ASSERT(!is_confirmed(conntrack));
1058
1059 DEBUGP("Altering reply tuple of %p to ", conntrack);
1060 DUMP_TUPLE(newreply);
1061
1062 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1063 if (!conntrack->master && conntrack->expecting == 0)
080774a2 1064 conntrack->helper = __ip_conntrack_helper_find(newreply);
e45b1be8 1065 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1066}
1067
1068int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1069{
1070 BUG_ON(me->timeout == 0);
e45b1be8 1071 write_lock_bh(&ip_conntrack_lock);
1da177e4 1072 list_prepend(&helpers, me);
e45b1be8 1073 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1074
1075 return 0;
1076}
1077
080774a2
HW
1078struct ip_conntrack_helper *
1079__ip_conntrack_helper_find_byname(const char *name)
1080{
1081 struct ip_conntrack_helper *h;
1082
1083 list_for_each_entry(h, &helpers, list) {
1084 if (!strcmp(h->name, name))
1085 return h;
1086 }
1087
1088 return NULL;
1089}
1090
1da177e4
LT
1091static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1092 const struct ip_conntrack_helper *me)
1093{
ac3247ba
HW
1094 if (tuplehash_to_ctrack(i)->helper == me) {
1095 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1da177e4 1096 tuplehash_to_ctrack(i)->helper = NULL;
ac3247ba 1097 }
1da177e4
LT
1098 return 0;
1099}
1100
1101void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1102{
1103 unsigned int i;
1104 struct ip_conntrack_expect *exp, *tmp;
1105
1106 /* Need write lock here, to delete helper. */
e45b1be8 1107 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1108 LIST_DELETE(&helpers, me);
1109
1110 /* Get rid of expectations */
1111 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1112 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1113 unlink_expect(exp);
4acdbdbe 1114 ip_conntrack_expect_put(exp);
1da177e4
LT
1115 }
1116 }
1117 /* Get rid of expecteds, set helpers to NULL. */
1118 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1119 for (i = 0; i < ip_conntrack_htable_size; i++)
1120 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1121 struct ip_conntrack_tuple_hash *, me);
e45b1be8 1122 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1123
1124 /* Someone could be still looking at the helper in a bh. */
1125 synchronize_net();
1126}
1127
1128static inline void ct_add_counters(struct ip_conntrack *ct,
1129 enum ip_conntrack_info ctinfo,
1130 const struct sk_buff *skb)
1131{
1132#ifdef CONFIG_IP_NF_CT_ACCT
1133 if (skb) {
1134 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1135 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1136 ntohs(skb->nh.iph->tot_len);
1137 }
1138#endif
1139}
1140
1141/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1142void ip_ct_refresh_acct(struct ip_conntrack *ct,
1143 enum ip_conntrack_info ctinfo,
1144 const struct sk_buff *skb,
1145 unsigned long extra_jiffies)
1146{
1147 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1148
1149 /* If not in hash table, timer will not be active yet */
1150 if (!is_confirmed(ct)) {
1151 ct->timeout.expires = extra_jiffies;
1152 ct_add_counters(ct, ctinfo, skb);
1153 } else {
e45b1be8 1154 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1155 /* Need del_timer for race avoidance (may already be dying). */
1156 if (del_timer(&ct->timeout)) {
1157 ct->timeout.expires = jiffies + extra_jiffies;
1158 add_timer(&ct->timeout);
ac3247ba 1159 ip_conntrack_event_cache(IPCT_REFRESH, skb);
1da177e4
LT
1160 }
1161 ct_add_counters(ct, ctinfo, skb);
e45b1be8 1162 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1163 }
1164}
1165
080774a2
HW
1166#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1167 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1168/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1169 * in ip_conntrack_core, since we don't want the protocols to autoload
1170 * or depend on ctnetlink */
1171int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1172 const struct ip_conntrack_tuple *tuple)
1173{
1174 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1175 &tuple->src.u.tcp.port);
1176 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1177 &tuple->dst.u.tcp.port);
1178 return 0;
1179
1180nfattr_failure:
1181 return -1;
1182}
1183
1184int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1185 struct ip_conntrack_tuple *t)
1186{
1187 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1188 return -EINVAL;
1189
1190 t->src.u.tcp.port =
1191 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1192 t->dst.u.tcp.port =
1193 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1194
1195 return 0;
1196}
1197#endif
1198
1da177e4
LT
1199/* Returns new sk_buff, or NULL */
1200struct sk_buff *
1201ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1202{
8be58932 1203 skb_orphan(skb);
1da177e4
LT
1204
1205 local_bh_disable();
1206 skb = ip_defrag(skb, user);
1207 local_bh_enable();
1208
6869c4d8 1209 if (skb)
8be58932 1210 ip_send_check(skb->nh.iph);
1da177e4
LT
1211 return skb;
1212}
1213
1214/* Used by ipt_REJECT. */
1215static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1216{
1217 struct ip_conntrack *ct;
1218 enum ip_conntrack_info ctinfo;
1219
1220 /* This ICMP is in reverse direction to the packet which caused it */
1221 ct = ip_conntrack_get(skb, &ctinfo);
1222
1223 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1224 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1225 else
1226 ctinfo = IP_CT_RELATED;
1227
1228 /* Attach to new skbuff, and increment count */
1229 nskb->nfct = &ct->ct_general;
1230 nskb->nfctinfo = ctinfo;
1231 nf_conntrack_get(nskb->nfct);
1232}
1233
1234static inline int
1235do_iter(const struct ip_conntrack_tuple_hash *i,
1236 int (*iter)(struct ip_conntrack *i, void *data),
1237 void *data)
1238{
1239 return iter(tuplehash_to_ctrack(i), data);
1240}
1241
1242/* Bring out ya dead! */
1243static struct ip_conntrack_tuple_hash *
1244get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1245 void *data, unsigned int *bucket)
1246{
1247 struct ip_conntrack_tuple_hash *h = NULL;
1248
e45b1be8 1249 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1250 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1251 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1252 struct ip_conntrack_tuple_hash *, iter, data);
1253 if (h)
1254 break;
1255 }
1256 if (!h)
1257 h = LIST_FIND_W(&unconfirmed, do_iter,
1258 struct ip_conntrack_tuple_hash *, iter, data);
1259 if (h)
1260 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
e45b1be8 1261 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1262
1263 return h;
1264}
1265
1266void
1267ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1268{
1269 struct ip_conntrack_tuple_hash *h;
1270 unsigned int bucket = 0;
1271
1272 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1273 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1274 /* Time to push up daises... */
1275 if (del_timer(&ct->timeout))
1276 death_by_timeout((unsigned long)ct);
1277 /* ... else the timer will get him soon. */
1278
1279 ip_conntrack_put(ct);
1280 }
ac3247ba
HW
1281
1282#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1283 {
1284 /* we need to deliver all cached events in order to drop
1285 * the reference counts */
1286 int cpu;
1287 for_each_cpu(cpu) {
1288 struct ip_conntrack_ecache *ecache =
1289 &per_cpu(ip_conntrack_ecache, cpu);
1290 if (ecache->ct) {
1291 __ip_ct_deliver_cached_events(ecache);
1292 ip_conntrack_put(ecache->ct);
1293 ecache->ct = NULL;
1294 }
1295 }
1296 }
1297#endif
1da177e4
LT
1298}
1299
1300/* Fast function for those who don't want to parse /proc (and I don't
1301 blame them). */
1302/* Reversing the socket's dst/src point of view gives us the reply
1303 mapping. */
1304static int
1305getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1306{
1307 struct inet_sock *inet = inet_sk(sk);
1308 struct ip_conntrack_tuple_hash *h;
1309 struct ip_conntrack_tuple tuple;
1310
1311 IP_CT_TUPLE_U_BLANK(&tuple);
1312 tuple.src.ip = inet->rcv_saddr;
1313 tuple.src.u.tcp.port = inet->sport;
1314 tuple.dst.ip = inet->daddr;
1315 tuple.dst.u.tcp.port = inet->dport;
1316 tuple.dst.protonum = IPPROTO_TCP;
1317
1318 /* We only do TCP at the moment: is there a better way? */
1319 if (strcmp(sk->sk_prot->name, "TCP")) {
1320 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1321 return -ENOPROTOOPT;
1322 }
1323
1324 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1325 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1326 *len, sizeof(struct sockaddr_in));
1327 return -EINVAL;
1328 }
1329
1330 h = ip_conntrack_find_get(&tuple, NULL);
1331 if (h) {
1332 struct sockaddr_in sin;
1333 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1334
1335 sin.sin_family = AF_INET;
1336 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1337 .tuple.dst.u.tcp.port;
1338 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1339 .tuple.dst.ip;
1340
1341 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1342 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1343 ip_conntrack_put(ct);
1344 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1345 return -EFAULT;
1346 else
1347 return 0;
1348 }
1349 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1350 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1351 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1352 return -ENOENT;
1353}
1354
1355static struct nf_sockopt_ops so_getorigdst = {
1356 .pf = PF_INET,
1357 .get_optmin = SO_ORIGINAL_DST,
1358 .get_optmax = SO_ORIGINAL_DST+1,
1359 .get = &getorigdst,
1360};
1361
1362static int kill_all(struct ip_conntrack *i, void *data)
1363{
1364 return 1;
1365}
1366
1367static void free_conntrack_hash(void)
1368{
1369 if (ip_conntrack_vmalloc)
1370 vfree(ip_conntrack_hash);
1371 else
1372 free_pages((unsigned long)ip_conntrack_hash,
1373 get_order(sizeof(struct list_head)
1374 * ip_conntrack_htable_size));
1375}
1376
080774a2 1377void ip_conntrack_flush()
1da177e4 1378{
1da177e4
LT
1379 /* This makes sure all current packets have passed through
1380 netfilter framework. Roll on, two-stage module
1381 delete... */
1382 synchronize_net();
080774a2 1383
1da177e4
LT
1384 i_see_dead_people:
1385 ip_ct_iterate_cleanup(kill_all, NULL);
1386 if (atomic_read(&ip_conntrack_count) != 0) {
1387 schedule();
1388 goto i_see_dead_people;
1389 }
21f930e4
PM
1390 /* wait until all references to ip_conntrack_untracked are dropped */
1391 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1392 schedule();
080774a2 1393}
1da177e4 1394
080774a2
HW
1395/* Mishearing the voices in his head, our hero wonders how he's
1396 supposed to kill the mall. */
1397void ip_conntrack_cleanup(void)
1398{
1399 ip_ct_attach = NULL;
1400 ip_conntrack_flush();
1da177e4
LT
1401 kmem_cache_destroy(ip_conntrack_cachep);
1402 kmem_cache_destroy(ip_conntrack_expect_cachep);
1403 free_conntrack_hash();
1404 nf_unregister_sockopt(&so_getorigdst);
1405}
1406
1407static int hashsize;
1408module_param(hashsize, int, 0400);
1409
1410int __init ip_conntrack_init(void)
1411{
1412 unsigned int i;
1413 int ret;
1414
1415 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1416 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1417 if (hashsize) {
1418 ip_conntrack_htable_size = hashsize;
1419 } else {
1420 ip_conntrack_htable_size
1421 = (((num_physpages << PAGE_SHIFT) / 16384)
1422 / sizeof(struct list_head));
1423 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1424 ip_conntrack_htable_size = 8192;
1425 if (ip_conntrack_htable_size < 16)
1426 ip_conntrack_htable_size = 16;
1427 }
1428 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1429
1430 printk("ip_conntrack version %s (%u buckets, %d max)"
1431 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1432 ip_conntrack_htable_size, ip_conntrack_max,
1433 sizeof(struct ip_conntrack));
1434
1435 ret = nf_register_sockopt(&so_getorigdst);
1436 if (ret != 0) {
1437 printk(KERN_ERR "Unable to register netfilter socket option\n");
1438 return ret;
1439 }
1440
1441 /* AK: the hash table is twice as big than needed because it
1442 uses list_head. it would be much nicer to caches to use a
1443 single pointer list head here. */
1444 ip_conntrack_vmalloc = 0;
1445 ip_conntrack_hash
1446 =(void*)__get_free_pages(GFP_KERNEL,
1447 get_order(sizeof(struct list_head)
1448 *ip_conntrack_htable_size));
1449 if (!ip_conntrack_hash) {
1450 ip_conntrack_vmalloc = 1;
1451 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1452 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1453 * ip_conntrack_htable_size);
1454 }
1455 if (!ip_conntrack_hash) {
1456 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1457 goto err_unreg_sockopt;
1458 }
1459
1460 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1461 sizeof(struct ip_conntrack), 0,
1462 0, NULL, NULL);
1463 if (!ip_conntrack_cachep) {
1464 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1465 goto err_free_hash;
1466 }
1467
1468 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1469 sizeof(struct ip_conntrack_expect),
1470 0, 0, NULL, NULL);
1471 if (!ip_conntrack_expect_cachep) {
1472 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1473 goto err_free_conntrack_slab;
1474 }
1475
1476 /* Don't NEED lock here, but good form anyway. */
e45b1be8 1477 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1478 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1479 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1480 /* Sew in builtin protocols. */
1481 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1482 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1483 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
e45b1be8 1484 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1485
1486 for (i = 0; i < ip_conntrack_htable_size; i++)
1487 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1488
1489 /* For use by ipt_REJECT */
1490 ip_ct_attach = ip_conntrack_attach;
1491
1492 /* Set up fake conntrack:
1493 - to never be deleted, not in any hashes */
1494 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1495 /* - and look it like as a confirmed connection */
1496 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1497
1498 return ret;
1499
1500err_free_conntrack_slab:
1501 kmem_cache_destroy(ip_conntrack_cachep);
1502err_free_hash:
1503 free_conntrack_hash();
1504err_unreg_sockopt:
1505 nf_unregister_sockopt(&so_getorigdst);
1506
1507 return -ENOMEM;
1508}
This page took 0.12576 seconds and 5 git commands to generate.