[NETFILTER] Fix conntrack event cache deadlock/oops
[deliverable/linux.git] / net / ipv4 / netfilter / ip_conntrack_core.c
CommitLineData
1da177e4
LT
1/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20#include <linux/config.h>
21#include <linux/types.h>
22#include <linux/icmp.h>
23#include <linux/ip.h>
24#include <linux/netfilter.h>
25#include <linux/netfilter_ipv4.h>
26#include <linux/module.h>
27#include <linux/skbuff.h>
28#include <linux/proc_fs.h>
29#include <linux/vmalloc.h>
30#include <net/checksum.h>
31#include <net/ip.h>
32#include <linux/stddef.h>
33#include <linux/sysctl.h>
34#include <linux/slab.h>
35#include <linux/random.h>
36#include <linux/jhash.h>
37#include <linux/err.h>
38#include <linux/percpu.h>
39#include <linux/moduleparam.h>
ac3247ba 40#include <linux/notifier.h>
1da177e4 41
e45b1be8 42/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
1da177e4 43 registrations, conntrack timers*/
e45b1be8
PM
44#define ASSERT_READ_LOCK(x)
45#define ASSERT_WRITE_LOCK(x)
1da177e4
LT
46
47#include <linux/netfilter_ipv4/ip_conntrack.h>
48#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50#include <linux/netfilter_ipv4/ip_conntrack_core.h>
51#include <linux/netfilter_ipv4/listhelp.h>
52
080774a2 53#define IP_CONNTRACK_VERSION "2.3"
1da177e4
LT
54
55#if 0
56#define DEBUGP printk
57#else
58#define DEBUGP(format, args...)
59#endif
60
e45b1be8 61DEFINE_RWLOCK(ip_conntrack_lock);
1da177e4
LT
62
63/* ip_conntrack_standalone needs this */
64atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67LIST_HEAD(ip_conntrack_expect_list);
68struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69static LIST_HEAD(helpers);
70unsigned int ip_conntrack_htable_size = 0;
71int ip_conntrack_max;
72struct list_head *ip_conntrack_hash;
ba89966c
ED
73static kmem_cache_t *ip_conntrack_cachep __read_mostly;
74static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
1da177e4
LT
75struct ip_conntrack ip_conntrack_untracked;
76unsigned int ip_ct_log_invalid;
77static LIST_HEAD(unconfirmed);
78static int ip_conntrack_vmalloc;
79
080774a2
HW
80static unsigned int ip_conntrack_next_id = 1;
81static unsigned int ip_conntrack_expect_next_id = 1;
ac3247ba
HW
82#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83struct notifier_block *ip_conntrack_chain;
84struct notifier_block *ip_conntrack_expect_chain;
85
86DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
a86888b9
PM
88/* deliver cached events and clear cache entry - must be called with locally
89 * disabled softirqs */
90static inline void
91__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
ac3247ba 92{
a86888b9 93 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
ac3247ba
HW
94 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95 notifier_call_chain(&ip_conntrack_chain, ecache->events,
96 ecache->ct);
97 ecache->events = 0;
a86888b9
PM
98 ip_conntrack_put(ecache->ct);
99 ecache->ct = NULL;
ac3247ba
HW
100}
101
102/* Deliver all cached events for a particular conntrack. This is called
103 * by code prior to async packet handling or freeing the skb */
a86888b9 104void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
ac3247ba 105{
a86888b9
PM
106 struct ip_conntrack_ecache *ecache;
107
108 local_bh_disable();
109 ecache = &__get_cpu_var(ip_conntrack_ecache);
110 if (ecache->ct == ct)
111 __ip_ct_deliver_cached_events(ecache);
112 local_bh_enable();
113}
ac3247ba 114
a86888b9
PM
115void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116{
117 struct ip_conntrack_ecache *ecache;
ac3247ba 118
a86888b9
PM
119 /* take care of delivering potentially old events */
120 ecache = &__get_cpu_var(ip_conntrack_ecache);
121 BUG_ON(ecache->ct == ct);
122 if (ecache->ct)
123 __ip_ct_deliver_cached_events(ecache);
124 /* initialize for this conntrack/packet */
125 ecache->ct = ct;
126 nf_conntrack_get(&ct->ct_general);
ac3247ba
HW
127}
128
a86888b9
PM
129/* flush the event cache - touches other CPU's data and must not be called while
130 * packets are still passing through the code */
131static void ip_ct_event_cache_flush(void)
ac3247ba 132{
a86888b9
PM
133 struct ip_conntrack_ecache *ecache;
134 int cpu;
ac3247ba 135
a86888b9
PM
136 for_each_cpu(cpu) {
137 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138 if (ecache->ct)
ac3247ba 139 ip_conntrack_put(ecache->ct);
ac3247ba
HW
140 }
141}
a86888b9
PM
142#else
143static inline void ip_ct_event_cache_flush(void) {}
ac3247ba
HW
144#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
1da177e4
LT
146DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
1da177e4
LT
148static int ip_conntrack_hash_rnd_initted;
149static unsigned int ip_conntrack_hash_rnd;
150
151static u_int32_t
152hash_conntrack(const struct ip_conntrack_tuple *tuple)
153{
154#if 0
155 dump_tuple(tuple);
156#endif
157 return (jhash_3words(tuple->src.ip,
158 (tuple->dst.ip ^ tuple->dst.protonum),
159 (tuple->src.u.all | (tuple->dst.u.all << 16)),
160 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
161}
162
163int
164ip_ct_get_tuple(const struct iphdr *iph,
165 const struct sk_buff *skb,
166 unsigned int dataoff,
167 struct ip_conntrack_tuple *tuple,
168 const struct ip_conntrack_protocol *protocol)
169{
170 /* Never happen */
171 if (iph->frag_off & htons(IP_OFFSET)) {
172 printk("ip_conntrack_core: Frag of proto %u.\n",
173 iph->protocol);
174 return 0;
175 }
176
177 tuple->src.ip = iph->saddr;
178 tuple->dst.ip = iph->daddr;
179 tuple->dst.protonum = iph->protocol;
180 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
181
182 return protocol->pkt_to_tuple(skb, dataoff, tuple);
183}
184
185int
186ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
187 const struct ip_conntrack_tuple *orig,
188 const struct ip_conntrack_protocol *protocol)
189{
190 inverse->src.ip = orig->dst.ip;
191 inverse->dst.ip = orig->src.ip;
192 inverse->dst.protonum = orig->dst.protonum;
193 inverse->dst.dir = !orig->dst.dir;
194
195 return protocol->invert_tuple(inverse, orig);
196}
197
198
199/* ip_conntrack_expect helper functions */
49719eb3 200void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
1da177e4 201{
e45b1be8 202 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
4acdbdbe 203 IP_NF_ASSERT(!timer_pending(&exp->timeout));
1da177e4 204 list_del(&exp->list);
4acdbdbe 205 CONNTRACK_STAT_INC(expect_delete);
1da177e4 206 exp->master->expecting--;
37012f7f 207 ip_conntrack_expect_put(exp);
1da177e4
LT
208}
209
210static void expectation_timed_out(unsigned long ul_expect)
211{
212 struct ip_conntrack_expect *exp = (void *)ul_expect;
213
e45b1be8 214 write_lock_bh(&ip_conntrack_lock);
49719eb3 215 ip_ct_unlink_expect(exp);
e45b1be8 216 write_unlock_bh(&ip_conntrack_lock);
4acdbdbe 217 ip_conntrack_expect_put(exp);
1da177e4
LT
218}
219
080774a2
HW
220struct ip_conntrack_expect *
221__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
222{
223 struct ip_conntrack_expect *i;
224
225 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
226 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
227 atomic_inc(&i->use);
228 return i;
229 }
230 }
231 return NULL;
232}
233
234/* Just find a expectation corresponding to a tuple. */
235struct ip_conntrack_expect *
a41bc002 236ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
080774a2
HW
237{
238 struct ip_conntrack_expect *i;
239
240 read_lock_bh(&ip_conntrack_lock);
241 i = __ip_conntrack_expect_find(tuple);
242 read_unlock_bh(&ip_conntrack_lock);
243
244 return i;
245}
246
1da177e4
LT
247/* If an expectation for this connection is found, it gets delete from
248 * global list then returned. */
249static struct ip_conntrack_expect *
250find_expectation(const struct ip_conntrack_tuple *tuple)
251{
252 struct ip_conntrack_expect *i;
253
254 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
255 /* If master is not in hash table yet (ie. packet hasn't left
256 this machine yet), how can other end know about expected?
257 Hence these are not the droids you are looking for (if
258 master ct never got confirmed, we'd hold a reference to it
259 and weird things would happen to future packets). */
260 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
2248bcfc
PM
261 && is_confirmed(i->master)) {
262 if (i->flags & IP_CT_EXPECT_PERMANENT) {
263 atomic_inc(&i->use);
264 return i;
265 } else if (del_timer(&i->timeout)) {
49719eb3 266 ip_ct_unlink_expect(i);
2248bcfc
PM
267 return i;
268 }
1da177e4
LT
269 }
270 }
271 return NULL;
272}
273
274/* delete all expectations for this conntrack */
080774a2 275void ip_ct_remove_expectations(struct ip_conntrack *ct)
1da177e4
LT
276{
277 struct ip_conntrack_expect *i, *tmp;
278
279 /* Optimization: most connection never expect any others. */
280 if (ct->expecting == 0)
281 return;
282
283 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
284 if (i->master == ct && del_timer(&i->timeout)) {
49719eb3 285 ip_ct_unlink_expect(i);
4acdbdbe 286 ip_conntrack_expect_put(i);
1da177e4
LT
287 }
288 }
289}
290
291static void
292clean_from_lists(struct ip_conntrack *ct)
293{
294 unsigned int ho, hr;
295
296 DEBUGP("clean_from_lists(%p)\n", ct);
e45b1be8 297 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
1da177e4
LT
298
299 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
300 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
301 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
302 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
303
304 /* Destroy all pending expectations */
080774a2 305 ip_ct_remove_expectations(ct);
1da177e4
LT
306}
307
308static void
309destroy_conntrack(struct nf_conntrack *nfct)
310{
311 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
312 struct ip_conntrack_protocol *proto;
313
314 DEBUGP("destroy_conntrack(%p)\n", ct);
315 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
316 IP_NF_ASSERT(!timer_pending(&ct->timeout));
317
14a50bba 318 ip_conntrack_event(IPCT_DESTROY, ct);
ac3247ba
HW
319 set_bit(IPS_DYING_BIT, &ct->status);
320
1da177e4
LT
321 /* To make sure we don't get any weird locking issues here:
322 * destroy_conntrack() MUST NOT be called with a write lock
323 * to ip_conntrack_lock!!! -HW */
080774a2 324 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
1da177e4
LT
325 if (proto && proto->destroy)
326 proto->destroy(ct);
327
328 if (ip_conntrack_destroyed)
329 ip_conntrack_destroyed(ct);
330
e45b1be8 331 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
332 /* Expectations will have been removed in clean_from_lists,
333 * except TFTP can create an expectation on the first packet,
334 * before connection is in the list, so we need to clean here,
335 * too. */
080774a2 336 ip_ct_remove_expectations(ct);
1da177e4
LT
337
338 /* We overload first tuple to link into unconfirmed list. */
339 if (!is_confirmed(ct)) {
340 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
341 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
342 }
343
344 CONNTRACK_STAT_INC(delete);
e45b1be8 345 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
346
347 if (ct->master)
348 ip_conntrack_put(ct->master);
349
350 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
080774a2 351 ip_conntrack_free(ct);
1da177e4
LT
352}
353
354static void death_by_timeout(unsigned long ul_conntrack)
355{
356 struct ip_conntrack *ct = (void *)ul_conntrack;
357
e45b1be8 358 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
359 /* Inside lock so preempt is disabled on module removal path.
360 * Otherwise we can get spurious warnings. */
361 CONNTRACK_STAT_INC(delete_list);
362 clean_from_lists(ct);
e45b1be8 363 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
364 ip_conntrack_put(ct);
365}
366
367static inline int
368conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
369 const struct ip_conntrack_tuple *tuple,
370 const struct ip_conntrack *ignored_conntrack)
371{
e45b1be8 372 ASSERT_READ_LOCK(&ip_conntrack_lock);
1da177e4
LT
373 return tuplehash_to_ctrack(i) != ignored_conntrack
374 && ip_ct_tuple_equal(tuple, &i->tuple);
375}
376
080774a2 377struct ip_conntrack_tuple_hash *
1da177e4
LT
378__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
379 const struct ip_conntrack *ignored_conntrack)
380{
381 struct ip_conntrack_tuple_hash *h;
382 unsigned int hash = hash_conntrack(tuple);
383
e45b1be8 384 ASSERT_READ_LOCK(&ip_conntrack_lock);
1da177e4
LT
385 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
386 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
387 CONNTRACK_STAT_INC(found);
388 return h;
389 }
390 CONNTRACK_STAT_INC(searched);
391 }
392
393 return NULL;
394}
395
396/* Find a connection corresponding to a tuple. */
397struct ip_conntrack_tuple_hash *
398ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
399 const struct ip_conntrack *ignored_conntrack)
400{
401 struct ip_conntrack_tuple_hash *h;
402
e45b1be8 403 read_lock_bh(&ip_conntrack_lock);
1da177e4
LT
404 h = __ip_conntrack_find(tuple, ignored_conntrack);
405 if (h)
406 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
e45b1be8 407 read_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
408
409 return h;
410}
411
080774a2
HW
412static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
413 unsigned int hash,
414 unsigned int repl_hash)
415{
416 ct->id = ++ip_conntrack_next_id;
417 list_prepend(&ip_conntrack_hash[hash],
418 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
419 list_prepend(&ip_conntrack_hash[repl_hash],
420 &ct->tuplehash[IP_CT_DIR_REPLY].list);
421}
422
423void ip_conntrack_hash_insert(struct ip_conntrack *ct)
424{
425 unsigned int hash, repl_hash;
426
427 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
428 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
429
430 write_lock_bh(&ip_conntrack_lock);
431 __ip_conntrack_hash_insert(ct, hash, repl_hash);
432 write_unlock_bh(&ip_conntrack_lock);
433}
434
1da177e4
LT
435/* Confirm a connection given skb; places it in hash table */
436int
437__ip_conntrack_confirm(struct sk_buff **pskb)
438{
439 unsigned int hash, repl_hash;
440 struct ip_conntrack *ct;
441 enum ip_conntrack_info ctinfo;
442
443 ct = ip_conntrack_get(*pskb, &ctinfo);
444
445 /* ipt_REJECT uses ip_conntrack_attach to attach related
446 ICMP/TCP RST packets in other direction. Actual packet
447 which created connection will be IP_CT_NEW or for an
448 expected connection, IP_CT_RELATED. */
449 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
450 return NF_ACCEPT;
451
452 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
453 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
454
455 /* We're not in hash table, and we refuse to set up related
456 connections for unconfirmed conns. But packet copies and
457 REJECT will give spurious warnings here. */
458 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
459
460 /* No external references means noone else could have
461 confirmed us. */
462 IP_NF_ASSERT(!is_confirmed(ct));
463 DEBUGP("Confirming conntrack %p\n", ct);
464
e45b1be8 465 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
466
467 /* See if there's one in the list already, including reverse:
468 NAT could have grabbed it without realizing, since we're
469 not in the hash. If there is, we lost race. */
470 if (!LIST_FIND(&ip_conntrack_hash[hash],
471 conntrack_tuple_cmp,
472 struct ip_conntrack_tuple_hash *,
473 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
474 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
475 conntrack_tuple_cmp,
476 struct ip_conntrack_tuple_hash *,
477 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
478 /* Remove from unconfirmed list */
479 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
480
080774a2 481 __ip_conntrack_hash_insert(ct, hash, repl_hash);
1da177e4
LT
482 /* Timer relative to confirmation time, not original
483 setting time, otherwise we'd get timer wrap in
484 weird delay cases. */
485 ct->timeout.expires += jiffies;
486 add_timer(&ct->timeout);
487 atomic_inc(&ct->ct_general.use);
488 set_bit(IPS_CONFIRMED_BIT, &ct->status);
489 CONNTRACK_STAT_INC(insert);
e45b1be8 490 write_unlock_bh(&ip_conntrack_lock);
ac3247ba
HW
491 if (ct->helper)
492 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
493#ifdef CONFIG_IP_NF_NAT_NEEDED
494 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
495 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
496 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
497#endif
498 ip_conntrack_event_cache(master_ct(ct) ?
499 IPCT_RELATED : IPCT_NEW, *pskb);
500
1da177e4
LT
501 return NF_ACCEPT;
502 }
503
504 CONNTRACK_STAT_INC(insert_failed);
e45b1be8 505 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
506
507 return NF_DROP;
508}
509
510/* Returns true if a connection correspondings to the tuple (required
511 for NAT). */
512int
513ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
514 const struct ip_conntrack *ignored_conntrack)
515{
516 struct ip_conntrack_tuple_hash *h;
517
e45b1be8 518 read_lock_bh(&ip_conntrack_lock);
1da177e4 519 h = __ip_conntrack_find(tuple, ignored_conntrack);
e45b1be8 520 read_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
521
522 return h != NULL;
523}
524
525/* There's a small race here where we may free a just-assured
526 connection. Too bad: we're in trouble anyway. */
527static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
528{
529 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
530}
531
532static int early_drop(struct list_head *chain)
533{
534 /* Traverse backwards: gives us oldest, which is roughly LRU */
535 struct ip_conntrack_tuple_hash *h;
536 struct ip_conntrack *ct = NULL;
537 int dropped = 0;
538
e45b1be8 539 read_lock_bh(&ip_conntrack_lock);
1da177e4
LT
540 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
541 if (h) {
542 ct = tuplehash_to_ctrack(h);
543 atomic_inc(&ct->ct_general.use);
544 }
e45b1be8 545 read_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
546
547 if (!ct)
548 return dropped;
549
550 if (del_timer(&ct->timeout)) {
551 death_by_timeout((unsigned long)ct);
552 dropped = 1;
553 CONNTRACK_STAT_INC(early_drop);
554 }
555 ip_conntrack_put(ct);
556 return dropped;
557}
558
559static inline int helper_cmp(const struct ip_conntrack_helper *i,
560 const struct ip_conntrack_tuple *rtuple)
561{
562 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
563}
564
080774a2
HW
565static struct ip_conntrack_helper *
566__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
1da177e4
LT
567{
568 return LIST_FIND(&helpers, helper_cmp,
569 struct ip_conntrack_helper *,
570 tuple);
571}
572
080774a2
HW
573struct ip_conntrack_helper *
574ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
575{
576 struct ip_conntrack_helper *helper;
577
578 /* need ip_conntrack_lock to assure that helper exists until
579 * try_module_get() is called */
580 read_lock_bh(&ip_conntrack_lock);
581
582 helper = __ip_conntrack_helper_find(tuple);
583 if (helper) {
584 /* need to increase module usage count to assure helper will
585 * not go away while the caller is e.g. busy putting a
586 * conntrack in the hash that uses the helper */
587 if (!try_module_get(helper->me))
588 helper = NULL;
589 }
590
591 read_unlock_bh(&ip_conntrack_lock);
592
593 return helper;
594}
595
596void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
597{
598 module_put(helper->me);
599}
600
601struct ip_conntrack_protocol *
602__ip_conntrack_proto_find(u_int8_t protocol)
603{
604 return ip_ct_protos[protocol];
605}
606
607/* this is guaranteed to always return a valid protocol helper, since
608 * it falls back to generic_protocol */
609struct ip_conntrack_protocol *
610ip_conntrack_proto_find_get(u_int8_t protocol)
611{
612 struct ip_conntrack_protocol *p;
613
614 preempt_disable();
615 p = __ip_conntrack_proto_find(protocol);
616 if (p) {
617 if (!try_module_get(p->me))
618 p = &ip_conntrack_generic_protocol;
619 }
620 preempt_enable();
621
622 return p;
623}
624
625void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
626{
627 module_put(p->me);
628}
629
630struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
631 struct ip_conntrack_tuple *repl)
1da177e4
LT
632{
633 struct ip_conntrack *conntrack;
1da177e4
LT
634
635 if (!ip_conntrack_hash_rnd_initted) {
636 get_random_bytes(&ip_conntrack_hash_rnd, 4);
637 ip_conntrack_hash_rnd_initted = 1;
638 }
639
1da177e4
LT
640 if (ip_conntrack_max
641 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
080774a2 642 unsigned int hash = hash_conntrack(orig);
1da177e4
LT
643 /* Try dropping from this hash chain. */
644 if (!early_drop(&ip_conntrack_hash[hash])) {
645 if (net_ratelimit())
646 printk(KERN_WARNING
647 "ip_conntrack: table full, dropping"
648 " packet.\n");
649 return ERR_PTR(-ENOMEM);
650 }
651 }
652
1da177e4
LT
653 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
654 if (!conntrack) {
655 DEBUGP("Can't allocate conntrack.\n");
7663f188 656 return ERR_PTR(-ENOMEM);
1da177e4
LT
657 }
658
659 memset(conntrack, 0, sizeof(*conntrack));
660 atomic_set(&conntrack->ct_general.use, 1);
661 conntrack->ct_general.destroy = destroy_conntrack;
080774a2
HW
662 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
663 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1da177e4
LT
664 /* Don't set timer yet: wait for confirmation */
665 init_timer(&conntrack->timeout);
666 conntrack->timeout.data = (unsigned long)conntrack;
667 conntrack->timeout.function = death_by_timeout;
668
080774a2
HW
669 atomic_inc(&ip_conntrack_count);
670
671 return conntrack;
672}
673
674void
675ip_conntrack_free(struct ip_conntrack *conntrack)
676{
677 atomic_dec(&ip_conntrack_count);
678 kmem_cache_free(ip_conntrack_cachep, conntrack);
679}
680
681/* Allocate a new conntrack: we return -ENOMEM if classification
682 * failed due to stress. Otherwise it really is unclassifiable */
683static struct ip_conntrack_tuple_hash *
684init_conntrack(struct ip_conntrack_tuple *tuple,
685 struct ip_conntrack_protocol *protocol,
686 struct sk_buff *skb)
687{
688 struct ip_conntrack *conntrack;
689 struct ip_conntrack_tuple repl_tuple;
690 struct ip_conntrack_expect *exp;
691
692 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
693 DEBUGP("Can't invert tuple.\n");
694 return NULL;
695 }
696
7663f188
YK
697 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
698 if (conntrack == NULL || IS_ERR(conntrack))
699 return (struct ip_conntrack_tuple_hash *)conntrack;
080774a2
HW
700
701 if (!protocol->new(conntrack, skb)) {
702 ip_conntrack_free(conntrack);
703 return NULL;
704 }
705
e45b1be8 706 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
707 exp = find_expectation(tuple);
708
709 if (exp) {
710 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
711 conntrack, exp);
712 /* Welcome, Mr. Bond. We've been expecting you... */
713 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
714 conntrack->master = exp->master;
7cee432a 715#ifdef CONFIG_IP_NF_CONNTRACK_MARK
1da177e4 716 conntrack->mark = exp->master->mark;
1f494c0e
HW
717#endif
718#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
719 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
720 /* this is ugly, but there is no other place where to put it */
721 conntrack->nat.masq_index = exp->master->nat.masq_index;
1da177e4
LT
722#endif
723 nf_conntrack_get(&conntrack->master->ct_general);
724 CONNTRACK_STAT_INC(expect_new);
725 } else {
080774a2 726 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
1da177e4
LT
727
728 CONNTRACK_STAT_INC(new);
729 }
730
731 /* Overload tuple linked list to put us in unconfirmed list. */
732 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
733
e45b1be8 734 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
735
736 if (exp) {
737 if (exp->expectfn)
738 exp->expectfn(conntrack, exp);
4acdbdbe 739 ip_conntrack_expect_put(exp);
1da177e4
LT
740 }
741
742 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
743}
744
745/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
746static inline struct ip_conntrack *
747resolve_normal_ct(struct sk_buff *skb,
748 struct ip_conntrack_protocol *proto,
749 int *set_reply,
750 unsigned int hooknum,
751 enum ip_conntrack_info *ctinfo)
752{
753 struct ip_conntrack_tuple tuple;
754 struct ip_conntrack_tuple_hash *h;
755 struct ip_conntrack *ct;
756
757 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
758
759 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
760 &tuple,proto))
761 return NULL;
762
763 /* look for tuple match */
764 h = ip_conntrack_find_get(&tuple, NULL);
765 if (!h) {
766 h = init_conntrack(&tuple, proto, skb);
767 if (!h)
768 return NULL;
769 if (IS_ERR(h))
770 return (void *)h;
771 }
772 ct = tuplehash_to_ctrack(h);
773
774 /* It exists; we have (non-exclusive) reference. */
775 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
776 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
777 /* Please set reply bit if this packet OK */
778 *set_reply = 1;
779 } else {
780 /* Once we've had two way comms, always ESTABLISHED. */
781 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
782 DEBUGP("ip_conntrack_in: normal packet for %p\n",
783 ct);
784 *ctinfo = IP_CT_ESTABLISHED;
785 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
786 DEBUGP("ip_conntrack_in: related packet for %p\n",
787 ct);
788 *ctinfo = IP_CT_RELATED;
789 } else {
790 DEBUGP("ip_conntrack_in: new packet for %p\n",
791 ct);
792 *ctinfo = IP_CT_NEW;
793 }
794 *set_reply = 0;
795 }
796 skb->nfct = &ct->ct_general;
797 skb->nfctinfo = *ctinfo;
798 return ct;
799}
800
801/* Netfilter hook itself. */
802unsigned int ip_conntrack_in(unsigned int hooknum,
803 struct sk_buff **pskb,
804 const struct net_device *in,
805 const struct net_device *out,
806 int (*okfn)(struct sk_buff *))
807{
808 struct ip_conntrack *ct;
809 enum ip_conntrack_info ctinfo;
810 struct ip_conntrack_protocol *proto;
ac3247ba 811 int set_reply = 0;
1da177e4
LT
812 int ret;
813
814 /* Previously seen (loopback or untracked)? Ignore. */
815 if ((*pskb)->nfct) {
816 CONNTRACK_STAT_INC(ignore);
817 return NF_ACCEPT;
818 }
819
820 /* Never happen */
821 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
822 if (net_ratelimit()) {
823 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
824 (*pskb)->nh.iph->protocol, hooknum);
825 }
826 return NF_DROP;
827 }
828
1da177e4
LT
829/* Doesn't cover locally-generated broadcast, so not worth it. */
830#if 0
831 /* Ignore broadcast: no `connection'. */
832 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
833 printk("Broadcast packet!\n");
834 return NF_ACCEPT;
835 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
836 == htonl(0x000000FF)) {
837 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
838 NIPQUAD((*pskb)->nh.iph->saddr),
839 NIPQUAD((*pskb)->nh.iph->daddr),
840 (*pskb)->sk, (*pskb)->pkt_type);
841 }
842#endif
843
080774a2 844 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
1da177e4
LT
845
846 /* It may be an special packet, error, unclean...
847 * inverse of the return code tells to the netfilter
848 * core what to do with the packet. */
849 if (proto->error != NULL
850 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
851 CONNTRACK_STAT_INC(error);
852 CONNTRACK_STAT_INC(invalid);
853 return -ret;
854 }
855
856 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
857 /* Not valid part of a connection */
858 CONNTRACK_STAT_INC(invalid);
859 return NF_ACCEPT;
860 }
861
862 if (IS_ERR(ct)) {
863 /* Too stressed to deal. */
864 CONNTRACK_STAT_INC(drop);
865 return NF_DROP;
866 }
867
868 IP_NF_ASSERT((*pskb)->nfct);
869
870 ret = proto->packet(ct, *pskb, ctinfo);
871 if (ret < 0) {
872 /* Invalid: inverse of the return code tells
873 * the netfilter core what to do*/
874 nf_conntrack_put((*pskb)->nfct);
875 (*pskb)->nfct = NULL;
876 CONNTRACK_STAT_INC(invalid);
877 return -ret;
878 }
879
ac3247ba
HW
880 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
881 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
1da177e4
LT
882
883 return ret;
884}
885
886int invert_tuplepr(struct ip_conntrack_tuple *inverse,
887 const struct ip_conntrack_tuple *orig)
888{
889 return ip_ct_invert_tuple(inverse, orig,
080774a2 890 __ip_conntrack_proto_find(orig->dst.protonum));
1da177e4
LT
891}
892
893/* Would two expected things clash? */
894static inline int expect_clash(const struct ip_conntrack_expect *a,
895 const struct ip_conntrack_expect *b)
896{
897 /* Part covered by intersection of masks must be unequal,
898 otherwise they clash */
899 struct ip_conntrack_tuple intersect_mask
900 = { { a->mask.src.ip & b->mask.src.ip,
901 { a->mask.src.u.all & b->mask.src.u.all } },
902 { a->mask.dst.ip & b->mask.dst.ip,
903 { a->mask.dst.u.all & b->mask.dst.u.all },
904 a->mask.dst.protonum & b->mask.dst.protonum } };
905
906 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
907}
908
909static inline int expect_matches(const struct ip_conntrack_expect *a,
910 const struct ip_conntrack_expect *b)
911{
912 return a->master == b->master
913 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
914 && ip_ct_tuple_equal(&a->mask, &b->mask);
915}
916
917/* Generally a bad idea to call this: could have matched already. */
918void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
919{
920 struct ip_conntrack_expect *i;
921
e45b1be8 922 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
923 /* choose the the oldest expectation to evict */
924 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
925 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
49719eb3 926 ip_ct_unlink_expect(i);
e45b1be8 927 write_unlock_bh(&ip_conntrack_lock);
4acdbdbe 928 ip_conntrack_expect_put(i);
1da177e4
LT
929 return;
930 }
931 }
e45b1be8 932 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
933}
934
91c46e2e
PNA
935/* We don't increase the master conntrack refcount for non-fulfilled
936 * conntracks. During the conntrack destruction, the expectations are
937 * always killed before the conntrack itself */
4acdbdbe 938struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
1da177e4
LT
939{
940 struct ip_conntrack_expect *new;
941
942 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
943 if (!new) {
944 DEBUGP("expect_related: OOM allocating expect\n");
945 return NULL;
946 }
4acdbdbe 947 new->master = me;
4acdbdbe 948 atomic_set(&new->use, 1);
1da177e4
LT
949 return new;
950}
951
4acdbdbe 952void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
1da177e4 953{
91c46e2e 954 if (atomic_dec_and_test(&exp->use))
4acdbdbe 955 kmem_cache_free(ip_conntrack_expect_cachep, exp);
1da177e4
LT
956}
957
958static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
959{
4acdbdbe 960 atomic_inc(&exp->use);
1da177e4
LT
961 exp->master->expecting++;
962 list_add(&exp->list, &ip_conntrack_expect_list);
963
1d3cdb41
PO
964 init_timer(&exp->timeout);
965 exp->timeout.data = (unsigned long)exp;
966 exp->timeout.function = expectation_timed_out;
967 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
968 add_timer(&exp->timeout);
1da177e4 969
080774a2
HW
970 exp->id = ++ip_conntrack_expect_next_id;
971 atomic_inc(&exp->use);
1da177e4
LT
972 CONNTRACK_STAT_INC(expect_create);
973}
974
975/* Race with expectations being used means we could have none to find; OK. */
976static void evict_oldest_expect(struct ip_conntrack *master)
977{
978 struct ip_conntrack_expect *i;
979
980 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
981 if (i->master == master) {
982 if (del_timer(&i->timeout)) {
49719eb3 983 ip_ct_unlink_expect(i);
4acdbdbe 984 ip_conntrack_expect_put(i);
1da177e4
LT
985 }
986 break;
987 }
988 }
989}
990
991static inline int refresh_timer(struct ip_conntrack_expect *i)
992{
993 if (!del_timer(&i->timeout))
994 return 0;
995
996 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
997 add_timer(&i->timeout);
998 return 1;
999}
1000
1001int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1002{
1003 struct ip_conntrack_expect *i;
1004 int ret;
1005
1006 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1007 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1008 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1009
e45b1be8 1010 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1011 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1012 if (expect_matches(i, expect)) {
1013 /* Refresh timer: if it's dying, ignore.. */
1014 if (refresh_timer(i)) {
1015 ret = 0;
1da177e4
LT
1016 goto out;
1017 }
1018 } else if (expect_clash(i, expect)) {
1019 ret = -EBUSY;
1020 goto out;
1021 }
1022 }
1023
1024 /* Will be over limit? */
1025 if (expect->master->helper->max_expected &&
1026 expect->master->expecting >= expect->master->helper->max_expected)
1027 evict_oldest_expect(expect->master);
1028
1029 ip_conntrack_expect_insert(expect);
ac3247ba 1030 ip_conntrack_expect_event(IPEXP_NEW, expect);
1da177e4
LT
1031 ret = 0;
1032out:
e45b1be8 1033 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1034 return ret;
1035}
1036
1037/* Alter reply tuple (maybe alter helper). This is for NAT, and is
1038 implicitly racy: see __ip_conntrack_confirm */
1039void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1040 const struct ip_conntrack_tuple *newreply)
1041{
e45b1be8 1042 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1043 /* Should be unconfirmed, so not in hash table yet */
1044 IP_NF_ASSERT(!is_confirmed(conntrack));
1045
1046 DEBUGP("Altering reply tuple of %p to ", conntrack);
1047 DUMP_TUPLE(newreply);
1048
1049 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1050 if (!conntrack->master && conntrack->expecting == 0)
080774a2 1051 conntrack->helper = __ip_conntrack_helper_find(newreply);
e45b1be8 1052 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1053}
1054
1055int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1056{
1057 BUG_ON(me->timeout == 0);
e45b1be8 1058 write_lock_bh(&ip_conntrack_lock);
1da177e4 1059 list_prepend(&helpers, me);
e45b1be8 1060 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1061
1062 return 0;
1063}
1064
080774a2
HW
1065struct ip_conntrack_helper *
1066__ip_conntrack_helper_find_byname(const char *name)
1067{
1068 struct ip_conntrack_helper *h;
1069
1070 list_for_each_entry(h, &helpers, list) {
1071 if (!strcmp(h->name, name))
1072 return h;
1073 }
1074
1075 return NULL;
1076}
1077
1da177e4
LT
1078static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1079 const struct ip_conntrack_helper *me)
1080{
ac3247ba
HW
1081 if (tuplehash_to_ctrack(i)->helper == me) {
1082 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1da177e4 1083 tuplehash_to_ctrack(i)->helper = NULL;
ac3247ba 1084 }
1da177e4
LT
1085 return 0;
1086}
1087
1088void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1089{
1090 unsigned int i;
1091 struct ip_conntrack_expect *exp, *tmp;
1092
1093 /* Need write lock here, to delete helper. */
e45b1be8 1094 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1095 LIST_DELETE(&helpers, me);
1096
1097 /* Get rid of expectations */
1098 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1099 if (exp->master->helper == me && del_timer(&exp->timeout)) {
49719eb3 1100 ip_ct_unlink_expect(exp);
4acdbdbe 1101 ip_conntrack_expect_put(exp);
1da177e4
LT
1102 }
1103 }
1104 /* Get rid of expecteds, set helpers to NULL. */
1105 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1106 for (i = 0; i < ip_conntrack_htable_size; i++)
1107 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1108 struct ip_conntrack_tuple_hash *, me);
e45b1be8 1109 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1110
1111 /* Someone could be still looking at the helper in a bh. */
1112 synchronize_net();
1113}
1114
1dfbab59
HW
1115/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1116void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1da177e4
LT
1117 enum ip_conntrack_info ctinfo,
1118 const struct sk_buff *skb,
1dfbab59
HW
1119 unsigned long extra_jiffies,
1120 int do_acct)
1da177e4 1121{
1dfbab59
HW
1122 int do_event = 0;
1123
1da177e4 1124 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1dfbab59
HW
1125 IP_NF_ASSERT(skb);
1126
1127 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1128
1129 /* If not in hash table, timer will not be active yet */
1130 if (!is_confirmed(ct)) {
1131 ct->timeout.expires = extra_jiffies;
1dfbab59 1132 do_event = 1;
1da177e4 1133 } else {
1da177e4
LT
1134 /* Need del_timer for race avoidance (may already be dying). */
1135 if (del_timer(&ct->timeout)) {
1136 ct->timeout.expires = jiffies + extra_jiffies;
1137 add_timer(&ct->timeout);
1dfbab59 1138 do_event = 1;
1da177e4 1139 }
1da177e4 1140 }
1dfbab59
HW
1141
1142#ifdef CONFIG_IP_NF_CT_ACCT
1143 if (do_acct) {
1144 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1145 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1146 ntohs(skb->nh.iph->tot_len);
1147 }
1148#endif
1149
1150 write_unlock_bh(&ip_conntrack_lock);
1151
1152 /* must be unlocked when calling event cache */
1153 if (do_event)
1154 ip_conntrack_event_cache(IPCT_REFRESH, skb);
1da177e4
LT
1155}
1156
080774a2
HW
1157#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1158 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1159/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1160 * in ip_conntrack_core, since we don't want the protocols to autoload
1161 * or depend on ctnetlink */
1162int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1163 const struct ip_conntrack_tuple *tuple)
1164{
1165 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1166 &tuple->src.u.tcp.port);
1167 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1168 &tuple->dst.u.tcp.port);
1169 return 0;
1170
1171nfattr_failure:
1172 return -1;
1173}
1174
1175int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1176 struct ip_conntrack_tuple *t)
1177{
1178 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1179 return -EINVAL;
1180
1181 t->src.u.tcp.port =
1182 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1183 t->dst.u.tcp.port =
1184 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1185
1186 return 0;
1187}
1188#endif
1189
1da177e4
LT
1190/* Returns new sk_buff, or NULL */
1191struct sk_buff *
1192ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1193{
8be58932 1194 skb_orphan(skb);
1da177e4
LT
1195
1196 local_bh_disable();
1197 skb = ip_defrag(skb, user);
1198 local_bh_enable();
1199
6869c4d8 1200 if (skb)
8be58932 1201 ip_send_check(skb->nh.iph);
1da177e4
LT
1202 return skb;
1203}
1204
1205/* Used by ipt_REJECT. */
1206static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1207{
1208 struct ip_conntrack *ct;
1209 enum ip_conntrack_info ctinfo;
1210
1211 /* This ICMP is in reverse direction to the packet which caused it */
1212 ct = ip_conntrack_get(skb, &ctinfo);
1213
1214 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1215 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1216 else
1217 ctinfo = IP_CT_RELATED;
1218
1219 /* Attach to new skbuff, and increment count */
1220 nskb->nfct = &ct->ct_general;
1221 nskb->nfctinfo = ctinfo;
1222 nf_conntrack_get(nskb->nfct);
1223}
1224
1225static inline int
1226do_iter(const struct ip_conntrack_tuple_hash *i,
1227 int (*iter)(struct ip_conntrack *i, void *data),
1228 void *data)
1229{
1230 return iter(tuplehash_to_ctrack(i), data);
1231}
1232
1233/* Bring out ya dead! */
1234static struct ip_conntrack_tuple_hash *
1235get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1236 void *data, unsigned int *bucket)
1237{
1238 struct ip_conntrack_tuple_hash *h = NULL;
1239
e45b1be8 1240 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1241 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1242 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1243 struct ip_conntrack_tuple_hash *, iter, data);
1244 if (h)
1245 break;
1246 }
1247 if (!h)
1248 h = LIST_FIND_W(&unconfirmed, do_iter,
1249 struct ip_conntrack_tuple_hash *, iter, data);
1250 if (h)
1251 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
e45b1be8 1252 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1253
1254 return h;
1255}
1256
1257void
1258ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1259{
1260 struct ip_conntrack_tuple_hash *h;
1261 unsigned int bucket = 0;
1262
1263 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1264 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1265 /* Time to push up daises... */
1266 if (del_timer(&ct->timeout))
1267 death_by_timeout((unsigned long)ct);
1268 /* ... else the timer will get him soon. */
1269
1270 ip_conntrack_put(ct);
1271 }
1272}
1273
1274/* Fast function for those who don't want to parse /proc (and I don't
1275 blame them). */
1276/* Reversing the socket's dst/src point of view gives us the reply
1277 mapping. */
1278static int
1279getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1280{
1281 struct inet_sock *inet = inet_sk(sk);
1282 struct ip_conntrack_tuple_hash *h;
1283 struct ip_conntrack_tuple tuple;
1284
1285 IP_CT_TUPLE_U_BLANK(&tuple);
1286 tuple.src.ip = inet->rcv_saddr;
1287 tuple.src.u.tcp.port = inet->sport;
1288 tuple.dst.ip = inet->daddr;
1289 tuple.dst.u.tcp.port = inet->dport;
1290 tuple.dst.protonum = IPPROTO_TCP;
1291
1292 /* We only do TCP at the moment: is there a better way? */
1293 if (strcmp(sk->sk_prot->name, "TCP")) {
1294 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1295 return -ENOPROTOOPT;
1296 }
1297
1298 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1299 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1300 *len, sizeof(struct sockaddr_in));
1301 return -EINVAL;
1302 }
1303
1304 h = ip_conntrack_find_get(&tuple, NULL);
1305 if (h) {
1306 struct sockaddr_in sin;
1307 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1308
1309 sin.sin_family = AF_INET;
1310 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1311 .tuple.dst.u.tcp.port;
1312 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1313 .tuple.dst.ip;
1314
1315 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1316 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1317 ip_conntrack_put(ct);
1318 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1319 return -EFAULT;
1320 else
1321 return 0;
1322 }
1323 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1324 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1325 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1326 return -ENOENT;
1327}
1328
1329static struct nf_sockopt_ops so_getorigdst = {
1330 .pf = PF_INET,
1331 .get_optmin = SO_ORIGINAL_DST,
1332 .get_optmax = SO_ORIGINAL_DST+1,
1333 .get = &getorigdst,
1334};
1335
1336static int kill_all(struct ip_conntrack *i, void *data)
1337{
1338 return 1;
1339}
1340
1341static void free_conntrack_hash(void)
1342{
1343 if (ip_conntrack_vmalloc)
1344 vfree(ip_conntrack_hash);
1345 else
1346 free_pages((unsigned long)ip_conntrack_hash,
1347 get_order(sizeof(struct list_head)
1348 * ip_conntrack_htable_size));
1349}
1350
080774a2 1351void ip_conntrack_flush()
1da177e4 1352{
1da177e4
LT
1353 /* This makes sure all current packets have passed through
1354 netfilter framework. Roll on, two-stage module
1355 delete... */
1356 synchronize_net();
080774a2 1357
a86888b9 1358 ip_ct_event_cache_flush();
1da177e4
LT
1359 i_see_dead_people:
1360 ip_ct_iterate_cleanup(kill_all, NULL);
1361 if (atomic_read(&ip_conntrack_count) != 0) {
1362 schedule();
1363 goto i_see_dead_people;
1364 }
21f930e4
PM
1365 /* wait until all references to ip_conntrack_untracked are dropped */
1366 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1367 schedule();
080774a2 1368}
1da177e4 1369
080774a2
HW
1370/* Mishearing the voices in his head, our hero wonders how he's
1371 supposed to kill the mall. */
1372void ip_conntrack_cleanup(void)
1373{
1374 ip_ct_attach = NULL;
1375 ip_conntrack_flush();
1da177e4
LT
1376 kmem_cache_destroy(ip_conntrack_cachep);
1377 kmem_cache_destroy(ip_conntrack_expect_cachep);
1378 free_conntrack_hash();
1379 nf_unregister_sockopt(&so_getorigdst);
1380}
1381
1382static int hashsize;
1383module_param(hashsize, int, 0400);
1384
1385int __init ip_conntrack_init(void)
1386{
1387 unsigned int i;
1388 int ret;
1389
1390 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1391 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1392 if (hashsize) {
1393 ip_conntrack_htable_size = hashsize;
1394 } else {
1395 ip_conntrack_htable_size
1396 = (((num_physpages << PAGE_SHIFT) / 16384)
1397 / sizeof(struct list_head));
1398 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1399 ip_conntrack_htable_size = 8192;
1400 if (ip_conntrack_htable_size < 16)
1401 ip_conntrack_htable_size = 16;
1402 }
1403 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1404
1405 printk("ip_conntrack version %s (%u buckets, %d max)"
1406 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1407 ip_conntrack_htable_size, ip_conntrack_max,
1408 sizeof(struct ip_conntrack));
1409
1410 ret = nf_register_sockopt(&so_getorigdst);
1411 if (ret != 0) {
1412 printk(KERN_ERR "Unable to register netfilter socket option\n");
1413 return ret;
1414 }
1415
1416 /* AK: the hash table is twice as big than needed because it
1417 uses list_head. it would be much nicer to caches to use a
1418 single pointer list head here. */
1419 ip_conntrack_vmalloc = 0;
1420 ip_conntrack_hash
1421 =(void*)__get_free_pages(GFP_KERNEL,
1422 get_order(sizeof(struct list_head)
1423 *ip_conntrack_htable_size));
1424 if (!ip_conntrack_hash) {
1425 ip_conntrack_vmalloc = 1;
1426 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1427 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1428 * ip_conntrack_htable_size);
1429 }
1430 if (!ip_conntrack_hash) {
1431 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1432 goto err_unreg_sockopt;
1433 }
1434
1435 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1436 sizeof(struct ip_conntrack), 0,
1437 0, NULL, NULL);
1438 if (!ip_conntrack_cachep) {
1439 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1440 goto err_free_hash;
1441 }
1442
1443 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1444 sizeof(struct ip_conntrack_expect),
1445 0, 0, NULL, NULL);
1446 if (!ip_conntrack_expect_cachep) {
1447 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1448 goto err_free_conntrack_slab;
1449 }
1450
1451 /* Don't NEED lock here, but good form anyway. */
e45b1be8 1452 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1453 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1454 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1455 /* Sew in builtin protocols. */
1456 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1457 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1458 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
e45b1be8 1459 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1460
1461 for (i = 0; i < ip_conntrack_htable_size; i++)
1462 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1463
1464 /* For use by ipt_REJECT */
1465 ip_ct_attach = ip_conntrack_attach;
1466
1467 /* Set up fake conntrack:
1468 - to never be deleted, not in any hashes */
1469 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1470 /* - and look it like as a confirmed connection */
1471 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1472
1473 return ret;
1474
1475err_free_conntrack_slab:
1476 kmem_cache_destroy(ip_conntrack_cachep);
1477err_free_hash:
1478 free_conntrack_hash();
1479err_unreg_sockopt:
1480 nf_unregister_sockopt(&so_getorigdst);
1481
1482 return -ENOMEM;
1483}
This page took 0.146276 seconds and 5 git commands to generate.