[NETFILTER]: Add support for permanent expectations
[deliverable/linux.git] / net / ipv4 / netfilter / ip_conntrack_core.c
CommitLineData
1da177e4
LT
1/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20#include <linux/config.h>
21#include <linux/types.h>
22#include <linux/icmp.h>
23#include <linux/ip.h>
24#include <linux/netfilter.h>
25#include <linux/netfilter_ipv4.h>
26#include <linux/module.h>
27#include <linux/skbuff.h>
28#include <linux/proc_fs.h>
29#include <linux/vmalloc.h>
30#include <net/checksum.h>
31#include <net/ip.h>
32#include <linux/stddef.h>
33#include <linux/sysctl.h>
34#include <linux/slab.h>
35#include <linux/random.h>
36#include <linux/jhash.h>
37#include <linux/err.h>
38#include <linux/percpu.h>
39#include <linux/moduleparam.h>
ac3247ba 40#include <linux/notifier.h>
1da177e4 41
e45b1be8 42/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
1da177e4 43 registrations, conntrack timers*/
e45b1be8
PM
44#define ASSERT_READ_LOCK(x)
45#define ASSERT_WRITE_LOCK(x)
1da177e4
LT
46
47#include <linux/netfilter_ipv4/ip_conntrack.h>
48#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50#include <linux/netfilter_ipv4/ip_conntrack_core.h>
51#include <linux/netfilter_ipv4/listhelp.h>
52
080774a2 53#define IP_CONNTRACK_VERSION "2.3"
1da177e4
LT
54
55#if 0
56#define DEBUGP printk
57#else
58#define DEBUGP(format, args...)
59#endif
60
e45b1be8 61DEFINE_RWLOCK(ip_conntrack_lock);
1da177e4
LT
62
63/* ip_conntrack_standalone needs this */
64atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67LIST_HEAD(ip_conntrack_expect_list);
68struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69static LIST_HEAD(helpers);
70unsigned int ip_conntrack_htable_size = 0;
71int ip_conntrack_max;
72struct list_head *ip_conntrack_hash;
ba89966c
ED
73static kmem_cache_t *ip_conntrack_cachep __read_mostly;
74static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
1da177e4
LT
75struct ip_conntrack ip_conntrack_untracked;
76unsigned int ip_ct_log_invalid;
77static LIST_HEAD(unconfirmed);
78static int ip_conntrack_vmalloc;
79
080774a2
HW
80static unsigned int ip_conntrack_next_id = 1;
81static unsigned int ip_conntrack_expect_next_id = 1;
ac3247ba
HW
82#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83struct notifier_block *ip_conntrack_chain;
84struct notifier_block *ip_conntrack_expect_chain;
85
86DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
a86888b9
PM
88/* deliver cached events and clear cache entry - must be called with locally
89 * disabled softirqs */
90static inline void
91__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
ac3247ba 92{
a86888b9 93 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
ac3247ba
HW
94 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95 notifier_call_chain(&ip_conntrack_chain, ecache->events,
96 ecache->ct);
97 ecache->events = 0;
a86888b9
PM
98 ip_conntrack_put(ecache->ct);
99 ecache->ct = NULL;
ac3247ba
HW
100}
101
102/* Deliver all cached events for a particular conntrack. This is called
103 * by code prior to async packet handling or freeing the skb */
a86888b9 104void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
ac3247ba 105{
a86888b9
PM
106 struct ip_conntrack_ecache *ecache;
107
108 local_bh_disable();
109 ecache = &__get_cpu_var(ip_conntrack_ecache);
110 if (ecache->ct == ct)
111 __ip_ct_deliver_cached_events(ecache);
112 local_bh_enable();
113}
ac3247ba 114
a86888b9
PM
115void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116{
117 struct ip_conntrack_ecache *ecache;
ac3247ba 118
a86888b9
PM
119 /* take care of delivering potentially old events */
120 ecache = &__get_cpu_var(ip_conntrack_ecache);
121 BUG_ON(ecache->ct == ct);
122 if (ecache->ct)
123 __ip_ct_deliver_cached_events(ecache);
124 /* initialize for this conntrack/packet */
125 ecache->ct = ct;
126 nf_conntrack_get(&ct->ct_general);
ac3247ba
HW
127}
128
a86888b9
PM
129/* flush the event cache - touches other CPU's data and must not be called while
130 * packets are still passing through the code */
131static void ip_ct_event_cache_flush(void)
ac3247ba 132{
a86888b9
PM
133 struct ip_conntrack_ecache *ecache;
134 int cpu;
ac3247ba 135
a86888b9
PM
136 for_each_cpu(cpu) {
137 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138 if (ecache->ct)
ac3247ba 139 ip_conntrack_put(ecache->ct);
ac3247ba
HW
140 }
141}
a86888b9
PM
142#else
143static inline void ip_ct_event_cache_flush(void) {}
ac3247ba
HW
144#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
1da177e4
LT
146DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
1da177e4
LT
148static int ip_conntrack_hash_rnd_initted;
149static unsigned int ip_conntrack_hash_rnd;
150
151static u_int32_t
152hash_conntrack(const struct ip_conntrack_tuple *tuple)
153{
154#if 0
155 dump_tuple(tuple);
156#endif
157 return (jhash_3words(tuple->src.ip,
158 (tuple->dst.ip ^ tuple->dst.protonum),
159 (tuple->src.u.all | (tuple->dst.u.all << 16)),
160 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
161}
162
163int
164ip_ct_get_tuple(const struct iphdr *iph,
165 const struct sk_buff *skb,
166 unsigned int dataoff,
167 struct ip_conntrack_tuple *tuple,
168 const struct ip_conntrack_protocol *protocol)
169{
170 /* Never happen */
171 if (iph->frag_off & htons(IP_OFFSET)) {
172 printk("ip_conntrack_core: Frag of proto %u.\n",
173 iph->protocol);
174 return 0;
175 }
176
177 tuple->src.ip = iph->saddr;
178 tuple->dst.ip = iph->daddr;
179 tuple->dst.protonum = iph->protocol;
180 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
181
182 return protocol->pkt_to_tuple(skb, dataoff, tuple);
183}
184
185int
186ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
187 const struct ip_conntrack_tuple *orig,
188 const struct ip_conntrack_protocol *protocol)
189{
190 inverse->src.ip = orig->dst.ip;
191 inverse->dst.ip = orig->src.ip;
192 inverse->dst.protonum = orig->dst.protonum;
193 inverse->dst.dir = !orig->dst.dir;
194
195 return protocol->invert_tuple(inverse, orig);
196}
197
198
199/* ip_conntrack_expect helper functions */
1da177e4
LT
200static void unlink_expect(struct ip_conntrack_expect *exp)
201{
e45b1be8 202 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
4acdbdbe 203 IP_NF_ASSERT(!timer_pending(&exp->timeout));
1da177e4 204 list_del(&exp->list);
4acdbdbe 205 CONNTRACK_STAT_INC(expect_delete);
1da177e4 206 exp->master->expecting--;
37012f7f 207 ip_conntrack_expect_put(exp);
1da177e4
LT
208}
209
080774a2
HW
210void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
211{
212 unlink_expect(exp);
213 ip_conntrack_expect_put(exp);
214}
215
1da177e4
LT
216static void expectation_timed_out(unsigned long ul_expect)
217{
218 struct ip_conntrack_expect *exp = (void *)ul_expect;
219
e45b1be8 220 write_lock_bh(&ip_conntrack_lock);
1da177e4 221 unlink_expect(exp);
e45b1be8 222 write_unlock_bh(&ip_conntrack_lock);
4acdbdbe 223 ip_conntrack_expect_put(exp);
1da177e4
LT
224}
225
080774a2
HW
226struct ip_conntrack_expect *
227__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
228{
229 struct ip_conntrack_expect *i;
230
231 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
232 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
233 atomic_inc(&i->use);
234 return i;
235 }
236 }
237 return NULL;
238}
239
240/* Just find a expectation corresponding to a tuple. */
241struct ip_conntrack_expect *
242ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
243{
244 struct ip_conntrack_expect *i;
245
246 read_lock_bh(&ip_conntrack_lock);
247 i = __ip_conntrack_expect_find(tuple);
248 read_unlock_bh(&ip_conntrack_lock);
249
250 return i;
251}
252
1da177e4
LT
253/* If an expectation for this connection is found, it gets delete from
254 * global list then returned. */
255static struct ip_conntrack_expect *
256find_expectation(const struct ip_conntrack_tuple *tuple)
257{
258 struct ip_conntrack_expect *i;
259
260 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
261 /* If master is not in hash table yet (ie. packet hasn't left
262 this machine yet), how can other end know about expected?
263 Hence these are not the droids you are looking for (if
264 master ct never got confirmed, we'd hold a reference to it
265 and weird things would happen to future packets). */
266 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
2248bcfc
PM
267 && is_confirmed(i->master)) {
268 if (i->flags & IP_CT_EXPECT_PERMANENT) {
269 atomic_inc(&i->use);
270 return i;
271 } else if (del_timer(&i->timeout)) {
272 unlink_expect(i);
273 return i;
274 }
1da177e4
LT
275 }
276 }
277 return NULL;
278}
279
280/* delete all expectations for this conntrack */
080774a2 281void ip_ct_remove_expectations(struct ip_conntrack *ct)
1da177e4
LT
282{
283 struct ip_conntrack_expect *i, *tmp;
284
285 /* Optimization: most connection never expect any others. */
286 if (ct->expecting == 0)
287 return;
288
289 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
290 if (i->master == ct && del_timer(&i->timeout)) {
291 unlink_expect(i);
4acdbdbe 292 ip_conntrack_expect_put(i);
1da177e4
LT
293 }
294 }
295}
296
297static void
298clean_from_lists(struct ip_conntrack *ct)
299{
300 unsigned int ho, hr;
301
302 DEBUGP("clean_from_lists(%p)\n", ct);
e45b1be8 303 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
1da177e4
LT
304
305 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
306 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
307 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
308 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
309
310 /* Destroy all pending expectations */
080774a2 311 ip_ct_remove_expectations(ct);
1da177e4
LT
312}
313
314static void
315destroy_conntrack(struct nf_conntrack *nfct)
316{
317 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
318 struct ip_conntrack_protocol *proto;
319
320 DEBUGP("destroy_conntrack(%p)\n", ct);
321 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
322 IP_NF_ASSERT(!timer_pending(&ct->timeout));
323
14a50bba 324 ip_conntrack_event(IPCT_DESTROY, ct);
ac3247ba
HW
325 set_bit(IPS_DYING_BIT, &ct->status);
326
1da177e4
LT
327 /* To make sure we don't get any weird locking issues here:
328 * destroy_conntrack() MUST NOT be called with a write lock
329 * to ip_conntrack_lock!!! -HW */
080774a2 330 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
1da177e4
LT
331 if (proto && proto->destroy)
332 proto->destroy(ct);
333
334 if (ip_conntrack_destroyed)
335 ip_conntrack_destroyed(ct);
336
e45b1be8 337 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
338 /* Expectations will have been removed in clean_from_lists,
339 * except TFTP can create an expectation on the first packet,
340 * before connection is in the list, so we need to clean here,
341 * too. */
080774a2 342 ip_ct_remove_expectations(ct);
1da177e4
LT
343
344 /* We overload first tuple to link into unconfirmed list. */
345 if (!is_confirmed(ct)) {
346 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
347 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
348 }
349
350 CONNTRACK_STAT_INC(delete);
e45b1be8 351 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
352
353 if (ct->master)
354 ip_conntrack_put(ct->master);
355
356 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
080774a2 357 ip_conntrack_free(ct);
1da177e4
LT
358}
359
360static void death_by_timeout(unsigned long ul_conntrack)
361{
362 struct ip_conntrack *ct = (void *)ul_conntrack;
363
e45b1be8 364 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
365 /* Inside lock so preempt is disabled on module removal path.
366 * Otherwise we can get spurious warnings. */
367 CONNTRACK_STAT_INC(delete_list);
368 clean_from_lists(ct);
e45b1be8 369 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
370 ip_conntrack_put(ct);
371}
372
373static inline int
374conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
375 const struct ip_conntrack_tuple *tuple,
376 const struct ip_conntrack *ignored_conntrack)
377{
e45b1be8 378 ASSERT_READ_LOCK(&ip_conntrack_lock);
1da177e4
LT
379 return tuplehash_to_ctrack(i) != ignored_conntrack
380 && ip_ct_tuple_equal(tuple, &i->tuple);
381}
382
080774a2 383struct ip_conntrack_tuple_hash *
1da177e4
LT
384__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
385 const struct ip_conntrack *ignored_conntrack)
386{
387 struct ip_conntrack_tuple_hash *h;
388 unsigned int hash = hash_conntrack(tuple);
389
e45b1be8 390 ASSERT_READ_LOCK(&ip_conntrack_lock);
1da177e4
LT
391 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
392 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
393 CONNTRACK_STAT_INC(found);
394 return h;
395 }
396 CONNTRACK_STAT_INC(searched);
397 }
398
399 return NULL;
400}
401
402/* Find a connection corresponding to a tuple. */
403struct ip_conntrack_tuple_hash *
404ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
405 const struct ip_conntrack *ignored_conntrack)
406{
407 struct ip_conntrack_tuple_hash *h;
408
e45b1be8 409 read_lock_bh(&ip_conntrack_lock);
1da177e4
LT
410 h = __ip_conntrack_find(tuple, ignored_conntrack);
411 if (h)
412 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
e45b1be8 413 read_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
414
415 return h;
416}
417
080774a2
HW
418static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
419 unsigned int hash,
420 unsigned int repl_hash)
421{
422 ct->id = ++ip_conntrack_next_id;
423 list_prepend(&ip_conntrack_hash[hash],
424 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
425 list_prepend(&ip_conntrack_hash[repl_hash],
426 &ct->tuplehash[IP_CT_DIR_REPLY].list);
427}
428
429void ip_conntrack_hash_insert(struct ip_conntrack *ct)
430{
431 unsigned int hash, repl_hash;
432
433 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
434 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
435
436 write_lock_bh(&ip_conntrack_lock);
437 __ip_conntrack_hash_insert(ct, hash, repl_hash);
438 write_unlock_bh(&ip_conntrack_lock);
439}
440
1da177e4
LT
441/* Confirm a connection given skb; places it in hash table */
442int
443__ip_conntrack_confirm(struct sk_buff **pskb)
444{
445 unsigned int hash, repl_hash;
446 struct ip_conntrack *ct;
447 enum ip_conntrack_info ctinfo;
448
449 ct = ip_conntrack_get(*pskb, &ctinfo);
450
451 /* ipt_REJECT uses ip_conntrack_attach to attach related
452 ICMP/TCP RST packets in other direction. Actual packet
453 which created connection will be IP_CT_NEW or for an
454 expected connection, IP_CT_RELATED. */
455 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
456 return NF_ACCEPT;
457
458 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
459 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
460
461 /* We're not in hash table, and we refuse to set up related
462 connections for unconfirmed conns. But packet copies and
463 REJECT will give spurious warnings here. */
464 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
465
466 /* No external references means noone else could have
467 confirmed us. */
468 IP_NF_ASSERT(!is_confirmed(ct));
469 DEBUGP("Confirming conntrack %p\n", ct);
470
e45b1be8 471 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
472
473 /* See if there's one in the list already, including reverse:
474 NAT could have grabbed it without realizing, since we're
475 not in the hash. If there is, we lost race. */
476 if (!LIST_FIND(&ip_conntrack_hash[hash],
477 conntrack_tuple_cmp,
478 struct ip_conntrack_tuple_hash *,
479 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
480 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
481 conntrack_tuple_cmp,
482 struct ip_conntrack_tuple_hash *,
483 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
484 /* Remove from unconfirmed list */
485 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
486
080774a2 487 __ip_conntrack_hash_insert(ct, hash, repl_hash);
1da177e4
LT
488 /* Timer relative to confirmation time, not original
489 setting time, otherwise we'd get timer wrap in
490 weird delay cases. */
491 ct->timeout.expires += jiffies;
492 add_timer(&ct->timeout);
493 atomic_inc(&ct->ct_general.use);
494 set_bit(IPS_CONFIRMED_BIT, &ct->status);
495 CONNTRACK_STAT_INC(insert);
e45b1be8 496 write_unlock_bh(&ip_conntrack_lock);
ac3247ba
HW
497 if (ct->helper)
498 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
499#ifdef CONFIG_IP_NF_NAT_NEEDED
500 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
501 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
502 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
503#endif
504 ip_conntrack_event_cache(master_ct(ct) ?
505 IPCT_RELATED : IPCT_NEW, *pskb);
506
1da177e4
LT
507 return NF_ACCEPT;
508 }
509
510 CONNTRACK_STAT_INC(insert_failed);
e45b1be8 511 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
512
513 return NF_DROP;
514}
515
516/* Returns true if a connection correspondings to the tuple (required
517 for NAT). */
518int
519ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
520 const struct ip_conntrack *ignored_conntrack)
521{
522 struct ip_conntrack_tuple_hash *h;
523
e45b1be8 524 read_lock_bh(&ip_conntrack_lock);
1da177e4 525 h = __ip_conntrack_find(tuple, ignored_conntrack);
e45b1be8 526 read_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
527
528 return h != NULL;
529}
530
531/* There's a small race here where we may free a just-assured
532 connection. Too bad: we're in trouble anyway. */
533static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
534{
535 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
536}
537
538static int early_drop(struct list_head *chain)
539{
540 /* Traverse backwards: gives us oldest, which is roughly LRU */
541 struct ip_conntrack_tuple_hash *h;
542 struct ip_conntrack *ct = NULL;
543 int dropped = 0;
544
e45b1be8 545 read_lock_bh(&ip_conntrack_lock);
1da177e4
LT
546 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
547 if (h) {
548 ct = tuplehash_to_ctrack(h);
549 atomic_inc(&ct->ct_general.use);
550 }
e45b1be8 551 read_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
552
553 if (!ct)
554 return dropped;
555
556 if (del_timer(&ct->timeout)) {
557 death_by_timeout((unsigned long)ct);
558 dropped = 1;
559 CONNTRACK_STAT_INC(early_drop);
560 }
561 ip_conntrack_put(ct);
562 return dropped;
563}
564
565static inline int helper_cmp(const struct ip_conntrack_helper *i,
566 const struct ip_conntrack_tuple *rtuple)
567{
568 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
569}
570
080774a2
HW
571static struct ip_conntrack_helper *
572__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
1da177e4
LT
573{
574 return LIST_FIND(&helpers, helper_cmp,
575 struct ip_conntrack_helper *,
576 tuple);
577}
578
080774a2
HW
579struct ip_conntrack_helper *
580ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
581{
582 struct ip_conntrack_helper *helper;
583
584 /* need ip_conntrack_lock to assure that helper exists until
585 * try_module_get() is called */
586 read_lock_bh(&ip_conntrack_lock);
587
588 helper = __ip_conntrack_helper_find(tuple);
589 if (helper) {
590 /* need to increase module usage count to assure helper will
591 * not go away while the caller is e.g. busy putting a
592 * conntrack in the hash that uses the helper */
593 if (!try_module_get(helper->me))
594 helper = NULL;
595 }
596
597 read_unlock_bh(&ip_conntrack_lock);
598
599 return helper;
600}
601
602void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
603{
604 module_put(helper->me);
605}
606
607struct ip_conntrack_protocol *
608__ip_conntrack_proto_find(u_int8_t protocol)
609{
610 return ip_ct_protos[protocol];
611}
612
613/* this is guaranteed to always return a valid protocol helper, since
614 * it falls back to generic_protocol */
615struct ip_conntrack_protocol *
616ip_conntrack_proto_find_get(u_int8_t protocol)
617{
618 struct ip_conntrack_protocol *p;
619
620 preempt_disable();
621 p = __ip_conntrack_proto_find(protocol);
622 if (p) {
623 if (!try_module_get(p->me))
624 p = &ip_conntrack_generic_protocol;
625 }
626 preempt_enable();
627
628 return p;
629}
630
631void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
632{
633 module_put(p->me);
634}
635
636struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
637 struct ip_conntrack_tuple *repl)
1da177e4
LT
638{
639 struct ip_conntrack *conntrack;
1da177e4
LT
640
641 if (!ip_conntrack_hash_rnd_initted) {
642 get_random_bytes(&ip_conntrack_hash_rnd, 4);
643 ip_conntrack_hash_rnd_initted = 1;
644 }
645
1da177e4
LT
646 if (ip_conntrack_max
647 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
080774a2 648 unsigned int hash = hash_conntrack(orig);
1da177e4
LT
649 /* Try dropping from this hash chain. */
650 if (!early_drop(&ip_conntrack_hash[hash])) {
651 if (net_ratelimit())
652 printk(KERN_WARNING
653 "ip_conntrack: table full, dropping"
654 " packet.\n");
655 return ERR_PTR(-ENOMEM);
656 }
657 }
658
1da177e4
LT
659 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
660 if (!conntrack) {
661 DEBUGP("Can't allocate conntrack.\n");
7663f188 662 return ERR_PTR(-ENOMEM);
1da177e4
LT
663 }
664
665 memset(conntrack, 0, sizeof(*conntrack));
666 atomic_set(&conntrack->ct_general.use, 1);
667 conntrack->ct_general.destroy = destroy_conntrack;
080774a2
HW
668 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
669 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1da177e4
LT
670 /* Don't set timer yet: wait for confirmation */
671 init_timer(&conntrack->timeout);
672 conntrack->timeout.data = (unsigned long)conntrack;
673 conntrack->timeout.function = death_by_timeout;
674
080774a2
HW
675 atomic_inc(&ip_conntrack_count);
676
677 return conntrack;
678}
679
680void
681ip_conntrack_free(struct ip_conntrack *conntrack)
682{
683 atomic_dec(&ip_conntrack_count);
684 kmem_cache_free(ip_conntrack_cachep, conntrack);
685}
686
687/* Allocate a new conntrack: we return -ENOMEM if classification
688 * failed due to stress. Otherwise it really is unclassifiable */
689static struct ip_conntrack_tuple_hash *
690init_conntrack(struct ip_conntrack_tuple *tuple,
691 struct ip_conntrack_protocol *protocol,
692 struct sk_buff *skb)
693{
694 struct ip_conntrack *conntrack;
695 struct ip_conntrack_tuple repl_tuple;
696 struct ip_conntrack_expect *exp;
697
698 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
699 DEBUGP("Can't invert tuple.\n");
700 return NULL;
701 }
702
7663f188
YK
703 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
704 if (conntrack == NULL || IS_ERR(conntrack))
705 return (struct ip_conntrack_tuple_hash *)conntrack;
080774a2
HW
706
707 if (!protocol->new(conntrack, skb)) {
708 ip_conntrack_free(conntrack);
709 return NULL;
710 }
711
e45b1be8 712 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
713 exp = find_expectation(tuple);
714
715 if (exp) {
716 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
717 conntrack, exp);
718 /* Welcome, Mr. Bond. We've been expecting you... */
719 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
720 conntrack->master = exp->master;
7cee432a 721#ifdef CONFIG_IP_NF_CONNTRACK_MARK
1da177e4 722 conntrack->mark = exp->master->mark;
1f494c0e
HW
723#endif
724#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
725 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
726 /* this is ugly, but there is no other place where to put it */
727 conntrack->nat.masq_index = exp->master->nat.masq_index;
1da177e4
LT
728#endif
729 nf_conntrack_get(&conntrack->master->ct_general);
730 CONNTRACK_STAT_INC(expect_new);
731 } else {
080774a2 732 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
1da177e4
LT
733
734 CONNTRACK_STAT_INC(new);
735 }
736
737 /* Overload tuple linked list to put us in unconfirmed list. */
738 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
739
e45b1be8 740 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
741
742 if (exp) {
743 if (exp->expectfn)
744 exp->expectfn(conntrack, exp);
4acdbdbe 745 ip_conntrack_expect_put(exp);
1da177e4
LT
746 }
747
748 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
749}
750
751/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
752static inline struct ip_conntrack *
753resolve_normal_ct(struct sk_buff *skb,
754 struct ip_conntrack_protocol *proto,
755 int *set_reply,
756 unsigned int hooknum,
757 enum ip_conntrack_info *ctinfo)
758{
759 struct ip_conntrack_tuple tuple;
760 struct ip_conntrack_tuple_hash *h;
761 struct ip_conntrack *ct;
762
763 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
764
765 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
766 &tuple,proto))
767 return NULL;
768
769 /* look for tuple match */
770 h = ip_conntrack_find_get(&tuple, NULL);
771 if (!h) {
772 h = init_conntrack(&tuple, proto, skb);
773 if (!h)
774 return NULL;
775 if (IS_ERR(h))
776 return (void *)h;
777 }
778 ct = tuplehash_to_ctrack(h);
779
780 /* It exists; we have (non-exclusive) reference. */
781 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
782 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
783 /* Please set reply bit if this packet OK */
784 *set_reply = 1;
785 } else {
786 /* Once we've had two way comms, always ESTABLISHED. */
787 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
788 DEBUGP("ip_conntrack_in: normal packet for %p\n",
789 ct);
790 *ctinfo = IP_CT_ESTABLISHED;
791 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
792 DEBUGP("ip_conntrack_in: related packet for %p\n",
793 ct);
794 *ctinfo = IP_CT_RELATED;
795 } else {
796 DEBUGP("ip_conntrack_in: new packet for %p\n",
797 ct);
798 *ctinfo = IP_CT_NEW;
799 }
800 *set_reply = 0;
801 }
802 skb->nfct = &ct->ct_general;
803 skb->nfctinfo = *ctinfo;
804 return ct;
805}
806
807/* Netfilter hook itself. */
808unsigned int ip_conntrack_in(unsigned int hooknum,
809 struct sk_buff **pskb,
810 const struct net_device *in,
811 const struct net_device *out,
812 int (*okfn)(struct sk_buff *))
813{
814 struct ip_conntrack *ct;
815 enum ip_conntrack_info ctinfo;
816 struct ip_conntrack_protocol *proto;
ac3247ba 817 int set_reply = 0;
1da177e4
LT
818 int ret;
819
820 /* Previously seen (loopback or untracked)? Ignore. */
821 if ((*pskb)->nfct) {
822 CONNTRACK_STAT_INC(ignore);
823 return NF_ACCEPT;
824 }
825
826 /* Never happen */
827 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
828 if (net_ratelimit()) {
829 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
830 (*pskb)->nh.iph->protocol, hooknum);
831 }
832 return NF_DROP;
833 }
834
1da177e4
LT
835/* Doesn't cover locally-generated broadcast, so not worth it. */
836#if 0
837 /* Ignore broadcast: no `connection'. */
838 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
839 printk("Broadcast packet!\n");
840 return NF_ACCEPT;
841 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
842 == htonl(0x000000FF)) {
843 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
844 NIPQUAD((*pskb)->nh.iph->saddr),
845 NIPQUAD((*pskb)->nh.iph->daddr),
846 (*pskb)->sk, (*pskb)->pkt_type);
847 }
848#endif
849
080774a2 850 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
1da177e4
LT
851
852 /* It may be an special packet, error, unclean...
853 * inverse of the return code tells to the netfilter
854 * core what to do with the packet. */
855 if (proto->error != NULL
856 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
857 CONNTRACK_STAT_INC(error);
858 CONNTRACK_STAT_INC(invalid);
859 return -ret;
860 }
861
862 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
863 /* Not valid part of a connection */
864 CONNTRACK_STAT_INC(invalid);
865 return NF_ACCEPT;
866 }
867
868 if (IS_ERR(ct)) {
869 /* Too stressed to deal. */
870 CONNTRACK_STAT_INC(drop);
871 return NF_DROP;
872 }
873
874 IP_NF_ASSERT((*pskb)->nfct);
875
876 ret = proto->packet(ct, *pskb, ctinfo);
877 if (ret < 0) {
878 /* Invalid: inverse of the return code tells
879 * the netfilter core what to do*/
880 nf_conntrack_put((*pskb)->nfct);
881 (*pskb)->nfct = NULL;
882 CONNTRACK_STAT_INC(invalid);
883 return -ret;
884 }
885
ac3247ba
HW
886 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
887 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
1da177e4
LT
888
889 return ret;
890}
891
892int invert_tuplepr(struct ip_conntrack_tuple *inverse,
893 const struct ip_conntrack_tuple *orig)
894{
895 return ip_ct_invert_tuple(inverse, orig,
080774a2 896 __ip_conntrack_proto_find(orig->dst.protonum));
1da177e4
LT
897}
898
899/* Would two expected things clash? */
900static inline int expect_clash(const struct ip_conntrack_expect *a,
901 const struct ip_conntrack_expect *b)
902{
903 /* Part covered by intersection of masks must be unequal,
904 otherwise they clash */
905 struct ip_conntrack_tuple intersect_mask
906 = { { a->mask.src.ip & b->mask.src.ip,
907 { a->mask.src.u.all & b->mask.src.u.all } },
908 { a->mask.dst.ip & b->mask.dst.ip,
909 { a->mask.dst.u.all & b->mask.dst.u.all },
910 a->mask.dst.protonum & b->mask.dst.protonum } };
911
912 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
913}
914
915static inline int expect_matches(const struct ip_conntrack_expect *a,
916 const struct ip_conntrack_expect *b)
917{
918 return a->master == b->master
919 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
920 && ip_ct_tuple_equal(&a->mask, &b->mask);
921}
922
923/* Generally a bad idea to call this: could have matched already. */
924void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
925{
926 struct ip_conntrack_expect *i;
927
e45b1be8 928 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
929 /* choose the the oldest expectation to evict */
930 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
931 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
932 unlink_expect(i);
e45b1be8 933 write_unlock_bh(&ip_conntrack_lock);
4acdbdbe 934 ip_conntrack_expect_put(i);
1da177e4
LT
935 return;
936 }
937 }
e45b1be8 938 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
939}
940
4acdbdbe 941struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
1da177e4
LT
942{
943 struct ip_conntrack_expect *new;
944
945 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
946 if (!new) {
947 DEBUGP("expect_related: OOM allocating expect\n");
948 return NULL;
949 }
4acdbdbe
RR
950 new->master = me;
951 atomic_inc(&new->master->ct_general.use);
952 atomic_set(&new->use, 1);
1da177e4
LT
953 return new;
954}
955
4acdbdbe 956void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
1da177e4 957{
4acdbdbe
RR
958 if (atomic_dec_and_test(&exp->use)) {
959 ip_conntrack_put(exp->master);
960 kmem_cache_free(ip_conntrack_expect_cachep, exp);
961 }
1da177e4
LT
962}
963
964static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
965{
4acdbdbe 966 atomic_inc(&exp->use);
1da177e4
LT
967 exp->master->expecting++;
968 list_add(&exp->list, &ip_conntrack_expect_list);
969
1d3cdb41
PO
970 init_timer(&exp->timeout);
971 exp->timeout.data = (unsigned long)exp;
972 exp->timeout.function = expectation_timed_out;
973 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
974 add_timer(&exp->timeout);
1da177e4 975
080774a2
HW
976 exp->id = ++ip_conntrack_expect_next_id;
977 atomic_inc(&exp->use);
1da177e4
LT
978 CONNTRACK_STAT_INC(expect_create);
979}
980
981/* Race with expectations being used means we could have none to find; OK. */
982static void evict_oldest_expect(struct ip_conntrack *master)
983{
984 struct ip_conntrack_expect *i;
985
986 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
987 if (i->master == master) {
988 if (del_timer(&i->timeout)) {
989 unlink_expect(i);
4acdbdbe 990 ip_conntrack_expect_put(i);
1da177e4
LT
991 }
992 break;
993 }
994 }
995}
996
997static inline int refresh_timer(struct ip_conntrack_expect *i)
998{
999 if (!del_timer(&i->timeout))
1000 return 0;
1001
1002 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1003 add_timer(&i->timeout);
1004 return 1;
1005}
1006
1007int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1008{
1009 struct ip_conntrack_expect *i;
1010 int ret;
1011
1012 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1013 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1014 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1015
e45b1be8 1016 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1017 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1018 if (expect_matches(i, expect)) {
1019 /* Refresh timer: if it's dying, ignore.. */
1020 if (refresh_timer(i)) {
1021 ret = 0;
1da177e4
LT
1022 goto out;
1023 }
1024 } else if (expect_clash(i, expect)) {
1025 ret = -EBUSY;
1026 goto out;
1027 }
1028 }
1029
1030 /* Will be over limit? */
1031 if (expect->master->helper->max_expected &&
1032 expect->master->expecting >= expect->master->helper->max_expected)
1033 evict_oldest_expect(expect->master);
1034
1035 ip_conntrack_expect_insert(expect);
ac3247ba 1036 ip_conntrack_expect_event(IPEXP_NEW, expect);
1da177e4
LT
1037 ret = 0;
1038out:
e45b1be8 1039 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1040 return ret;
1041}
1042
1043/* Alter reply tuple (maybe alter helper). This is for NAT, and is
1044 implicitly racy: see __ip_conntrack_confirm */
1045void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1046 const struct ip_conntrack_tuple *newreply)
1047{
e45b1be8 1048 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1049 /* Should be unconfirmed, so not in hash table yet */
1050 IP_NF_ASSERT(!is_confirmed(conntrack));
1051
1052 DEBUGP("Altering reply tuple of %p to ", conntrack);
1053 DUMP_TUPLE(newreply);
1054
1055 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1056 if (!conntrack->master && conntrack->expecting == 0)
080774a2 1057 conntrack->helper = __ip_conntrack_helper_find(newreply);
e45b1be8 1058 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1059}
1060
1061int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1062{
1063 BUG_ON(me->timeout == 0);
e45b1be8 1064 write_lock_bh(&ip_conntrack_lock);
1da177e4 1065 list_prepend(&helpers, me);
e45b1be8 1066 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1067
1068 return 0;
1069}
1070
080774a2
HW
1071struct ip_conntrack_helper *
1072__ip_conntrack_helper_find_byname(const char *name)
1073{
1074 struct ip_conntrack_helper *h;
1075
1076 list_for_each_entry(h, &helpers, list) {
1077 if (!strcmp(h->name, name))
1078 return h;
1079 }
1080
1081 return NULL;
1082}
1083
1da177e4
LT
1084static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1085 const struct ip_conntrack_helper *me)
1086{
ac3247ba
HW
1087 if (tuplehash_to_ctrack(i)->helper == me) {
1088 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1da177e4 1089 tuplehash_to_ctrack(i)->helper = NULL;
ac3247ba 1090 }
1da177e4
LT
1091 return 0;
1092}
1093
1094void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1095{
1096 unsigned int i;
1097 struct ip_conntrack_expect *exp, *tmp;
1098
1099 /* Need write lock here, to delete helper. */
e45b1be8 1100 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1101 LIST_DELETE(&helpers, me);
1102
1103 /* Get rid of expectations */
1104 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1105 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1106 unlink_expect(exp);
4acdbdbe 1107 ip_conntrack_expect_put(exp);
1da177e4
LT
1108 }
1109 }
1110 /* Get rid of expecteds, set helpers to NULL. */
1111 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1112 for (i = 0; i < ip_conntrack_htable_size; i++)
1113 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1114 struct ip_conntrack_tuple_hash *, me);
e45b1be8 1115 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1116
1117 /* Someone could be still looking at the helper in a bh. */
1118 synchronize_net();
1119}
1120
1121static inline void ct_add_counters(struct ip_conntrack *ct,
1122 enum ip_conntrack_info ctinfo,
1123 const struct sk_buff *skb)
1124{
1125#ifdef CONFIG_IP_NF_CT_ACCT
1126 if (skb) {
1127 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1128 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1129 ntohs(skb->nh.iph->tot_len);
1130 }
1131#endif
1132}
1133
1134/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1135void ip_ct_refresh_acct(struct ip_conntrack *ct,
1136 enum ip_conntrack_info ctinfo,
1137 const struct sk_buff *skb,
1138 unsigned long extra_jiffies)
1139{
1140 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1141
1142 /* If not in hash table, timer will not be active yet */
1143 if (!is_confirmed(ct)) {
1144 ct->timeout.expires = extra_jiffies;
1145 ct_add_counters(ct, ctinfo, skb);
1146 } else {
e45b1be8 1147 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1148 /* Need del_timer for race avoidance (may already be dying). */
1149 if (del_timer(&ct->timeout)) {
1150 ct->timeout.expires = jiffies + extra_jiffies;
1151 add_timer(&ct->timeout);
ac3247ba 1152 ip_conntrack_event_cache(IPCT_REFRESH, skb);
1da177e4
LT
1153 }
1154 ct_add_counters(ct, ctinfo, skb);
e45b1be8 1155 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1156 }
1157}
1158
080774a2
HW
1159#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1160 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1161/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1162 * in ip_conntrack_core, since we don't want the protocols to autoload
1163 * or depend on ctnetlink */
1164int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1165 const struct ip_conntrack_tuple *tuple)
1166{
1167 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1168 &tuple->src.u.tcp.port);
1169 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1170 &tuple->dst.u.tcp.port);
1171 return 0;
1172
1173nfattr_failure:
1174 return -1;
1175}
1176
1177int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1178 struct ip_conntrack_tuple *t)
1179{
1180 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1181 return -EINVAL;
1182
1183 t->src.u.tcp.port =
1184 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1185 t->dst.u.tcp.port =
1186 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1187
1188 return 0;
1189}
1190#endif
1191
1da177e4
LT
1192/* Returns new sk_buff, or NULL */
1193struct sk_buff *
1194ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1195{
8be58932 1196 skb_orphan(skb);
1da177e4
LT
1197
1198 local_bh_disable();
1199 skb = ip_defrag(skb, user);
1200 local_bh_enable();
1201
6869c4d8 1202 if (skb)
8be58932 1203 ip_send_check(skb->nh.iph);
1da177e4
LT
1204 return skb;
1205}
1206
1207/* Used by ipt_REJECT. */
1208static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1209{
1210 struct ip_conntrack *ct;
1211 enum ip_conntrack_info ctinfo;
1212
1213 /* This ICMP is in reverse direction to the packet which caused it */
1214 ct = ip_conntrack_get(skb, &ctinfo);
1215
1216 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1217 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1218 else
1219 ctinfo = IP_CT_RELATED;
1220
1221 /* Attach to new skbuff, and increment count */
1222 nskb->nfct = &ct->ct_general;
1223 nskb->nfctinfo = ctinfo;
1224 nf_conntrack_get(nskb->nfct);
1225}
1226
1227static inline int
1228do_iter(const struct ip_conntrack_tuple_hash *i,
1229 int (*iter)(struct ip_conntrack *i, void *data),
1230 void *data)
1231{
1232 return iter(tuplehash_to_ctrack(i), data);
1233}
1234
1235/* Bring out ya dead! */
1236static struct ip_conntrack_tuple_hash *
1237get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1238 void *data, unsigned int *bucket)
1239{
1240 struct ip_conntrack_tuple_hash *h = NULL;
1241
e45b1be8 1242 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1243 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1244 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1245 struct ip_conntrack_tuple_hash *, iter, data);
1246 if (h)
1247 break;
1248 }
1249 if (!h)
1250 h = LIST_FIND_W(&unconfirmed, do_iter,
1251 struct ip_conntrack_tuple_hash *, iter, data);
1252 if (h)
1253 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
e45b1be8 1254 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1255
1256 return h;
1257}
1258
1259void
1260ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1261{
1262 struct ip_conntrack_tuple_hash *h;
1263 unsigned int bucket = 0;
1264
1265 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1266 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1267 /* Time to push up daises... */
1268 if (del_timer(&ct->timeout))
1269 death_by_timeout((unsigned long)ct);
1270 /* ... else the timer will get him soon. */
1271
1272 ip_conntrack_put(ct);
1273 }
1274}
1275
1276/* Fast function for those who don't want to parse /proc (and I don't
1277 blame them). */
1278/* Reversing the socket's dst/src point of view gives us the reply
1279 mapping. */
1280static int
1281getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1282{
1283 struct inet_sock *inet = inet_sk(sk);
1284 struct ip_conntrack_tuple_hash *h;
1285 struct ip_conntrack_tuple tuple;
1286
1287 IP_CT_TUPLE_U_BLANK(&tuple);
1288 tuple.src.ip = inet->rcv_saddr;
1289 tuple.src.u.tcp.port = inet->sport;
1290 tuple.dst.ip = inet->daddr;
1291 tuple.dst.u.tcp.port = inet->dport;
1292 tuple.dst.protonum = IPPROTO_TCP;
1293
1294 /* We only do TCP at the moment: is there a better way? */
1295 if (strcmp(sk->sk_prot->name, "TCP")) {
1296 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1297 return -ENOPROTOOPT;
1298 }
1299
1300 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1301 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1302 *len, sizeof(struct sockaddr_in));
1303 return -EINVAL;
1304 }
1305
1306 h = ip_conntrack_find_get(&tuple, NULL);
1307 if (h) {
1308 struct sockaddr_in sin;
1309 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1310
1311 sin.sin_family = AF_INET;
1312 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1313 .tuple.dst.u.tcp.port;
1314 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1315 .tuple.dst.ip;
1316
1317 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1318 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1319 ip_conntrack_put(ct);
1320 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1321 return -EFAULT;
1322 else
1323 return 0;
1324 }
1325 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1326 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1327 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1328 return -ENOENT;
1329}
1330
1331static struct nf_sockopt_ops so_getorigdst = {
1332 .pf = PF_INET,
1333 .get_optmin = SO_ORIGINAL_DST,
1334 .get_optmax = SO_ORIGINAL_DST+1,
1335 .get = &getorigdst,
1336};
1337
1338static int kill_all(struct ip_conntrack *i, void *data)
1339{
1340 return 1;
1341}
1342
1343static void free_conntrack_hash(void)
1344{
1345 if (ip_conntrack_vmalloc)
1346 vfree(ip_conntrack_hash);
1347 else
1348 free_pages((unsigned long)ip_conntrack_hash,
1349 get_order(sizeof(struct list_head)
1350 * ip_conntrack_htable_size));
1351}
1352
080774a2 1353void ip_conntrack_flush()
1da177e4 1354{
1da177e4
LT
1355 /* This makes sure all current packets have passed through
1356 netfilter framework. Roll on, two-stage module
1357 delete... */
1358 synchronize_net();
080774a2 1359
a86888b9 1360 ip_ct_event_cache_flush();
1da177e4
LT
1361 i_see_dead_people:
1362 ip_ct_iterate_cleanup(kill_all, NULL);
1363 if (atomic_read(&ip_conntrack_count) != 0) {
1364 schedule();
1365 goto i_see_dead_people;
1366 }
21f930e4
PM
1367 /* wait until all references to ip_conntrack_untracked are dropped */
1368 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1369 schedule();
080774a2 1370}
1da177e4 1371
080774a2
HW
1372/* Mishearing the voices in his head, our hero wonders how he's
1373 supposed to kill the mall. */
1374void ip_conntrack_cleanup(void)
1375{
1376 ip_ct_attach = NULL;
1377 ip_conntrack_flush();
1da177e4
LT
1378 kmem_cache_destroy(ip_conntrack_cachep);
1379 kmem_cache_destroy(ip_conntrack_expect_cachep);
1380 free_conntrack_hash();
1381 nf_unregister_sockopt(&so_getorigdst);
1382}
1383
1384static int hashsize;
1385module_param(hashsize, int, 0400);
1386
1387int __init ip_conntrack_init(void)
1388{
1389 unsigned int i;
1390 int ret;
1391
1392 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1393 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1394 if (hashsize) {
1395 ip_conntrack_htable_size = hashsize;
1396 } else {
1397 ip_conntrack_htable_size
1398 = (((num_physpages << PAGE_SHIFT) / 16384)
1399 / sizeof(struct list_head));
1400 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1401 ip_conntrack_htable_size = 8192;
1402 if (ip_conntrack_htable_size < 16)
1403 ip_conntrack_htable_size = 16;
1404 }
1405 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1406
1407 printk("ip_conntrack version %s (%u buckets, %d max)"
1408 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1409 ip_conntrack_htable_size, ip_conntrack_max,
1410 sizeof(struct ip_conntrack));
1411
1412 ret = nf_register_sockopt(&so_getorigdst);
1413 if (ret != 0) {
1414 printk(KERN_ERR "Unable to register netfilter socket option\n");
1415 return ret;
1416 }
1417
1418 /* AK: the hash table is twice as big than needed because it
1419 uses list_head. it would be much nicer to caches to use a
1420 single pointer list head here. */
1421 ip_conntrack_vmalloc = 0;
1422 ip_conntrack_hash
1423 =(void*)__get_free_pages(GFP_KERNEL,
1424 get_order(sizeof(struct list_head)
1425 *ip_conntrack_htable_size));
1426 if (!ip_conntrack_hash) {
1427 ip_conntrack_vmalloc = 1;
1428 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1429 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1430 * ip_conntrack_htable_size);
1431 }
1432 if (!ip_conntrack_hash) {
1433 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1434 goto err_unreg_sockopt;
1435 }
1436
1437 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1438 sizeof(struct ip_conntrack), 0,
1439 0, NULL, NULL);
1440 if (!ip_conntrack_cachep) {
1441 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1442 goto err_free_hash;
1443 }
1444
1445 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1446 sizeof(struct ip_conntrack_expect),
1447 0, 0, NULL, NULL);
1448 if (!ip_conntrack_expect_cachep) {
1449 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1450 goto err_free_conntrack_slab;
1451 }
1452
1453 /* Don't NEED lock here, but good form anyway. */
e45b1be8 1454 write_lock_bh(&ip_conntrack_lock);
1da177e4
LT
1455 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1456 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1457 /* Sew in builtin protocols. */
1458 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1459 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1460 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
e45b1be8 1461 write_unlock_bh(&ip_conntrack_lock);
1da177e4
LT
1462
1463 for (i = 0; i < ip_conntrack_htable_size; i++)
1464 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1465
1466 /* For use by ipt_REJECT */
1467 ip_ct_attach = ip_conntrack_attach;
1468
1469 /* Set up fake conntrack:
1470 - to never be deleted, not in any hashes */
1471 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1472 /* - and look it like as a confirmed connection */
1473 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1474
1475 return ret;
1476
1477err_free_conntrack_slab:
1478 kmem_cache_destroy(ip_conntrack_cachep);
1479err_free_hash:
1480 free_conntrack_hash();
1481err_unreg_sockopt:
1482 nf_unregister_sockopt(&so_getorigdst);
1483
1484 return -ENOMEM;
1485}
This page took 0.129943 seconds and 5 git commands to generate.