/spare/repo/netdev-2.6 branch 'master'
[deliverable/linux.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
41
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43 registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
46
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
52
53 #define IP_CONNTRACK_VERSION "2.3"
54
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
60
61 DEFINE_RWLOCK(ip_conntrack_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
74 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
79
80 static unsigned int ip_conntrack_next_id = 1;
81 static unsigned int ip_conntrack_expect_next_id = 1;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 struct notifier_block *ip_conntrack_chain;
84 struct notifier_block *ip_conntrack_expect_chain;
85
86 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88 /* deliver cached events and clear cache entry - must be called with locally
89 * disabled softirqs */
90 static inline void
91 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
92 {
93 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
94 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95 notifier_call_chain(&ip_conntrack_chain, ecache->events,
96 ecache->ct);
97 ecache->events = 0;
98 ip_conntrack_put(ecache->ct);
99 ecache->ct = NULL;
100 }
101
102 /* Deliver all cached events for a particular conntrack. This is called
103 * by code prior to async packet handling or freeing the skb */
104 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
105 {
106 struct ip_conntrack_ecache *ecache;
107
108 local_bh_disable();
109 ecache = &__get_cpu_var(ip_conntrack_ecache);
110 if (ecache->ct == ct)
111 __ip_ct_deliver_cached_events(ecache);
112 local_bh_enable();
113 }
114
115 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116 {
117 struct ip_conntrack_ecache *ecache;
118
119 /* take care of delivering potentially old events */
120 ecache = &__get_cpu_var(ip_conntrack_ecache);
121 BUG_ON(ecache->ct == ct);
122 if (ecache->ct)
123 __ip_ct_deliver_cached_events(ecache);
124 /* initialize for this conntrack/packet */
125 ecache->ct = ct;
126 nf_conntrack_get(&ct->ct_general);
127 }
128
129 /* flush the event cache - touches other CPU's data and must not be called while
130 * packets are still passing through the code */
131 static void ip_ct_event_cache_flush(void)
132 {
133 struct ip_conntrack_ecache *ecache;
134 int cpu;
135
136 for_each_cpu(cpu) {
137 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138 if (ecache->ct)
139 ip_conntrack_put(ecache->ct);
140 }
141 }
142 #else
143 static inline void ip_ct_event_cache_flush(void) {}
144 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
146 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
148 static int ip_conntrack_hash_rnd_initted;
149 static unsigned int ip_conntrack_hash_rnd;
150
151 static u_int32_t
152 hash_conntrack(const struct ip_conntrack_tuple *tuple)
153 {
154 #if 0
155 dump_tuple(tuple);
156 #endif
157 return (jhash_3words(tuple->src.ip,
158 (tuple->dst.ip ^ tuple->dst.protonum),
159 (tuple->src.u.all | (tuple->dst.u.all << 16)),
160 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
161 }
162
163 int
164 ip_ct_get_tuple(const struct iphdr *iph,
165 const struct sk_buff *skb,
166 unsigned int dataoff,
167 struct ip_conntrack_tuple *tuple,
168 const struct ip_conntrack_protocol *protocol)
169 {
170 /* Never happen */
171 if (iph->frag_off & htons(IP_OFFSET)) {
172 printk("ip_conntrack_core: Frag of proto %u.\n",
173 iph->protocol);
174 return 0;
175 }
176
177 tuple->src.ip = iph->saddr;
178 tuple->dst.ip = iph->daddr;
179 tuple->dst.protonum = iph->protocol;
180 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
181
182 return protocol->pkt_to_tuple(skb, dataoff, tuple);
183 }
184
185 int
186 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
187 const struct ip_conntrack_tuple *orig,
188 const struct ip_conntrack_protocol *protocol)
189 {
190 inverse->src.ip = orig->dst.ip;
191 inverse->dst.ip = orig->src.ip;
192 inverse->dst.protonum = orig->dst.protonum;
193 inverse->dst.dir = !orig->dst.dir;
194
195 return protocol->invert_tuple(inverse, orig);
196 }
197
198
199 /* ip_conntrack_expect helper functions */
200 static void unlink_expect(struct ip_conntrack_expect *exp)
201 {
202 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
203 IP_NF_ASSERT(!timer_pending(&exp->timeout));
204 list_del(&exp->list);
205 CONNTRACK_STAT_INC(expect_delete);
206 exp->master->expecting--;
207 ip_conntrack_expect_put(exp);
208 }
209
210 void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
211 {
212 unlink_expect(exp);
213 ip_conntrack_expect_put(exp);
214 }
215
216 static void expectation_timed_out(unsigned long ul_expect)
217 {
218 struct ip_conntrack_expect *exp = (void *)ul_expect;
219
220 write_lock_bh(&ip_conntrack_lock);
221 unlink_expect(exp);
222 write_unlock_bh(&ip_conntrack_lock);
223 ip_conntrack_expect_put(exp);
224 }
225
226 struct ip_conntrack_expect *
227 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
228 {
229 struct ip_conntrack_expect *i;
230
231 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
232 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
233 atomic_inc(&i->use);
234 return i;
235 }
236 }
237 return NULL;
238 }
239
240 /* Just find a expectation corresponding to a tuple. */
241 struct ip_conntrack_expect *
242 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
243 {
244 struct ip_conntrack_expect *i;
245
246 read_lock_bh(&ip_conntrack_lock);
247 i = __ip_conntrack_expect_find(tuple);
248 read_unlock_bh(&ip_conntrack_lock);
249
250 return i;
251 }
252
253 /* If an expectation for this connection is found, it gets delete from
254 * global list then returned. */
255 static struct ip_conntrack_expect *
256 find_expectation(const struct ip_conntrack_tuple *tuple)
257 {
258 struct ip_conntrack_expect *i;
259
260 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
261 /* If master is not in hash table yet (ie. packet hasn't left
262 this machine yet), how can other end know about expected?
263 Hence these are not the droids you are looking for (if
264 master ct never got confirmed, we'd hold a reference to it
265 and weird things would happen to future packets). */
266 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
267 && is_confirmed(i->master)
268 && del_timer(&i->timeout)) {
269 unlink_expect(i);
270 return i;
271 }
272 }
273 return NULL;
274 }
275
276 /* delete all expectations for this conntrack */
277 void ip_ct_remove_expectations(struct ip_conntrack *ct)
278 {
279 struct ip_conntrack_expect *i, *tmp;
280
281 /* Optimization: most connection never expect any others. */
282 if (ct->expecting == 0)
283 return;
284
285 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
286 if (i->master == ct && del_timer(&i->timeout)) {
287 unlink_expect(i);
288 ip_conntrack_expect_put(i);
289 }
290 }
291 }
292
293 static void
294 clean_from_lists(struct ip_conntrack *ct)
295 {
296 unsigned int ho, hr;
297
298 DEBUGP("clean_from_lists(%p)\n", ct);
299 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
300
301 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
302 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
303 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
304 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
305
306 /* Destroy all pending expectations */
307 ip_ct_remove_expectations(ct);
308 }
309
310 static void
311 destroy_conntrack(struct nf_conntrack *nfct)
312 {
313 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
314 struct ip_conntrack_protocol *proto;
315
316 DEBUGP("destroy_conntrack(%p)\n", ct);
317 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
318 IP_NF_ASSERT(!timer_pending(&ct->timeout));
319
320 ip_conntrack_event(IPCT_DESTROY, ct);
321 set_bit(IPS_DYING_BIT, &ct->status);
322
323 /* To make sure we don't get any weird locking issues here:
324 * destroy_conntrack() MUST NOT be called with a write lock
325 * to ip_conntrack_lock!!! -HW */
326 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
327 if (proto && proto->destroy)
328 proto->destroy(ct);
329
330 if (ip_conntrack_destroyed)
331 ip_conntrack_destroyed(ct);
332
333 write_lock_bh(&ip_conntrack_lock);
334 /* Expectations will have been removed in clean_from_lists,
335 * except TFTP can create an expectation on the first packet,
336 * before connection is in the list, so we need to clean here,
337 * too. */
338 ip_ct_remove_expectations(ct);
339
340 /* We overload first tuple to link into unconfirmed list. */
341 if (!is_confirmed(ct)) {
342 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
343 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
344 }
345
346 CONNTRACK_STAT_INC(delete);
347 write_unlock_bh(&ip_conntrack_lock);
348
349 if (ct->master)
350 ip_conntrack_put(ct->master);
351
352 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
353 ip_conntrack_free(ct);
354 }
355
356 static void death_by_timeout(unsigned long ul_conntrack)
357 {
358 struct ip_conntrack *ct = (void *)ul_conntrack;
359
360 write_lock_bh(&ip_conntrack_lock);
361 /* Inside lock so preempt is disabled on module removal path.
362 * Otherwise we can get spurious warnings. */
363 CONNTRACK_STAT_INC(delete_list);
364 clean_from_lists(ct);
365 write_unlock_bh(&ip_conntrack_lock);
366 ip_conntrack_put(ct);
367 }
368
369 static inline int
370 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
371 const struct ip_conntrack_tuple *tuple,
372 const struct ip_conntrack *ignored_conntrack)
373 {
374 ASSERT_READ_LOCK(&ip_conntrack_lock);
375 return tuplehash_to_ctrack(i) != ignored_conntrack
376 && ip_ct_tuple_equal(tuple, &i->tuple);
377 }
378
379 struct ip_conntrack_tuple_hash *
380 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
381 const struct ip_conntrack *ignored_conntrack)
382 {
383 struct ip_conntrack_tuple_hash *h;
384 unsigned int hash = hash_conntrack(tuple);
385
386 ASSERT_READ_LOCK(&ip_conntrack_lock);
387 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
388 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
389 CONNTRACK_STAT_INC(found);
390 return h;
391 }
392 CONNTRACK_STAT_INC(searched);
393 }
394
395 return NULL;
396 }
397
398 /* Find a connection corresponding to a tuple. */
399 struct ip_conntrack_tuple_hash *
400 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
401 const struct ip_conntrack *ignored_conntrack)
402 {
403 struct ip_conntrack_tuple_hash *h;
404
405 read_lock_bh(&ip_conntrack_lock);
406 h = __ip_conntrack_find(tuple, ignored_conntrack);
407 if (h)
408 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
409 read_unlock_bh(&ip_conntrack_lock);
410
411 return h;
412 }
413
414 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
415 unsigned int hash,
416 unsigned int repl_hash)
417 {
418 ct->id = ++ip_conntrack_next_id;
419 list_prepend(&ip_conntrack_hash[hash],
420 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
421 list_prepend(&ip_conntrack_hash[repl_hash],
422 &ct->tuplehash[IP_CT_DIR_REPLY].list);
423 }
424
425 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
426 {
427 unsigned int hash, repl_hash;
428
429 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
430 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
431
432 write_lock_bh(&ip_conntrack_lock);
433 __ip_conntrack_hash_insert(ct, hash, repl_hash);
434 write_unlock_bh(&ip_conntrack_lock);
435 }
436
437 /* Confirm a connection given skb; places it in hash table */
438 int
439 __ip_conntrack_confirm(struct sk_buff **pskb)
440 {
441 unsigned int hash, repl_hash;
442 struct ip_conntrack *ct;
443 enum ip_conntrack_info ctinfo;
444
445 ct = ip_conntrack_get(*pskb, &ctinfo);
446
447 /* ipt_REJECT uses ip_conntrack_attach to attach related
448 ICMP/TCP RST packets in other direction. Actual packet
449 which created connection will be IP_CT_NEW or for an
450 expected connection, IP_CT_RELATED. */
451 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
452 return NF_ACCEPT;
453
454 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
455 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
456
457 /* We're not in hash table, and we refuse to set up related
458 connections for unconfirmed conns. But packet copies and
459 REJECT will give spurious warnings here. */
460 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
461
462 /* No external references means noone else could have
463 confirmed us. */
464 IP_NF_ASSERT(!is_confirmed(ct));
465 DEBUGP("Confirming conntrack %p\n", ct);
466
467 write_lock_bh(&ip_conntrack_lock);
468
469 /* See if there's one in the list already, including reverse:
470 NAT could have grabbed it without realizing, since we're
471 not in the hash. If there is, we lost race. */
472 if (!LIST_FIND(&ip_conntrack_hash[hash],
473 conntrack_tuple_cmp,
474 struct ip_conntrack_tuple_hash *,
475 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
476 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
477 conntrack_tuple_cmp,
478 struct ip_conntrack_tuple_hash *,
479 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
480 /* Remove from unconfirmed list */
481 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
482
483 __ip_conntrack_hash_insert(ct, hash, repl_hash);
484 /* Timer relative to confirmation time, not original
485 setting time, otherwise we'd get timer wrap in
486 weird delay cases. */
487 ct->timeout.expires += jiffies;
488 add_timer(&ct->timeout);
489 atomic_inc(&ct->ct_general.use);
490 set_bit(IPS_CONFIRMED_BIT, &ct->status);
491 CONNTRACK_STAT_INC(insert);
492 write_unlock_bh(&ip_conntrack_lock);
493 if (ct->helper)
494 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
495 #ifdef CONFIG_IP_NF_NAT_NEEDED
496 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
497 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
498 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
499 #endif
500 ip_conntrack_event_cache(master_ct(ct) ?
501 IPCT_RELATED : IPCT_NEW, *pskb);
502
503 return NF_ACCEPT;
504 }
505
506 CONNTRACK_STAT_INC(insert_failed);
507 write_unlock_bh(&ip_conntrack_lock);
508
509 return NF_DROP;
510 }
511
512 /* Returns true if a connection correspondings to the tuple (required
513 for NAT). */
514 int
515 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
516 const struct ip_conntrack *ignored_conntrack)
517 {
518 struct ip_conntrack_tuple_hash *h;
519
520 read_lock_bh(&ip_conntrack_lock);
521 h = __ip_conntrack_find(tuple, ignored_conntrack);
522 read_unlock_bh(&ip_conntrack_lock);
523
524 return h != NULL;
525 }
526
527 /* There's a small race here where we may free a just-assured
528 connection. Too bad: we're in trouble anyway. */
529 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
530 {
531 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
532 }
533
534 static int early_drop(struct list_head *chain)
535 {
536 /* Traverse backwards: gives us oldest, which is roughly LRU */
537 struct ip_conntrack_tuple_hash *h;
538 struct ip_conntrack *ct = NULL;
539 int dropped = 0;
540
541 read_lock_bh(&ip_conntrack_lock);
542 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
543 if (h) {
544 ct = tuplehash_to_ctrack(h);
545 atomic_inc(&ct->ct_general.use);
546 }
547 read_unlock_bh(&ip_conntrack_lock);
548
549 if (!ct)
550 return dropped;
551
552 if (del_timer(&ct->timeout)) {
553 death_by_timeout((unsigned long)ct);
554 dropped = 1;
555 CONNTRACK_STAT_INC(early_drop);
556 }
557 ip_conntrack_put(ct);
558 return dropped;
559 }
560
561 static inline int helper_cmp(const struct ip_conntrack_helper *i,
562 const struct ip_conntrack_tuple *rtuple)
563 {
564 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
565 }
566
567 static struct ip_conntrack_helper *
568 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
569 {
570 return LIST_FIND(&helpers, helper_cmp,
571 struct ip_conntrack_helper *,
572 tuple);
573 }
574
575 struct ip_conntrack_helper *
576 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
577 {
578 struct ip_conntrack_helper *helper;
579
580 /* need ip_conntrack_lock to assure that helper exists until
581 * try_module_get() is called */
582 read_lock_bh(&ip_conntrack_lock);
583
584 helper = __ip_conntrack_helper_find(tuple);
585 if (helper) {
586 /* need to increase module usage count to assure helper will
587 * not go away while the caller is e.g. busy putting a
588 * conntrack in the hash that uses the helper */
589 if (!try_module_get(helper->me))
590 helper = NULL;
591 }
592
593 read_unlock_bh(&ip_conntrack_lock);
594
595 return helper;
596 }
597
598 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
599 {
600 module_put(helper->me);
601 }
602
603 struct ip_conntrack_protocol *
604 __ip_conntrack_proto_find(u_int8_t protocol)
605 {
606 return ip_ct_protos[protocol];
607 }
608
609 /* this is guaranteed to always return a valid protocol helper, since
610 * it falls back to generic_protocol */
611 struct ip_conntrack_protocol *
612 ip_conntrack_proto_find_get(u_int8_t protocol)
613 {
614 struct ip_conntrack_protocol *p;
615
616 preempt_disable();
617 p = __ip_conntrack_proto_find(protocol);
618 if (p) {
619 if (!try_module_get(p->me))
620 p = &ip_conntrack_generic_protocol;
621 }
622 preempt_enable();
623
624 return p;
625 }
626
627 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
628 {
629 module_put(p->me);
630 }
631
632 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
633 struct ip_conntrack_tuple *repl)
634 {
635 struct ip_conntrack *conntrack;
636
637 if (!ip_conntrack_hash_rnd_initted) {
638 get_random_bytes(&ip_conntrack_hash_rnd, 4);
639 ip_conntrack_hash_rnd_initted = 1;
640 }
641
642 if (ip_conntrack_max
643 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
644 unsigned int hash = hash_conntrack(orig);
645 /* Try dropping from this hash chain. */
646 if (!early_drop(&ip_conntrack_hash[hash])) {
647 if (net_ratelimit())
648 printk(KERN_WARNING
649 "ip_conntrack: table full, dropping"
650 " packet.\n");
651 return ERR_PTR(-ENOMEM);
652 }
653 }
654
655 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
656 if (!conntrack) {
657 DEBUGP("Can't allocate conntrack.\n");
658 return ERR_PTR(-ENOMEM);
659 }
660
661 memset(conntrack, 0, sizeof(*conntrack));
662 atomic_set(&conntrack->ct_general.use, 1);
663 conntrack->ct_general.destroy = destroy_conntrack;
664 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
665 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
666 /* Don't set timer yet: wait for confirmation */
667 init_timer(&conntrack->timeout);
668 conntrack->timeout.data = (unsigned long)conntrack;
669 conntrack->timeout.function = death_by_timeout;
670
671 atomic_inc(&ip_conntrack_count);
672
673 return conntrack;
674 }
675
676 void
677 ip_conntrack_free(struct ip_conntrack *conntrack)
678 {
679 atomic_dec(&ip_conntrack_count);
680 kmem_cache_free(ip_conntrack_cachep, conntrack);
681 }
682
683 /* Allocate a new conntrack: we return -ENOMEM if classification
684 * failed due to stress. Otherwise it really is unclassifiable */
685 static struct ip_conntrack_tuple_hash *
686 init_conntrack(struct ip_conntrack_tuple *tuple,
687 struct ip_conntrack_protocol *protocol,
688 struct sk_buff *skb)
689 {
690 struct ip_conntrack *conntrack;
691 struct ip_conntrack_tuple repl_tuple;
692 struct ip_conntrack_expect *exp;
693
694 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
695 DEBUGP("Can't invert tuple.\n");
696 return NULL;
697 }
698
699 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
700 if (conntrack == NULL || IS_ERR(conntrack))
701 return (struct ip_conntrack_tuple_hash *)conntrack;
702
703 if (!protocol->new(conntrack, skb)) {
704 ip_conntrack_free(conntrack);
705 return NULL;
706 }
707
708 write_lock_bh(&ip_conntrack_lock);
709 exp = find_expectation(tuple);
710
711 if (exp) {
712 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
713 conntrack, exp);
714 /* Welcome, Mr. Bond. We've been expecting you... */
715 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
716 conntrack->master = exp->master;
717 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
718 conntrack->mark = exp->master->mark;
719 #endif
720 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
721 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
722 /* this is ugly, but there is no other place where to put it */
723 conntrack->nat.masq_index = exp->master->nat.masq_index;
724 #endif
725 nf_conntrack_get(&conntrack->master->ct_general);
726 CONNTRACK_STAT_INC(expect_new);
727 } else {
728 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
729
730 CONNTRACK_STAT_INC(new);
731 }
732
733 /* Overload tuple linked list to put us in unconfirmed list. */
734 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
735
736 write_unlock_bh(&ip_conntrack_lock);
737
738 if (exp) {
739 if (exp->expectfn)
740 exp->expectfn(conntrack, exp);
741 ip_conntrack_expect_put(exp);
742 }
743
744 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
745 }
746
747 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
748 static inline struct ip_conntrack *
749 resolve_normal_ct(struct sk_buff *skb,
750 struct ip_conntrack_protocol *proto,
751 int *set_reply,
752 unsigned int hooknum,
753 enum ip_conntrack_info *ctinfo)
754 {
755 struct ip_conntrack_tuple tuple;
756 struct ip_conntrack_tuple_hash *h;
757 struct ip_conntrack *ct;
758
759 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
760
761 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
762 &tuple,proto))
763 return NULL;
764
765 /* look for tuple match */
766 h = ip_conntrack_find_get(&tuple, NULL);
767 if (!h) {
768 h = init_conntrack(&tuple, proto, skb);
769 if (!h)
770 return NULL;
771 if (IS_ERR(h))
772 return (void *)h;
773 }
774 ct = tuplehash_to_ctrack(h);
775
776 /* It exists; we have (non-exclusive) reference. */
777 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
778 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
779 /* Please set reply bit if this packet OK */
780 *set_reply = 1;
781 } else {
782 /* Once we've had two way comms, always ESTABLISHED. */
783 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
784 DEBUGP("ip_conntrack_in: normal packet for %p\n",
785 ct);
786 *ctinfo = IP_CT_ESTABLISHED;
787 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
788 DEBUGP("ip_conntrack_in: related packet for %p\n",
789 ct);
790 *ctinfo = IP_CT_RELATED;
791 } else {
792 DEBUGP("ip_conntrack_in: new packet for %p\n",
793 ct);
794 *ctinfo = IP_CT_NEW;
795 }
796 *set_reply = 0;
797 }
798 skb->nfct = &ct->ct_general;
799 skb->nfctinfo = *ctinfo;
800 return ct;
801 }
802
803 /* Netfilter hook itself. */
804 unsigned int ip_conntrack_in(unsigned int hooknum,
805 struct sk_buff **pskb,
806 const struct net_device *in,
807 const struct net_device *out,
808 int (*okfn)(struct sk_buff *))
809 {
810 struct ip_conntrack *ct;
811 enum ip_conntrack_info ctinfo;
812 struct ip_conntrack_protocol *proto;
813 int set_reply = 0;
814 int ret;
815
816 /* Previously seen (loopback or untracked)? Ignore. */
817 if ((*pskb)->nfct) {
818 CONNTRACK_STAT_INC(ignore);
819 return NF_ACCEPT;
820 }
821
822 /* Never happen */
823 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
824 if (net_ratelimit()) {
825 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
826 (*pskb)->nh.iph->protocol, hooknum);
827 }
828 return NF_DROP;
829 }
830
831 /* Doesn't cover locally-generated broadcast, so not worth it. */
832 #if 0
833 /* Ignore broadcast: no `connection'. */
834 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
835 printk("Broadcast packet!\n");
836 return NF_ACCEPT;
837 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
838 == htonl(0x000000FF)) {
839 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
840 NIPQUAD((*pskb)->nh.iph->saddr),
841 NIPQUAD((*pskb)->nh.iph->daddr),
842 (*pskb)->sk, (*pskb)->pkt_type);
843 }
844 #endif
845
846 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
847
848 /* It may be an special packet, error, unclean...
849 * inverse of the return code tells to the netfilter
850 * core what to do with the packet. */
851 if (proto->error != NULL
852 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
853 CONNTRACK_STAT_INC(error);
854 CONNTRACK_STAT_INC(invalid);
855 return -ret;
856 }
857
858 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
859 /* Not valid part of a connection */
860 CONNTRACK_STAT_INC(invalid);
861 return NF_ACCEPT;
862 }
863
864 if (IS_ERR(ct)) {
865 /* Too stressed to deal. */
866 CONNTRACK_STAT_INC(drop);
867 return NF_DROP;
868 }
869
870 IP_NF_ASSERT((*pskb)->nfct);
871
872 ret = proto->packet(ct, *pskb, ctinfo);
873 if (ret < 0) {
874 /* Invalid: inverse of the return code tells
875 * the netfilter core what to do*/
876 nf_conntrack_put((*pskb)->nfct);
877 (*pskb)->nfct = NULL;
878 CONNTRACK_STAT_INC(invalid);
879 return -ret;
880 }
881
882 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
883 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
884
885 return ret;
886 }
887
888 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
889 const struct ip_conntrack_tuple *orig)
890 {
891 return ip_ct_invert_tuple(inverse, orig,
892 __ip_conntrack_proto_find(orig->dst.protonum));
893 }
894
895 /* Would two expected things clash? */
896 static inline int expect_clash(const struct ip_conntrack_expect *a,
897 const struct ip_conntrack_expect *b)
898 {
899 /* Part covered by intersection of masks must be unequal,
900 otherwise they clash */
901 struct ip_conntrack_tuple intersect_mask
902 = { { a->mask.src.ip & b->mask.src.ip,
903 { a->mask.src.u.all & b->mask.src.u.all } },
904 { a->mask.dst.ip & b->mask.dst.ip,
905 { a->mask.dst.u.all & b->mask.dst.u.all },
906 a->mask.dst.protonum & b->mask.dst.protonum } };
907
908 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
909 }
910
911 static inline int expect_matches(const struct ip_conntrack_expect *a,
912 const struct ip_conntrack_expect *b)
913 {
914 return a->master == b->master
915 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
916 && ip_ct_tuple_equal(&a->mask, &b->mask);
917 }
918
919 /* Generally a bad idea to call this: could have matched already. */
920 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
921 {
922 struct ip_conntrack_expect *i;
923
924 write_lock_bh(&ip_conntrack_lock);
925 /* choose the the oldest expectation to evict */
926 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
927 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
928 unlink_expect(i);
929 write_unlock_bh(&ip_conntrack_lock);
930 ip_conntrack_expect_put(i);
931 return;
932 }
933 }
934 write_unlock_bh(&ip_conntrack_lock);
935 }
936
937 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
938 {
939 struct ip_conntrack_expect *new;
940
941 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
942 if (!new) {
943 DEBUGP("expect_related: OOM allocating expect\n");
944 return NULL;
945 }
946 new->master = me;
947 atomic_inc(&new->master->ct_general.use);
948 atomic_set(&new->use, 1);
949 return new;
950 }
951
952 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
953 {
954 if (atomic_dec_and_test(&exp->use)) {
955 ip_conntrack_put(exp->master);
956 kmem_cache_free(ip_conntrack_expect_cachep, exp);
957 }
958 }
959
960 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
961 {
962 atomic_inc(&exp->use);
963 exp->master->expecting++;
964 list_add(&exp->list, &ip_conntrack_expect_list);
965
966 init_timer(&exp->timeout);
967 exp->timeout.data = (unsigned long)exp;
968 exp->timeout.function = expectation_timed_out;
969 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
970 add_timer(&exp->timeout);
971
972 exp->id = ++ip_conntrack_expect_next_id;
973 atomic_inc(&exp->use);
974 CONNTRACK_STAT_INC(expect_create);
975 }
976
977 /* Race with expectations being used means we could have none to find; OK. */
978 static void evict_oldest_expect(struct ip_conntrack *master)
979 {
980 struct ip_conntrack_expect *i;
981
982 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
983 if (i->master == master) {
984 if (del_timer(&i->timeout)) {
985 unlink_expect(i);
986 ip_conntrack_expect_put(i);
987 }
988 break;
989 }
990 }
991 }
992
993 static inline int refresh_timer(struct ip_conntrack_expect *i)
994 {
995 if (!del_timer(&i->timeout))
996 return 0;
997
998 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
999 add_timer(&i->timeout);
1000 return 1;
1001 }
1002
1003 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1004 {
1005 struct ip_conntrack_expect *i;
1006 int ret;
1007
1008 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1009 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1010 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1011
1012 write_lock_bh(&ip_conntrack_lock);
1013 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1014 if (expect_matches(i, expect)) {
1015 /* Refresh timer: if it's dying, ignore.. */
1016 if (refresh_timer(i)) {
1017 ret = 0;
1018 goto out;
1019 }
1020 } else if (expect_clash(i, expect)) {
1021 ret = -EBUSY;
1022 goto out;
1023 }
1024 }
1025
1026 /* Will be over limit? */
1027 if (expect->master->helper->max_expected &&
1028 expect->master->expecting >= expect->master->helper->max_expected)
1029 evict_oldest_expect(expect->master);
1030
1031 ip_conntrack_expect_insert(expect);
1032 ip_conntrack_expect_event(IPEXP_NEW, expect);
1033 ret = 0;
1034 out:
1035 write_unlock_bh(&ip_conntrack_lock);
1036 return ret;
1037 }
1038
1039 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1040 implicitly racy: see __ip_conntrack_confirm */
1041 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1042 const struct ip_conntrack_tuple *newreply)
1043 {
1044 write_lock_bh(&ip_conntrack_lock);
1045 /* Should be unconfirmed, so not in hash table yet */
1046 IP_NF_ASSERT(!is_confirmed(conntrack));
1047
1048 DEBUGP("Altering reply tuple of %p to ", conntrack);
1049 DUMP_TUPLE(newreply);
1050
1051 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1052 if (!conntrack->master && conntrack->expecting == 0)
1053 conntrack->helper = __ip_conntrack_helper_find(newreply);
1054 write_unlock_bh(&ip_conntrack_lock);
1055 }
1056
1057 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1058 {
1059 BUG_ON(me->timeout == 0);
1060 write_lock_bh(&ip_conntrack_lock);
1061 list_prepend(&helpers, me);
1062 write_unlock_bh(&ip_conntrack_lock);
1063
1064 return 0;
1065 }
1066
1067 struct ip_conntrack_helper *
1068 __ip_conntrack_helper_find_byname(const char *name)
1069 {
1070 struct ip_conntrack_helper *h;
1071
1072 list_for_each_entry(h, &helpers, list) {
1073 if (!strcmp(h->name, name))
1074 return h;
1075 }
1076
1077 return NULL;
1078 }
1079
1080 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1081 const struct ip_conntrack_helper *me)
1082 {
1083 if (tuplehash_to_ctrack(i)->helper == me) {
1084 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1085 tuplehash_to_ctrack(i)->helper = NULL;
1086 }
1087 return 0;
1088 }
1089
1090 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1091 {
1092 unsigned int i;
1093 struct ip_conntrack_expect *exp, *tmp;
1094
1095 /* Need write lock here, to delete helper. */
1096 write_lock_bh(&ip_conntrack_lock);
1097 LIST_DELETE(&helpers, me);
1098
1099 /* Get rid of expectations */
1100 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1101 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1102 unlink_expect(exp);
1103 ip_conntrack_expect_put(exp);
1104 }
1105 }
1106 /* Get rid of expecteds, set helpers to NULL. */
1107 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1108 for (i = 0; i < ip_conntrack_htable_size; i++)
1109 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1110 struct ip_conntrack_tuple_hash *, me);
1111 write_unlock_bh(&ip_conntrack_lock);
1112
1113 /* Someone could be still looking at the helper in a bh. */
1114 synchronize_net();
1115 }
1116
1117 static inline void ct_add_counters(struct ip_conntrack *ct,
1118 enum ip_conntrack_info ctinfo,
1119 const struct sk_buff *skb)
1120 {
1121 #ifdef CONFIG_IP_NF_CT_ACCT
1122 if (skb) {
1123 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1124 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1125 ntohs(skb->nh.iph->tot_len);
1126 }
1127 #endif
1128 }
1129
1130 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1131 void ip_ct_refresh_acct(struct ip_conntrack *ct,
1132 enum ip_conntrack_info ctinfo,
1133 const struct sk_buff *skb,
1134 unsigned long extra_jiffies)
1135 {
1136 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1137
1138 /* If not in hash table, timer will not be active yet */
1139 if (!is_confirmed(ct)) {
1140 ct->timeout.expires = extra_jiffies;
1141 ct_add_counters(ct, ctinfo, skb);
1142 } else {
1143 write_lock_bh(&ip_conntrack_lock);
1144 /* Need del_timer for race avoidance (may already be dying). */
1145 if (del_timer(&ct->timeout)) {
1146 ct->timeout.expires = jiffies + extra_jiffies;
1147 add_timer(&ct->timeout);
1148 ip_conntrack_event_cache(IPCT_REFRESH, skb);
1149 }
1150 ct_add_counters(ct, ctinfo, skb);
1151 write_unlock_bh(&ip_conntrack_lock);
1152 }
1153 }
1154
1155 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1156 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1157 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1158 * in ip_conntrack_core, since we don't want the protocols to autoload
1159 * or depend on ctnetlink */
1160 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1161 const struct ip_conntrack_tuple *tuple)
1162 {
1163 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1164 &tuple->src.u.tcp.port);
1165 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1166 &tuple->dst.u.tcp.port);
1167 return 0;
1168
1169 nfattr_failure:
1170 return -1;
1171 }
1172
1173 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1174 struct ip_conntrack_tuple *t)
1175 {
1176 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1177 return -EINVAL;
1178
1179 t->src.u.tcp.port =
1180 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1181 t->dst.u.tcp.port =
1182 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1183
1184 return 0;
1185 }
1186 #endif
1187
1188 /* Returns new sk_buff, or NULL */
1189 struct sk_buff *
1190 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1191 {
1192 skb_orphan(skb);
1193
1194 local_bh_disable();
1195 skb = ip_defrag(skb, user);
1196 local_bh_enable();
1197
1198 if (skb)
1199 ip_send_check(skb->nh.iph);
1200 return skb;
1201 }
1202
1203 /* Used by ipt_REJECT. */
1204 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1205 {
1206 struct ip_conntrack *ct;
1207 enum ip_conntrack_info ctinfo;
1208
1209 /* This ICMP is in reverse direction to the packet which caused it */
1210 ct = ip_conntrack_get(skb, &ctinfo);
1211
1212 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1213 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1214 else
1215 ctinfo = IP_CT_RELATED;
1216
1217 /* Attach to new skbuff, and increment count */
1218 nskb->nfct = &ct->ct_general;
1219 nskb->nfctinfo = ctinfo;
1220 nf_conntrack_get(nskb->nfct);
1221 }
1222
1223 static inline int
1224 do_iter(const struct ip_conntrack_tuple_hash *i,
1225 int (*iter)(struct ip_conntrack *i, void *data),
1226 void *data)
1227 {
1228 return iter(tuplehash_to_ctrack(i), data);
1229 }
1230
1231 /* Bring out ya dead! */
1232 static struct ip_conntrack_tuple_hash *
1233 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1234 void *data, unsigned int *bucket)
1235 {
1236 struct ip_conntrack_tuple_hash *h = NULL;
1237
1238 write_lock_bh(&ip_conntrack_lock);
1239 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1240 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1241 struct ip_conntrack_tuple_hash *, iter, data);
1242 if (h)
1243 break;
1244 }
1245 if (!h)
1246 h = LIST_FIND_W(&unconfirmed, do_iter,
1247 struct ip_conntrack_tuple_hash *, iter, data);
1248 if (h)
1249 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1250 write_unlock_bh(&ip_conntrack_lock);
1251
1252 return h;
1253 }
1254
1255 void
1256 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1257 {
1258 struct ip_conntrack_tuple_hash *h;
1259 unsigned int bucket = 0;
1260
1261 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1262 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1263 /* Time to push up daises... */
1264 if (del_timer(&ct->timeout))
1265 death_by_timeout((unsigned long)ct);
1266 /* ... else the timer will get him soon. */
1267
1268 ip_conntrack_put(ct);
1269 }
1270 }
1271
1272 /* Fast function for those who don't want to parse /proc (and I don't
1273 blame them). */
1274 /* Reversing the socket's dst/src point of view gives us the reply
1275 mapping. */
1276 static int
1277 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1278 {
1279 struct inet_sock *inet = inet_sk(sk);
1280 struct ip_conntrack_tuple_hash *h;
1281 struct ip_conntrack_tuple tuple;
1282
1283 IP_CT_TUPLE_U_BLANK(&tuple);
1284 tuple.src.ip = inet->rcv_saddr;
1285 tuple.src.u.tcp.port = inet->sport;
1286 tuple.dst.ip = inet->daddr;
1287 tuple.dst.u.tcp.port = inet->dport;
1288 tuple.dst.protonum = IPPROTO_TCP;
1289
1290 /* We only do TCP at the moment: is there a better way? */
1291 if (strcmp(sk->sk_prot->name, "TCP")) {
1292 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1293 return -ENOPROTOOPT;
1294 }
1295
1296 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1297 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1298 *len, sizeof(struct sockaddr_in));
1299 return -EINVAL;
1300 }
1301
1302 h = ip_conntrack_find_get(&tuple, NULL);
1303 if (h) {
1304 struct sockaddr_in sin;
1305 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1306
1307 sin.sin_family = AF_INET;
1308 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1309 .tuple.dst.u.tcp.port;
1310 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1311 .tuple.dst.ip;
1312
1313 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1314 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1315 ip_conntrack_put(ct);
1316 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1317 return -EFAULT;
1318 else
1319 return 0;
1320 }
1321 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1322 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1323 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1324 return -ENOENT;
1325 }
1326
1327 static struct nf_sockopt_ops so_getorigdst = {
1328 .pf = PF_INET,
1329 .get_optmin = SO_ORIGINAL_DST,
1330 .get_optmax = SO_ORIGINAL_DST+1,
1331 .get = &getorigdst,
1332 };
1333
1334 static int kill_all(struct ip_conntrack *i, void *data)
1335 {
1336 return 1;
1337 }
1338
1339 static void free_conntrack_hash(void)
1340 {
1341 if (ip_conntrack_vmalloc)
1342 vfree(ip_conntrack_hash);
1343 else
1344 free_pages((unsigned long)ip_conntrack_hash,
1345 get_order(sizeof(struct list_head)
1346 * ip_conntrack_htable_size));
1347 }
1348
1349 void ip_conntrack_flush()
1350 {
1351 /* This makes sure all current packets have passed through
1352 netfilter framework. Roll on, two-stage module
1353 delete... */
1354 synchronize_net();
1355
1356 ip_ct_event_cache_flush();
1357 i_see_dead_people:
1358 ip_ct_iterate_cleanup(kill_all, NULL);
1359 if (atomic_read(&ip_conntrack_count) != 0) {
1360 schedule();
1361 goto i_see_dead_people;
1362 }
1363 /* wait until all references to ip_conntrack_untracked are dropped */
1364 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1365 schedule();
1366 }
1367
1368 /* Mishearing the voices in his head, our hero wonders how he's
1369 supposed to kill the mall. */
1370 void ip_conntrack_cleanup(void)
1371 {
1372 ip_ct_attach = NULL;
1373 ip_conntrack_flush();
1374 kmem_cache_destroy(ip_conntrack_cachep);
1375 kmem_cache_destroy(ip_conntrack_expect_cachep);
1376 free_conntrack_hash();
1377 nf_unregister_sockopt(&so_getorigdst);
1378 }
1379
1380 static int hashsize;
1381 module_param(hashsize, int, 0400);
1382
1383 int __init ip_conntrack_init(void)
1384 {
1385 unsigned int i;
1386 int ret;
1387
1388 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1389 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1390 if (hashsize) {
1391 ip_conntrack_htable_size = hashsize;
1392 } else {
1393 ip_conntrack_htable_size
1394 = (((num_physpages << PAGE_SHIFT) / 16384)
1395 / sizeof(struct list_head));
1396 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1397 ip_conntrack_htable_size = 8192;
1398 if (ip_conntrack_htable_size < 16)
1399 ip_conntrack_htable_size = 16;
1400 }
1401 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1402
1403 printk("ip_conntrack version %s (%u buckets, %d max)"
1404 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1405 ip_conntrack_htable_size, ip_conntrack_max,
1406 sizeof(struct ip_conntrack));
1407
1408 ret = nf_register_sockopt(&so_getorigdst);
1409 if (ret != 0) {
1410 printk(KERN_ERR "Unable to register netfilter socket option\n");
1411 return ret;
1412 }
1413
1414 /* AK: the hash table is twice as big than needed because it
1415 uses list_head. it would be much nicer to caches to use a
1416 single pointer list head here. */
1417 ip_conntrack_vmalloc = 0;
1418 ip_conntrack_hash
1419 =(void*)__get_free_pages(GFP_KERNEL,
1420 get_order(sizeof(struct list_head)
1421 *ip_conntrack_htable_size));
1422 if (!ip_conntrack_hash) {
1423 ip_conntrack_vmalloc = 1;
1424 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1425 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1426 * ip_conntrack_htable_size);
1427 }
1428 if (!ip_conntrack_hash) {
1429 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1430 goto err_unreg_sockopt;
1431 }
1432
1433 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1434 sizeof(struct ip_conntrack), 0,
1435 0, NULL, NULL);
1436 if (!ip_conntrack_cachep) {
1437 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1438 goto err_free_hash;
1439 }
1440
1441 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1442 sizeof(struct ip_conntrack_expect),
1443 0, 0, NULL, NULL);
1444 if (!ip_conntrack_expect_cachep) {
1445 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1446 goto err_free_conntrack_slab;
1447 }
1448
1449 /* Don't NEED lock here, but good form anyway. */
1450 write_lock_bh(&ip_conntrack_lock);
1451 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1452 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1453 /* Sew in builtin protocols. */
1454 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1455 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1456 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1457 write_unlock_bh(&ip_conntrack_lock);
1458
1459 for (i = 0; i < ip_conntrack_htable_size; i++)
1460 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1461
1462 /* For use by ipt_REJECT */
1463 ip_ct_attach = ip_conntrack_attach;
1464
1465 /* Set up fake conntrack:
1466 - to never be deleted, not in any hashes */
1467 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1468 /* - and look it like as a confirmed connection */
1469 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1470
1471 return ret;
1472
1473 err_free_conntrack_slab:
1474 kmem_cache_destroy(ip_conntrack_cachep);
1475 err_free_hash:
1476 free_conntrack_hash();
1477 err_unreg_sockopt:
1478 nf_unregister_sockopt(&so_getorigdst);
1479
1480 return -ENOMEM;
1481 }
This page took 0.075603 seconds and 5 git commands to generate.