[NETFILTER]: Add ctnetlink subsystem
[deliverable/linux.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
41
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43 registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
46
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
52
53 #define IP_CONNTRACK_VERSION "2.3"
54
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
60
61 DEFINE_RWLOCK(ip_conntrack_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep;
74 static kmem_cache_t *ip_conntrack_expect_cachep;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
79
80 static unsigned int ip_conntrack_next_id = 1;
81 static unsigned int ip_conntrack_expect_next_id = 1;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 struct notifier_block *ip_conntrack_chain;
84 struct notifier_block *ip_conntrack_expect_chain;
85
86 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88 static inline void __deliver_cached_events(struct ip_conntrack_ecache *ecache)
89 {
90 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
91 notifier_call_chain(&ip_conntrack_chain, ecache->events,
92 ecache->ct);
93 ecache->events = 0;
94 }
95
96 void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
97 {
98 __deliver_cached_events(ecache);
99 }
100
101 /* Deliver all cached events for a particular conntrack. This is called
102 * by code prior to async packet handling or freeing the skb */
103 void
104 ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct)
105 {
106 struct ip_conntrack_ecache *ecache =
107 &__get_cpu_var(ip_conntrack_ecache);
108
109 if (!ct)
110 return;
111
112 if (ecache->ct == ct) {
113 DEBUGP("ecache: delivering event for %p\n", ct);
114 __deliver_cached_events(ecache);
115 } else {
116 if (net_ratelimit())
117 printk(KERN_WARNING "ecache: want to deliver for %p, "
118 "but cache has %p\n", ct, ecache->ct);
119 }
120
121 /* signalize that events have already been delivered */
122 ecache->ct = NULL;
123 }
124
125 /* Deliver cached events for old pending events, if current conntrack != old */
126 void ip_conntrack_event_cache_init(const struct sk_buff *skb)
127 {
128 struct ip_conntrack *ct = (struct ip_conntrack *) skb->nfct;
129 struct ip_conntrack_ecache *ecache =
130 &__get_cpu_var(ip_conntrack_ecache);
131
132 /* take care of delivering potentially old events */
133 if (ecache->ct != ct) {
134 enum ip_conntrack_info ctinfo;
135 /* we have to check, since at startup the cache is NULL */
136 if (likely(ecache->ct)) {
137 DEBUGP("ecache: entered for different conntrack: "
138 "ecache->ct=%p, skb->nfct=%p. delivering "
139 "events\n", ecache->ct, ct);
140 __deliver_cached_events(ecache);
141 ip_conntrack_put(ecache->ct);
142 } else {
143 DEBUGP("ecache: entered for conntrack %p, "
144 "cache was clean before\n", ct);
145 }
146
147 /* initialize for this conntrack/packet */
148 ecache->ct = ip_conntrack_get(skb, &ctinfo);
149 /* ecache->events cleared by __deliver_cached_devents() */
150 } else {
151 DEBUGP("ecache: re-entered for conntrack %p.\n", ct);
152 }
153 }
154
155 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
156
157 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
158
159 static int ip_conntrack_hash_rnd_initted;
160 static unsigned int ip_conntrack_hash_rnd;
161
162 static u_int32_t
163 hash_conntrack(const struct ip_conntrack_tuple *tuple)
164 {
165 #if 0
166 dump_tuple(tuple);
167 #endif
168 return (jhash_3words(tuple->src.ip,
169 (tuple->dst.ip ^ tuple->dst.protonum),
170 (tuple->src.u.all | (tuple->dst.u.all << 16)),
171 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
172 }
173
174 int
175 ip_ct_get_tuple(const struct iphdr *iph,
176 const struct sk_buff *skb,
177 unsigned int dataoff,
178 struct ip_conntrack_tuple *tuple,
179 const struct ip_conntrack_protocol *protocol)
180 {
181 /* Never happen */
182 if (iph->frag_off & htons(IP_OFFSET)) {
183 printk("ip_conntrack_core: Frag of proto %u.\n",
184 iph->protocol);
185 return 0;
186 }
187
188 tuple->src.ip = iph->saddr;
189 tuple->dst.ip = iph->daddr;
190 tuple->dst.protonum = iph->protocol;
191 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
192
193 return protocol->pkt_to_tuple(skb, dataoff, tuple);
194 }
195
196 int
197 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
198 const struct ip_conntrack_tuple *orig,
199 const struct ip_conntrack_protocol *protocol)
200 {
201 inverse->src.ip = orig->dst.ip;
202 inverse->dst.ip = orig->src.ip;
203 inverse->dst.protonum = orig->dst.protonum;
204 inverse->dst.dir = !orig->dst.dir;
205
206 return protocol->invert_tuple(inverse, orig);
207 }
208
209
210 /* ip_conntrack_expect helper functions */
211 static void unlink_expect(struct ip_conntrack_expect *exp)
212 {
213 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
214 IP_NF_ASSERT(!timer_pending(&exp->timeout));
215 list_del(&exp->list);
216 CONNTRACK_STAT_INC(expect_delete);
217 exp->master->expecting--;
218 }
219
220 void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
221 {
222 unlink_expect(exp);
223 ip_conntrack_expect_put(exp);
224 }
225
226 static void expectation_timed_out(unsigned long ul_expect)
227 {
228 struct ip_conntrack_expect *exp = (void *)ul_expect;
229
230 write_lock_bh(&ip_conntrack_lock);
231 unlink_expect(exp);
232 write_unlock_bh(&ip_conntrack_lock);
233 ip_conntrack_expect_put(exp);
234 }
235
236 struct ip_conntrack_expect *
237 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
238 {
239 struct ip_conntrack_expect *i;
240
241 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
242 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
243 atomic_inc(&i->use);
244 return i;
245 }
246 }
247 return NULL;
248 }
249
250 /* Just find a expectation corresponding to a tuple. */
251 struct ip_conntrack_expect *
252 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
253 {
254 struct ip_conntrack_expect *i;
255
256 read_lock_bh(&ip_conntrack_lock);
257 i = __ip_conntrack_expect_find(tuple);
258 read_unlock_bh(&ip_conntrack_lock);
259
260 return i;
261 }
262
263 /* If an expectation for this connection is found, it gets delete from
264 * global list then returned. */
265 static struct ip_conntrack_expect *
266 find_expectation(const struct ip_conntrack_tuple *tuple)
267 {
268 struct ip_conntrack_expect *i;
269
270 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
271 /* If master is not in hash table yet (ie. packet hasn't left
272 this machine yet), how can other end know about expected?
273 Hence these are not the droids you are looking for (if
274 master ct never got confirmed, we'd hold a reference to it
275 and weird things would happen to future packets). */
276 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
277 && is_confirmed(i->master)
278 && del_timer(&i->timeout)) {
279 unlink_expect(i);
280 return i;
281 }
282 }
283 return NULL;
284 }
285
286 /* delete all expectations for this conntrack */
287 void ip_ct_remove_expectations(struct ip_conntrack *ct)
288 {
289 struct ip_conntrack_expect *i, *tmp;
290
291 /* Optimization: most connection never expect any others. */
292 if (ct->expecting == 0)
293 return;
294
295 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
296 if (i->master == ct && del_timer(&i->timeout)) {
297 unlink_expect(i);
298 ip_conntrack_expect_put(i);
299 }
300 }
301 }
302
303 static void
304 clean_from_lists(struct ip_conntrack *ct)
305 {
306 unsigned int ho, hr;
307
308 DEBUGP("clean_from_lists(%p)\n", ct);
309 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
310
311 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
312 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
313 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
314 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
315
316 /* Destroy all pending expectations */
317 ip_ct_remove_expectations(ct);
318 }
319
320 static void
321 destroy_conntrack(struct nf_conntrack *nfct)
322 {
323 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
324 struct ip_conntrack_protocol *proto;
325
326 DEBUGP("destroy_conntrack(%p)\n", ct);
327 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
328 IP_NF_ASSERT(!timer_pending(&ct->timeout));
329
330 set_bit(IPS_DYING_BIT, &ct->status);
331
332 /* To make sure we don't get any weird locking issues here:
333 * destroy_conntrack() MUST NOT be called with a write lock
334 * to ip_conntrack_lock!!! -HW */
335 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
336 if (proto && proto->destroy)
337 proto->destroy(ct);
338
339 if (ip_conntrack_destroyed)
340 ip_conntrack_destroyed(ct);
341
342 write_lock_bh(&ip_conntrack_lock);
343 /* Expectations will have been removed in clean_from_lists,
344 * except TFTP can create an expectation on the first packet,
345 * before connection is in the list, so we need to clean here,
346 * too. */
347 ip_ct_remove_expectations(ct);
348
349 /* We overload first tuple to link into unconfirmed list. */
350 if (!is_confirmed(ct)) {
351 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
352 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
353 }
354
355 CONNTRACK_STAT_INC(delete);
356 write_unlock_bh(&ip_conntrack_lock);
357
358 if (ct->master)
359 ip_conntrack_put(ct->master);
360
361 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
362 ip_conntrack_free(ct);
363 }
364
365 static void death_by_timeout(unsigned long ul_conntrack)
366 {
367 struct ip_conntrack *ct = (void *)ul_conntrack;
368
369 ip_conntrack_event(IPCT_DESTROY, ct);
370 write_lock_bh(&ip_conntrack_lock);
371 /* Inside lock so preempt is disabled on module removal path.
372 * Otherwise we can get spurious warnings. */
373 CONNTRACK_STAT_INC(delete_list);
374 clean_from_lists(ct);
375 write_unlock_bh(&ip_conntrack_lock);
376 ip_conntrack_put(ct);
377 }
378
379 static inline int
380 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
381 const struct ip_conntrack_tuple *tuple,
382 const struct ip_conntrack *ignored_conntrack)
383 {
384 ASSERT_READ_LOCK(&ip_conntrack_lock);
385 return tuplehash_to_ctrack(i) != ignored_conntrack
386 && ip_ct_tuple_equal(tuple, &i->tuple);
387 }
388
389 struct ip_conntrack_tuple_hash *
390 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
391 const struct ip_conntrack *ignored_conntrack)
392 {
393 struct ip_conntrack_tuple_hash *h;
394 unsigned int hash = hash_conntrack(tuple);
395
396 ASSERT_READ_LOCK(&ip_conntrack_lock);
397 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
398 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
399 CONNTRACK_STAT_INC(found);
400 return h;
401 }
402 CONNTRACK_STAT_INC(searched);
403 }
404
405 return NULL;
406 }
407
408 /* Find a connection corresponding to a tuple. */
409 struct ip_conntrack_tuple_hash *
410 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
411 const struct ip_conntrack *ignored_conntrack)
412 {
413 struct ip_conntrack_tuple_hash *h;
414
415 read_lock_bh(&ip_conntrack_lock);
416 h = __ip_conntrack_find(tuple, ignored_conntrack);
417 if (h)
418 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
419 read_unlock_bh(&ip_conntrack_lock);
420
421 return h;
422 }
423
424 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
425 unsigned int hash,
426 unsigned int repl_hash)
427 {
428 ct->id = ++ip_conntrack_next_id;
429 list_prepend(&ip_conntrack_hash[hash],
430 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
431 list_prepend(&ip_conntrack_hash[repl_hash],
432 &ct->tuplehash[IP_CT_DIR_REPLY].list);
433 }
434
435 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
436 {
437 unsigned int hash, repl_hash;
438
439 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
440 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
441
442 write_lock_bh(&ip_conntrack_lock);
443 __ip_conntrack_hash_insert(ct, hash, repl_hash);
444 write_unlock_bh(&ip_conntrack_lock);
445 }
446
447 /* Confirm a connection given skb; places it in hash table */
448 int
449 __ip_conntrack_confirm(struct sk_buff **pskb)
450 {
451 unsigned int hash, repl_hash;
452 struct ip_conntrack *ct;
453 enum ip_conntrack_info ctinfo;
454
455 ct = ip_conntrack_get(*pskb, &ctinfo);
456
457 /* ipt_REJECT uses ip_conntrack_attach to attach related
458 ICMP/TCP RST packets in other direction. Actual packet
459 which created connection will be IP_CT_NEW or for an
460 expected connection, IP_CT_RELATED. */
461 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
462 return NF_ACCEPT;
463
464 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
465 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
466
467 /* We're not in hash table, and we refuse to set up related
468 connections for unconfirmed conns. But packet copies and
469 REJECT will give spurious warnings here. */
470 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
471
472 /* No external references means noone else could have
473 confirmed us. */
474 IP_NF_ASSERT(!is_confirmed(ct));
475 DEBUGP("Confirming conntrack %p\n", ct);
476
477 write_lock_bh(&ip_conntrack_lock);
478
479 /* See if there's one in the list already, including reverse:
480 NAT could have grabbed it without realizing, since we're
481 not in the hash. If there is, we lost race. */
482 if (!LIST_FIND(&ip_conntrack_hash[hash],
483 conntrack_tuple_cmp,
484 struct ip_conntrack_tuple_hash *,
485 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
486 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
487 conntrack_tuple_cmp,
488 struct ip_conntrack_tuple_hash *,
489 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
490 /* Remove from unconfirmed list */
491 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
492
493 __ip_conntrack_hash_insert(ct, hash, repl_hash);
494 /* Timer relative to confirmation time, not original
495 setting time, otherwise we'd get timer wrap in
496 weird delay cases. */
497 ct->timeout.expires += jiffies;
498 add_timer(&ct->timeout);
499 atomic_inc(&ct->ct_general.use);
500 set_bit(IPS_CONFIRMED_BIT, &ct->status);
501 CONNTRACK_STAT_INC(insert);
502 write_unlock_bh(&ip_conntrack_lock);
503 if (ct->helper)
504 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
505 #ifdef CONFIG_IP_NF_NAT_NEEDED
506 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
507 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
508 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
509 #endif
510 ip_conntrack_event_cache(master_ct(ct) ?
511 IPCT_RELATED : IPCT_NEW, *pskb);
512
513 return NF_ACCEPT;
514 }
515
516 CONNTRACK_STAT_INC(insert_failed);
517 write_unlock_bh(&ip_conntrack_lock);
518
519 return NF_DROP;
520 }
521
522 /* Returns true if a connection correspondings to the tuple (required
523 for NAT). */
524 int
525 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
526 const struct ip_conntrack *ignored_conntrack)
527 {
528 struct ip_conntrack_tuple_hash *h;
529
530 read_lock_bh(&ip_conntrack_lock);
531 h = __ip_conntrack_find(tuple, ignored_conntrack);
532 read_unlock_bh(&ip_conntrack_lock);
533
534 return h != NULL;
535 }
536
537 /* There's a small race here where we may free a just-assured
538 connection. Too bad: we're in trouble anyway. */
539 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
540 {
541 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
542 }
543
544 static int early_drop(struct list_head *chain)
545 {
546 /* Traverse backwards: gives us oldest, which is roughly LRU */
547 struct ip_conntrack_tuple_hash *h;
548 struct ip_conntrack *ct = NULL;
549 int dropped = 0;
550
551 read_lock_bh(&ip_conntrack_lock);
552 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
553 if (h) {
554 ct = tuplehash_to_ctrack(h);
555 atomic_inc(&ct->ct_general.use);
556 }
557 read_unlock_bh(&ip_conntrack_lock);
558
559 if (!ct)
560 return dropped;
561
562 if (del_timer(&ct->timeout)) {
563 death_by_timeout((unsigned long)ct);
564 dropped = 1;
565 CONNTRACK_STAT_INC(early_drop);
566 }
567 ip_conntrack_put(ct);
568 return dropped;
569 }
570
571 static inline int helper_cmp(const struct ip_conntrack_helper *i,
572 const struct ip_conntrack_tuple *rtuple)
573 {
574 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
575 }
576
577 static struct ip_conntrack_helper *
578 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
579 {
580 return LIST_FIND(&helpers, helper_cmp,
581 struct ip_conntrack_helper *,
582 tuple);
583 }
584
585 struct ip_conntrack_helper *
586 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
587 {
588 struct ip_conntrack_helper *helper;
589
590 /* need ip_conntrack_lock to assure that helper exists until
591 * try_module_get() is called */
592 read_lock_bh(&ip_conntrack_lock);
593
594 helper = __ip_conntrack_helper_find(tuple);
595 if (helper) {
596 /* need to increase module usage count to assure helper will
597 * not go away while the caller is e.g. busy putting a
598 * conntrack in the hash that uses the helper */
599 if (!try_module_get(helper->me))
600 helper = NULL;
601 }
602
603 read_unlock_bh(&ip_conntrack_lock);
604
605 return helper;
606 }
607
608 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
609 {
610 module_put(helper->me);
611 }
612
613 struct ip_conntrack_protocol *
614 __ip_conntrack_proto_find(u_int8_t protocol)
615 {
616 return ip_ct_protos[protocol];
617 }
618
619 /* this is guaranteed to always return a valid protocol helper, since
620 * it falls back to generic_protocol */
621 struct ip_conntrack_protocol *
622 ip_conntrack_proto_find_get(u_int8_t protocol)
623 {
624 struct ip_conntrack_protocol *p;
625
626 preempt_disable();
627 p = __ip_conntrack_proto_find(protocol);
628 if (p) {
629 if (!try_module_get(p->me))
630 p = &ip_conntrack_generic_protocol;
631 }
632 preempt_enable();
633
634 return p;
635 }
636
637 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
638 {
639 module_put(p->me);
640 }
641
642 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
643 struct ip_conntrack_tuple *repl)
644 {
645 struct ip_conntrack *conntrack;
646
647 if (!ip_conntrack_hash_rnd_initted) {
648 get_random_bytes(&ip_conntrack_hash_rnd, 4);
649 ip_conntrack_hash_rnd_initted = 1;
650 }
651
652 if (ip_conntrack_max
653 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
654 unsigned int hash = hash_conntrack(orig);
655 /* Try dropping from this hash chain. */
656 if (!early_drop(&ip_conntrack_hash[hash])) {
657 if (net_ratelimit())
658 printk(KERN_WARNING
659 "ip_conntrack: table full, dropping"
660 " packet.\n");
661 return ERR_PTR(-ENOMEM);
662 }
663 }
664
665 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
666 if (!conntrack) {
667 DEBUGP("Can't allocate conntrack.\n");
668 return NULL;
669 }
670
671 memset(conntrack, 0, sizeof(*conntrack));
672 atomic_set(&conntrack->ct_general.use, 1);
673 conntrack->ct_general.destroy = destroy_conntrack;
674 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
675 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
676 /* Don't set timer yet: wait for confirmation */
677 init_timer(&conntrack->timeout);
678 conntrack->timeout.data = (unsigned long)conntrack;
679 conntrack->timeout.function = death_by_timeout;
680
681 atomic_inc(&ip_conntrack_count);
682
683 return conntrack;
684 }
685
686 void
687 ip_conntrack_free(struct ip_conntrack *conntrack)
688 {
689 atomic_dec(&ip_conntrack_count);
690 kmem_cache_free(ip_conntrack_cachep, conntrack);
691 }
692
693 /* Allocate a new conntrack: we return -ENOMEM if classification
694 * failed due to stress. Otherwise it really is unclassifiable */
695 static struct ip_conntrack_tuple_hash *
696 init_conntrack(struct ip_conntrack_tuple *tuple,
697 struct ip_conntrack_protocol *protocol,
698 struct sk_buff *skb)
699 {
700 struct ip_conntrack *conntrack;
701 struct ip_conntrack_tuple repl_tuple;
702 struct ip_conntrack_expect *exp;
703
704 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
705 DEBUGP("Can't invert tuple.\n");
706 return NULL;
707 }
708
709 if (!(conntrack = ip_conntrack_alloc(tuple, &repl_tuple)))
710 return NULL;
711
712 if (!protocol->new(conntrack, skb)) {
713 ip_conntrack_free(conntrack);
714 return NULL;
715 }
716
717 write_lock_bh(&ip_conntrack_lock);
718 exp = find_expectation(tuple);
719
720 if (exp) {
721 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
722 conntrack, exp);
723 /* Welcome, Mr. Bond. We've been expecting you... */
724 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
725 conntrack->master = exp->master;
726 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
727 conntrack->mark = exp->master->mark;
728 #endif
729 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
730 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
731 /* this is ugly, but there is no other place where to put it */
732 conntrack->nat.masq_index = exp->master->nat.masq_index;
733 #endif
734 nf_conntrack_get(&conntrack->master->ct_general);
735 CONNTRACK_STAT_INC(expect_new);
736 } else {
737 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
738
739 CONNTRACK_STAT_INC(new);
740 }
741
742 /* Overload tuple linked list to put us in unconfirmed list. */
743 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
744
745 write_unlock_bh(&ip_conntrack_lock);
746
747 if (exp) {
748 if (exp->expectfn)
749 exp->expectfn(conntrack, exp);
750 ip_conntrack_expect_put(exp);
751 }
752
753 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
754 }
755
756 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
757 static inline struct ip_conntrack *
758 resolve_normal_ct(struct sk_buff *skb,
759 struct ip_conntrack_protocol *proto,
760 int *set_reply,
761 unsigned int hooknum,
762 enum ip_conntrack_info *ctinfo)
763 {
764 struct ip_conntrack_tuple tuple;
765 struct ip_conntrack_tuple_hash *h;
766 struct ip_conntrack *ct;
767
768 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
769
770 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
771 &tuple,proto))
772 return NULL;
773
774 /* look for tuple match */
775 h = ip_conntrack_find_get(&tuple, NULL);
776 if (!h) {
777 h = init_conntrack(&tuple, proto, skb);
778 if (!h)
779 return NULL;
780 if (IS_ERR(h))
781 return (void *)h;
782 }
783 ct = tuplehash_to_ctrack(h);
784
785 /* It exists; we have (non-exclusive) reference. */
786 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
787 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
788 /* Please set reply bit if this packet OK */
789 *set_reply = 1;
790 } else {
791 /* Once we've had two way comms, always ESTABLISHED. */
792 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
793 DEBUGP("ip_conntrack_in: normal packet for %p\n",
794 ct);
795 *ctinfo = IP_CT_ESTABLISHED;
796 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
797 DEBUGP("ip_conntrack_in: related packet for %p\n",
798 ct);
799 *ctinfo = IP_CT_RELATED;
800 } else {
801 DEBUGP("ip_conntrack_in: new packet for %p\n",
802 ct);
803 *ctinfo = IP_CT_NEW;
804 }
805 *set_reply = 0;
806 }
807 skb->nfct = &ct->ct_general;
808 skb->nfctinfo = *ctinfo;
809 return ct;
810 }
811
812 /* Netfilter hook itself. */
813 unsigned int ip_conntrack_in(unsigned int hooknum,
814 struct sk_buff **pskb,
815 const struct net_device *in,
816 const struct net_device *out,
817 int (*okfn)(struct sk_buff *))
818 {
819 struct ip_conntrack *ct;
820 enum ip_conntrack_info ctinfo;
821 struct ip_conntrack_protocol *proto;
822 int set_reply = 0;
823 int ret;
824
825 /* Previously seen (loopback or untracked)? Ignore. */
826 if ((*pskb)->nfct) {
827 CONNTRACK_STAT_INC(ignore);
828 return NF_ACCEPT;
829 }
830
831 /* Never happen */
832 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
833 if (net_ratelimit()) {
834 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
835 (*pskb)->nh.iph->protocol, hooknum);
836 }
837 return NF_DROP;
838 }
839
840 /* Doesn't cover locally-generated broadcast, so not worth it. */
841 #if 0
842 /* Ignore broadcast: no `connection'. */
843 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
844 printk("Broadcast packet!\n");
845 return NF_ACCEPT;
846 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
847 == htonl(0x000000FF)) {
848 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
849 NIPQUAD((*pskb)->nh.iph->saddr),
850 NIPQUAD((*pskb)->nh.iph->daddr),
851 (*pskb)->sk, (*pskb)->pkt_type);
852 }
853 #endif
854
855 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
856
857 /* It may be an special packet, error, unclean...
858 * inverse of the return code tells to the netfilter
859 * core what to do with the packet. */
860 if (proto->error != NULL
861 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
862 CONNTRACK_STAT_INC(error);
863 CONNTRACK_STAT_INC(invalid);
864 return -ret;
865 }
866
867 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
868 /* Not valid part of a connection */
869 CONNTRACK_STAT_INC(invalid);
870 return NF_ACCEPT;
871 }
872
873 if (IS_ERR(ct)) {
874 /* Too stressed to deal. */
875 CONNTRACK_STAT_INC(drop);
876 return NF_DROP;
877 }
878
879 IP_NF_ASSERT((*pskb)->nfct);
880
881 ip_conntrack_event_cache_init(*pskb);
882
883 ret = proto->packet(ct, *pskb, ctinfo);
884 if (ret < 0) {
885 /* Invalid: inverse of the return code tells
886 * the netfilter core what to do*/
887 nf_conntrack_put((*pskb)->nfct);
888 (*pskb)->nfct = NULL;
889 CONNTRACK_STAT_INC(invalid);
890 return -ret;
891 }
892
893 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
894 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
895
896 return ret;
897 }
898
899 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
900 const struct ip_conntrack_tuple *orig)
901 {
902 return ip_ct_invert_tuple(inverse, orig,
903 __ip_conntrack_proto_find(orig->dst.protonum));
904 }
905
906 /* Would two expected things clash? */
907 static inline int expect_clash(const struct ip_conntrack_expect *a,
908 const struct ip_conntrack_expect *b)
909 {
910 /* Part covered by intersection of masks must be unequal,
911 otherwise they clash */
912 struct ip_conntrack_tuple intersect_mask
913 = { { a->mask.src.ip & b->mask.src.ip,
914 { a->mask.src.u.all & b->mask.src.u.all } },
915 { a->mask.dst.ip & b->mask.dst.ip,
916 { a->mask.dst.u.all & b->mask.dst.u.all },
917 a->mask.dst.protonum & b->mask.dst.protonum } };
918
919 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
920 }
921
922 static inline int expect_matches(const struct ip_conntrack_expect *a,
923 const struct ip_conntrack_expect *b)
924 {
925 return a->master == b->master
926 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
927 && ip_ct_tuple_equal(&a->mask, &b->mask);
928 }
929
930 /* Generally a bad idea to call this: could have matched already. */
931 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
932 {
933 struct ip_conntrack_expect *i;
934
935 write_lock_bh(&ip_conntrack_lock);
936 /* choose the the oldest expectation to evict */
937 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
938 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
939 unlink_expect(i);
940 write_unlock_bh(&ip_conntrack_lock);
941 ip_conntrack_expect_put(i);
942 return;
943 }
944 }
945 write_unlock_bh(&ip_conntrack_lock);
946 }
947
948 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
949 {
950 struct ip_conntrack_expect *new;
951
952 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
953 if (!new) {
954 DEBUGP("expect_related: OOM allocating expect\n");
955 return NULL;
956 }
957 new->master = me;
958 atomic_inc(&new->master->ct_general.use);
959 atomic_set(&new->use, 1);
960 return new;
961 }
962
963 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
964 {
965 if (atomic_dec_and_test(&exp->use)) {
966 ip_conntrack_put(exp->master);
967 kmem_cache_free(ip_conntrack_expect_cachep, exp);
968 }
969 }
970
971 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
972 {
973 atomic_inc(&exp->use);
974 exp->master->expecting++;
975 list_add(&exp->list, &ip_conntrack_expect_list);
976
977 init_timer(&exp->timeout);
978 exp->timeout.data = (unsigned long)exp;
979 exp->timeout.function = expectation_timed_out;
980 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
981 add_timer(&exp->timeout);
982
983 exp->id = ++ip_conntrack_expect_next_id;
984 atomic_inc(&exp->use);
985 CONNTRACK_STAT_INC(expect_create);
986 }
987
988 /* Race with expectations being used means we could have none to find; OK. */
989 static void evict_oldest_expect(struct ip_conntrack *master)
990 {
991 struct ip_conntrack_expect *i;
992
993 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
994 if (i->master == master) {
995 if (del_timer(&i->timeout)) {
996 unlink_expect(i);
997 ip_conntrack_expect_put(i);
998 }
999 break;
1000 }
1001 }
1002 }
1003
1004 static inline int refresh_timer(struct ip_conntrack_expect *i)
1005 {
1006 if (!del_timer(&i->timeout))
1007 return 0;
1008
1009 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1010 add_timer(&i->timeout);
1011 return 1;
1012 }
1013
1014 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1015 {
1016 struct ip_conntrack_expect *i;
1017 int ret;
1018
1019 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1020 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1021 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1022
1023 write_lock_bh(&ip_conntrack_lock);
1024 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1025 if (expect_matches(i, expect)) {
1026 /* Refresh timer: if it's dying, ignore.. */
1027 if (refresh_timer(i)) {
1028 ret = 0;
1029 goto out;
1030 }
1031 } else if (expect_clash(i, expect)) {
1032 ret = -EBUSY;
1033 goto out;
1034 }
1035 }
1036
1037 /* Will be over limit? */
1038 if (expect->master->helper->max_expected &&
1039 expect->master->expecting >= expect->master->helper->max_expected)
1040 evict_oldest_expect(expect->master);
1041
1042 ip_conntrack_expect_insert(expect);
1043 ip_conntrack_expect_event(IPEXP_NEW, expect);
1044 ret = 0;
1045 out:
1046 write_unlock_bh(&ip_conntrack_lock);
1047 return ret;
1048 }
1049
1050 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1051 implicitly racy: see __ip_conntrack_confirm */
1052 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1053 const struct ip_conntrack_tuple *newreply)
1054 {
1055 write_lock_bh(&ip_conntrack_lock);
1056 /* Should be unconfirmed, so not in hash table yet */
1057 IP_NF_ASSERT(!is_confirmed(conntrack));
1058
1059 DEBUGP("Altering reply tuple of %p to ", conntrack);
1060 DUMP_TUPLE(newreply);
1061
1062 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1063 if (!conntrack->master && conntrack->expecting == 0)
1064 conntrack->helper = __ip_conntrack_helper_find(newreply);
1065 write_unlock_bh(&ip_conntrack_lock);
1066 }
1067
1068 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1069 {
1070 BUG_ON(me->timeout == 0);
1071 write_lock_bh(&ip_conntrack_lock);
1072 list_prepend(&helpers, me);
1073 write_unlock_bh(&ip_conntrack_lock);
1074
1075 return 0;
1076 }
1077
1078 struct ip_conntrack_helper *
1079 __ip_conntrack_helper_find_byname(const char *name)
1080 {
1081 struct ip_conntrack_helper *h;
1082
1083 list_for_each_entry(h, &helpers, list) {
1084 if (!strcmp(h->name, name))
1085 return h;
1086 }
1087
1088 return NULL;
1089 }
1090
1091 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1092 const struct ip_conntrack_helper *me)
1093 {
1094 if (tuplehash_to_ctrack(i)->helper == me) {
1095 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1096 tuplehash_to_ctrack(i)->helper = NULL;
1097 }
1098 return 0;
1099 }
1100
1101 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1102 {
1103 unsigned int i;
1104 struct ip_conntrack_expect *exp, *tmp;
1105
1106 /* Need write lock here, to delete helper. */
1107 write_lock_bh(&ip_conntrack_lock);
1108 LIST_DELETE(&helpers, me);
1109
1110 /* Get rid of expectations */
1111 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1112 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1113 unlink_expect(exp);
1114 ip_conntrack_expect_put(exp);
1115 }
1116 }
1117 /* Get rid of expecteds, set helpers to NULL. */
1118 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1119 for (i = 0; i < ip_conntrack_htable_size; i++)
1120 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1121 struct ip_conntrack_tuple_hash *, me);
1122 write_unlock_bh(&ip_conntrack_lock);
1123
1124 /* Someone could be still looking at the helper in a bh. */
1125 synchronize_net();
1126 }
1127
1128 static inline void ct_add_counters(struct ip_conntrack *ct,
1129 enum ip_conntrack_info ctinfo,
1130 const struct sk_buff *skb)
1131 {
1132 #ifdef CONFIG_IP_NF_CT_ACCT
1133 if (skb) {
1134 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1135 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1136 ntohs(skb->nh.iph->tot_len);
1137 }
1138 #endif
1139 }
1140
1141 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1142 void ip_ct_refresh_acct(struct ip_conntrack *ct,
1143 enum ip_conntrack_info ctinfo,
1144 const struct sk_buff *skb,
1145 unsigned long extra_jiffies)
1146 {
1147 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1148
1149 /* If not in hash table, timer will not be active yet */
1150 if (!is_confirmed(ct)) {
1151 ct->timeout.expires = extra_jiffies;
1152 ct_add_counters(ct, ctinfo, skb);
1153 } else {
1154 write_lock_bh(&ip_conntrack_lock);
1155 /* Need del_timer for race avoidance (may already be dying). */
1156 if (del_timer(&ct->timeout)) {
1157 ct->timeout.expires = jiffies + extra_jiffies;
1158 add_timer(&ct->timeout);
1159 ip_conntrack_event_cache(IPCT_REFRESH, skb);
1160 }
1161 ct_add_counters(ct, ctinfo, skb);
1162 write_unlock_bh(&ip_conntrack_lock);
1163 }
1164 }
1165
1166 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1167 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1168 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1169 * in ip_conntrack_core, since we don't want the protocols to autoload
1170 * or depend on ctnetlink */
1171 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1172 const struct ip_conntrack_tuple *tuple)
1173 {
1174 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1175 &tuple->src.u.tcp.port);
1176 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1177 &tuple->dst.u.tcp.port);
1178 return 0;
1179
1180 nfattr_failure:
1181 return -1;
1182 }
1183
1184 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1185 struct ip_conntrack_tuple *t)
1186 {
1187 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1188 return -EINVAL;
1189
1190 t->src.u.tcp.port =
1191 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1192 t->dst.u.tcp.port =
1193 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1194
1195 return 0;
1196 }
1197 #endif
1198
1199 /* Returns new sk_buff, or NULL */
1200 struct sk_buff *
1201 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1202 {
1203 skb_orphan(skb);
1204
1205 local_bh_disable();
1206 skb = ip_defrag(skb, user);
1207 local_bh_enable();
1208
1209 if (skb)
1210 ip_send_check(skb->nh.iph);
1211 return skb;
1212 }
1213
1214 /* Used by ipt_REJECT. */
1215 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1216 {
1217 struct ip_conntrack *ct;
1218 enum ip_conntrack_info ctinfo;
1219
1220 /* This ICMP is in reverse direction to the packet which caused it */
1221 ct = ip_conntrack_get(skb, &ctinfo);
1222
1223 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1224 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1225 else
1226 ctinfo = IP_CT_RELATED;
1227
1228 /* Attach to new skbuff, and increment count */
1229 nskb->nfct = &ct->ct_general;
1230 nskb->nfctinfo = ctinfo;
1231 nf_conntrack_get(nskb->nfct);
1232 }
1233
1234 static inline int
1235 do_iter(const struct ip_conntrack_tuple_hash *i,
1236 int (*iter)(struct ip_conntrack *i, void *data),
1237 void *data)
1238 {
1239 return iter(tuplehash_to_ctrack(i), data);
1240 }
1241
1242 /* Bring out ya dead! */
1243 static struct ip_conntrack_tuple_hash *
1244 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1245 void *data, unsigned int *bucket)
1246 {
1247 struct ip_conntrack_tuple_hash *h = NULL;
1248
1249 write_lock_bh(&ip_conntrack_lock);
1250 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1251 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1252 struct ip_conntrack_tuple_hash *, iter, data);
1253 if (h)
1254 break;
1255 }
1256 if (!h)
1257 h = LIST_FIND_W(&unconfirmed, do_iter,
1258 struct ip_conntrack_tuple_hash *, iter, data);
1259 if (h)
1260 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1261 write_unlock_bh(&ip_conntrack_lock);
1262
1263 return h;
1264 }
1265
1266 void
1267 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1268 {
1269 struct ip_conntrack_tuple_hash *h;
1270 unsigned int bucket = 0;
1271
1272 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1273 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1274 /* Time to push up daises... */
1275 if (del_timer(&ct->timeout))
1276 death_by_timeout((unsigned long)ct);
1277 /* ... else the timer will get him soon. */
1278
1279 ip_conntrack_put(ct);
1280 }
1281
1282 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
1283 {
1284 /* we need to deliver all cached events in order to drop
1285 * the reference counts */
1286 int cpu;
1287 for_each_cpu(cpu) {
1288 struct ip_conntrack_ecache *ecache =
1289 &per_cpu(ip_conntrack_ecache, cpu);
1290 if (ecache->ct) {
1291 __ip_ct_deliver_cached_events(ecache);
1292 ip_conntrack_put(ecache->ct);
1293 ecache->ct = NULL;
1294 }
1295 }
1296 }
1297 #endif
1298 }
1299
1300 /* Fast function for those who don't want to parse /proc (and I don't
1301 blame them). */
1302 /* Reversing the socket's dst/src point of view gives us the reply
1303 mapping. */
1304 static int
1305 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1306 {
1307 struct inet_sock *inet = inet_sk(sk);
1308 struct ip_conntrack_tuple_hash *h;
1309 struct ip_conntrack_tuple tuple;
1310
1311 IP_CT_TUPLE_U_BLANK(&tuple);
1312 tuple.src.ip = inet->rcv_saddr;
1313 tuple.src.u.tcp.port = inet->sport;
1314 tuple.dst.ip = inet->daddr;
1315 tuple.dst.u.tcp.port = inet->dport;
1316 tuple.dst.protonum = IPPROTO_TCP;
1317
1318 /* We only do TCP at the moment: is there a better way? */
1319 if (strcmp(sk->sk_prot->name, "TCP")) {
1320 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1321 return -ENOPROTOOPT;
1322 }
1323
1324 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1325 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1326 *len, sizeof(struct sockaddr_in));
1327 return -EINVAL;
1328 }
1329
1330 h = ip_conntrack_find_get(&tuple, NULL);
1331 if (h) {
1332 struct sockaddr_in sin;
1333 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1334
1335 sin.sin_family = AF_INET;
1336 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1337 .tuple.dst.u.tcp.port;
1338 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1339 .tuple.dst.ip;
1340
1341 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1342 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1343 ip_conntrack_put(ct);
1344 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1345 return -EFAULT;
1346 else
1347 return 0;
1348 }
1349 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1350 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1351 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1352 return -ENOENT;
1353 }
1354
1355 static struct nf_sockopt_ops so_getorigdst = {
1356 .pf = PF_INET,
1357 .get_optmin = SO_ORIGINAL_DST,
1358 .get_optmax = SO_ORIGINAL_DST+1,
1359 .get = &getorigdst,
1360 };
1361
1362 static int kill_all(struct ip_conntrack *i, void *data)
1363 {
1364 return 1;
1365 }
1366
1367 static void free_conntrack_hash(void)
1368 {
1369 if (ip_conntrack_vmalloc)
1370 vfree(ip_conntrack_hash);
1371 else
1372 free_pages((unsigned long)ip_conntrack_hash,
1373 get_order(sizeof(struct list_head)
1374 * ip_conntrack_htable_size));
1375 }
1376
1377 void ip_conntrack_flush()
1378 {
1379 /* This makes sure all current packets have passed through
1380 netfilter framework. Roll on, two-stage module
1381 delete... */
1382 synchronize_net();
1383
1384 i_see_dead_people:
1385 ip_ct_iterate_cleanup(kill_all, NULL);
1386 if (atomic_read(&ip_conntrack_count) != 0) {
1387 schedule();
1388 goto i_see_dead_people;
1389 }
1390 /* wait until all references to ip_conntrack_untracked are dropped */
1391 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1392 schedule();
1393 }
1394
1395 /* Mishearing the voices in his head, our hero wonders how he's
1396 supposed to kill the mall. */
1397 void ip_conntrack_cleanup(void)
1398 {
1399 ip_ct_attach = NULL;
1400 ip_conntrack_flush();
1401 kmem_cache_destroy(ip_conntrack_cachep);
1402 kmem_cache_destroy(ip_conntrack_expect_cachep);
1403 free_conntrack_hash();
1404 nf_unregister_sockopt(&so_getorigdst);
1405 }
1406
1407 static int hashsize;
1408 module_param(hashsize, int, 0400);
1409
1410 int __init ip_conntrack_init(void)
1411 {
1412 unsigned int i;
1413 int ret;
1414
1415 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1416 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1417 if (hashsize) {
1418 ip_conntrack_htable_size = hashsize;
1419 } else {
1420 ip_conntrack_htable_size
1421 = (((num_physpages << PAGE_SHIFT) / 16384)
1422 / sizeof(struct list_head));
1423 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1424 ip_conntrack_htable_size = 8192;
1425 if (ip_conntrack_htable_size < 16)
1426 ip_conntrack_htable_size = 16;
1427 }
1428 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1429
1430 printk("ip_conntrack version %s (%u buckets, %d max)"
1431 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1432 ip_conntrack_htable_size, ip_conntrack_max,
1433 sizeof(struct ip_conntrack));
1434
1435 ret = nf_register_sockopt(&so_getorigdst);
1436 if (ret != 0) {
1437 printk(KERN_ERR "Unable to register netfilter socket option\n");
1438 return ret;
1439 }
1440
1441 /* AK: the hash table is twice as big than needed because it
1442 uses list_head. it would be much nicer to caches to use a
1443 single pointer list head here. */
1444 ip_conntrack_vmalloc = 0;
1445 ip_conntrack_hash
1446 =(void*)__get_free_pages(GFP_KERNEL,
1447 get_order(sizeof(struct list_head)
1448 *ip_conntrack_htable_size));
1449 if (!ip_conntrack_hash) {
1450 ip_conntrack_vmalloc = 1;
1451 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1452 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1453 * ip_conntrack_htable_size);
1454 }
1455 if (!ip_conntrack_hash) {
1456 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1457 goto err_unreg_sockopt;
1458 }
1459
1460 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1461 sizeof(struct ip_conntrack), 0,
1462 0, NULL, NULL);
1463 if (!ip_conntrack_cachep) {
1464 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1465 goto err_free_hash;
1466 }
1467
1468 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1469 sizeof(struct ip_conntrack_expect),
1470 0, 0, NULL, NULL);
1471 if (!ip_conntrack_expect_cachep) {
1472 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1473 goto err_free_conntrack_slab;
1474 }
1475
1476 /* Don't NEED lock here, but good form anyway. */
1477 write_lock_bh(&ip_conntrack_lock);
1478 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1479 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1480 /* Sew in builtin protocols. */
1481 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1482 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1483 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1484 write_unlock_bh(&ip_conntrack_lock);
1485
1486 for (i = 0; i < ip_conntrack_htable_size; i++)
1487 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1488
1489 /* For use by ipt_REJECT */
1490 ip_ct_attach = ip_conntrack_attach;
1491
1492 /* Set up fake conntrack:
1493 - to never be deleted, not in any hashes */
1494 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1495 /* - and look it like as a confirmed connection */
1496 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1497
1498 return ret;
1499
1500 err_free_conntrack_slab:
1501 kmem_cache_destroy(ip_conntrack_cachep);
1502 err_free_hash:
1503 free_conntrack_hash();
1504 err_unreg_sockopt:
1505 nf_unregister_sockopt(&so_getorigdst);
1506
1507 return -ENOMEM;
1508 }
This page took 0.097306 seconds and 5 git commands to generate.