Merge git://git.infradead.org/mtd-2.6
[deliverable/linux.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
41
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43 registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
46
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
52
53 #define IP_CONNTRACK_VERSION "2.4"
54
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
60
61 DEFINE_RWLOCK(ip_conntrack_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
74 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
79
80 static unsigned int ip_conntrack_next_id;
81 static unsigned int ip_conntrack_expect_next_id;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
84 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
85
86 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88 /* deliver cached events and clear cache entry - must be called with locally
89 * disabled softirqs */
90 static inline void
91 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
92 {
93 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
94 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
96 ecache->ct);
97 ecache->events = 0;
98 ip_conntrack_put(ecache->ct);
99 ecache->ct = NULL;
100 }
101
102 /* Deliver all cached events for a particular conntrack. This is called
103 * by code prior to async packet handling or freeing the skb */
104 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
105 {
106 struct ip_conntrack_ecache *ecache;
107
108 local_bh_disable();
109 ecache = &__get_cpu_var(ip_conntrack_ecache);
110 if (ecache->ct == ct)
111 __ip_ct_deliver_cached_events(ecache);
112 local_bh_enable();
113 }
114
115 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116 {
117 struct ip_conntrack_ecache *ecache;
118
119 /* take care of delivering potentially old events */
120 ecache = &__get_cpu_var(ip_conntrack_ecache);
121 BUG_ON(ecache->ct == ct);
122 if (ecache->ct)
123 __ip_ct_deliver_cached_events(ecache);
124 /* initialize for this conntrack/packet */
125 ecache->ct = ct;
126 nf_conntrack_get(&ct->ct_general);
127 }
128
129 /* flush the event cache - touches other CPU's data and must not be called while
130 * packets are still passing through the code */
131 static void ip_ct_event_cache_flush(void)
132 {
133 struct ip_conntrack_ecache *ecache;
134 int cpu;
135
136 for_each_possible_cpu(cpu) {
137 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138 if (ecache->ct)
139 ip_conntrack_put(ecache->ct);
140 }
141 }
142 #else
143 static inline void ip_ct_event_cache_flush(void) {}
144 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
146 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
148 static int ip_conntrack_hash_rnd_initted;
149 static unsigned int ip_conntrack_hash_rnd;
150
151 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
152 unsigned int size, unsigned int rnd)
153 {
154 return (jhash_3words(tuple->src.ip,
155 (tuple->dst.ip ^ tuple->dst.protonum),
156 (tuple->src.u.all | (tuple->dst.u.all << 16)),
157 rnd) % size);
158 }
159
160 static u_int32_t
161 hash_conntrack(const struct ip_conntrack_tuple *tuple)
162 {
163 return __hash_conntrack(tuple, ip_conntrack_htable_size,
164 ip_conntrack_hash_rnd);
165 }
166
167 int
168 ip_ct_get_tuple(const struct iphdr *iph,
169 const struct sk_buff *skb,
170 unsigned int dataoff,
171 struct ip_conntrack_tuple *tuple,
172 const struct ip_conntrack_protocol *protocol)
173 {
174 /* Never happen */
175 if (iph->frag_off & htons(IP_OFFSET)) {
176 printk("ip_conntrack_core: Frag of proto %u.\n",
177 iph->protocol);
178 return 0;
179 }
180
181 tuple->src.ip = iph->saddr;
182 tuple->dst.ip = iph->daddr;
183 tuple->dst.protonum = iph->protocol;
184 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
185
186 return protocol->pkt_to_tuple(skb, dataoff, tuple);
187 }
188
189 int
190 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
191 const struct ip_conntrack_tuple *orig,
192 const struct ip_conntrack_protocol *protocol)
193 {
194 inverse->src.ip = orig->dst.ip;
195 inverse->dst.ip = orig->src.ip;
196 inverse->dst.protonum = orig->dst.protonum;
197 inverse->dst.dir = !orig->dst.dir;
198
199 return protocol->invert_tuple(inverse, orig);
200 }
201
202
203 /* ip_conntrack_expect helper functions */
204 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
205 {
206 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
207 IP_NF_ASSERT(!timer_pending(&exp->timeout));
208 list_del(&exp->list);
209 CONNTRACK_STAT_INC(expect_delete);
210 exp->master->expecting--;
211 ip_conntrack_expect_put(exp);
212 }
213
214 static void expectation_timed_out(unsigned long ul_expect)
215 {
216 struct ip_conntrack_expect *exp = (void *)ul_expect;
217
218 write_lock_bh(&ip_conntrack_lock);
219 ip_ct_unlink_expect(exp);
220 write_unlock_bh(&ip_conntrack_lock);
221 ip_conntrack_expect_put(exp);
222 }
223
224 struct ip_conntrack_expect *
225 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
226 {
227 struct ip_conntrack_expect *i;
228
229 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
230 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
231 atomic_inc(&i->use);
232 return i;
233 }
234 }
235 return NULL;
236 }
237
238 /* Just find a expectation corresponding to a tuple. */
239 struct ip_conntrack_expect *
240 ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
241 {
242 struct ip_conntrack_expect *i;
243
244 read_lock_bh(&ip_conntrack_lock);
245 i = __ip_conntrack_expect_find(tuple);
246 read_unlock_bh(&ip_conntrack_lock);
247
248 return i;
249 }
250
251 /* If an expectation for this connection is found, it gets delete from
252 * global list then returned. */
253 static struct ip_conntrack_expect *
254 find_expectation(const struct ip_conntrack_tuple *tuple)
255 {
256 struct ip_conntrack_expect *i;
257
258 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
259 /* If master is not in hash table yet (ie. packet hasn't left
260 this machine yet), how can other end know about expected?
261 Hence these are not the droids you are looking for (if
262 master ct never got confirmed, we'd hold a reference to it
263 and weird things would happen to future packets). */
264 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
265 && is_confirmed(i->master)) {
266 if (i->flags & IP_CT_EXPECT_PERMANENT) {
267 atomic_inc(&i->use);
268 return i;
269 } else if (del_timer(&i->timeout)) {
270 ip_ct_unlink_expect(i);
271 return i;
272 }
273 }
274 }
275 return NULL;
276 }
277
278 /* delete all expectations for this conntrack */
279 void ip_ct_remove_expectations(struct ip_conntrack *ct)
280 {
281 struct ip_conntrack_expect *i, *tmp;
282
283 /* Optimization: most connection never expect any others. */
284 if (ct->expecting == 0)
285 return;
286
287 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
288 if (i->master == ct && del_timer(&i->timeout)) {
289 ip_ct_unlink_expect(i);
290 ip_conntrack_expect_put(i);
291 }
292 }
293 }
294
295 static void
296 clean_from_lists(struct ip_conntrack *ct)
297 {
298 unsigned int ho, hr;
299
300 DEBUGP("clean_from_lists(%p)\n", ct);
301 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
302
303 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
304 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
305 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
306 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
307
308 /* Destroy all pending expectations */
309 ip_ct_remove_expectations(ct);
310 }
311
312 static void
313 destroy_conntrack(struct nf_conntrack *nfct)
314 {
315 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
316 struct ip_conntrack_protocol *proto;
317
318 DEBUGP("destroy_conntrack(%p)\n", ct);
319 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
320 IP_NF_ASSERT(!timer_pending(&ct->timeout));
321
322 ip_conntrack_event(IPCT_DESTROY, ct);
323 set_bit(IPS_DYING_BIT, &ct->status);
324
325 /* To make sure we don't get any weird locking issues here:
326 * destroy_conntrack() MUST NOT be called with a write lock
327 * to ip_conntrack_lock!!! -HW */
328 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
329 if (proto && proto->destroy)
330 proto->destroy(ct);
331
332 if (ip_conntrack_destroyed)
333 ip_conntrack_destroyed(ct);
334
335 write_lock_bh(&ip_conntrack_lock);
336 /* Expectations will have been removed in clean_from_lists,
337 * except TFTP can create an expectation on the first packet,
338 * before connection is in the list, so we need to clean here,
339 * too. */
340 ip_ct_remove_expectations(ct);
341
342 /* We overload first tuple to link into unconfirmed list. */
343 if (!is_confirmed(ct)) {
344 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
345 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
346 }
347
348 CONNTRACK_STAT_INC(delete);
349 write_unlock_bh(&ip_conntrack_lock);
350
351 if (ct->master)
352 ip_conntrack_put(ct->master);
353
354 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
355 ip_conntrack_free(ct);
356 }
357
358 static void death_by_timeout(unsigned long ul_conntrack)
359 {
360 struct ip_conntrack *ct = (void *)ul_conntrack;
361
362 write_lock_bh(&ip_conntrack_lock);
363 /* Inside lock so preempt is disabled on module removal path.
364 * Otherwise we can get spurious warnings. */
365 CONNTRACK_STAT_INC(delete_list);
366 clean_from_lists(ct);
367 write_unlock_bh(&ip_conntrack_lock);
368 ip_conntrack_put(ct);
369 }
370
371 static inline int
372 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
373 const struct ip_conntrack_tuple *tuple,
374 const struct ip_conntrack *ignored_conntrack)
375 {
376 ASSERT_READ_LOCK(&ip_conntrack_lock);
377 return tuplehash_to_ctrack(i) != ignored_conntrack
378 && ip_ct_tuple_equal(tuple, &i->tuple);
379 }
380
381 struct ip_conntrack_tuple_hash *
382 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
383 const struct ip_conntrack *ignored_conntrack)
384 {
385 struct ip_conntrack_tuple_hash *h;
386 unsigned int hash = hash_conntrack(tuple);
387
388 ASSERT_READ_LOCK(&ip_conntrack_lock);
389 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
390 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
391 CONNTRACK_STAT_INC(found);
392 return h;
393 }
394 CONNTRACK_STAT_INC(searched);
395 }
396
397 return NULL;
398 }
399
400 /* Find a connection corresponding to a tuple. */
401 struct ip_conntrack_tuple_hash *
402 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
403 const struct ip_conntrack *ignored_conntrack)
404 {
405 struct ip_conntrack_tuple_hash *h;
406
407 read_lock_bh(&ip_conntrack_lock);
408 h = __ip_conntrack_find(tuple, ignored_conntrack);
409 if (h)
410 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
411 read_unlock_bh(&ip_conntrack_lock);
412
413 return h;
414 }
415
416 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
417 unsigned int hash,
418 unsigned int repl_hash)
419 {
420 ct->id = ++ip_conntrack_next_id;
421 list_prepend(&ip_conntrack_hash[hash],
422 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
423 list_prepend(&ip_conntrack_hash[repl_hash],
424 &ct->tuplehash[IP_CT_DIR_REPLY].list);
425 }
426
427 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
428 {
429 unsigned int hash, repl_hash;
430
431 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
432 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
433
434 write_lock_bh(&ip_conntrack_lock);
435 __ip_conntrack_hash_insert(ct, hash, repl_hash);
436 write_unlock_bh(&ip_conntrack_lock);
437 }
438
439 /* Confirm a connection given skb; places it in hash table */
440 int
441 __ip_conntrack_confirm(struct sk_buff **pskb)
442 {
443 unsigned int hash, repl_hash;
444 struct ip_conntrack *ct;
445 enum ip_conntrack_info ctinfo;
446
447 ct = ip_conntrack_get(*pskb, &ctinfo);
448
449 /* ipt_REJECT uses ip_conntrack_attach to attach related
450 ICMP/TCP RST packets in other direction. Actual packet
451 which created connection will be IP_CT_NEW or for an
452 expected connection, IP_CT_RELATED. */
453 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
454 return NF_ACCEPT;
455
456 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
457 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
458
459 /* We're not in hash table, and we refuse to set up related
460 connections for unconfirmed conns. But packet copies and
461 REJECT will give spurious warnings here. */
462 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
463
464 /* No external references means noone else could have
465 confirmed us. */
466 IP_NF_ASSERT(!is_confirmed(ct));
467 DEBUGP("Confirming conntrack %p\n", ct);
468
469 write_lock_bh(&ip_conntrack_lock);
470
471 /* See if there's one in the list already, including reverse:
472 NAT could have grabbed it without realizing, since we're
473 not in the hash. If there is, we lost race. */
474 if (!LIST_FIND(&ip_conntrack_hash[hash],
475 conntrack_tuple_cmp,
476 struct ip_conntrack_tuple_hash *,
477 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
478 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
479 conntrack_tuple_cmp,
480 struct ip_conntrack_tuple_hash *,
481 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
482 /* Remove from unconfirmed list */
483 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
484
485 __ip_conntrack_hash_insert(ct, hash, repl_hash);
486 /* Timer relative to confirmation time, not original
487 setting time, otherwise we'd get timer wrap in
488 weird delay cases. */
489 ct->timeout.expires += jiffies;
490 add_timer(&ct->timeout);
491 atomic_inc(&ct->ct_general.use);
492 set_bit(IPS_CONFIRMED_BIT, &ct->status);
493 CONNTRACK_STAT_INC(insert);
494 write_unlock_bh(&ip_conntrack_lock);
495 if (ct->helper)
496 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
497 #ifdef CONFIG_IP_NF_NAT_NEEDED
498 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
499 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
500 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
501 #endif
502 ip_conntrack_event_cache(master_ct(ct) ?
503 IPCT_RELATED : IPCT_NEW, *pskb);
504
505 return NF_ACCEPT;
506 }
507
508 CONNTRACK_STAT_INC(insert_failed);
509 write_unlock_bh(&ip_conntrack_lock);
510
511 return NF_DROP;
512 }
513
514 /* Returns true if a connection correspondings to the tuple (required
515 for NAT). */
516 int
517 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
518 const struct ip_conntrack *ignored_conntrack)
519 {
520 struct ip_conntrack_tuple_hash *h;
521
522 read_lock_bh(&ip_conntrack_lock);
523 h = __ip_conntrack_find(tuple, ignored_conntrack);
524 read_unlock_bh(&ip_conntrack_lock);
525
526 return h != NULL;
527 }
528
529 /* There's a small race here where we may free a just-assured
530 connection. Too bad: we're in trouble anyway. */
531 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
532 {
533 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
534 }
535
536 static int early_drop(struct list_head *chain)
537 {
538 /* Traverse backwards: gives us oldest, which is roughly LRU */
539 struct ip_conntrack_tuple_hash *h;
540 struct ip_conntrack *ct = NULL;
541 int dropped = 0;
542
543 read_lock_bh(&ip_conntrack_lock);
544 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
545 if (h) {
546 ct = tuplehash_to_ctrack(h);
547 atomic_inc(&ct->ct_general.use);
548 }
549 read_unlock_bh(&ip_conntrack_lock);
550
551 if (!ct)
552 return dropped;
553
554 if (del_timer(&ct->timeout)) {
555 death_by_timeout((unsigned long)ct);
556 dropped = 1;
557 CONNTRACK_STAT_INC(early_drop);
558 }
559 ip_conntrack_put(ct);
560 return dropped;
561 }
562
563 static inline int helper_cmp(const struct ip_conntrack_helper *i,
564 const struct ip_conntrack_tuple *rtuple)
565 {
566 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
567 }
568
569 static struct ip_conntrack_helper *
570 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
571 {
572 return LIST_FIND(&helpers, helper_cmp,
573 struct ip_conntrack_helper *,
574 tuple);
575 }
576
577 struct ip_conntrack_helper *
578 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
579 {
580 struct ip_conntrack_helper *helper;
581
582 /* need ip_conntrack_lock to assure that helper exists until
583 * try_module_get() is called */
584 read_lock_bh(&ip_conntrack_lock);
585
586 helper = __ip_conntrack_helper_find(tuple);
587 if (helper) {
588 /* need to increase module usage count to assure helper will
589 * not go away while the caller is e.g. busy putting a
590 * conntrack in the hash that uses the helper */
591 if (!try_module_get(helper->me))
592 helper = NULL;
593 }
594
595 read_unlock_bh(&ip_conntrack_lock);
596
597 return helper;
598 }
599
600 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
601 {
602 module_put(helper->me);
603 }
604
605 struct ip_conntrack_protocol *
606 __ip_conntrack_proto_find(u_int8_t protocol)
607 {
608 return ip_ct_protos[protocol];
609 }
610
611 /* this is guaranteed to always return a valid protocol helper, since
612 * it falls back to generic_protocol */
613 struct ip_conntrack_protocol *
614 ip_conntrack_proto_find_get(u_int8_t protocol)
615 {
616 struct ip_conntrack_protocol *p;
617
618 preempt_disable();
619 p = __ip_conntrack_proto_find(protocol);
620 if (p) {
621 if (!try_module_get(p->me))
622 p = &ip_conntrack_generic_protocol;
623 }
624 preempt_enable();
625
626 return p;
627 }
628
629 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
630 {
631 module_put(p->me);
632 }
633
634 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
635 struct ip_conntrack_tuple *repl)
636 {
637 struct ip_conntrack *conntrack;
638
639 if (!ip_conntrack_hash_rnd_initted) {
640 get_random_bytes(&ip_conntrack_hash_rnd, 4);
641 ip_conntrack_hash_rnd_initted = 1;
642 }
643
644 if (ip_conntrack_max
645 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
646 unsigned int hash = hash_conntrack(orig);
647 /* Try dropping from this hash chain. */
648 if (!early_drop(&ip_conntrack_hash[hash])) {
649 if (net_ratelimit())
650 printk(KERN_WARNING
651 "ip_conntrack: table full, dropping"
652 " packet.\n");
653 return ERR_PTR(-ENOMEM);
654 }
655 }
656
657 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
658 if (!conntrack) {
659 DEBUGP("Can't allocate conntrack.\n");
660 return ERR_PTR(-ENOMEM);
661 }
662
663 memset(conntrack, 0, sizeof(*conntrack));
664 atomic_set(&conntrack->ct_general.use, 1);
665 conntrack->ct_general.destroy = destroy_conntrack;
666 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
667 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
668 /* Don't set timer yet: wait for confirmation */
669 init_timer(&conntrack->timeout);
670 conntrack->timeout.data = (unsigned long)conntrack;
671 conntrack->timeout.function = death_by_timeout;
672
673 atomic_inc(&ip_conntrack_count);
674
675 return conntrack;
676 }
677
678 void
679 ip_conntrack_free(struct ip_conntrack *conntrack)
680 {
681 atomic_dec(&ip_conntrack_count);
682 kmem_cache_free(ip_conntrack_cachep, conntrack);
683 }
684
685 /* Allocate a new conntrack: we return -ENOMEM if classification
686 * failed due to stress. Otherwise it really is unclassifiable */
687 static struct ip_conntrack_tuple_hash *
688 init_conntrack(struct ip_conntrack_tuple *tuple,
689 struct ip_conntrack_protocol *protocol,
690 struct sk_buff *skb)
691 {
692 struct ip_conntrack *conntrack;
693 struct ip_conntrack_tuple repl_tuple;
694 struct ip_conntrack_expect *exp;
695
696 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
697 DEBUGP("Can't invert tuple.\n");
698 return NULL;
699 }
700
701 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
702 if (conntrack == NULL || IS_ERR(conntrack))
703 return (struct ip_conntrack_tuple_hash *)conntrack;
704
705 if (!protocol->new(conntrack, skb)) {
706 ip_conntrack_free(conntrack);
707 return NULL;
708 }
709
710 write_lock_bh(&ip_conntrack_lock);
711 exp = find_expectation(tuple);
712
713 if (exp) {
714 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
715 conntrack, exp);
716 /* Welcome, Mr. Bond. We've been expecting you... */
717 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
718 conntrack->master = exp->master;
719 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
720 conntrack->mark = exp->master->mark;
721 #endif
722 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
723 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
724 /* this is ugly, but there is no other place where to put it */
725 conntrack->nat.masq_index = exp->master->nat.masq_index;
726 #endif
727 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
728 conntrack->secmark = exp->master->secmark;
729 #endif
730 nf_conntrack_get(&conntrack->master->ct_general);
731 CONNTRACK_STAT_INC(expect_new);
732 } else {
733 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
734
735 CONNTRACK_STAT_INC(new);
736 }
737
738 /* Overload tuple linked list to put us in unconfirmed list. */
739 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
740
741 write_unlock_bh(&ip_conntrack_lock);
742
743 if (exp) {
744 if (exp->expectfn)
745 exp->expectfn(conntrack, exp);
746 ip_conntrack_expect_put(exp);
747 }
748
749 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
750 }
751
752 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
753 static inline struct ip_conntrack *
754 resolve_normal_ct(struct sk_buff *skb,
755 struct ip_conntrack_protocol *proto,
756 int *set_reply,
757 unsigned int hooknum,
758 enum ip_conntrack_info *ctinfo)
759 {
760 struct ip_conntrack_tuple tuple;
761 struct ip_conntrack_tuple_hash *h;
762 struct ip_conntrack *ct;
763
764 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
765
766 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
767 &tuple,proto))
768 return NULL;
769
770 /* look for tuple match */
771 h = ip_conntrack_find_get(&tuple, NULL);
772 if (!h) {
773 h = init_conntrack(&tuple, proto, skb);
774 if (!h)
775 return NULL;
776 if (IS_ERR(h))
777 return (void *)h;
778 }
779 ct = tuplehash_to_ctrack(h);
780
781 /* It exists; we have (non-exclusive) reference. */
782 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
783 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
784 /* Please set reply bit if this packet OK */
785 *set_reply = 1;
786 } else {
787 /* Once we've had two way comms, always ESTABLISHED. */
788 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
789 DEBUGP("ip_conntrack_in: normal packet for %p\n",
790 ct);
791 *ctinfo = IP_CT_ESTABLISHED;
792 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
793 DEBUGP("ip_conntrack_in: related packet for %p\n",
794 ct);
795 *ctinfo = IP_CT_RELATED;
796 } else {
797 DEBUGP("ip_conntrack_in: new packet for %p\n",
798 ct);
799 *ctinfo = IP_CT_NEW;
800 }
801 *set_reply = 0;
802 }
803 skb->nfct = &ct->ct_general;
804 skb->nfctinfo = *ctinfo;
805 return ct;
806 }
807
808 /* Netfilter hook itself. */
809 unsigned int ip_conntrack_in(unsigned int hooknum,
810 struct sk_buff **pskb,
811 const struct net_device *in,
812 const struct net_device *out,
813 int (*okfn)(struct sk_buff *))
814 {
815 struct ip_conntrack *ct;
816 enum ip_conntrack_info ctinfo;
817 struct ip_conntrack_protocol *proto;
818 int set_reply = 0;
819 int ret;
820
821 /* Previously seen (loopback or untracked)? Ignore. */
822 if ((*pskb)->nfct) {
823 CONNTRACK_STAT_INC(ignore);
824 return NF_ACCEPT;
825 }
826
827 /* Never happen */
828 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
829 if (net_ratelimit()) {
830 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
831 (*pskb)->nh.iph->protocol, hooknum);
832 }
833 return NF_DROP;
834 }
835
836 /* Doesn't cover locally-generated broadcast, so not worth it. */
837 #if 0
838 /* Ignore broadcast: no `connection'. */
839 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
840 printk("Broadcast packet!\n");
841 return NF_ACCEPT;
842 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
843 == htonl(0x000000FF)) {
844 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
845 NIPQUAD((*pskb)->nh.iph->saddr),
846 NIPQUAD((*pskb)->nh.iph->daddr),
847 (*pskb)->sk, (*pskb)->pkt_type);
848 }
849 #endif
850
851 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
852
853 /* It may be an special packet, error, unclean...
854 * inverse of the return code tells to the netfilter
855 * core what to do with the packet. */
856 if (proto->error != NULL
857 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
858 CONNTRACK_STAT_INC(error);
859 CONNTRACK_STAT_INC(invalid);
860 return -ret;
861 }
862
863 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
864 /* Not valid part of a connection */
865 CONNTRACK_STAT_INC(invalid);
866 return NF_ACCEPT;
867 }
868
869 if (IS_ERR(ct)) {
870 /* Too stressed to deal. */
871 CONNTRACK_STAT_INC(drop);
872 return NF_DROP;
873 }
874
875 IP_NF_ASSERT((*pskb)->nfct);
876
877 ret = proto->packet(ct, *pskb, ctinfo);
878 if (ret < 0) {
879 /* Invalid: inverse of the return code tells
880 * the netfilter core what to do*/
881 nf_conntrack_put((*pskb)->nfct);
882 (*pskb)->nfct = NULL;
883 CONNTRACK_STAT_INC(invalid);
884 return -ret;
885 }
886
887 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
888 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
889
890 return ret;
891 }
892
893 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
894 const struct ip_conntrack_tuple *orig)
895 {
896 return ip_ct_invert_tuple(inverse, orig,
897 __ip_conntrack_proto_find(orig->dst.protonum));
898 }
899
900 /* Would two expected things clash? */
901 static inline int expect_clash(const struct ip_conntrack_expect *a,
902 const struct ip_conntrack_expect *b)
903 {
904 /* Part covered by intersection of masks must be unequal,
905 otherwise they clash */
906 struct ip_conntrack_tuple intersect_mask
907 = { { a->mask.src.ip & b->mask.src.ip,
908 { a->mask.src.u.all & b->mask.src.u.all } },
909 { a->mask.dst.ip & b->mask.dst.ip,
910 { a->mask.dst.u.all & b->mask.dst.u.all },
911 a->mask.dst.protonum & b->mask.dst.protonum } };
912
913 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
914 }
915
916 static inline int expect_matches(const struct ip_conntrack_expect *a,
917 const struct ip_conntrack_expect *b)
918 {
919 return a->master == b->master
920 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
921 && ip_ct_tuple_equal(&a->mask, &b->mask);
922 }
923
924 /* Generally a bad idea to call this: could have matched already. */
925 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
926 {
927 struct ip_conntrack_expect *i;
928
929 write_lock_bh(&ip_conntrack_lock);
930 /* choose the the oldest expectation to evict */
931 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
932 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
933 ip_ct_unlink_expect(i);
934 write_unlock_bh(&ip_conntrack_lock);
935 ip_conntrack_expect_put(i);
936 return;
937 }
938 }
939 write_unlock_bh(&ip_conntrack_lock);
940 }
941
942 /* We don't increase the master conntrack refcount for non-fulfilled
943 * conntracks. During the conntrack destruction, the expectations are
944 * always killed before the conntrack itself */
945 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
946 {
947 struct ip_conntrack_expect *new;
948
949 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
950 if (!new) {
951 DEBUGP("expect_related: OOM allocating expect\n");
952 return NULL;
953 }
954 new->master = me;
955 atomic_set(&new->use, 1);
956 return new;
957 }
958
959 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
960 {
961 if (atomic_dec_and_test(&exp->use))
962 kmem_cache_free(ip_conntrack_expect_cachep, exp);
963 }
964
965 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
966 {
967 atomic_inc(&exp->use);
968 exp->master->expecting++;
969 list_add(&exp->list, &ip_conntrack_expect_list);
970
971 init_timer(&exp->timeout);
972 exp->timeout.data = (unsigned long)exp;
973 exp->timeout.function = expectation_timed_out;
974 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
975 add_timer(&exp->timeout);
976
977 exp->id = ++ip_conntrack_expect_next_id;
978 atomic_inc(&exp->use);
979 CONNTRACK_STAT_INC(expect_create);
980 }
981
982 /* Race with expectations being used means we could have none to find; OK. */
983 static void evict_oldest_expect(struct ip_conntrack *master)
984 {
985 struct ip_conntrack_expect *i;
986
987 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
988 if (i->master == master) {
989 if (del_timer(&i->timeout)) {
990 ip_ct_unlink_expect(i);
991 ip_conntrack_expect_put(i);
992 }
993 break;
994 }
995 }
996 }
997
998 static inline int refresh_timer(struct ip_conntrack_expect *i)
999 {
1000 if (!del_timer(&i->timeout))
1001 return 0;
1002
1003 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1004 add_timer(&i->timeout);
1005 return 1;
1006 }
1007
1008 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1009 {
1010 struct ip_conntrack_expect *i;
1011 int ret;
1012
1013 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1014 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1015 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1016
1017 write_lock_bh(&ip_conntrack_lock);
1018 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1019 if (expect_matches(i, expect)) {
1020 /* Refresh timer: if it's dying, ignore.. */
1021 if (refresh_timer(i)) {
1022 ret = 0;
1023 goto out;
1024 }
1025 } else if (expect_clash(i, expect)) {
1026 ret = -EBUSY;
1027 goto out;
1028 }
1029 }
1030
1031 /* Will be over limit? */
1032 if (expect->master->helper->max_expected &&
1033 expect->master->expecting >= expect->master->helper->max_expected)
1034 evict_oldest_expect(expect->master);
1035
1036 ip_conntrack_expect_insert(expect);
1037 ip_conntrack_expect_event(IPEXP_NEW, expect);
1038 ret = 0;
1039 out:
1040 write_unlock_bh(&ip_conntrack_lock);
1041 return ret;
1042 }
1043
1044 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1045 implicitly racy: see __ip_conntrack_confirm */
1046 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1047 const struct ip_conntrack_tuple *newreply)
1048 {
1049 write_lock_bh(&ip_conntrack_lock);
1050 /* Should be unconfirmed, so not in hash table yet */
1051 IP_NF_ASSERT(!is_confirmed(conntrack));
1052
1053 DEBUGP("Altering reply tuple of %p to ", conntrack);
1054 DUMP_TUPLE(newreply);
1055
1056 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1057 if (!conntrack->master && conntrack->expecting == 0)
1058 conntrack->helper = __ip_conntrack_helper_find(newreply);
1059 write_unlock_bh(&ip_conntrack_lock);
1060 }
1061
1062 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1063 {
1064 BUG_ON(me->timeout == 0);
1065 write_lock_bh(&ip_conntrack_lock);
1066 list_prepend(&helpers, me);
1067 write_unlock_bh(&ip_conntrack_lock);
1068
1069 return 0;
1070 }
1071
1072 struct ip_conntrack_helper *
1073 __ip_conntrack_helper_find_byname(const char *name)
1074 {
1075 struct ip_conntrack_helper *h;
1076
1077 list_for_each_entry(h, &helpers, list) {
1078 if (!strcmp(h->name, name))
1079 return h;
1080 }
1081
1082 return NULL;
1083 }
1084
1085 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1086 const struct ip_conntrack_helper *me)
1087 {
1088 if (tuplehash_to_ctrack(i)->helper == me) {
1089 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1090 tuplehash_to_ctrack(i)->helper = NULL;
1091 }
1092 return 0;
1093 }
1094
1095 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1096 {
1097 unsigned int i;
1098 struct ip_conntrack_expect *exp, *tmp;
1099
1100 /* Need write lock here, to delete helper. */
1101 write_lock_bh(&ip_conntrack_lock);
1102 LIST_DELETE(&helpers, me);
1103
1104 /* Get rid of expectations */
1105 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1106 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1107 ip_ct_unlink_expect(exp);
1108 ip_conntrack_expect_put(exp);
1109 }
1110 }
1111 /* Get rid of expecteds, set helpers to NULL. */
1112 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1113 for (i = 0; i < ip_conntrack_htable_size; i++)
1114 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1115 struct ip_conntrack_tuple_hash *, me);
1116 write_unlock_bh(&ip_conntrack_lock);
1117
1118 /* Someone could be still looking at the helper in a bh. */
1119 synchronize_net();
1120 }
1121
1122 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1123 void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1124 enum ip_conntrack_info ctinfo,
1125 const struct sk_buff *skb,
1126 unsigned long extra_jiffies,
1127 int do_acct)
1128 {
1129 int event = 0;
1130
1131 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1132 IP_NF_ASSERT(skb);
1133
1134 write_lock_bh(&ip_conntrack_lock);
1135
1136 /* Only update if this is not a fixed timeout */
1137 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1138 write_unlock_bh(&ip_conntrack_lock);
1139 return;
1140 }
1141
1142 /* If not in hash table, timer will not be active yet */
1143 if (!is_confirmed(ct)) {
1144 ct->timeout.expires = extra_jiffies;
1145 event = IPCT_REFRESH;
1146 } else {
1147 /* Need del_timer for race avoidance (may already be dying). */
1148 if (del_timer(&ct->timeout)) {
1149 ct->timeout.expires = jiffies + extra_jiffies;
1150 add_timer(&ct->timeout);
1151 event = IPCT_REFRESH;
1152 }
1153 }
1154
1155 #ifdef CONFIG_IP_NF_CT_ACCT
1156 if (do_acct) {
1157 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1158 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1159 ntohs(skb->nh.iph->tot_len);
1160 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1161 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1162 event |= IPCT_COUNTER_FILLING;
1163 }
1164 #endif
1165
1166 write_unlock_bh(&ip_conntrack_lock);
1167
1168 /* must be unlocked when calling event cache */
1169 if (event)
1170 ip_conntrack_event_cache(event, skb);
1171 }
1172
1173 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1174 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1175 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1176 * in ip_conntrack_core, since we don't want the protocols to autoload
1177 * or depend on ctnetlink */
1178 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1179 const struct ip_conntrack_tuple *tuple)
1180 {
1181 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1182 &tuple->src.u.tcp.port);
1183 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1184 &tuple->dst.u.tcp.port);
1185 return 0;
1186
1187 nfattr_failure:
1188 return -1;
1189 }
1190
1191 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1192 struct ip_conntrack_tuple *t)
1193 {
1194 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1195 return -EINVAL;
1196
1197 t->src.u.tcp.port =
1198 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1199 t->dst.u.tcp.port =
1200 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1201
1202 return 0;
1203 }
1204 #endif
1205
1206 /* Returns new sk_buff, or NULL */
1207 struct sk_buff *
1208 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1209 {
1210 skb_orphan(skb);
1211
1212 local_bh_disable();
1213 skb = ip_defrag(skb, user);
1214 local_bh_enable();
1215
1216 if (skb)
1217 ip_send_check(skb->nh.iph);
1218 return skb;
1219 }
1220
1221 /* Used by ipt_REJECT. */
1222 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1223 {
1224 struct ip_conntrack *ct;
1225 enum ip_conntrack_info ctinfo;
1226
1227 /* This ICMP is in reverse direction to the packet which caused it */
1228 ct = ip_conntrack_get(skb, &ctinfo);
1229
1230 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1231 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1232 else
1233 ctinfo = IP_CT_RELATED;
1234
1235 /* Attach to new skbuff, and increment count */
1236 nskb->nfct = &ct->ct_general;
1237 nskb->nfctinfo = ctinfo;
1238 nf_conntrack_get(nskb->nfct);
1239 }
1240
1241 static inline int
1242 do_iter(const struct ip_conntrack_tuple_hash *i,
1243 int (*iter)(struct ip_conntrack *i, void *data),
1244 void *data)
1245 {
1246 return iter(tuplehash_to_ctrack(i), data);
1247 }
1248
1249 /* Bring out ya dead! */
1250 static struct ip_conntrack_tuple_hash *
1251 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1252 void *data, unsigned int *bucket)
1253 {
1254 struct ip_conntrack_tuple_hash *h = NULL;
1255
1256 write_lock_bh(&ip_conntrack_lock);
1257 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1258 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1259 struct ip_conntrack_tuple_hash *, iter, data);
1260 if (h)
1261 break;
1262 }
1263 if (!h)
1264 h = LIST_FIND_W(&unconfirmed, do_iter,
1265 struct ip_conntrack_tuple_hash *, iter, data);
1266 if (h)
1267 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1268 write_unlock_bh(&ip_conntrack_lock);
1269
1270 return h;
1271 }
1272
1273 void
1274 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1275 {
1276 struct ip_conntrack_tuple_hash *h;
1277 unsigned int bucket = 0;
1278
1279 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1280 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1281 /* Time to push up daises... */
1282 if (del_timer(&ct->timeout))
1283 death_by_timeout((unsigned long)ct);
1284 /* ... else the timer will get him soon. */
1285
1286 ip_conntrack_put(ct);
1287 }
1288 }
1289
1290 /* Fast function for those who don't want to parse /proc (and I don't
1291 blame them). */
1292 /* Reversing the socket's dst/src point of view gives us the reply
1293 mapping. */
1294 static int
1295 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1296 {
1297 struct inet_sock *inet = inet_sk(sk);
1298 struct ip_conntrack_tuple_hash *h;
1299 struct ip_conntrack_tuple tuple;
1300
1301 IP_CT_TUPLE_U_BLANK(&tuple);
1302 tuple.src.ip = inet->rcv_saddr;
1303 tuple.src.u.tcp.port = inet->sport;
1304 tuple.dst.ip = inet->daddr;
1305 tuple.dst.u.tcp.port = inet->dport;
1306 tuple.dst.protonum = IPPROTO_TCP;
1307
1308 /* We only do TCP at the moment: is there a better way? */
1309 if (strcmp(sk->sk_prot->name, "TCP")) {
1310 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1311 return -ENOPROTOOPT;
1312 }
1313
1314 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1315 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1316 *len, sizeof(struct sockaddr_in));
1317 return -EINVAL;
1318 }
1319
1320 h = ip_conntrack_find_get(&tuple, NULL);
1321 if (h) {
1322 struct sockaddr_in sin;
1323 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1324
1325 sin.sin_family = AF_INET;
1326 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1327 .tuple.dst.u.tcp.port;
1328 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1329 .tuple.dst.ip;
1330 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1331
1332 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1333 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1334 ip_conntrack_put(ct);
1335 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1336 return -EFAULT;
1337 else
1338 return 0;
1339 }
1340 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1341 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1342 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1343 return -ENOENT;
1344 }
1345
1346 static struct nf_sockopt_ops so_getorigdst = {
1347 .pf = PF_INET,
1348 .get_optmin = SO_ORIGINAL_DST,
1349 .get_optmax = SO_ORIGINAL_DST+1,
1350 .get = &getorigdst,
1351 };
1352
1353 static int kill_all(struct ip_conntrack *i, void *data)
1354 {
1355 return 1;
1356 }
1357
1358 void ip_conntrack_flush(void)
1359 {
1360 ip_ct_iterate_cleanup(kill_all, NULL);
1361 }
1362
1363 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1364 {
1365 if (vmalloced)
1366 vfree(hash);
1367 else
1368 free_pages((unsigned long)hash,
1369 get_order(sizeof(struct list_head) * size));
1370 }
1371
1372 /* Mishearing the voices in his head, our hero wonders how he's
1373 supposed to kill the mall. */
1374 void ip_conntrack_cleanup(void)
1375 {
1376 ip_ct_attach = NULL;
1377
1378 /* This makes sure all current packets have passed through
1379 netfilter framework. Roll on, two-stage module
1380 delete... */
1381 synchronize_net();
1382
1383 ip_ct_event_cache_flush();
1384 i_see_dead_people:
1385 ip_conntrack_flush();
1386 if (atomic_read(&ip_conntrack_count) != 0) {
1387 schedule();
1388 goto i_see_dead_people;
1389 }
1390 /* wait until all references to ip_conntrack_untracked are dropped */
1391 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1392 schedule();
1393
1394 kmem_cache_destroy(ip_conntrack_cachep);
1395 kmem_cache_destroy(ip_conntrack_expect_cachep);
1396 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1397 ip_conntrack_htable_size);
1398 nf_unregister_sockopt(&so_getorigdst);
1399 }
1400
1401 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1402 {
1403 struct list_head *hash;
1404 unsigned int i;
1405
1406 *vmalloced = 0;
1407 hash = (void*)__get_free_pages(GFP_KERNEL,
1408 get_order(sizeof(struct list_head)
1409 * size));
1410 if (!hash) {
1411 *vmalloced = 1;
1412 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1413 hash = vmalloc(sizeof(struct list_head) * size);
1414 }
1415
1416 if (hash)
1417 for (i = 0; i < size; i++)
1418 INIT_LIST_HEAD(&hash[i]);
1419
1420 return hash;
1421 }
1422
1423 static int set_hashsize(const char *val, struct kernel_param *kp)
1424 {
1425 int i, bucket, hashsize, vmalloced;
1426 int old_vmalloced, old_size;
1427 int rnd;
1428 struct list_head *hash, *old_hash;
1429 struct ip_conntrack_tuple_hash *h;
1430
1431 /* On boot, we can set this without any fancy locking. */
1432 if (!ip_conntrack_htable_size)
1433 return param_set_int(val, kp);
1434
1435 hashsize = simple_strtol(val, NULL, 0);
1436 if (!hashsize)
1437 return -EINVAL;
1438
1439 hash = alloc_hashtable(hashsize, &vmalloced);
1440 if (!hash)
1441 return -ENOMEM;
1442
1443 /* We have to rehash for the new table anyway, so we also can
1444 * use a new random seed */
1445 get_random_bytes(&rnd, 4);
1446
1447 write_lock_bh(&ip_conntrack_lock);
1448 for (i = 0; i < ip_conntrack_htable_size; i++) {
1449 while (!list_empty(&ip_conntrack_hash[i])) {
1450 h = list_entry(ip_conntrack_hash[i].next,
1451 struct ip_conntrack_tuple_hash, list);
1452 list_del(&h->list);
1453 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1454 list_add_tail(&h->list, &hash[bucket]);
1455 }
1456 }
1457 old_size = ip_conntrack_htable_size;
1458 old_vmalloced = ip_conntrack_vmalloc;
1459 old_hash = ip_conntrack_hash;
1460
1461 ip_conntrack_htable_size = hashsize;
1462 ip_conntrack_vmalloc = vmalloced;
1463 ip_conntrack_hash = hash;
1464 ip_conntrack_hash_rnd = rnd;
1465 write_unlock_bh(&ip_conntrack_lock);
1466
1467 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1468 return 0;
1469 }
1470
1471 module_param_call(hashsize, set_hashsize, param_get_uint,
1472 &ip_conntrack_htable_size, 0600);
1473
1474 int __init ip_conntrack_init(void)
1475 {
1476 unsigned int i;
1477 int ret;
1478
1479 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1480 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1481 if (!ip_conntrack_htable_size) {
1482 ip_conntrack_htable_size
1483 = (((num_physpages << PAGE_SHIFT) / 16384)
1484 / sizeof(struct list_head));
1485 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1486 ip_conntrack_htable_size = 8192;
1487 if (ip_conntrack_htable_size < 16)
1488 ip_conntrack_htable_size = 16;
1489 }
1490 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1491
1492 printk("ip_conntrack version %s (%u buckets, %d max)"
1493 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1494 ip_conntrack_htable_size, ip_conntrack_max,
1495 sizeof(struct ip_conntrack));
1496
1497 ret = nf_register_sockopt(&so_getorigdst);
1498 if (ret != 0) {
1499 printk(KERN_ERR "Unable to register netfilter socket option\n");
1500 return ret;
1501 }
1502
1503 ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1504 &ip_conntrack_vmalloc);
1505 if (!ip_conntrack_hash) {
1506 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1507 goto err_unreg_sockopt;
1508 }
1509
1510 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1511 sizeof(struct ip_conntrack), 0,
1512 0, NULL, NULL);
1513 if (!ip_conntrack_cachep) {
1514 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1515 goto err_free_hash;
1516 }
1517
1518 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1519 sizeof(struct ip_conntrack_expect),
1520 0, 0, NULL, NULL);
1521 if (!ip_conntrack_expect_cachep) {
1522 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1523 goto err_free_conntrack_slab;
1524 }
1525
1526 /* Don't NEED lock here, but good form anyway. */
1527 write_lock_bh(&ip_conntrack_lock);
1528 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1529 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1530 /* Sew in builtin protocols. */
1531 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1532 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1533 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1534 write_unlock_bh(&ip_conntrack_lock);
1535
1536 /* For use by ipt_REJECT */
1537 ip_ct_attach = ip_conntrack_attach;
1538
1539 /* Set up fake conntrack:
1540 - to never be deleted, not in any hashes */
1541 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1542 /* - and look it like as a confirmed connection */
1543 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1544
1545 return ret;
1546
1547 err_free_conntrack_slab:
1548 kmem_cache_destroy(ip_conntrack_cachep);
1549 err_free_hash:
1550 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1551 ip_conntrack_htable_size);
1552 err_unreg_sockopt:
1553 nf_unregister_sockopt(&so_getorigdst);
1554
1555 return -ENOMEM;
1556 }
This page took 0.06135 seconds and 6 git commands to generate.