[NETFILTER]: Change tunables to __read_mostly
[deliverable/linux.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
40
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42 registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x)
44 #define ASSERT_WRITE_LOCK(x)
45
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50 #include <linux/netfilter_ipv4/listhelp.h>
51
52 #define IP_CONNTRACK_VERSION "2.4"
53
54 #if 0
55 #define DEBUGP printk
56 #else
57 #define DEBUGP(format, args...)
58 #endif
59
60 DEFINE_RWLOCK(ip_conntrack_lock);
61
62 /* ip_conntrack_standalone needs this */
63 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
64
65 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
66 LIST_HEAD(ip_conntrack_expect_list);
67 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
68 static LIST_HEAD(helpers);
69 unsigned int ip_conntrack_htable_size __read_mostly = 0;
70 int ip_conntrack_max __read_mostly;
71 struct list_head *ip_conntrack_hash;
72 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
73 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
74 struct ip_conntrack ip_conntrack_untracked;
75 unsigned int ip_ct_log_invalid __read_mostly;
76 static LIST_HEAD(unconfirmed);
77 static int ip_conntrack_vmalloc;
78
79 static unsigned int ip_conntrack_next_id;
80 static unsigned int ip_conntrack_expect_next_id;
81 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
82 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
83 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
84
85 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
86
87 /* deliver cached events and clear cache entry - must be called with locally
88 * disabled softirqs */
89 static inline void
90 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
91 {
92 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
93 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
94 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
95 ecache->ct);
96 ecache->events = 0;
97 ip_conntrack_put(ecache->ct);
98 ecache->ct = NULL;
99 }
100
101 /* Deliver all cached events for a particular conntrack. This is called
102 * by code prior to async packet handling or freeing the skb */
103 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
104 {
105 struct ip_conntrack_ecache *ecache;
106
107 local_bh_disable();
108 ecache = &__get_cpu_var(ip_conntrack_ecache);
109 if (ecache->ct == ct)
110 __ip_ct_deliver_cached_events(ecache);
111 local_bh_enable();
112 }
113
114 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
115 {
116 struct ip_conntrack_ecache *ecache;
117
118 /* take care of delivering potentially old events */
119 ecache = &__get_cpu_var(ip_conntrack_ecache);
120 BUG_ON(ecache->ct == ct);
121 if (ecache->ct)
122 __ip_ct_deliver_cached_events(ecache);
123 /* initialize for this conntrack/packet */
124 ecache->ct = ct;
125 nf_conntrack_get(&ct->ct_general);
126 }
127
128 /* flush the event cache - touches other CPU's data and must not be called while
129 * packets are still passing through the code */
130 static void ip_ct_event_cache_flush(void)
131 {
132 struct ip_conntrack_ecache *ecache;
133 int cpu;
134
135 for_each_possible_cpu(cpu) {
136 ecache = &per_cpu(ip_conntrack_ecache, cpu);
137 if (ecache->ct)
138 ip_conntrack_put(ecache->ct);
139 }
140 }
141 #else
142 static inline void ip_ct_event_cache_flush(void) {}
143 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
144
145 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
146
147 static int ip_conntrack_hash_rnd_initted;
148 static unsigned int ip_conntrack_hash_rnd;
149
150 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
151 unsigned int size, unsigned int rnd)
152 {
153 return (jhash_3words(tuple->src.ip,
154 (tuple->dst.ip ^ tuple->dst.protonum),
155 (tuple->src.u.all | (tuple->dst.u.all << 16)),
156 rnd) % size);
157 }
158
159 static u_int32_t
160 hash_conntrack(const struct ip_conntrack_tuple *tuple)
161 {
162 return __hash_conntrack(tuple, ip_conntrack_htable_size,
163 ip_conntrack_hash_rnd);
164 }
165
166 int
167 ip_ct_get_tuple(const struct iphdr *iph,
168 const struct sk_buff *skb,
169 unsigned int dataoff,
170 struct ip_conntrack_tuple *tuple,
171 const struct ip_conntrack_protocol *protocol)
172 {
173 /* Never happen */
174 if (iph->frag_off & htons(IP_OFFSET)) {
175 printk("ip_conntrack_core: Frag of proto %u.\n",
176 iph->protocol);
177 return 0;
178 }
179
180 tuple->src.ip = iph->saddr;
181 tuple->dst.ip = iph->daddr;
182 tuple->dst.protonum = iph->protocol;
183 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
184
185 return protocol->pkt_to_tuple(skb, dataoff, tuple);
186 }
187
188 int
189 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
190 const struct ip_conntrack_tuple *orig,
191 const struct ip_conntrack_protocol *protocol)
192 {
193 inverse->src.ip = orig->dst.ip;
194 inverse->dst.ip = orig->src.ip;
195 inverse->dst.protonum = orig->dst.protonum;
196 inverse->dst.dir = !orig->dst.dir;
197
198 return protocol->invert_tuple(inverse, orig);
199 }
200
201
202 /* ip_conntrack_expect helper functions */
203 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
204 {
205 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
206 IP_NF_ASSERT(!timer_pending(&exp->timeout));
207 list_del(&exp->list);
208 CONNTRACK_STAT_INC(expect_delete);
209 exp->master->expecting--;
210 ip_conntrack_expect_put(exp);
211 }
212
213 static void expectation_timed_out(unsigned long ul_expect)
214 {
215 struct ip_conntrack_expect *exp = (void *)ul_expect;
216
217 write_lock_bh(&ip_conntrack_lock);
218 ip_ct_unlink_expect(exp);
219 write_unlock_bh(&ip_conntrack_lock);
220 ip_conntrack_expect_put(exp);
221 }
222
223 struct ip_conntrack_expect *
224 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
225 {
226 struct ip_conntrack_expect *i;
227
228 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
229 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
230 atomic_inc(&i->use);
231 return i;
232 }
233 }
234 return NULL;
235 }
236
237 /* Just find a expectation corresponding to a tuple. */
238 struct ip_conntrack_expect *
239 ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
240 {
241 struct ip_conntrack_expect *i;
242
243 read_lock_bh(&ip_conntrack_lock);
244 i = __ip_conntrack_expect_find(tuple);
245 read_unlock_bh(&ip_conntrack_lock);
246
247 return i;
248 }
249
250 /* If an expectation for this connection is found, it gets delete from
251 * global list then returned. */
252 static struct ip_conntrack_expect *
253 find_expectation(const struct ip_conntrack_tuple *tuple)
254 {
255 struct ip_conntrack_expect *i;
256
257 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
258 /* If master is not in hash table yet (ie. packet hasn't left
259 this machine yet), how can other end know about expected?
260 Hence these are not the droids you are looking for (if
261 master ct never got confirmed, we'd hold a reference to it
262 and weird things would happen to future packets). */
263 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
264 && is_confirmed(i->master)) {
265 if (i->flags & IP_CT_EXPECT_PERMANENT) {
266 atomic_inc(&i->use);
267 return i;
268 } else if (del_timer(&i->timeout)) {
269 ip_ct_unlink_expect(i);
270 return i;
271 }
272 }
273 }
274 return NULL;
275 }
276
277 /* delete all expectations for this conntrack */
278 void ip_ct_remove_expectations(struct ip_conntrack *ct)
279 {
280 struct ip_conntrack_expect *i, *tmp;
281
282 /* Optimization: most connection never expect any others. */
283 if (ct->expecting == 0)
284 return;
285
286 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
287 if (i->master == ct && del_timer(&i->timeout)) {
288 ip_ct_unlink_expect(i);
289 ip_conntrack_expect_put(i);
290 }
291 }
292 }
293
294 static void
295 clean_from_lists(struct ip_conntrack *ct)
296 {
297 unsigned int ho, hr;
298
299 DEBUGP("clean_from_lists(%p)\n", ct);
300 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
301
302 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
303 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
304 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
305 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
306
307 /* Destroy all pending expectations */
308 ip_ct_remove_expectations(ct);
309 }
310
311 static void
312 destroy_conntrack(struct nf_conntrack *nfct)
313 {
314 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
315 struct ip_conntrack_protocol *proto;
316
317 DEBUGP("destroy_conntrack(%p)\n", ct);
318 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
319 IP_NF_ASSERT(!timer_pending(&ct->timeout));
320
321 ip_conntrack_event(IPCT_DESTROY, ct);
322 set_bit(IPS_DYING_BIT, &ct->status);
323
324 /* To make sure we don't get any weird locking issues here:
325 * destroy_conntrack() MUST NOT be called with a write lock
326 * to ip_conntrack_lock!!! -HW */
327 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
328 if (proto && proto->destroy)
329 proto->destroy(ct);
330
331 if (ip_conntrack_destroyed)
332 ip_conntrack_destroyed(ct);
333
334 write_lock_bh(&ip_conntrack_lock);
335 /* Expectations will have been removed in clean_from_lists,
336 * except TFTP can create an expectation on the first packet,
337 * before connection is in the list, so we need to clean here,
338 * too. */
339 ip_ct_remove_expectations(ct);
340
341 /* We overload first tuple to link into unconfirmed list. */
342 if (!is_confirmed(ct)) {
343 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
344 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
345 }
346
347 CONNTRACK_STAT_INC(delete);
348 write_unlock_bh(&ip_conntrack_lock);
349
350 if (ct->master)
351 ip_conntrack_put(ct->master);
352
353 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
354 ip_conntrack_free(ct);
355 }
356
357 static void death_by_timeout(unsigned long ul_conntrack)
358 {
359 struct ip_conntrack *ct = (void *)ul_conntrack;
360
361 write_lock_bh(&ip_conntrack_lock);
362 /* Inside lock so preempt is disabled on module removal path.
363 * Otherwise we can get spurious warnings. */
364 CONNTRACK_STAT_INC(delete_list);
365 clean_from_lists(ct);
366 write_unlock_bh(&ip_conntrack_lock);
367 ip_conntrack_put(ct);
368 }
369
370 static inline int
371 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
372 const struct ip_conntrack_tuple *tuple,
373 const struct ip_conntrack *ignored_conntrack)
374 {
375 ASSERT_READ_LOCK(&ip_conntrack_lock);
376 return tuplehash_to_ctrack(i) != ignored_conntrack
377 && ip_ct_tuple_equal(tuple, &i->tuple);
378 }
379
380 struct ip_conntrack_tuple_hash *
381 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
382 const struct ip_conntrack *ignored_conntrack)
383 {
384 struct ip_conntrack_tuple_hash *h;
385 unsigned int hash = hash_conntrack(tuple);
386
387 ASSERT_READ_LOCK(&ip_conntrack_lock);
388 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
389 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
390 CONNTRACK_STAT_INC(found);
391 return h;
392 }
393 CONNTRACK_STAT_INC(searched);
394 }
395
396 return NULL;
397 }
398
399 /* Find a connection corresponding to a tuple. */
400 struct ip_conntrack_tuple_hash *
401 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
402 const struct ip_conntrack *ignored_conntrack)
403 {
404 struct ip_conntrack_tuple_hash *h;
405
406 read_lock_bh(&ip_conntrack_lock);
407 h = __ip_conntrack_find(tuple, ignored_conntrack);
408 if (h)
409 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
410 read_unlock_bh(&ip_conntrack_lock);
411
412 return h;
413 }
414
415 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
416 unsigned int hash,
417 unsigned int repl_hash)
418 {
419 ct->id = ++ip_conntrack_next_id;
420 list_prepend(&ip_conntrack_hash[hash],
421 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
422 list_prepend(&ip_conntrack_hash[repl_hash],
423 &ct->tuplehash[IP_CT_DIR_REPLY].list);
424 }
425
426 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
427 {
428 unsigned int hash, repl_hash;
429
430 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
431 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
432
433 write_lock_bh(&ip_conntrack_lock);
434 __ip_conntrack_hash_insert(ct, hash, repl_hash);
435 write_unlock_bh(&ip_conntrack_lock);
436 }
437
438 /* Confirm a connection given skb; places it in hash table */
439 int
440 __ip_conntrack_confirm(struct sk_buff **pskb)
441 {
442 unsigned int hash, repl_hash;
443 struct ip_conntrack *ct;
444 enum ip_conntrack_info ctinfo;
445
446 ct = ip_conntrack_get(*pskb, &ctinfo);
447
448 /* ipt_REJECT uses ip_conntrack_attach to attach related
449 ICMP/TCP RST packets in other direction. Actual packet
450 which created connection will be IP_CT_NEW or for an
451 expected connection, IP_CT_RELATED. */
452 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
453 return NF_ACCEPT;
454
455 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
456 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
457
458 /* We're not in hash table, and we refuse to set up related
459 connections for unconfirmed conns. But packet copies and
460 REJECT will give spurious warnings here. */
461 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
462
463 /* No external references means noone else could have
464 confirmed us. */
465 IP_NF_ASSERT(!is_confirmed(ct));
466 DEBUGP("Confirming conntrack %p\n", ct);
467
468 write_lock_bh(&ip_conntrack_lock);
469
470 /* See if there's one in the list already, including reverse:
471 NAT could have grabbed it without realizing, since we're
472 not in the hash. If there is, we lost race. */
473 if (!LIST_FIND(&ip_conntrack_hash[hash],
474 conntrack_tuple_cmp,
475 struct ip_conntrack_tuple_hash *,
476 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
477 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
478 conntrack_tuple_cmp,
479 struct ip_conntrack_tuple_hash *,
480 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
481 /* Remove from unconfirmed list */
482 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
483
484 __ip_conntrack_hash_insert(ct, hash, repl_hash);
485 /* Timer relative to confirmation time, not original
486 setting time, otherwise we'd get timer wrap in
487 weird delay cases. */
488 ct->timeout.expires += jiffies;
489 add_timer(&ct->timeout);
490 atomic_inc(&ct->ct_general.use);
491 set_bit(IPS_CONFIRMED_BIT, &ct->status);
492 CONNTRACK_STAT_INC(insert);
493 write_unlock_bh(&ip_conntrack_lock);
494 if (ct->helper)
495 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
496 #ifdef CONFIG_IP_NF_NAT_NEEDED
497 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
498 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
499 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
500 #endif
501 ip_conntrack_event_cache(master_ct(ct) ?
502 IPCT_RELATED : IPCT_NEW, *pskb);
503
504 return NF_ACCEPT;
505 }
506
507 CONNTRACK_STAT_INC(insert_failed);
508 write_unlock_bh(&ip_conntrack_lock);
509
510 return NF_DROP;
511 }
512
513 /* Returns true if a connection correspondings to the tuple (required
514 for NAT). */
515 int
516 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
517 const struct ip_conntrack *ignored_conntrack)
518 {
519 struct ip_conntrack_tuple_hash *h;
520
521 read_lock_bh(&ip_conntrack_lock);
522 h = __ip_conntrack_find(tuple, ignored_conntrack);
523 read_unlock_bh(&ip_conntrack_lock);
524
525 return h != NULL;
526 }
527
528 /* There's a small race here where we may free a just-assured
529 connection. Too bad: we're in trouble anyway. */
530 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
531 {
532 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
533 }
534
535 static int early_drop(struct list_head *chain)
536 {
537 /* Traverse backwards: gives us oldest, which is roughly LRU */
538 struct ip_conntrack_tuple_hash *h;
539 struct ip_conntrack *ct = NULL;
540 int dropped = 0;
541
542 read_lock_bh(&ip_conntrack_lock);
543 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
544 if (h) {
545 ct = tuplehash_to_ctrack(h);
546 atomic_inc(&ct->ct_general.use);
547 }
548 read_unlock_bh(&ip_conntrack_lock);
549
550 if (!ct)
551 return dropped;
552
553 if (del_timer(&ct->timeout)) {
554 death_by_timeout((unsigned long)ct);
555 dropped = 1;
556 CONNTRACK_STAT_INC(early_drop);
557 }
558 ip_conntrack_put(ct);
559 return dropped;
560 }
561
562 static inline int helper_cmp(const struct ip_conntrack_helper *i,
563 const struct ip_conntrack_tuple *rtuple)
564 {
565 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
566 }
567
568 static struct ip_conntrack_helper *
569 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
570 {
571 return LIST_FIND(&helpers, helper_cmp,
572 struct ip_conntrack_helper *,
573 tuple);
574 }
575
576 struct ip_conntrack_helper *
577 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
578 {
579 struct ip_conntrack_helper *helper;
580
581 /* need ip_conntrack_lock to assure that helper exists until
582 * try_module_get() is called */
583 read_lock_bh(&ip_conntrack_lock);
584
585 helper = __ip_conntrack_helper_find(tuple);
586 if (helper) {
587 /* need to increase module usage count to assure helper will
588 * not go away while the caller is e.g. busy putting a
589 * conntrack in the hash that uses the helper */
590 if (!try_module_get(helper->me))
591 helper = NULL;
592 }
593
594 read_unlock_bh(&ip_conntrack_lock);
595
596 return helper;
597 }
598
599 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
600 {
601 module_put(helper->me);
602 }
603
604 struct ip_conntrack_protocol *
605 __ip_conntrack_proto_find(u_int8_t protocol)
606 {
607 return ip_ct_protos[protocol];
608 }
609
610 /* this is guaranteed to always return a valid protocol helper, since
611 * it falls back to generic_protocol */
612 struct ip_conntrack_protocol *
613 ip_conntrack_proto_find_get(u_int8_t protocol)
614 {
615 struct ip_conntrack_protocol *p;
616
617 preempt_disable();
618 p = __ip_conntrack_proto_find(protocol);
619 if (p) {
620 if (!try_module_get(p->me))
621 p = &ip_conntrack_generic_protocol;
622 }
623 preempt_enable();
624
625 return p;
626 }
627
628 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
629 {
630 module_put(p->me);
631 }
632
633 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
634 struct ip_conntrack_tuple *repl)
635 {
636 struct ip_conntrack *conntrack;
637
638 if (!ip_conntrack_hash_rnd_initted) {
639 get_random_bytes(&ip_conntrack_hash_rnd, 4);
640 ip_conntrack_hash_rnd_initted = 1;
641 }
642
643 if (ip_conntrack_max
644 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
645 unsigned int hash = hash_conntrack(orig);
646 /* Try dropping from this hash chain. */
647 if (!early_drop(&ip_conntrack_hash[hash])) {
648 if (net_ratelimit())
649 printk(KERN_WARNING
650 "ip_conntrack: table full, dropping"
651 " packet.\n");
652 return ERR_PTR(-ENOMEM);
653 }
654 }
655
656 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
657 if (!conntrack) {
658 DEBUGP("Can't allocate conntrack.\n");
659 return ERR_PTR(-ENOMEM);
660 }
661
662 memset(conntrack, 0, sizeof(*conntrack));
663 atomic_set(&conntrack->ct_general.use, 1);
664 conntrack->ct_general.destroy = destroy_conntrack;
665 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
666 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
667 /* Don't set timer yet: wait for confirmation */
668 init_timer(&conntrack->timeout);
669 conntrack->timeout.data = (unsigned long)conntrack;
670 conntrack->timeout.function = death_by_timeout;
671
672 atomic_inc(&ip_conntrack_count);
673
674 return conntrack;
675 }
676
677 void
678 ip_conntrack_free(struct ip_conntrack *conntrack)
679 {
680 atomic_dec(&ip_conntrack_count);
681 kmem_cache_free(ip_conntrack_cachep, conntrack);
682 }
683
684 /* Allocate a new conntrack: we return -ENOMEM if classification
685 * failed due to stress. Otherwise it really is unclassifiable */
686 static struct ip_conntrack_tuple_hash *
687 init_conntrack(struct ip_conntrack_tuple *tuple,
688 struct ip_conntrack_protocol *protocol,
689 struct sk_buff *skb)
690 {
691 struct ip_conntrack *conntrack;
692 struct ip_conntrack_tuple repl_tuple;
693 struct ip_conntrack_expect *exp;
694
695 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
696 DEBUGP("Can't invert tuple.\n");
697 return NULL;
698 }
699
700 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
701 if (conntrack == NULL || IS_ERR(conntrack))
702 return (struct ip_conntrack_tuple_hash *)conntrack;
703
704 if (!protocol->new(conntrack, skb)) {
705 ip_conntrack_free(conntrack);
706 return NULL;
707 }
708
709 write_lock_bh(&ip_conntrack_lock);
710 exp = find_expectation(tuple);
711
712 if (exp) {
713 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
714 conntrack, exp);
715 /* Welcome, Mr. Bond. We've been expecting you... */
716 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
717 conntrack->master = exp->master;
718 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
719 conntrack->mark = exp->master->mark;
720 #endif
721 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
722 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
723 /* this is ugly, but there is no other place where to put it */
724 conntrack->nat.masq_index = exp->master->nat.masq_index;
725 #endif
726 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
727 conntrack->secmark = exp->master->secmark;
728 #endif
729 nf_conntrack_get(&conntrack->master->ct_general);
730 CONNTRACK_STAT_INC(expect_new);
731 } else {
732 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
733
734 CONNTRACK_STAT_INC(new);
735 }
736
737 /* Overload tuple linked list to put us in unconfirmed list. */
738 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
739
740 write_unlock_bh(&ip_conntrack_lock);
741
742 if (exp) {
743 if (exp->expectfn)
744 exp->expectfn(conntrack, exp);
745 ip_conntrack_expect_put(exp);
746 }
747
748 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
749 }
750
751 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
752 static inline struct ip_conntrack *
753 resolve_normal_ct(struct sk_buff *skb,
754 struct ip_conntrack_protocol *proto,
755 int *set_reply,
756 unsigned int hooknum,
757 enum ip_conntrack_info *ctinfo)
758 {
759 struct ip_conntrack_tuple tuple;
760 struct ip_conntrack_tuple_hash *h;
761 struct ip_conntrack *ct;
762
763 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
764
765 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
766 &tuple,proto))
767 return NULL;
768
769 /* look for tuple match */
770 h = ip_conntrack_find_get(&tuple, NULL);
771 if (!h) {
772 h = init_conntrack(&tuple, proto, skb);
773 if (!h)
774 return NULL;
775 if (IS_ERR(h))
776 return (void *)h;
777 }
778 ct = tuplehash_to_ctrack(h);
779
780 /* It exists; we have (non-exclusive) reference. */
781 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
782 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
783 /* Please set reply bit if this packet OK */
784 *set_reply = 1;
785 } else {
786 /* Once we've had two way comms, always ESTABLISHED. */
787 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
788 DEBUGP("ip_conntrack_in: normal packet for %p\n",
789 ct);
790 *ctinfo = IP_CT_ESTABLISHED;
791 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
792 DEBUGP("ip_conntrack_in: related packet for %p\n",
793 ct);
794 *ctinfo = IP_CT_RELATED;
795 } else {
796 DEBUGP("ip_conntrack_in: new packet for %p\n",
797 ct);
798 *ctinfo = IP_CT_NEW;
799 }
800 *set_reply = 0;
801 }
802 skb->nfct = &ct->ct_general;
803 skb->nfctinfo = *ctinfo;
804 return ct;
805 }
806
807 /* Netfilter hook itself. */
808 unsigned int ip_conntrack_in(unsigned int hooknum,
809 struct sk_buff **pskb,
810 const struct net_device *in,
811 const struct net_device *out,
812 int (*okfn)(struct sk_buff *))
813 {
814 struct ip_conntrack *ct;
815 enum ip_conntrack_info ctinfo;
816 struct ip_conntrack_protocol *proto;
817 int set_reply = 0;
818 int ret;
819
820 /* Previously seen (loopback or untracked)? Ignore. */
821 if ((*pskb)->nfct) {
822 CONNTRACK_STAT_INC(ignore);
823 return NF_ACCEPT;
824 }
825
826 /* Never happen */
827 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
828 if (net_ratelimit()) {
829 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
830 (*pskb)->nh.iph->protocol, hooknum);
831 }
832 return NF_DROP;
833 }
834
835 /* Doesn't cover locally-generated broadcast, so not worth it. */
836 #if 0
837 /* Ignore broadcast: no `connection'. */
838 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
839 printk("Broadcast packet!\n");
840 return NF_ACCEPT;
841 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
842 == htonl(0x000000FF)) {
843 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
844 NIPQUAD((*pskb)->nh.iph->saddr),
845 NIPQUAD((*pskb)->nh.iph->daddr),
846 (*pskb)->sk, (*pskb)->pkt_type);
847 }
848 #endif
849
850 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
851
852 /* It may be an special packet, error, unclean...
853 * inverse of the return code tells to the netfilter
854 * core what to do with the packet. */
855 if (proto->error != NULL
856 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
857 CONNTRACK_STAT_INC(error);
858 CONNTRACK_STAT_INC(invalid);
859 return -ret;
860 }
861
862 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
863 /* Not valid part of a connection */
864 CONNTRACK_STAT_INC(invalid);
865 return NF_ACCEPT;
866 }
867
868 if (IS_ERR(ct)) {
869 /* Too stressed to deal. */
870 CONNTRACK_STAT_INC(drop);
871 return NF_DROP;
872 }
873
874 IP_NF_ASSERT((*pskb)->nfct);
875
876 ret = proto->packet(ct, *pskb, ctinfo);
877 if (ret < 0) {
878 /* Invalid: inverse of the return code tells
879 * the netfilter core what to do*/
880 nf_conntrack_put((*pskb)->nfct);
881 (*pskb)->nfct = NULL;
882 CONNTRACK_STAT_INC(invalid);
883 return -ret;
884 }
885
886 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
887 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
888
889 return ret;
890 }
891
892 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
893 const struct ip_conntrack_tuple *orig)
894 {
895 return ip_ct_invert_tuple(inverse, orig,
896 __ip_conntrack_proto_find(orig->dst.protonum));
897 }
898
899 /* Would two expected things clash? */
900 static inline int expect_clash(const struct ip_conntrack_expect *a,
901 const struct ip_conntrack_expect *b)
902 {
903 /* Part covered by intersection of masks must be unequal,
904 otherwise they clash */
905 struct ip_conntrack_tuple intersect_mask
906 = { { a->mask.src.ip & b->mask.src.ip,
907 { a->mask.src.u.all & b->mask.src.u.all } },
908 { a->mask.dst.ip & b->mask.dst.ip,
909 { a->mask.dst.u.all & b->mask.dst.u.all },
910 a->mask.dst.protonum & b->mask.dst.protonum } };
911
912 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
913 }
914
915 static inline int expect_matches(const struct ip_conntrack_expect *a,
916 const struct ip_conntrack_expect *b)
917 {
918 return a->master == b->master
919 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
920 && ip_ct_tuple_equal(&a->mask, &b->mask);
921 }
922
923 /* Generally a bad idea to call this: could have matched already. */
924 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
925 {
926 struct ip_conntrack_expect *i;
927
928 write_lock_bh(&ip_conntrack_lock);
929 /* choose the the oldest expectation to evict */
930 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
931 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
932 ip_ct_unlink_expect(i);
933 write_unlock_bh(&ip_conntrack_lock);
934 ip_conntrack_expect_put(i);
935 return;
936 }
937 }
938 write_unlock_bh(&ip_conntrack_lock);
939 }
940
941 /* We don't increase the master conntrack refcount for non-fulfilled
942 * conntracks. During the conntrack destruction, the expectations are
943 * always killed before the conntrack itself */
944 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
945 {
946 struct ip_conntrack_expect *new;
947
948 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
949 if (!new) {
950 DEBUGP("expect_related: OOM allocating expect\n");
951 return NULL;
952 }
953 new->master = me;
954 atomic_set(&new->use, 1);
955 return new;
956 }
957
958 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
959 {
960 if (atomic_dec_and_test(&exp->use))
961 kmem_cache_free(ip_conntrack_expect_cachep, exp);
962 }
963
964 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
965 {
966 atomic_inc(&exp->use);
967 exp->master->expecting++;
968 list_add(&exp->list, &ip_conntrack_expect_list);
969
970 init_timer(&exp->timeout);
971 exp->timeout.data = (unsigned long)exp;
972 exp->timeout.function = expectation_timed_out;
973 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
974 add_timer(&exp->timeout);
975
976 exp->id = ++ip_conntrack_expect_next_id;
977 atomic_inc(&exp->use);
978 CONNTRACK_STAT_INC(expect_create);
979 }
980
981 /* Race with expectations being used means we could have none to find; OK. */
982 static void evict_oldest_expect(struct ip_conntrack *master)
983 {
984 struct ip_conntrack_expect *i;
985
986 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
987 if (i->master == master) {
988 if (del_timer(&i->timeout)) {
989 ip_ct_unlink_expect(i);
990 ip_conntrack_expect_put(i);
991 }
992 break;
993 }
994 }
995 }
996
997 static inline int refresh_timer(struct ip_conntrack_expect *i)
998 {
999 if (!del_timer(&i->timeout))
1000 return 0;
1001
1002 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1003 add_timer(&i->timeout);
1004 return 1;
1005 }
1006
1007 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1008 {
1009 struct ip_conntrack_expect *i;
1010 int ret;
1011
1012 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1013 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1014 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1015
1016 write_lock_bh(&ip_conntrack_lock);
1017 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1018 if (expect_matches(i, expect)) {
1019 /* Refresh timer: if it's dying, ignore.. */
1020 if (refresh_timer(i)) {
1021 ret = 0;
1022 goto out;
1023 }
1024 } else if (expect_clash(i, expect)) {
1025 ret = -EBUSY;
1026 goto out;
1027 }
1028 }
1029
1030 /* Will be over limit? */
1031 if (expect->master->helper->max_expected &&
1032 expect->master->expecting >= expect->master->helper->max_expected)
1033 evict_oldest_expect(expect->master);
1034
1035 ip_conntrack_expect_insert(expect);
1036 ip_conntrack_expect_event(IPEXP_NEW, expect);
1037 ret = 0;
1038 out:
1039 write_unlock_bh(&ip_conntrack_lock);
1040 return ret;
1041 }
1042
1043 /* Alter reply tuple (maybe alter helper). This is for NAT, and is
1044 implicitly racy: see __ip_conntrack_confirm */
1045 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1046 const struct ip_conntrack_tuple *newreply)
1047 {
1048 write_lock_bh(&ip_conntrack_lock);
1049 /* Should be unconfirmed, so not in hash table yet */
1050 IP_NF_ASSERT(!is_confirmed(conntrack));
1051
1052 DEBUGP("Altering reply tuple of %p to ", conntrack);
1053 DUMP_TUPLE(newreply);
1054
1055 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1056 if (!conntrack->master && conntrack->expecting == 0)
1057 conntrack->helper = __ip_conntrack_helper_find(newreply);
1058 write_unlock_bh(&ip_conntrack_lock);
1059 }
1060
1061 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1062 {
1063 BUG_ON(me->timeout == 0);
1064 write_lock_bh(&ip_conntrack_lock);
1065 list_prepend(&helpers, me);
1066 write_unlock_bh(&ip_conntrack_lock);
1067
1068 return 0;
1069 }
1070
1071 struct ip_conntrack_helper *
1072 __ip_conntrack_helper_find_byname(const char *name)
1073 {
1074 struct ip_conntrack_helper *h;
1075
1076 list_for_each_entry(h, &helpers, list) {
1077 if (!strcmp(h->name, name))
1078 return h;
1079 }
1080
1081 return NULL;
1082 }
1083
1084 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1085 const struct ip_conntrack_helper *me)
1086 {
1087 if (tuplehash_to_ctrack(i)->helper == me) {
1088 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1089 tuplehash_to_ctrack(i)->helper = NULL;
1090 }
1091 return 0;
1092 }
1093
1094 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1095 {
1096 unsigned int i;
1097 struct ip_conntrack_expect *exp, *tmp;
1098
1099 /* Need write lock here, to delete helper. */
1100 write_lock_bh(&ip_conntrack_lock);
1101 LIST_DELETE(&helpers, me);
1102
1103 /* Get rid of expectations */
1104 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1105 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1106 ip_ct_unlink_expect(exp);
1107 ip_conntrack_expect_put(exp);
1108 }
1109 }
1110 /* Get rid of expecteds, set helpers to NULL. */
1111 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1112 for (i = 0; i < ip_conntrack_htable_size; i++)
1113 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1114 struct ip_conntrack_tuple_hash *, me);
1115 write_unlock_bh(&ip_conntrack_lock);
1116
1117 /* Someone could be still looking at the helper in a bh. */
1118 synchronize_net();
1119 }
1120
1121 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1122 void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1123 enum ip_conntrack_info ctinfo,
1124 const struct sk_buff *skb,
1125 unsigned long extra_jiffies,
1126 int do_acct)
1127 {
1128 int event = 0;
1129
1130 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1131 IP_NF_ASSERT(skb);
1132
1133 write_lock_bh(&ip_conntrack_lock);
1134
1135 /* Only update if this is not a fixed timeout */
1136 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1137 write_unlock_bh(&ip_conntrack_lock);
1138 return;
1139 }
1140
1141 /* If not in hash table, timer will not be active yet */
1142 if (!is_confirmed(ct)) {
1143 ct->timeout.expires = extra_jiffies;
1144 event = IPCT_REFRESH;
1145 } else {
1146 /* Need del_timer for race avoidance (may already be dying). */
1147 if (del_timer(&ct->timeout)) {
1148 ct->timeout.expires = jiffies + extra_jiffies;
1149 add_timer(&ct->timeout);
1150 event = IPCT_REFRESH;
1151 }
1152 }
1153
1154 #ifdef CONFIG_IP_NF_CT_ACCT
1155 if (do_acct) {
1156 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1157 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1158 ntohs(skb->nh.iph->tot_len);
1159 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1160 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1161 event |= IPCT_COUNTER_FILLING;
1162 }
1163 #endif
1164
1165 write_unlock_bh(&ip_conntrack_lock);
1166
1167 /* must be unlocked when calling event cache */
1168 if (event)
1169 ip_conntrack_event_cache(event, skb);
1170 }
1171
1172 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1173 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1174 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1175 * in ip_conntrack_core, since we don't want the protocols to autoload
1176 * or depend on ctnetlink */
1177 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1178 const struct ip_conntrack_tuple *tuple)
1179 {
1180 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1181 &tuple->src.u.tcp.port);
1182 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1183 &tuple->dst.u.tcp.port);
1184 return 0;
1185
1186 nfattr_failure:
1187 return -1;
1188 }
1189
1190 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1191 struct ip_conntrack_tuple *t)
1192 {
1193 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1194 return -EINVAL;
1195
1196 t->src.u.tcp.port =
1197 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1198 t->dst.u.tcp.port =
1199 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1200
1201 return 0;
1202 }
1203 #endif
1204
1205 /* Returns new sk_buff, or NULL */
1206 struct sk_buff *
1207 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1208 {
1209 skb_orphan(skb);
1210
1211 local_bh_disable();
1212 skb = ip_defrag(skb, user);
1213 local_bh_enable();
1214
1215 if (skb)
1216 ip_send_check(skb->nh.iph);
1217 return skb;
1218 }
1219
1220 /* Used by ipt_REJECT. */
1221 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1222 {
1223 struct ip_conntrack *ct;
1224 enum ip_conntrack_info ctinfo;
1225
1226 /* This ICMP is in reverse direction to the packet which caused it */
1227 ct = ip_conntrack_get(skb, &ctinfo);
1228
1229 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1230 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1231 else
1232 ctinfo = IP_CT_RELATED;
1233
1234 /* Attach to new skbuff, and increment count */
1235 nskb->nfct = &ct->ct_general;
1236 nskb->nfctinfo = ctinfo;
1237 nf_conntrack_get(nskb->nfct);
1238 }
1239
1240 static inline int
1241 do_iter(const struct ip_conntrack_tuple_hash *i,
1242 int (*iter)(struct ip_conntrack *i, void *data),
1243 void *data)
1244 {
1245 return iter(tuplehash_to_ctrack(i), data);
1246 }
1247
1248 /* Bring out ya dead! */
1249 static struct ip_conntrack_tuple_hash *
1250 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1251 void *data, unsigned int *bucket)
1252 {
1253 struct ip_conntrack_tuple_hash *h = NULL;
1254
1255 write_lock_bh(&ip_conntrack_lock);
1256 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1257 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1258 struct ip_conntrack_tuple_hash *, iter, data);
1259 if (h)
1260 break;
1261 }
1262 if (!h)
1263 h = LIST_FIND_W(&unconfirmed, do_iter,
1264 struct ip_conntrack_tuple_hash *, iter, data);
1265 if (h)
1266 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1267 write_unlock_bh(&ip_conntrack_lock);
1268
1269 return h;
1270 }
1271
1272 void
1273 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1274 {
1275 struct ip_conntrack_tuple_hash *h;
1276 unsigned int bucket = 0;
1277
1278 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1279 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1280 /* Time to push up daises... */
1281 if (del_timer(&ct->timeout))
1282 death_by_timeout((unsigned long)ct);
1283 /* ... else the timer will get him soon. */
1284
1285 ip_conntrack_put(ct);
1286 }
1287 }
1288
1289 /* Fast function for those who don't want to parse /proc (and I don't
1290 blame them). */
1291 /* Reversing the socket's dst/src point of view gives us the reply
1292 mapping. */
1293 static int
1294 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1295 {
1296 struct inet_sock *inet = inet_sk(sk);
1297 struct ip_conntrack_tuple_hash *h;
1298 struct ip_conntrack_tuple tuple;
1299
1300 IP_CT_TUPLE_U_BLANK(&tuple);
1301 tuple.src.ip = inet->rcv_saddr;
1302 tuple.src.u.tcp.port = inet->sport;
1303 tuple.dst.ip = inet->daddr;
1304 tuple.dst.u.tcp.port = inet->dport;
1305 tuple.dst.protonum = IPPROTO_TCP;
1306
1307 /* We only do TCP at the moment: is there a better way? */
1308 if (strcmp(sk->sk_prot->name, "TCP")) {
1309 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1310 return -ENOPROTOOPT;
1311 }
1312
1313 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1314 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1315 *len, sizeof(struct sockaddr_in));
1316 return -EINVAL;
1317 }
1318
1319 h = ip_conntrack_find_get(&tuple, NULL);
1320 if (h) {
1321 struct sockaddr_in sin;
1322 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1323
1324 sin.sin_family = AF_INET;
1325 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1326 .tuple.dst.u.tcp.port;
1327 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1328 .tuple.dst.ip;
1329 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1330
1331 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1332 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1333 ip_conntrack_put(ct);
1334 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1335 return -EFAULT;
1336 else
1337 return 0;
1338 }
1339 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1340 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1341 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1342 return -ENOENT;
1343 }
1344
1345 static struct nf_sockopt_ops so_getorigdst = {
1346 .pf = PF_INET,
1347 .get_optmin = SO_ORIGINAL_DST,
1348 .get_optmax = SO_ORIGINAL_DST+1,
1349 .get = &getorigdst,
1350 };
1351
1352 static int kill_all(struct ip_conntrack *i, void *data)
1353 {
1354 return 1;
1355 }
1356
1357 void ip_conntrack_flush(void)
1358 {
1359 ip_ct_iterate_cleanup(kill_all, NULL);
1360 }
1361
1362 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1363 {
1364 if (vmalloced)
1365 vfree(hash);
1366 else
1367 free_pages((unsigned long)hash,
1368 get_order(sizeof(struct list_head) * size));
1369 }
1370
1371 /* Mishearing the voices in his head, our hero wonders how he's
1372 supposed to kill the mall. */
1373 void ip_conntrack_cleanup(void)
1374 {
1375 ip_ct_attach = NULL;
1376
1377 /* This makes sure all current packets have passed through
1378 netfilter framework. Roll on, two-stage module
1379 delete... */
1380 synchronize_net();
1381
1382 ip_ct_event_cache_flush();
1383 i_see_dead_people:
1384 ip_conntrack_flush();
1385 if (atomic_read(&ip_conntrack_count) != 0) {
1386 schedule();
1387 goto i_see_dead_people;
1388 }
1389 /* wait until all references to ip_conntrack_untracked are dropped */
1390 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1391 schedule();
1392
1393 kmem_cache_destroy(ip_conntrack_cachep);
1394 kmem_cache_destroy(ip_conntrack_expect_cachep);
1395 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1396 ip_conntrack_htable_size);
1397 nf_unregister_sockopt(&so_getorigdst);
1398 }
1399
1400 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1401 {
1402 struct list_head *hash;
1403 unsigned int i;
1404
1405 *vmalloced = 0;
1406 hash = (void*)__get_free_pages(GFP_KERNEL,
1407 get_order(sizeof(struct list_head)
1408 * size));
1409 if (!hash) {
1410 *vmalloced = 1;
1411 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1412 hash = vmalloc(sizeof(struct list_head) * size);
1413 }
1414
1415 if (hash)
1416 for (i = 0; i < size; i++)
1417 INIT_LIST_HEAD(&hash[i]);
1418
1419 return hash;
1420 }
1421
1422 static int set_hashsize(const char *val, struct kernel_param *kp)
1423 {
1424 int i, bucket, hashsize, vmalloced;
1425 int old_vmalloced, old_size;
1426 int rnd;
1427 struct list_head *hash, *old_hash;
1428 struct ip_conntrack_tuple_hash *h;
1429
1430 /* On boot, we can set this without any fancy locking. */
1431 if (!ip_conntrack_htable_size)
1432 return param_set_int(val, kp);
1433
1434 hashsize = simple_strtol(val, NULL, 0);
1435 if (!hashsize)
1436 return -EINVAL;
1437
1438 hash = alloc_hashtable(hashsize, &vmalloced);
1439 if (!hash)
1440 return -ENOMEM;
1441
1442 /* We have to rehash for the new table anyway, so we also can
1443 * use a new random seed */
1444 get_random_bytes(&rnd, 4);
1445
1446 write_lock_bh(&ip_conntrack_lock);
1447 for (i = 0; i < ip_conntrack_htable_size; i++) {
1448 while (!list_empty(&ip_conntrack_hash[i])) {
1449 h = list_entry(ip_conntrack_hash[i].next,
1450 struct ip_conntrack_tuple_hash, list);
1451 list_del(&h->list);
1452 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1453 list_add_tail(&h->list, &hash[bucket]);
1454 }
1455 }
1456 old_size = ip_conntrack_htable_size;
1457 old_vmalloced = ip_conntrack_vmalloc;
1458 old_hash = ip_conntrack_hash;
1459
1460 ip_conntrack_htable_size = hashsize;
1461 ip_conntrack_vmalloc = vmalloced;
1462 ip_conntrack_hash = hash;
1463 ip_conntrack_hash_rnd = rnd;
1464 write_unlock_bh(&ip_conntrack_lock);
1465
1466 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1467 return 0;
1468 }
1469
1470 module_param_call(hashsize, set_hashsize, param_get_uint,
1471 &ip_conntrack_htable_size, 0600);
1472
1473 int __init ip_conntrack_init(void)
1474 {
1475 unsigned int i;
1476 int ret;
1477
1478 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1479 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1480 if (!ip_conntrack_htable_size) {
1481 ip_conntrack_htable_size
1482 = (((num_physpages << PAGE_SHIFT) / 16384)
1483 / sizeof(struct list_head));
1484 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1485 ip_conntrack_htable_size = 8192;
1486 if (ip_conntrack_htable_size < 16)
1487 ip_conntrack_htable_size = 16;
1488 }
1489 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1490
1491 printk("ip_conntrack version %s (%u buckets, %d max)"
1492 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1493 ip_conntrack_htable_size, ip_conntrack_max,
1494 sizeof(struct ip_conntrack));
1495
1496 ret = nf_register_sockopt(&so_getorigdst);
1497 if (ret != 0) {
1498 printk(KERN_ERR "Unable to register netfilter socket option\n");
1499 return ret;
1500 }
1501
1502 ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1503 &ip_conntrack_vmalloc);
1504 if (!ip_conntrack_hash) {
1505 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1506 goto err_unreg_sockopt;
1507 }
1508
1509 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1510 sizeof(struct ip_conntrack), 0,
1511 0, NULL, NULL);
1512 if (!ip_conntrack_cachep) {
1513 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1514 goto err_free_hash;
1515 }
1516
1517 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1518 sizeof(struct ip_conntrack_expect),
1519 0, 0, NULL, NULL);
1520 if (!ip_conntrack_expect_cachep) {
1521 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1522 goto err_free_conntrack_slab;
1523 }
1524
1525 /* Don't NEED lock here, but good form anyway. */
1526 write_lock_bh(&ip_conntrack_lock);
1527 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1528 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1529 /* Sew in builtin protocols. */
1530 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1531 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1532 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1533 write_unlock_bh(&ip_conntrack_lock);
1534
1535 /* For use by ipt_REJECT */
1536 ip_ct_attach = ip_conntrack_attach;
1537
1538 /* Set up fake conntrack:
1539 - to never be deleted, not in any hashes */
1540 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1541 /* - and look it like as a confirmed connection */
1542 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1543
1544 return ret;
1545
1546 err_free_conntrack_slab:
1547 kmem_cache_destroy(ip_conntrack_cachep);
1548 err_free_hash:
1549 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1550 ip_conntrack_htable_size);
1551 err_unreg_sockopt:
1552 nf_unregister_sockopt(&so_getorigdst);
1553
1554 return -ENOMEM;
1555 }
This page took 0.080689 seconds and 5 git commands to generate.