Commit | Line | Data |
---|---|---|
304a1618 ACM |
1 | /* |
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | |
3 | * operating system. INET is implemented using the BSD Socket | |
4 | * interface as the means of communication with the user level. | |
5 | * | |
6 | * Authors: Lotsa people, from code originally in tcp | |
7 | * | |
8 | * This program is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU General Public License | |
10 | * as published by the Free Software Foundation; either version | |
11 | * 2 of the License, or (at your option) any later version. | |
12 | */ | |
13 | ||
14 | #ifndef _INET_HASHTABLES_H | |
15 | #define _INET_HASHTABLES_H | |
16 | ||
77d8bf9c ACM |
17 | #include <linux/ip.h> |
18 | #include <linux/list.h> | |
19 | #include <linux/slab.h> | |
20 | #include <linux/spinlock.h> | |
304a1618 ACM |
21 | #include <linux/types.h> |
22 | ||
77d8bf9c ACM |
23 | /* This is for all connections with a full identity, no wildcards. |
24 | * New scheme, half the table is for TIME_WAIT, the other half is | |
25 | * for the rest. I'll experiment with dynamic table growth later. | |
26 | */ | |
27 | struct inet_ehash_bucket { | |
28 | rwlock_t lock; | |
29 | struct hlist_head chain; | |
30 | } __attribute__((__aligned__(8))); | |
31 | ||
32 | /* There are a few simple rules, which allow for local port reuse by | |
33 | * an application. In essence: | |
34 | * | |
35 | * 1) Sockets bound to different interfaces may share a local port. | |
36 | * Failing that, goto test 2. | |
37 | * 2) If all sockets have sk->sk_reuse set, and none of them are in | |
38 | * TCP_LISTEN state, the port may be shared. | |
39 | * Failing that, goto test 3. | |
40 | * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local | |
41 | * address, and none of them are the same, the port may be | |
42 | * shared. | |
43 | * Failing this, the port cannot be shared. | |
44 | * | |
45 | * The interesting point, is test #2. This is what an FTP server does | |
46 | * all day. To optimize this case we use a specific flag bit defined | |
47 | * below. As we add sockets to a bind bucket list, we perform a | |
48 | * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) | |
49 | * As long as all sockets added to a bind bucket pass this test, | |
50 | * the flag bit will be set. | |
51 | * The resulting situation is that tcp_v[46]_verify_bind() can just check | |
52 | * for this flag bit, if it is set and the socket trying to bind has | |
53 | * sk->sk_reuse set, we don't even have to walk the owners list at all, | |
54 | * we return that it is ok to bind this socket to the requested local port. | |
55 | * | |
56 | * Sounds like a lot of work, but it is worth it. In a more naive | |
57 | * implementation (ie. current FreeBSD etc.) the entire list of ports | |
58 | * must be walked for each data port opened by an ftp server. Needless | |
59 | * to say, this does not scale at all. With a couple thousand FTP | |
60 | * users logged onto your box, isn't it nice to know that new data | |
61 | * ports are created in O(1) time? I thought so. ;-) -DaveM | |
62 | */ | |
63 | struct inet_bind_bucket { | |
64 | unsigned short port; | |
65 | signed short fastreuse; | |
66 | struct hlist_node node; | |
67 | struct hlist_head owners; | |
68 | }; | |
69 | ||
70 | #define inet_bind_bucket_for_each(tb, node, head) \ | |
71 | hlist_for_each_entry(tb, node, head, node) | |
72 | ||
73 | struct inet_bind_hashbucket { | |
74 | spinlock_t lock; | |
75 | struct hlist_head chain; | |
76 | }; | |
77 | ||
78 | /* This is for listening sockets, thus all sockets which possess wildcards. */ | |
79 | #define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ | |
80 | ||
81 | struct inet_hashinfo { | |
82 | /* This is for sockets with full identity only. Sockets here will | |
83 | * always be without wildcards and will have the following invariant: | |
84 | * | |
85 | * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE | |
86 | * | |
87 | * First half of the table is for sockets not in TIME_WAIT, second half | |
88 | * is for TIME_WAIT sockets only. | |
89 | */ | |
90 | struct inet_ehash_bucket *ehash; | |
91 | ||
92 | /* Ok, let's try this, I give up, we do need a local binding | |
93 | * TCP hash as well as the others for fast bind/connect. | |
94 | */ | |
95 | struct inet_bind_hashbucket *bhash; | |
96 | ||
97 | int bhash_size; | |
98 | int ehash_size; | |
99 | ||
100 | /* All sockets in TCP_LISTEN state will be in here. This is the only | |
101 | * table where wildcard'd TCP sockets can exist. Hash function here | |
102 | * is just local port number. | |
103 | */ | |
104 | struct hlist_head listening_hash[INET_LHTABLE_SIZE]; | |
105 | ||
106 | /* All the above members are written once at bootup and | |
107 | * never written again _or_ are predominantly read-access. | |
108 | * | |
109 | * Now align to a new cache line as all the following members | |
110 | * are often dirty. | |
111 | */ | |
112 | rwlock_t lhash_lock ____cacheline_aligned; | |
113 | atomic_t lhash_users; | |
114 | wait_queue_head_t lhash_wait; | |
115 | spinlock_t portalloc_lock; | |
116 | }; | |
117 | ||
304a1618 ACM |
118 | static inline int inet_ehashfn(const __u32 laddr, const __u16 lport, |
119 | const __u32 faddr, const __u16 fport, | |
120 | const int ehash_size) | |
121 | { | |
122 | int h = (laddr ^ lport) ^ (faddr ^ fport); | |
123 | h ^= h >> 16; | |
124 | h ^= h >> 8; | |
125 | return h & (ehash_size - 1); | |
126 | } | |
127 | ||
128 | static inline int inet_sk_ehashfn(const struct sock *sk, const int ehash_size) | |
129 | { | |
130 | const struct inet_sock *inet = inet_sk(sk); | |
131 | const __u32 laddr = inet->rcv_saddr; | |
132 | const __u16 lport = inet->num; | |
133 | const __u32 faddr = inet->daddr; | |
134 | const __u16 fport = inet->dport; | |
135 | ||
136 | return inet_ehashfn(laddr, lport, faddr, fport, ehash_size); | |
137 | } | |
138 | ||
77d8bf9c ACM |
139 | extern struct inet_bind_bucket * |
140 | inet_bind_bucket_create(kmem_cache_t *cachep, | |
141 | struct inet_bind_hashbucket *head, | |
142 | const unsigned short snum); | |
143 | extern void inet_bind_bucket_destroy(kmem_cache_t *cachep, | |
144 | struct inet_bind_bucket *tb); | |
145 | ||
146 | static inline int inet_bhashfn(const __u16 lport, const int bhash_size) | |
147 | { | |
148 | return lport & (bhash_size - 1); | |
149 | } | |
150 | ||
151 | /* These can have wildcards, don't try too hard. */ | |
152 | static inline int inet_lhashfn(const unsigned short num) | |
153 | { | |
154 | return num & (INET_LHTABLE_SIZE - 1); | |
155 | } | |
156 | ||
157 | static inline int inet_sk_listen_hashfn(const struct sock *sk) | |
158 | { | |
159 | return inet_lhashfn(inet_sk(sk)->num); | |
160 | } | |
161 | ||
304a1618 | 162 | #endif /* _INET_HASHTABLES_H */ |